Skip to content

polyfuzz.models.RapidFuzz

Calculate the Edit Distance between lists of strings using RapidFuzz's process function

We are using RapidFuzz instead of FuzzyWuzzy since it is much faster and does not require the more restrictive GPL license

Parameters:

Name Type Description Default
n_jobs int

Nr of parallel processes, use -1 to use all cores

1
score_cutoff float

The minimum similarity for which to return a good match. Should be between 0 and 1.

0
scorer Callable

The scorer function to be used to calculate the edit distance Options: * fuzz.ratio * fuzz.partial_ratio * fuzz.token_sort_ratio * fuzz.partial_token_sort_ratio * fuzz.token_set_ratio * fuzz.partial_token_set_ratio * fuzz.token_ratio * fuzz.partial_token_ratio * fuzz.WRation * fuzz.QRatio See https://maxbachmann.github.io/rapidfuzz/usage/fuzz/ for an extensive description of the scoring methods.

<cyfunction WRatio at 0x0000020BAAB935F0>
model_id str

The name of the particular instance, used when comparing models

None

Usage:

from rapidfuzz import fuzz
model = RapidFuzz(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
Source code in polyfuzz\models\_rapidfuzz.py
class RapidFuzz(BaseMatcher):
    """
    Calculate the Edit Distance between lists of strings using RapidFuzz's process function

    We are using RapidFuzz instead of FuzzyWuzzy since it is much faster
    and does not require the more restrictive GPL license

    Arguments:
        n_jobs: Nr of parallel processes, use -1 to use all cores
        score_cutoff: The minimum similarity for which to return a good match.
                      Should be between 0 and 1.
        scorer: The scorer function to be used to calculate the edit distance
                Options:
                    * fuzz.ratio
                    * fuzz.partial_ratio
                    * fuzz.token_sort_ratio
                    * fuzz.partial_token_sort_ratio
                    * fuzz.token_set_ratio
                    * fuzz.partial_token_set_ratio
                    * fuzz.token_ratio
                    * fuzz.partial_token_ratio
                    * fuzz.WRation
                    * fuzz.QRatio
                See https://maxbachmann.github.io/rapidfuzz/usage/fuzz/ for an extensive
                description of the scoring methods.
        model_id: The name of the particular instance, used when comparing models

    Usage:

    ```python
    from rapidfuzz import fuzz
    model = RapidFuzz(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
    ```
    """
    def __init__(self,
                 n_jobs: int = 1,
                 score_cutoff: float = 0,
                 scorer: Callable = fuzz.WRatio,
                 model_id: str = None):
        super().__init__(model_id)
        self.type = "EditDistance"
        self.score_cutoff = score_cutoff * 100
        self.scorer = scorer
        self.equal_lists = False

        if n_jobs == -1:
            self.n_jobs = cpu_count()
        else:
            self.n_jobs = n_jobs

    def match(self,
              from_list: List[str],
              to_list: List[str] = None,
              **kwargs) -> pd.DataFrame:
        """ Calculate the edit distances between two list of strings
        by parallelizing the calculation and passing the lists in
        batches.

        Arguments:
            from_list: The list from which you want mappings
            to_list: The list where you want to map to

        Returns:
            matches: The best matches between the lists of strings

        Usage:

        ```python
        from rapidfuzz import fuzz
        model = RapidFuzz(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
        matches = model.match(["string_one", "string_two"],
                              ["string_three", "string_four"])
        ```
        """
        if to_list is None:
            self.equal_lists = True
            expected_iterations = int(len(from_list)/2)
            to_list = from_list.copy()
        else:
            expected_iterations = len(from_list)

        matches = Parallel(n_jobs=self.n_jobs)(delayed(self._calculate_edit_distance)
                                               (from_string, to_list)
                                               for from_string in tqdm(from_list, total=expected_iterations,
                                                                       disable=True))
        matches = pd.DataFrame(matches, columns=['From', "To", "Similarity"])
        return matches

    def _calculate_edit_distance(self,
                                 from_string: str,
                                 to_list: List[str]) -> Tuple[str, Union[str, None], float]:
        """ Calculate the edit distance between a string and a list """
        if self.equal_lists:
            to_list.remove(from_string)

        match = process.extractOne(from_string, to_list,
                                   score_cutoff=self.score_cutoff,
                                   scorer=self.scorer)

        if match:
            return from_string, match[0], match[1] / 100
        else:
            return from_string, None, 0.

match(self, from_list, to_list=None, **kwargs)

Calculate the edit distances between two list of strings by parallelizing the calculation and passing the lists in batches.

Parameters:

Name Type Description Default
from_list List[str]

The list from which you want mappings

required
to_list List[str]

The list where you want to map to

None

Returns:

Type Description
matches

The best matches between the lists of strings

Usage:

from rapidfuzz import fuzz
model = RapidFuzz(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
matches = model.match(["string_one", "string_two"],
                      ["string_three", "string_four"])
Source code in polyfuzz\models\_rapidfuzz.py
def match(self,
          from_list: List[str],
          to_list: List[str] = None,
          **kwargs) -> pd.DataFrame:
    """ Calculate the edit distances between two list of strings
    by parallelizing the calculation and passing the lists in
    batches.

    Arguments:
        from_list: The list from which you want mappings
        to_list: The list where you want to map to

    Returns:
        matches: The best matches between the lists of strings

    Usage:

    ```python
    from rapidfuzz import fuzz
    model = RapidFuzz(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
    matches = model.match(["string_one", "string_two"],
                          ["string_three", "string_four"])
    ```
    """
    if to_list is None:
        self.equal_lists = True
        expected_iterations = int(len(from_list)/2)
        to_list = from_list.copy()
    else:
        expected_iterations = len(from_list)

    matches = Parallel(n_jobs=self.n_jobs)(delayed(self._calculate_edit_distance)
                                           (from_string, to_list)
                                           for from_string in tqdm(from_list, total=expected_iterations,
                                                                   disable=True))
    matches = pd.DataFrame(matches, columns=['From', "To", "Similarity"])
    return matches