polyfuzz.models.EditDistance
¶
Calculate the Edit Distance between lists of strings using any distance/similarity based scorer
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n_jobs |
int |
Nr of parallel processes, use -1 to use all cores |
1 |
scorer |
Callable |
The scorer function to be used to calculate the edit distance. This function should give back a float between 0 and 1, and work as follows: scorer("string_one", "string_two") |
<cyfunction ratio at 0x0000020BAAB37BA0> |
model_id |
str |
The name of the particular instance, used when comparing models |
None |
Usage:
from rapidfuzz import fuzz
model = EditDistance(n_jobs=-1, scorer=fuzz.WRatio)
Source code in polyfuzz\models\_distance.py
class EditDistance(BaseMatcher):
"""
Calculate the Edit Distance between lists of strings using any distance/similarity based scorer
Arguments:
n_jobs: Nr of parallel processes, use -1 to use all cores
scorer: The scorer function to be used to calculate the edit distance.
This function should give back a float between 0 and 1, and work as follows:
scorer("string_one", "string_two")
model_id: The name of the particular instance, used when comparing models
Usage:
```python
from rapidfuzz import fuzz
model = EditDistance(n_jobs=-1, scorer=fuzz.WRatio)
```
"""
def __init__(self,
n_jobs: int = 1,
scorer: Callable = fuzz.ratio,
model_id: str = None,
normalize: bool = True):
super().__init__(model_id)
self.type = "EditDistance"
self.scorer = scorer
self.normalize = normalize
self.equal_lists = False
if n_jobs == -1:
self.n_jobs = cpu_count()
else:
self.n_jobs = n_jobs
def match(self,
from_list: List[str],
to_list: List[str] = None,
**kwargs) -> pd.DataFrame:
""" Calculate the edit distances between two list of strings
by parallelizing the calculation and passing the lists in
batches.
Arguments:
from_list: The list from which you want mappings
to_list: The list where you want to map to
Returns:
matches: The best matches between the lists of strings
Usage:
```python
from rapidfuzz import fuzz
model = EditDistance(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
matches = model.match(["string_one", "string_two"],
["string_three", "string_four"])
```
"""
if to_list is None:
self.equal_lists = True
expected_iterations = int(len(from_list)/2)
to_list = from_list.copy()
else:
expected_iterations = len(from_list)
matches = Parallel(n_jobs=self.n_jobs)(delayed(self._calculate_edit_distance)
(from_string, to_list)
for from_string in tqdm(from_list, total=expected_iterations,
disable=True))
matches = pd.DataFrame(matches, columns=['From', "To", "Similarity"])
if self.normalize:
matches["Similarity"] = (matches["Similarity"] -
matches["Similarity"].min()) / (matches["Similarity"].max() -
matches["Similarity"].min())
return matches
def _calculate_edit_distance(self,
from_string: str,
to_list: List[str]) -> Tuple[str, str, float]:
""" Calculate the edit distance between a string and a list """
if self.equal_lists:
to_list.remove(from_string)
matches = [self.scorer(from_string, to_string) for to_string in to_list]
index = np.argmax(matches)
value = np.max(matches)
return from_string, to_list[index], value
match(self, from_list, to_list=None, **kwargs)
¶
Calculate the edit distances between two list of strings by parallelizing the calculation and passing the lists in batches.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
from_list |
List[str] |
The list from which you want mappings |
required |
to_list |
List[str] |
The list where you want to map to |
None |
Returns:
Type | Description |
---|---|
matches |
The best matches between the lists of strings |
Usage:
from rapidfuzz import fuzz
model = EditDistance(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
matches = model.match(["string_one", "string_two"],
["string_three", "string_four"])
Source code in polyfuzz\models\_distance.py
def match(self,
from_list: List[str],
to_list: List[str] = None,
**kwargs) -> pd.DataFrame:
""" Calculate the edit distances between two list of strings
by parallelizing the calculation and passing the lists in
batches.
Arguments:
from_list: The list from which you want mappings
to_list: The list where you want to map to
Returns:
matches: The best matches between the lists of strings
Usage:
```python
from rapidfuzz import fuzz
model = EditDistance(n_jobs=-1, score_cutoff=0.5, scorer=fuzz.WRatio)
matches = model.match(["string_one", "string_two"],
["string_three", "string_four"])
```
"""
if to_list is None:
self.equal_lists = True
expected_iterations = int(len(from_list)/2)
to_list = from_list.copy()
else:
expected_iterations = len(from_list)
matches = Parallel(n_jobs=self.n_jobs)(delayed(self._calculate_edit_distance)
(from_string, to_list)
for from_string in tqdm(from_list, total=expected_iterations,
disable=True))
matches = pd.DataFrame(matches, columns=['From', "To", "Similarity"])
if self.normalize:
matches["Similarity"] = (matches["Similarity"] -
matches["Similarity"].min()) / (matches["Similarity"].max() -
matches["Similarity"].min())
return matches