polyfuzz.polyfuzz.PolyFuzz
¶
PolyFuzz class for Fuzzy string matching, grouping, and evaluation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
method |
Union[str, polyfuzz.models._base.BaseMatcher, List[polyfuzz.models._base.BaseMatcher]] |
the method(s) used for matching. For quick selection of models select one of the following: "EditDistance", "TF-IDF" or "Embeddings". If you want more control over the models above, pass in a model from polyfuzz.models. For examples, see usage below. |
'TF-IDF' |
verbose |
bool |
Changes the verbosity of the model, Set to True if you want to track the stages of the model. |
False |
Usage:
For basic, out-of-the-box usage, run the code below. You can replace "TF-IDF" with either "EditDistance" or "Embeddings" for quick access to these models:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF")
If you want more control over the String Matching models, you can load in these models separately:
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF-Sklearn")
model = pf.PolyFuzz(tfidf)
You can also select multiple models in order to compare performance:
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF-Sklearn")
edit = EditDistance(n_jobs=-1)
model = pf.PolyFuzz([tfidf, edit])
You can use embedding model, like Flair:
from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings
fasttext_embedding = WordEmbeddings('news')
bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased')
embedding = Embeddings([fasttext_embedding, bert_embedding ], min_similarity=0.0)
model = pf.PolyFuzz(embedding)
Source code in polyfuzz\polyfuzz.py
class PolyFuzz:
"""
PolyFuzz class for Fuzzy string matching, grouping, and evaluation.
Arguments:
method: the method(s) used for matching. For quick selection of models
select one of the following: "EditDistance", "TF-IDF" or "Embeddings".
If you want more control over the models above, pass
in a model from polyfuzz.models. For examples, see
usage below.
verbose: Changes the verbosity of the model, Set to True if you want
to track the stages of the model.
Usage:
For basic, out-of-the-box usage, run the code below. You can replace "TF-IDF"
with either "EditDistance" or "Embeddings" for quick access to these models:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF")
```
If you want more control over the String Matching models, you can load
in these models separately:
```python
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF-Sklearn")
model = pf.PolyFuzz(tfidf)
```
You can also select multiple models in order to compare performance:
```python
tfidf = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF-Sklearn")
edit = EditDistance(n_jobs=-1)
model = pf.PolyFuzz([tfidf, edit])
```
You can use embedding model, like Flair:
```python
from flair.embeddings import WordEmbeddings, TransformerWordEmbeddings
fasttext_embedding = WordEmbeddings('news')
bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased')
embedding = Embeddings([fasttext_embedding, bert_embedding ], min_similarity=0.0)
model = pf.PolyFuzz(embedding)
```
"""
def __init__(self,
method: Union[str,
BaseMatcher,
List[BaseMatcher]] = "TF-IDF",
verbose: bool = False):
self.method = method
self.matches = None
# Metrics
self.min_precisions = None
self.recalls = None
self.average_precisions = None
# Cluster
self.clusters = None
self.cluster_mappings = None
self.grouped_matches = None
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARNING)
def match(self,
from_list: List[str],
to_list: List[str] = None,
top_n: int = 1):
""" Match the from_list of strings to the to_list of strings with whatever models
you have initialized
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
top_n: The number of matches you want returned. This is currently only implemented
for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they
can computationally handle more comparisons.
Updates:
self.matches: A dictionary with the matches from all models, can
be accessed with `model.get_all_matches` or
`model.get_match("TF-IDF")`
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
You can access the results matches with `model.get_all_matches` or a specific
model with `model.get_match("TF-IDF")` based on their model_id.
"""
# Standard models - quick access
if isinstance(self.method, str):
if self.method in ["TF-IDF", "TFIDF"]:
self.method = TFIDF(min_similarity=0, top_n=top_n)
self.matches = {"TF-IDF": self.method.match(from_list, to_list)}
elif self.method in ["EditDistance", "Edit Distance"]:
self.method = RapidFuzz()
self.matches = {"EditDistance": self.method.match(from_list, to_list)}
elif self.method in ["Embeddings", "Embedding"]:
self.method = Embeddings(min_similarity=0, top_n=top_n)
self.matches = {"Embeddings": self.method.match(from_list, to_list)}
else:
raise ValueError("Please instantiate the model with one of the following methods: \n"
"* 'TF-IDF'\n"
"* 'EditDistance'\n"
"* 'Embeddings'\n")
logger.info(f"Ran model with model id = {self.method}")
# Custom models
elif isinstance(self.method, BaseMatcher):
self.matches = {self.method.model_id: self.method.match(from_list, to_list)}
logging.info(f"Ran model with model id = {self.method.model_id}")
# Multiple custom models
elif isinstance(self.method, Iterable):
self._update_model_ids()
self.matches = {}
for model in self.method:
self.matches[model.model_id] = model.match(from_list, to_list)
logging.info(f"Ran model with model id = {model.model_id}")
return self
def fit(self,
from_list: List[str],
to_list: List[str] = None):
""" Fit one or model distance models on `from_list` if no `to_list` is given
or fit them on `to_list` if both `from_list` and `to_list` are given.
Typically, the `to_list` will be tracked as the list that we want to transform
our `from_list` to. In other words, it is the golden list of words that we
want the words in the `from_list` mapped to.
However, you can also choose a single `from_list` and leave `to_list` empty
to map all words from within `from_list` to each other. Then, `from_list`
will be tracked instead as the golden list of words.
Thus, if you want to train on a single list instead, use only `from_list`
and keep `to_list` empty.
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
Now, whenever you apply `.transform(new_list)`, the `new_list` will be mapped
to the words in `to_list`.
You can also fit on a single list of words:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["string_three", "string_four"])
```
"""
self.match(from_list, to_list)
if to_list is not None:
self.to_list = to_list
else:
self.to_list = from_list
return self
def transform(self, from_list: List[str]) -> Mapping[str, pd.DataFrame]:
""" After fitting your model, match all words in `from_list`
to the words that were fitted on previously.
Arguments:
from_list: The list from which you want mappings.
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["input_string_1", "input_string2"])
```
Then, you can transform and normalize new strings:
```python
results = model.transform(["input_string_1", "input_string2"])
```
"""
all_matches = {}
if isinstance(self.method, BaseMatcher):
matches = self.method.match(from_list, self.to_list, re_train=False)
all_matches[self.method.type] = matches
elif isinstance(self.method, Iterable):
for model in self.method:
all_matches[model.type] = model.match(from_list, self.to_list, re_train=False)
return all_matches
def fit_transform(self,
from_list: List[str],
to_list: List[str] = None) -> Mapping[str, pd.DataFrame]:
""" Fit and transform lists of words on one or more distance models.
Typically, the `to_list` will be tracked as the list that we want to transform
our `from_list` to. In other words, it is the golden list of words that we
want the words in the `from_list` mapped to.
However, you can also choose a single `from_list` and leave `to_list` empty
to map all words from within `from_list` to each other. Then, `from_list`
will be tracked instead as the golden list of words.
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
You can also fit and transform a single list of words:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(["string_three", "string_four"])
```
"""
self.fit(from_list, to_list)
return self.transform(from_list)
def visualize_precision_recall(self,
kde: bool = False,
save_path: str = None
):
""" Calculate and visualize precision-recall curves
A minimum similarity score might be used to identify
when a match could be considered to be correct. For example,
we can assume that if a similarity score pass 0.95 we are
quite confident that the matches are correct. This minimum
similarity score can be defined as **precision** since it shows
you how precise we believe the matches are at a minimum.
**Recall** can then be defined as as the percentage of matches
found at a certain minimum similarity score. A high recall means
that for a certain minimum precision score, we find many matches.
Arguments:
kde: whether to also visualize the kde plot
save_path: the path to save the resulting image to
Usage:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
model.visualize_precision_recall(save_path="results.png")
```
"""
check_matches(self)
self.min_precisions = {}
self.recalls = {}
self.average_precisions = {}
for name, match in self.matches.items():
min_precision, recall, average_precision = precision_recall_curve(match)
self.min_precisions[name] = min_precision
self.recalls[name] = recall
self.average_precisions[name] = average_precision
visualize_precision_recall(self.matches, self.min_precisions, self.recalls, kde, save_path)
def group(self,
model: Union[str, BaseMatcher] = None,
link_min_similarity: float = 0.75,
group_all_strings: bool = False):
""" From the matches, group the `To` matches together using single linkage
Arguments:
model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
link_min_similarity: the minimum similarity between strings before they are grouped
in a single linkage fashion
group_all_strings: if you want to compare a list of strings with itself and then cluster
those strings, set this to True. Otherwise, only the strings that
were mapped To are clustered.
Updates:
self.matches: Adds a column `Group` that is the grouped version of the `To` column
"""
check_matches(self)
self.clusters = {}
self.cluster_mappings = {}
# Standard models - quick access
if isinstance(model, str):
if model in ["TF-IDF", "TFIDF"]:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
elif self.method in ["EditDistance", "Edit Distance"]:
model = RapidFuzz()
elif self.method in ["Embeddings", "Embedding"]:
model = Embeddings(min_similarity=link_min_similarity)
else:
raise ValueError("Please instantiate the model with one of the following methods: \n"
"* 'TF-IDF'\n"
"* 'EditDistance'\n"
"* 'Embeddings'\n"
"* Or None if you want to automatically use TF-IDF")
# Use TF-IDF if no model is specified
elif not model:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
# Group per model
for name, match in self.matches.items():
self._create_groups(name, model, link_min_similarity, group_all_strings)
def get_ids(self) -> Union[str, List[str], None]:
""" Get all model ids for easier access """
check_matches(self)
if isinstance(self.method, str):
return self.method
elif isinstance(self.method, Iterable):
return [model.model_id for model in self.method]
return None
def get_matches(self, model_id: str = None) -> Union[pd.DataFrame,
Mapping[str, pd.DataFrame]]:
""" Get the matches from one or more models"""
check_matches(self)
if len(self.matches) == 1:
return list(self.matches.values())[0]
elif len(self.matches) > 1 and model_id:
return self.matches[model_id]
return self.matches
def get_clusters(self, model_id: str = None) -> Mapping[str, List[str]]:
""" Get the groupings/clusters from a single model
Arguments:
model_id: the model id of the model if you have specified multiple models
"""
check_matches(self)
check_grouped(self)
if len(self.matches) == 1:
return list(self.clusters.values())[0]
elif len(self.matches) > 1 and model_id:
return self.clusters[model_id]
return self.clusters
def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:
""" Get the mappings from the `To` column to its respective column """
check_matches(self)
check_grouped(self)
if len(self.matches) == 1:
return list(self.cluster_mappings.values())[0]
elif len(self.matches) > 1 and name:
return self.cluster_mappings[name]
return self.cluster_mappings
def save(self, path: str) -> None:
""" Saves the model to the specified path
Arguments:
path: the location and name of the file you want to save
Usage:
```python
model.save("my_model")
```
"""
with open(path, 'wb') as file:
joblib.dump(self, file)
@classmethod
def load(cls, path: str):
""" Loads the model from the specified path
Arguments:
path: the location and name of the PolyFuzz file you want to load
Usage:
```python
PolyFuzz.load("my_model")
```
"""
with open(path, 'rb') as file:
model = joblib.load(file)
return model
def _create_groups(self,
name: str,
model: BaseMatcher,
link_min_similarity: float,
group_all_strings: bool):
""" Create groups based on either the To mappings if you compare two different lists of strings, or
the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
"""
if group_all_strings:
strings = list(self.matches[name].From.dropna().unique())
else:
strings = list(self.matches[name].To.dropna().unique())
# Create clusters
matches = model.match(strings)
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
# Map the `to` list to groups
df = self.matches[name]
df["Group"] = df['To'].map(cluster_name_map).fillna(df['To'])
self.matches[name] = df
# Track clusters and their ids
self.clusters[name] = clusters
self.cluster_mappings[name] = cluster_id_map
def _update_model_ids(self):
""" Update model ids such that there is no overlap between ids """
# Give models a model_id if it didn't already exist
for index, model in enumerate(self.method):
if not model.model_id:
model.model_id = f"Model {index}"
# Update duplicate names
model_ids = [model.model_id for model in self.method]
if len(set(model_ids)) != len(model_ids):
for index, model in enumerate(self.method):
model.model_id = f"Model {index}"
fit(self, from_list, to_list=None)
¶
Fit one or model distance models on from_list
if no to_list
is given
or fit them on to_list
if both from_list
and to_list
are given.
Typically, the to_list
will be tracked as the list that we want to transform
our from_list
to. In other words, it is the golden list of words that we
want the words in the from_list
mapped to.
However, you can also choose a single from_list
and leave to_list
empty
to map all words from within from_list
to each other. Then, from_list
will be tracked instead as the golden list of words.
Thus, if you want to train on a single list instead, use only from_list
and keep to_list
empty.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
from_list |
List[str] |
The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the |
required |
to_list |
List[str] |
The list where you want to map to |
None |
Usage:
After having initialized your models, you can pass through lists of strings:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
Now, whenever you apply .transform(new_list)
, the new_list
will be mapped
to the words in to_list
.
You can also fit on a single list of words:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["string_three", "string_four"])
Source code in polyfuzz\polyfuzz.py
def fit(self,
from_list: List[str],
to_list: List[str] = None):
""" Fit one or model distance models on `from_list` if no `to_list` is given
or fit them on `to_list` if both `from_list` and `to_list` are given.
Typically, the `to_list` will be tracked as the list that we want to transform
our `from_list` to. In other words, it is the golden list of words that we
want the words in the `from_list` mapped to.
However, you can also choose a single `from_list` and leave `to_list` empty
to map all words from within `from_list` to each other. Then, `from_list`
will be tracked instead as the golden list of words.
Thus, if you want to train on a single list instead, use only `from_list`
and keep `to_list` empty.
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
Now, whenever you apply `.transform(new_list)`, the `new_list` will be mapped
to the words in `to_list`.
You can also fit on a single list of words:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["string_three", "string_four"])
```
"""
self.match(from_list, to_list)
if to_list is not None:
self.to_list = to_list
else:
self.to_list = from_list
return self
fit_transform(self, from_list, to_list=None)
¶
Fit and transform lists of words on one or more distance models.
Typically, the to_list
will be tracked as the list that we want to transform
our from_list
to. In other words, it is the golden list of words that we
want the words in the from_list
mapped to.
However, you can also choose a single from_list
and leave to_list
empty
to map all words from within from_list
to each other. Then, from_list
will be tracked instead as the golden list of words.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
from_list |
List[str] |
The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the |
required |
to_list |
List[str] |
The list where you want to map to |
None |
Usage:
After having initialized your models, you can pass through lists of strings:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
You can also fit and transform a single list of words:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(["string_three", "string_four"])
Source code in polyfuzz\polyfuzz.py
def fit_transform(self,
from_list: List[str],
to_list: List[str] = None) -> Mapping[str, pd.DataFrame]:
""" Fit and transform lists of words on one or more distance models.
Typically, the `to_list` will be tracked as the list that we want to transform
our `from_list` to. In other words, it is the golden list of words that we
want the words in the `from_list` mapped to.
However, you can also choose a single `from_list` and leave `to_list` empty
to map all words from within `from_list` to each other. Then, `from_list`
will be tracked instead as the golden list of words.
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
You can also fit and transform a single list of words:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
results = model.fit_transform(["string_three", "string_four"])
```
"""
self.fit(from_list, to_list)
return self.transform(from_list)
get_cluster_mappings(self, name=None)
¶
Get the mappings from the To
column to its respective column
Source code in polyfuzz\polyfuzz.py
def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:
""" Get the mappings from the `To` column to its respective column """
check_matches(self)
check_grouped(self)
if len(self.matches) == 1:
return list(self.cluster_mappings.values())[0]
elif len(self.matches) > 1 and name:
return self.cluster_mappings[name]
return self.cluster_mappings
get_clusters(self, model_id=None)
¶
Get the groupings/clusters from a single model
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model_id |
str |
the model id of the model if you have specified multiple models |
None |
Source code in polyfuzz\polyfuzz.py
def get_clusters(self, model_id: str = None) -> Mapping[str, List[str]]:
""" Get the groupings/clusters from a single model
Arguments:
model_id: the model id of the model if you have specified multiple models
"""
check_matches(self)
check_grouped(self)
if len(self.matches) == 1:
return list(self.clusters.values())[0]
elif len(self.matches) > 1 and model_id:
return self.clusters[model_id]
return self.clusters
get_ids(self)
¶
Get all model ids for easier access
Source code in polyfuzz\polyfuzz.py
def get_ids(self) -> Union[str, List[str], None]:
""" Get all model ids for easier access """
check_matches(self)
if isinstance(self.method, str):
return self.method
elif isinstance(self.method, Iterable):
return [model.model_id for model in self.method]
return None
get_matches(self, model_id=None)
¶
Get the matches from one or more models
Source code in polyfuzz\polyfuzz.py
def get_matches(self, model_id: str = None) -> Union[pd.DataFrame,
Mapping[str, pd.DataFrame]]:
""" Get the matches from one or more models"""
check_matches(self)
if len(self.matches) == 1:
return list(self.matches.values())[0]
elif len(self.matches) > 1 and model_id:
return self.matches[model_id]
return self.matches
group(self, model=None, link_min_similarity=0.75, group_all_strings=False)
¶
From the matches, group the To
matches together using single linkage
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
Union[str, polyfuzz.models._base.BaseMatcher] |
you can choose one of the models in |
None |
link_min_similarity |
float |
the minimum similarity between strings before they are grouped in a single linkage fashion |
0.75 |
group_all_strings |
bool |
if you want to compare a list of strings with itself and then cluster those strings, set this to True. Otherwise, only the strings that were mapped To are clustered. |
False |
Updates:
self.matches: Adds a column Group
that is the grouped version of the To
column
Source code in polyfuzz\polyfuzz.py
def group(self,
model: Union[str, BaseMatcher] = None,
link_min_similarity: float = 0.75,
group_all_strings: bool = False):
""" From the matches, group the `To` matches together using single linkage
Arguments:
model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
link_min_similarity: the minimum similarity between strings before they are grouped
in a single linkage fashion
group_all_strings: if you want to compare a list of strings with itself and then cluster
those strings, set this to True. Otherwise, only the strings that
were mapped To are clustered.
Updates:
self.matches: Adds a column `Group` that is the grouped version of the `To` column
"""
check_matches(self)
self.clusters = {}
self.cluster_mappings = {}
# Standard models - quick access
if isinstance(model, str):
if model in ["TF-IDF", "TFIDF"]:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
elif self.method in ["EditDistance", "Edit Distance"]:
model = RapidFuzz()
elif self.method in ["Embeddings", "Embedding"]:
model = Embeddings(min_similarity=link_min_similarity)
else:
raise ValueError("Please instantiate the model with one of the following methods: \n"
"* 'TF-IDF'\n"
"* 'EditDistance'\n"
"* 'Embeddings'\n"
"* Or None if you want to automatically use TF-IDF")
# Use TF-IDF if no model is specified
elif not model:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)
# Group per model
for name, match in self.matches.items():
self._create_groups(name, model, link_min_similarity, group_all_strings)
load(path)
classmethod
¶
Loads the model from the specified path
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
str |
the location and name of the PolyFuzz file you want to load |
required |
Usage:
PolyFuzz.load("my_model")
Source code in polyfuzz\polyfuzz.py
@classmethod
def load(cls, path: str):
""" Loads the model from the specified path
Arguments:
path: the location and name of the PolyFuzz file you want to load
Usage:
```python
PolyFuzz.load("my_model")
```
"""
with open(path, 'rb') as file:
model = joblib.load(file)
return model
match(self, from_list, to_list=None, top_n=1)
¶
Match the from_list of strings to the to_list of strings with whatever models you have initialized
Parameters:
Name | Type | Description | Default |
---|---|---|---|
from_list |
List[str] |
The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the |
required |
to_list |
List[str] |
The list where you want to map to |
None |
top_n |
int |
The number of matches you want returned. This is currently only implemented
for |
1 |
Updates
self.matches: A dictionary with the matches from all models, can
be accessed with model.get_all_matches
or
model.get_match("TF-IDF")
Usage:
After having initialized your models, you can pass through lists of strings:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
You can access the results matches with model.get_all_matches
or a specific
model with model.get_match("TF-IDF")
based on their model_id.
Source code in polyfuzz\polyfuzz.py
def match(self,
from_list: List[str],
to_list: List[str] = None,
top_n: int = 1):
""" Match the from_list of strings to the to_list of strings with whatever models
you have initialized
Arguments:
from_list: The list from which you want mappings.
If you want to map items within a list, and not map the
items to themselves, you can supply only the `from_list` and
ignore the `to_list`.
to_list: The list where you want to map to
top_n: The number of matches you want returned. This is currently only implemented
for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they
can computationally handle more comparisons.
Updates:
self.matches: A dictionary with the matches from all models, can
be accessed with `model.get_all_matches` or
`model.get_match("TF-IDF")`
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
```
You can access the results matches with `model.get_all_matches` or a specific
model with `model.get_match("TF-IDF")` based on their model_id.
"""
# Standard models - quick access
if isinstance(self.method, str):
if self.method in ["TF-IDF", "TFIDF"]:
self.method = TFIDF(min_similarity=0, top_n=top_n)
self.matches = {"TF-IDF": self.method.match(from_list, to_list)}
elif self.method in ["EditDistance", "Edit Distance"]:
self.method = RapidFuzz()
self.matches = {"EditDistance": self.method.match(from_list, to_list)}
elif self.method in ["Embeddings", "Embedding"]:
self.method = Embeddings(min_similarity=0, top_n=top_n)
self.matches = {"Embeddings": self.method.match(from_list, to_list)}
else:
raise ValueError("Please instantiate the model with one of the following methods: \n"
"* 'TF-IDF'\n"
"* 'EditDistance'\n"
"* 'Embeddings'\n")
logger.info(f"Ran model with model id = {self.method}")
# Custom models
elif isinstance(self.method, BaseMatcher):
self.matches = {self.method.model_id: self.method.match(from_list, to_list)}
logging.info(f"Ran model with model id = {self.method.model_id}")
# Multiple custom models
elif isinstance(self.method, Iterable):
self._update_model_ids()
self.matches = {}
for model in self.method:
self.matches[model.model_id] = model.match(from_list, to_list)
logging.info(f"Ran model with model id = {model.model_id}")
return self
save(self, path)
¶
Saves the model to the specified path
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
str |
the location and name of the file you want to save |
required |
Usage:
model.save("my_model")
Source code in polyfuzz\polyfuzz.py
def save(self, path: str) -> None:
""" Saves the model to the specified path
Arguments:
path: the location and name of the file you want to save
Usage:
```python
model.save("my_model")
```
"""
with open(path, 'wb') as file:
joblib.dump(self, file)
transform(self, from_list)
¶
After fitting your model, match all words in from_list
to the words that were fitted on previously.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
from_list |
List[str] |
The list from which you want mappings. |
required |
Usage:
After having initialized your models, you can pass through lists of strings:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["input_string_1", "input_string2"])
Then, you can transform and normalize new strings:
results = model.transform(["input_string_1", "input_string2"])
Source code in polyfuzz\polyfuzz.py
def transform(self, from_list: List[str]) -> Mapping[str, pd.DataFrame]:
""" After fitting your model, match all words in `from_list`
to the words that were fitted on previously.
Arguments:
from_list: The list from which you want mappings.
Usage:
After having initialized your models, you can pass through lists of strings:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.fit(["input_string_1", "input_string2"])
```
Then, you can transform and normalize new strings:
```python
results = model.transform(["input_string_1", "input_string2"])
```
"""
all_matches = {}
if isinstance(self.method, BaseMatcher):
matches = self.method.match(from_list, self.to_list, re_train=False)
all_matches[self.method.type] = matches
elif isinstance(self.method, Iterable):
for model in self.method:
all_matches[model.type] = model.match(from_list, self.to_list, re_train=False)
return all_matches
visualize_precision_recall(self, kde=False, save_path=None)
¶
Calculate and visualize precision-recall curves
A minimum similarity score might be used to identify when a match could be considered to be correct. For example, we can assume that if a similarity score pass 0.95 we are quite confident that the matches are correct. This minimum similarity score can be defined as precision since it shows you how precise we believe the matches are at a minimum.
Recall can then be defined as as the percentage of matches found at a certain minimum similarity score. A high recall means that for a certain minimum precision score, we find many matches.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
kde |
bool |
whether to also visualize the kde plot |
False |
save_path |
str |
the path to save the resulting image to |
None |
Usage:
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
model.visualize_precision_recall(save_path="results.png")
Source code in polyfuzz\polyfuzz.py
def visualize_precision_recall(self,
kde: bool = False,
save_path: str = None
):
""" Calculate and visualize precision-recall curves
A minimum similarity score might be used to identify
when a match could be considered to be correct. For example,
we can assume that if a similarity score pass 0.95 we are
quite confident that the matches are correct. This minimum
similarity score can be defined as **precision** since it shows
you how precise we believe the matches are at a minimum.
**Recall** can then be defined as as the percentage of matches
found at a certain minimum similarity score. A high recall means
that for a certain minimum precision score, we find many matches.
Arguments:
kde: whether to also visualize the kde plot
save_path: the path to save the resulting image to
Usage:
```python
import polyfuzz as pf
model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
model.match(from_list = ["string_one", "string_two"],
to_list = ["string_three", "string_four"])
model.visualize_precision_recall(save_path="results.png")
```
"""
check_matches(self)
self.min_precisions = {}
self.recalls = {}
self.average_precisions = {}
for name, match in self.matches.items():
min_precision, recall, average_precision = precision_recall_curve(match)
self.min_precisions[name] = min_precision
self.recalls[name] = recall
self.average_precisions[name] = average_precision
visualize_precision_recall(self.matches, self.min_precisions, self.recalls, kde, save_path)