polyfuzz.models.TFIDF
¶
A character based n-gram TF-IDF to approximate edit distance
We turn a string into, typically of length 3, n-grams. For example, using 3-grams of the "hotel" we get ['hot', 'ote', 'tel']. These are then used as input for a TfidfVectorizer in order to create a vector for each word. Then, we simply apply cosine similarity through k-NN
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n_gram_range |
Tuple[int, int] |
The n_gram_range on a character-level |
(3, 3) |
clean_string |
bool |
Whether to clean the string such that only alphanumerical characters are kept |
True |
min_similarity |
float |
The minimum similarity between strings, otherwise return 0 similarity |
0.75 |
top_n |
int |
The number of matches you want returned |
1 |
cosine_method |
str |
The method/package for calculating the cosine similarity. Options: * sparse * sklearn * knn
|
'sparse' |
model_id |
str |
The name of the particular instance, used when comparing models |
None |
Usage:
from polymatcher.models import TFIDF
model = TFIDF(n_gram_range=(3, 3), clean_string=True, use_knn=False)
Source code in polyfuzz\models\_tfidf.py
class TFIDF(BaseMatcher):
"""
A character based n-gram TF-IDF to approximate edit distance
We turn a string into, typically of length 3, n-grams. For example,
using 3-grams of the "hotel" we get ['hot', 'ote', 'tel']. These are then
used as input for a TfidfVectorizer in order to create a vector for each
word. Then, we simply apply cosine similarity through k-NN
Arguments:
n_gram_range: The n_gram_range on a character-level
clean_string: Whether to clean the string such that only alphanumerical characters are kept
min_similarity: The minimum similarity between strings, otherwise return 0 similarity
top_n: The number of matches you want returned
cosine_method: The method/package for calculating the cosine similarity.
Options:
* sparse
* sklearn
* knn
sparse is the fastest and most memory efficient but requires a
package that might be difficult to install
sklearn is a bit slower than sparse and requires significantly more memory as
the distance matrix is not sparse
knn uses 1-nearest neighbor to extract the most similar strings
it is significantly slower than both methods but requires little memory
model_id: The name of the particular instance, used when comparing models
Usage:
```python
from polymatcher.models import TFIDF
model = TFIDF(n_gram_range=(3, 3), clean_string=True, use_knn=False)
```
"""
def __init__(self,
n_gram_range: Tuple[int, int] = (3, 3),
clean_string: bool = True,
min_similarity: float = 0.75,
top_n: int = 1,
cosine_method: str = "sparse",
model_id: str = None):
super().__init__(model_id)
self.type = "TF-IDF"
self.n_gram_range = n_gram_range
self.clean_string = clean_string
self.min_similarity = min_similarity
self.cosine_method = cosine_method
self.top_n = top_n
self.vectorizer = None
self.tf_idf_to = None
def match(self,
from_list: List[str],
to_list: List[str] = None,
re_train: bool = True) -> pd.DataFrame:
""" Match two lists of strings to each other and return the most similar strings
Arguments:
from_list: The list from which you want mappings
to_list: The list where you want to map to
re_train: Whether to re-train the model with new embeddings
Set this to False if you want to use this model in production
Returns:
matches: The best matches between the lists of strings
Usage:
```python
from polymatcher.models import TFIDF
model = TFIDF()
matches = model.match(["string_one", "string_two"],
["string_three", "string_four"])
```
"""
tf_idf_from, tf_idf_to = self._extract_tf_idf(from_list, to_list, re_train)
matches = cosine_similarity(tf_idf_from, tf_idf_to,
from_list, to_list,
self.min_similarity,
top_n=self.top_n,
method=self.cosine_method)
return matches
def _extract_tf_idf(self,
from_list: List[str],
to_list: List[str] = None,
re_train: bool = True) -> Tuple[np.ndarray, np.ndarray]:
""" Calculate distances between TF-IDF vectors of from_list and to_list """
if to_list:
if re_train:
self.vectorizer = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit(to_list + from_list)
self.tf_idf_to = self.vectorizer.transform(to_list)
tf_idf_from = self.vectorizer.transform(from_list)
else:
if re_train:
self.vectorizer = TfidfVectorizer(min_df=1, analyzer=self._create_ngrams).fit(from_list)
self.tf_idf_to = self.vectorizer.transform(from_list)
tf_idf_from = self.tf_idf_to
return tf_idf_from, self.tf_idf_to
def _create_ngrams(self, string: str) -> List[str]:
""" Create n_grams from a string
Steps:
* Extract character-level ngrams with `self.n_gram_range` (both ends inclusive)
* Remove n-grams that have a whitespace in them
"""
if self.clean_string:
string = _clean_string(string)
result = []
for n in range(self.n_gram_range[0], self.n_gram_range[1]+1):
ngrams = zip(*[string[i:] for i in range(n)])
ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram]
result.extend(ngrams)
return result
match(self, from_list, to_list=None, re_train=True)
¶
Match two lists of strings to each other and return the most similar strings
Parameters:
Name | Type | Description | Default |
---|---|---|---|
from_list |
List[str] |
The list from which you want mappings |
required |
to_list |
List[str] |
The list where you want to map to |
None |
re_train |
bool |
Whether to re-train the model with new embeddings Set this to False if you want to use this model in production |
True |
Returns:
Type | Description |
---|---|
matches |
The best matches between the lists of strings |
Usage:
from polymatcher.models import TFIDF
model = TFIDF()
matches = model.match(["string_one", "string_two"],
["string_three", "string_four"])
Source code in polyfuzz\models\_tfidf.py
def match(self,
from_list: List[str],
to_list: List[str] = None,
re_train: bool = True) -> pd.DataFrame:
""" Match two lists of strings to each other and return the most similar strings
Arguments:
from_list: The list from which you want mappings
to_list: The list where you want to map to
re_train: Whether to re-train the model with new embeddings
Set this to False if you want to use this model in production
Returns:
matches: The best matches between the lists of strings
Usage:
```python
from polymatcher.models import TFIDF
model = TFIDF()
matches = model.match(["string_one", "string_two"],
["string_three", "string_four"])
```
"""
tf_idf_from, tf_idf_to = self._extract_tf_idf(from_list, to_list, re_train)
matches = cosine_similarity(tf_idf_from, tf_idf_to,
from_list, to_list,
self.min_similarity,
top_n=self.top_n,
method=self.cosine_method)
return matches