Skip to content

polyfuzz.models.cosine_similarity

Calculate similarity between two matrices/vectors and return best matches

Parameters:

Name Type Description Default
from_vector ndarray

the matrix or vector representing the embedded strings to map from

required
to_vector ndarray

the matrix or vector representing the embedded strings to map to

required
from_list List[str]

The list from which you want mappings

required
to_list List[str]

The list where you want to map to

required
min_similarity float

The minimum similarity between strings, otherwise return 0 similarity

0.75
top_n int

The number of best matches you want returned

1
method str

The method/package for calculating the cosine similarity. Options: "sparse", "sklearn", "knn". Sparse is the fastest and most memory efficient but requires a package that might be difficult to install. Sklearn is a bit slower than sparse and requires significantly more memory as the distance matrix is not sparse Knn uses 1-nearest neighbor to extract the most similar strings it is significantly slower than both methods but requires little memory

'sparse'

Returns:

Type Description
matches

The best matches between the lists of strings

Usage:

Make sure to fill the to_vector and from_vector with vector representations of to_list and from_list respectively:

from polyfuzz.models import extract_best_matches
indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
Source code in polyfuzz\models\_utils.py
def cosine_similarity(from_vector: np.ndarray,
                      to_vector: np.ndarray,
                      from_list: List[str],
                      to_list: List[str],
                      min_similarity: float = 0.75,
                      top_n: int = 1,
                      method: str = "sparse") -> pd.DataFrame:
    """ Calculate similarity between two matrices/vectors and return best matches

    Arguments:
        from_vector: the matrix or vector representing the embedded strings to map from
        to_vector: the matrix or vector representing the embedded strings to map to
        from_list: The list from which you want mappings
        to_list: The list where you want to map to
        min_similarity: The minimum similarity between strings, otherwise return 0 similarity
        top_n: The number of best matches you want returned
        method: The method/package for calculating the cosine similarity.
                Options: "sparse", "sklearn", "knn".
                Sparse is the fastest and most memory efficient but requires a
                package that might be difficult to install.
                Sklearn is a bit slower than sparse and requires significantly more memory as
                the distance matrix is not sparse
                Knn uses 1-nearest neighbor to extract the most similar strings
                it is significantly slower than both methods but requires little memory

    Returns:
        matches:  The best matches between the lists of strings


    Usage:

    Make sure to fill the `to_vector` and `from_vector` with vector representations
    of `to_list` and `from_list` respectively:

    ```python
    from polyfuzz.models import extract_best_matches
    indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
    ```
    """
    if to_list is not None:
        if top_n > len(set(to_list)):
            top_n = len(set(to_list))

    # Slower but uses less memory
    if method == "knn":

        if to_list is None:
            knn = NearestNeighbors(n_neighbors=top_n+1, n_jobs=-1, metric='cosine').fit(to_vector)
            distances, indices = knn.kneighbors(from_vector)
            distances = distances[:, 1:]
            indices = indices[:, 1:]
        else:
            knn = NearestNeighbors(n_neighbors=top_n, n_jobs=-1, metric='cosine').fit(to_vector)
            distances, indices = knn.kneighbors(from_vector)

        similarities = [np.round(1 - distances[:, i], 3) for i in range(distances.shape[1])]

    # Fast, but does has some installation issues
    elif _HAVE_SPARSE_DOT and method == "sparse":
        if isinstance(to_vector, np.ndarray):
            to_vector = csr_matrix(to_vector)
        if isinstance(from_vector, np.ndarray):
            from_vector = csr_matrix(from_vector)

        # There is a bug with awesome_cossim_topn that when to_vector and from_vector
        # have the same shape, setting topn to 1 does not work. Apparently, you need
        # to it at least to 2 for it to work
        similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity)

        if to_list is None:
            similarity_matrix = similarity_matrix.tolil()
            similarity_matrix.setdiag(0.)
            similarity_matrix = similarity_matrix.tocsr()

        indices = _top_n_idx_sparse(similarity_matrix, top_n)
        similarities = _top_n_similarities_sparse(similarity_matrix, indices)
        indices = np.array(np.nan_to_num(np.array(indices, dtype=np.float), nan=0), dtype=np.int)

    # Faster than knn and slower than sparse but uses more memory
    else:
        similarity_matrix = scikit_cosine_similarity(from_vector, to_vector)

        if to_list is None:
            np.fill_diagonal(similarity_matrix, 0)

        indices = np.flip(np.argsort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
        similarities = np.flip(np.sort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
        similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]

    # Convert results to df
    if to_list is None:
        to_list = from_list.copy()

    columns = (["From"] +
               ["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
               ["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)])
    matches = [[to_list[idx] for idx in indices[:, i]] for i in range(indices.shape[1])]
    matches = pd.DataFrame(np.vstack(([from_list], matches, similarities)).T, columns = columns)

    # Update column order
    columns = [["From", "To", "Similarity"]] + [[f"To_{i+2}", f"Similarity_{i+2}"] for i in range((top_n-1))]
    matches = matches.loc[:, [title for column in columns for title in column]]

    # Update types
    for column in matches.columns:
        if "Similarity" in column:
            matches[column] = matches[column].astype(float)
            matches.loc[matches[column] < 0.001, column] = float(0)
            matches.loc[matches[column] < 0.001, column.replace("Similarity", "To")] = None

    return matches