polyfuzz.metrics

precision_recall_curve(matches, precision_steps=0.01)

Show source code in polyfuzz\metrics.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def precision_recall_curve(matches: pd.DataFrame,
                           precision_steps: float = 0.01) -> Tuple[List[float],
                                                                   List[float],
                                                                   List[float]]:
    """ Calculate precision recall curve based on minimum similarity between strings

    A minimum similarity score might be used to identify
    when a match could be considered to be correct. For example,
    we can assume that if a similarity score pass 0.95 we are
    quite confident that the matches are correct. This minimum
    similarity score can be defined as **precision** since it shows
    you how precise we believe the matches are at a minimum.

    **Recall** can then be defined as as the percentage of matches
    found at a certain minimum similarity score. A high recall means
    that for a certain minimum precision score, we find many matches.

    Arguments:
        matches: contains the columns *From*, *To*, and *Similarity* used for calculating
                 precision, recall, and average precision
        precision_steps: the incremental steps in minimum precision

    Returns:
        min_precisions: minimum precision steps
        recall: recall per minimum precision step
        average_precision: average precision per minimum precision step
    """
    min_precisions = list(np.arange(0., 1 + precision_steps, precision_steps))
    average_precision = []
    recall = []
    similarities = matches.Similarity.values
    total = len(matches)

    for min_precision in min_precisions:
        selection = similarities[similarities >= min_precision]
        recall.append(len(selection) / total)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            average_precision.append(float(np.mean(selection)))

    return min_precisions, recall, average_precision

Calculate precision recall curve based on minimum similarity between strings

A minimum similarity score might be used to identify when a match could be considered to be correct. For example, we can assume that if a similarity score pass 0.95 we are quite confident that the matches are correct. This minimum similarity score can be defined as precision since it shows you how precise we believe the matches are at a minimum.

Recall can then be defined as as the percentage of matches found at a certain minimum similarity score. A high recall means that for a certain minimum precision score, we find many matches.

Parameters

Name Type Description Default
matches DataFrame contains the columns From, To, and Similarity used for calculating precision, recall, and average precision required
precision_steps float the incremental steps in minimum precision 0.01

Returns

Type Description
Tuple[List[float], List[float], List[float]] min_precisions: minimum precision steps recall: recall per minimum precision step average_precision: average precision per minimum precision step

visualize_precision_recall(matches, min_precisions, recall, kde=True, save_path=None)

Show source code in polyfuzz\metrics.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def visualize_precision_recall(matches: Mapping[str, pd.DataFrame],
                               min_precisions: Mapping[str, List[float]],
                               recall: Mapping[str, List[float]],
                               kde: bool = True,
                               save_path: str = None):
    """ Visualize the precision recall curve for one or more models

    Arguments:
        matches: contains the columns *From*, *To*, and *Similarity* used for calculating
                 precision, recall, and average precision per model
        min_precisions: minimum precision steps per model
        recall: recall per minimum precision step per model
        kde: whether to also visualize the kde plot
        save_path: the path to save the resulting image to

    Usage:

    ```python
    visualize_precision_recall(matches, min_precisions, recall, save_path="data/results.png")
    ```
    """
    SMALL_SIZE = 10
    MEDIUM_SIZE = 12
    BIGGER_SIZE = 14

    plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

    if not isinstance(matches, dict):
        matches = {"Model": matches}
        min_precisions = {"Model": min_precisions}
        recall = {"Model": recall}

    # Create single dataset of similarity score for all models
    distribution_data = [(matches[name].Similarity.values, [name for _ in range(len(matches[name]))]) for name in
                         matches.keys()]
    distribution_data = pd.DataFrame(np.hstack(distribution_data).T, columns=["Similarity", "Model"])
    distribution_data.Similarity = distribution_data.Similarity.astype(float)
    model_names = list(matches.keys())

    # Create layout
    cmap = get_cmap('Accent')
    fig = plt.figure(figsize=(20, 5))

    if len(model_names) == 1:
        middle = 0
    else:
        middle = .1

    if kde:
        widths = [1.5, middle, 1.5]
    else:
        widths = [1.5, middle, 0]

    heights = [1.5]
    gs = gridspec.GridSpec(1, 3, width_ratios=widths, height_ratios=heights)
    ax1 = plt.subplot(gs[:, 0])

    if kde:
        ax2 = plt.subplot(gs[:, 2], sharex=ax1)

    # Precision-recall curve
    for color, model_name in zip(cmap.colors, model_names):
        ax1.plot(min_precisions[model_name], recall[model_name], color=color)
    ax1.set_ylim(bottom=0, top=1)
    ax1.set_xlim(left=0, right=1)
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.set_xlabel(r"$\bf{Precision}$" + "\n(Minimum Similarity)")
    ax1.set_ylabel(r"$\bf{Recall}$" + "\n(Percentage Matched)")


    # Similarity Histogram
    if kde:
        for color, model_name in zip(cmap.colors, model_names):
            sns.kdeplot(matches[model_name]["Similarity"], fill=True, ax=ax2, color=color)
        ax2.yaxis.set_label_position("right")
        ax2.yaxis.tick_right()
        ax2.set_xlabel(r"$\bf{Similarity}$")
        ax2.set_ylabel("")
        ax2.set_xlim(left=-0, right=1)
        plt.setp([ax2], title='Score Frequency - KDE')

    # Titles
    if len(model_names) == 1 and kde:
        fig.suptitle(f'Score Metrics', size=20, y=1, x=0.5)
        plt.setp([ax1], title='Precision-Recall Curve')
    elif kde:
        fig.suptitle('Score Metrics', size=20, y=1, x=0.5)
        plt.setp([ax1], title='Precision-Recall Curve')
    else:
        fig.suptitle('Precision-Recall Curve', size=20, y=1, x=0.45)

    # Custom Legend
    if len(model_names) > 1:
        custom_lines = [Line2D([0], [0], color=color, lw=4) for color, model_name in zip(cmap.colors, model_names)]
        ax1.legend(custom_lines, model_names, bbox_to_anchor=(1.05, .61, .7, .902), loc=3,
                   ncol=1, borderaxespad=0., frameon=True, fontsize=10)

    if save_path:
        plt.savefig(save_path, dpi=300)

Visualize the precision recall curve for one or more models

Parameters

Name Type Description Default
matches Mapping[str, pandas.core.frame.DataFrame] contains the columns From, To, and Similarity used for calculating precision, recall, and average precision per model required
min_precisions Mapping[str, List[float]] minimum precision steps per model required
recall Mapping[str, List[float]] recall per minimum precision step per model required
kde bool whether to also visualize the kde plot True
save_path str the path to save the resulting image to None

Usage:

visualize_precision_recall(matches, min_precisions, recall, save_path="data/results.png")