polyfuzz.metrics
¶
precision_recall_curve(matches, precision_steps=0.01)
¶
Calculate precision recall curve based on minimum similarity between strings
A minimum similarity score might be used to identify when a match could be considered to be correct. For example, we can assume that if a similarity score pass 0.95 we are quite confident that the matches are correct. This minimum similarity score can be defined as precision since it shows you how precise we believe the matches are at a minimum.
Recall can then be defined as as the percentage of matches found at a certain minimum similarity score. A high recall means that for a certain minimum precision score, we find many matches.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
matches |
DataFrame |
contains the columns From, To, and Similarity used for calculating precision, recall, and average precision |
required |
precision_steps |
float |
the incremental steps in minimum precision |
0.01 |
Returns:
Type | Description |
---|---|
min_precisions |
minimum precision steps recall: recall per minimum precision step average_precision: average precision per minimum precision step |
Source code in polyfuzz\metrics.py
def precision_recall_curve(matches: pd.DataFrame,
precision_steps: float = 0.01) -> Tuple[List[float],
List[float],
List[float]]:
""" Calculate precision recall curve based on minimum similarity between strings
A minimum similarity score might be used to identify
when a match could be considered to be correct. For example,
we can assume that if a similarity score pass 0.95 we are
quite confident that the matches are correct. This minimum
similarity score can be defined as **precision** since it shows
you how precise we believe the matches are at a minimum.
**Recall** can then be defined as as the percentage of matches
found at a certain minimum similarity score. A high recall means
that for a certain minimum precision score, we find many matches.
Arguments:
matches: contains the columns *From*, *To*, and *Similarity* used for calculating
precision, recall, and average precision
precision_steps: the incremental steps in minimum precision
Returns:
min_precisions: minimum precision steps
recall: recall per minimum precision step
average_precision: average precision per minimum precision step
"""
min_precisions = list(np.arange(0., 1 + precision_steps, precision_steps))
average_precision = []
recall = []
similarities = matches.Similarity.values
total = len(matches)
for min_precision in min_precisions:
selection = similarities[similarities >= min_precision]
recall.append(len(selection) / total)
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
average_precision.append(float(np.mean(selection)))
return min_precisions, recall, average_precision
visualize_precision_recall(matches, min_precisions, recall, kde=True, save_path=None)
¶
Visualize the precision recall curve for one or more models
Parameters:
Name | Type | Description | Default |
---|---|---|---|
matches |
Mapping[str, pandas.core.frame.DataFrame] |
contains the columns From, To, and Similarity used for calculating precision, recall, and average precision per model |
required |
min_precisions |
Mapping[str, List[float]] |
minimum precision steps per model |
required |
recall |
Mapping[str, List[float]] |
recall per minimum precision step per model |
required |
kde |
bool |
whether to also visualize the kde plot |
True |
save_path |
str |
the path to save the resulting image to |
None |
Usage:
visualize_precision_recall(matches, min_precisions, recall, save_path="data/results.png")
Source code in polyfuzz\metrics.py
def visualize_precision_recall(matches: Mapping[str, pd.DataFrame],
min_precisions: Mapping[str, List[float]],
recall: Mapping[str, List[float]],
kde: bool = True,
save_path: str = None):
""" Visualize the precision recall curve for one or more models
Arguments:
matches: contains the columns *From*, *To*, and *Similarity* used for calculating
precision, recall, and average precision per model
min_precisions: minimum precision steps per model
recall: recall per minimum precision step per model
kde: whether to also visualize the kde plot
save_path: the path to save the resulting image to
Usage:
```python
visualize_precision_recall(matches, min_precisions, recall, save_path="data/results.png")
```
"""
SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 14
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
if not isinstance(matches, dict):
matches = {"Model": matches}
min_precisions = {"Model": min_precisions}
recall = {"Model": recall}
# Create single dataset of similarity score for all models
distribution_data = [(matches[name].Similarity.values, [name for _ in range(len(matches[name]))]) for name in
matches.keys()]
distribution_data = pd.DataFrame(np.hstack(distribution_data).T, columns=["Similarity", "Model"])
distribution_data.Similarity = distribution_data.Similarity.astype(float)
model_names = list(matches.keys())
# Create layout
cmap = get_cmap('Accent')
fig = plt.figure(figsize=(20, 5))
if len(model_names) == 1:
middle = 0
else:
middle = .1
if kde:
widths = [1.5, middle, 1.5]
else:
widths = [1.5, middle, 0]
heights = [1.5]
gs = gridspec.GridSpec(1, 3, width_ratios=widths, height_ratios=heights)
ax1 = plt.subplot(gs[:, 0])
if kde:
ax2 = plt.subplot(gs[:, 2], sharex=ax1)
# Precision-recall curve
for color, model_name in zip(cmap.colors, model_names):
ax1.plot(min_precisions[model_name], recall[model_name], color=color)
ax1.set_ylim(bottom=0, top=1)
ax1.set_xlim(left=0, right=1)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.set_xlabel(r"$\bf{Precision}$" + "\n(Minimum Similarity)")
ax1.set_ylabel(r"$\bf{Recall}$" + "\n(Percentage Matched)")
# Similarity Histogram
if kde:
for color, model_name in zip(cmap.colors, model_names):
sns.kdeplot(matches[model_name]["Similarity"], fill=True, ax=ax2, color=color)
ax2.yaxis.set_label_position("right")
ax2.yaxis.tick_right()
ax2.set_xlabel(r"$\bf{Similarity}$")
ax2.set_ylabel("")
ax2.set_xlim(left=-0, right=1)
plt.setp([ax2], title='Score Frequency - KDE')
# Titles
if len(model_names) == 1 and kde:
fig.suptitle(f'Score Metrics', size=20, y=1, x=0.5)
plt.setp([ax1], title='Precision-Recall Curve')
elif kde:
fig.suptitle('Score Metrics', size=20, y=1, x=0.5)
plt.setp([ax1], title='Precision-Recall Curve')
else:
fig.suptitle('Precision-Recall Curve', size=20, y=1, x=0.45)
# Custom Legend
if len(model_names) > 1:
custom_lines = [Line2D([0], [0], color=color, lw=4) for color, model_name in zip(cmap.colors, model_names)]
ax1.legend(custom_lines, model_names, bbox_to_anchor=(1.05, .61, .7, .902), loc=3,
ncol=1, borderaxespad=0., frameon=True, fontsize=10)
if save_path:
plt.savefig(save_path, dpi=300)