Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

FBetaMeasure metric with one value per key #5638

Merged
merged 15 commits into from
Jun 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

- Added metric `FBetaVerboseMeasure` which extends `FBetaMeasure` to ensure compatibility with logging plugins and add some options.

## [v2.9.3](https://github.com/allenai/allennlp/releases/tag/v2.9.3) - 2022-04-13

### Added
Expand Down
1 change: 1 addition & 0 deletions allennlp/training/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
DEFAULT_EVALB_DIR,
)
from allennlp.training.metrics.fbeta_measure import FBetaMeasure
from allennlp.training.metrics.fbeta_verbose_measure import FBetaVerboseMeasure
from allennlp.training.metrics.fbeta_multi_label_measure import (
FBetaMultiLabelMeasure,
F1MultiLabelMeasure,
Expand Down
3 changes: 1 addition & 2 deletions allennlp/training/metrics/fbeta_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,7 @@ def get_metric(self, reset: bool = False):
If `self.average` is not `None`, you will get `float` instead of `List[float]`.
"""
if self._true_positive_sum is None:
raise RuntimeError("You never call this metric before.")

raise RuntimeError("You have never called this metric before.")
else:
tp_sum = self._true_positive_sum
pred_sum = self._pred_sum
Expand Down
154 changes: 154 additions & 0 deletions allennlp/training/metrics/fbeta_verbose_measure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from typing import List, Dict

from allennlp.common.util import nan_safe_tensor_divide
from allennlp.training.metrics.metric import Metric
from allennlp.training.metrics.fbeta_measure import FBetaMeasure


@Metric.register("fbeta_verbose")
class FBetaVerboseMeasure(FBetaMeasure):
"""Compute precision, recall, F-measure and support for each class.

This is basically the same as `FBetaMeasure` (the super class)
with two differences:
- it always returns a dictionary of floats, while `FBetaMeasure`
can return a dictionary of lists (one element for each class).
- it always returns precision, recall and F-measure for each
class and also three averaged values for each metric: micro,
macro and weighted averages.

The returned dictionary contains keys with the following format:
<class>-precision : `float`
<class>-recall : `float`
<class>-fscore : `float`
<avg>-precision : `float`
<avg>-recall : `float`
<avg>-fscore : `float`
where <class> is the index (or the label if `index_to_label` is given)
of each class; and <avg> is `micro`, `macro` and `weighted`, one for
each kind of average.

The precision is the ratio `tp / (tp + fp)` where `tp` is the number of
true positives and `fp` the number of false positives. The precision is
intuitively the ability of the classifier not to label as positive a sample
that is negative.

The recall is the ratio `tp / (tp + fn)` where `tp` is the number of
true positives and `fn` the number of false negatives. The recall is
intuitively the ability of the classifier to find all the positive samples.

The F-beta score can be interpreted as a weighted harmonic mean of
the precision and recall, where an F-beta score reaches its best
value at 1 and worst score at 0.

If we have precision and recall, the F-beta score is simply:
`F-beta = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)`

The F-beta score weights recall more than precision by a factor of
`beta`. `beta == 1.0` means recall and precision are equally important.

The support is the number of occurrences of each class in `y_true`.

# Parameters

beta : `float`, optional (default = `1.0`)
The strength of recall versus precision in the F-score.

labels : `List[int]`, optional
The set of labels to include. Labels present in the data can be excluded,
for example, to calculate a multi-class average ignoring a majority
negative class. Labels not present in the data will result in 0
components in a macro or weighted average.

index_to_label : `Dict[int, str]`, optional
A dictionary mapping indices to the corresponding label.
If this map is giving, the provided metrics include the label
instead of the index for each class.
"""

def __init__(
self,
beta: float = 1.0,
labels: List[int] = None,
index_to_label: Dict[int, str] = None,
) -> None:
super().__init__(beta=beta, average=None, labels=labels)
self._index_to_label = index_to_label

def get_metric(self, reset: bool = False):
"""
# Returns

<class>-precision : `float`
<class>-recall : `float`
<class>-fscore : `float`
<avg>-precision : `float`
<avg>-recall : `float`
<avg>-fscore : `float`

where <class> is the index (or the label if `index_to_label` is given)
of each class; and <avg> is `micro`, `macro` and `weighted`, one for
each kind of average.
"""
if self._true_positive_sum is None or self._pred_sum is None or self._true_sum is None:
raise RuntimeError("You have never called this metric before.")

tp_sum = self._true_positive_sum
pred_sum = self._pred_sum
true_sum = self._true_sum

if self._labels is not None:
# Retain only selected labels and order them
tp_sum = tp_sum[self._labels]
pred_sum = pred_sum[self._labels] # type: ignore
true_sum = true_sum[self._labels] # type: ignore

beta2 = self._beta**2

# Finally, we have all our sufficient statistics.
precision = nan_safe_tensor_divide(tp_sum, pred_sum)
recall = nan_safe_tensor_divide(tp_sum, true_sum)
fscore = nan_safe_tensor_divide(
(1 + beta2) * precision * recall, beta2 * precision + recall
)

all_metrics = {}
for c, (p, r, f) in enumerate(zip(precision.tolist(), recall.tolist(), fscore.tolist())):
label = str(c)
if self._index_to_label:
label = self._index_to_label[c]
all_metrics[f"{label}-precision"] = p
all_metrics[f"{label}-recall"] = r
all_metrics[f"{label}-fscore"] = f

# macro average
all_metrics["macro-precision"] = precision.mean().item()
all_metrics["macro-recall"] = recall.mean().item()
all_metrics["macro-fscore"] = fscore.mean().item()

# weighted average
weights = true_sum
weights_sum = true_sum.sum() # type: ignore
all_metrics["weighted-precision"] = nan_safe_tensor_divide(
(weights * precision).sum(), weights_sum
).item()
all_metrics["weighted-recall"] = nan_safe_tensor_divide(
(weights * recall).sum(), weights_sum
).item()
all_metrics["weighted-fscore"] = nan_safe_tensor_divide(
(weights * fscore).sum(), weights_sum
).item()

# micro average
micro_precision = nan_safe_tensor_divide(tp_sum.sum(), pred_sum.sum())
micro_recall = nan_safe_tensor_divide(tp_sum.sum(), true_sum.sum())
all_metrics["micro-precision"] = micro_precision.item()
all_metrics["micro-recall"] = micro_recall.item()
all_metrics["micro-fscore"] = nan_safe_tensor_divide(
(1 + beta2) * micro_precision * micro_recall, beta2 * micro_precision + micro_recall
).item()

if reset:
self.reset()

return all_metrics