From c723469584dfb538b25e86bcedbd9ee6ed9d408b Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 20 Oct 2021 16:54:18 +0200 Subject: [PATCH 1/6] Add Precision, Recall, F-measure, Confusion Matrix and per-tag evaluation to Taggers And add precision, recall and f-measure to ConfusionMatrix. Includes large doctests, and some small doctest fixes throughout the tag module --- nltk/metrics/confusionmatrix.py | 75 ++++++ nltk/tag/__init__.py | 6 +- nltk/tag/api.py | 244 +++++++++++++++++- nltk/tag/brill.py | 2 +- nltk/tag/sequential.py | 4 +- nltk/test/metrics.doctest | 21 ++ nltk/test/tag.doctest | 432 ++++++++++++++++++++++++++++++++ 7 files changed, 775 insertions(+), 9 deletions(-) diff --git a/nltk/metrics/confusionmatrix.py b/nltk/metrics/confusionmatrix.py index 1dc7121082..a2b196e27e 100644 --- a/nltk/metrics/confusionmatrix.py +++ b/nltk/metrics/confusionmatrix.py @@ -201,6 +201,81 @@ def key(self): return str + def recall(self, value): + """Given a value in the confusion matrix, return the recall + that corresponds to this value. The recall is defined as: + + - *r* = true positive / (true positive + false positive) + + and can loosely be considered the ratio of how often ``value`` + was predicted correctly relative to how often ``value`` was + the true result. + + :param value: value used in the ConfusionMatrix + :return: the recall corresponding to ``value``. + :rtype: float + """ + # Number of times `value` was correct, and also predicted + TP = self[value, value] + # Number of times `value` was correct + TP_FN = sum(self[value, pred_value] for pred_value in self._values) + if TP_FN == 0: + return 0.0 + return TP / TP_FN + + def precision(self, value): + """Given a value in the confusion matrix, return the precision + that corresponds to this value. The precision is defined as: + + - *p* = true positive / (true positive + false negative) + + and can loosely be considered the ratio of how often ``value`` + was predicted correctly relative to the number of predictions + for ``value``. + + :param value: value used in the ConfusionMatrix + :return: the precision corresponding to ``value``. + :rtype: float + """ + # Number of times `value` was correct, and also predicted + TP = self[value, value] + # Number of times `value` was predicted + TP_FP = sum(self[real_value, value] for real_value in self._values) + if TP_FP == 0: + return 0.0 + return TP / TP_FP + + def f_measure(self, value, alpha=0.5): + """ + Given a value used in the confusion matrix, return the f-measure + that corresponds to this value. The f-measure is the harmonic mean + of the ``precision`` and ``recall``, weighted by ``alpha``. + In particular, given the precision *p* and recall *r* defined by: + + - *p* = true positive / (true positive + false negative) + - *r* = true positive / (true positive + false positive) + + The f-measure is: + + - *1/(alpha/p + (1-alpha)/r)* + + With ``alpha = 0.5``, this reduces to: + + - *2pr / (p + r)* + + :param value: value used in the ConfusionMatrix + :param alpha: Ratio of the cost of false negative compared to false + positives. Defaults to 0.5, where the costs are equal. + :type alpha: float + :return: the F-measure corresponding to ``value``. + :rtype: float + """ + p = self.precision(value) + r = self.recall(value) + if p == 0.0 or r == 0.0: + return 0.0 + return 1.0 / (alpha / p + (1 - alpha) / r) + def demo(): reference = "DET NN VB DET JJ NN NN IN DET NN".split() diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py index ce7610e171..c3d2a94e0b 100644 --- a/nltk/tag/__init__.py +++ b/nltk/tag/__init__.py @@ -21,7 +21,7 @@ An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset: >>> from nltk import pos_tag, word_tokenize - >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) + >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] @@ -144,10 +144,10 @@ def pos_tag(tokens, tagset=None, lang="eng"): >>> from nltk.tag import pos_tag >>> from nltk.tokenize import word_tokenize - >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) + >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] - >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') + >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] diff --git a/nltk/tag/api.py b/nltk/tag/api.py index 9ef6513549..b769522025 100644 --- a/nltk/tag/api.py +++ b/nltk/tag/api.py @@ -11,10 +11,12 @@ information, such as its part of speech. """ from abc import ABCMeta, abstractmethod +from functools import lru_cache from itertools import chain +from typing import Dict from nltk.internals import overridden -from nltk.metrics import accuracy +from nltk.metrics import ConfusionMatrix, accuracy from nltk.tag.util import untag @@ -47,7 +49,7 @@ def tag(self, tokens): def tag_sents(self, sentences): """ - Apply ``self.tag()`` to each element of *sentences*. I.e.: + Apply ``self.tag()`` to each element of *sentences*. I.e.:: return [self.tag(sent) for sent in sentences] """ @@ -59,8 +61,8 @@ def evaluate(self, gold): Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. - :type gold: list(list(tuple(str, str))) :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) :rtype: float """ @@ -69,6 +71,242 @@ def evaluate(self, gold): test_tokens = list(chain.from_iterable(tagged_sents)) return accuracy(gold_tokens, test_tokens) + @lru_cache(maxsize=1) + def _confusion_cached(self, gold): + """ + Inner function used after ``gold`` is converted to a + ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on + creating a ConfusionMatrix. + + :param gold: The list of tagged sentences to run the tagger with, + also used as the reference values in the generated confusion matrix. + :type gold: tuple(tuple(tuple(str, str))) + :rtype: ConfusionMatrix + """ + + tagged_sents = self.tag_sents(untag(sent) for sent in gold) + gold_tokens = [token for _word, token in chain.from_iterable(gold)] + test_tokens = [token for _word, token in chain.from_iterable(tagged_sents)] + return ConfusionMatrix(gold_tokens, test_tokens) + + def confusion(self, gold): + """ + Return a ConfusionMatrix with the tags from ``gold`` as the reference + values, with the predictions from ``tag_sents`` as the predicted values. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[:10] + >>> print(tagger.confusion(gold_data)) + | - | + | N | + | O P | + | N J J N N P P R R V V V V V W | + | ' E C C D E I J J J M N N N O R P R B R T V B B B B B D ` | + | ' , - . C D T X N J R S D N P S S P $ B R P O B D G N P Z T ` | + -------+----------------------------------------------------------------------------------------------+ + '' | <1> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | .<15> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . <.> . . 2 . . . 2 . . . 5 1 . . . . 2 . . . . . . . . . . . | + . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . .<20> . . . . . . . . . . . . . . . . . . . . . . . . | + EX | . . . . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . .<22> . . . . . . . . . . 3 . . . . . . . . . . . | + JJ | . . . . . . . . .<16> . . . . 1 . . . . 1 . . . . . . . . . . . | + JJR | . . . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . .<28> 1 1 . . . . . . . . . . . . . . . | + NNP | . . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . .<19> . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . <2> . . . . . . . . . . . . | + RB | . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RBR | . . . . . . . . . . 1 . . . . . . . . . <1> . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . <5> . . . . . . . . | + VB | . . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . . . | + VBD | . . . . . . . . . . . . . 1 . . . . . . . . . . <6> . . . . . . | + VBG | . . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . . . | + VBN | . . . . . . . . . . . . . . . . . . . . . . . . 1 . <4> . . . . | + VBP | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . . . . <7> . . | + WDT | . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . <.> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <1>| + -------+----------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + + :param gold: The list of tagged sentences to run the tagger with, + also used as the reference values in the generated confusion matrix. + :type gold: list(list(tuple(str, str))) + :rtype: ConfusionMatrix + """ + + return self._confusion_cached(tuple(tuple(sent) for sent in gold)) + + def recall(self, gold) -> Dict[str, float]: + """ + Compute the recall for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to recall. The recall is defined as: + + - *r* = true positive / (true positive + false positive) + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :return: A mapping from tags to recall + :rtype: Dict[str, float] + """ + + cm = self.confusion(gold) + return {tag: cm.recall(tag) for tag in cm._values} + + def precision(self, gold): + """ + Compute the precision for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to precision. The precision is defined as: + + - *p* = true positive / (true positive + false negative) + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :return: A mapping from tags to precision + :rtype: Dict[str, float] + """ + + cm = self.confusion(gold) + return {tag: cm.precision(tag) for tag in cm._values} + + def f_measure(self, gold, alpha=0.5): + """ + Compute the f-measure for each tag from ``gold`` or from running ``tag`` + on the tokenized sentences from ``gold``. Then, return the dictionary + with mappings from tag to f-measure. The f-measure is the harmonic mean + of the ``precision`` and ``recall``, weighted by ``alpha``. + In particular, given the precision *p* and recall *r* defined by: + + - *p* = true positive / (true positive + false negative) + - *r* = true positive / (true positive + false positive) + + The f-measure is: + + - *1/(alpha/p + (1-alpha)/r)* + + With ``alpha = 0.5``, this reduces to: + + - *2pr / (p + r)* + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :param alpha: Ratio of the cost of false negative compared to false + positives. Defaults to 0.5, where the costs are equal. + :type alpha: float + :return: A mapping from tags to precision + :rtype: Dict[str, float] + """ + cm = self.confusion(gold) + return {tag: cm.f_measure(tag, alpha) for tag in cm._values} + + def evaluate_per_tag(self, gold, alpha=0.5, truncate=None, sort_by_count=False): + """Tabulate the **recall**, **precision** and **f-measure** + for each tag from ``gold`` or from running ``tag`` on the tokenized + sentences from ``gold``. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[:10] + >>> print(tagger.evaluate_per_tag(gold_data)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + '' | 1.0000 | 1.0000 | 1.0000 + , | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + . | 1.0000 | 1.0000 | 1.0000 + CC | 1.0000 | 1.0000 | 1.0000 + CD | 0.7143 | 1.0000 | 0.8333 + DT | 1.0000 | 1.0000 | 1.0000 + EX | 1.0000 | 1.0000 | 1.0000 + IN | 0.9167 | 0.8800 | 0.8980 + JJ | 0.8889 | 0.8889 | 0.8889 + JJR | 0.0000 | 0.0000 | 0.0000 + JJS | 1.0000 | 1.0000 | 1.0000 + MD | 1.0000 | 1.0000 | 1.0000 + NN | 0.8000 | 0.9333 | 0.8615 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 0.9500 | 1.0000 | 0.9744 + POS | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 1.0000 | 1.0000 + PRP$ | 1.0000 | 1.0000 | 1.0000 + RB | 0.4000 | 1.0000 | 0.5714 + RBR | 1.0000 | 0.5000 | 0.6667 + RP | 1.0000 | 1.0000 | 1.0000 + TO | 1.0000 | 1.0000 | 1.0000 + VB | 1.0000 | 1.0000 | 1.0000 + VBD | 0.8571 | 0.8571 | 0.8571 + VBG | 1.0000 | 0.8000 | 0.8889 + VBN | 1.0000 | 0.8000 | 0.8889 + VBP | 1.0000 | 1.0000 | 1.0000 + VBZ | 1.0000 | 1.0000 | 1.0000 + WDT | 0.0000 | 0.0000 | 0.0000 + `` | 1.0000 | 1.0000 | 1.0000 + + + :param gold: The list of tagged sentences to score the tagger on. + :type gold: list(list(tuple(str, str))) + :param alpha: Ratio of the cost of false negative compared to false + positives, as used in the f-measure computation. Defaults to 0.5, + where the costs are equal. + :type alpha: float + :param truncate: If specified, then only show the specified + number of values. Any sorting (e.g., sort_by_count) + will be performed before truncation. Defaults to None + :type truncate: int, optional + :param sort_by_count: Whether to sort the outputs on number of + occurrences of that tag in the ``gold`` data, defaults to False + :type sort_by_count: bool, optional + :return: A tabulated recall, precision and f-measure string + :rtype: str + """ + # Gather Confusion Matrix and metrics + cm = self.confusion(gold) + recalls = self.recall(gold) + precisions = self.precision(gold) + f_measures = self.f_measure(gold) + + tags = cm._values + + # Apply keyword parameters + if sort_by_count: + tags = sorted(tags, key=lambda v: -sum(cm._confusion[cm._indices[v]])) + if truncate: + tags = tags[:truncate] + + tag_column_len = max(max(len(tag) for tag in tags), 3) + + # Construct the header + s = ( + f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n" + f"{'-' * tag_column_len}-+--------+--------+-----------\n" + ) + + # Construct the body + for tag in tags: + s += ( + f"{tag:>{tag_column_len}} | " + f"{precisions[tag]:<6.4f} | " + f"{recalls[tag]:<6.4f} | " + f"{f_measures[tag]:.4f}\n" + ) + + return s + def _check_params(self, train, model): if (train and model) or (not train and not model): raise ValueError("Must specify either training data or trained model.") diff --git a/nltk/tag/brill.py b/nltk/tag/brill.py index b0b7607eac..05a8dd5f43 100644 --- a/nltk/tag/brill.py +++ b/nltk/tag/brill.py @@ -329,7 +329,7 @@ def print_train_stats(): ) print( "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " - "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats) + "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats) ) head = "#ID | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py index 3576d2d8a9..09745d0a91 100644 --- a/nltk/tag/sequential.py +++ b/nltk/tag/sequential.py @@ -337,7 +337,7 @@ class UnigramTagger(NgramTagger): >>> test_sent = brown.sents(categories='news')[0] >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) >>> for tok, tag in unigram_tagger.tag(test_sent): - ... print("({}, {}), ".format(tok, tag)) + ... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL), (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT), (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ), @@ -515,7 +515,7 @@ class RegexpTagger(SequentialBackoffTagger): ... ]) >>> regexp_tagger - >>> regexp_tagger.tag(test_sent) + >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'), ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), diff --git a/nltk/test/metrics.doctest b/nltk/test/metrics.doctest index 42df6f42e9..d4852c12f1 100644 --- a/nltk/test/metrics.doctest +++ b/nltk/test/metrics.doctest @@ -217,6 +217,27 @@ Confusion Matrix 10: h +For "e", the number of true positives should be 6, while the number of false negatives is 3. +So, the recall ought to be 6 / (6 + 3): + + >>> cm.recall("e") # doctest: +ELLIPSIS + 0.666666... + +For "e", the false positive is just 1, so the precision should be 6 / (6 + 1): + + >>> cm.precision("e") # doctest: +ELLIPSIS + 0.857142... + +The f-measure with default value of ``alpha = 0.5`` should then be: + +* *1/(alpha/p + (1-alpha)/r) =* +* *1/(0.5/p + 0.5/r) =* +* *2pr / (p + r) =* +* *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =* +* *0.749999...* + + >>> cm.f_measure("e") # doctest: +ELLIPSIS + 0.749999... -------------------- Association measures diff --git a/nltk/test/tag.doctest b/nltk/test/tag.doctest index ca893b8f90..a78ab4d3f8 100644 --- a/nltk/test/tag.doctest +++ b/nltk/test/tag.doctest @@ -1,6 +1,438 @@ .. Copyright (C) 2001-2021 NLTK Project .. For license information, see LICENSE.TXT +Evaluation of Taggers +===================== + +Evaluating the standard NLTK PerceptronTagger using Accuracy, +Precision, Recall and F-measure for each of the tags. + + >>> from nltk.tag import PerceptronTagger + >>> from nltk.corpus import treebank + >>> tagger = PerceptronTagger() + >>> gold_data = treebank.tagged_sents()[10:20] + >>> print(tagger.evaluate(gold_data)) # doctest: +ELLIPSIS + 0.885931... + + >>> print(tagger.evaluate_per_tag(gold_data)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + '' | 1.0000 | 1.0000 | 1.0000 + , | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + . | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + CC | 1.0000 | 1.0000 | 1.0000 + CD | 0.7647 | 1.0000 | 0.8667 + DT | 1.0000 | 1.0000 | 1.0000 + IN | 1.0000 | 1.0000 | 1.0000 + JJ | 0.5882 | 0.8333 | 0.6897 + JJR | 1.0000 | 1.0000 | 1.0000 + JJS | 1.0000 | 1.0000 | 1.0000 + NN | 0.7647 | 0.9630 | 0.8525 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 1.0000 | 1.0000 | 1.0000 + POS | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 1.0000 | 1.0000 + RB | 0.8000 | 1.0000 | 0.8889 + RBR | 0.0000 | 0.0000 | 0.0000 + TO | 1.0000 | 1.0000 | 1.0000 + VB | 1.0000 | 1.0000 | 1.0000 + VBD | 0.8571 | 0.9231 | 0.8889 + VBG | 1.0000 | 1.0000 | 1.0000 + VBN | 0.8333 | 0.5556 | 0.6667 + VBP | 0.5714 | 0.8000 | 0.6667 + VBZ | 1.0000 | 1.0000 | 1.0000 + WP | 1.0000 | 1.0000 | 1.0000 + `` | 1.0000 | 1.0000 | 1.0000 + + +List only the 10 most common tags: + + >>> print(tagger.evaluate_per_tag(gold_data, truncate=10, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + IN | 1.0000 | 1.0000 | 1.0000 + DT | 1.0000 | 1.0000 | 1.0000 + NN | 0.7647 | 0.9630 | 0.8525 + NNP | 0.8929 | 1.0000 | 0.9434 + NNS | 1.0000 | 1.0000 | 1.0000 + -NONE- | 0.0000 | 0.0000 | 0.0000 + CD | 0.7647 | 1.0000 | 0.8667 + VBD | 0.8571 | 0.9231 | 0.8889 + JJ | 0.5882 | 0.8333 | 0.6897 + , | 1.0000 | 1.0000 | 1.0000 + + +Similarly, we can display the confusion matrix for this tagger. + + >>> print(tagger.confusion(gold_data)) + | - | + | N | + | O | + | N J J N N P P R V V V V V | + | ' E C C D I J J J N N N O R R B T V B B B B B W ` | + | ' , - . : C D T N J R S N P S S P B R O B D G N P Z P ` | + -------+-------------------------------------------------------------------------------------+ + '' | <3> . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | .<11> . . . . . . . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . <.> . . . 4 . . 4 . . 7 2 . . . 1 . . . . . . 3 . . . | + . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . .<13> . . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . .<28> . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . .<34> . . . . . . . . . . . . . . . . . . . | + JJ | . . . . . . . . .<10> . . . 1 . . . . 1 . . . . . . . . . | + JJR | . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . 1 . .<26> . . . . . . . . . . . . . . . | + NNP | . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . .<22> . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . <3> . . . . . . . . . . . | + RB | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . <.> . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . <2> . . . . . . . . | + VB | . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . | + VBD | . . . . . . . . . . . . . . . . . . . . .<12> . 1 . . . . | + VBG | . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . | + VBN | . . . . . . . . . 2 . . . . . . . . . . . 2 . <5> . . . . | + VBP | . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3>| + -------+-------------------------------------------------------------------------------------+ + (row = reference; col = test) + + +Brill Trainer with evaluation +============================= + + >>> # Perform the relevant imports. + >>> from nltk.tbl.template import Template + >>> from nltk.tag.brill import Pos, Word + >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger + + >>> # Load some data + >>> from nltk.corpus import treebank + >>> training_data = treebank.tagged_sents()[:100] + >>> baseline_data = treebank.tagged_sents()[100:200] + >>> gold_data = treebank.tagged_sents()[200:300] + >>> testing_data = [untag(s) for s in gold_data] + + >>> backoff = RegexpTagger([ + ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers + ... (r'(The|the|A|a|An|an)$', 'AT'), # articles + ... (r'.*able$', 'JJ'), # adjectives + ... (r'.*ness$', 'NN'), # nouns formed from adjectives + ... (r'.*ly$', 'RB'), # adverbs + ... (r'.*s$', 'NNS'), # plural nouns + ... (r'.*ing$', 'VBG'), # gerunds + ... (r'.*ed$', 'VBD'), # past tense verbs + ... (r'.*', 'NN') # nouns (default) + ... ]) + +We've now created a simple ``RegexpTagger``, which tags according to the regular expression +rules it has been supplied. This tagger in and of itself does not have a great accuracy. + + >>> backoff.evaluate(gold_data) #doctest: +ELLIPSIS + 0.245014... + +Neither does a simple ``UnigramTagger``. This tagger is trained on some data, +and will then first try to match unigrams (i.e. tokens) of the sentence it has +to tag to the learned data. + + >>> unigram_tagger = UnigramTagger(baseline_data) + >>> unigram_tagger.evaluate(gold_data) #doctest: +ELLIPSIS + 0.581196... + +The lackluster accuracy here can be explained with the following example: + + >>> unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"]) + [('I', 'NNP'), ('would', 'MD'), ('like', None), ('this', 'DT'), ('sentence', None), + ('to', 'TO'), ('be', 'VB'), ('tagged', None)] + +As you can see, many tokens are tagged as ``None``, as these tokens are OOV (out of vocabulary). +The ``UnigramTagger`` has never seen them, and as a result they are not in its database of known terms. + +In practice, a ``UnigramTagger`` is exclusively used in conjunction with a *backoff*. Our real +baseline which will use such a backoff. We'll create a ``UnigramTagger`` like before, but now +the ``RegexpTagger`` will be used as a backoff for the situations where the ``UnigramTagger`` +encounters an OOV token. + + >>> baseline = UnigramTagger(baseline_data, backoff=backoff) + >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS + 0.7537647... + +That is already much better. We can investigate the performance further by running +``evaluate_per_tag``. This method will output the *Precision*, *Recall* and *F-measure* +of each tag. + + >>> print(baseline.evaluate_per_tag(gold_data, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + NNP | 0.9674 | 0.2738 | 0.4269 + NN | 0.4111 | 0.9136 | 0.5670 + IN | 0.9383 | 0.9580 | 0.9480 + DT | 0.9819 | 0.8859 | 0.9314 + JJ | 0.8167 | 0.2970 | 0.4356 + NNS | 0.7393 | 0.9630 | 0.8365 + -NONE- | 1.0000 | 0.8345 | 0.9098 + , | 1.0000 | 1.0000 | 1.0000 + . | 1.0000 | 1.0000 | 1.0000 + VBD | 0.6429 | 0.8804 | 0.7431 + CD | 1.0000 | 0.9872 | 0.9935 + CC | 1.0000 | 0.9355 | 0.9667 + VB | 0.7778 | 0.3684 | 0.5000 + VBN | 0.9375 | 0.3000 | 0.4545 + RB | 0.7778 | 0.7447 | 0.7609 + TO | 1.0000 | 1.0000 | 1.0000 + VBZ | 0.9643 | 0.6429 | 0.7714 + VBG | 0.6415 | 0.9444 | 0.7640 + PRP$ | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 0.5556 | 0.7143 + MD | 1.0000 | 1.0000 | 1.0000 + VBP | 0.6471 | 0.5789 | 0.6111 + POS | 1.0000 | 1.0000 | 1.0000 + $ | 1.0000 | 0.8182 | 0.9000 + '' | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + WDT | 0.4000 | 0.2000 | 0.2667 + `` | 1.0000 | 1.0000 | 1.0000 + JJR | 1.0000 | 0.5000 | 0.6667 + NNPS | 0.0000 | 0.0000 | 0.0000 + RBR | 1.0000 | 1.0000 | 1.0000 + -LRB- | 0.0000 | 0.0000 | 0.0000 + -RRB- | 0.0000 | 0.0000 | 0.0000 + RP | 0.6667 | 0.6667 | 0.6667 + EX | 0.5000 | 0.5000 | 0.5000 + JJS | 0.0000 | 0.0000 | 0.0000 + WP | 1.0000 | 1.0000 | 1.0000 + PDT | 0.0000 | 0.0000 | 0.0000 + AT | 0.0000 | 0.0000 | 0.0000 + + +It's clear that although the precision of tagging `"NNP"` is high, the recall is very low. +With other words, we're missing a lot of cases where the true label is `"NNP"`. We can see +a similar effect with `"JJ"`. + +We can also see a very expected result: The precision of `"NN"` is low, while the recall +is high. If a term is OOV (i.e. ``UnigramTagger`` defers it to ``RegexpTagger``) and +``RegexpTagger`` doesn't have a good rule for it, then it will be tagged as `"NN"`. So, +we catch almost all tokens that are truly labeled as `"NN"`, but we also tag as `"NN"` +for many tokens that shouldn't be `"NN"`. + +This method gives us some insight in what parts of the tagger needs more attention, and why. +However, it doesn't tell us what the terms with true label `"NNP"` or `"JJ"` are actually +tagged as. +To help that, we can create a confusion matrix. + + >>> print(baseline.confusion(gold_data)) + | - | + | - N - | + | L O R N P | + | R N R J J N N N P P P R R V V V V V W | + | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | + | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + $ | <9> . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . | + '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -LRB- | . . . <.> . . . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . . .<121> . . . . . . . . . . . . . 24 . . . . . . . . . . . . . . . . . . . . | + -RRB- | . . . . . <.> . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | + . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . . . . . <58> . . . . . . . . 4 . . . . . . . . . . . . . . . . . . . . | + CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . . . 1 . .<163> . 4 . . . . 13 . . . . . . . . . . . . . . . . . 3 . . | + EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | + JJ | . . . . . . . . . . . . . . <49> . . . 86 2 . 4 . . . . 6 . . . . 12 3 . 3 . . . . | + JJR | . . . . . . . . . . . . . . . <3> . . 3 . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . . . 9 . . .<296> . . 5 . . . . . . . . 5 . 9 . . . . . . | + NNP | . . . . . . . . . . . 2 . . . . . . 199 <89> . 26 . . . . 2 . . . . 2 5 . . . . . . | + NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | + PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | + RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | + VB | . . . . . . . . . . . . . . 2 . . . 30 . . . . . . . 1 . . . <21> . . . 3 . . . . | + VBD | . . . . . . . . . . . . . . . . . . 10 . . . . . . . . . . . . <81> . 1 . . . . . | + VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | + VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 31 . <15> . . . . . | + VBP | . . . . . . . . . . . . . . . . . . 7 . . . . . . . . . . . 1 . . . <11> . . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . 15 . . . . . . . . . . . . . <27> . . . | + WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + +Once again we can see that `"NN"` is the default if the tagger isn't sure. Beyond that, +we can see why the recall for `"NNP"` is so low: these tokens are often tagged as `"NN"`. +This effect can also be seen for `"JJ"`, where the majority of tokens that ought to be +tagged as `"JJ"` are actually tagged as `"NN"` by our tagger. + +This tagger will only serve as a baseline for the ``BrillTaggerTrainer``, which uses +templates to attempt to improve the performance of the tagger. + + >>> # Set up templates + >>> Template._cleartemplates() #clear any templates created in earlier tests + >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] + + >>> # Construct a BrillTaggerTrainer + >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) + >>> tagger1 = tt.train(training_data, max_rules=10) + TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) + Finding initial useful rules... + Found 618 useful rules. + + B | + S F r O | Score = Fixed - Broken + c i o t | R Fixed = num tags changed incorrect -> correct + o x k h | u Broken = num tags changed correct -> incorrect + r e e e | l Other = num tags changed incorrect -> incorrect + e d n r | e + ------------------+------------------------------------------------------- + 13 14 1 4 | NN->VB if Pos:TO@[-1] + 8 8 0 0 | NN->VB if Pos:MD@[-1] + 7 10 3 22 | NN->IN if Pos:NNS@[-1] + 5 5 0 0 | NN->VBP if Pos:PRP@[-1] + 5 5 0 0 | VBD->VBN if Pos:VBZ@[-1] + 5 5 0 0 | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0] + 4 4 0 0 | NN->-NONE- if Pos:WP@[-1] + 4 4 0 3 | NN->NNP if Pos:-NONE-@[-1] + 4 6 2 2 | NN->NNP if Pos:NNP@[-1] + 4 4 0 0 | NNS->VBZ if Pos:PRP@[-1] + + >>> tagger1.rules()[1:3] + (Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]), Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')])) + + >>> tagger1.print_template_statistics(printunused=False) + TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) + TRAIN ( 2417 tokens) initial 555 0.7704 final: 496 0.7948 + #ID | Score (train) | #Rules | Template + -------------------------------------------- + 000 | 54 0.915 | 9 0.900 | Template(Pos([-1])) + 001 | 5 0.085 | 1 0.100 | Template(Pos([-1]),Word([0])) + + + + >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS + 0.769230... + + >>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True)) + Tag | Prec. | Recall | F-measure + -------+--------+--------+----------- + NNP | 0.8298 | 0.3600 | 0.5021 + NN | 0.4435 | 0.8364 | 0.5797 + IN | 0.8476 | 0.9580 | 0.8994 + DT | 0.9819 | 0.8859 | 0.9314 + JJ | 0.8167 | 0.2970 | 0.4356 + NNS | 0.7464 | 0.9630 | 0.8410 + -NONE- | 1.0000 | 0.8414 | 0.9139 + , | 1.0000 | 1.0000 | 1.0000 + . | 1.0000 | 1.0000 | 1.0000 + VBD | 0.6723 | 0.8696 | 0.7583 + CD | 1.0000 | 0.9872 | 0.9935 + CC | 1.0000 | 0.9355 | 0.9667 + VB | 0.8103 | 0.8246 | 0.8174 + VBN | 0.9130 | 0.4200 | 0.5753 + RB | 0.7778 | 0.7447 | 0.7609 + TO | 1.0000 | 1.0000 | 1.0000 + VBZ | 0.9667 | 0.6905 | 0.8056 + VBG | 0.6415 | 0.9444 | 0.7640 + PRP$ | 1.0000 | 1.0000 | 1.0000 + PRP | 1.0000 | 0.5556 | 0.7143 + MD | 1.0000 | 1.0000 | 1.0000 + VBP | 0.6316 | 0.6316 | 0.6316 + POS | 1.0000 | 1.0000 | 1.0000 + $ | 1.0000 | 0.8182 | 0.9000 + '' | 1.0000 | 1.0000 | 1.0000 + : | 1.0000 | 1.0000 | 1.0000 + WDT | 0.4000 | 0.2000 | 0.2667 + `` | 1.0000 | 1.0000 | 1.0000 + JJR | 1.0000 | 0.5000 | 0.6667 + NNPS | 0.0000 | 0.0000 | 0.0000 + RBR | 1.0000 | 1.0000 | 1.0000 + -LRB- | 0.0000 | 0.0000 | 0.0000 + -RRB- | 0.0000 | 0.0000 | 0.0000 + RP | 0.6667 | 0.6667 | 0.6667 + EX | 0.5000 | 0.5000 | 0.5000 + JJS | 0.0000 | 0.0000 | 0.0000 + WP | 1.0000 | 1.0000 | 1.0000 + PDT | 0.0000 | 0.0000 | 0.0000 + AT | 0.0000 | 0.0000 | 0.0000 + + + >>> print(tagger1.confusion(gold_data)) + | - | + | - N - | + | L O R N P | + | R N R J J N N N P P P R R V V V V V W | + | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | + | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + $ | <9> . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . 1 . . . . . . . . | + '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + -LRB- | . . . <.> . . . . . . . . . 1 . . . . 2 . . . . . . . . . . . . . . . . . . . . | + -NONE- | . . . .<122> . . . . . . . . 1 . . . . 22 . . . . . . . . . . . . . . . . . . . . | + -RRB- | . . . . . <.> . . . . . . . . . . . . 2 1 . . . . . . . . . . . . . . . . . . . | + . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | + CC | . . . . . . . . . <58> . . . . . . . . 2 1 . . . . . . . . . . . . . . 1 . . . . | + CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + DT | . . . . . . . . 1 . .<163> . 5 . . . . 12 . . . . . . . . . . . . . . . . . 3 . . | + EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | + IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | + JJ | . . . . . . . . . . . . . 4 <49> . . . 79 4 . 4 . . . . 6 . . . 1 12 3 . 3 . . . . | + JJR | . . . . . . . . . . . . . 2 . <3> . . 1 . . . . . . . . . . . . . . . . . . . . | + JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | + MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | + NN | . . . . . . . . . . . . . 7 9 . . .<271> 16 . 5 . . . . . . . . 7 . 9 . . . . . . | + NNP | . . . . . . . . . . . 2 . 7 . . . . 163<117> . 26 . . . . 2 . . . 1 2 5 . . . . . . | + NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | + NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | + PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | + POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | + PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | + PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | + RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | + RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | + RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | + TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | + VB | . . . . . . . . . . . . . . 2 . . . 4 . . . . . . . 1 . . . <47> . . . 3 . . . . | + VBD | . . . . . . . . . . . . . 1 . . . . 8 1 . . . . . . . . . . . <80> . 2 . . . . . | + VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | + VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 25 . <21> . . . . . | + VBP | . . . . . . . . . . . . . 2 . . . . 4 . . . . . . . . . . . 1 . . . <12> . . . . | + VBZ | . . . . . . . . . . . . . . . . . . . . . 13 . . . . . . . . . . . . . <29> . . . | + WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | + WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | + `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| + -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + (row = reference; col = test) + + + >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) + >>> tagged[33][12:] + [('foreign', 'NN'), ('debt', 'NN'), ('of', 'IN'), ('$', '$'), ('64', 'CD'), + ('billion', 'CD'), ('*U*', '-NONE-'), ('--', ':'), ('the', 'DT'), ('third-highest', 'NN'), + ('in', 'IN'), ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] + Regression Tests ~~~~~~~~~~~~~~~~ From 67f6dfd66a15312991a5ce43da045d6ef9899a2f Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 20 Oct 2021 17:21:23 +0200 Subject: [PATCH 2/6] Move evaluation of ConfusionMatrix into nltk\metrics\confusionmatrix.py --- nltk/metrics/confusionmatrix.py | 61 +++++++++++++++++++++++++++++++++ nltk/tag/api.py | 32 +---------------- 2 files changed, 62 insertions(+), 31 deletions(-) diff --git a/nltk/metrics/confusionmatrix.py b/nltk/metrics/confusionmatrix.py index a2b196e27e..05f1291b0e 100644 --- a/nltk/metrics/confusionmatrix.py +++ b/nltk/metrics/confusionmatrix.py @@ -276,6 +276,65 @@ def f_measure(self, value, alpha=0.5): return 0.0 return 1.0 / (alpha / p + (1 - alpha) / r) + def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False): + """ + Tabulate the **recall**, **precision** and **f-measure** + for each value in this confusion matrix. + + >>> reference = "DET NN VB DET JJ NN NN IN DET NN".split() + >>> test = "DET VB VB DET NN NN NN IN DET NN".split() + >>> cm = ConfusionMatrix(reference, test) + >>> print(cm.evaluate()) + Tag | Prec. | Recall | F-measure + ----+--------+--------+----------- + DET | 1.0000 | 1.0000 | 1.0000 + IN | 1.0000 | 1.0000 | 1.0000 + JJ | 0.0000 | 0.0000 | 0.0000 + NN | 0.7500 | 0.7500 | 0.7500 + VB | 0.5000 | 1.0000 | 0.6667 + + + :param alpha: Ratio of the cost of false negative compared to false + positives, as used in the f-measure computation. Defaults to 0.5, + where the costs are equal. + :type alpha: float + :param truncate: If specified, then only show the specified + number of values. Any sorting (e.g., sort_by_count) + will be performed before truncation. Defaults to None + :type truncate: int, optional + :param sort_by_count: Whether to sort the outputs on frequency + in the reference label. Defaults to False. + :type sort_by_count: bool, optional + :return: A tabulated recall, precision and f-measure string + :rtype: str + """ + tags = self._values + + # Apply keyword parameters + if sort_by_count: + tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]])) + if truncate: + tags = tags[:truncate] + + tag_column_len = max(max(len(tag) for tag in tags), 3) + + # Construct the header + s = ( + f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n" + f"{'-' * tag_column_len}-+--------+--------+-----------\n" + ) + + # Construct the body + for tag in tags: + s += ( + f"{tag:>{tag_column_len}} | " + f"{self.precision(tag):<6.4f} | " + f"{self.recall(tag):<6.4f} | " + f"{self.f_measure(tag, alpha=alpha):.4f}\n" + ) + + return s + def demo(): reference = "DET NN VB DET JJ NN NN IN DET NN".split() @@ -286,6 +345,8 @@ def demo(): print(ConfusionMatrix(reference, test)) print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True)) + print(ConfusionMatrix(reference, test).recall("VB")) + if __name__ == "__main__": demo() diff --git a/nltk/tag/api.py b/nltk/tag/api.py index b769522025..805337c71c 100644 --- a/nltk/tag/api.py +++ b/nltk/tag/api.py @@ -274,38 +274,8 @@ def evaluate_per_tag(self, gold, alpha=0.5, truncate=None, sort_by_count=False): :return: A tabulated recall, precision and f-measure string :rtype: str """ - # Gather Confusion Matrix and metrics cm = self.confusion(gold) - recalls = self.recall(gold) - precisions = self.precision(gold) - f_measures = self.f_measure(gold) - - tags = cm._values - - # Apply keyword parameters - if sort_by_count: - tags = sorted(tags, key=lambda v: -sum(cm._confusion[cm._indices[v]])) - if truncate: - tags = tags[:truncate] - - tag_column_len = max(max(len(tag) for tag in tags), 3) - - # Construct the header - s = ( - f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n" - f"{'-' * tag_column_len}-+--------+--------+-----------\n" - ) - - # Construct the body - for tag in tags: - s += ( - f"{tag:>{tag_column_len}} | " - f"{precisions[tag]:<6.4f} | " - f"{recalls[tag]:<6.4f} | " - f"{f_measures[tag]:.4f}\n" - ) - - return s + return cm.evaluate(alpha=alpha, truncate=truncate, sort_by_count=sort_by_count) def _check_params(self, train, model): if (train and model) or (not train and not model): From ed4286e2afc580546a6ccb193422cdcd8acab454 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 21 Oct 2021 19:39:12 +0200 Subject: [PATCH 3/6] Add self as author in significantly updated files --- nltk/metrics/confusionmatrix.py | 1 + nltk/tag/api.py | 1 + 2 files changed, 2 insertions(+) diff --git a/nltk/metrics/confusionmatrix.py b/nltk/metrics/confusionmatrix.py index 05f1291b0e..5fbcbe3493 100644 --- a/nltk/metrics/confusionmatrix.py +++ b/nltk/metrics/confusionmatrix.py @@ -3,6 +3,7 @@ # Copyright (C) 2001-2021 NLTK Project # Author: Edward Loper # Steven Bird +# Tom Aarsen <> # URL: # For license information, see LICENSE.TXT diff --git a/nltk/tag/api.py b/nltk/tag/api.py index 805337c71c..fd028d1629 100644 --- a/nltk/tag/api.py +++ b/nltk/tag/api.py @@ -3,6 +3,7 @@ # Copyright (C) 2001-2021 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) +# Tom Aarsen <> # URL: # For license information, see LICENSE.TXT From f622d9924595bf8dd54deca76e7e34a236f0ccea Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Mon, 25 Oct 2021 11:24:30 +0200 Subject: [PATCH 4/6] Deprecate tagger evaluate(gold) in favor of accuracy(gold) --- nltk/tag/__init__.py | 2 +- nltk/tag/api.py | 6 +++++- nltk/tag/brill_trainer.py | 6 +++--- nltk/tag/crf.py | 4 ++-- nltk/tag/perceptron.py | 2 +- nltk/tag/tnt.py | 8 ++++---- nltk/tbl/demo.py | 4 ++-- nltk/test/tag.doctest | 10 +++++----- 8 files changed, 23 insertions(+), 19 deletions(-) diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py index c3d2a94e0b..36446de271 100644 --- a/nltk/tag/__init__.py +++ b/nltk/tag/__init__.py @@ -57,7 +57,7 @@ We evaluate a tagger on data that was not seen during training: - >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600]) + >>> tagger.accuracy(brown.tagged_sents(categories='news')[500:600]) 0.7... For more information, please consult chapter 5 of the NLTK Book. diff --git a/nltk/tag/api.py b/nltk/tag/api.py index fd028d1629..25ffd1e0a4 100644 --- a/nltk/tag/api.py +++ b/nltk/tag/api.py @@ -16,7 +16,7 @@ from itertools import chain from typing import Dict -from nltk.internals import overridden +from nltk.internals import deprecated, overridden from nltk.metrics import ConfusionMatrix, accuracy from nltk.tag.util import untag @@ -56,7 +56,11 @@ def tag_sents(self, sentences): """ return [self.tag(sent) for sent in sentences] + @deprecated("Use accuracy(gold) instead.") def evaluate(self, gold): + return self.accuracy(gold) + + def accuracy(self, gold): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py index 3e18a22b17..b57bda2431 100644 --- a/nltk/tag/brill_trainer.py +++ b/nltk/tag/brill_trainer.py @@ -124,7 +124,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> baseline = backoff #see NOTE1 - >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS + >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS 0.2450142... >>> # Set up templates @@ -174,7 +174,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): - >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS + >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) @@ -211,7 +211,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] - >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS + >>> tagger2.accuracy(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py index cdcb4623be..5b1964e43f 100644 --- a/nltk/tag/crf.py +++ b/nltk/tag/crf.py @@ -35,13 +35,13 @@ class CRFTagger(TaggerI): [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]] >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] - >>> ct.evaluate(gold_sentences) + >>> ct.accuracy(gold_sentences) 1.0 Setting learned model file >>> ct = CRFTagger() >>> ct.set_model_file('model.crf.tagger') - >>> ct.evaluate(gold_sentences) + >>> ct.accuracy(gold_sentences) 1.0 """ diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py index a18c0c2069..02ff0865b8 100644 --- a/nltk/tag/perceptron.py +++ b/nltk/tag/perceptron.py @@ -363,7 +363,7 @@ def _get_pretrain_model(): print("Size of training and testing (sentence)", len(training), len(testing)) # Train and save the model tagger.train(training, PICKLE) - print("Accuracy : ", tagger.evaluate(testing)) + print("Accuracy : ", tagger.accuracy(testing)) if __name__ == "__main__": diff --git a/nltk/tag/tnt.py b/nltk/tag/tnt.py index 9174e498c7..e4cbf74b3e 100755 --- a/nltk/tag/tnt.py +++ b/nltk/tag/tnt.py @@ -492,7 +492,7 @@ def demo2(): s.train(d[(11) * 100 :]) for i in range(10): - tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)]) + tacc = t.accuracy(d[i * 100 : ((i + 1) * 100)]) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) t.unknown = 0 @@ -504,7 +504,7 @@ def demo2(): print("Percentage unknown:", tp_un) print("Accuracy over known words:", (tacc / tp_kn)) - sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)]) + sacc = s.accuracy(d[i * 100 : ((i + 1) * 100)]) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) s.unknown = 0 @@ -550,14 +550,14 @@ def demo3(): t.train(dtrain) s.train(etrain) - tacc = t.evaluate(dtest) + tacc = t.accuracy(dtest) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 - sacc = s.evaluate(etest) + sacc = s.accuracy(etest) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) sknown += sp_kn diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py index 613351b03a..dabaf9b330 100644 --- a/nltk/tbl/demo.py +++ b/nltk/tbl/demo.py @@ -261,7 +261,7 @@ def postag( if gold_data: print( " Accuracy on test set: {:0.4f}".format( - baseline_tagger.evaluate(gold_data) + baseline_tagger.accuracy(gold_data) ) ) @@ -274,7 +274,7 @@ def postag( brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds") if gold_data: - print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) + print(" Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data)) # printing the learned rules, if learned silently if trace == 1: diff --git a/nltk/test/tag.doctest b/nltk/test/tag.doctest index a78ab4d3f8..beda200e0c 100644 --- a/nltk/test/tag.doctest +++ b/nltk/test/tag.doctest @@ -11,7 +11,7 @@ Precision, Recall and F-measure for each of the tags. >>> from nltk.corpus import treebank >>> tagger = PerceptronTagger() >>> gold_data = treebank.tagged_sents()[10:20] - >>> print(tagger.evaluate(gold_data)) # doctest: +ELLIPSIS + >>> print(tagger.accuracy(gold_data)) # doctest: +ELLIPSIS 0.885931... >>> print(tagger.evaluate_per_tag(gold_data)) @@ -136,7 +136,7 @@ Brill Trainer with evaluation We've now created a simple ``RegexpTagger``, which tags according to the regular expression rules it has been supplied. This tagger in and of itself does not have a great accuracy. - >>> backoff.evaluate(gold_data) #doctest: +ELLIPSIS + >>> backoff.accuracy(gold_data) #doctest: +ELLIPSIS 0.245014... Neither does a simple ``UnigramTagger``. This tagger is trained on some data, @@ -144,7 +144,7 @@ and will then first try to match unigrams (i.e. tokens) of the sentence it has to tag to the learned data. >>> unigram_tagger = UnigramTagger(baseline_data) - >>> unigram_tagger.evaluate(gold_data) #doctest: +ELLIPSIS + >>> unigram_tagger.accuracy(gold_data) #doctest: +ELLIPSIS 0.581196... The lackluster accuracy here can be explained with the following example: @@ -162,7 +162,7 @@ the ``RegexpTagger`` will be used as a backoff for the situations where the ``Un encounters an OOV token. >>> baseline = UnigramTagger(baseline_data, backoff=backoff) - >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS + >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS 0.7537647... That is already much better. We can investigate the performance further by running @@ -329,7 +329,7 @@ templates to attempt to improve the performance of the tagger. - >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS + >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS 0.769230... >>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True)) From 147d0fbf151606dd37e7cb3e2a6e3745b455b56e Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Mon, 25 Oct 2021 11:59:29 +0200 Subject: [PATCH 5/6] Missed one case of Tagger evaluate still being used - fixed now --- nltk/test/probability.doctest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nltk/test/probability.doctest b/nltk/test/probability.doctest index a955a5f87f..fd91a1f595 100644 --- a/nltk/test/probability.doctest +++ b/nltk/test/probability.doctest @@ -139,7 +139,7 @@ And now we can test the estimators >>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) - ... print('%.2f%%' % (100 * hmm.evaluate(test_corpus))) + ... print('%.2f%%' % (100 * hmm.accuracy(test_corpus))) Maximum Likelihood Estimation ----------------------------- From a1adb5a8cfe6d5c17e62579b894dd270ea5944b9 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Mon, 25 Oct 2021 12:04:09 +0200 Subject: [PATCH 6/6] Deprecate ChunkParser's evaluate(gold) in favor of accuracy(gold) --- nltk/chunk/api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nltk/chunk/api.py b/nltk/chunk/api.py index de4bb93958..56a63d2743 100644 --- a/nltk/chunk/api.py +++ b/nltk/chunk/api.py @@ -11,6 +11,7 @@ ##////////////////////////////////////////////////////// from nltk.chunk.util import ChunkScore +from nltk.internals import deprecated from nltk.parse import ParserI @@ -34,7 +35,11 @@ def parse(self, tokens): """ raise NotImplementedError() + @deprecated("Use accuracy(gold) instead.") def evaluate(self, gold): + return self.accuracy(gold) + + def accuracy(self, gold): """ Score the accuracy of the chunker against the gold standard. Remove the chunking the gold standard text, rechunk it using