Skip to content

Commit a28d256

Browse files
Add Precision, Recall, F-measure, Confusion Matrix to Taggers (#2862)
* Add Precision, Recall, F-measure, Confusion Matrix and per-tag evaluation to Taggers And add precision, recall and f-measure to ConfusionMatrix. Includes large doctests, and some small doctest fixes throughout the tag module * Move evaluation of ConfusionMatrix into nltk\metrics\confusionmatrix.py * Add self as author in significantly updated files * Deprecate tagger evaluate(gold) in favor of accuracy(gold) * Missed one case of Tagger evaluate still being used - fixed now * Deprecate ChunkParser's evaluate(gold) in favor of accuracy(gold) Co-authored-by: Steven Bird <stevenbird1@gmail.com>
1 parent 72d9885 commit a28d256

12 files changed

+833
-25
lines changed

nltk/chunk/api.py

+5
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
##//////////////////////////////////////////////////////
1212

1313
from nltk.chunk.util import ChunkScore
14+
from nltk.internals import deprecated
1415
from nltk.parse import ParserI
1516

1617

@@ -34,7 +35,11 @@ def parse(self, tokens):
3435
"""
3536
raise NotImplementedError()
3637

38+
@deprecated("Use accuracy(gold) instead.")
3739
def evaluate(self, gold):
40+
return self.accuracy(gold)
41+
42+
def accuracy(self, gold):
3843
"""
3944
Score the accuracy of the chunker against the gold standard.
4045
Remove the chunking the gold standard text, rechunk it using

nltk/metrics/confusionmatrix.py

+137
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Copyright (C) 2001-2021 NLTK Project
44
# Author: Edward Loper <edloper@gmail.com>
55
# Steven Bird <stevenbird1@gmail.com>
6+
# Tom Aarsen <>
67
# URL: <https://www.nltk.org/>
78
# For license information, see LICENSE.TXT
89

@@ -201,6 +202,140 @@ def key(self):
201202

202203
return str
203204

205+
def recall(self, value):
206+
"""Given a value in the confusion matrix, return the recall
207+
that corresponds to this value. The recall is defined as:
208+
209+
- *r* = true positive / (true positive + false positive)
210+
211+
and can loosely be considered the ratio of how often ``value``
212+
was predicted correctly relative to how often ``value`` was
213+
the true result.
214+
215+
:param value: value used in the ConfusionMatrix
216+
:return: the recall corresponding to ``value``.
217+
:rtype: float
218+
"""
219+
# Number of times `value` was correct, and also predicted
220+
TP = self[value, value]
221+
# Number of times `value` was correct
222+
TP_FN = sum(self[value, pred_value] for pred_value in self._values)
223+
if TP_FN == 0:
224+
return 0.0
225+
return TP / TP_FN
226+
227+
def precision(self, value):
228+
"""Given a value in the confusion matrix, return the precision
229+
that corresponds to this value. The precision is defined as:
230+
231+
- *p* = true positive / (true positive + false negative)
232+
233+
and can loosely be considered the ratio of how often ``value``
234+
was predicted correctly relative to the number of predictions
235+
for ``value``.
236+
237+
:param value: value used in the ConfusionMatrix
238+
:return: the precision corresponding to ``value``.
239+
:rtype: float
240+
"""
241+
# Number of times `value` was correct, and also predicted
242+
TP = self[value, value]
243+
# Number of times `value` was predicted
244+
TP_FP = sum(self[real_value, value] for real_value in self._values)
245+
if TP_FP == 0:
246+
return 0.0
247+
return TP / TP_FP
248+
249+
def f_measure(self, value, alpha=0.5):
250+
"""
251+
Given a value used in the confusion matrix, return the f-measure
252+
that corresponds to this value. The f-measure is the harmonic mean
253+
of the ``precision`` and ``recall``, weighted by ``alpha``.
254+
In particular, given the precision *p* and recall *r* defined by:
255+
256+
- *p* = true positive / (true positive + false negative)
257+
- *r* = true positive / (true positive + false positive)
258+
259+
The f-measure is:
260+
261+
- *1/(alpha/p + (1-alpha)/r)*
262+
263+
With ``alpha = 0.5``, this reduces to:
264+
265+
- *2pr / (p + r)*
266+
267+
:param value: value used in the ConfusionMatrix
268+
:param alpha: Ratio of the cost of false negative compared to false
269+
positives. Defaults to 0.5, where the costs are equal.
270+
:type alpha: float
271+
:return: the F-measure corresponding to ``value``.
272+
:rtype: float
273+
"""
274+
p = self.precision(value)
275+
r = self.recall(value)
276+
if p == 0.0 or r == 0.0:
277+
return 0.0
278+
return 1.0 / (alpha / p + (1 - alpha) / r)
279+
280+
def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False):
281+
"""
282+
Tabulate the **recall**, **precision** and **f-measure**
283+
for each value in this confusion matrix.
284+
285+
>>> reference = "DET NN VB DET JJ NN NN IN DET NN".split()
286+
>>> test = "DET VB VB DET NN NN NN IN DET NN".split()
287+
>>> cm = ConfusionMatrix(reference, test)
288+
>>> print(cm.evaluate())
289+
Tag | Prec. | Recall | F-measure
290+
----+--------+--------+-----------
291+
DET | 1.0000 | 1.0000 | 1.0000
292+
IN | 1.0000 | 1.0000 | 1.0000
293+
JJ | 0.0000 | 0.0000 | 0.0000
294+
NN | 0.7500 | 0.7500 | 0.7500
295+
VB | 0.5000 | 1.0000 | 0.6667
296+
<BLANKLINE>
297+
298+
:param alpha: Ratio of the cost of false negative compared to false
299+
positives, as used in the f-measure computation. Defaults to 0.5,
300+
where the costs are equal.
301+
:type alpha: float
302+
:param truncate: If specified, then only show the specified
303+
number of values. Any sorting (e.g., sort_by_count)
304+
will be performed before truncation. Defaults to None
305+
:type truncate: int, optional
306+
:param sort_by_count: Whether to sort the outputs on frequency
307+
in the reference label. Defaults to False.
308+
:type sort_by_count: bool, optional
309+
:return: A tabulated recall, precision and f-measure string
310+
:rtype: str
311+
"""
312+
tags = self._values
313+
314+
# Apply keyword parameters
315+
if sort_by_count:
316+
tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]]))
317+
if truncate:
318+
tags = tags[:truncate]
319+
320+
tag_column_len = max(max(len(tag) for tag in tags), 3)
321+
322+
# Construct the header
323+
s = (
324+
f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n"
325+
f"{'-' * tag_column_len}-+--------+--------+-----------\n"
326+
)
327+
328+
# Construct the body
329+
for tag in tags:
330+
s += (
331+
f"{tag:>{tag_column_len}} | "
332+
f"{self.precision(tag):<6.4f} | "
333+
f"{self.recall(tag):<6.4f} | "
334+
f"{self.f_measure(tag, alpha=alpha):.4f}\n"
335+
)
336+
337+
return s
338+
204339

205340
def demo():
206341
reference = "DET NN VB DET JJ NN NN IN DET NN".split()
@@ -211,6 +346,8 @@ def demo():
211346
print(ConfusionMatrix(reference, test))
212347
print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True))
213348

349+
print(ConfusionMatrix(reference, test).recall("VB"))
350+
214351

215352
if __name__ == "__main__":
216353
demo()

nltk/tag/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
2222
2323
>>> from nltk import pos_tag, word_tokenize
24-
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
24+
>>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE
2525
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
2626
("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
2727
@@ -57,7 +57,7 @@
5757
5858
We evaluate a tagger on data that was not seen during training:
5959
60-
>>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600])
60+
>>> tagger.accuracy(brown.tagged_sents(categories='news')[500:600])
6161
0.7...
6262
6363
For more information, please consult chapter 5 of the NLTK Book.
@@ -144,10 +144,10 @@ def pos_tag(tokens, tagset=None, lang="eng"):
144144
145145
>>> from nltk.tag import pos_tag
146146
>>> from nltk.tokenize import word_tokenize
147-
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
147+
>>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE
148148
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
149149
("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
150-
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
150+
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE
151151
[('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
152152
("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
153153

0 commit comments

Comments
 (0)