Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: perfom linting for punkt.py #2830

Merged
merged 4 commits into from Oct 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion nltk/tokenize/__init__.py
Expand Up @@ -123,7 +123,7 @@ def word_tokenize(text, language="english", preserve_line=False):
:type text: str
:param language: the model name in the Punkt corpus
:type language: str
:param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
:type preserve_line: bool
"""
sentences = [text] if preserve_line else sent_tokenize(text, language)
Expand Down
120 changes: 58 additions & 62 deletions nltk/tokenize/punkt.py
Expand Up @@ -308,18 +308,18 @@ def period_context_re(self):
# ////////////////////////////////////////////////////////////


def _pair_iter(it):
def _pair_iter(iterator):
"""
Yields pairs of tokens from the given iterator such that each input
token will appear as the first element in a yielded tuple. The last
pair will have None as its second element.
"""
it = iter(it)
iterator = iter(iterator)
try:
prev = next(it)
prev = next(iterator)
except StopIteration:
return
for el in it:
for el in iterator:
yield (prev, el)
prev = el
yield (prev, None)
Expand Down Expand Up @@ -369,18 +369,18 @@ def add_ortho_context(self, typ, flag):
self.ortho_context[typ] |= flag

def _debug_ortho_context(self, typ):
c = self.ortho_context[typ]
if c & _ORTHO_BEG_UC:
context = self.ortho_context[typ]
if context & _ORTHO_BEG_UC:
yield "BEG-UC"
if c & _ORTHO_MID_UC:
if context & _ORTHO_MID_UC:
yield "MID-UC"
if c & _ORTHO_UNK_UC:
if context & _ORTHO_UNK_UC:
yield "UNK-UC"
if c & _ORTHO_BEG_LC:
if context & _ORTHO_BEG_LC:
yield "BEG-LC"
if c & _ORTHO_MID_LC:
if context & _ORTHO_MID_LC:
yield "MID-LC"
if c & _ORTHO_UNK_LC:
if context & _ORTHO_UNK_LC:
yield "UNK-LC"


Expand All @@ -401,8 +401,8 @@ def __init__(self, tok, **params):
self.type = self._get_type(tok)
self.period_final = tok.endswith(".")

for p in self._properties:
setattr(self, p, None)
for prop in self._properties:
setattr(self, prop, None)
for k in params:
setattr(self, k, params[k])

Expand Down Expand Up @@ -456,7 +456,7 @@ def first_lower(self):
def first_case(self):
if self.first_lower:
return "lower"
elif self.first_upper:
if self.first_upper:
return "upper"
return "none"

Expand Down Expand Up @@ -570,8 +570,8 @@ def _tokenize_words(self, plaintext):
yield self._Token(tok, parastart=parastart, linestart=True)
parastart = False

for t in line_toks:
yield self._Token(t)
for tok in line_toks:
yield self._Token(tok)
else:
parastart = True

Expand Down Expand Up @@ -817,16 +817,16 @@ def finalize_training(self, verbose=False):
collocations and sentence starters.
"""
self._params.clear_sent_starters()
for typ, ll in self._find_sent_starters():
for typ, log_likelihood in self._find_sent_starters():
self._params.sent_starters.add(typ)
if verbose:
print(f" Sent Starter: [{ll:6.4f}] {typ!r}")
print(f" Sent Starter: [{log_likelihood:6.4f}] {typ!r}")

self._params.clear_collocations()
for (typ1, typ2), ll in self._find_collocations():
for (typ1, typ2), log_likelihood in self._find_collocations():
self._params.collocations.add((typ1, typ2))
if verbose:
print(f" Collocation: [{ll:6.4f}] {typ1!r}+{typ2!r}")
print(f" Collocation: [{log_likelihood:6.4f}] {typ1!r}+{typ2!r}")

self._finalized = True

Expand Down Expand Up @@ -971,11 +971,11 @@ def _reclassify_abbrev_types(self, types):
# Let <a> be the candidate without the period, and <b>
# be the period. Find a log likelihood ratio that
# indicates whether <ab> occurs as a single unit (high
# value of ll), or as two independent units <a> and
# <b> (low value of ll).
# value of log_likelihood), or as two independent units <a> and
# <b> (low value of log_likelihood).
count_with_period = self._type_fdist[typ + "."]
count_without_period = self._type_fdist[typ]
ll = self._dunning_log_likelihood(
log_likelihood = self._dunning_log_likelihood(
count_with_period + count_without_period,
self._num_period_toks,
count_with_period,
Expand All @@ -992,7 +992,7 @@ def _reclassify_abbrev_types(self, types):
f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow(
num_nonperiods, -count_without_period
)
score = ll * f_length * f_periods * f_penalty
score = log_likelihood * f_length * f_periods * f_penalty

yield typ, score, is_add

Expand All @@ -1004,7 +1004,7 @@ def find_abbrev_types(self):
"""
self._params.clear_abbrevs()
tokens = (typ for typ in self._type_fdist if typ and typ.endswith("."))
for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
for abbr, score, _is_add in self._reclassify_abbrev_types(tokens):
if score >= self.ABBREV:
self._params.abbrev_types.add(abbr)

Expand Down Expand Up @@ -1046,7 +1046,7 @@ def _is_rare_abbrev_type(self, cur_tok, next_tok):
# and (iii) never occus with an uppercase letter
# sentence-internally.
# [xx] should the check for (ii) be modified??
elif next_tok.first_lower:
if next_tok.first_lower:
typ2 = next_tok.type_no_sentperiod
typ2ortho_context = self._params.ortho_context[typ2]
if (typ2ortho_context & _ORTHO_BEG_UC) and not (
Expand Down Expand Up @@ -1090,19 +1090,19 @@ def _col_log_likelihood(count_a, count_b, count_ab, N):
p1 = count_ab / count_a
try:
p2 = (count_b - count_ab) / (N - count_a)
except ZeroDivisionError as e:
except ZeroDivisionError:
p2 = 1

try:
summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p)
except ValueError as e:
except ValueError:
summand1 = 0

try:
summand2 = (count_b - count_ab) * math.log(p) + (
N - count_a - count_b + count_ab
) * math.log(1.0 - p)
except ValueError as e:
except ValueError:
summand2 = 0

if count_a == count_ab or p1 <= 0 or p1 >= 1:
Expand Down Expand Up @@ -1164,14 +1164,14 @@ def _find_collocations(self):
and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count)
):

ll = self._col_log_likelihood(
log_likelihood = self._col_log_likelihood(
typ1_count, typ2_count, col_count, self._type_fdist.N()
)
# Filter out the not-so-collocative
if ll >= self.COLLOCATION and (
if log_likelihood >= self.COLLOCATION and (
self._type_fdist.N() / typ1_count > typ2_count / col_count
):
yield (typ1, typ2), ll
yield (typ1, typ2), log_likelihood

# ////////////////////////////////////////////////////////////
# { Sentence-Starter Finder
Expand Down Expand Up @@ -1206,19 +1206,19 @@ def _find_sent_starters(self):
# needed after freq_threshold
continue

ll = self._col_log_likelihood(
log_likelihood = self._col_log_likelihood(
self._sentbreak_count,
typ_count,
typ_at_break_count,
self._type_fdist.N(),
)

if (
ll >= self.SENT_STARTER
log_likelihood >= self.SENT_STARTER
and self._type_fdist.N() / self._sentbreak_count
> typ_count / typ_at_break_count
):
yield typ, ll
yield typ, log_likelihood

def _get_sentbreak_count(self, tokens):
"""
Expand Down Expand Up @@ -1321,8 +1321,8 @@ def span_tokenize(self, text, realign_boundaries=True):
slices = self._slices_from_text(text)
if realign_boundaries:
slices = self._realign_boundaries(text, slices)
for sl in slices:
yield (sl.start, sl.stop)
for sentence in slices:
yield (sentence.start, sentence.stop)
12mohaned marked this conversation as resolved.
Show resolved Hide resolved

def sentences_from_text(self, text, realign_boundaries=True):
"""
Expand Down Expand Up @@ -1362,31 +1362,31 @@ def _realign_boundaries(self, text, slices):
["(Sent1.)", "Sent2."].
"""
realign = 0
for sl1, sl2 in _pair_iter(slices):
sl1 = slice(sl1.start + realign, sl1.stop)
if not sl2:
if text[sl1]:
yield sl1
for sentence1, sentence2 in _pair_iter(slices):
sentence1 = slice(sentence1.start + realign, sentence1.stop)
if not sentence2:
if text[sentence1]:
yield sentence1
continue

m = self._lang_vars.re_boundary_realignment.match(text[sl2])
m = self._lang_vars.re_boundary_realignment.match(text[sentence2])
if m:
yield slice(sl1.start, sl2.start + len(m.group(0).rstrip()))
yield slice(sentence1.start, sentence2.start + len(m.group(0).rstrip()))
realign = m.end()
else:
realign = 0
if text[sl1]:
yield sl1
if text[sentence1]:
yield sentence1

def text_contains_sentbreak(self, text):
"""
Returns True if the given text includes a sentence break.
"""
found = False # used to ignore last token
for t in self._annotate_tokens(self._tokenize_words(text)):
for tok in self._annotate_tokens(self._tokenize_words(text)):
if found:
return True
if t.sentbreak:
if tok.sentbreak:
found = True
return False

Expand Down Expand Up @@ -1448,15 +1448,15 @@ def _build_sentence_list(self, text, tokens):
pos = 0

# A regular expression that finds pieces of whitespace:
WS_REGEXP = re.compile(r"\s*")
white_space_regexp = re.compile(r"\s*")

sentence = ""
for aug_tok in tokens:
tok = aug_tok.tok

# Find the whitespace before this token, and update pos.
ws = WS_REGEXP.match(text, pos).group()
pos += len(ws)
white_space = white_space_regexp.match(text, pos).group()
pos += len(white_space)

# Some of the rules used by the punkt word tokenizer
# strip whitespace out of the text, resulting in tokens
Expand All @@ -1477,7 +1477,7 @@ def _build_sentence_list(self, text, tokens):
# sentence, then include any whitespace that separated it
# from the previous token.
if sentence:
sentence += ws
sentence += white_space
sentence += tok

# If we're at a sentence break, then start a new sentence.
Expand Down Expand Up @@ -1519,9 +1519,9 @@ def _annotate_second_pass(self, tokens):
tokens, making use of the orthographic heuristic (4.1.1), collocation
heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
"""
for t1, t2 in _pair_iter(tokens):
self._second_pass_annotation(t1, t2)
yield t1
for token1, token2 in _pair_iter(tokens):
self._second_pass_annotation(token1, token2)
yield token1

def _second_pass_annotation(self, aug_tok1, aug_tok2):
"""
Expand All @@ -1532,13 +1532,10 @@ def _second_pass_annotation(self, aug_tok1, aug_tok2):
if not aug_tok2:
return

tok = aug_tok1.tok
if not aug_tok1.period_final:
# We only care about words ending in periods.
return

typ = aug_tok1.type_no_period
next_tok = aug_tok2.tok
next_typ = aug_tok2.type_no_sentperiod
tok_is_initial = aug_tok1.is_initial

Expand Down Expand Up @@ -1588,8 +1585,7 @@ def _second_pass_annotation(self, aug_tok1, aug_tok2):
aug_tok1.abbr = True
if tok_is_initial:
return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC
else:
return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC
return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC

# Special heuristic for initials: if orthogrpahic
# heuristic is unknown, and next word is always
Expand Down Expand Up @@ -1665,5 +1661,5 @@ def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)
sbd = tok_cls(trainer.get_params())
for l in sbd.sentences_from_text(text):
print(cleanup(l))
for sentence in sbd.sentences_from_text(text):
print(cleanup(sentence))