Skip to content

Commit

Permalink
refactor: perfom linting for punkt.py (#2830)
Browse files Browse the repository at this point in the history
* refactor: perfom linting for punkt.py

* fix failed test-cases

* fix naming for variables

* fix naming for variables
  • Loading branch information
12mohaned committed Oct 9, 2021
1 parent c05b0e7 commit 82ceb20
Showing 1 changed file with 58 additions and 62 deletions.
120 changes: 58 additions & 62 deletions nltk/tokenize/punkt.py
Expand Up @@ -308,18 +308,18 @@ def period_context_re(self):
# ////////////////////////////////////////////////////////////


def _pair_iter(it):
def _pair_iter(iterator):
"""
Yields pairs of tokens from the given iterator such that each input
token will appear as the first element in a yielded tuple. The last
pair will have None as its second element.
"""
it = iter(it)
iterator = iter(iterator)
try:
prev = next(it)
prev = next(iterator)
except StopIteration:
return
for el in it:
for el in iterator:
yield (prev, el)
prev = el
yield (prev, None)
Expand Down Expand Up @@ -369,18 +369,18 @@ def add_ortho_context(self, typ, flag):
self.ortho_context[typ] |= flag

def _debug_ortho_context(self, typ):
c = self.ortho_context[typ]
if c & _ORTHO_BEG_UC:
context = self.ortho_context[typ]
if context & _ORTHO_BEG_UC:
yield "BEG-UC"
if c & _ORTHO_MID_UC:
if context & _ORTHO_MID_UC:
yield "MID-UC"
if c & _ORTHO_UNK_UC:
if context & _ORTHO_UNK_UC:
yield "UNK-UC"
if c & _ORTHO_BEG_LC:
if context & _ORTHO_BEG_LC:
yield "BEG-LC"
if c & _ORTHO_MID_LC:
if context & _ORTHO_MID_LC:
yield "MID-LC"
if c & _ORTHO_UNK_LC:
if context & _ORTHO_UNK_LC:
yield "UNK-LC"


Expand All @@ -401,8 +401,8 @@ def __init__(self, tok, **params):
self.type = self._get_type(tok)
self.period_final = tok.endswith(".")

for p in self._properties:
setattr(self, p, None)
for prop in self._properties:
setattr(self, prop, None)
for k in params:
setattr(self, k, params[k])

Expand Down Expand Up @@ -456,7 +456,7 @@ def first_lower(self):
def first_case(self):
if self.first_lower:
return "lower"
elif self.first_upper:
if self.first_upper:
return "upper"
return "none"

Expand Down Expand Up @@ -570,8 +570,8 @@ def _tokenize_words(self, plaintext):
yield self._Token(tok, parastart=parastart, linestart=True)
parastart = False

for t in line_toks:
yield self._Token(t)
for tok in line_toks:
yield self._Token(tok)
else:
parastart = True

Expand Down Expand Up @@ -817,16 +817,16 @@ def finalize_training(self, verbose=False):
collocations and sentence starters.
"""
self._params.clear_sent_starters()
for typ, ll in self._find_sent_starters():
for typ, log_likelihood in self._find_sent_starters():
self._params.sent_starters.add(typ)
if verbose:
print(f" Sent Starter: [{ll:6.4f}] {typ!r}")
print(f" Sent Starter: [{log_likelihood:6.4f}] {typ!r}")

self._params.clear_collocations()
for (typ1, typ2), ll in self._find_collocations():
for (typ1, typ2), log_likelihood in self._find_collocations():
self._params.collocations.add((typ1, typ2))
if verbose:
print(f" Collocation: [{ll:6.4f}] {typ1!r}+{typ2!r}")
print(f" Collocation: [{log_likelihood:6.4f}] {typ1!r}+{typ2!r}")

self._finalized = True

Expand Down Expand Up @@ -971,11 +971,11 @@ def _reclassify_abbrev_types(self, types):
# Let <a> be the candidate without the period, and <b>
# be the period. Find a log likelihood ratio that
# indicates whether <ab> occurs as a single unit (high
# value of ll), or as two independent units <a> and
# <b> (low value of ll).
# value of log_likelihood), or as two independent units <a> and
# <b> (low value of log_likelihood).
count_with_period = self._type_fdist[typ + "."]
count_without_period = self._type_fdist[typ]
ll = self._dunning_log_likelihood(
log_likelihood = self._dunning_log_likelihood(
count_with_period + count_without_period,
self._num_period_toks,
count_with_period,
Expand All @@ -992,7 +992,7 @@ def _reclassify_abbrev_types(self, types):
f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow(
num_nonperiods, -count_without_period
)
score = ll * f_length * f_periods * f_penalty
score = log_likelihood * f_length * f_periods * f_penalty

yield typ, score, is_add

Expand All @@ -1004,7 +1004,7 @@ def find_abbrev_types(self):
"""
self._params.clear_abbrevs()
tokens = (typ for typ in self._type_fdist if typ and typ.endswith("."))
for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
for abbr, score, _is_add in self._reclassify_abbrev_types(tokens):
if score >= self.ABBREV:
self._params.abbrev_types.add(abbr)

Expand Down Expand Up @@ -1046,7 +1046,7 @@ def _is_rare_abbrev_type(self, cur_tok, next_tok):
# and (iii) never occus with an uppercase letter
# sentence-internally.
# [xx] should the check for (ii) be modified??
elif next_tok.first_lower:
if next_tok.first_lower:
typ2 = next_tok.type_no_sentperiod
typ2ortho_context = self._params.ortho_context[typ2]
if (typ2ortho_context & _ORTHO_BEG_UC) and not (
Expand Down Expand Up @@ -1090,19 +1090,19 @@ def _col_log_likelihood(count_a, count_b, count_ab, N):
p1 = count_ab / count_a
try:
p2 = (count_b - count_ab) / (N - count_a)
except ZeroDivisionError as e:
except ZeroDivisionError:
p2 = 1

try:
summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p)
except ValueError as e:
except ValueError:
summand1 = 0

try:
summand2 = (count_b - count_ab) * math.log(p) + (
N - count_a - count_b + count_ab
) * math.log(1.0 - p)
except ValueError as e:
except ValueError:
summand2 = 0

if count_a == count_ab or p1 <= 0 or p1 >= 1:
Expand Down Expand Up @@ -1164,14 +1164,14 @@ def _find_collocations(self):
and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count)
):

ll = self._col_log_likelihood(
log_likelihood = self._col_log_likelihood(
typ1_count, typ2_count, col_count, self._type_fdist.N()
)
# Filter out the not-so-collocative
if ll >= self.COLLOCATION and (
if log_likelihood >= self.COLLOCATION and (
self._type_fdist.N() / typ1_count > typ2_count / col_count
):
yield (typ1, typ2), ll
yield (typ1, typ2), log_likelihood

# ////////////////////////////////////////////////////////////
# { Sentence-Starter Finder
Expand Down Expand Up @@ -1206,19 +1206,19 @@ def _find_sent_starters(self):
# needed after freq_threshold
continue

ll = self._col_log_likelihood(
log_likelihood = self._col_log_likelihood(
self._sentbreak_count,
typ_count,
typ_at_break_count,
self._type_fdist.N(),
)

if (
ll >= self.SENT_STARTER
log_likelihood >= self.SENT_STARTER
and self._type_fdist.N() / self._sentbreak_count
> typ_count / typ_at_break_count
):
yield typ, ll
yield typ, log_likelihood

def _get_sentbreak_count(self, tokens):
"""
Expand Down Expand Up @@ -1321,8 +1321,8 @@ def span_tokenize(self, text, realign_boundaries=True):
slices = self._slices_from_text(text)
if realign_boundaries:
slices = self._realign_boundaries(text, slices)
for sl in slices:
yield (sl.start, sl.stop)
for sentence in slices:
yield (sentence.start, sentence.stop)

def sentences_from_text(self, text, realign_boundaries=True):
"""
Expand Down Expand Up @@ -1362,31 +1362,31 @@ def _realign_boundaries(self, text, slices):
["(Sent1.)", "Sent2."].
"""
realign = 0
for sl1, sl2 in _pair_iter(slices):
sl1 = slice(sl1.start + realign, sl1.stop)
if not sl2:
if text[sl1]:
yield sl1
for sentence1, sentence2 in _pair_iter(slices):
sentence1 = slice(sentence1.start + realign, sentence1.stop)
if not sentence2:
if text[sentence1]:
yield sentence1
continue

m = self._lang_vars.re_boundary_realignment.match(text[sl2])
m = self._lang_vars.re_boundary_realignment.match(text[sentence2])
if m:
yield slice(sl1.start, sl2.start + len(m.group(0).rstrip()))
yield slice(sentence1.start, sentence2.start + len(m.group(0).rstrip()))
realign = m.end()
else:
realign = 0
if text[sl1]:
yield sl1
if text[sentence1]:
yield sentence1

def text_contains_sentbreak(self, text):
"""
Returns True if the given text includes a sentence break.
"""
found = False # used to ignore last token
for t in self._annotate_tokens(self._tokenize_words(text)):
for tok in self._annotate_tokens(self._tokenize_words(text)):
if found:
return True
if t.sentbreak:
if tok.sentbreak:
found = True
return False

Expand Down Expand Up @@ -1448,15 +1448,15 @@ def _build_sentence_list(self, text, tokens):
pos = 0

# A regular expression that finds pieces of whitespace:
WS_REGEXP = re.compile(r"\s*")
white_space_regexp = re.compile(r"\s*")

sentence = ""
for aug_tok in tokens:
tok = aug_tok.tok

# Find the whitespace before this token, and update pos.
ws = WS_REGEXP.match(text, pos).group()
pos += len(ws)
white_space = white_space_regexp.match(text, pos).group()
pos += len(white_space)

# Some of the rules used by the punkt word tokenizer
# strip whitespace out of the text, resulting in tokens
Expand All @@ -1477,7 +1477,7 @@ def _build_sentence_list(self, text, tokens):
# sentence, then include any whitespace that separated it
# from the previous token.
if sentence:
sentence += ws
sentence += white_space
sentence += tok

# If we're at a sentence break, then start a new sentence.
Expand Down Expand Up @@ -1519,9 +1519,9 @@ def _annotate_second_pass(self, tokens):
tokens, making use of the orthographic heuristic (4.1.1), collocation
heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
"""
for t1, t2 in _pair_iter(tokens):
self._second_pass_annotation(t1, t2)
yield t1
for token1, token2 in _pair_iter(tokens):
self._second_pass_annotation(token1, token2)
yield token1

def _second_pass_annotation(self, aug_tok1, aug_tok2):
"""
Expand All @@ -1532,13 +1532,10 @@ def _second_pass_annotation(self, aug_tok1, aug_tok2):
if not aug_tok2:
return

tok = aug_tok1.tok
if not aug_tok1.period_final:
# We only care about words ending in periods.
return

typ = aug_tok1.type_no_period
next_tok = aug_tok2.tok
next_typ = aug_tok2.type_no_sentperiod
tok_is_initial = aug_tok1.is_initial

Expand Down Expand Up @@ -1588,8 +1585,7 @@ def _second_pass_annotation(self, aug_tok1, aug_tok2):
aug_tok1.abbr = True
if tok_is_initial:
return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC
else:
return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC
return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC

# Special heuristic for initials: if orthogrpahic
# heuristic is unknown, and next word is always
Expand Down Expand Up @@ -1665,5 +1661,5 @@ def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(text)
sbd = tok_cls(trainer.get_params())
for l in sbd.sentences_from_text(text):
print(cleanup(l))
for sentence in sbd.sentences_from_text(text):
print(cleanup(sentence))

0 comments on commit 82ceb20

Please sign in to comment.