refactor: perfom linting for punkt.py (#2830)

* refactor: perfom linting for punkt.py * fix failed test-cases * fix naming for variables * fix naming for variables
nltk · Oct 9, 2021 · 82ceb20 · 82ceb20
1 parent c05b0e7
commit 82ceb20
Showing 1 changed file with 58 additions and 62 deletions.
diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
@@ -308,18 +308,18 @@ def period_context_re(self):
 # ////////////////////////////////////////////////////////////
 
 
-def _pair_iter(it):
+def _pair_iter(iterator):
     """
     Yields pairs of tokens from the given iterator such that each input
     token will appear as the first element in a yielded tuple. The last
     pair will have None as its second element.
     """
-    it = iter(it)
+    iterator = iter(iterator)
     try:
-        prev = next(it)
+        prev = next(iterator)
     except StopIteration:
         return
-    for el in it:
+    for el in iterator:
         yield (prev, el)
         prev = el
     yield (prev, None)
@@ -369,18 +369,18 @@ def add_ortho_context(self, typ, flag):
         self.ortho_context[typ] |= flag
 
     def _debug_ortho_context(self, typ):
-        c = self.ortho_context[typ]
-        if c & _ORTHO_BEG_UC:
+        context = self.ortho_context[typ]
+        if context & _ORTHO_BEG_UC:
             yield "BEG-UC"
-        if c & _ORTHO_MID_UC:
+        if context & _ORTHO_MID_UC:
             yield "MID-UC"
-        if c & _ORTHO_UNK_UC:
+        if context & _ORTHO_UNK_UC:
             yield "UNK-UC"
-        if c & _ORTHO_BEG_LC:
+        if context & _ORTHO_BEG_LC:
             yield "BEG-LC"
-        if c & _ORTHO_MID_LC:
+        if context & _ORTHO_MID_LC:
             yield "MID-LC"
-        if c & _ORTHO_UNK_LC:
+        if context & _ORTHO_UNK_LC:
             yield "UNK-LC"
 
 
@@ -401,8 +401,8 @@ def __init__(self, tok, **params):
         self.type = self._get_type(tok)
         self.period_final = tok.endswith(".")
 
-        for p in self._properties:
-            setattr(self, p, None)
+        for prop in self._properties:
+            setattr(self, prop, None)
         for k in params:
             setattr(self, k, params[k])
 
@@ -456,7 +456,7 @@ def first_lower(self):
     def first_case(self):
         if self.first_lower:
             return "lower"
-        elif self.first_upper:
+        if self.first_upper:
             return "upper"
         return "none"
 
@@ -570,8 +570,8 @@ def _tokenize_words(self, plaintext):
                 yield self._Token(tok, parastart=parastart, linestart=True)
                 parastart = False
 
-                for t in line_toks:
-                    yield self._Token(t)
+                for tok in line_toks:
+                    yield self._Token(tok)
             else:
                 parastart = True
 
@@ -817,16 +817,16 @@ def finalize_training(self, verbose=False):
         collocations and sentence starters.
         """
         self._params.clear_sent_starters()
-        for typ, ll in self._find_sent_starters():
+        for typ, log_likelihood in self._find_sent_starters():
             self._params.sent_starters.add(typ)
             if verbose:
-                print(f"  Sent Starter: [{ll:6.4f}] {typ!r}")
+                print(f"  Sent Starter: [{log_likelihood:6.4f}] {typ!r}")
 
         self._params.clear_collocations()
-        for (typ1, typ2), ll in self._find_collocations():
+        for (typ1, typ2), log_likelihood in self._find_collocations():
             self._params.collocations.add((typ1, typ2))
             if verbose:
-                print(f"  Collocation: [{ll:6.4f}] {typ1!r}+{typ2!r}")
+                print(f"  Collocation: [{log_likelihood:6.4f}] {typ1!r}+{typ2!r}")
 
         self._finalized = True
 
@@ -971,11 +971,11 @@ def _reclassify_abbrev_types(self, types):
             # Let <a> be the candidate without the period, and <b>
             # be the period.  Find a log likelihood ratio that
             # indicates whether <ab> occurs as a single unit (high
-            # value of ll), or as two independent units <a> and
-            # <b> (low value of ll).
+            # value of log_likelihood), or as two independent units <a> and
+            # <b> (low value of log_likelihood).
             count_with_period = self._type_fdist[typ + "."]
             count_without_period = self._type_fdist[typ]
-            ll = self._dunning_log_likelihood(
+            log_likelihood = self._dunning_log_likelihood(
                 count_with_period + count_without_period,
                 self._num_period_toks,
                 count_with_period,
@@ -992,7 +992,7 @@ def _reclassify_abbrev_types(self, types):
             f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow(
                 num_nonperiods, -count_without_period
             )
-            score = ll * f_length * f_periods * f_penalty
+            score = log_likelihood * f_length * f_periods * f_penalty
 
             yield typ, score, is_add
 
@@ -1004,7 +1004,7 @@ def find_abbrev_types(self):
         """
         self._params.clear_abbrevs()
         tokens = (typ for typ in self._type_fdist if typ and typ.endswith("."))
-        for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
+        for abbr, score, _is_add in self._reclassify_abbrev_types(tokens):
             if score >= self.ABBREV:
                 self._params.abbrev_types.add(abbr)
 
@@ -1046,7 +1046,7 @@ def _is_rare_abbrev_type(self, cur_tok, next_tok):
         # and (iii) never occus with an uppercase letter
         # sentence-internally.
         # [xx] should the check for (ii) be modified??
-        elif next_tok.first_lower:
+        if next_tok.first_lower:
             typ2 = next_tok.type_no_sentperiod
             typ2ortho_context = self._params.ortho_context[typ2]
             if (typ2ortho_context & _ORTHO_BEG_UC) and not (
@@ -1090,19 +1090,19 @@ def _col_log_likelihood(count_a, count_b, count_ab, N):
         p1 = count_ab / count_a
         try:
             p2 = (count_b - count_ab) / (N - count_a)
-        except ZeroDivisionError as e:
+        except ZeroDivisionError:
             p2 = 1
 
         try:
             summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p)
-        except ValueError as e:
+        except ValueError:
             summand1 = 0
 
         try:
             summand2 = (count_b - count_ab) * math.log(p) + (
                 N - count_a - count_b + count_ab
             ) * math.log(1.0 - p)
-        except ValueError as e:
+        except ValueError:
             summand2 = 0
 
         if count_a == count_ab or p1 <= 0 or p1 >= 1:
@@ -1164,14 +1164,14 @@ def _find_collocations(self):
                 and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count)
             ):
 
-                ll = self._col_log_likelihood(
+                log_likelihood = self._col_log_likelihood(
                     typ1_count, typ2_count, col_count, self._type_fdist.N()
                 )
                 # Filter out the not-so-collocative
-                if ll >= self.COLLOCATION and (
+                if log_likelihood >= self.COLLOCATION and (
                     self._type_fdist.N() / typ1_count > typ2_count / col_count
                 ):
-                    yield (typ1, typ2), ll
+                    yield (typ1, typ2), log_likelihood
 
     # ////////////////////////////////////////////////////////////
     # { Sentence-Starter Finder
@@ -1206,19 +1206,19 @@ def _find_sent_starters(self):
                 # needed after freq_threshold
                 continue
 
-            ll = self._col_log_likelihood(
+            log_likelihood = self._col_log_likelihood(
                 self._sentbreak_count,
                 typ_count,
                 typ_at_break_count,
                 self._type_fdist.N(),
             )
 
             if (
-                ll >= self.SENT_STARTER
+                log_likelihood >= self.SENT_STARTER
                 and self._type_fdist.N() / self._sentbreak_count
                 > typ_count / typ_at_break_count
             ):
-                yield typ, ll
+                yield typ, log_likelihood
 
     def _get_sentbreak_count(self, tokens):
         """
@@ -1321,8 +1321,8 @@ def span_tokenize(self, text, realign_boundaries=True):
         slices = self._slices_from_text(text)
         if realign_boundaries:
             slices = self._realign_boundaries(text, slices)
-        for sl in slices:
-            yield (sl.start, sl.stop)
+        for sentence in slices:
+            yield (sentence.start, sentence.stop)
 
     def sentences_from_text(self, text, realign_boundaries=True):
         """
@@ -1362,31 +1362,31 @@ def _realign_boundaries(self, text, slices):
             ["(Sent1.)", "Sent2."].
         """
         realign = 0
-        for sl1, sl2 in _pair_iter(slices):
-            sl1 = slice(sl1.start + realign, sl1.stop)
-            if not sl2:
-                if text[sl1]:
-                    yield sl1
+        for sentence1, sentence2 in _pair_iter(slices):
+            sentence1 = slice(sentence1.start + realign, sentence1.stop)
+            if not sentence2:
+                if text[sentence1]:
+                    yield sentence1
                 continue
 
-            m = self._lang_vars.re_boundary_realignment.match(text[sl2])
+            m = self._lang_vars.re_boundary_realignment.match(text[sentence2])
             if m:
-                yield slice(sl1.start, sl2.start + len(m.group(0).rstrip()))
+                yield slice(sentence1.start, sentence2.start + len(m.group(0).rstrip()))
                 realign = m.end()
             else:
                 realign = 0
-                if text[sl1]:
-                    yield sl1
+                if text[sentence1]:
+                    yield sentence1
 
     def text_contains_sentbreak(self, text):
         """
         Returns True if the given text includes a sentence break.
         """
         found = False  # used to ignore last token
-        for t in self._annotate_tokens(self._tokenize_words(text)):
+        for tok in self._annotate_tokens(self._tokenize_words(text)):
             if found:
                 return True
-            if t.sentbreak:
+            if tok.sentbreak:
                 found = True
         return False
 
@@ -1448,15 +1448,15 @@ def _build_sentence_list(self, text, tokens):
         pos = 0
 
         # A regular expression that finds pieces of whitespace:
-        WS_REGEXP = re.compile(r"\s*")
+        white_space_regexp = re.compile(r"\s*")
 
         sentence = ""
         for aug_tok in tokens:
             tok = aug_tok.tok
 
             # Find the whitespace before this token, and update pos.
-            ws = WS_REGEXP.match(text, pos).group()
-            pos += len(ws)
+            white_space = white_space_regexp.match(text, pos).group()
+            pos += len(white_space)
 
             # Some of the rules used by the punkt word tokenizer
             # strip whitespace out of the text, resulting in tokens
@@ -1477,7 +1477,7 @@ def _build_sentence_list(self, text, tokens):
             # sentence, then include any whitespace that separated it
             # from the previous token.
             if sentence:
-                sentence += ws
+                sentence += white_space
             sentence += tok
 
             # If we're at a sentence break, then start a new sentence.
@@ -1519,9 +1519,9 @@ def _annotate_second_pass(self, tokens):
         tokens, making use of the orthographic heuristic (4.1.1), collocation
         heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
         """
-        for t1, t2 in _pair_iter(tokens):
-            self._second_pass_annotation(t1, t2)
-            yield t1
+        for token1, token2 in _pair_iter(tokens):
+            self._second_pass_annotation(token1, token2)
+            yield token1
 
     def _second_pass_annotation(self, aug_tok1, aug_tok2):
         """
@@ -1532,13 +1532,10 @@ def _second_pass_annotation(self, aug_tok1, aug_tok2):
         if not aug_tok2:
             return
 
-        tok = aug_tok1.tok
         if not aug_tok1.period_final:
             # We only care about words ending in periods.
             return
-
         typ = aug_tok1.type_no_period
-        next_tok = aug_tok2.tok
         next_typ = aug_tok2.type_no_sentperiod
         tok_is_initial = aug_tok1.is_initial
 
@@ -1588,8 +1585,7 @@ def _second_pass_annotation(self, aug_tok1, aug_tok2):
                 aug_tok1.abbr = True
                 if tok_is_initial:
                     return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC
-                else:
-                    return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC
+                return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC
 
             # Special heuristic for initials: if orthogrpahic
             # heuristic is unknown, and next word is always
@@ -1665,5 +1661,5 @@ def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
     trainer.INCLUDE_ALL_COLLOCS = True
     trainer.train(text)
     sbd = tok_cls(trainer.get_params())
-    for l in sbd.sentences_from_text(text):
-        print(cleanup(l))
+    for sentence in sbd.sentences_from_text(text):
+        print(cleanup(sentence))