Skip to content

Commit

Permalink
Merge pull request #3172 from BroMattMiller/feature/Text-concordance-…
Browse files Browse the repository at this point in the history
…line-alignment

Align text.ConcordanceIndex.find_concordance()
  • Loading branch information
stevenbird committed Dec 17, 2023
2 parents 59a1dbc + ce6a0a2 commit 5f69622
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 4 deletions.
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
- David McClosky
- Xinfan Meng
- Dmitrijs Milajevs
- Matt Miller
- Margaret Mitchell
- Tomonori Nagano
- Jason Narad
Expand Down
13 changes: 9 additions & 4 deletions nltk/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import re
import sys
import unicodedata
from collections import Counter, defaultdict, namedtuple
from functools import reduce
from math import log
Expand All @@ -27,7 +28,7 @@
from nltk.probability import ConditionalFreqDist as CFD
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.util import LazyConcatenation, tokenwrap
from nltk.util import LazyConcatenation, cut_string, tokenwrap

ConcordanceLine = namedtuple(
"ConcordanceLine",
Expand Down Expand Up @@ -193,7 +194,9 @@ def find_concordance(self, word, width=80):
else:
phrase = [word]

half_width = (width - len(" ".join(phrase)) - 2) // 2
phrase_str = " ".join(phrase)
phrase_len = sum(1 for char in phrase_str if not unicodedata.combining(char))
half_width = (width - phrase_len - 2) // 2
context = width // 4 # approx number of words of context

# Find the instances of the word to create the ConcordanceLine
Expand All @@ -209,8 +212,10 @@ def find_concordance(self, word, width=80):
left_context = self._tokens[max(0, i - context) : i]
right_context = self._tokens[i + len(phrase) : i + context]
# Create the pretty lines with the query_word in the middle.
left_print = " ".join(left_context)[-half_width:]
right_print = " ".join(right_context)[:half_width]
left_print = cut_string(" ".join(left_context), -half_width).rjust(
half_width
)
right_print = cut_string(" ".join(right_context), half_width)
# The WYSIWYG line of the concordance.
line_print = " ".join([left_print, query_word, right_print])
# Create the ConcordanceLine
Expand Down
36 changes: 36 additions & 0 deletions nltk/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pydoc
import re
import textwrap
import unicodedata
import warnings
from collections import defaultdict, deque
from itertools import chain, combinations, islice, tee
Expand Down Expand Up @@ -139,6 +140,41 @@ def tokenwrap(tokens, separator=" ", width=70):
return "\n".join(textwrap.wrap(separator.join(tokens), width=width))


def cut_string(s, width=70):
"""
Cut off and return a given width of a string
Return the same as s[:width] if width >= 0 or s[-width:] if
width < 0, as long as s has no unicode combining characters.
If it has combining characters make sure the returned string's
visible width matches the called-for width.
:param s: the string to cut
:type s: str
:param width: the display_width
:type width: int
"""
chars_sofar = 0
width_sofar = 0
result = ""

abs_width = abs(width)
max_chars = len(s)
while width_sofar < abs_width and chars_sofar < max_chars:
if width < 0:
char = s[-(chars_sofar + 1)]
result = char + result
else:
char = s[chars_sofar]
result = result + char

chars_sofar += 1
if not unicodedata.combining(char):
width_sofar += 1

return result


##########################################################################
# Indexing
##########################################################################
Expand Down

0 comments on commit 5f69622

Please sign in to comment.