Skip to content

Commit

Permalink
fix: levenstein distance for duplicated letters (#2849)
Browse files Browse the repository at this point in the history
  • Loading branch information
p9f committed Oct 27, 2021
1 parent 00a5d80 commit ad3c84c
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 9 deletions.
21 changes: 12 additions & 9 deletions nltk/metrics/distance.py
Expand Up @@ -99,24 +99,27 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
last_left_t = _last_left_t_init(sigma)

# iterate over the array
for i in range(len1):
last_right = 0
for j in range(len2):
last_left = last_left_t[s2[j]]
# i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code
# see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
for i in range(1, len1 + 1):
last_right_buf = 0
for j in range(1, len2 + 1):
last_left = last_left_t[s2[j - 1]]
last_right = last_right_buf
if s1[i - 1] == s2[j - 1]:
last_right_buf = j
_edit_dist_step(
lev,
i + 1,
j + 1,
i,
j,
s1,
s2,
last_left,
last_right,
substitution_cost=substitution_cost,
transpositions=transpositions,
)
if s1[i] == s2[j]:
last_right = j + 1
last_left_t[s1[i]] = i + 1
last_left_t[s1[i - 1]] = i
return lev[len1][len2]


Expand Down
6 changes: 6 additions & 0 deletions nltk/test/unit/test_distance.py
Expand Up @@ -94,6 +94,12 @@ class TestEditDistance:
# (but cost 5 if substitution_cost=2)
("kitten", "sitting", 1, (3, 3)),
("kitten", "sitting", 2, (5, 5)),
#
# duplicated letter
# e.g. "duplicated" -D-> "duplicated"
("duplicated", "duuplicated", 1, (1, 1)),
("duplicated", "duuplicated", 2, (1, 1)),
("very duplicated", "very duuplicateed", 2, (2, 2)),
],
)
def test_with_transpositions(
Expand Down

0 comments on commit ad3c84c

Please sign in to comment.