diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index 1f3211bd03..c5af07cb38 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -99,14 +99,19 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): last_left_t = _last_left_t_init(sigma) # iterate over the array - for i in range(len1): - last_right = 0 - for j in range(len2): - last_left = last_left_t[s2[j]] + # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code + # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance + for i in range(1, len1 + 1): + last_right_buf = 0 + for j in range(1, len2 + 1): + last_left = last_left_t[s2[j - 1]] + last_right = last_right_buf + if s1[i - 1] == s2[j - 1]: + last_right_buf = j _edit_dist_step( lev, - i + 1, - j + 1, + i, + j, s1, s2, last_left, @@ -114,9 +119,7 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): substitution_cost=substitution_cost, transpositions=transpositions, ) - if s1[i] == s2[j]: - last_right = j + 1 - last_left_t[s1[i]] = i + 1 + last_left_t[s1[i - 1]] = i return lev[len1][len2] diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py index 23a5b9c4fc..96d814d0b8 100644 --- a/nltk/test/unit/test_distance.py +++ b/nltk/test/unit/test_distance.py @@ -94,6 +94,12 @@ class TestEditDistance: # (but cost 5 if substitution_cost=2) ("kitten", "sitting", 1, (3, 3)), ("kitten", "sitting", 2, (5, 5)), + # + # duplicated letter + # e.g. "duplicated" -D-> "duplicated" + ("duplicated", "duuplicated", 1, (1, 1)), + ("duplicated", "duuplicated", 2, (1, 1)), + ("very duplicated", "very duuplicateed", 2, (2, 2)), ], ) def test_with_transpositions(