From ab53eb8474b8b3cbf7e74c46c1a824ddfc4ca33d Mon Sep 17 00:00:00 2001 From: guillaume Date: Thu, 7 Oct 2021 12:20:25 +0100 Subject: [PATCH 1/2] fix: levenstein distance for duplicated letters Fix a bug where a duplicated letter was not contributing to the distance, if transposition was set to true and duplicated letter was the left argument. ```python3 edit_distance("duuplicated", "duplicated", transpositions=False) edit_distance("duplicated", "duuplicated", transpositions=True) edit_distance("duuplicated", "duplicated", transpositions=True) # all return 1 - correct edit_distance("duplicated", "duuplicated", transpositions=True) # returns 0 - incorrect ``` I believe it is a bug introduced three weeks ago by PR [2736]. The fix make nltk implementation closer to the [wikipedia] pseudo code, which should make further reviews / iteration easier I believe. [2736]: https://github.com/nltk/nltk/pull/2736 [wikipedia]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions --- nltk/metrics/distance.py | 9 +++++---- nltk/test/unit/test_distance.py | 6 ++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index c0da4a1753..42f4272439 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -100,9 +100,12 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): # iterate over the array for i in range(len1): - last_right = 0 + last_right_buf = 0 for j in range(len2): last_left = last_left_t[s2[j]] + last_right = last_right_buf + if s1[i] == s2[j]: + last_right_buf = j + 1 _edit_dist_step( lev, i + 1, @@ -114,9 +117,7 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): substitution_cost=substitution_cost, transpositions=transpositions, ) - if s1[i] == s2[j]: - last_right = j + 1 - last_left_t[s1[i]] = i + 1 + last_left_t[s1[i]] = i + 1 return lev[len1][len2] diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py index bea1b542c2..d39415e8df 100644 --- a/nltk/test/unit/test_distance.py +++ b/nltk/test/unit/test_distance.py @@ -94,6 +94,12 @@ class TestEditDistance: # (but cost 5 if substitution_cost=2) ("kitten", "sitting", 1, (3, 3)), ("kitten", "sitting", 2, (5, 5)), + # + # duplicated letter + # e.g. "duplicated" -D-> "duplicated" + ("duplicated", "duuplicated", 1, (1, 1)), + ("duplicated", "duuplicated", 2, (1, 1)), + ("very duplicated", "very duuplicateed", 2, (2, 2)), ], ) def test_with_transpositions( From de73ed2d3010b8255f3af23d328f1039a846e7e2 Mon Sep 17 00:00:00 2001 From: guillaume Date: Tue, 12 Oct 2021 13:59:00 +0100 Subject: [PATCH 2/2] refactor: make edit distance closer to wikipedia pseudo code Start i / j loops from 1 and not 0 to make the code closer to [wikipedia] pseudo code, as requested by this pull request comment [0]. [wikipedia]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions [0]: https://github.com/nltk/nltk/pull/2849#issuecomment-940880097 --- nltk/metrics/distance.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index 42f4272439..f8760e7322 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -99,17 +99,19 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): last_left_t = _last_left_t_init(sigma) # iterate over the array - for i in range(len1): + # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code + # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance + for i in range(1, len1 + 1): last_right_buf = 0 - for j in range(len2): - last_left = last_left_t[s2[j]] + for j in range(1, len2 + 1): + last_left = last_left_t[s2[j - 1]] last_right = last_right_buf - if s1[i] == s2[j]: - last_right_buf = j + 1 + if s1[i - 1] == s2[j - 1]: + last_right_buf = j _edit_dist_step( lev, - i + 1, - j + 1, + i, + j, s1, s2, last_left, @@ -117,7 +119,7 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): substitution_cost=substitution_cost, transpositions=transpositions, ) - last_left_t[s1[i]] = i + 1 + last_left_t[s1[i - 1]] = i return lev[len1][len2]