From 9bd512eb5cbd6785e819331653ff7d908866f777 Mon Sep 17 00:00:00 2001 From: antoine Date: Tue, 22 Jun 2021 16:29:15 -0400 Subject: [PATCH 1/7] Edit_distance now computes the actual Damerau-Levenshtein edit-distance --- nltk/metrics/distance.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index 7aee73bd6c..1a00d3b585 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -35,7 +35,11 @@ def _edit_dist_init(len1, len2): return lev -def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False): +def _last_left_t_init(sigma): + return {c: 0 for c in sigma} + + +def _edit_dist_step(lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False): c1 = s1[i - 1] c2 = s2[j - 1] @@ -48,9 +52,8 @@ def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False # transposition d = c + 1 # never picked by default - if transpositions and i > 1 and j > 1: - if s1[i - 2] == c2 and s2[j - 2] == c1: - d = lev[i - 2][j - 2] + 1 + if transpositions and last_left > 0 and last_right > 0: + d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1 # pick the cheapest lev[i][j] = min(a, b, c, d) @@ -86,18 +89,33 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): len2 = len(s2) lev = _edit_dist_init(len1 + 1, len2 + 1) + # retrieve alphabet + sigma = set() + sigma.update(s1) + sigma.update(s2) + + # set up table to remember positions of last seen occurrence in s1 + last_left_t = _last_left_t_init(sigma) + # iterate over the array for i in range(len1): + last_right = 0 for j in range(len2): + last_left = last_left_t[s2[j]] _edit_dist_step( lev, i + 1, j + 1, s1, s2, + last_left, + last_right, substitution_cost=substitution_cost, transpositions=transpositions, ) + if s1[i] == s2[j]: + last_right = j + 1 + last_left_t[s2[j]] = i + 1 return lev[len1][len2] From abcb431acb19d6464a1782e393d62bd35e264deb Mon Sep 17 00:00:00 2001 From: antoine Date: Tue, 22 Jun 2021 17:40:33 -0400 Subject: [PATCH 2/7] adapted edit_distance_align to the changes in _edit_distance_step --- nltk/metrics/distance.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index 1a00d3b585..51817fd4f5 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -181,6 +181,8 @@ def edit_distance_align(s1, s2, substitution_cost=1): j + 1, s1, s2, + 0, + 0, substitution_cost=substitution_cost, transpositions=False, ) From 48c15495a2a01671b9c1d9959341a17088f05e77 Mon Sep 17 00:00:00 2001 From: antoine Date: Mon, 13 Sep 2021 18:27:34 -0400 Subject: [PATCH 3/7] +couple unit test for the levensthein edit distance with vs without transpositions --- nltk/test/unit/test_distance.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 nltk/test/unit/test_distance.py diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py new file mode 100644 index 0000000000..d4233f06ad --- /dev/null +++ b/nltk/test/unit/test_distance.py @@ -0,0 +1,34 @@ +import unittest +from nltk.metrics.distance import edit_distance + + +class TestEditDistanceABC(unittest.TestCase): + + def setUp(self): + self.left = "abc" + self.right = "ca" + + def test_with_transpositions(self): + self.assertEqual(edit_distance(self.left, self.right, transpositions=True), 2) + self.assertEqual(edit_distance(self.right, self.left, transpositions=True), 2) + + def test_without_transpositions(self): + self.assertEqual(edit_distance(self.left, self.right, transpositions=False), 3) + self.assertEqual(edit_distance(self.right, self.left, transpositions=False), 3) + + +class TestEditDistanceWithHigherSubCost(unittest.TestCase): + + def setUp(self): + self.left = "wants" + self.right = "Wasp" + self.sub_cost = 2 + + def test_with_transpositions(self): + self.assertEqual(edit_distance(self.left, self.right, substitution_cost=2, transpositions=True), 4) + self.assertEqual(edit_distance(self.right, self.left, substitution_cost=2, transpositions=True), 4) + + def test_without_transpositions(self): + self.assertEqual(edit_distance(self.left, self.right, substitution_cost=2, transpositions=False), 5) + self.assertEqual(edit_distance(self.right, self.left, substitution_cost=2, transpositions=False), 5) + From b11966d37e42e5275012ee51fec68e94668699e0 Mon Sep 17 00:00:00 2001 From: antoine Date: Mon, 13 Sep 2021 20:34:58 -0400 Subject: [PATCH 4/7] pre commit fails when pushing --- nltk/test/unit/test_distance.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py index d4233f06ad..4c1dfdca0a 100644 --- a/nltk/test/unit/test_distance.py +++ b/nltk/test/unit/test_distance.py @@ -1,9 +1,9 @@ import unittest + from nltk.metrics.distance import edit_distance class TestEditDistanceABC(unittest.TestCase): - def setUp(self): self.left = "abc" self.right = "ca" @@ -18,17 +18,35 @@ def test_without_transpositions(self): class TestEditDistanceWithHigherSubCost(unittest.TestCase): - def setUp(self): self.left = "wants" self.right = "Wasp" self.sub_cost = 2 def test_with_transpositions(self): - self.assertEqual(edit_distance(self.left, self.right, substitution_cost=2, transpositions=True), 4) - self.assertEqual(edit_distance(self.right, self.left, substitution_cost=2, transpositions=True), 4) + self.assertEqual( + edit_distance( + self.left, self.right, substitution_cost=2, transpositions=True + ), + 4, + ) + self.assertEqual( + edit_distance( + self.right, self.left, substitution_cost=2, transpositions=True + ), + 4, + ) def test_without_transpositions(self): - self.assertEqual(edit_distance(self.left, self.right, substitution_cost=2, transpositions=False), 5) - self.assertEqual(edit_distance(self.right, self.left, substitution_cost=2, transpositions=False), 5) - + self.assertEqual( + edit_distance( + self.left, self.right, substitution_cost=2, transpositions=False + ), + 5, + ) + self.assertEqual( + edit_distance( + self.right, self.left, substitution_cost=2, transpositions=False + ), + 5, + ) From f157b3fc46b57f0d8cca09f01693a3f79af142e4 Mon Sep 17 00:00:00 2001 From: antoine Date: Mon, 13 Sep 2021 20:41:14 -0400 Subject: [PATCH 5/7] commiting to run pre-commit hooks --- nltk/metrics/distance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index 19845c965e..9a07f495ee 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -38,7 +38,9 @@ def _last_left_t_init(sigma): return {c: 0 for c in sigma} -def _edit_dist_step(lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False): +def _edit_dist_step( + lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False +): c1 = s1[i - 1] c2 = s2[j - 1] From 39e414c35b1780fc2af5f90584f8f3ec84244b0f Mon Sep 17 00:00:00 2001 From: antoine Date: Tue, 14 Sep 2021 21:08:58 -0400 Subject: [PATCH 6/7] fixed edit distance unit tests and edit distance with transpositions --- nltk/metrics/distance.py | 4 +-- nltk/test/unit/test_distance.py | 64 +++++++++++++++++++-------------- 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index 9a07f495ee..c0239953f8 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -53,7 +53,7 @@ def _edit_dist_step( # transposition d = c + 1 # never picked by default - if transpositions and last_left > 0 and last_right > 0: + if (transpositions and (last_left > 0)) and (last_right > 0): d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1 # pick the cheapest @@ -116,7 +116,7 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): ) if s1[i] == s2[j]: last_right = j + 1 - last_left_t[s2[j]] = i + 1 + last_left_t[s1[i]] = i + 1 return lev[len1][len2] diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py index 4c1dfdca0a..7c78c725cf 100644 --- a/nltk/test/unit/test_distance.py +++ b/nltk/test/unit/test_distance.py @@ -3,50 +3,60 @@ from nltk.metrics.distance import edit_distance -class TestEditDistanceABC(unittest.TestCase): - def setUp(self): - self.left = "abc" - self.right = "ca" - - def test_with_transpositions(self): - self.assertEqual(edit_distance(self.left, self.right, transpositions=True), 2) - self.assertEqual(edit_distance(self.right, self.left, transpositions=True), 2) - - def test_without_transpositions(self): - self.assertEqual(edit_distance(self.left, self.right, transpositions=False), 3) - self.assertEqual(edit_distance(self.right, self.left, transpositions=False), 3) - - -class TestEditDistanceWithHigherSubCost(unittest.TestCase): - def setUp(self): - self.left = "wants" - self.right = "Wasp" - self.sub_cost = 2 +class EditDistanceTestPattern: + def __init__( + self, s1, s2, expected_transpositions, expected_no_transpositions, sub_cost=1 + ): + self.s1 = s1 + self.s2 = s2 + self.expected_tr = expected_transpositions + self.expected_no_tr = expected_no_transpositions + self.sub_cost = sub_cost def test_with_transpositions(self): self.assertEqual( edit_distance( - self.left, self.right, substitution_cost=2, transpositions=True + self.s1, self.s2, substitution_cost=self.sub_cost, transpositions=True ), - 4, + self.expected_tr, ) self.assertEqual( edit_distance( - self.right, self.left, substitution_cost=2, transpositions=True + self.s2, self.s1, substitution_cost=self.sub_cost, transpositions=True ), - 4, + self.expected_tr, ) def test_without_transpositions(self): self.assertEqual( edit_distance( - self.left, self.right, substitution_cost=2, transpositions=False + self.s1, self.s2, substitution_cost=self.sub_cost, transpositions=False ), - 5, + self.expected_no_tr, ) self.assertEqual( edit_distance( - self.right, self.left, substitution_cost=2, transpositions=False + self.s2, self.s1, substitution_cost=self.sub_cost, transpositions=False ), - 5, + self.expected_no_tr, ) + + +class TestEditDistanceABC(unittest.TestCase, EditDistanceTestPattern): + def __init__(self, *args, **k_args): + EditDistanceTestPattern.__init__(self, "abc", "ca", 2, 3) + unittest.TestCase.__init__(self, *args, **k_args) + + +class TestEditDistanceWithHigherSubCost(unittest.TestCase, EditDistanceTestPattern): + def __init__(self, *args, **k_args): + EditDistanceTestPattern.__init__(self, "wants", "swim", 6, 7, sub_cost=2) + unittest.TestCase.__init__(self, *args, **k_args) + + +class TestEditDistanceWithNoTranspositionBenefit( + unittest.TestCase, EditDistanceTestPattern +): + def __init__(self, *args, **k_args): + EditDistanceTestPattern.__init__(self, "wants", "wasp", 3, 3) + unittest.TestCase.__init__(self, *args, **k_args) From 62a1a8ed284b791e1b6354101d3990f142788ef4 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 15 Sep 2021 10:56:57 +0200 Subject: [PATCH 7/7] Added and pytest-ified edit_distance tests --- nltk/metrics/distance.py | 2 +- nltk/test/unit/test_distance.py | 173 +++++++++++++++++++++----------- 2 files changed, 118 insertions(+), 57 deletions(-) diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index c0239953f8..c0da4a1753 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -53,7 +53,7 @@ def _edit_dist_step( # transposition d = c + 1 # never picked by default - if (transpositions and (last_left > 0)) and (last_right > 0): + if transpositions and last_left > 0 and last_right > 0: d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1 # pick the cheapest diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py index 7c78c725cf..bea1b542c2 100644 --- a/nltk/test/unit/test_distance.py +++ b/nltk/test/unit/test_distance.py @@ -1,62 +1,123 @@ -import unittest +from typing import Tuple + +import pytest from nltk.metrics.distance import edit_distance -class EditDistanceTestPattern: - def __init__( - self, s1, s2, expected_transpositions, expected_no_transpositions, sub_cost=1 +class TestEditDistance: + @pytest.mark.parametrize( + "left,right,substitution_cost,expecteds", + [ + # Allowing transpositions reduces the number of edits required. + # with transpositions: + # e.g. "abc" -T-> "cba" -D-> "ca": 2 steps + # + # without transpositions: + # e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps + ("abc", "ca", 1, (2, 3)), + ("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions + # Note, a substition_cost of higher than 2 doesn't make much + # sense, as a deletion + insertion is identical, and always + # costs 2. + # + # + # Transpositions don't always reduce the number of edits required: + # with or without transpositions: + # e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps + ("wants", "wasp", 1, (3, 3)), + ("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions + # + # + # Ought to have the same results with and without transpositions + # with or without transpositions: + # e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps + # (but cost 5 if substitution_cost=2) + ("rain", "shine", 1, (3, 3)), + ("rain", "shine", 2, (5, 5)), # Does *require* substitutions + # + # + # Several potentially interesting typos + # with transpositions: + # e.g. "acbdef" -T-> "abcdef": 1 step + # + # without transpositions: + # e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps + ("acbdef", "abcdef", 1, (1, 2)), + ("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions + # + # + # with transpositions: + # e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps + # + # without transpositions: + # e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps + ("lnaguaeg", "language", 1, (2, 4)), + ("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions + # + # + # with transpositions: + # e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps + # + # without transpositions: + # e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps + # (but one substitution, so a cost of 4 if substition_cost = 2) + ("lnaugage", "language", 1, (2, 3)), + ("lnaugage", "language", 2, (2, 4)), + # Does *require* substitutions if no transpositions + # + # + # with transpositions: + # e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps + # without transpositions: + # e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps + ("lngauage", "language", 1, (2, 2)), + ("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions + # + # + # with or without transpositions: + # e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps + # + # with substitution_cost=2 and transpositions: + # e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw" + # -I-> "swi" -I-> "swim": 6 steps + # + # with substitution_cost=2 and no transpositions: + # e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw" + # -I-> "swi" -I-> "swim": 7 steps + ("wants", "swim", 1, (5, 5)), + ("wants", "swim", 2, (6, 7)), + # + # + # with or without transpositions: + # e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps + # (but cost 5 if substitution_cost=2) + ("kitten", "sitting", 1, (3, 3)), + ("kitten", "sitting", 2, (5, 5)), + ], + ) + def test_with_transpositions( + self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int] ): - self.s1 = s1 - self.s2 = s2 - self.expected_tr = expected_transpositions - self.expected_no_tr = expected_no_transpositions - self.sub_cost = sub_cost - - def test_with_transpositions(self): - self.assertEqual( - edit_distance( - self.s1, self.s2, substitution_cost=self.sub_cost, transpositions=True - ), - self.expected_tr, - ) - self.assertEqual( - edit_distance( - self.s2, self.s1, substitution_cost=self.sub_cost, transpositions=True - ), - self.expected_tr, - ) - - def test_without_transpositions(self): - self.assertEqual( - edit_distance( - self.s1, self.s2, substitution_cost=self.sub_cost, transpositions=False - ), - self.expected_no_tr, - ) - self.assertEqual( - edit_distance( - self.s2, self.s1, substitution_cost=self.sub_cost, transpositions=False - ), - self.expected_no_tr, - ) - - -class TestEditDistanceABC(unittest.TestCase, EditDistanceTestPattern): - def __init__(self, *args, **k_args): - EditDistanceTestPattern.__init__(self, "abc", "ca", 2, 3) - unittest.TestCase.__init__(self, *args, **k_args) - - -class TestEditDistanceWithHigherSubCost(unittest.TestCase, EditDistanceTestPattern): - def __init__(self, *args, **k_args): - EditDistanceTestPattern.__init__(self, "wants", "swim", 6, 7, sub_cost=2) - unittest.TestCase.__init__(self, *args, **k_args) - + """Test `edit_distance` between two strings, given some `substitution_cost`, + and whether transpositions are allowed. -class TestEditDistanceWithNoTranspositionBenefit( - unittest.TestCase, EditDistanceTestPattern -): - def __init__(self, *args, **k_args): - EditDistanceTestPattern.__init__(self, "wants", "wasp", 3, 3) - unittest.TestCase.__init__(self, *args, **k_args) + Args: + left (str): First input string to `edit_distance`. + right (str): Second input string to `edit_distance`. + substitution_cost (int): The cost of a substitution action in `edit_distance`. + expecteds (Tuple[int, int]): A tuple of expected outputs, such that `expecteds[0]` is + the expected output with `transpositions=True`, and `expecteds[1]` is + the expected output with `transpositions=False`. + """ + # Test the input strings in both orderings + for s1, s2 in ((left, right), (right, left)): + # zip with [True, False] to get the transpositions value + for expected, transpositions in zip(expecteds, [True, False]): + predicted = edit_distance( + s1, + s2, + substitution_cost=substitution_cost, + transpositions=transpositions, + ) + assert predicted == expected