Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Edit_distance now computes the actual Damerau-Levenshtein edit-distance #2736

30 changes: 26 additions & 4 deletions nltk/metrics/distance.py
Expand Up @@ -34,7 +34,13 @@ def _edit_dist_init(len1, len2):
return lev


def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False):
def _last_left_t_init(sigma):
return {c: 0 for c in sigma}


def _edit_dist_step(
lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False
):
c1 = s1[i - 1]
c2 = s2[j - 1]

Expand All @@ -47,9 +53,8 @@ def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False

# transposition
d = c + 1 # never picked by default
if transpositions and i > 1 and j > 1:
if s1[i - 2] == c2 and s2[j - 2] == c1:
d = lev[i - 2][j - 2] + 1
if transpositions and last_left > 0 and last_right > 0:
d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1

# pick the cheapest
lev[i][j] = min(a, b, c, d)
Expand Down Expand Up @@ -85,18 +90,33 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
len2 = len(s2)
lev = _edit_dist_init(len1 + 1, len2 + 1)

# retrieve alphabet
sigma = set()
sigma.update(s1)
sigma.update(s2)

# set up table to remember positions of last seen occurrence in s1
last_left_t = _last_left_t_init(sigma)

# iterate over the array
for i in range(len1):
last_right = 0
for j in range(len2):
last_left = last_left_t[s2[j]]
_edit_dist_step(
lev,
i + 1,
j + 1,
s1,
s2,
last_left,
last_right,
substitution_cost=substitution_cost,
transpositions=transpositions,
)
if s1[i] == s2[j]:
last_right = j + 1
last_left_t[s1[i]] = i + 1
return lev[len1][len2]


Expand Down Expand Up @@ -162,6 +182,8 @@ def edit_distance_align(s1, s2, substitution_cost=1):
j + 1,
s1,
s2,
0,
0,
substitution_cost=substitution_cost,
transpositions=False,
)
Expand Down
123 changes: 123 additions & 0 deletions nltk/test/unit/test_distance.py
@@ -0,0 +1,123 @@
from typing import Tuple

import pytest

from nltk.metrics.distance import edit_distance


class TestEditDistance:
@pytest.mark.parametrize(
"left,right,substitution_cost,expecteds",
[
# Allowing transpositions reduces the number of edits required.
# with transpositions:
# e.g. "abc" -T-> "cba" -D-> "ca": 2 steps
#
# without transpositions:
# e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps
("abc", "ca", 1, (2, 3)),
("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions
# Note, a substition_cost of higher than 2 doesn't make much
# sense, as a deletion + insertion is identical, and always
# costs 2.
#
#
# Transpositions don't always reduce the number of edits required:
# with or without transpositions:
# e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps
("wants", "wasp", 1, (3, 3)),
("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions
#
#
# Ought to have the same results with and without transpositions
# with or without transpositions:
# e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps
# (but cost 5 if substitution_cost=2)
("rain", "shine", 1, (3, 3)),
("rain", "shine", 2, (5, 5)), # Does *require* substitutions
#
#
# Several potentially interesting typos
# with transpositions:
# e.g. "acbdef" -T-> "abcdef": 1 step
#
# without transpositions:
# e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps
("acbdef", "abcdef", 1, (1, 2)),
("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions
#
#
# with transpositions:
# e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps
#
# without transpositions:
# e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps
("lnaguaeg", "language", 1, (2, 4)),
("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions
#
#
# with transpositions:
# e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps
#
# without transpositions:
# e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps
# (but one substitution, so a cost of 4 if substition_cost = 2)
("lnaugage", "language", 1, (2, 3)),
("lnaugage", "language", 2, (2, 4)),
# Does *require* substitutions if no transpositions
#
#
# with transpositions:
# e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps
# without transpositions:
# e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps
("lngauage", "language", 1, (2, 2)),
("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions
#
#
# with or without transpositions:
# e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps
#
# with substitution_cost=2 and transpositions:
# e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw"
# -I-> "swi" -I-> "swim": 6 steps
#
# with substitution_cost=2 and no transpositions:
# e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw"
# -I-> "swi" -I-> "swim": 7 steps
("wants", "swim", 1, (5, 5)),
("wants", "swim", 2, (6, 7)),
#
#
# with or without transpositions:
# e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps
# (but cost 5 if substitution_cost=2)
("kitten", "sitting", 1, (3, 3)),
("kitten", "sitting", 2, (5, 5)),
],
)
def test_with_transpositions(
self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int]
):
"""Test `edit_distance` between two strings, given some `substitution_cost`,
and whether transpositions are allowed.

Args:
left (str): First input string to `edit_distance`.
right (str): Second input string to `edit_distance`.
substitution_cost (int): The cost of a substitution action in `edit_distance`.
expecteds (Tuple[int, int]): A tuple of expected outputs, such that `expecteds[0]` is
the expected output with `transpositions=True`, and `expecteds[1]` is
the expected output with `transpositions=False`.
"""
# Test the input strings in both orderings
for s1, s2 in ((left, right), (right, left)):
# zip with [True, False] to get the transpositions value
for expected, transpositions in zip(expecteds, [True, False]):
predicted = edit_distance(
s1,
s2,
substitution_cost=substitution_cost,
transpositions=transpositions,
)
assert predicted == expected