Skip to content

Commit

Permalink
Edit_distance now computes the actual Damerau-Levenshtein edit-distan…
Browse files Browse the repository at this point in the history
…ce (#2736)

* Edit_distance now computes the actual Damerau-Levenshtein edit-distance

* adapted edit_distance_align to the changes in _edit_distance_step

* +couple unit test for the levensthein edit distance with vs without transpositions

* pre commit fails when pushing

* commiting to run pre-commit hooks

* fixed edit distance unit tests and edit distance with transpositions

* Added and pytest-ified edit_distance tests

Co-authored-by: Tom Aarsen <Cubiegamedev@gmail.com>
  • Loading branch information
avena554 and tomaarsen committed Sep 18, 2021
1 parent 53dbaa5 commit 77b5945
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 4 deletions.
30 changes: 26 additions & 4 deletions nltk/metrics/distance.py
Expand Up @@ -34,7 +34,13 @@ def _edit_dist_init(len1, len2):
return lev


def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False):
def _last_left_t_init(sigma):
return {c: 0 for c in sigma}


def _edit_dist_step(
lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False
):
c1 = s1[i - 1]
c2 = s2[j - 1]

Expand All @@ -47,9 +53,8 @@ def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False

# transposition
d = c + 1 # never picked by default
if transpositions and i > 1 and j > 1:
if s1[i - 2] == c2 and s2[j - 2] == c1:
d = lev[i - 2][j - 2] + 1
if transpositions and last_left > 0 and last_right > 0:
d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1

# pick the cheapest
lev[i][j] = min(a, b, c, d)
Expand Down Expand Up @@ -85,18 +90,33 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
len2 = len(s2)
lev = _edit_dist_init(len1 + 1, len2 + 1)

# retrieve alphabet
sigma = set()
sigma.update(s1)
sigma.update(s2)

# set up table to remember positions of last seen occurrence in s1
last_left_t = _last_left_t_init(sigma)

# iterate over the array
for i in range(len1):
last_right = 0
for j in range(len2):
last_left = last_left_t[s2[j]]
_edit_dist_step(
lev,
i + 1,
j + 1,
s1,
s2,
last_left,
last_right,
substitution_cost=substitution_cost,
transpositions=transpositions,
)
if s1[i] == s2[j]:
last_right = j + 1
last_left_t[s1[i]] = i + 1
return lev[len1][len2]


Expand Down Expand Up @@ -162,6 +182,8 @@ def edit_distance_align(s1, s2, substitution_cost=1):
j + 1,
s1,
s2,
0,
0,
substitution_cost=substitution_cost,
transpositions=False,
)
Expand Down
123 changes: 123 additions & 0 deletions nltk/test/unit/test_distance.py
@@ -0,0 +1,123 @@
from typing import Tuple

import pytest

from nltk.metrics.distance import edit_distance


class TestEditDistance:
@pytest.mark.parametrize(
"left,right,substitution_cost,expecteds",
[
# Allowing transpositions reduces the number of edits required.
# with transpositions:
# e.g. "abc" -T-> "cba" -D-> "ca": 2 steps
#
# without transpositions:
# e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps
("abc", "ca", 1, (2, 3)),
("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions
# Note, a substition_cost of higher than 2 doesn't make much
# sense, as a deletion + insertion is identical, and always
# costs 2.
#
#
# Transpositions don't always reduce the number of edits required:
# with or without transpositions:
# e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps
("wants", "wasp", 1, (3, 3)),
("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions
#
#
# Ought to have the same results with and without transpositions
# with or without transpositions:
# e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps
# (but cost 5 if substitution_cost=2)
("rain", "shine", 1, (3, 3)),
("rain", "shine", 2, (5, 5)), # Does *require* substitutions
#
#
# Several potentially interesting typos
# with transpositions:
# e.g. "acbdef" -T-> "abcdef": 1 step
#
# without transpositions:
# e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps
("acbdef", "abcdef", 1, (1, 2)),
("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions
#
#
# with transpositions:
# e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps
#
# without transpositions:
# e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps
("lnaguaeg", "language", 1, (2, 4)),
("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions
#
#
# with transpositions:
# e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps
#
# without transpositions:
# e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps
# (but one substitution, so a cost of 4 if substition_cost = 2)
("lnaugage", "language", 1, (2, 3)),
("lnaugage", "language", 2, (2, 4)),
# Does *require* substitutions if no transpositions
#
#
# with transpositions:
# e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps
# without transpositions:
# e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps
("lngauage", "language", 1, (2, 2)),
("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions
#
#
# with or without transpositions:
# e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps
#
# with substitution_cost=2 and transpositions:
# e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw"
# -I-> "swi" -I-> "swim": 6 steps
#
# with substitution_cost=2 and no transpositions:
# e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw"
# -I-> "swi" -I-> "swim": 7 steps
("wants", "swim", 1, (5, 5)),
("wants", "swim", 2, (6, 7)),
#
#
# with or without transpositions:
# e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps
# (but cost 5 if substitution_cost=2)
("kitten", "sitting", 1, (3, 3)),
("kitten", "sitting", 2, (5, 5)),
],
)
def test_with_transpositions(
self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int]
):
"""Test `edit_distance` between two strings, given some `substitution_cost`,
and whether transpositions are allowed.
Args:
left (str): First input string to `edit_distance`.
right (str): Second input string to `edit_distance`.
substitution_cost (int): The cost of a substitution action in `edit_distance`.
expecteds (Tuple[int, int]): A tuple of expected outputs, such that `expecteds[0]` is
the expected output with `transpositions=True`, and `expecteds[1]` is
the expected output with `transpositions=False`.
"""
# Test the input strings in both orderings
for s1, s2 in ((left, right), (right, left)):
# zip with [True, False] to get the transpositions value
for expected, transpositions in zip(expecteds, [True, False]):
predicted = edit_distance(
s1,
s2,
substitution_cost=substitution_cost,
transpositions=transpositions,
)
assert predicted == expected

0 comments on commit 77b5945

Please sign in to comment.