Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Edit_distance now computes the actual Damerau-Levenshtein edit-distan…
…ce (#2736) * Edit_distance now computes the actual Damerau-Levenshtein edit-distance * adapted edit_distance_align to the changes in _edit_distance_step * +couple unit test for the levensthein edit distance with vs without transpositions * pre commit fails when pushing * commiting to run pre-commit hooks * fixed edit distance unit tests and edit distance with transpositions * Added and pytest-ified edit_distance tests Co-authored-by: Tom Aarsen <Cubiegamedev@gmail.com>
- Loading branch information
Showing
2 changed files
with
149 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
from typing import Tuple | ||
|
||
import pytest | ||
|
||
from nltk.metrics.distance import edit_distance | ||
|
||
|
||
class TestEditDistance: | ||
@pytest.mark.parametrize( | ||
"left,right,substitution_cost,expecteds", | ||
[ | ||
# Allowing transpositions reduces the number of edits required. | ||
# with transpositions: | ||
# e.g. "abc" -T-> "cba" -D-> "ca": 2 steps | ||
# | ||
# without transpositions: | ||
# e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps | ||
("abc", "ca", 1, (2, 3)), | ||
("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions | ||
# Note, a substition_cost of higher than 2 doesn't make much | ||
# sense, as a deletion + insertion is identical, and always | ||
# costs 2. | ||
# | ||
# | ||
# Transpositions don't always reduce the number of edits required: | ||
# with or without transpositions: | ||
# e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps | ||
("wants", "wasp", 1, (3, 3)), | ||
("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions | ||
# | ||
# | ||
# Ought to have the same results with and without transpositions | ||
# with or without transpositions: | ||
# e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps | ||
# (but cost 5 if substitution_cost=2) | ||
("rain", "shine", 1, (3, 3)), | ||
("rain", "shine", 2, (5, 5)), # Does *require* substitutions | ||
# | ||
# | ||
# Several potentially interesting typos | ||
# with transpositions: | ||
# e.g. "acbdef" -T-> "abcdef": 1 step | ||
# | ||
# without transpositions: | ||
# e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps | ||
("acbdef", "abcdef", 1, (1, 2)), | ||
("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions | ||
# | ||
# | ||
# with transpositions: | ||
# e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps | ||
# | ||
# without transpositions: | ||
# e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps | ||
("lnaguaeg", "language", 1, (2, 4)), | ||
("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions | ||
# | ||
# | ||
# with transpositions: | ||
# e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps | ||
# | ||
# without transpositions: | ||
# e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps | ||
# (but one substitution, so a cost of 4 if substition_cost = 2) | ||
("lnaugage", "language", 1, (2, 3)), | ||
("lnaugage", "language", 2, (2, 4)), | ||
# Does *require* substitutions if no transpositions | ||
# | ||
# | ||
# with transpositions: | ||
# e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps | ||
# without transpositions: | ||
# e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps | ||
("lngauage", "language", 1, (2, 2)), | ||
("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions | ||
# | ||
# | ||
# with or without transpositions: | ||
# e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps | ||
# | ||
# with substitution_cost=2 and transpositions: | ||
# e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw" | ||
# -I-> "swi" -I-> "swim": 6 steps | ||
# | ||
# with substitution_cost=2 and no transpositions: | ||
# e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw" | ||
# -I-> "swi" -I-> "swim": 7 steps | ||
("wants", "swim", 1, (5, 5)), | ||
("wants", "swim", 2, (6, 7)), | ||
# | ||
# | ||
# with or without transpositions: | ||
# e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps | ||
# (but cost 5 if substitution_cost=2) | ||
("kitten", "sitting", 1, (3, 3)), | ||
("kitten", "sitting", 2, (5, 5)), | ||
], | ||
) | ||
def test_with_transpositions( | ||
self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int] | ||
): | ||
"""Test `edit_distance` between two strings, given some `substitution_cost`, | ||
and whether transpositions are allowed. | ||
Args: | ||
left (str): First input string to `edit_distance`. | ||
right (str): Second input string to `edit_distance`. | ||
substitution_cost (int): The cost of a substitution action in `edit_distance`. | ||
expecteds (Tuple[int, int]): A tuple of expected outputs, such that `expecteds[0]` is | ||
the expected output with `transpositions=True`, and `expecteds[1]` is | ||
the expected output with `transpositions=False`. | ||
""" | ||
# Test the input strings in both orderings | ||
for s1, s2 in ((left, right), (right, left)): | ||
# zip with [True, False] to get the transpositions value | ||
for expected, transpositions in zip(expecteds, [True, False]): | ||
predicted = edit_distance( | ||
s1, | ||
s2, | ||
substitution_cost=substitution_cost, | ||
transpositions=transpositions, | ||
) | ||
assert predicted == expected |