Skip to content

Commit

Permalink
Fix decode error for bllip parser (#2897)
Browse files Browse the repository at this point in the history
* Fix decode error for bllip parser
* add bllipparser as a dependency to tests, otherwise skip
  • Loading branch information
dannysepler committed Dec 16, 2021
1 parent a28d256 commit 59aa3fb
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 5 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -13,6 +13,7 @@ web/_build
# Test artifacts and coverage reports
*.tox
*.errs
.hypothesis
.noseids
.coverage*
nltk/test/*.html
Expand Down
10 changes: 5 additions & 5 deletions nltk/parse/bllip.py
Expand Up @@ -100,11 +100,11 @@ def _ensure_bllip_import_or_error(ie=ie):
def _ensure_ascii(words):
try:
for i, word in enumerate(words):
word.decode("ascii")
except UnicodeDecodeError as e:
word.encode("ascii")
except UnicodeEncodeError as e:
raise ValueError(
"Token %d (%r) is non-ASCII. BLLIP Parser "
"currently doesn't support non-ASCII inputs." % (i, word)
f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser "
"currently doesn't support non-ASCII inputs."
) from e


Expand Down Expand Up @@ -163,7 +163,7 @@ def __init__(
self.rrp.load_reranker_model(
features_filename=reranker_features,
weights_filename=reranker_weights,
**reranker_options
**reranker_options,
)

def parse(self, sentence):
Expand Down
42 changes: 42 additions & 0 deletions nltk/test/unit/test_bllip.py
@@ -0,0 +1,42 @@
import pytest

from nltk.data import find
from nltk.parse.bllip import BllipParser
from nltk.tree import Tree


@pytest.fixture(scope="module")
def parser():
model_dir = find("models/bllip_wsj_no_aux").path
return BllipParser.from_unified_model_dir(model_dir)


def setup_module():
pytest.importorskip("bllipparser")


class TestBllipParser:
def test_parser_loads_a_valid_tree(self, parser):
parsed = parser.parse("I saw the man with the telescope")
tree = next(parsed)

assert isinstance(tree, Tree)
assert (
tree.pformat()
== """
(S1
(S
(NP (PRP I))
(VP
(VBD saw)
(NP (DT the) (NN man))
(PP (IN with) (NP (DT the) (NN telescope))))))
""".strip()
)

def test_tagged_parse_finds_matching_element(self, parser):
parsed = parser.parse("I saw the man with the telescope")
tagged_tree = next(parser.tagged_parse([("telescope", "NN")]))

assert isinstance(tagged_tree, Tree)
assert tagged_tree.pformat() == "(S1 (NP (NN telescope)))"

0 comments on commit 59aa3fb

Please sign in to comment.