diff --git a/.gitignore b/.gitignore index be11c74413..0a4bcc8975 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ web/_build # Test artifacts and coverage reports *.tox *.errs +.hypothesis .noseids .coverage* nltk/test/*.html diff --git a/nltk/parse/bllip.py b/nltk/parse/bllip.py index e897513530..fdecfb3543 100644 --- a/nltk/parse/bllip.py +++ b/nltk/parse/bllip.py @@ -100,11 +100,11 @@ def _ensure_bllip_import_or_error(ie=ie): def _ensure_ascii(words): try: for i, word in enumerate(words): - word.decode("ascii") - except UnicodeDecodeError as e: + word.encode("ascii") + except UnicodeEncodeError as e: raise ValueError( - "Token %d (%r) is non-ASCII. BLLIP Parser " - "currently doesn't support non-ASCII inputs." % (i, word) + f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser " + "currently doesn't support non-ASCII inputs." ) from e @@ -163,7 +163,7 @@ def __init__( self.rrp.load_reranker_model( features_filename=reranker_features, weights_filename=reranker_weights, - **reranker_options + **reranker_options, ) def parse(self, sentence): diff --git a/nltk/test/unit/test_bllip.py b/nltk/test/unit/test_bllip.py new file mode 100644 index 0000000000..09d992b9b2 --- /dev/null +++ b/nltk/test/unit/test_bllip.py @@ -0,0 +1,42 @@ +import pytest + +from nltk.data import find +from nltk.parse.bllip import BllipParser +from nltk.tree import Tree + + +@pytest.fixture(scope="module") +def parser(): + model_dir = find("models/bllip_wsj_no_aux").path + return BllipParser.from_unified_model_dir(model_dir) + + +def setup_module(): + pytest.importorskip("bllipparser") + + +class TestBllipParser: + def test_parser_loads_a_valid_tree(self, parser): + parsed = parser.parse("I saw the man with the telescope") + tree = next(parsed) + + assert isinstance(tree, Tree) + assert ( + tree.pformat() + == """ +(S1 + (S + (NP (PRP I)) + (VP + (VBD saw) + (NP (DT the) (NN man)) + (PP (IN with) (NP (DT the) (NN telescope)))))) +""".strip() + ) + + def test_tagged_parse_finds_matching_element(self, parser): + parsed = parser.parse("I saw the man with the telescope") + tagged_tree = next(parser.tagged_parse([("telescope", "NN")])) + + assert isinstance(tagged_tree, Tree) + assert tagged_tree.pformat() == "(S1 (NP (NN telescope)))"