Skip to content

Commit

Permalink
Allow empty string in CFG's + more (#2888)
Browse files Browse the repository at this point in the history
* Allow for empty string terminals

* Move toy pcfg's into pcfg_demo

* Deal with removing toy_pcfg1 and 2 from nltk/grammar.py

* Added doctests for empty strings/productions and their consequences

* Reworked RecursionError

* Add doctest for generate RecursionError

* Fixed issue in incorrect explanation of doctest

* Add as contributor
  • Loading branch information
tomaarsen committed Nov 18, 2021
1 parent 68e4e58 commit 7fb092a
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 53 deletions.
86 changes: 43 additions & 43 deletions nltk/grammar.py
Expand Up @@ -5,6 +5,7 @@
# Edward Loper <edloper@gmail.com>
# Jason Narad <jason.narad@gmail.com>
# Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
# Tom Aarsen <>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
Expand Down Expand Up @@ -1317,7 +1318,7 @@ def _read_fcfg_production(input, fstruct_reader):

_ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE)
_PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE)
_TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
_TERMINAL_RE = re.compile(r'( "[^"]*" | \'[^\']*\' ) \s*', re.VERBOSE)
_DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE)


Expand Down Expand Up @@ -1531,48 +1532,6 @@ def cfg_demo():
print()


toy_pcfg1 = PCFG.fromstring(
"""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
"""
)

toy_pcfg2 = PCFG.fromstring(
"""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
VP -> VP PP [.01]
NP -> Det N [.41]
NP -> Name [.28]
NP -> NP PP [.31]
PP -> P NP [1.0]
V -> 'saw' [.21]
V -> 'ate' [.51]
V -> 'ran' [.28]
N -> 'boy' [.11]
N -> 'cookie' [.12]
N -> 'table' [.13]
N -> 'telescope' [.14]
N -> 'hill' [.5]
Name -> 'Jack' [.52]
Name -> 'Bob' [.48]
P -> 'with' [.61]
P -> 'under' [.39]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
"""
)


def pcfg_demo():
"""
A demonstration showing how a ``PCFG`` can be created and used.
Expand All @@ -1582,6 +1541,47 @@ def pcfg_demo():
from nltk.corpus import treebank
from nltk.parse import pchart

toy_pcfg1 = PCFG.fromstring(
"""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
"""
)

toy_pcfg2 = PCFG.fromstring(
"""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
VP -> VP PP [.01]
NP -> Det N [.41]
NP -> Name [.28]
NP -> NP PP [.31]
PP -> P NP [1.0]
V -> 'saw' [.21]
V -> 'ate' [.51]
V -> 'ran' [.28]
N -> 'boy' [.11]
N -> 'cookie' [.12]
N -> 'table' [.13]
N -> 'telescope' [.14]
N -> 'hill' [.5]
Name -> 'Jack' [.52]
Name -> 'Bob' [.48]
P -> 'with' [.61]
P -> 'under' [.39]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
"""
)

pcfg_prods = toy_pcfg1.productions()

pcfg_prod = pcfg_prods[2]
Expand Down
13 changes: 5 additions & 8 deletions nltk/parse/generate.py
Expand Up @@ -42,14 +42,11 @@ def _generate_all(grammar, items, depth):
for frag1 in _generate_one(grammar, items[0], depth):
for frag2 in _generate_all(grammar, items[1:], depth):
yield frag1 + frag2
except RuntimeError as _error:
if _error.message == "maximum recursion depth exceeded":
# Helpful error message while still showing the recursion stack.
raise RuntimeError(
"The grammar has rule(s) that yield infinite recursion!!"
) from _error
else:
raise
except RecursionError as error:
# Helpful error message while still showing the recursion stack.
raise RuntimeError(
"The grammar has rule(s) that yield infinite recursion!"
) from error
else:
yield []

Expand Down
1 change: 1 addition & 0 deletions nltk/parse/util.py
@@ -1,6 +1,7 @@
# Natural Language Toolkit: Parser Utility Functions
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# Tom Aarsen <>
#
# Copyright (C) 2001-2021 NLTK Project
# URL: <https://www.nltk.org/>
Expand Down
43 changes: 42 additions & 1 deletion nltk/parse/viterbi.py
Expand Up @@ -337,9 +337,50 @@ def demo():
import time

from nltk import tokenize
from nltk.grammar import toy_pcfg1, toy_pcfg2
from nltk.grammar import PCFG
from nltk.parse import ViterbiParser

toy_pcfg1 = PCFG.fromstring(
"""
S -> NP VP [1.0]
NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
Det -> 'the' [0.8] | 'my' [0.2]
N -> 'man' [0.5] | 'telescope' [0.5]
VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
V -> 'ate' [0.35] | 'saw' [0.65]
PP -> P NP [1.0]
P -> 'with' [0.61] | 'under' [0.39]
"""
)

toy_pcfg2 = PCFG.fromstring(
"""
S -> NP VP [1.0]
VP -> V NP [.59]
VP -> V [.40]
VP -> VP PP [.01]
NP -> Det N [.41]
NP -> Name [.28]
NP -> NP PP [.31]
PP -> P NP [1.0]
V -> 'saw' [.21]
V -> 'ate' [.51]
V -> 'ran' [.28]
N -> 'boy' [.11]
N -> 'cookie' [.12]
N -> 'table' [.13]
N -> 'telescope' [.14]
N -> 'hill' [.5]
Name -> 'Jack' [.52]
Name -> 'Bob' [.48]
P -> 'with' [.61]
P -> 'under' [.39]
Det -> 'the' [.41]
Det -> 'a' [.31]
Det -> 'my' [.28]
"""
)

# Define two demos. Each demo has a sentence and a grammar.
demos = [
("I saw the man with my telescope", toy_pcfg1),
Expand Down
12 changes: 12 additions & 0 deletions nltk/test/generate.doctest
Expand Up @@ -64,3 +64,15 @@ The number of sentences of different max depths:
114
>>> len(list(generate(grammar)))
114

Infinite grammars will throw a RecursionError when not bounded by some ``depth``:

>>> grammar = CFG.fromstring("""
... S -> A B
... A -> B
... B -> "b" | A
... """)
>>> list(generate(grammar))
Traceback (most recent call last):
...
RuntimeError: The grammar has rule(s) that yield infinite recursion!
21 changes: 21 additions & 0 deletions nltk/test/grammar.doctest
Expand Up @@ -46,3 +46,24 @@ Chomsky Normal Form grammar (Test for bug 474)
>>> g = CFG.fromstring("VP^<TOP> -> VBP NP^<VP-TOP>")
>>> g.productions()[0].lhs()
VP^<TOP>

Grammars can contain both empty strings and empty productions:

>>> from nltk.grammar import CFG
>>> from nltk.parse.generate import generate
>>> grammar = CFG.fromstring("""
... S -> A B
... A -> 'a'
... # An empty string:
... B -> 'b' | ''
... """)
>>> list(generate(grammar))
[['a', 'b'], ['a', '']]
>>> grammar = CFG.fromstring("""
... S -> A B
... A -> 'a'
... # An empty production:
... B -> 'b' |
... """)
>>> list(generate(grammar))
[['a', 'b'], ['a']]
38 changes: 37 additions & 1 deletion nltk/test/parse.doctest
Expand Up @@ -545,7 +545,43 @@ Unit tests for the Probabilistic CFG class

>>> from nltk.corpus import treebank
>>> from itertools import islice
>>> from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
>>> from nltk.grammar import PCFG, induce_pcfg
>>> toy_pcfg1 = PCFG.fromstring("""
... S -> NP VP [1.0]
... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
... Det -> 'the' [0.8] | 'my' [0.2]
... N -> 'man' [0.5] | 'telescope' [0.5]
... VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
... V -> 'ate' [0.35] | 'saw' [0.65]
... PP -> P NP [1.0]
... P -> 'with' [0.61] | 'under' [0.39]
... """)

>>> toy_pcfg2 = PCFG.fromstring("""
... S -> NP VP [1.0]
... VP -> V NP [.59]
... VP -> V [.40]
... VP -> VP PP [.01]
... NP -> Det N [.41]
... NP -> Name [.28]
... NP -> NP PP [.31]
... PP -> P NP [1.0]
... V -> 'saw' [.21]
... V -> 'ate' [.51]
... V -> 'ran' [.28]
... N -> 'boy' [.11]
... N -> 'cookie' [.12]
... N -> 'table' [.13]
... N -> 'telescope' [.14]
... N -> 'hill' [.5]
... Name -> 'Jack' [.52]
... Name -> 'Bob' [.48]
... P -> 'with' [.61]
... P -> 'under' [.39]
... Det -> 'the' [.41]
... Det -> 'a' [.31]
... Det -> 'my' [.28]
... """)

Create a set of PCFG productions.

Expand Down

0 comments on commit 7fb092a

Please sign in to comment.