Skip to content

Commit

Permalink
Partition tree.py module into tree package + pickle fix (#2863)
Browse files Browse the repository at this point in the history
* Resolved un-pickling issue with ParentedTrees for Python 3.7+

* Split tree.py up into its own module

* Fixed improper import from tree/probabilistic.py

* Import tree in the list of packages, instead of modules

* The label for a node doesn't need to be a str, it can be Any

* Renamed
ltk.tree.parse to
ltk.tree.parsing to avoid accidental overwrites

* Moved TreePrettyPrinter to tree package

The old import still works, but throws a warning. Perfect!

* Improved import issue with prettyprinter

* Moved treetransforms functions to tree package

* Heavily shrunk treetransforms.py

* Prevent shallow copy on ParentedTree - added doctest w. documentation

* Deprecate GhostScript in favor of svling

See #2875
  • Loading branch information
tomaarsen committed Nov 12, 2021
1 parent 7d3d6a4 commit 68e4e58
Show file tree
Hide file tree
Showing 14 changed files with 2,019 additions and 1,682 deletions.
4 changes: 2 additions & 2 deletions nltk/__init__.py
Expand Up @@ -135,7 +135,6 @@ def _fake_Popen(*args, **kwargs):
from nltk.grammar import *
from nltk.probability import *
from nltk.text import *
from nltk.tree import *
from nltk.util import *
from nltk.jsontags import *

Expand All @@ -151,6 +150,7 @@ def _fake_Popen(*args, **kwargs):
from nltk.tag import *
from nltk.tokenize import *
from nltk.translate import *
from nltk.tree import *
from nltk.sem import *
from nltk.stem import *

Expand Down Expand Up @@ -200,7 +200,7 @@ def _fake_Popen(*args, **kwargs):
from nltk import ccg, chunk, classify, collocations
from nltk import data, featstruct, grammar, help, inference, metrics
from nltk import misc, parse, probability, sem, stem, wsd
from nltk import tag, tbl, text, tokenize, translate, tree, treetransforms, util
from nltk import tag, tbl, text, tokenize, translate, tree, util


# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116
Expand Down
78 changes: 72 additions & 6 deletions nltk/test/tree.doctest
Expand Up @@ -113,7 +113,7 @@ type:
>>> print(tree)
(VP (V enjoyed) (NP my cookie))
>>> print(type(tree))
<class 'nltk.tree.ImmutableTree'>
<class 'nltk.tree.immutable.ImmutableTree'>
>>> tree[1] = 'x'
Traceback (most recent call last):
. . .
Expand Down Expand Up @@ -345,7 +345,7 @@ Parented trees can be created from strings using the classmethod
>>> print(ptree)
(VP (VERB saw) (NP (DET the) (NOUN dog)))
>>> print(type(ptree))
<class 'nltk.tree.ParentedTree'>
<class 'nltk.tree.parented.ParentedTree'>

Parented trees can also be created by using the classmethod
`ParentedTree.convert` to convert another type of tree to a parented
Expand All @@ -356,7 +356,7 @@ tree:
>>> print(ptree)
(VP (VERB saw) (NP (DET the) (NOUN dog)))
>>> print(type(ptree))
<class 'nltk.tree.ParentedTree'>
<class 'nltk.tree.parented.ParentedTree'>

.. clean-up:

Expand Down Expand Up @@ -802,13 +802,59 @@ Test that a tree can not be given multiple parents:

[more to be written]

Shallow copying can be tricky for Tree and several of its subclasses.
For shallow copies of Tree, only the root node is reconstructed, while
all the children are shared between the two trees. Modify the children
of one tree - and the shallowly copied tree will also update.

>>> from nltk.tree import Tree, ParentedTree, MultiParentedTree
>>> tree = Tree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))")
>>> copy_tree = tree.copy(deep=False)
>>> tree == copy_tree # Ensure identical labels and nodes
True
>>> id(copy_tree[0]) == id(tree[0]) # Ensure shallow copy - the children are the same objects in memory
True

For ParentedTree objects, this behaviour is not possible. With a shallow
copy, the children of the root node would be reused for both the original
and the shallow copy. For this to be possible, some children would need
to have multiple parents. As this is forbidden for ParentedTree objects,
attempting to make a shallow copy will cause a warning, and a deep copy
is made instead.

>>> ptree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))")
>>> copy_ptree = ptree.copy(deep=False)
>>> copy_ptree == ptree # Ensure identical labels and nodes
True
>>> id(copy_ptree[0]) != id(ptree[0]) # Shallow copying isn't supported - it defaults to deep copy.
True

For MultiParentedTree objects, the issue of only allowing one parent that
can be seen for ParentedTree objects is no more. Shallow copying a
MultiParentedTree gives the children of the root node two parents:
the original and the newly copied root.

>>> mptree = MultiParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))")
>>> len(mptree[0].parents())
1
>>> copy_mptree = mptree.copy(deep=False)
>>> copy_mptree == mptree # Ensure identical labels and nodes
True
>>> len(mptree[0].parents())
2
>>> len(copy_mptree[0].parents())
2

Shallow copying a MultiParentedTree is similar to creating a second root
which is identically labeled as the root on which the copy method was called.


ImmutableParentedTree Regression Tests
--------------------------------------

>>> iptree = ImmutableParentedTree.convert(ptree)
>>> type(iptree)
<class 'nltk.tree.ImmutableParentedTree'>
<class 'nltk.tree.immutable.ImmutableParentedTree'>
>>> del iptree[0]
Traceback (most recent call last):
. . .
Expand Down Expand Up @@ -1110,7 +1156,7 @@ ImmutableMultiParentedTree Regression Tests

>>> imptree = ImmutableMultiParentedTree.convert(mptree)
>>> type(imptree)
<class 'nltk.tree.ImmutableMultiParentedTree'>
<class 'nltk.tree.immutable.ImmutableMultiParentedTree'>
>>> del imptree[0]
Traceback (most recent call last):
. . .
Expand All @@ -1137,7 +1183,7 @@ ProbabilisticTree Regression Tests

>>> imprtree = ImmutableProbabilisticTree.convert(prtree)
>>> type(imprtree)
<class 'nltk.tree.ImmutableProbabilisticTree'>
<class 'nltk.tree.immutable.ImmutableProbabilisticTree'>
>>> del imprtree[0]
Traceback (most recent call last):
. . .
Expand All @@ -1155,3 +1201,23 @@ This used to discard the ``(B b)`` subtree (fixed in svn 6270):

>>> print(Tree.fromstring('((A a) (B b))'))
( (A a) (B b))

Pickling ParentedTree instances didn't work for Python 3.7 onwards (See #2478)

>>> import pickle
>>> tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
>>> print(tree)
(S (NN x) (NP x) (NN x))

>>> pickled = pickle.dumps(tree)
>>> tree_loaded = pickle.loads(pickled)
>>> print(tree_loaded)
(S (NN x) (NP x) (NN x))

ParentedTree used to be impossible to (deep)copy. (See #1324)

>>> from nltk.tree import ParentedTree
>>> import copy
>>> tree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))")
>>> tree == copy.deepcopy(tree) == copy.copy(tree) == tree.copy(deep=True) == tree.copy()
True
61 changes: 56 additions & 5 deletions nltk/test/treeprettyprinter.doctest
@@ -1,12 +1,11 @@
.. Copyright (C) 2001-2021 NLTK Project
.. For license information, see LICENSE.TXT

========================================================
Unit tests for nltk.treeprettyprinter.TreePrettyPrinter
========================================================
=========================================================
Unit tests for nltk.tree.prettyprinter.TreePrettyPrinter
=========================================================

>>> from nltk.tree import Tree
>>> from nltk.treeprettyprinter import TreePrettyPrinter
>>> from nltk.tree import Tree, TreePrettyPrinter

Tree nr 2170 from nltk.corpus.treebank:

Expand Down Expand Up @@ -124,3 +123,55 @@ A discontinuous tree:
noun verb prep det noun verb verb verb punct verb vg verb punct
│ │ │ │ │ │ │ │ │ │ │ │ │
Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen .

Importing TreePrettyPrinter
---------------------------

First of all, a simple tree will be constructed::

>>> from nltk.tree import Tree
>>> tree = Tree.fromstring('(S (NP Mary) (VP walks))')

We'll use this sample tree to show that the method of importing `TreePrettyPrinter` work correctly:

- Recommended::

>>> from nltk.tree import TreePrettyPrinter
>>> print(TreePrettyPrinter(tree).text())
S
____|____
NP VP
| |
Mary walks

- Alternative but valid options::

>>> from nltk import TreePrettyPrinter
>>> print(TreePrettyPrinter(tree).text())
S
____|____
NP VP
| |
Mary walks

>>> from nltk.tree.prettyprinter import TreePrettyPrinter
>>> print(TreePrettyPrinter(tree).text())
S
____|____
NP VP
| |
Mary walks

- Deprecated, do not use::

>>> from nltk.treeprettyprinter import TreePrettyPrinter
>>> print(TreePrettyPrinter(tree).text())
S
____|____
NP VP
| |
Mary walks

This method will throw a DeprecationWarning::

Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead.
3 changes: 1 addition & 2 deletions nltk/test/treetransforms.doctest
Expand Up @@ -6,8 +6,7 @@ Unit tests for the TreeTransformation class
-------------------------------------------

>>> from copy import deepcopy
>>> from nltk.tree import *
>>> from nltk.treetransforms import *
>>> from nltk.tree import Tree, collapse_unary, chomsky_normal_form, un_chomsky_normal_form

>>> tree_string = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))"

Expand Down
35 changes: 35 additions & 0 deletions nltk/tree/__init__.py
@@ -0,0 +1,35 @@
# Natural Language Toolkit: Machine Translation
#
# Copyright (C) 2001-2021 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Peter Ljunglöf <peter.ljunglof@gu.se>
# Tom Aarsen <>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
NLTK Tree Package
This package may be used for representing hierarchical language
structures, such as syntax trees and morphological trees.
"""

# TODO: add LabelledTree (can be used for dependency trees)

from nltk.tree.immutable import (
ImmutableMultiParentedTree,
ImmutableParentedTree,
ImmutableProbabilisticTree,
ImmutableTree,
)
from nltk.tree.parented import MultiParentedTree, ParentedTree
from nltk.tree.parsing import bracket_parse, sinica_parse
from nltk.tree.prettyprinter import TreePrettyPrinter
from nltk.tree.probabilistic import ProbabilisticTree
from nltk.tree.transforms import (
chomsky_normal_form,
collapse_unary,
un_chomsky_normal_form,
)
from nltk.tree.tree import Tree
124 changes: 124 additions & 0 deletions nltk/tree/immutable.py
@@ -0,0 +1,124 @@
# Natural Language Toolkit: Text Trees
#
# Copyright (C) 2001-2021 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Peter Ljunglöf <peter.ljunglof@gu.se>
# Tom Aarsen <>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

from nltk.probability import ProbabilisticMixIn
from nltk.tree.parented import MultiParentedTree, ParentedTree
from nltk.tree.tree import Tree


class ImmutableTree(Tree):
def __init__(self, node, children=None):
super().__init__(node, children)
# Precompute our hash value. This ensures that we're really
# immutable. It also means we only have to calculate it once.
try:
self._hash = hash((self._label, tuple(self)))
except (TypeError, ValueError) as e:
raise ValueError(
"%s: node value and children " "must be immutable" % type(self).__name__
) from e

def __setitem__(self, index, value):
raise ValueError("%s may not be modified" % type(self).__name__)

def __setslice__(self, i, j, value):
raise ValueError("%s may not be modified" % type(self).__name__)

def __delitem__(self, index):
raise ValueError("%s may not be modified" % type(self).__name__)

def __delslice__(self, i, j):
raise ValueError("%s may not be modified" % type(self).__name__)

def __iadd__(self, other):
raise ValueError("%s may not be modified" % type(self).__name__)

def __imul__(self, other):
raise ValueError("%s may not be modified" % type(self).__name__)

def append(self, v):
raise ValueError("%s may not be modified" % type(self).__name__)

def extend(self, v):
raise ValueError("%s may not be modified" % type(self).__name__)

def pop(self, v=None):
raise ValueError("%s may not be modified" % type(self).__name__)

def remove(self, v):
raise ValueError("%s may not be modified" % type(self).__name__)

def reverse(self):
raise ValueError("%s may not be modified" % type(self).__name__)

def sort(self):
raise ValueError("%s may not be modified" % type(self).__name__)

def __hash__(self):
return self._hash

def set_label(self, value):
"""
Set the node label. This will only succeed the first time the
node label is set, which should occur in ImmutableTree.__init__().
"""
if hasattr(self, "_label"):
raise ValueError("%s may not be modified" % type(self).__name__)
self._label = value


class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
def __init__(self, node, children=None, **prob_kwargs):
ImmutableTree.__init__(self, node, children)
ProbabilisticMixIn.__init__(self, **prob_kwargs)
self._hash = hash((self._label, tuple(self), self.prob()))

# We have to patch up these methods to make them work right:
def _frozen_class(self):
return ImmutableProbabilisticTree

def __repr__(self):
return f"{Tree.__repr__(self)} [{self.prob()}]"

def __str__(self):
return f"{self.pformat(margin=60)} [{self.prob()}]"

def copy(self, deep=False):
if not deep:
return type(self)(self._label, self, prob=self.prob())
else:
return type(self).convert(self)

@classmethod
def convert(cls, val):
if isinstance(val, Tree):
children = [cls.convert(child) for child in val]
if isinstance(val, ProbabilisticMixIn):
return cls(val._label, children, prob=val.prob())
else:
return cls(val._label, children, prob=1.0)
else:
return val


class ImmutableParentedTree(ImmutableTree, ParentedTree):
pass


class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
pass


__all__ = [
"ImmutableProbabilisticTree",
"ImmutableTree",
"ImmutableParentedTree",
"ImmutableMultiParentedTree",
]

0 comments on commit 68e4e58

Please sign in to comment.