Skip to content

Commit

Permalink
Removed hundreds of formatting warnings for nltk.org (#2859)
Browse files Browse the repository at this point in the history
* Removed 500+ warnings when building website documentation

* Improved formatting for website news titles

* Update some of the IBM documentation to include description lists
  • Loading branch information
tomaarsen committed Oct 19, 2021
1 parent bec8910 commit 4a130f1
Show file tree
Hide file tree
Showing 89 changed files with 879 additions and 776 deletions.
6 changes: 4 additions & 2 deletions nltk/chunk/regexp.py
Expand Up @@ -78,12 +78,14 @@ def __init__(self, chunk_struct, debug_level=1):
:param debug_level: The level of debugging which should be
applied to transformations on the ``ChunkString``. The
valid levels are:
- 0: no checks
- 1: full check on to_chunkstruct
- 2: full check on to_chunkstruct and cursory check after
each transformation.
each transformation.
- 3: full check on to_chunkstruct and full check after
each transformation.
each transformation.
We recommend you use at least level 1. You should
probably use level 3 if you use any non-standard
subclasses of ``RegexpChunkRule``.
Expand Down
15 changes: 8 additions & 7 deletions nltk/classify/senna.py
Expand Up @@ -21,19 +21,20 @@
misalignment errors.
The input is:
- path to the directory that contains SENNA executables. If the path is incorrect,
Senna will automatically search for executable file specified in SENNA environment variable
Senna will automatically search for executable file specified in SENNA environment variable
- List of the operations needed to be performed.
- (optionally) the encoding of the input data (default:utf-8)
Note: Unit tests for this module can be found in test/unit/test_senna.py
>>> from nltk.classify import Senna
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
>>> sent = 'Dusseldorf is an international business center'.split()
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
>>> from nltk.classify import Senna
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
>>> sent = 'Dusseldorf is an international business center'.split()
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
"""

from os import environ, path, sep
Expand Down
10 changes: 6 additions & 4 deletions nltk/cluster/__init__.py
Expand Up @@ -47,10 +47,11 @@
not significantly increase.
They all extend the ClusterI interface which defines common operations
available with each clusterer. These operations include.
- cluster: clusters a sequence of vectors
- classify: assign a vector to a cluster
- classification_probdist: give the probability distribution over cluster memberships
available with each clusterer. These operations include:
- cluster: clusters a sequence of vectors
- classify: assign a vector to a cluster
- classification_probdist: give the probability distribution over cluster memberships
The current existing classifiers also extend cluster.VectorSpace, an
abstract class which allows for singular value decomposition (SVD) and vector
Expand All @@ -61,6 +62,7 @@
hypersphere.
Usage example (see also demo())::
from nltk import cluster
from nltk.cluster import euclidean_distance
from numpy import array
Expand Down
3 changes: 2 additions & 1 deletion nltk/cluster/util.py
Expand Up @@ -125,7 +125,7 @@ def euclidean_distance(u, v):
def cosine_distance(u, v):
"""
Returns 1 minus the cosine of the angle between vectors v and u. This is
equal to 1 - (u.v / |u||v|).
equal to ``1 - (u.v / |u||v|)``.
"""
return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))

Expand Down Expand Up @@ -221,6 +221,7 @@ def groups(self, n):
def show(self, leaf_labels=[]):
"""
Print the dendrogram in ASCII art to standard out.
:param leaf_labels: an optional list of strings to use for labeling the
leaves
:type leaf_labels: list
Expand Down
3 changes: 2 additions & 1 deletion nltk/corpus/reader/api.py
Expand Up @@ -52,6 +52,7 @@ def __init__(self, root, fileids, encoding="utf8", tagset=None):
:param encoding: The default unicode encoding for the files
that make up the corpus. The value of ``encoding`` can be any
of the following:
- A string: ``encoding`` is the encoding name for all files.
- A dictionary: ``encoding[file_id]`` is the encoding
name for the file whose identifier is ``file_id``. If
Expand All @@ -67,7 +68,7 @@ def __init__(self, root, fileids, encoding="utf8", tagset=None):
processed using non-unicode byte strings.
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
tagged_...() methods.
``tagged_...()`` methods.
"""
# Convert the root to a path pointer, if necessary.
if isinstance(root, str) and not isinstance(root, PathPointer):
Expand Down
16 changes: 8 additions & 8 deletions nltk/corpus/reader/bracket_parse.py
Expand Up @@ -46,12 +46,12 @@ def __init__(
:param comment_char: The character which can appear at the start of
a line to indicate that the rest of the line is a comment.
:param detect_blocks: The method that is used to find blocks
in the corpus; can be 'unindented_paren' (every unindented
parenthesis starts a new parse) or 'sexpr' (brackets are
matched).
in the corpus; can be 'unindented_paren' (every unindented
parenthesis starts a new parse) or 'sexpr' (brackets are
matched).
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
tagged_...() methods.
for normalizing or converting the POS tags returned by the
``tagged_...()`` methods.
"""
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
# from CorpusReader?
Expand Down Expand Up @@ -167,10 +167,10 @@ def parsed_paras(self, fileids=None, categories=None):
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
This corpus has a lexical breakdown structure embedded, as read by _parse
This corpus has a lexical breakdown structure embedded, as read by `_parse`
Unfortunately this puts punctuation and some other words out of the sentence
order in the xml element tree. This is no good for tag_ and word_
_tag and _word will be overridden to use a non-default new parameter 'ordered'
order in the xml element tree. This is no good for `tag_` and `word_`
`_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'
to the overridden _normalize function. The _parse function can then remain
untouched.
"""
Expand Down
4 changes: 2 additions & 2 deletions nltk/corpus/reader/cmudict.py
Expand Up @@ -54,7 +54,7 @@ class CMUDictCorpusReader(CorpusReader):
def entries(self):
"""
:return: the cmudict lexicon as a list of entries
containing (word, transcriptions) tuples.
containing (word, transcriptions) tuples.
"""
return concat(
[
Expand All @@ -72,7 +72,7 @@ def words(self):
def dict(self):
"""
:return: the cmudict lexicon as a dictionary, whose keys are
lowercase words and whose values are lists of pronunciations.
lowercase words and whose values are lists of pronunciations.
"""
return dict(Index(self.entries()))

Expand Down
21 changes: 13 additions & 8 deletions nltk/corpus/reader/framenet.py
Expand Up @@ -772,7 +772,7 @@ class AttrDict(dict):

"""A class that wraps a dict and allows accessing the keys of the
dict as if they were attributes. Taken from here:
https://stackoverflow.com/a/14620633/8879
https://stackoverflow.com/a/14620633/8879
>>> foo = {'a':1, 'b':2, 'c':3}
>>> bar = AttrDict(foo)
Expand Down Expand Up @@ -1350,8 +1350,7 @@ def doc(self, fn_docid):
- 'frameID' : (only if status is 'MANUAL')
- 'frameName': (only if status is 'MANUAL')
- 'layer' : a list of labels for the layer
- Each item in the layer is a dict containing the
following keys:
- Each item in the layer is a dict containing the following keys:
- '_type': 'layer'
- 'rank'
- 'name'
Expand Down Expand Up @@ -1533,6 +1532,7 @@ def frame(self, fn_fid_or_fname, ignorekeys=[]):
- 'FE' : a dict containing the Frame Elements that are part of this frame
The keys in this dict are the names of the FEs (e.g. 'Body_system')
and the values are dicts containing the following keys
- 'definition' : The definition of the FE
- 'name' : The name of the FE e.g. 'Body_system'
- 'ID' : The id number
Expand Down Expand Up @@ -1706,19 +1706,24 @@ def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None):
- 'lexemes' : a list of dicts describing the lemma of this LU.
Each dict in the list contains these keys:
- 'POS' : part of speech e.g. 'N'
- 'name' : either single-lexeme e.g. 'merger' or
multi-lexeme e.g. 'a little'
- 'order': the order of the lexeme in the lemma (starting from 1)
- 'headword': a boolean ('true' or 'false')
- 'breakBefore': Can this lexeme be separated from the previous lexeme?
Consider: "take over.v" as in:
Consider: "take over.v" as in::
Germany took over the Netherlands in 2 days.
Germany took the Netherlands over in 2 days.
In this case, 'breakBefore' would be "true" for the lexeme
"over". Contrast this with "take after.v" as in:
"over". Contrast this with "take after.v" as in::
Mary takes after her grandmother.
*Mary takes her grandmother after.
In this case, 'breakBefore' would be "false" for the lexeme "after"
- 'lemmaID' : Can be used to connect lemmas in different LUs
Expand Down Expand Up @@ -2518,11 +2523,11 @@ def frame_relation_types(self):
def frame_relations(self, frame=None, frame2=None, type=None):
"""
:param frame: (optional) frame object, name, or ID; only relations involving
this frame will be returned
this frame will be returned
:param frame2: (optional; 'frame' must be a different frame) only show relations
between the two specified frames, in either direction
between the two specified frames, in either direction
:param type: (optional) frame relation type (name or object); show only relations
of this type
of this type
:type frame: int or str or AttrDict
:return: A list of all of the frame relations in framenet
:rtype: list(dict)
Expand Down
2 changes: 1 addition & 1 deletion nltk/corpus/reader/ieer.py
Expand Up @@ -14,7 +14,7 @@
This corpus contains the NEWSWIRE development test data for the
NIST 1999 IE-ER Evaluation. The files were taken from the
subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt
subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt``
and filenames were shortened.
The corpus contains the following files: APW_19980314, APW_19980424,
Expand Down
1 change: 0 additions & 1 deletion nltk/corpus/reader/knbc.py
Expand Up @@ -38,7 +38,6 @@ class KNBCorpusReader(SyntaxCorpusReader):
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
Usage example
-------------
>>> from nltk.corpus.util import LazyCorpusLoader
>>> knbc = LazyCorpusLoader(
Expand Down
6 changes: 3 additions & 3 deletions nltk/corpus/reader/nombank.py
Expand Up @@ -70,7 +70,7 @@ def __init__(
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``NombankInstance`` objects, one for each noun in the corpus.
``NombankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
Expand All @@ -84,7 +84,7 @@ def instances(self, baseform=None):
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._nomfile),
Expand Down Expand Up @@ -138,7 +138,7 @@ def rolesets(self, baseform=None):
def nouns(self):
"""
:return: a corpus view that acts as a list of all noun lemmas
in this corpus (from the nombank.1.0.words file).
in this corpus (from the nombank.1.0.words file).
"""
return StreamBackedCorpusView(
self.abspath(self._nounsfile),
Expand Down
7 changes: 5 additions & 2 deletions nltk/corpus/reader/opinion_lexicon.py
Expand Up @@ -8,9 +8,11 @@
"""
CorpusReader for the Opinion Lexicon.
- Opinion Lexicon information -
Opinion Lexicon information
===========================
Authors: Minqing Hu and Bing Liu, 2004.
Department of Computer Sicence
Department of Computer Science
University of Illinois at Chicago
Contact: Bing Liu, liub@cs.uic.edu
Expand All @@ -19,6 +21,7 @@
Distributed with permission.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
Expand Down
4 changes: 2 additions & 2 deletions nltk/corpus/reader/panlex_lite.py
Expand Up @@ -108,14 +108,14 @@ def meanings(self, expr_uid, expr_tt):
def translations(self, from_uid, from_tt, to_uid):
"""
Return a list of translations for an expression into a single language
variety.
variety.
:param from_uid: the source expression's language variety, as a
seven-character uniform identifier.
:param from_tt: the source expression's text.
:param to_uid: the target language variety, as a seven-character
uniform identifier.
:return a list of translation tuples. The first element is the expression
:return: a list of translation tuples. The first element is the expression
text and the second element is the translation quality.
:rtype: list(tuple)
"""
Expand Down
19 changes: 10 additions & 9 deletions nltk/corpus/reader/plaintext.py
Expand Up @@ -177,15 +177,16 @@ class EuroparlCorpusReader(PlaintextCorpusReader):
for regular plaintext documents. Chapters are separated using blank
lines. Everything is inherited from ``PlaintextCorpusReader`` except
that:
- Since the corpus is pre-processed and pre-tokenized, the
word tokenizer should just split the line at whitespaces.
- For the same reason, the sentence tokenizer should just
split the paragraph at line breaks.
- There is a new 'chapters()' method that returns chapters instead
instead of paragraphs.
- The 'paras()' method inherited from PlaintextCorpusReader is
made non-functional to remove any confusion between chapters
and paragraphs for Europarl.
- Since the corpus is pre-processed and pre-tokenized, the
word tokenizer should just split the line at whitespaces.
- For the same reason, the sentence tokenizer should just
split the paragraph at line breaks.
- There is a new 'chapters()' method that returns chapters instead
instead of paragraphs.
- The 'paras()' method inherited from PlaintextCorpusReader is
made non-functional to remove any confusion between chapters
and paragraphs for Europarl.
"""

def _read_word_block(self, stream):
Expand Down
6 changes: 3 additions & 3 deletions nltk/corpus/reader/propbank.py
Expand Up @@ -70,7 +70,7 @@ def __init__(
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``PropBankInstance`` objects, one for each noun in the corpus.
``PropBankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
Expand All @@ -84,7 +84,7 @@ def instances(self, baseform=None):
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._propfile),
Expand Down Expand Up @@ -134,7 +134,7 @@ def rolesets(self, baseform=None):
def verbs(self):
"""
:return: a corpus view that acts as a list of all verb lemmas
in this corpus (from the verbs.txt file).
in this corpus (from the verbs.txt file).
"""
return StreamBackedCorpusView(
self.abspath(self._verbsfile),
Expand Down

0 comments on commit 4a130f1

Please sign in to comment.