From 4a130f1cdf642327378a4387b319cd2df6f79287 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 19 Oct 2021 14:44:07 +0200 Subject: [PATCH] Removed hundreds of formatting warnings for nltk.org (#2859) * Removed 500+ warnings when building website documentation * Improved formatting for website news titles * Update some of the IBM documentation to include description lists --- nltk/chunk/regexp.py | 6 +- nltk/classify/senna.py | 15 +- nltk/cluster/__init__.py | 10 +- nltk/cluster/util.py | 3 +- nltk/corpus/reader/api.py | 3 +- nltk/corpus/reader/bracket_parse.py | 16 +- nltk/corpus/reader/cmudict.py | 4 +- nltk/corpus/reader/framenet.py | 21 ++- nltk/corpus/reader/ieer.py | 2 +- nltk/corpus/reader/knbc.py | 1 - nltk/corpus/reader/nombank.py | 6 +- nltk/corpus/reader/opinion_lexicon.py | 7 +- nltk/corpus/reader/panlex_lite.py | 4 +- nltk/corpus/reader/plaintext.py | 19 +-- nltk/corpus/reader/propbank.py | 6 +- nltk/corpus/reader/reviews.py | 30 ++-- nltk/corpus/reader/timit.py | 12 +- nltk/corpus/reader/twitter.py | 10 +- nltk/corpus/reader/verbnet.py | 2 +- nltk/corpus/reader/wordlist.py | 1 + nltk/corpus/reader/wordnet.py | 55 +++---- nltk/corpus/util.py | 4 +- nltk/draw/util.py | 13 +- nltk/featstruct.py | 9 +- nltk/grammar.py | 14 +- nltk/inference/api.py | 4 +- nltk/inference/nonmonotonic.py | 10 +- nltk/inference/prover9.py | 4 +- nltk/inference/resolution.py | 16 +- nltk/internals.py | 9 +- nltk/lm/api.py | 10 +- nltk/lm/preprocessing.py | 5 +- nltk/lm/vocabulary.py | 1 + nltk/metrics/agreement.py | 4 +- nltk/metrics/association.py | 22 +-- nltk/metrics/distance.py | 44 +++--- nltk/metrics/paice.py | 6 +- nltk/metrics/segmentation.py | 6 +- nltk/misc/minimalset.py | 2 +- nltk/parse/api.py | 2 +- nltk/parse/bllip.py | 24 +-- nltk/parse/chart.py | 2 +- nltk/parse/corenlp.py | 3 - nltk/parse/dependencygraph.py | 11 +- nltk/parse/featurechart.py | 6 +- nltk/parse/malt.py | 18 +-- nltk/parse/nonprojectivedependencyparser.py | 31 ++-- nltk/parse/projectivedependencyparser.py | 2 +- nltk/parse/transitionparser.py | 20 ++- nltk/parse/util.py | 12 +- nltk/probability.py | 6 +- nltk/sem/boxer.py | 35 +++-- nltk/sem/chat80.py | 6 +- nltk/sem/drt.py | 2 +- nltk/sem/glue.py | 2 +- nltk/sem/logic.py | 7 +- nltk/sem/util.py | 2 +- nltk/sentiment/sentiment_analyzer.py | 1 + nltk/sentiment/util.py | 8 +- nltk/stem/porter.py | 51 ++++--- nltk/stem/snowball.py | 16 +- nltk/stem/util.py | 3 +- nltk/tag/brill_trainer.py | 11 +- nltk/tag/crf.py | 87 +++++------ nltk/tag/perceptron.py | 2 +- nltk/tag/senna.py | 45 +++--- nltk/tag/tnt.py | 8 +- nltk/tbl/feature.py | 19 ++- nltk/tbl/rule.py | 12 +- nltk/tbl/template.py | 32 ++-- nltk/test/unit/test_distance.py | 16 +- nltk/test/unit/test_stem.py | 9 +- nltk/text.py | 3 +- nltk/tokenize/legality_principle.py | 13 +- nltk/tokenize/sonority_sequencing.py | 1 + nltk/tokenize/treebank.py | 41 ++--- nltk/translate/bleu_score.py | 108 +++++++------- nltk/translate/ibm1.py | 43 +++--- nltk/translate/ibm2.py | 38 ++--- nltk/translate/ibm3.py | 56 +++---- nltk/translate/ibm4.py | 95 ++++++------ nltk/translate/ibm5.py | 86 ++++++----- nltk/translate/stack_decoder.py | 4 +- nltk/twitter/common.py | 30 ++-- nltk/twitter/twitter_demo.py | 2 +- nltk/twitter/twitterclient.py | 32 ++-- nltk/twitter/util.py | 15 +- nltk/util.py | 5 +- web/news.rst | 156 ++++++++++---------- 89 files changed, 879 insertions(+), 776 deletions(-) diff --git a/nltk/chunk/regexp.py b/nltk/chunk/regexp.py index e193cc754f..56cc284ef6 100644 --- a/nltk/chunk/regexp.py +++ b/nltk/chunk/regexp.py @@ -78,12 +78,14 @@ def __init__(self, chunk_struct, debug_level=1): :param debug_level: The level of debugging which should be applied to transformations on the ``ChunkString``. The valid levels are: + - 0: no checks - 1: full check on to_chunkstruct - 2: full check on to_chunkstruct and cursory check after - each transformation. + each transformation. - 3: full check on to_chunkstruct and full check after - each transformation. + each transformation. + We recommend you use at least level 1. You should probably use level 3 if you use any non-standard subclasses of ``RegexpChunkRule``. diff --git a/nltk/classify/senna.py b/nltk/classify/senna.py index b7dec6a5d6..0233846ac0 100644 --- a/nltk/classify/senna.py +++ b/nltk/classify/senna.py @@ -21,19 +21,20 @@ misalignment errors. The input is: + - path to the directory that contains SENNA executables. If the path is incorrect, - Senna will automatically search for executable file specified in SENNA environment variable + Senna will automatically search for executable file specified in SENNA environment variable - List of the operations needed to be performed. - (optionally) the encoding of the input data (default:utf-8) Note: Unit tests for this module can be found in test/unit/test_senna.py - >>> from nltk.classify import Senna - >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) - >>> sent = 'Dusseldorf is an international business center'.split() - >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP - [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), - ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')] +>>> from nltk.classify import Senna +>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) +>>> sent = 'Dusseldorf is an international business center'.split() +>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP +[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), +('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')] """ from os import environ, path, sep diff --git a/nltk/cluster/__init__.py b/nltk/cluster/__init__.py index 6e7ccb1548..b864a4f461 100644 --- a/nltk/cluster/__init__.py +++ b/nltk/cluster/__init__.py @@ -47,10 +47,11 @@ not significantly increase. They all extend the ClusterI interface which defines common operations -available with each clusterer. These operations include. - - cluster: clusters a sequence of vectors - - classify: assign a vector to a cluster - - classification_probdist: give the probability distribution over cluster memberships +available with each clusterer. These operations include: + +- cluster: clusters a sequence of vectors +- classify: assign a vector to a cluster +- classification_probdist: give the probability distribution over cluster memberships The current existing classifiers also extend cluster.VectorSpace, an abstract class which allows for singular value decomposition (SVD) and vector @@ -61,6 +62,7 @@ hypersphere. Usage example (see also demo()):: + from nltk import cluster from nltk.cluster import euclidean_distance from numpy import array diff --git a/nltk/cluster/util.py b/nltk/cluster/util.py index e36bd30548..827a6c7bb3 100644 --- a/nltk/cluster/util.py +++ b/nltk/cluster/util.py @@ -125,7 +125,7 @@ def euclidean_distance(u, v): def cosine_distance(u, v): """ Returns 1 minus the cosine of the angle between vectors v and u. This is - equal to 1 - (u.v / |u||v|). + equal to ``1 - (u.v / |u||v|)``. """ return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v)))) @@ -221,6 +221,7 @@ def groups(self, n): def show(self, leaf_labels=[]): """ Print the dendrogram in ASCII art to standard out. + :param leaf_labels: an optional list of strings to use for labeling the leaves :type leaf_labels: list diff --git a/nltk/corpus/reader/api.py b/nltk/corpus/reader/api.py index 7c908d85e1..76c836806d 100644 --- a/nltk/corpus/reader/api.py +++ b/nltk/corpus/reader/api.py @@ -52,6 +52,7 @@ def __init__(self, root, fileids, encoding="utf8", tagset=None): :param encoding: The default unicode encoding for the files that make up the corpus. The value of ``encoding`` can be any of the following: + - A string: ``encoding`` is the encoding name for all files. - A dictionary: ``encoding[file_id]`` is the encoding name for the file whose identifier is ``file_id``. If @@ -67,7 +68,7 @@ def __init__(self, root, fileids, encoding="utf8", tagset=None): processed using non-unicode byte strings. :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the - tagged_...() methods. + ``tagged_...()`` methods. """ # Convert the root to a path pointer, if necessary. if isinstance(root, str) and not isinstance(root, PathPointer): diff --git a/nltk/corpus/reader/bracket_parse.py b/nltk/corpus/reader/bracket_parse.py index 40a1d6e8a5..92602e4a8f 100644 --- a/nltk/corpus/reader/bracket_parse.py +++ b/nltk/corpus/reader/bracket_parse.py @@ -46,12 +46,12 @@ def __init__( :param comment_char: The character which can appear at the start of a line to indicate that the rest of the line is a comment. :param detect_blocks: The method that is used to find blocks - in the corpus; can be 'unindented_paren' (every unindented - parenthesis starts a new parse) or 'sexpr' (brackets are - matched). + in the corpus; can be 'unindented_paren' (every unindented + parenthesis starts a new parse) or 'sexpr' (brackets are + matched). :param tagset: The name of the tagset used by this corpus, to be used - for normalizing or converting the POS tags returned by the - tagged_...() methods. + for normalizing or converting the POS tags returned by the + ``tagged_...()`` methods. """ # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing # from CorpusReader? @@ -167,10 +167,10 @@ def parsed_paras(self, fileids=None, categories=None): class AlpinoCorpusReader(BracketParseCorpusReader): """ Reader for the Alpino Dutch Treebank. - This corpus has a lexical breakdown structure embedded, as read by _parse + This corpus has a lexical breakdown structure embedded, as read by `_parse` Unfortunately this puts punctuation and some other words out of the sentence - order in the xml element tree. This is no good for tag_ and word_ - _tag and _word will be overridden to use a non-default new parameter 'ordered' + order in the xml element tree. This is no good for `tag_` and `word_` + `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered' to the overridden _normalize function. The _parse function can then remain untouched. """ diff --git a/nltk/corpus/reader/cmudict.py b/nltk/corpus/reader/cmudict.py index 50417c4ee9..91b55c378b 100644 --- a/nltk/corpus/reader/cmudict.py +++ b/nltk/corpus/reader/cmudict.py @@ -54,7 +54,7 @@ class CMUDictCorpusReader(CorpusReader): def entries(self): """ :return: the cmudict lexicon as a list of entries - containing (word, transcriptions) tuples. + containing (word, transcriptions) tuples. """ return concat( [ @@ -72,7 +72,7 @@ def words(self): def dict(self): """ :return: the cmudict lexicon as a dictionary, whose keys are - lowercase words and whose values are lists of pronunciations. + lowercase words and whose values are lists of pronunciations. """ return dict(Index(self.entries())) diff --git a/nltk/corpus/reader/framenet.py b/nltk/corpus/reader/framenet.py index e722588c30..f664dd3b38 100644 --- a/nltk/corpus/reader/framenet.py +++ b/nltk/corpus/reader/framenet.py @@ -772,7 +772,7 @@ class AttrDict(dict): """A class that wraps a dict and allows accessing the keys of the dict as if they were attributes. Taken from here: - https://stackoverflow.com/a/14620633/8879 + https://stackoverflow.com/a/14620633/8879 >>> foo = {'a':1, 'b':2, 'c':3} >>> bar = AttrDict(foo) @@ -1350,8 +1350,7 @@ def doc(self, fn_docid): - 'frameID' : (only if status is 'MANUAL') - 'frameName': (only if status is 'MANUAL') - 'layer' : a list of labels for the layer - - Each item in the layer is a dict containing the - following keys: + - Each item in the layer is a dict containing the following keys: - '_type': 'layer' - 'rank' - 'name' @@ -1533,6 +1532,7 @@ def frame(self, fn_fid_or_fname, ignorekeys=[]): - 'FE' : a dict containing the Frame Elements that are part of this frame The keys in this dict are the names of the FEs (e.g. 'Body_system') and the values are dicts containing the following keys + - 'definition' : The definition of the FE - 'name' : The name of the FE e.g. 'Body_system' - 'ID' : The id number @@ -1706,19 +1706,24 @@ def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None): - 'lexemes' : a list of dicts describing the lemma of this LU. Each dict in the list contains these keys: + - 'POS' : part of speech e.g. 'N' - 'name' : either single-lexeme e.g. 'merger' or multi-lexeme e.g. 'a little' - 'order': the order of the lexeme in the lemma (starting from 1) - 'headword': a boolean ('true' or 'false') - 'breakBefore': Can this lexeme be separated from the previous lexeme? - Consider: "take over.v" as in: + Consider: "take over.v" as in:: + Germany took over the Netherlands in 2 days. Germany took the Netherlands over in 2 days. + In this case, 'breakBefore' would be "true" for the lexeme - "over". Contrast this with "take after.v" as in: + "over". Contrast this with "take after.v" as in:: + Mary takes after her grandmother. *Mary takes her grandmother after. + In this case, 'breakBefore' would be "false" for the lexeme "after" - 'lemmaID' : Can be used to connect lemmas in different LUs @@ -2518,11 +2523,11 @@ def frame_relation_types(self): def frame_relations(self, frame=None, frame2=None, type=None): """ :param frame: (optional) frame object, name, or ID; only relations involving - this frame will be returned + this frame will be returned :param frame2: (optional; 'frame' must be a different frame) only show relations - between the two specified frames, in either direction + between the two specified frames, in either direction :param type: (optional) frame relation type (name or object); show only relations - of this type + of this type :type frame: int or str or AttrDict :return: A list of all of the frame relations in framenet :rtype: list(dict) diff --git a/nltk/corpus/reader/ieer.py b/nltk/corpus/reader/ieer.py index f916e4fb33..8eee4cd8a7 100644 --- a/nltk/corpus/reader/ieer.py +++ b/nltk/corpus/reader/ieer.py @@ -14,7 +14,7 @@ This corpus contains the NEWSWIRE development test data for the NIST 1999 IE-ER Evaluation. The files were taken from the -subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt +subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt`` and filenames were shortened. The corpus contains the following files: APW_19980314, APW_19980424, diff --git a/nltk/corpus/reader/knbc.py b/nltk/corpus/reader/knbc.py index 6e5edde180..b898fc24ad 100644 --- a/nltk/corpus/reader/knbc.py +++ b/nltk/corpus/reader/knbc.py @@ -38,7 +38,6 @@ class KNBCorpusReader(SyntaxCorpusReader): tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) Usage example - ------------- >>> from nltk.corpus.util import LazyCorpusLoader >>> knbc = LazyCorpusLoader( diff --git a/nltk/corpus/reader/nombank.py b/nltk/corpus/reader/nombank.py index 511e770bbd..85c94d88e1 100644 --- a/nltk/corpus/reader/nombank.py +++ b/nltk/corpus/reader/nombank.py @@ -70,7 +70,7 @@ def __init__( def instances(self, baseform=None): """ :return: a corpus view that acts as a list of - ``NombankInstance`` objects, one for each noun in the corpus. + ``NombankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: @@ -84,7 +84,7 @@ def instances(self, baseform=None): def lines(self): """ :return: a corpus view that acts as a list of strings, one for - each line in the predicate-argument annotation file. + each line in the predicate-argument annotation file. """ return StreamBackedCorpusView( self.abspath(self._nomfile), @@ -138,7 +138,7 @@ def rolesets(self, baseform=None): def nouns(self): """ :return: a corpus view that acts as a list of all noun lemmas - in this corpus (from the nombank.1.0.words file). + in this corpus (from the nombank.1.0.words file). """ return StreamBackedCorpusView( self.abspath(self._nounsfile), diff --git a/nltk/corpus/reader/opinion_lexicon.py b/nltk/corpus/reader/opinion_lexicon.py index 1d3075f48a..aed7f4c2dc 100644 --- a/nltk/corpus/reader/opinion_lexicon.py +++ b/nltk/corpus/reader/opinion_lexicon.py @@ -8,9 +8,11 @@ """ CorpusReader for the Opinion Lexicon. -- Opinion Lexicon information - +Opinion Lexicon information +=========================== + Authors: Minqing Hu and Bing Liu, 2004. - Department of Computer Sicence + Department of Computer Science University of Illinois at Chicago Contact: Bing Liu, liub@cs.uic.edu @@ -19,6 +21,7 @@ Distributed with permission. Related papers: + - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA. diff --git a/nltk/corpus/reader/panlex_lite.py b/nltk/corpus/reader/panlex_lite.py index fb47e6c8ce..9d8c823acd 100644 --- a/nltk/corpus/reader/panlex_lite.py +++ b/nltk/corpus/reader/panlex_lite.py @@ -108,14 +108,14 @@ def meanings(self, expr_uid, expr_tt): def translations(self, from_uid, from_tt, to_uid): """ Return a list of translations for an expression into a single language - variety. + variety. :param from_uid: the source expression's language variety, as a seven-character uniform identifier. :param from_tt: the source expression's text. :param to_uid: the target language variety, as a seven-character uniform identifier. - :return a list of translation tuples. The first element is the expression + :return: a list of translation tuples. The first element is the expression text and the second element is the translation quality. :rtype: list(tuple) """ diff --git a/nltk/corpus/reader/plaintext.py b/nltk/corpus/reader/plaintext.py index ddf01182c1..37ceb7f747 100644 --- a/nltk/corpus/reader/plaintext.py +++ b/nltk/corpus/reader/plaintext.py @@ -177,15 +177,16 @@ class EuroparlCorpusReader(PlaintextCorpusReader): for regular plaintext documents. Chapters are separated using blank lines. Everything is inherited from ``PlaintextCorpusReader`` except that: - - Since the corpus is pre-processed and pre-tokenized, the - word tokenizer should just split the line at whitespaces. - - For the same reason, the sentence tokenizer should just - split the paragraph at line breaks. - - There is a new 'chapters()' method that returns chapters instead - instead of paragraphs. - - The 'paras()' method inherited from PlaintextCorpusReader is - made non-functional to remove any confusion between chapters - and paragraphs for Europarl. + + - Since the corpus is pre-processed and pre-tokenized, the + word tokenizer should just split the line at whitespaces. + - For the same reason, the sentence tokenizer should just + split the paragraph at line breaks. + - There is a new 'chapters()' method that returns chapters instead + instead of paragraphs. + - The 'paras()' method inherited from PlaintextCorpusReader is + made non-functional to remove any confusion between chapters + and paragraphs for Europarl. """ def _read_word_block(self, stream): diff --git a/nltk/corpus/reader/propbank.py b/nltk/corpus/reader/propbank.py index 57dd83de38..66779a1f39 100644 --- a/nltk/corpus/reader/propbank.py +++ b/nltk/corpus/reader/propbank.py @@ -70,7 +70,7 @@ def __init__( def instances(self, baseform=None): """ :return: a corpus view that acts as a list of - ``PropBankInstance`` objects, one for each noun in the corpus. + ``PropBankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: @@ -84,7 +84,7 @@ def instances(self, baseform=None): def lines(self): """ :return: a corpus view that acts as a list of strings, one for - each line in the predicate-argument annotation file. + each line in the predicate-argument annotation file. """ return StreamBackedCorpusView( self.abspath(self._propfile), @@ -134,7 +134,7 @@ def rolesets(self, baseform=None): def verbs(self): """ :return: a corpus view that acts as a list of all verb lemmas - in this corpus (from the verbs.txt file). + in this corpus (from the verbs.txt file). """ return StreamBackedCorpusView( self.abspath(self._verbsfile), diff --git a/nltk/corpus/reader/reviews.py b/nltk/corpus/reader/reviews.py index 5c0427668b..c8f979b069 100644 --- a/nltk/corpus/reader/reviews.py +++ b/nltk/corpus/reader/reviews.py @@ -8,9 +8,11 @@ """ CorpusReader for reviews corpora (syntax based on Customer Review Corpus). -- Customer Review Corpus information - +Customer Review Corpus information +================================== + Annotated by: Minqing Hu and Bing Liu, 2004. - Department of Computer Sicence + Department of Computer Science University of Illinois at Chicago Contact: Bing Liu, liub@cs.uic.edu @@ -38,18 +40,18 @@ Symbols used in the annotated reviews: - [t] : the title of the review: Each [t] tag starts a review. - xxxx[+|-n]: xxxx is a product feature. - [+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest. - Note that the strength is quite subjective. - You may want ignore it, but only considering + and - - [-n]: Negative opinion - ## : start of each sentence. Each line is a sentence. - [u] : feature not appeared in the sentence. - [p] : feature not appeared in the sentence. Pronoun resolution is needed. - [s] : suggestion or recommendation. - [cc]: comparison with a competing product from a different brand. - [cs]: comparison with a competing product from the same brand. + :[t]: the title of the review: Each [t] tag starts a review. + :xxxx[+|-n]: xxxx is a product feature. + :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest. + Note that the strength is quite subjective. + You may want ignore it, but only considering + and - + :[-n]: Negative opinion + :##: start of each sentence. Each line is a sentence. + :[u]: feature not appeared in the sentence. + :[p]: feature not appeared in the sentence. Pronoun resolution is needed. + :[s]: suggestion or recommendation. + :[cc]: comparison with a competing product from a different brand. + :[cs]: comparison with a competing product from the same brand. Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not provide separation between different reviews. This is due to the fact that diff --git a/nltk/corpus/reader/timit.py b/nltk/corpus/reader/timit.py index e134cef4e4..cf258ba447 100644 --- a/nltk/corpus/reader/timit.py +++ b/nltk/corpus/reader/timit.py @@ -20,7 +20,7 @@ are spoken by all speakers.) - total 160 recording of sentences (10 recordings per speaker) - audio format: NIST Sphere, single channel, 16kHz sampling, - 16 bit sample, PCM encoding + 16 bit sample, PCM encoding Module contents @@ -195,9 +195,9 @@ def utteranceids( ): """ :return: A list of the utterance identifiers for all - utterances in this corpus, or for the given speaker, dialect - region, gender, sentence type, or sentence number, if - specified. + utterances in this corpus, or for the given speaker, dialect + region, gender, sentence type, or sentence number, if + specified. """ if isinstance(dialect, str): dialect = [dialect] @@ -226,7 +226,7 @@ def utteranceids( def transcription_dict(self): """ :return: A dictionary giving the 'standard' transcription for - each word. + each word. """ _transcriptions = {} with self.open("timitdic.txt") as fp: @@ -251,7 +251,7 @@ def utterance(self, spkrid, sentid): def spkrutteranceids(self, speaker): """ :return: A list of all utterances associated with a given - speaker. + speaker. """ return [ utterance diff --git a/nltk/corpus/reader/twitter.py b/nltk/corpus/reader/twitter.py index 9cf3043a91..3a57e2643e 100644 --- a/nltk/corpus/reader/twitter.py +++ b/nltk/corpus/reader/twitter.py @@ -59,14 +59,10 @@ def __init__( self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8" ): """ - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - :param word_tokenizer: Tokenizer for breaking the text of Tweets into - smaller units, including but not limited to words. - + smaller units, including but not limited to words. """ CorpusReader.__init__(self, root, fileids, encoding) @@ -86,7 +82,7 @@ def docs(self, fileids=None): `_ :return: the given file(s) as a list of dictionaries deserialised - from JSON. + from JSON. :rtype: list(dict) """ return concat( @@ -118,7 +114,7 @@ def strings(self, fileids=None): def tokenized(self, fileids=None): """ :return: the given file(s) as a list of the text content of Tweets as - as a list of words, screenanames, hashtags, URLs and punctuation symbols. + as a list of words, screenanames, hashtags, URLs and punctuation symbols. :rtype: list(list(str)) """ diff --git a/nltk/corpus/reader/verbnet.py b/nltk/corpus/reader/verbnet.py index 33d1839844..0ff022ddd6 100644 --- a/nltk/corpus/reader/verbnet.py +++ b/nltk/corpus/reader/verbnet.py @@ -441,7 +441,7 @@ def pprint(self, vnclass): the given VerbNet class. :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. + containing the xml contents of a VerbNet class. """ if isinstance(vnclass, str): vnclass = self.vnclass(vnclass) diff --git a/nltk/corpus/reader/wordlist.py b/nltk/corpus/reader/wordlist.py index f7fba76706..661a094cfa 100644 --- a/nltk/corpus/reader/wordlist.py +++ b/nltk/corpus/reader/wordlist.py @@ -146,6 +146,7 @@ class MWAPPDBCorpusReader(WordListCorpusReader): This class is used to read the list of word pairs from the subset of lexical pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015): + - http://acl2014.org/acl2014/Q14/pdf/Q14-1017 - https://www.aclweb.org/anthology/S14-2039 - https://www.aclweb.org/anthology/S15-2027 diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py index be12d2852f..580ea3a853 100644 --- a/nltk/corpus/reader/wordnet.py +++ b/nltk/corpus/reader/wordnet.py @@ -520,7 +520,7 @@ def root_hypernyms(self): def max_depth(self): """ :return: The length of the longest hypernym path from this - synset to the root. + synset to the root. """ if "_max_depth" not in self.__dict__: @@ -534,7 +534,7 @@ def max_depth(self): def min_depth(self): """ :return: The length of the shortest hypernym path from this - synset to the root. + synset to the root. """ if "_min_depth" not in self.__dict__: @@ -1393,6 +1393,7 @@ def synset_from_pos_and_offset(self, pos, offset): attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v'). - offset: The byte offset of this synset in the WordNet dict file for this pos. + >>> from nltk.corpus import wordnet as wn >>> print(wn.synset_from_pos_and_offset('n', 1740)) Synset('entity.n.01') @@ -1554,25 +1555,29 @@ def synset_from_sense_key(self, sense_key): obtained from lemma.key() From https://wordnet.princeton.edu/documentation/senseidx5wn: - A sense_key is represented as: + A sense_key is represented as:: + lemma % lex_sense (e.g. 'dog%1:18:01::') - where lex_sense is encoded as: + + where lex_sense is encoded as:: + ss_type:lex_filenum:lex_id:head_word:head_id - lemma: ASCII text of word/collocation, in lower case - ss_type: synset type for the sense (1 digit int) - The synset type is encoded as follows: - 1 NOUN - 2 VERB - 3 ADJECTIVE - 4 ADVERB - 5 ADJECTIVE SATELLITE - lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int) - lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int) - head_word: lemma of the first word in satellite's head synset - Only used if sense is in an adjective satellite synset - head_id: uniquely identifies sense in a lexicographer file when paired with head_word - Only used if head_word is present (2 digit int) + :lemma: ASCII text of word/collocation, in lower case + :ss_type: synset type for the sense (1 digit int) + The synset type is encoded as follows:: + + 1 NOUN + 2 VERB + 3 ADJECTIVE + 4 ADVERB + 5 ADJECTIVE SATELLITE + :lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int) + :lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int) + :head_word: lemma of the first word in satellite's head synset + Only used if sense is in an adjective satellite synset + :head_id: uniquely identifies sense in a lexicographer file when paired with head_word + Only used if head_word is present (2 digit int) >>> import nltk >>> from nltk.corpus import wordnet as wn @@ -1964,13 +1969,13 @@ def ic(self, corpus, weight_senses_equally=False, smoothing=1.0): :type corpus: CorpusReader :param corpus: The corpus from which we create an information - content dictionary. + content dictionary. :type weight_senses_equally: bool :param weight_senses_equally: If this is True, gives all - possible senses equal weight rather than dividing by the - number of possible senses. (If a word has 3 synses, each - sense gets 0.3333 per appearance when this is False, 1.0 when - it is true.) + possible senses equal weight rather than dividing by the + number of possible senses. (If a word has 3 synses, each + sense gets 0.3333 per appearance when this is False, 1.0 when + it is true.) :param smoothing: How much do we smooth synset counts (default is 1.0) :type smoothing: float :return: An information content dictionary @@ -2024,8 +2029,8 @@ def custom_lemmas(self, tab_file, lang): documentation on the Multilingual WordNet tab file format. :param tab_file: Tab file as a file or file-like object - :type lang str - :param lang ISO 639-3 code of the language of the tab file + :type: lang str + :param: lang ISO 639-3 code of the language of the tab file """ if len(lang) != 3: raise ValueError("lang should be a (3 character) ISO 639-3 code") diff --git a/nltk/corpus/util.py b/nltk/corpus/util.py index 890ac8e905..287ec05d56 100644 --- a/nltk/corpus/util.py +++ b/nltk/corpus/util.py @@ -44,8 +44,8 @@ class LazyCorpusLoader: :type reader: nltk.corpus.reader.api.CorpusReader :param nltk_data_subdir: The subdirectory where the corpus is stored. :type nltk_data_subdir: str - :param *args: Any other non-keywords arguments that `reader_cls` might need. - :param *kargs: Any other keywords arguments that `reader_cls` might need. + :param `*args`: Any other non-keywords arguments that `reader_cls` might need. + :param `**kwargs`: Any other keywords arguments that `reader_cls` might need. """ def __init__(self, name, reader_cls, *args, **kwargs): diff --git a/nltk/draw/util.py b/nltk/draw/util.py index 579adfe6f9..8d26cad59b 100644 --- a/nltk/draw/util.py +++ b/nltk/draw/util.py @@ -117,6 +117,7 @@ class CanvasWidget(metaclass=ABCMeta): - ``__init__``: Builds a new canvas widget. It must perform the following three tasks (in order): + - Create any new graphical elements. - Call ``_add_child_widget`` on each child widget. - Call the ``CanvasWidget`` constructor. @@ -2171,11 +2172,13 @@ class ColorizedList: """ An abstract base class for displaying a colorized list of items. Subclasses should define: - - ``_init_colortags``, which sets up Text color tags that - will be used by the list. - - ``_item_repr``, which returns a list of (text,colortag) - tuples that make up the colorized representation of the - item. + + - ``_init_colortags``, which sets up Text color tags that + will be used by the list. + - ``_item_repr``, which returns a list of (text,colortag) + tuples that make up the colorized representation of the + item. + :note: Typically, you will want to register a callback for ``'select'`` that calls ``mark`` on the given item. """ diff --git a/nltk/featstruct.py b/nltk/featstruct.py index 633b91e558..b0719c35ae 100644 --- a/nltk/featstruct.py +++ b/nltk/featstruct.py @@ -160,10 +160,11 @@ def __new__(cls, features=None, **morefeatures): :param features: The initial feature values for this feature structure: - - FeatStruct(string) -> FeatStructReader().read(string) - - FeatStruct(mapping) -> FeatDict(mapping) - - FeatStruct(sequence) -> FeatList(sequence) - - FeatStruct() -> FeatDict() + + - FeatStruct(string) -> FeatStructReader().read(string) + - FeatStruct(mapping) -> FeatDict(mapping) + - FeatStruct(sequence) -> FeatList(sequence) + - FeatStruct() -> FeatDict() :param morefeatures: If ``features`` is a mapping or None, then ``morefeatures`` provides additional features for the ``FeatDict`` constructor. diff --git a/nltk/grammar.py b/nltk/grammar.py index 64a2c8dbe3..49c38300ac 100644 --- a/nltk/grammar.py +++ b/nltk/grammar.py @@ -454,7 +454,7 @@ class CFG: def __init__(self, start, productions, calculate_leftcorners=True): """ Create a new context-free grammar, from the given start state - and set of ``Production``s. + and set of ``Production`` instances. :param start: The start symbol :type start: Nonterminal @@ -737,6 +737,7 @@ def is_chomsky_normal_form(self): def chomsky_normal_form(self, new_token_padding="@$@", flexible=False): """ Returns a new Grammar that is in chomsky normal + :param: new_token_padding Customise new rule formation during binarisation """ @@ -794,11 +795,12 @@ def binarize(cls, grammar, padding="@$@"): Convert all non-binary rules into binary by introducing new tokens. Example:: - Original: - A => B C D - After Conversion: - A => B A@$@B - A@$@B => C D + + Original: + A => B C D + After Conversion: + A => B A@$@B + A@$@B => C D """ result = [] diff --git a/nltk/inference/api.py b/nltk/inference/api.py index e80d629b24..93c513f224 100644 --- a/nltk/inference/api.py +++ b/nltk/inference/api.py @@ -90,7 +90,7 @@ def retract_assumptions(self, retracted, debug=False): Retract assumptions from the assumption list. :param debug: If True, give warning when ``retracted`` is not present on - assumptions list. + assumptions list. :type debug: bool :param retracted: assumptions to be retracted :type retracted: list(sem.Expression) @@ -218,7 +218,7 @@ def retract_assumptions(self, retracted, debug=False): Retract assumptions from the assumption list. :param debug: If True, give warning when ``retracted`` is not present on - assumptions list. + assumptions list. :type debug: bool :param retracted: assumptions to be retracted :type retracted: list(sem.Expression) diff --git a/nltk/inference/nonmonotonic.py b/nltk/inference/nonmonotonic.py index eeb6ce5323..ff904b0c97 100644 --- a/nltk/inference/nonmonotonic.py +++ b/nltk/inference/nonmonotonic.py @@ -67,10 +67,12 @@ def goal(self): def replace_quants(self, ex, domain): """ Apply the closed domain assumption to the expression - - Domain = union([e.free()|e.constants() for e in all_expressions]) - - translate "exists x.P" to "(z=d1 | z=d2 | ... ) & P.replace(x,z)" OR - "P.replace(x, d1) | P.replace(x, d2) | ..." - - translate "all x.P" to "P.replace(x, d1) & P.replace(x, d2) & ..." + + - Domain = union([e.free()|e.constants() for e in all_expressions]) + - translate "exists x.P" to "(z=d1 | z=d2 | ... ) & P.replace(x,z)" OR + "P.replace(x, d1) | P.replace(x, d2) | ..." + - translate "all x.P" to "P.replace(x, d1) & P.replace(x, d2) & ..." + :param ex: ``Expression`` :param domain: set of {Variable}s :return: ``Expression`` diff --git a/nltk/inference/prover9.py b/nltk/inference/prover9.py index 8c428e5b2f..83b0483ebb 100644 --- a/nltk/inference/prover9.py +++ b/nltk/inference/prover9.py @@ -137,8 +137,8 @@ def config_prover9(self, binary_location, verbose=False): def prover9_input(self, goal, assumptions): """ :return: The input string that should be provided to the - prover9 binary. This string is formed based on the goal, - assumptions, and timeout value of this object. + prover9 binary. This string is formed based on the goal, + assumptions, and timeout value of this object. """ s = "" diff --git a/nltk/inference/resolution.py b/nltk/inference/resolution.py index deb7cc7e57..98d01b585d 100755 --- a/nltk/inference/resolution.py +++ b/nltk/inference/resolution.py @@ -184,17 +184,17 @@ def unify(self, other, bindings=None, used=None, skipped=None, debug=False): :param other: ``Clause`` with which to unify :param bindings: ``BindingDict`` containing bindings that should be used - during the unification + during the unification :param used: tuple of two lists of atoms. The first lists the - atoms from 'self' that were successfully unified with atoms from - 'other'. The second lists the atoms from 'other' that were successfully - unified with atoms from 'self'. + atoms from 'self' that were successfully unified with atoms from + 'other'. The second lists the atoms from 'other' that were successfully + unified with atoms from 'self'. :param skipped: tuple of two ``Clause`` objects. The first is a list of all - the atoms from the 'self' Clause that have not been unified with - anything on the path. The second is same thing for the 'other' Clause. + the atoms from the 'self' Clause that have not been unified with + anything on the path. The second is same thing for the 'other' Clause. :param debug: bool indicating whether debug statements should print :return: list containing all the resulting ``Clause`` objects that could be - obtained by unification + obtained by unification """ if bindings is None: bindings = BindingDict() @@ -325,7 +325,7 @@ def substitute_bindings(self, bindings): Replace every binding :param bindings: A list of tuples mapping Variable Expressions to the - Expressions to which they are bound + Expressions to which they are bound. :return: ``Clause`` """ return Clause([atom.substitute_bindings(bindings) for atom in self]) diff --git a/nltk/internals.py b/nltk/internals.py index 97fd029029..b3fd05c9e6 100644 --- a/nltk/internals.py +++ b/nltk/internals.py @@ -214,6 +214,7 @@ def read_str(s, start_position): escape sequence) is passed into the ``eval``. :Example: + >>> from nltk.internals import read_str >>> read_str('"Hello", World!', 0) ('Hello', 7) @@ -271,6 +272,7 @@ def read_int(s, start_position): match in ``s`` at ``start_position``. :Example: + >>> from nltk.internals import read_int >>> read_int('42 is the answer', 0) (42, 2) @@ -308,6 +310,7 @@ def read_number(s, start_position): match in ``s`` at ``start_position``. :Example: + >>> from nltk.internals import read_number >>> read_number('Pi is 3.14159', 6) (3.14159, 13) @@ -330,9 +333,9 @@ def read_number(s, start_position): def overridden(method): """ :return: True if ``method`` overrides some method with the same - name in a base class. This is typically used when defining - abstract base classes or interfaces, to allow subclasses to define - either of two related methods: + name in a base class. This is typically used when defining + abstract base classes or interfaces, to allow subclasses to define + either of two related methods: >>> class EaterI: ... '''Subclass must define eat() or batch_eat().''' diff --git a/nltk/lm/api.py b/nltk/lm/api.py index a97ced38f5..8a89ad190a 100644 --- a/nltk/lm/api.py +++ b/nltk/lm/api.py @@ -80,16 +80,15 @@ def __init__(self, order, vocabulary=None, counter=None): """Creates new LanguageModel. :param vocabulary: If provided, this vocabulary will be used instead - of creating a new one when training. + of creating a new one when training. :type vocabulary: `nltk.lm.Vocabulary` or None :param counter: If provided, use this object to count ngrams. :type vocabulary: `nltk.lm.NgramCounter` or None :param ngrams_fn: If given, defines how sentences in training text are turned to ngram - sequences. + sequences. :type ngrams_fn: function or None :param pad_fn: If given, defines how sentences in training text are padded. :type pad_fn: function or None - """ self.order = order self.vocab = Vocabulary() if vocabulary is None else vocabulary @@ -129,10 +128,9 @@ def unmasked_score(self, word, context=None): :param str word: Word for which we want the score :param tuple(str) context: Context the word is in. - If `None`, compute unigram score. + If `None`, compute unigram score. :param context: tuple(str) or None :rtype: float - """ raise NotImplementedError() @@ -180,7 +178,7 @@ def generate(self, num_words=1, text_seed=None, random_seed=None): :param int num_words: How many words to generate. By default 1. :param text_seed: Generation can be conditioned on preceding context. :param random_seed: A random seed or an instance of `random.Random`. If provided, - makes the random sampling part of generation reproducible. + makes the random sampling part of generation reproducible. :return: One (str) word or a list of words generated from model. Examples: diff --git a/nltk/lm/preprocessing.py b/nltk/lm/preprocessing.py index c65fb76fa9..4f3c0fcf7d 100644 --- a/nltk/lm/preprocessing.py +++ b/nltk/lm/preprocessing.py @@ -35,12 +35,13 @@ def padded_everygram_pipeline(order, text): """Default preprocessing for a sequence of sentences. Creates two iterators: + - sentences padded and turned into sequences of `nltk.util.everygrams` - sentences padded as above and chained together for a flat stream of words :param order: Largest ngram length produced by `everygrams`. - :param text: Text to iterate over. Expected to be an iterable of sentences: - Iterable[Iterable[str]] + :param text: Text to iterate over. Expected to be an iterable of sentences. + :type text: Iterable[Iterable[str]] :return: iterator over text as ngrams, iterator over text as vocabulary data """ padding_fn = partial(pad_both_ends, n=order) diff --git a/nltk/lm/vocabulary.py b/nltk/lm/vocabulary.py index b50c5dbcc5..d122281bed 100644 --- a/nltk/lm/vocabulary.py +++ b/nltk/lm/vocabulary.py @@ -38,6 +38,7 @@ class Vocabulary: """Stores language model vocabulary. Satisfies two common language modeling requirements for a vocabulary: + - When checking membership and calculating its size, filters items by comparing their counts to a cutoff value. - Adds a special "unknown" token which unseen words are mapped to. diff --git a/nltk/metrics/agreement.py b/nltk/metrics/agreement.py index 875e5b509e..6c2c3f2e06 100644 --- a/nltk/metrics/agreement.py +++ b/nltk/metrics/agreement.py @@ -98,11 +98,11 @@ def __init__(self, data=None, distance=binary_distance): The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples, each representing a coder's labeling of an item: - (coder,item,label) + ``(coder,item,label)`` The distance argument is a function taking two arguments (labels) and producing a numerical distance. The distance from a label to itself should be zero: - distance(l,l) = 0 + ``distance(l,l) = 0`` """ self.distance = distance self.I = set() diff --git a/nltk/metrics/association.py b/nltk/metrics/association.py index 6ad60fdd1e..5461ef88f3 100644 --- a/nltk/metrics/association.py +++ b/nltk/metrics/association.py @@ -173,10 +173,10 @@ class BigramAssocMeasures(NgramAssocMeasures): suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: - n_ii counts (w1, w2), i.e. the bigram being scored - n_ix counts (w1, *) - n_xi counts (*, w2) - n_xx counts (*, *), i.e. any bigram + - n_ii counts ``(w1, w2)``, i.e. the bigram being scored + - n_ix counts ``(w1, *)`` + - n_xi counts ``(*, w2)`` + - n_xx counts ``(*, *)``, i.e. any bigram This may be shown with respect to a contingency table:: @@ -264,9 +264,10 @@ class TrigramAssocMeasures(NgramAssocMeasures): the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: - n_iii counts (w1, w2, w3), i.e. the trigram being scored - n_ixx counts (w1, *, *) - n_xxx counts (*, *, *), i.e. any trigram + + - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored + - n_ixx counts ``(w1, *, *)`` + - n_xxx counts ``(*, *, *)``, i.e. any trigram """ _n = 3 @@ -324,9 +325,10 @@ class QuadgramAssocMeasures(NgramAssocMeasures): the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: - n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored - n_ixxi counts (w1, *, *, w4) - n_xxxx counts (*, *, *, *), i.e. any quadgram + + - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored + - n_ixxi counts ``(w1, *, *, w4)`` + - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram """ _n = 4 diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py index f89b4809d3..1f3211bd03 100644 --- a/nltk/metrics/distance.py +++ b/nltk/metrics/distance.py @@ -83,7 +83,7 @@ def edit_distance(s1, s2, substitution_cost=1, transpositions=False): :type s2: str :type substitution_cost: int :type transpositions: bool - :rtype int + :rtype: int """ # set up a 2-D array len1 = len(s1) @@ -155,9 +155,11 @@ def edit_distance_align(s1, s2, substitution_cost=1): In case of multiple valid minimum-distance alignments, the backtrace has the following operation precedence: + 1. Skip s1 character 2. Skip s2 character 3. Substitute s1 and s2 characters + The backtrace is carried out in reverse string order. This function does not support transposition. @@ -166,7 +168,7 @@ def edit_distance_align(s1, s2, substitution_cost=1): :type s1: str :type s2: str :type substitution_cost: int - :rtype List[Tuple(int, int)] + :rtype: List[Tuple(int, int)] """ # set up a 2-D array len1 = len(s1) @@ -299,13 +301,12 @@ def jaro_similarity(s1, s2): required to change one word into another. The Jaro similarity formula from https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance : - jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m) - - where: - - |s_i| is the length of string s_i - - m is the no. of matching characters - - t is the half no. of possible transpositions. + ``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)`` + where + - `|s_i|` is the length of string `s_i` + - `m` is the no. of matching characters + - `t` is the half no. of possible transpositions. """ # First, store the length of the strings # because they will be re-used several times. @@ -357,20 +358,21 @@ def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4): Decision Rules in the Fellegi-Sunter Model of Record Linkage. Proceedings of the Section on Survey Research Methods. American Statistical Association: 354-359. + such that: jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) ) where, - - jaro_sim is the output from the Jaro Similarity, + - jaro_sim is the output from the Jaro Similarity, see jaro_similarity() - - l is the length of common prefix at the start of the string - - this implementation provides an upperbound for the l value - to keep the prefixes.A common value of this upperbound is 4. - - p is the constant scaling factor to overweigh common prefixes. - The Jaro-Winkler similarity will fall within the [0, 1] bound, - given that max(p)<=0.25 , default is p=0.1 in Winkler (1990) + - l is the length of common prefix at the start of the string + - this implementation provides an upperbound for the l value + to keep the prefixes.A common value of this upperbound is 4. + - p is the constant scaling factor to overweigh common prefixes. + The Jaro-Winkler similarity will fall within the [0, 1] bound, + given that max(p)<=0.25 , default is p=0.1 in Winkler (1990) Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf @@ -383,8 +385,9 @@ def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4): >>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000] >>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000] - # One way to match the values on the Winkler's paper is to provide a different - # p scaling factor for different pairs of strings, e.g. + One way to match the values on the Winkler's paper is to provide a different + p scaling factor for different pairs of strings, e.g. + >>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1] >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors): @@ -412,8 +415,9 @@ def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4): ... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943, ... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000] - # One way to match the values on the Winkler's paper is to provide a different - # p scaling factor for different pairs of strings, e.g. + One way to match the values on the Winkler's paper is to provide a different + p scaling factor for different pairs of strings, e.g. + >>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20, ... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] @@ -432,8 +436,6 @@ def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4): >>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3) 0.88 - - """ # To ensure that the output of the Jaro-Winkler's similarity # falls between [0,1], the product of l * p needs to be diff --git a/nltk/metrics/paice.py b/nltk/metrics/paice.py index 6068c05137..70c05d0c0c 100644 --- a/nltk/metrics/paice.py +++ b/nltk/metrics/paice.py @@ -28,7 +28,7 @@ def get_words_from_dictionary(lemmas): Get original set of words used for analysis. :param lemmas: A dictionary where keys are lemmas and values are sets - or lists of words corresponding to that lemma. + or lists of words corresponding to that lemma. :type lemmas: dict(str): list(str) :return: Set of words that exist as values in the dictionary :rtype: set(str) @@ -219,9 +219,9 @@ class Paice: def __init__(self, lemmas, stems): """ :param lemmas: A dictionary where keys are lemmas and values are sets - or lists of words corresponding to that lemma. + or lists of words corresponding to that lemma. :param stems: A dictionary where keys are stems and values are sets - or lists of words corresponding to that stem. + or lists of words corresponding to that stem. :type lemmas: dict(str): list(str) :type stems: dict(str): set(str) """ diff --git a/nltk/metrics/segmentation.py b/nltk/metrics/segmentation.py index cd00849dba..ce6253f79b 100644 --- a/nltk/metrics/segmentation.py +++ b/nltk/metrics/segmentation.py @@ -15,7 +15,7 @@ Pevzner, L., and Hearst, M., A Critique and Improvement of an Evaluation Metric for Text Segmentation, -Computational Linguistics 28, 19-36 + Computational Linguistics 28, 19-36 2. Generalized Hamming Distance @@ -156,8 +156,8 @@ def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1" :param del_cost: deletion cost :type del_cost: float :param shift_cost_coeff: constant used to compute the cost of a shift. - shift cost = shift_cost_coeff * |i - j| where i and j are - the positions indicating the shift + ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j`` + are the positions indicating the shift :type shift_cost_coeff: float :param boundary: boundary value :type boundary: str or int or bool diff --git a/nltk/misc/minimalset.py b/nltk/misc/minimalset.py index 2d5c18182f..0953418add 100644 --- a/nltk/misc/minimalset.py +++ b/nltk/misc/minimalset.py @@ -63,7 +63,7 @@ def contexts(self, minimum=2): :param minimum: the minimum number of distinct target forms :type minimum: int - :rtype list + :rtype: list """ return [c for c in self._contexts if len(self._seen[c]) >= minimum] diff --git a/nltk/parse/api.py b/nltk/parse/api.py index 659dc7952d..60a1df2f9c 100644 --- a/nltk/parse/api.py +++ b/nltk/parse/api.py @@ -37,7 +37,7 @@ def grammar(self): def parse(self, sent, *args, **kwargs): """ :return: An iterator that generates parse trees for the sentence. - When possible this list is sorted from most likely to least likely. + When possible this list is sorted from most likely to least likely. :param sent: The sentence to be parsed :type sent: list(str) diff --git a/nltk/parse/bllip.py b/nltk/parse/bllip.py index ffc497fa32..e897513530 100644 --- a/nltk/parse/bllip.py +++ b/nltk/parse/bllip.py @@ -142,14 +142,14 @@ def __init__( :type reranker_weights: str :param parser_options: optional dictionary of parser options, see - ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` - for more information. + ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` + for more information. :type parser_options: dict(str) :param reranker_options: optional - dictionary of reranker options, see - ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` - for more information. + dictionary of reranker options, see + ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` + for more information. :type reranker_options: dict(str) """ _ensure_bllip_import_or_error() @@ -173,7 +173,7 @@ def parse(self, sentence): instance's tagger. :return: An iterator that generates parse trees for the sentence - from most likely to least likely. + from most likely to least likely. :param sentence: The sentence to be parsed :type sentence: list(str) @@ -194,7 +194,7 @@ def tagged_parse(self, word_and_tag_pairs): to leave a token's tag unconstrained. :return: An iterator that generates parse trees for the sentence - from most likely to least likely. + from most likely to least likely. :param sentence: Input sentence to parse as (word, tag) pairs :type sentence: list(tuple(str, str)) @@ -224,17 +224,17 @@ def from_unified_model_dir( for more information about unified model directories. :return: A ``BllipParser`` object using the parser and reranker - models in the model directory. + models in the model directory. :param model_dir: Path to the unified model directory. :type model_dir: str :param parser_options: optional dictionary of parser options, see - ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` - for more information. + ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` + for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see - ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` - for more information. + ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` + for more information. :type reranker_options: dict(str) :rtype: BllipParser """ diff --git a/nltk/parse/chart.py b/nltk/parse/chart.py index 873d74f9e7..934df155f8 100644 --- a/nltk/parse/chart.py +++ b/nltk/parse/chart.py @@ -965,7 +965,7 @@ class AbstractChartRule(ChartRuleI): - A default implementation for ``apply``. - A default implementation for ``apply_everywhere``, - (Currently, this implementation assumes that ``NUM_EDGES``<=3.) + (Currently, this implementation assumes that ``NUM_EDGES <= 3``.) - A default implementation for ``__str__``, which returns a name based on the rule's class name. """ diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 77c553c32d..15e5e3fa09 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -515,7 +515,6 @@ class CoreNLPParser(GenericCoreNLPParser): Mary walks . Special cases - ------------- >>> next( ... parser.raw_parse( @@ -533,7 +532,6 @@ class CoreNLPParser(GenericCoreNLPParser): ... ) ... ).height() 9 - """ _OUTPUT_FORMAT = "penn" @@ -648,7 +646,6 @@ class CoreNLPDependencyParser(GenericCoreNLPParser): . . 2 punct Special cases - ------------- Non-breaking space inside of a token. diff --git a/nltk/parse/dependencygraph.py b/nltk/parse/dependencygraph.py index dd61a346f7..d34639be7e 100755 --- a/nltk/parse/dependencygraph.py +++ b/nltk/parse/dependencygraph.py @@ -51,11 +51,10 @@ def __init__( zpar). :param str cell_separator: the cell separator. If not provided, cells - are split by whitespace. + are split by whitespace. :param str top_relation_label: the label by which the top relation is - identified, for examlple, `ROOT`, `null` or `TOP`. - + identified, for examlple, `ROOT`, `null` or `TOP`. """ self.nodes = defaultdict( lambda: { @@ -212,11 +211,11 @@ def load( """ :param filename: a name of a file in Malt-TAB format :param zero_based: nodes in the input file are numbered starting from 0 - rather than 1 (as produced by, e.g., zpar) + rather than 1 (as produced by, e.g., zpar) :param str cell_separator: the cell separator. If not provided, cells - are split by whitespace. + are split by whitespace. :param str top_relation_label: the label by which the top relation is - identified, for examlple, `ROOT`, `null` or `TOP`. + identified, for examlple, `ROOT`, `null` or `TOP`. :return: a list of DependencyGraphs diff --git a/nltk/parse/featurechart.py b/nltk/parse/featurechart.py index 9ef4f3558b..72174378fb 100644 --- a/nltk/parse/featurechart.py +++ b/nltk/parse/featurechart.py @@ -309,7 +309,7 @@ def apply(self, chart, grammar, left_edge, right_edge): class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): """ A specialized version of the completer / single edge fundamental rule - that operates on nonterminals whose symbols are ``FeatStructNonterminal``s. + that operates on nonterminals whose symbols are ``FeatStructNonterminal``. Rather than simply comparing the nonterminals for equality, they are unified. """ @@ -347,7 +347,7 @@ def apply(self, chart, grammar): class FeatureTopDownPredictRule(CachedTopDownPredictRule): r""" A specialized version of the (cached) top down predict rule that operates - on nonterminals whose symbols are ``FeatStructNonterminal``s. Rather + on nonterminals whose symbols are ``FeatStructNonterminal``. Rather than simply comparing the nonterminals for equality, they are unified. @@ -530,7 +530,7 @@ class InstantiateVarsChart(FeatureChart): start with '@', by replacing them with unique new variables. In particular, whenever a complete edge is added to the chart, any variables in the edge's ``lhs`` whose names start with '@' will be - replaced by unique new ``Variable``s. + replaced by unique new ``Variable``. """ def __init__(self, tokens): diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py index 0368595fa7..768f4846bf 100644 --- a/nltk/parse/malt.py +++ b/nltk/parse/malt.py @@ -122,20 +122,20 @@ def __init__( An interface for parsing with the Malt Parser. :param parser_dirname: The path to the maltparser directory that - contains the maltparser-1.x.jar + contains the maltparser-1.x.jar :type parser_dirname: str :param model_filename: The name of the pre-trained model with .mco file - extension. If provided, training will not be required. - (see http://www.maltparser.org/mco/mco.html and - see http://www.patful.com/chalk/node/185) + extension. If provided, training will not be required. + (see http://www.maltparser.org/mco/mco.html and + see http://www.patful.com/chalk/node/185) :type model_filename: str :param tagger: The tagger used to POS tag the raw string before - formatting to CONLL format. It should behave like `nltk.pos_tag` + formatting to CONLL format. It should behave like `nltk.pos_tag` :type tagger: function :param additional_java_args: This is the additional Java arguments that - one can use when calling Maltparser, usually this is the heapsize - limits, e.g. `additional_java_args=['-Xmx1024m']` - (see https://goo.gl/mpDBvQ) + one can use when calling Maltparser, usually this is the heapsize + limits, e.g. `additional_java_args=['-Xmx1024m']` + (see https://goo.gl/mpDBvQ) :type additional_java_args: list """ @@ -162,7 +162,7 @@ def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null" :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: iter(iter(``DependencyGraph``)) the dependency graph - representation of each sentence + representation of each sentence """ if not self._trained: raise Exception("Parser has not been trained. Call train() first.") diff --git a/nltk/parse/nonprojectivedependencyparser.py b/nltk/parse/nonprojectivedependencyparser.py index 4a29f038a0..ed4cc41d18 100644 --- a/nltk/parse/nonprojectivedependencyparser.py +++ b/nltk/parse/nonprojectivedependencyparser.py @@ -39,9 +39,9 @@ def train(self, graphs): """ :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. - Typically the edges present in the graphs can be used as - positive training examples, and the edges not present as negative - examples. + Typically the edges present in the graphs can be used as + positive training examples, and the edges not present as negative + examples. """ raise NotImplementedError() @@ -49,21 +49,23 @@ def score(self, graph): """ :type graph: DependencyGraph :param graph: A dependency graph whose set of edges need to be - scored. + scored. :rtype: A three-dimensional list of numbers. :return: The score is returned in a multidimensional(3) list, such - that the outer-dimension refers to the head, and the - inner-dimension refers to the dependencies. For instance, - scores[0][1] would reference the list of scores corresponding to - arcs from node 0 to node 1. The node's 'address' field can be used - to determine its number identification. + that the outer-dimension refers to the head, and the + inner-dimension refers to the dependencies. For instance, + scores[0][1] would reference the list of scores corresponding to + arcs from node 0 to node 1. The node's 'address' field can be used + to determine its number identification. For further illustration, a score list corresponding to Fig.2 of - Keith Hall's 'K-best Spanning Tree Parsing' paper: + Keith Hall's 'K-best Spanning Tree Parsing' paper:: + scores = [[[], [5], [1], [1]], [[], [], [11], [4]], [[], [10], [], [5]], [[], [8], [8], []]] + When used in conjunction with a MaxEntClassifier, each score would correspond to the confidence of a particular edge being classified with the positive training examples. @@ -228,7 +230,6 @@ class ProbabilisticNonprojectiveParser: 1 Rule based example - ------------------ >>> from nltk.grammar import DependencyGrammar @@ -349,7 +350,7 @@ def compute_original_indexes(self, new_indexes): :type new_indexes: A list of integers. :param new_indexes: A list of node addresses to check for - subsumed nodes. + subsumed nodes. """ swapped = True while swapped: @@ -374,10 +375,10 @@ def compute_max_subtract_score(self, column_index, cycle_indexes): :type column_index: integer. :param column_index: A index representing the column of incoming arcs - to a particular node being updated + to a particular node being updated :type cycle_indexes: A list of integers. :param cycle_indexes: Only arcs from cycle nodes are considered. This - is a list of such nodes addresses. + is a list of such nodes addresses. """ max_score = -100000 for row_index in cycle_indexes: @@ -393,7 +394,7 @@ def best_incoming_arc(self, node_index): :type node_index: integer. :param node_index: The address of the 'destination' node, - the node that is arced to. + the node that is arced to. """ originals = self.compute_original_indexes([node_index]) logger.debug("originals: %s", originals) diff --git a/nltk/parse/projectivedependencyparser.py b/nltk/parse/projectivedependencyparser.py index c14ee62ff2..aa32c19871 100644 --- a/nltk/parse/projectivedependencyparser.py +++ b/nltk/parse/projectivedependencyparser.py @@ -295,7 +295,7 @@ class ProbabilisticProjectiveDependencyParser: to the one utilized by the rule-based projective parser. Usage example - ------------- + >>> from nltk.parse.dependencygraph import conll_data2 >>> graphs = [ diff --git a/nltk/parse/transitionparser.py b/nltk/parse/transitionparser.py index ea041ec755..0a212dce27 100644 --- a/nltk/parse/transitionparser.py +++ b/nltk/parse/transitionparser.py @@ -206,8 +206,9 @@ def __init__(self, alg_option): def left_arc(self, conf, relation): """ Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager - :param configuration: is the current configuration - :return : A new configuration or -1 if the pre-condition is not satisfied + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied """ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): return -1 @@ -233,8 +234,9 @@ def left_arc(self, conf, relation): def right_arc(self, conf, relation): """ Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager - :param configuration: is the current configuration - :return : A new configuration or -1 if the pre-condition is not satisfied + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied """ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): return -1 @@ -252,8 +254,9 @@ def right_arc(self, conf, relation): def reduce(self, conf): """ Note that the algorithm for reduce is only available for arc-eager - :param configuration: is the current configuration - :return : A new configuration or -1 if the pre-condition is not satisfied + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied """ if self._algo != TransitionParser.ARC_EAGER: @@ -274,8 +277,9 @@ def reduce(self, conf): def shift(self, conf): """ Note that the algorithm for shift is the SAME for arc-standard and arc-eager - :param configuration: is the current configuration - :return : A new configuration or -1 if the pre-condition is not satisfied + + :param configuration: is the current configuration + :return: A new configuration or -1 if the pre-condition is not satisfied """ if len(conf.buffer) <= 0: return -1 diff --git a/nltk/parse/util.py b/nltk/parse/util.py index 6159ad68d1..b2714e0eca 100644 --- a/nltk/parse/util.py +++ b/nltk/parse/util.py @@ -160,8 +160,10 @@ def __init__(self, grammar, suite, accept=None, reject=None): def run(self, show_trees=False): """ Sentences in the test suite are divided into two classes: - - grammatical (``accept``) and - - ungrammatical (``reject``). + + - grammatical (``accept``) and + - ungrammatical (``reject``). + If a sentence should parse according to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be None. @@ -195,8 +197,10 @@ def extract_test_sentences(string, comment_chars="#%;", encoding=None): """ Parses a string with one test sentence per line. Lines can optionally begin with: - - a bool, saying if the sentence is grammatical or not, or - - an int, giving the number of parse trees is should have, + + - a bool, saying if the sentence is grammatical or not, or + - an int, giving the number of parse trees is should have, + The result information is followed by a colon, and then the sentence. Empty lines and lines beginning with a comment char are ignored. diff --git a/nltk/probability.py b/nltk/probability.py index cd235fdb28..4e7cbdd236 100755 --- a/nltk/probability.py +++ b/nltk/probability.py @@ -252,7 +252,7 @@ def plot( displaying the most frequent sample first. If an integer parameter is supplied, stop after this many samples have been plotted. For a cumulative plot, specify cumulative=True. Additional - **kwargs are passed to matplotlib's plot function. + ``**kwargs`` are passed to matplotlib's plot function. (Requires Matplotlib to be installed.) :param title: The title for the graph. @@ -1932,8 +1932,8 @@ def plot( ): """ Plot the given samples from the conditional frequency distribution. - For a cumulative plot, specify cumulative=True. Additional *args and - **kwargs are passed to matplotlib's plot function. + For a cumulative plot, specify cumulative=True. Additional ``*args`` and + ``**kwargs`` are passed to matplotlib's plot function. (Requires Matplotlib to be installed.) :param samples: The samples to plot diff --git a/nltk/sem/boxer.py b/nltk/sem/boxer.py index c5b6ded6a0..c824cece31 100644 --- a/nltk/sem/boxer.py +++ b/nltk/sem/boxer.py @@ -13,16 +13,19 @@ This interface relies on the latest version of the development (subversion) version of C&C and Boxer. -Usage: - Set the environment variable CANDC to the bin directory of your CandC installation. - The models directory should be in the CandC root directory. - For example: - /path/to/candc/ - bin/ - candc - boxer - models/ - boxer/ +Usage +===== + +Set the environment variable CANDC to the bin directory of your CandC installation. +The models directory should be in the CandC root directory. +For example:: + + /path/to/candc/ + bin/ + candc + boxer + models/ + boxer/ """ import operator @@ -69,14 +72,14 @@ def __init__( ): """ :param boxer_drs_interpreter: A class that converts from the - ``AbstractBoxerDrs`` object hierarchy to a different object. The - default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK - DRT hierarchy. + ``AbstractBoxerDrs`` object hierarchy to a different object. The + default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK + DRT hierarchy. :param elimeq: When set to true, Boxer removes all equalities from the - DRSs and discourse referents standing in the equality relation are - unified, but only if this can be done in a meaning-preserving manner. + DRSs and discourse referents standing in the equality relation are + unified, but only if this can be done in a meaning-preserving manner. :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. - Resolution follows Van der Sandt's theory of binding and accommodation. + Resolution follows Van der Sandt's theory of binding and accommodation. """ if boxer_drs_interpreter is None: boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter() diff --git a/nltk/sem/chat80.py b/nltk/sem/chat80.py index 042fc8fa7d..63704d70bd 100644 --- a/nltk/sem/chat80.py +++ b/nltk/sem/chat80.py @@ -255,12 +255,12 @@ def __init__(self, prefLabel, arity, altLabels=[], closures=[], extension=set()) :type prefLabel: str :param arity: the arity of the concept :type arity: int - @keyword altLabels: other (related) labels + :param altLabels: other (related) labels :type altLabels: list - @keyword closures: closure properties of the extension \ + :param closures: closure properties of the extension (list items can be ``symmetric``, ``reflexive``, ``transitive``) :type closures: list - @keyword extension: the extensional value of the concept + :param extension: the extensional value of the concept :type extension: set """ self.prefLabel = prefLabel diff --git a/nltk/sem/drt.py b/nltk/sem/drt.py index be236f02d3..23c43c6f44 100644 --- a/nltk/sem/drt.py +++ b/nltk/sem/drt.py @@ -304,7 +304,7 @@ class DRS(DrtExpression, Expression): def __init__(self, refs, conds, consequent=None): """ :param refs: list of ``DrtIndividualVariableExpression`` for the - discourse referents + discourse referents :param conds: list of ``Expression`` for the conditions """ self.refs = refs diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py index fa2990ef56..5b3c945571 100644 --- a/nltk/sem/glue.py +++ b/nltk/sem/glue.py @@ -414,7 +414,7 @@ def get_glueformulas_from_semtype_entry( def get_meaning_formula(self, generic, word): """ :param generic: A meaning formula string containing the - parameter "" + parameter "" :param word: The actual word to be replace "" """ word = word.replace(".", "") diff --git a/nltk/sem/logic.py b/nltk/sem/logic.py index b5cf21530f..1bb1ee8de9 100644 --- a/nltk/sem/logic.py +++ b/nltk/sem/logic.py @@ -99,8 +99,9 @@ class LogicParser: def __init__(self, type_check=False): """ - :param type_check: bool should type checking be performed? - to their types. + :param type_check: should type checking be performed + to their types? + :type type_check: bool """ assert isinstance(type_check, bool) @@ -139,7 +140,7 @@ def parse(self, data, signature=None): :param data: str for the input to be parsed :param signature: ``dict`` that maps variable names to type - strings + strings :returns: a parsed Expression """ data = data.rstrip() diff --git a/nltk/sem/util.py b/nltk/sem/util.py index 1429e657d6..7cc77fd9ad 100644 --- a/nltk/sem/util.py +++ b/nltk/sem/util.py @@ -31,7 +31,7 @@ def parse_sents(inputs, grammar, trace=0): :param grammar: ``FeatureGrammar`` or name of feature-based grammar :type grammar: nltk.grammar.FeatureGrammar :rtype: list(nltk.tree.Tree) or dict(list(str)): list(Tree) - :return: a mapping from input sentences to a list of ``Tree``s + :return: a mapping from input sentences to a list of ``Tree`` instances. """ # put imports here to avoid circult dependencies from nltk.grammar import FeatureGrammar diff --git a/nltk/sentiment/sentiment_analyzer.py b/nltk/sentiment/sentiment_analyzer.py index e46a2e9f40..2253cdc990 100644 --- a/nltk/sentiment/sentiment_analyzer.py +++ b/nltk/sentiment/sentiment_analyzer.py @@ -37,6 +37,7 @@ def __init__(self, classifier=None): def all_words(self, documents, labeled=None): """ Return all words/tokens from the documents (with duplicates). + :param documents: a list of (words, label) tuples. :param labeled: if `True`, assume that each document is represented by a (words, label) tuple: (list(str), str). If `False`, each document is diff --git a/nltk/sentiment/util.py b/nltk/sentiment/util.py index e949ff0a6b..fa804ffb1a 100644 --- a/nltk/sentiment/util.py +++ b/nltk/sentiment/util.py @@ -464,8 +464,9 @@ def demo_tweets(trainer, n_instances=None, output=None): Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - - 1000 most frequent unigrams - - 100 top bigrams (using BigramAssocMeasures.pmi) + + - 1000 most frequent unigrams + - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for @@ -554,7 +555,8 @@ def demo_movie_reviews(trainer, n_instances=None, output=None): The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - - most frequent unigrams + + - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for diff --git a/nltk/stem/porter.py b/nltk/stem/porter.py index 807cdac933..d7ea6108cc 100644 --- a/nltk/stem/porter.py +++ b/nltk/stem/porter.py @@ -43,30 +43,33 @@ class PorterStemmer(StemmerI): passing the appropriate constant to the class constructor's `mode` attribute: - PorterStemmer.ORIGINAL_ALGORITHM - - Implementation that is faithful to the original paper. - - Note that Martin Porter has deprecated this version of the - algorithm. Martin distributes implementations of the Porter - Stemmer in many languages, hosted at: - - https://www.tartarus.org/~martin/PorterStemmer/ - - and all of these implementations include his extensions. He - strongly recommends against using the original, published - version of the algorithm; only use this mode if you clearly - understand why you are choosing to do so. - - PorterStemmer.MARTIN_EXTENSIONS - - Implementation that only uses the modifications to the - algorithm that are included in the implementations on Martin - Porter's website. He has declared Porter frozen, so the - behaviour of those implementations should never change. - - PorterStemmer.NLTK_EXTENSIONS (default) - - Implementation that includes further improvements devised by - NLTK contributors or taken from other modified implementations - found on the web. + - PorterStemmer.ORIGINAL_ALGORITHM + + An implementation that is faithful to the original paper. + + Note that Martin Porter has deprecated this version of the + algorithm. Martin distributes implementations of the Porter + Stemmer in many languages, hosted at: + + https://www.tartarus.org/~martin/PorterStemmer/ + + and all of these implementations include his extensions. He + strongly recommends against using the original, published + version of the algorithm; only use this mode if you clearly + understand why you are choosing to do so. + + - PorterStemmer.MARTIN_EXTENSIONS + + An implementation that only uses the modifications to the + algorithm that are included in the implementations on Martin + Porter's website. He has declared Porter frozen, so the + behaviour of those implementations should never change. + + - PorterStemmer.NLTK_EXTENSIONS (default) + + An implementation that includes further improvements devised by + NLTK contributors or taken from other modified implementations + found on the web. For the best stemming, you should use the default NLTK_EXTENSIONS version. However, if you need to get the same results as either the diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py index cb17df9875..2fd0e3a4c4 100644 --- a/nltk/stem/snowball.py +++ b/nltk/stem/snowball.py @@ -309,10 +309,15 @@ class ArabicStemmer(_StandardStemmer): """ https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) The Snowball Arabic light Stemmer - Algorithm : Assem Chelli - Abdelkrim Aries - Lakhdar Benzahia - Nltk Version Author : Lakhdar Benzahia + Algorithm: + + - Assem Chelli + - Abdelkrim Aries + - Lakhdar Benzahia + + NLTK Version Author: + + - Lakhdar Benzahia """ # Normalize_pre stes @@ -807,7 +812,8 @@ def __Prefix_Step4_Verb(self, token): def stem(self, word): """ - Stem an Arabic word and return the stemmed form. + Stem an Arabic word and return the stemmed form. + :param word: string :return: string """ diff --git a/nltk/stem/util.py b/nltk/stem/util.py index 250e5ff088..a3dbe97efc 100644 --- a/nltk/stem/util.py +++ b/nltk/stem/util.py @@ -15,7 +15,8 @@ def suffix_replace(original, old, new): def prefix_replace(original, old, new): """ - Replaces the old prefix of the original string by a new suffix + Replaces the old prefix of the original string by a new suffix + :param original: string :param old: string :param new: string diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py index e82be79f93..3e18a22b17 100644 --- a/nltk/tag/brill_trainer.py +++ b/nltk/tag/brill_trainer.py @@ -98,12 +98,12 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): *min_score*, and each of which has accuracy not lower than *min_acc*. - #imports + >>> # Relevant imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer - #some data + >>> # Load some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] @@ -127,11 +127,11 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS 0.2450142... - #templates + >>> # Set up templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] - #construct a BrillTaggerTrainer + >>> # Construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) @@ -187,7 +187,7 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] - # a high-accuracy tagger + >>> # A high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... @@ -233,7 +233,6 @@ def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger - """ # FIXME: several tests are a bit too dependent on tracing format # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py index 02eadb0b1d..cdcb4623be 100644 --- a/nltk/tag/crf.py +++ b/nltk/tag/crf.py @@ -43,39 +43,38 @@ class CRFTagger(TaggerI): >>> ct.set_model_file('model.crf.tagger') >>> ct.evaluate(gold_sentences) 1.0 - """ def __init__(self, feature_func=None, verbose=False, training_opt={}): """ Initialize the CRFSuite tagger + :param feature_func: The function that extracts features for each token of a sentence. This function should take - 2 parameters: tokens and index which extract features at index position from tokens list. See the build in - _get_features function for more detail. + 2 parameters: tokens and index which extract features at index position from tokens list. See the build in + _get_features function for more detail. :param verbose: output the debugging messages during training. :type verbose: boolean :param training_opt: python-crfsuite training options - :type training_opt : dictionary + :type training_opt: dictionary Set of possible training options (using LBFGS training algorithm). - 'feature.minfreq' : The minimum frequency of features. - 'feature.possible_states' : Force to generate possible state features. - 'feature.possible_transitions' : Force to generate possible transition features. - 'c1' : Coefficient for L1 regularization. - 'c2' : Coefficient for L2 regularization. - 'max_iterations' : The maximum number of iterations for L-BFGS optimization. - 'num_memories' : The number of limited memories for approximating the inverse hessian matrix. - 'epsilon' : Epsilon for testing the convergence of the objective. - 'period' : The duration of iterations to test the stopping criterion. - 'delta' : The threshold for the stopping criterion; an L-BFGS iteration stops when the - improvement of the log likelihood over the last ${period} iterations is no greater than this threshold. - 'linesearch' : The line search algorithm used in L-BFGS updates: - { 'MoreThuente': More and Thuente's method, - 'Backtracking': Backtracking method with regular Wolfe condition, - 'StrongBacktracking': Backtracking method with strong Wolfe condition - } - 'max_linesearch' : The maximum number of trials for the line search algorithm. - + :'feature.minfreq': The minimum frequency of features. + :'feature.possible_states': Force to generate possible state features. + :'feature.possible_transitions': Force to generate possible transition features. + :'c1': Coefficient for L1 regularization. + :'c2': Coefficient for L2 regularization. + :'max_iterations': The maximum number of iterations for L-BFGS optimization. + :'num_memories': The number of limited memories for approximating the inverse hessian matrix. + :'epsilon': Epsilon for testing the convergence of the objective. + :'period': The duration of iterations to test the stopping criterion. + :'delta': The threshold for the stopping criterion; an L-BFGS iteration stops when the + improvement of the log likelihood over the last ${period} iterations is no greater than this threshold. + :'linesearch': The line search algorithm used in L-BFGS updates: + + - 'MoreThuente': More and Thuente's method, + - 'Backtracking': Backtracking method with regular Wolfe condition, + - 'StrongBacktracking': Backtracking method with strong Wolfe condition + :'max_linesearch': The maximum number of trials for the line search algorithm. """ self._model_file = "" @@ -97,16 +96,16 @@ def set_model_file(self, model_file): def _get_features(self, tokens, idx): """ Extract basic features about this word including - - Current Word - - Is Capitalized ? - - Has Punctuation ? - - Has Number ? - - Suffixes up to length 3 - Note that : we might include feature over previous word, next word etc. + - Current word + - is it capitalized? + - Does it have punctuation? + - Does it have a number? + - Suffixes up to length 3 - :return : a list which contains the features - :rtype : list(str) + Note that : we might include feature over previous word, next word etc. + :return: a list which contains the features + :rtype: list(str) """ token = tokens[idx] @@ -143,12 +142,14 @@ def _get_features(self, tokens, idx): def tag_sents(self, sents): """ Tag a list of sentences. NB before using this function, user should specify the mode_file either by - - Train a new model using ``train'' function - - Use the pre-trained model which is set via ``set_model_file'' function - :params sentences : list of sentences needed to tag. - :type sentences : list(list(str)) - :return : list of tagged sentences. - :rtype : list (list (tuple(str,str))) + + - Train a new model using ``train`` function + - Use the pre-trained model which is set via ``set_model_file`` function + + :params sentences: list of sentences needed to tag. + :type sentences: list(list(str)) + :return: list of tagged sentences. + :rtype: list(list(tuple(str,str))) """ if self._model_file == "": raise Exception( @@ -193,12 +194,14 @@ def train(self, train_data, model_file): def tag(self, tokens): """ Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by - - Train a new model using ``train'' function - - Use the pre-trained model which is set via ``set_model_file'' function - :params tokens : list of tokens needed to tag. - :type tokens : list(str) - :return : list of tagged tokens. - :rtype : list (tuple(str,str)) + + - Train a new model using ``train`` function + - Use the pre-trained model which is set via ``set_model_file`` function + + :params tokens: list of tokens needed to tag. + :type tokens: list(str) + :return: list of tagged tokens. + :rtype: list(tuple(str,str)) """ return self.tag_sents([tokens])[0] diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py index 6e12f3eb6e..a18c0c2069 100644 --- a/nltk/tag/perceptron.py +++ b/nltk/tag/perceptron.py @@ -125,7 +125,7 @@ class PerceptronTagger(TaggerI): """ Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. See more implementation details here: - https://explosion.ai/blog/part-of-speech-pos-tagger-in-python + https://explosion.ai/blog/part-of-speech-pos-tagger-in-python >>> from nltk.tag.perceptron import PerceptronTagger diff --git a/nltk/tag/senna.py b/nltk/tag/senna.py index edba2c3530..cd6c0b9508 100644 --- a/nltk/tag/senna.py +++ b/nltk/tag/senna.py @@ -9,33 +9,34 @@ Senna POS tagger, NER Tagger, Chunk Tagger The input is: + - path to the directory that contains SENNA executables. If the path is incorrect, - SennaTagger will automatically search for executable file specified in SENNA environment variable + SennaTagger will automatically search for executable file specified in SENNA environment variable - (optionally) the encoding of the input data (default:utf-8) Note: Unit tests for this module can be found in test/unit/test_senna.py - >>> from nltk.tag import SennaTagger - >>> tagger = SennaTagger('/usr/share/senna-v3.0') - >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP - [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), - ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] - - >>> from nltk.tag import SennaChunkTagger - >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') - >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP - [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), - ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), - ('?', 'O')] - - >>> from nltk.tag import SennaNERTagger - >>> nertagger = SennaNERTagger('/usr/share/senna-v3.0') - >>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP - [('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'), - ('London', 'B-LOC'), ('.', 'O')] - >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP - [('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), - ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')] +>>> from nltk.tag import SennaTagger +>>> tagger = SennaTagger('/usr/share/senna-v3.0') +>>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP +[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), +('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] + +>>> from nltk.tag import SennaChunkTagger +>>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') +>>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP +[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), +('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), +('?', 'O')] + +>>> from nltk.tag import SennaNERTagger +>>> nertagger = SennaNERTagger('/usr/share/senna-v3.0') +>>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP +[('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'), +('London', 'B-LOC'), ('.', 'O')] +>>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP +[('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), +('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')] """ from nltk.classify import Senna diff --git a/nltk/tag/tnt.py b/nltk/tag/tnt.py index bfcfd87e58..9174e498c7 100755 --- a/nltk/tag/tnt.py +++ b/nltk/tag/tnt.py @@ -88,13 +88,13 @@ def __init__(self, unk=None, Trained=False, N=1000, C=False): before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI - :type unk:(TaggerI) + :type unk: TaggerI :param Trained: Indication that the POS tagger is trained or not - :type Trained: boolean + :type Trained: bool :param N: Beam search degree (see above) - :type N:(int) + :type N: int :param C: Capitalization flag - :type C: boolean + :type C: bool Initializer, creates frequency distributions to be used for tagging diff --git a/nltk/tbl/feature.py b/nltk/tbl/feature.py index 1d4f619cf7..8fe9617483 100644 --- a/nltk/tbl/feature.py +++ b/nltk/tbl/feature.py @@ -36,23 +36,23 @@ def __init__(self, positions, end=None): """ Construct a Feature which may apply at C{positions}. - #For instance, importing some concrete subclasses (Feature is abstract) + >>> # For instance, importing some concrete subclasses (Feature is abstract) >>> from nltk.tag.brill import Word, Pos - #Feature Word, applying at one of [-2, -1] + >>> # Feature Word, applying at one of [-2, -1] >>> Word([-2,-1]) Word([-2, -1]) - #Positions need not be contiguous + >>> # Positions need not be contiguous >>> Word([-2,-1, 1]) Word([-2, -1, 1]) - #Contiguous ranges can alternatively be specified giving the - #two endpoints (inclusive) + >>> # Contiguous ranges can alternatively be specified giving the + >>> # two endpoints (inclusive) >>> Pos(-3, -1) Pos([-3, -2, -1]) - #In two-arg form, start <= end is enforced + >>> # In two-arg form, start <= end is enforced >>> Pos(2, 1) Traceback (most recent call last): File "", line 1, in @@ -71,7 +71,6 @@ def __init__(self, positions, end=None): :param start: start of range where this feature should apply :type end: int :param end: end of range (NOTE: inclusive!) where this feature should apply - """ self.positions = None # to avoid warnings if end is None: @@ -113,10 +112,12 @@ def expand(cls, starts, winlens, excludezero=False): target feature at [0]) For instance, importing a concrete subclass (Feature is abstract) + >>> from nltk.tag.brill import Word First argument gives the possible start positions, second the possible window lengths + >>> Word.expand([-3,-2,-1], [1]) [Word([-3]), Word([-2]), Word([-1])] @@ -129,7 +130,8 @@ def expand(cls, starts, winlens, excludezero=False): >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] - a third optional argument excludes all Features whose positions contain zero + A third optional argument excludes all Features whose positions contain zero + >>> Word.expand([-2,-1,0], [1,2], excludezero=False) [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] @@ -137,6 +139,7 @@ def expand(cls, starts, winlens, excludezero=False): [Word([-2]), Word([-1]), Word([-2, -1])] All window lengths must be positive + >>> Word.expand([-2,-1], [0]) Traceback (most recent call last): File "", line 1, in diff --git a/nltk/tbl/rule.py b/nltk/tbl/rule.py index c39df01073..ee899859c2 100644 --- a/nltk/tbl/rule.py +++ b/nltk/tbl/rule.py @@ -106,9 +106,9 @@ class Rule(TagRule): - The M{n}th token is tagged with the Rule's original tag; and - For each (Feature(positions), M{value}) tuple: + - The value of Feature of at least one token in {n+p for p in positions} is M{value}. - """ json_tag = "nltk.tbl.Rule" @@ -119,15 +119,15 @@ def __init__(self, templateid, original_tag, replacement_tag, conditions): C{original_tag} to C{replacement_tag} if all of the properties specified in C{conditions} hold. - @type templateid: string - @param templateid: the template id (a zero-padded string, '001' etc, - so it will sort nicely) + :param templateid: the template id (a zero-padded string, '001' etc, + so it will sort nicely) + :type templateid: string - @type conditions: C{iterable} of C{Feature} - @param conditions: A list of Feature(positions), + :param conditions: A list of Feature(positions), each of which specifies that the property (computed by Feature.extract_property()) of at least one token in M{n} + p in positions is C{value}. + :type conditions: C{iterable} of C{Feature} """ TagRule.__init__(self, original_tag, replacement_tag) diff --git a/nltk/tbl/template.py b/nltk/tbl/template.py index cfa9d9ac94..d039c7a058 100644 --- a/nltk/tbl/template.py +++ b/nltk/tbl/template.py @@ -25,19 +25,19 @@ class BrillTemplateI(metaclass=ABCMeta): def applicable_rules(self, tokens, i, correctTag): """ Return a list of the transformational rules that would correct - the *i*th subtoken's tag in the given token. In particular, + the ``i``-th subtoken's tag in the given token. In particular, return a list of zero or more rules that would change - *tokens*[i][1] to *correctTag*, if applied to *token*[i]. + ``tokens[i][1]`` to ``correctTag``, if applied to ``token[i]``. - If the *i*th token already has the correct tag (i.e., if - tagged_tokens[i][1] == correctTag), then + If the ``i``-th token already has the correct tag (i.e., if + ``tagged_tokens[i][1] == correctTag``), then ``applicable_rules()`` should return the empty list. :param tokens: The tagged tokens being tagged. :type tokens: list(tuple) :param i: The index of the token whose tag should be corrected. :type i: int - :param correctTag: The correct tag for the *i*th token. + :param correctTag: The correct tag for the ``i``-th token. :type correctTag: any :rtype: list(BrillRule) """ @@ -90,32 +90,38 @@ def __init__(self, *features): In new code, that would be better written Template(Feature(start1, end1), Feature(start2, end2), ...) - #For instance, importing some features + For instance, importing some features + >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Word, Pos - #create some features + Create some features >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1])) - #Create a single-feature template + Create a single-feature template + >>> Template(wfeat1) Template(Word([-1])) - #or a two-feature one + Or a two-feature one + >>> Template(wfeat1, wfeat2) Template(Word([-1]),Word([1, 2])) - #or a three-feature one with two different feature types + Or a three-feature one with two different feature types + >>> Template(wfeat1, wfeat2, pfeat) Template(Word([-1]),Word([1, 2]),Pos([-2, -1])) - #deprecated api: Feature subclass, followed by list of (start,end) pairs - #(permits only a single Feature) + deprecated api: Feature subclass, followed by list of (start,end) pairs + (permits only a single Feature) + >>> Template(Word, (-2,-1), (0,0)) Template(Word([-2, -1]),Word([0])) - #incorrect specification raises TypeError + Incorrect specification raises TypeError + >>> Template(Word, (-2,-1), Pos, (0,0)) Traceback (most recent call last): File "", line 1, in diff --git a/nltk/test/unit/test_distance.py b/nltk/test/unit/test_distance.py index bea1b542c2..23a5b9c4fc 100644 --- a/nltk/test/unit/test_distance.py +++ b/nltk/test/unit/test_distance.py @@ -99,16 +99,16 @@ class TestEditDistance: def test_with_transpositions( self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int] ): - """Test `edit_distance` between two strings, given some `substitution_cost`, + """ + Test `edit_distance` between two strings, given some `substitution_cost`, and whether transpositions are allowed. - Args: - left (str): First input string to `edit_distance`. - right (str): Second input string to `edit_distance`. - substitution_cost (int): The cost of a substitution action in `edit_distance`. - expecteds (Tuple[int, int]): A tuple of expected outputs, such that `expecteds[0]` is - the expected output with `transpositions=True`, and `expecteds[1]` is - the expected output with `transpositions=False`. + :param str left: First input string to `edit_distance`. + :param str right: Second input string to `edit_distance`. + :param int substitution_cost: The cost of a substitution action in `edit_distance`. + :param Tuple[int, int] expecteds: A tuple of expected outputs, such that `expecteds[0]` is + the expected output with `transpositions=True`, and `expecteds[1]` is + the expected output with `transpositions=False`. """ # Test the input strings in both orderings for s1, s2 in ((left, right), (right, left)): diff --git a/nltk/test/unit/test_stem.py b/nltk/test/unit/test_stem.py index 95bed812d1..fefadbcd9e 100644 --- a/nltk/test/unit/test_stem.py +++ b/nltk/test/unit/test_stem.py @@ -88,12 +88,11 @@ def _test_against_expected_output(self, stemmer_mode, expected_stems): def test_vocabulary_martin_mode(self): """Tests all words from the test vocabulary provided by M Porter - The sample vocabulary and output were sourced from: - https://tartarus.org/martin/PorterStemmer/voc.txt - https://tartarus.org/martin/PorterStemmer/output.txt + The sample vocabulary and output were sourced from + https://tartarus.org/martin/PorterStemmer/voc.txt and + https://tartarus.org/martin/PorterStemmer/output.txt and are linked to from the Porter Stemmer algorithm's homepage - at - https://tartarus.org/martin/PorterStemmer/ + at https://tartarus.org/martin/PorterStemmer/ """ with closing( data.find("stemmers/porter_test/porter_martin_output.txt").open( diff --git a/nltk/text.py b/nltk/text.py index dd734aa1be..3e3388e532 100644 --- a/nltk/text.py +++ b/nltk/text.py @@ -569,9 +569,8 @@ def generate(self, length=100, text_seed=None, random_seed=42): :type text_seed: list(str) :param random_seed: A random seed or an instance of `random.Random`. If provided, - makes the random sampling part of generation reproducible. (default=42) + makes the random sampling part of generation reproducible. (default=42) :type random_seed: int - """ # Create the model when using it the first time. self._tokenized_sents = [ diff --git a/nltk/tokenize/legality_principle.py b/nltk/tokenize/legality_principle.py index 0f2e0704eb..d377ab0f0a 100644 --- a/nltk/tokenize/legality_principle.py +++ b/nltk/tokenize/legality_principle.py @@ -10,14 +10,14 @@ The Legality Principle is a language agnostic principle maintaining that syllable onsets and codas (the beginning and ends of syllables not including the vowel) are only legal if they are found as word onsets or codas in the language. The English -word ``admit'' must then be syllabified as ``ad-mit'' since ``dm'' is not found +word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found word-initially in the English language (Bartlett et al.). This principle was first proposed -in Daniel Kahn's 1976 dissertation, ``Syllable-based generalizations in English phonology''. +in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''. -Kahn further argues that there is a ``strong tendency to syllabify in such a way that +Kahn further argues that there is a ''strong tendency to syllabify in such a way that initial clusters are of maximal length, consistent with the general constraints on word-initial consonant clusters.'' Consequently, in addition to being legal onsets, -the longest legal onset is preferable---``Onset Maximization''. +the longest legal onset is preferable---''Onset Maximization''. The default implementation assumes an English vowel set, but the `vowels` attribute can be set to IPA or any other alphabet's vowel set for the use-case. @@ -29,10 +29,11 @@ is a good benchmark for English accuracy if utilizing IPA (pg. 311). References: + - Otto Jespersen. 1904. Lehrbuch der Phonetik. Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. -- Theo Vennemann, ``On the Theory of Syllabic Phonology,'' 1972, p. 11. -- Daniel Kahn, ``Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976). +- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11. +- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976). - Elisabeth Selkirk. 1984. On the major class features and syllable theory. In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology. Cambridge, MIT Press. pp. 107-136. diff --git a/nltk/tokenize/sonority_sequencing.py b/nltk/tokenize/sonority_sequencing.py index 7d9925a6d2..30e6f10f09 100644 --- a/nltk/tokenize/sonority_sequencing.py +++ b/nltk/tokenize/sonority_sequencing.py @@ -22,6 +22,7 @@ one level, they should be given separately to the `vowels` class attribute. References: + - Otto Jespersen. 1904. Lehrbuch der Phonetik. Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. - Elisabeth Selkirk. 1984. On the major class features and syllable theory. diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py index d443cdef9f..f5b90f5a0a 100644 --- a/nltk/tokenize/treebank.py +++ b/nltk/tokenize/treebank.py @@ -36,16 +36,16 @@ class TreebankWordTokenizer(TokenizerI): - split off commas and single quotes, when followed by whitespace - separate periods that appear at the end of line - >>> from nltk.tokenize import TreebankWordTokenizer - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' - >>> TreebankWordTokenizer().tokenize(s) - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] - >>> s = "They'll save and invest more." - >>> TreebankWordTokenizer().tokenize(s) - ['They', "'ll", 'save', 'and', 'invest', 'more', '.'] - >>> s = "hi, my name can't hello," - >>> TreebankWordTokenizer().tokenize(s) - ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ','] + >>> from nltk.tokenize import TreebankWordTokenizer + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' + >>> TreebankWordTokenizer().tokenize(s) + ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] + >>> s = "They'll save and invest more." + >>> TreebankWordTokenizer().tokenize(s) + ['They', "'ll", 'save', 'and', 'invest', 'more', '.'] + >>> s = "hi, my name can't hello," + >>> TreebankWordTokenizer().tokenize(s) + ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ','] """ # starting quotes @@ -196,22 +196,23 @@ class TreebankWordDetokenizer(TokenizerI): the Treebank tokenizer's regexes. Note: - - There're additional assumption mades when undoing the padding of [;@#$%&] + + - There're additional assumption mades when undoing the padding of `[;@#$%&]` punctuation symbols that isn't presupposed in the TreebankTokenizer. - There're additional regexes added in reversing the parentheses tokenization, - - the r'([\]\)\}\>])\s([:;,.])' removes the additional right padding added - to the closing parentheses precedding [:;,.]. + - the `r'([\]\)\}\>])\s([:;,.])'` removes the additional right padding added + to the closing parentheses precedding `[:;,.]`. - It's not possible to return the original whitespaces as they were because there wasn't explicit records of where '\n', '\t' or '\s' were removed at the text.split() operation. - >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' - >>> d = TreebankWordDetokenizer() - >>> t = TreebankWordTokenizer() - >>> toks = t.tokenize(s) - >>> d.detokenize(toks) - 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.' + >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer + >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' + >>> d = TreebankWordDetokenizer() + >>> t = TreebankWordTokenizer() + >>> toks = t.tokenize(s) + >>> d.detokenize(toks) + 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.' The MXPOST parentheses substitution can be undone using the `convert_parentheses` parameter: diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py index b0d50e57d9..36fb0704fd 100644 --- a/nltk/translate/bleu_score.py +++ b/nltk/translate/bleu_score.py @@ -362,74 +362,74 @@ def brevity_penalty(closest_ref_len, hyp_len): An example from the paper. There are three references with length 12, 15 and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. - >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 - >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 - >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 - >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 - >>> references = [reference1, reference2, reference3] - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 1.0 + >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 + >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 + >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 + >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 + >>> references = [reference1, reference2, reference3] + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 1.0 In case a hypothesis translation is shorter than the references, penalty is applied. - >>> references = [['a'] * 28, ['a'] * 28] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 0.2635971381157267 + >>> references = [['a'] * 28, ['a'] * 28] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 0.2635971381157267 The length of the closest reference is used to compute the penalty. If the length of a hypothesis is 12, and the reference lengths are 13 and 2, the penalty is applied because the hypothesis length (12) is less then the closest reference length (13). - >>> references = [['a'] * 13, ['a'] * 2] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS - 0.9200... + >>> references = [['a'] * 13, ['a'] * 2] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS + 0.9200... The brevity penalty doesn't depend on reference order. More importantly, when two reference sentences are at the same distance, the shortest reference sentence length is used. - >>> references = [['a'] * 13, ['a'] * 11] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) - >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) - >>> bp1 == bp2 == 1 - True + >>> references = [['a'] * 13, ['a'] * 11] + >>> hypothesis = ['a'] * 12 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) + >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) + >>> bp1 == bp2 == 1 + True A test example from mteval-v13a.pl (starting from the line 705): - >>> references = [['a'] * 11, ['a'] * 8] - >>> hypothesis = ['a'] * 7 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS - 0.8668... - - >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] - >>> hypothesis = ['a'] * 7 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 1.0 + >>> references = [['a'] * 11, ['a'] * 8] + >>> hypothesis = ['a'] * 7 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS + 0.8668... + + >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] + >>> hypothesis = ['a'] * 7 + >>> hyp_len = len(hypothesis) + >>> closest_ref_len = closest_ref_length(references, hyp_len) + >>> brevity_penalty(closest_ref_len, hyp_len) + 1.0 :param hyp_len: The length of the hypothesis for a single sentence OR the - sum of all the hypotheses' lengths for a corpus + sum of all the hypotheses' lengths for a corpus :type hyp_len: int :param closest_ref_len: The length of the closest reference for a single - hypothesis OR the sum of all the closest references for every hypotheses. + hypothesis OR the sum of all the closest references for every hypotheses. :type closest_ref_len: int :return: BLEU's brevity penalty. :rtype: float @@ -552,14 +552,18 @@ def method3(self, p_n, *args, **kwargs): The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each precision score whose matching n-gram count is null. k is 1 for the first 'n' value for which the n-gram match count is null/ + For example, if the text contains: - - one 2-gram match - - and (consequently) two 1-gram matches + + - one 2-gram match + - and (consequently) two 1-gram matches + the n-gram count for each individual precision score would be: - - n=1 => prec_count = 2 (two unigrams) - - n=2 => prec_count = 1 (one bigram) - - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) - - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) + + - n=1 => prec_count = 2 (two unigrams) + - n=2 => prec_count = 1 (one bigram) + - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) + - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) """ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. for i, p_i in enumerate(p_n): diff --git a/nltk/translate/ibm1.py b/nltk/translate/ibm1.py index c11a5ae6cc..f91543d865 100644 --- a/nltk/translate/ibm1.py +++ b/nltk/translate/ibm1.py @@ -17,42 +17,45 @@ In IBM Model 1, word order is ignored for simplicity. As long as the word alignments are equivalent, it doesn't matter where the word occurs in the source or target sentence. Thus, the following three alignments -are equally likely. +are equally likely:: -Source: je mange du jambon -Target: i eat some ham -Alignment: (0,0) (1,1) (2,2) (3,3) + Source: je mange du jambon + Target: i eat some ham + Alignment: (0,0) (1,1) (2,2) (3,3) -Source: je mange du jambon -Target: some ham eat i -Alignment: (0,2) (1,3) (2,1) (3,1) + Source: je mange du jambon + Target: some ham eat i + Alignment: (0,2) (1,3) (2,1) (3,1) -Source: du jambon je mange -Target: eat i some ham -Alignment: (0,3) (1,2) (2,0) (3,1) + Source: du jambon je mange + Target: eat i some ham + Alignment: (0,3) (1,2) (2,0) (3,1) Note that an alignment is represented here as (word_index_in_target, word_index_in_source). The EM algorithm used in Model 1 is: -E step - In the training data, count how many times a source language + +:E step: In the training data, count how many times a source language word is translated into a target language word, weighted by the prior probability of the translation. -M step - Estimate the new probability of translation based on the +:M step: Estimate the new probability of translation based on the counts from the Expectation step. +Notations +--------- -Notations: -i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -s: A word in the source language -t: A word in the target language +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:s: A word in the source language +:t: A word in the target language +References +---------- -References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. diff --git a/nltk/translate/ibm2.py b/nltk/translate/ibm2.py index ad54c41ace..44cccea5be 100644 --- a/nltk/translate/ibm2.py +++ b/nltk/translate/ibm2.py @@ -13,29 +13,33 @@ a source word position, given its aligned target word's position. The EM algorithm used in Model 2 is: -E step - In the training data, collect counts, weighted by prior + +:E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated - into a target language word - (b) count how many times a particular position in the source - sentence is aligned to a particular position in the target - sentence -M step - Estimate new probabilities based on the counts from the E step + - (a) count how many times a source language word is translated + into a target language word + - (b) count how many times a particular position in the source + sentence is aligned to a particular position in the target + sentence + +:M step: Estimate new probabilities based on the counts from the E step +Notations +--------- -Notations: -i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -l: Number of words in the source sentence, excluding NULL -m: Number of words in the target sentence -s: A word in the source language -t: A word in the target language +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language +References +---------- -References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. diff --git a/nltk/translate/ibm3.py b/nltk/translate/ibm3.py index 2b04597818..1456288fb8 100644 --- a/nltk/translate/ibm3.py +++ b/nltk/translate/ibm3.py @@ -28,41 +28,45 @@ target word that is produced by NULL. The EM algorithm used in Model 3 is: -E step - In the training data, collect counts, weighted by prior + +:E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated - into a target language word - (b) count how many times a particular position in the target - sentence is aligned to a particular position in the source - sentence - (c) count how many times a source word is aligned to phi number - of target words - (d) count how many times NULL is aligned to a target word -M step - Estimate new probabilities based on the counts from the E step + - (a) count how many times a source language word is translated + into a target language word + - (b) count how many times a particular position in the target + sentence is aligned to a particular position in the source + sentence + - (c) count how many times a source word is aligned to phi number + of target words + - (d) count how many times NULL is aligned to a target word + +:M step: Estimate new probabilities based on the counts from the E step Because there are too many possible alignments, only the most probable ones are considered. First, the best alignment is determined using prior probabilities. Then, a hill climbing approach is used to find other good candidates. +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language +:phi: Fertility, the number of target words produced by a source word +:p1: Probability that a target word produced by a source word is + accompanied by another target word that is aligned to NULL +:p0: 1 - p1 + +References +---------- -Notations: -i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -l: Number of words in the source sentence, excluding NULL -m: Number of words in the target sentence -s: A word in the source language -t: A word in the target language -phi: Fertility, the number of target words produced by a source word -p1: Probability that a target word produced by a source word is - accompanied by another target word that is aligned to NULL -p0: 1 - p1 - - -References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. diff --git a/nltk/translate/ibm4.py b/nltk/translate/ibm4.py index f65aa27313..0a0f5ba44c 100644 --- a/nltk/translate/ibm4.py +++ b/nltk/translate/ibm4.py @@ -21,21 +21,23 @@ information theoretic approach to group words into 50 classes for each vocabulary. -Terminology: -Cept: +Terminology +----------- + +:Cept: A source word with non-zero fertility i.e. aligned to one or more target words. -Tablet: +:Tablet: The set of target word(s) aligned to a cept. -Head of cept: +:Head of cept: The first word of the tablet of that cept. -Center of cept: +:Center of cept: The average position of the words in that cept's tablet. If the value is not an integer, the ceiling is taken. For example, for a tablet with words in positions 2, 5, 6 in the target sentence, the center of the corresponding cept is ceil((2 + 5 + 6) / 3) = 5 -Displacement: +:Displacement: For a head word, defined as (position of head word - position of previous cept's center). Can be positive or negative. For a non-head word, defined as (position of non-head word - @@ -45,52 +47,57 @@ In contrast to Model 3 which reorders words in a tablet independently of other words, Model 4 distinguishes between three cases. -(1) Words generated by NULL are distributed uniformly. -(2) For a head word t, its position is modeled by the probability - d_head(displacement | word_class_s(s),word_class_t(t)), - where s is the previous cept, and word_class_s and word_class_t maps - s and t to a source and target language word class respectively. -(3) For a non-head word t, its position is modeled by the probability - d_non_head(displacement | word_class_t(t)) + +1. Words generated by NULL are distributed uniformly. +2. For a head word t, its position is modeled by the probability + d_head(displacement | word_class_s(s),word_class_t(t)), + where s is the previous cept, and word_class_s and word_class_t maps + s and t to a source and target language word class respectively. +3. For a non-head word t, its position is modeled by the probability + d_non_head(displacement | word_class_t(t)) The EM algorithm used in Model 4 is: -E step - In the training data, collect counts, weighted by prior + +:E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated - into a target language word - (b) for a particular word class, count how many times a head - word is located at a particular displacement from the - previous cept's center - (c) for a particular word class, count how many times a - non-head word is located at a particular displacement from - the previous target word - (d) count how many times a source word is aligned to phi number - of target words - (e) count how many times NULL is aligned to a target word - -M step - Estimate new probabilities based on the counts from the E step -Like Model 3, there are too many possible alignments to consider. Thus, -a hill climbing approach is used to sample good candidates. + - (a) count how many times a source language word is translated + into a target language word + - (b) for a particular word class, count how many times a head + word is located at a particular displacement from the + previous cept's center + - (c) for a particular word class, count how many times a + non-head word is located at a particular displacement from + the previous target word + - (d) count how many times a source word is aligned to phi number + of target words + - (e) count how many times NULL is aligned to a target word +:M step: Estimate new probabilities based on the counts from the E step -Notations: -i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -l: Number of words in the source sentence, excluding NULL -m: Number of words in the target sentence -s: A word in the source language -t: A word in the target language -phi: Fertility, the number of target words produced by a source word -p1: Probability that a target word produced by a source word is - accompanied by another target word that is aligned to NULL -p0: 1 - p1 -dj: Displacement, Δj +Like Model 3, there are too many possible alignments to consider. Thus, +a hill climbing approach is used to sample good candidates. +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language +:phi: Fertility, the number of target words produced by a source word +:p1: Probability that a target word produced by a source word is + accompanied by another target word that is aligned to NULL +:p0: 1 - p1 +:dj: Displacement, Δj + +References +---------- -References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. diff --git a/nltk/translate/ibm5.py b/nltk/translate/ibm5.py index bdf80360b2..986d429500 100644 --- a/nltk/translate/ibm5.py +++ b/nltk/translate/ibm5.py @@ -28,8 +28,10 @@ during translation. It introduces the vacancy function v(j), the number of vacancies up to, and including, position j in the target sentence. -Terminology: -Maximum vacancy: +Terminology +----------- + +:Maximum vacancy: The number of valid slots that a word can be placed in. This is not necessarily the same as the number of vacant slots. For example, if a tablet contains more than one word, the head word @@ -38,7 +40,7 @@ has to take into account the length of the tablet. Non-head words cannot be placed before the head word, so vacancies to the left of the head word are ignored. -Vacancy difference: +:Vacancy difference: For a head word: (v(j) - v(center of previous cept)) Can be positive or negative. For a non-head word: (v(j) - v(position of previously placed word)) @@ -46,50 +48,55 @@ appear to the right of the previous word. Positioning of target words fall under three cases: -(1) Words generated by NULL are distributed uniformly -(2) For a head word t, its position is modeled by the probability - v_head(dv | max_v,word_class_t(t)) -(3) For a non-head word t, its position is modeled by the probability - v_non_head(dv | max_v,word_class_t(t)) + +1. Words generated by NULL are distributed uniformly +2. For a head word t, its position is modeled by the probability + v_head(dv | max_v,word_class_t(t)) +3. For a non-head word t, its position is modeled by the probability + v_non_head(dv | max_v,word_class_t(t)) + dv and max_v are defined differently for head and non-head words. The EM algorithm used in Model 5 is: -E step - In the training data, collect counts, weighted by prior + +:E step: In the training data, collect counts, weighted by prior probabilities. - (a) count how many times a source language word is translated - into a target language word - (b) for a particular word class and maximum vacancy, count how - many times a head word and the previous cept's center have - a particular difference in number of vacancies - (b) for a particular word class and maximum vacancy, count how - many times a non-head word and the previous target word - have a particular difference in number of vacancies - (d) count how many times a source word is aligned to phi number - of target words - (e) count how many times NULL is aligned to a target word - -M step - Estimate new probabilities based on the counts from the E step + + - (a) count how many times a source language word is translated + into a target language word + - (b) for a particular word class and maximum vacancy, count how + many times a head word and the previous cept's center have + a particular difference in number of vacancies + - (b) for a particular word class and maximum vacancy, count how + many times a non-head word and the previous target word + have a particular difference in number of vacancies + - (d) count how many times a source word is aligned to phi number + of target words + - (e) count how many times NULL is aligned to a target word + +:M step: Estimate new probabilities based on the counts from the E step Like Model 4, there are too many possible alignments to consider. Thus, a hill climbing approach is used to sample good candidates. In addition, pruning is used to weed out unlikely alignments based on Model 4 scores. - -Notations: -i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -l: Number of words in the source sentence, excluding NULL -m: Number of words in the target sentence -s: A word in the source language -t: A word in the target language -phi: Fertility, the number of target words produced by a source word -p1: Probability that a target word produced by a source word is - accompanied by another target word that is aligned to NULL -p0: 1 - p1 -max_v: Maximum vacancy -dv: Vacancy difference, Δv +Notations +--------- + +:i: Position in the source sentence + Valid values are 0 (for NULL), 1, 2, ..., length of source sentence +:j: Position in the target sentence + Valid values are 1, 2, ..., length of target sentence +:l: Number of words in the source sentence, excluding NULL +:m: Number of words in the target sentence +:s: A word in the source language +:t: A word in the target language +:phi: Fertility, the number of target words produced by a source word +:p1: Probability that a target word produced by a source word is + accompanied by another target word that is aligned to NULL +:p0: 1 - p1 +:max_v: Maximum vacancy +:dv: Vacancy difference, Δv The definition of v_head here differs from GIZA++, section 4.7 of [Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is @@ -99,8 +106,9 @@ v(center of previous cept) to obtain dv: v_head(v(j) - v(center of previous cept) | max_v,word_class(t)). +References +---------- -References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. diff --git a/nltk/translate/stack_decoder.py b/nltk/translate/stack_decoder.py index 786c1d7b08..9508cbb07b 100644 --- a/nltk/translate/stack_decoder.py +++ b/nltk/translate/stack_decoder.py @@ -233,8 +233,8 @@ def compute_future_scores(self, src_sentence): :type src_sentence: tuple(str) :return: Scores of subsequences referenced by their start and - end positions. For example, result[2][5] is the score of the - subsequence covering positions 2, 3, and 4. + end positions. For example, result[2][5] is the score of the + subsequence covering positions 2, 3, and 4. :rtype: dict(int: (dict(int): float)) """ scores = defaultdict(lambda: defaultdict(lambda: float("-inf"))) diff --git a/nltk/twitter/common.py b/nltk/twitter/common.py index 58b339e31e..658a51f1f4 100644 --- a/nltk/twitter/common.py +++ b/nltk/twitter/common.py @@ -7,7 +7,7 @@ # For license information, see LICENSE.TXT """ -Utility functions for the :module:`twitterclient` module which do not require +Utility functions for the `twitterclient` module which do not require the `twython` library to have been installed. """ import csv @@ -168,28 +168,28 @@ def json2csv_entities( :param tweets_file: the file-like object containing full Tweets :param str outfile: The path of the text file where results should be\ - written + written :param list main_fields: The list of fields to be extracted from the main\ - object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\ - for a full list of fields. - e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count'] - If `entity_type` is expressed with hierarchy, then it is the list of\ - fields of the object that corresponds to the key of the entity_type,\ - (e.g., for entity_type='user.urls', the fields in the main_fields list\ - belong to the user object; for entity_type='place.bounding_box', the\ - files in the main_field list belong to the place object of the tweet). + object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\ + for a full list of fields. + e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count'] + If `entity_type` is expressed with hierarchy, then it is the list of\ + fields of the object that corresponds to the key of the entity_type,\ + (e.g., for entity_type='user.urls', the fields in the main_fields list\ + belong to the user object; for entity_type='place.bounding_box', the\ + files in the main_field list belong to the place object of the tweet). :param list entity_type: The name of the entity: 'hashtags', 'media',\ - 'urls' and 'user_mentions' for the tweet object. For a user object,\ - this needs to be expressed with a hierarchy: `'user.urls'`. For the\ - bounding box of the Tweet location, use `'place.bounding_box'`. + 'urls' and 'user_mentions' for the tweet object. For a user object,\ + this needs to be expressed with a hierarchy: `'user.urls'`. For the\ + bounding box of the Tweet location, use `'place.bounding_box'`. :param list entity_fields: The list of fields to be extracted from the\ - entity. E.g. `['text']` (of the Tweet) + entity. E.g. `['text']` (of the Tweet) :param error: Behaviour for encoding errors, see\ - https://docs.python.org/3/library/codecs.html#codec-base-classes + https://docs.python.org/3/library/codecs.html#codec-base-classes :param gzip_compress: if `True`, output files are compressed with gzip """ diff --git a/nltk/twitter/twitter_demo.py b/nltk/twitter/twitter_demo.py index 5d24c7b4e1..441ad356f0 100644 --- a/nltk/twitter/twitter_demo.py +++ b/nltk/twitter/twitter_demo.py @@ -214,7 +214,7 @@ def limit_by_time_demo(keywords="nltk"): @verbose def corpusreader_demo(): """ - Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out + Use `TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and diff --git a/nltk/twitter/twitterclient.py b/nltk/twitter/twitterclient.py index 621e568ad5..5f3ad70b73 100644 --- a/nltk/twitter/twitterclient.py +++ b/nltk/twitter/twitterclient.py @@ -126,6 +126,14 @@ class Query(Twython): """ def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): + """ + :param app_key: (optional) Your applications key + :param app_secret: (optional) Your applications secret key + :param oauth_token: (optional) When using **OAuth 1**, combined with + oauth_token_secret to make authenticated calls + :param oauth_token_secret: (optional) When using **OAuth 1** combined + with oauth_token to make authenticated calls + """ self.handler = None self.do_continue = True Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret) @@ -329,28 +337,28 @@ def tweets( :param str keywords: Keywords to use for searching or filtering :param list follow: UserIDs to use for filtering Tweets from the public stream :param bool to_screen: If `True`, display the tweet texts on the screen,\ - otherwise print to a file + otherwise print to a file :param bool stream: If `True`, use the live public stream,\ - otherwise search past public Tweets + otherwise search past public Tweets :param int limit: The number of data items to process in the current\ - round of processing. + round of processing. :param tuple date_limit: The date at which to stop collecting\ - new data. This should be entered as a tuple which can serve as the\ - argument to `datetime.datetime`.\ - E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. - Note that, in the case of streaming, this is the maximum date, i.e.\ - a date in the future; if not, it is the minimum date, i.e. a date\ - in the past + new data. This should be entered as a tuple which can serve as the\ + argument to `datetime.datetime`.\ + E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. + Note that, in the case of streaming, this is the maximum date, i.e.\ + a date in the future; if not, it is the minimum date, i.e. a date\ + in the past :param str lang: language :param bool repeat: A flag to determine whether multiple files should\ - be written. If `True`, the length of each file will be set by the\ - value of `limit`. Use only if `to_screen` is `False`. See also - :py:func:`handle`. + be written. If `True`, the length of each file will be set by the\ + value of `limit`. Use only if `to_screen` is `False`. See also + :py:func:`handle`. :param gzip_compress: if `True`, output files are compressed with gzip. """ diff --git a/nltk/twitter/util.py b/nltk/twitter/util.py index 26bef4d4c4..29740b1ed7 100644 --- a/nltk/twitter/util.py +++ b/nltk/twitter/util.py @@ -7,7 +7,7 @@ # For license information, see LICENSE.TXT """ -Authentication utilities to accompany :module:`twitterclient`. +Authentication utilities to accompany `twitterclient`. """ import os @@ -46,25 +46,22 @@ def load_creds(self, creds_file=None, subdir=None, verbose=False): """ Read OAuth credentials from a text file. - :: - File format for OAuth 1 - ======================= + File format for OAuth 1:: + app_key=YOUR_APP_KEY app_secret=YOUR_APP_SECRET oauth_token=OAUTH_TOKEN oauth_token_secret=OAUTH_TOKEN_SECRET - :: - File format for OAuth 2 - ======================= + File format for OAuth 2:: app_key=YOUR_APP_KEY app_secret=YOUR_APP_SECRET access_token=ACCESS_TOKEN - :param str file_name: File containing credentials. ``None`` (default) reads\ - data from `TWITTER/'credentials.txt'` + :param str file_name: File containing credentials. ``None`` (default) reads + data from `TWITTER/'credentials.txt'` """ if creds_file is not None: self.creds_file = creds_file diff --git a/nltk/util.py b/nltk/util.py index c06f6b1cac..69d427b406 100644 --- a/nltk/util.py +++ b/nltk/util.py @@ -264,8 +264,8 @@ def edges2dot(edges, shapes=None, attr=None): :param edges: the set (or list) of edges of a directed graph. :return dot_string: a representation of 'edges' as a string in the DOT - graph language, which can be converted to an image by the 'dot' program - from the Graphviz package, or nltk.parse.dependencygraph.dot2img(dot_string). + graph language, which can be converted to an image by the 'dot' program + from the Graphviz package, or nltk.parse.dependencygraph.dot2img(dot_string). :param shapes: dictionary of strings that trigger a specified shape. :param attr: dictionary with global graph attributes @@ -279,7 +279,6 @@ def edges2dot(edges, shapes=None, attr=None): "B" -> "C"; "C" -> "B"; } - """ if not shapes: shapes = dict() diff --git a/web/news.rst b/web/news.rst index c0b65aee3d..5acc8d8d43 100644 --- a/web/news.rst +++ b/web/news.rst @@ -121,7 +121,7 @@ NLTK 3.2.1 released: April 2016 Support for CCG semantics, Stanford segmenter, VADER lexicon; Fixes to BLEU score calculation, CHILDES corpus reader. -NLTK 3.2 released : March 2016 +NLTK 3.2 released: March 2016 Fixes for Python 3.5, code cleanups now Python 2.6 is no longer supported, support for PanLex, support for third party download locations for NLTK data, new support for RIBES score, BLEU @@ -134,7 +134,7 @@ NLTK 3.2 released : March 2016 2015 ---- -NLTK 3.1 released : October 2015 +NLTK 3.1 released: October 2015 Add support for Python 3.5, drop support for Python 2.6, sentiment analysis package and several corpora, improved POS tagger, Twitter package, @@ -145,41 +145,41 @@ NLTK 3.1 released : October 2015 Multext East Corpus and MTECorpusReader, minor bugfixes and enhancements -NLTK 3.0.5 released : September 2015 +NLTK 3.0.5 released: September 2015 New Twitter package; updates to IBM models 1-3, new models 4 and 5, minor bugfixes and enhancements -NLTK 3.0.4 released : July 2015 +NLTK 3.0.4 released: July 2015 Minor bugfixes and enhancements. -NLTK 3.0.3 released : June 2015 +NLTK 3.0.3 released: June 2015 PanLex Swadesh Corpus, tgrep tree search, minor bugfixes. -NLTK 3.0.2 released : March 2015 +NLTK 3.0.2 released: March 2015 Senna, BLLIP, python-crfsuite interfaces, transition-based dependency parsers, dependency graph visualization, NKJP corpus reader, minor bugfixes and clean-ups. -NLTK 3.0.1 released : January 2015 +NLTK 3.0.1 released: January 2015 Minor packaging update. 2014 ---- -NLTK 3.0.0 released : September 2014 +NLTK 3.0.0 released: September 2014 Minor bugfixes. -NLTK 3.0.0b2 released : August 2014 +NLTK 3.0.0b2 released: August 2014 Minor bugfixes and clean-ups. -NLTK Book Updates : July 2014 - The NLTK book is being updated for Python 3 and NLTK 3 `here `_. - The original Python 2 edition is still available `here `_. +NLTK Book Updates: July 2014 + The NLTK book is being updated for Python 3 and NLTK 3 `here `__. + The original Python 2 edition is still available `here `__. -NLTK 3.0.0b1 released : July 2014 +NLTK 3.0.0b1 released: July 2014 FrameNet, SentiWordNet, universal tagset, misc efficiency improvements and bugfixes Several API changes, see https://github.com/nltk/nltk/wiki/Porting-your-code-to-NLTK-3.0 -NLTK 3.0a4 released : June 2014 +NLTK 3.0a4 released: June 2014 FrameNet, universal tagset, misc efficiency improvements and bugfixes Several API changes, see https://github.com/nltk/nltk/wiki/Porting-your-code-to-NLTK-3.0 For full details see: @@ -189,223 +189,223 @@ NLTK 3.0a4 released : June 2014 2013 ---- -NLTK Book Updates : October 2013 +NLTK Book Updates: October 2013 We are updating the NLTK book for Python 3 and NLTK 3; please see https://www.nltk.org/book/ -NLTK 3.0a2 released : July 2013 +NLTK 3.0a2 released: July 2013 Misc efficiency improvements and bugfixes; for details see https://github.com/nltk/nltk/blob/develop/ChangeLog http://nltk.org/nltk3-alpha/ -NLTK 3.0a1 released : February 2013 +NLTK 3.0a1 released: February 2013 This version adds support for NLTK's graphical user interfaces. http://nltk.org/nltk3-alpha/ -NLTK 3.0a0 released : January 2013 +NLTK 3.0a0 released: January 2013 The first alpha release of NLTK 3.0 is now available for testing. This version of NLTK works with Python 2.6, 2.7, and Python 3. http://nltk.org/nltk3-alpha/ 2012 ---- -Python Grant : November 2012 +Python Grant: November 2012 The Python Software Foundation is sponsoring Mikhail Korobov's work on porting NLTK to Python 3. https://pyfound.blogspot.hu/2012/11/grants-to-assist-kivy-nltk-in-porting.html -NLTK 2.0.4 released : November 2012 +NLTK 2.0.4 released: November 2012 Minor fix to remove numpy dependency. -NLTK 2.0.3 released : September 2012 +NLTK 2.0.3 released: September 2012 This release contains minor improvements and bugfixes. This is the final release compatible with Python 2.5. -NLTK 2.0.2 released : July 2012 +NLTK 2.0.2 released: July 2012 This release contains minor improvements and bugfixes. -NLTK 2.0.1 released : May 2012 +NLTK 2.0.1 released: May 2012 The final release of NLTK 2. -NLTK 2.0.1rc4 released : February 2012 +NLTK 2.0.1rc4 released: February 2012 The fourth release candidate for NLTK 2. -NLTK 2.0.1rc3 released : January 2012 +NLTK 2.0.1rc3 released: January 2012 The third release candidate for NLTK 2. 2011 ---- -NLTK 2.0.1rc2 released : December 2011 +NLTK 2.0.1rc2 released: December 2011 The second release candidate for NLTK 2. For full details see the ChangeLog. -NLTK development moved to GitHub : October 2011 +NLTK development moved to GitHub: October 2011 The development site for NLTK has moved from GoogleCode to GitHub: https://github.com/nltk -NLTK 2.0.1rc1 released : April 2011 +NLTK 2.0.1rc1 released: April 2011 The first release candidate for NLTK 2. For full details see the ChangeLog. 2010 ---- -Python Text Processing with NLTK 2.0 Cookbook : December 2010 +Python Text Processing with NLTK 2.0 Cookbook: December 2010 Jacob Perkins has written a 250-page cookbook full of great recipes for text processing using Python and NLTK, published by Packt Publishing. Some of the royalties are being donated to the NLTK project. -Japanese translation of NLTK book : November 2010 +Japanese translation of NLTK book: November 2010 Masato Hagiwara has translated the NLTK book into Japanese, along with an extra chapter on particular issues with Japanese language process. See https://www.oreilly.co.jp/books/9784873114705/. -NLTK 2.0b9 released : July 2010 +NLTK 2.0b9 released: July 2010 The last beta release before 2.0 final. For full details see the ChangeLog. -NLTK in Ubuntu 10.4 (Lucid Lynx) : February 2010 +NLTK in Ubuntu 10.4 (Lucid Lynx): February 2010 NLTK is now in the latest LTS version of Ubuntu, thanks to the efforts of Robin Munn. See https://packages.ubuntu.com/lucid/python/python-nltk -NLTK 2.0b? released : June 2009 - February 2010 +NLTK 2.0b? released: June 2009 - February 2010 Bugfix releases in preparation for 2.0 final. For full details see the ChangeLog. 2009 ---- -NLTK Book in second printing : December 2009 +NLTK Book in second printing: December 2009 The second print run of Natural Language Processing with Python will go on sale in January. We've taken the opportunity to make about 40 minor corrections. The online version has been updated. -NLTK Book published : June 2009 +NLTK Book published: June 2009 Natural Language Processing with Python, by Steven Bird, Ewan Klein and Edward Loper, has been published by O'Reilly Media Inc. It can be purchased in hardcopy, ebook, PDF or for online access, at https://oreilly.com/catalog/9780596516499/. For information about sellers and prices, see https://isbndb.com/d/book/natural_language_processing_with_python/prices.html. -Version 0.9.9 released : May 2009 +Version 0.9.9 released: May 2009 This version finalizes NLTK's API ahead of the 2.0 release and the publication of the NLTK book. There have been dozens of minor enhancements and bugfixes. Many names of the form nltk.foo.Bar are now available as nltk.Bar. There is expanded functionality in the decision tree, collocations, and Toolbox modules. A new translation toy nltk.misc.babelfish has been added. A new module nltk.help gives access to tagset documentation. Fixed imports so NLTK will build and install without Tkinter (for running on servers). New data includes a maximum entropy chunker model and updated grammars. NLTK Contrib includes updates to the coreference package (Joseph Frazee) and the ISRI Arabic stemmer (Hosam Algasaier). The book has undergone substantial editorial corrections ahead of final publication. For full details see the ChangeLog. -Version 0.9.8 released : February 2009 +Version 0.9.8 released: February 2009 This version contains a new off-the-shelf tokenizer, POS tagger, and named-entity tagger. A new metrics package includes inter-annotator agreement scores and various distance and word association measures (Tom Lippincott and Joel Nothman). There's a new collocations package (Joel Nothman). There are many improvements to the WordNet package and browser (Steven Bethard, Jordan Boyd-Graber, Paul Bone), and to the semantics and inference packages (Dan Garrette). The NLTK corpus collection now includes the PE08 Parser Evaluation data, and the CoNLL 2007 Basque and Catalan Dependency Treebanks. We have added an interface for dependency treebanks. Many chapters of the book have been revised in response to feedback from readers. For full details see the ChangeLog. NB some method names have been changed for consistency and simplicity. Use of old names will generate deprecation warnings that indicate the correct name to use. 2008 ---- -Version 0.9.7 released : December 2008 +Version 0.9.7 released: December 2008 This version contains fixes to the corpus downloader (see instructions) enabling NLTK corpora to be released independently of the software, and to be stored in compressed format. There are improvements in the grammars, chart parsers, probability distributions, sentence segmenter, text classifiers and RTE classifier. There are many further improvements to the book. For full details see the ChangeLog. -Version 0.9.6 released : December 2008 +Version 0.9.6 released: December 2008 This version has an incremental corpus downloader (see instructions) enabling NLTK corpora to be released independently of the software. A new WordNet interface has been developed by Steven Bethard (details). NLTK now has support for dependency parsing, developed by Jason Narad (sponsored by Google Summer of Code). There are many enhancements to the semantics and inference packages, contributed by Dan Garrette. The frequency distribution classes have new support for tabulation and plotting. The Brown Corpus reader has human readable category labels instead of letters. A new Swadesh Corpus containing comparative wordlists has been added. NLTK-Contrib includes a TIGERSearch implementation for searching treebanks (Torsten Marek). Most chapters of the book have been substantially revised. -The NLTK Project has moved : November 2008 +The NLTK Project has moved: November 2008 The NLTK project has moved to Google Sites, Google Code and Google Groups. Content for users and the nltk.org domain is hosted on Google Sites. The home of NLTK development is now Google Code. All discussion lists are at Google Groups. Our old site at nltk.sourceforge.net will continue to be available while we complete this transition. Old releases are still available via our SourceForge release page. We're grateful to SourceForge for hosting our project since its inception in 2001. -Version 0.9.5 released : August 2008 +Version 0.9.5 released: August 2008 This version contains several low-level changes to facilitate installation, plus updates to several NLTK-Contrib projects. A new text module gives easy access to text corpora for newcomers to NLP. For full details see the ChangeLog. -Version 0.9.4 released : August 2008 +Version 0.9.4 released: August 2008 This version contains a substantially expanded semantics package contributed by Dan Garrette, improvements to the chunk, tag, wordnet, tree and feature-structure modules, Mallet interface, ngram language modeling, new GUI tools (WordNet? browser, chunking, POS-concordance). The data distribution includes the new NPS Chat Corpus. NLTK-Contrib includes the following new packages (still undergoing active development) NLG package (Petro Verkhogliad), dependency parsers (Jason Narad), coreference (Joseph Frazee), CCG parser (Graeme Gange), and a first order resolution theorem prover (Dan Garrette). For full details see the ChangeLog. -NLTK presented at ACL conference : June 2008 +NLTK presented at ACL conference: June 2008 A paper on teaching courses using NLTK will be presented at the ACL conference: Multidisciplinary Instruction with the Natural Language Toolkit -Version 0.9.3 released : June 2008 +Version 0.9.3 released: June 2008 This version contains an improved WordNet? similarity module using pre-built information content files (included in the corpus distribution), new/improved interfaces to Weka, MEGAM and Prover9/Mace4 toolkits, improved Unicode support for corpus readers, a BNC corpus reader, and a rewrite of the Punkt sentence segmenter contributed by Joel Nothman. NLTK-Contrib includes an implementation of incremental algorithm for generating referring expression contributed by Margaret Mitchell. For full details see the ChangeLog. -NLTK presented at LinuxFest Northwest : April 2008 +NLTK presented at LinuxFest Northwest: April 2008 Sean Boisen presented NLTK at LinuxFest Northwest, which took place in Bellingham, Washington. His presentation slides are available at: https://semanticbible.com/other/talks/2008/nltk/main.html -NLTK in Google Summer of Code : April 2008 +NLTK in Google Summer of Code: April 2008 Google Summer of Code will sponsor two NLTK projects. Jason Narad won funding for a project on dependency parsers in NLTK (mentored by Sebastian Riedel and Jason Baldridge). Petro Verkhogliad won funding for a project on natural language generation in NLTK (mentored by Robert Dale and Edward Loper). -Python Software Foundation adopts NLTK for Google Summer of Code application : March 2008 +Python Software Foundation adopts NLTK for Google Summer of Code application: March 2008 The Python Software Foundation has listed NLTK projects for sponsorship from the 2008 Google Summer of Code program. For details please see https://wiki.python.org/moin/SummerOfCode. -Version 0.9.2 released : March 2008 +Version 0.9.2 released: March 2008 This version contains a new inference module linked to the Prover9/Mace4 theorem-prover and model checker (Dan Garrette, Ewan Klein). It also includes the VerbNet? and PropBank? corpora along with corpus readers. A bug in the Reuters corpus reader has been fixed. NLTK-Contrib includes new work on the WordNet? browser (Jussi Salmela). For full details see the ChangeLog -Youtube video about NLTK : January 2008 +Youtube video about NLTK: January 2008 The video from of the NLTK talk at the Bay Area Python Interest Group last July has been posted at https://www.youtube.com/watch?v=keXW_5-llD0 (1h15m) -Version 0.9.1 released : January 2008 +Version 0.9.1 released: January 2008 This version contains new support for accessing text categorization corpora, along with several corpora categorized for topic, genre, question type, or sentiment. It includes several new corpora: Question classification data (Li & Roth), Reuters 21578 Corpus, Movie Reviews corpus (Pang & Lee), Recognising Textual Entailment (RTE) Challenges. NLTK-Contrib includes expanded support for semantics (Dan Garrette), readability scoring (Thomas Jakobsen, Thomas Skardal), and SIL Toolbox (Greg Aumann). The book contains many improvements in early chapters in response to reader feedback. For full details see the ChangeLog. 2007 ---- -NLTK-Lite 0.9 released : October 2007 +NLTK-Lite 0.9 released: October 2007 This version is substantially revised and expanded from version 0.8. The entire toolkit can be accessed via a single import statement "import nltk", and there is a more convenient naming scheme. Calling deprecated functions generates messages that help programmers update their code. The corpus, tagger, and classifier modules have been redesigned. All functionality of the old NLTK 1.4.3 is now covered by NLTK-Lite 0.9. The book has been revised and expanded. A new data package incorporates the existing corpus collection and contains new sections for pre-specified grammars and pre-computed models. Several new corpora have been added, including treebanks for Portuguese, Spanish, Catalan and Dutch. A Macintosh distribution is provided. For full details see the ChangeLog. -NLTK-Lite 0.9b2 released : September 2007 +NLTK-Lite 0.9b2 released: September 2007 This version is substantially revised and expanded from version 0.8. The entire toolkit can be accessed via a single import statement "import nltk", and many common NLP functions accessed directly, e.g. nltk.PorterStemmer?, nltk.ShiftReduceParser?. The corpus, tagger, and classifier modules have been redesigned. The book has been revised and expanded, and the chapters have been reordered. NLTK has a new data package incorporating the existing corpus collection and adding new sections for pre-specified grammars and pre-computed models. The Floresta Portuguese Treebank has been added. Release 0.9b2 fixes several minor problems with 0.9b1 and removes the numpy dependency. It includes a new corpus and corpus reader for Brazilian Portuguese news text (MacMorphy?) and an improved corpus reader for the Sinica Treebank, and a trained model for Portuguese sentence segmentation. -NLTK-Lite 0.9b1 released : August 2007 +NLTK-Lite 0.9b1 released: August 2007 This version is substantially revised and expanded from version 0.8. The entire toolkit can be accessed via a single import statement "import nltk", and many common NLP functions accessed directly, e.g. nltk.PorterStemmer?, nltk.ShiftReduceParser?. The corpus, tagger, and classifier modules have been redesigned. The book has been revised and expanded, and the chapters have been reordered. NLTK has a new data package incorporating the existing corpus collection and adding new sections for pre-specified grammars and pre-computed models. The Floresta Portuguese Treebank has been added. For full details see the ChangeLog?. -NLTK talks in São Paulo : August 2007 +NLTK talks in São Paulo: August 2007 Steven Bird will present NLTK in a series of talks at the First Brazilian School on Computational Linguistics, at the University of São Paulo in the first week of September. -NLTK talk in Bay Area : July 2007 +NLTK talk in Bay Area: July 2007 Steven Bird, Ewan Klein, and Edward Loper will present NLTK at the Bay Area Python Interest Group, at Google on Thursday 12 July. -NLTK-Lite 0.8 released : July 2007 +NLTK-Lite 0.8 released: July 2007 This version is substantially revised and expanded from version 0.7. The code now includes improved interfaces to corpora, chunkers, grammars, frequency distributions, full integration with WordNet? 3.0 and WordNet? similarity measures. The book contains substantial revision of Part I (tokenization, tagging, chunking) and Part II (grammars and parsing). NLTK has several new corpora including the Switchboard Telephone Speech Corpus transcript sample (Talkbank Project), CMU Problem Reports Corpus sample, CONLL2002 POS+NER data, Patient Information Leaflet corpus sample, Indian POS-Tagged data (Bangla, Hindi, Marathi, Telugu), Shakespeare XML corpus sample, and the Universal Declaration of Human Rights corpus with text samples in 300+ languages. -NLTK features in Language Documentation and Conservation article : July 2007 +NLTK features in Language Documentation and Conservation article: July 2007 An article Managing Fieldwork Data with Toolbox and the Natural Language Toolkit by Stuart Robinson, Greg Aumann, and Steven Bird appears in the inaugural issue of ''Language Documentation and Conservation''. It discusses several small Python programs for manipulating field data. -NLTK features in ACM Crossroads article : May 2007 +NLTK features in ACM Crossroads article: May 2007 An article Getting Started on Natural Language Processing with Python by Nitin Madnani will appear in ''ACM Crossroads'', the ACM Student Journal. It discusses NLTK in detail, and provides several helpful examples including an entertaining free word association program. -NLTK-Lite 0.7.5 released : May 2007 +NLTK-Lite 0.7.5 released: May 2007 This version contains improved interfaces for WordNet 3.0 and WordNet-Similarity, the Lancaster Stemmer (contributed by Steven Tomcavage), and several new corpora including the Switchboard Telephone Speech Corpus transcript sample (Talkbank Project), CMU Problem Reports Corpus sample, CONLL2002 POS+NER data, Patient Information Leaflet corpus sample and WordNet 3.0 data files. With this distribution WordNet no longer needs to be separately installed. -NLTK-Lite 0.7.4 released : May 2007 +NLTK-Lite 0.7.4 released: May 2007 This release contains new corpora and corpus readers for Indian POS-Tagged data (Bangla, Hindi, Marathi, Telugu), and the Sinica Treebank, and substantial revision of Part II of the book on structured programming, grammars and parsing. -NLTK-Lite 0.7.3 released : April 2007 +NLTK-Lite 0.7.3 released: April 2007 This release contains improved chunker and PCFG interfaces, the Shakespeare XML corpus sample and corpus reader, improved tutorials and improved formatting of code samples, and categorization of problem sets by difficulty. -NLTK-Lite 0.7.2 released : March 2007 +NLTK-Lite 0.7.2 released: March 2007 This release contains new text classifiers (Cosine, NaiveBayes?, Spearman), contributed by Sam Huston, simple feature detectors, the UDHR corpus with text samples in 300+ languages and a corpus interface; improved tutorials (340 pages in total); additions to contrib area including Kimmo finite-state morphology system, Lambek calculus system, and a demonstration of text classifiers for language identification. -NLTK-Lite 0.7.1 released : January 2007 +NLTK-Lite 0.7.1 released: January 2007 This release contains bugfixes in the WordNet? and HMM modules. 2006 ---- -NLTK-Lite 0.7 released : December 2006 +NLTK-Lite 0.7 released: December 2006 This release contains: new semantic interpretation package (Ewan Klein), new support for SIL Toolbox format (Greg Aumann), new chunking package including cascaded chunking (Steven Bird), new interface to WordNet? 2.1 and Wordnet similarity measures (David Ormiston Smith), new support for Penn Treebank format (Yoav Goldberg), bringing the codebase to 48,000 lines; substantial new chapters on semantic interpretation and chunking, and substantial revisions to several other chapters, bringing the textbook documentation to 280 pages; -NLTK-Lite 0.7b1 released : December 2006 +NLTK-Lite 0.7b1 released: December 2006 This release contains: new semantic interpretation package (Ewan Klein), new support for SIL Toolbox format (Greg Aumann), new chunking package including cascaded chunking, wordnet package updated for version 2.1 of Wordnet, and prototype wordnet similarity measures (David Ormiston Smith), bringing the codebase to 48,000 lines; substantial new chapters on semantic interpretation and chunking, and substantial revisions to several other chapters, bringing the textbook documentation to 270 pages; -NLTK-Lite 0.6.6 released : October 2006 +NLTK-Lite 0.6.6 released: October 2006 This release contains bugfixes, improvements to Shoebox file format support, and expanded tutorial discussions of programming and feature-based grammars. -NLTK-Lite 0.6.5 released : July 2006 +NLTK-Lite 0.6.5 released: July 2006 This release contains improvements to Shoebox file format support (by Stuart Robinson and Greg Aumann); an implementation of hole semantics (by Peter Wang); improvements to lambda calculus and semantic interpretation modules (by Ewan Klein); a new corpus (Sinica Treebank sample); and expanded tutorial discussions of trees, feature-based grammar, unification, PCFGs, and more exercises. -NLTK-Lite passes 10k download milestone : May 2006 +NLTK-Lite passes 10k download milestone: May 2006 We have now had 10,000 downloads of NLTK-Lite in the nine months since it was first released. -NLTK-Lite 0.6.4 released : April 2006 +NLTK-Lite 0.6.4 released: April 2006 This release contains new corpora (Senseval 2, TIMIT sample), a clusterer, cascaded chunker, and several substantially revised tutorials. 2005 ---- -NLTK 1.4 no longer supported : December 2005 +NLTK 1.4 no longer supported: December 2005 The main development has switched to NLTK-Lite. The latest version of NLTK can still be downloaded; see the installation page for instructions. -NLTK-Lite 0.6 released : November 2005 +NLTK-Lite 0.6 released: November 2005 contains bug-fixes, PDF versions of tutorials, expanded fieldwork tutorial, PCFG grammar induction (by Nathan Bodenstab), and prototype concordance and paradigm display tools (by Peter Spiller and Will Hardy). -NLTK-Lite 0.5 released : September 2005 +NLTK-Lite 0.5 released: September 2005 contains bug-fixes, improved tutorials, more project suggestions, and a pronunciation dictionary. -NLTK-Lite 0.4 released : September 2005 +NLTK-Lite 0.4 released: September 2005 contains bug-fixes, improved tutorials, more project suggestions, and probabilistic parsers. -NLTK-Lite 0.3 released : August 2005 +NLTK-Lite 0.3 released: August 2005 contains bug-fixes, documentation clean-up, project suggestions, and the chart parser demos including one for Earley parsing by Jean Mark Gawron. -NLTK-Lite 0.2 released : July 2005 +NLTK-Lite 0.2 released: July 2005 contains bug-fixes, documentation clean-up, and some translations of tutorials into Brazilian Portuguese by Tiago Tresoldi. -NLTK-Lite 0.1 released : July 2005 +NLTK-Lite 0.1 released: July 2005 substantially simplified and streamlined version of NLTK has been released -Brazilian Portuguese Translation : April 2005 +Brazilian Portuguese Translation: April 2005 top-level pages of this website have been translated into Brazilian Portuguese by Tiago Tresoldi; translations of the tutorials are in preparation http://hermes.sourceforge.net/nltk-br/ -1.4.3 Release : February 2005 +1.4.3 Release: February 2005 NLTK 1.4.3 has been released; this is the first version which is compatible with Python 2.4.