diff --git a/AUTHORS.md b/AUTHORS.md index 5b7546d43c..3e4078df72 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -280,6 +280,7 @@ - Hiroki Teranishi - Ruben Cartuyvels - Dalton Pearson +- Abdul Rafey Khan - Robby Horvath - Gavish Poddar - Saibo Geng diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index 15e5e3fa09..2ebf362cef 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -326,7 +326,7 @@ def tokenize(self, text, properties=None): for token in sentence["tokens"]: yield token["originalText"] or token["word"] - def tag_sents(self, sentences): + def tag_sents(self, sentences, properties=None): """ Tag multiple sentences. @@ -339,9 +339,11 @@ def tag_sents(self, sentences): """ # Converting list(list(str)) -> list(str) sentences = (" ".join(words) for words in sentences) - return [sentences[0] for sentences in self.raw_tag_sents(sentences)] + if properties is None: + properties = {"tokenize.whitespace": "true"} + return [sentences[0] for sentences in self.raw_tag_sents(sentences, properties)] - def tag(self, sentence): + def tag(self, sentence, properties=None): """ Tag a list of tokens. @@ -360,9 +362,9 @@ def tag(self, sentence): ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] """ - return self.tag_sents([sentence])[0] + return self.tag_sents([sentence], properties)[0] - def raw_tag_sents(self, sentences): + def raw_tag_sents(self, sentences, properties=None): """ Tag multiple sentences. @@ -377,8 +379,13 @@ def raw_tag_sents(self, sentences): "annotators": "tokenize,ssplit,", } + default_properties.update(properties or {}) + # Supports only 'pos' or 'ner' tags. - assert self.tagtype in ["pos", "ner"] + assert self.tagtype in [ + "pos", + "ner", + ], "CoreNLP tagger supports only 'pos' or 'ner' tags." default_properties["annotators"] += self.tagtype for sentence in sentences: tagged_data = self.api_call(sentence, properties=default_properties)