From bbbb697122a373a522dfbc0ff9742b2db9356bc5 Mon Sep 17 00:00:00 2001 From: BatMrE <48859022+BatMrE@users.noreply.github.com> Date: Sun, 29 Aug 2021 17:52:12 +0530 Subject: [PATCH] Allowed properties overloading for CoreNLPParser tag, Authors.md --- AUTHORS.md | 1 + nltk/parse/corenlp.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index 10d1a34667..fd55ce0bda 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -280,6 +280,7 @@ - Hiroki Teranishi - Ruben Cartuyvels - Dalton Pearson +- Abdul Rafey Khan ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py index fa791398a1..2b2b54903f 100644 --- a/nltk/parse/corenlp.py +++ b/nltk/parse/corenlp.py @@ -326,7 +326,7 @@ def tokenize(self, text, properties=None): for token in sentence["tokens"]: yield token["originalText"] or token["word"] - def tag_sents(self, sentences): + def tag_sents(self, sentences, properties=None): """ Tag multiple sentences. @@ -339,9 +339,11 @@ def tag_sents(self, sentences): """ # Converting list(list(str)) -> list(str) sentences = (" ".join(words) for words in sentences) - return [sentences[0] for sentences in self.raw_tag_sents(sentences)] + if properties is None: + properties = {"tokenize.whitespace": "true"} + return [sentences[0] for sentences in self.raw_tag_sents(sentences, properties)] - def tag(self, sentence): + def tag(self, sentence, properties=None): """ Tag a list of tokens. @@ -360,9 +362,9 @@ def tag(self, sentence): ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] """ - return self.tag_sents([sentence])[0] + return self.tag_sents([sentence], properties)[0] - def raw_tag_sents(self, sentences): + def raw_tag_sents(self, sentences, properties=None): """ Tag multiple sentences. @@ -377,8 +379,13 @@ def raw_tag_sents(self, sentences): "annotators": "tokenize,ssplit,", } + default_properties.update(properties or {}) + # Supports only 'pos' or 'ner' tags. - assert self.tagtype in ["pos", "ner"] + assert self.tagtype in [ + "pos", + "ner", + ], "CoreNLP tagger supports only 'pos' or 'ner' tags." default_properties["annotators"] += self.tagtype for sentence in sentences: tagged_data = self.api_call(sentence, properties=default_properties)