diff --git a/AUTHORS b/AUTHORS index 52d0ee8e528..7454bc78067 100644 --- a/AUTHORS +++ b/AUTHORS @@ -96,5 +96,3 @@ authors and projects: * sphinx.util.jsdump uses the basestring encoding from simplejson, written by Bob Ippolito, released under the MIT license -* sphinx.util.stemmer was written by Vivake Gupta, placed in the - Public Domain diff --git a/sphinx/search/en.py b/sphinx/search/en.py index 53cd917dc7f..19bd9f019fb 100644 --- a/sphinx/search/en.py +++ b/sphinx/search/en.py @@ -2,8 +2,9 @@ from typing import Dict +import snowballstemmer + from sphinx.search import SearchLanguage -from sphinx.util.stemmer import get_stemmer english_stopwords = set(""" a and are as at @@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage): stopwords = english_stopwords def init(self, options: Dict) -> None: - self.stemmer = get_stemmer() + self.stemmer = snowballstemmer.stemmer('porter') def stem(self, word: str) -> str: - return self.stemmer.stem(word.lower()) + return self.stemmer.stemWord(word.lower()) diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py index 700c2683f72..86f612d5db1 100644 --- a/sphinx/search/zh.py +++ b/sphinx/search/zh.py @@ -4,8 +4,9 @@ import re from typing import Dict, List +import snowballstemmer + from sphinx.search import SearchLanguage -from sphinx.util.stemmer import get_stemmer try: import jieba @@ -230,7 +231,7 @@ def init(self, options: Dict) -> None: if dict_path and os.path.isfile(dict_path): jieba.load_userdict(dict_path) - self.stemmer = get_stemmer() + self.stemmer = snowballstemmer.stemmer('english') def split(self, input: str) -> List[str]: chinese: List[str] = [] @@ -252,8 +253,8 @@ def stem(self, word: str) -> str: should_not_be_stemmed = ( word in self.latin_terms and len(word) >= 3 and - len(self.stemmer.stem(word.lower())) < 3 + len(self.stemmer.stemWord(word.lower())) < 3 ) if should_not_be_stemmed: return word.lower() - return self.stemmer.stem(word.lower()) + return self.stemmer.stemWord(word.lower()) diff --git a/sphinx/util/stemmer/__init__.py b/sphinx/util/stemmer/__init__.py index ff6c365c74d..6d27592d872 100644 --- a/sphinx/util/stemmer/__init__.py +++ b/sphinx/util/stemmer/__init__.py @@ -1,37 +1,62 @@ """Word stemming utilities for Sphinx.""" -from sphinx.util.stemmer.porter import PorterStemmer +import warnings -try: - from Stemmer import Stemmer as _PyStemmer - PYSTEMMER = True -except ImportError: - PYSTEMMER = False +import snowballstemmer + +from sphinx.deprecation import RemovedInSphinx70Warning + + +class PorterStemmer: + def __init__(self): + warnings.warn(f"{self.__class__.__name__} is deprecated, use " + "snowballstemmer.stemmer('porter') instead.", + RemovedInSphinx70Warning, stacklevel=2) + self.stemmer = snowballstemmer.stemmer('porter') + + def stem(self, p: str, i: int, j: int) -> str: + warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use " + "snowballstemmer.stemmer('porter').stemWord() instead.", + RemovedInSphinx70Warning, stacklevel=2) + return self.stemmer.stemWord(p) class BaseStemmer: + def __init__(self): + warnings.warn(f"{self.__class__.__name__} is deprecated, use " + "snowballstemmer.stemmer('porter') instead.", + RemovedInSphinx70Warning, stacklevel=3) + def stem(self, word: str) -> str: - raise NotImplementedError() + raise NotImplementedError class PyStemmer(BaseStemmer): - def __init__(self) -> None: - self.stemmer = _PyStemmer('porter') + def __init__(self): # NoQA + super().__init__() + self.stemmer = snowballstemmer.stemmer('porter') def stem(self, word: str) -> str: + warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use " + "snowballstemmer.stemmer('porter').stemWord() instead.", + RemovedInSphinx70Warning, stacklevel=2) return self.stemmer.stemWord(word) -class StandardStemmer(PorterStemmer, BaseStemmer): - """All those porter stemmer implementations look hideous; - make at least the stem method nicer. - """ - def stem(self, word: str) -> str: # type: ignore - return super().stem(word, 0, len(word) - 1) +class StandardStemmer(BaseStemmer): + def __init__(self): # NoQA + super().__init__() + self.stemmer = snowballstemmer.stemmer('porter') + + def stem(self, word: str) -> str: + warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use " + "snowballstemmer.stemmer('porter').stemWord() instead.", + RemovedInSphinx70Warning, stacklevel=2) + return self.stemmer.stemWord(word) def get_stemmer() -> BaseStemmer: - if PYSTEMMER: - return PyStemmer() - else: - return StandardStemmer() + warnings.warn("get_stemmer() is deprecated, use " + "snowballstemmer.stemmer('porter') instead.", + RemovedInSphinx70Warning, stacklevel=2) + return PyStemmer() diff --git a/sphinx/util/stemmer/porter.py b/sphinx/util/stemmer/porter.py deleted file mode 100644 index c4f89eb9539..00000000000 --- a/sphinx/util/stemmer/porter.py +++ /dev/null @@ -1,406 +0,0 @@ -"""Porter Stemming Algorithm - -This is the Porter stemming algorithm, ported to Python from the -version coded up in ANSI C by the author. It may be be regarded -as canonical, in that it follows the algorithm presented in - -Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, -no. 3, pp 130-137, - -only differing from it at the points made --DEPARTURE-- below. - -See also https://tartarus.org/martin/PorterStemmer/ - -The algorithm as described in the paper could be exactly replicated -by adjusting the points of DEPARTURE, but this is barely necessary, -because (a) the points of DEPARTURE are definitely improvements, and -(b) no encoding of the Porter stemmer I have seen is anything like -as exact as this version, even with the points of DEPARTURE! - -Release 1: January 2001 - -:author: Vivake Gupta . -:license: Public Domain ("can be used free of charge for any purpose"). -""" - - -class PorterStemmer: - - def __init__(self) -> None: - """The main part of the stemming algorithm starts here. - b is a buffer holding a word to be stemmed. The letters are in b[k0], - b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is - readjusted downwards as the stemming progresses. Zero termination is - not in fact used in the algorithm. - - Note that only lower case sequences are stemmed. Forcing to lower case - should be done before stem(...) is called. - """ - - self.b = "" # buffer for word to be stemmed - self.k = 0 - self.k0 = 0 - self.j = 0 # j is a general offset into the string - - def cons(self, i: int) -> int: - """cons(i) is TRUE <=> b[i] is a consonant.""" - if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \ - or self.b[i] == 'o' or self.b[i] == 'u': - return 0 - if self.b[i] == 'y': - if i == self.k0: - return 1 - else: - return (not self.cons(i - 1)) - return 1 - - def m(self) -> int: - """m() measures the number of consonant sequences between k0 and j. - if c is a consonant sequence and v a vowel sequence, and <..> - indicates arbitrary presence, - - gives 0 - vc gives 1 - vcvc gives 2 - vcvcvc gives 3 - .... - """ - n = 0 - i = self.k0 - while 1: - if i > self.j: - return n - if not self.cons(i): - break - i = i + 1 - i = i + 1 - while 1: - while 1: - if i > self.j: - return n - if self.cons(i): - break - i = i + 1 - i = i + 1 - n = n + 1 - while 1: - if i > self.j: - return n - if not self.cons(i): - break - i = i + 1 - i = i + 1 - - def vowelinstem(self) -> int: - """vowelinstem() is TRUE <=> k0,...j contains a vowel""" - for i in range(self.k0, self.j + 1): - if not self.cons(i): - return 1 - return 0 - - def doublec(self, j: int) -> int: - """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" - if j < (self.k0 + 1): - return 0 - if (self.b[j] != self.b[j - 1]): - return 0 - return self.cons(j) - - def cvc(self, i: int) -> int: - """cvc(i) is TRUE <=> i-2,i-1,i has the form - consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - """ - if i < (self.k0 + 2) or not self.cons(i) or self.cons(i - 1) \ - or not self.cons(i - 2): - return 0 - ch = self.b[i] - if ch in ('w', 'x', 'y'): - return 0 - return 1 - - def ends(self, s: str) -> int: - """ends(s) is TRUE <=> k0,...k ends with the string s.""" - length = len(s) - if s[length - 1] != self.b[self.k]: # tiny speed-up - return 0 - if length > (self.k - self.k0 + 1): - return 0 - if self.b[self.k - length + 1:self.k + 1] != s: - return 0 - self.j = self.k - length - return 1 - - def setto(self, s: str) -> None: - """setto(s) sets (j+1),...k to the characters in the string s, - readjusting k.""" - length = len(s) - self.b = self.b[:self.j + 1] + s + self.b[self.j + length + 1:] - self.k = self.j + length - - def r(self, s: str) -> None: - """r(s) is used further down.""" - if self.m() > 0: - self.setto(s) - - def step1ab(self) -> None: - """step1ab() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - """ - if self.b[self.k] == 's': - if self.ends("sses"): - self.k = self.k - 2 - elif self.ends("ies"): - self.setto("i") - elif self.b[self.k - 1] != 's': - self.k = self.k - 1 - if self.ends("eed"): - if self.m() > 0: - self.k = self.k - 1 - elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): - self.k = self.j - if self.ends("at"): - self.setto("ate") - elif self.ends("bl"): - self.setto("ble") - elif self.ends("iz"): - self.setto("ize") - elif self.doublec(self.k): - self.k = self.k - 1 - ch = self.b[self.k] - if ch in ('l', 's', 'z'): - self.k = self.k + 1 - elif (self.m() == 1 and self.cvc(self.k)): - self.setto("e") - - def step1c(self) -> None: - """step1c() turns terminal y to i when there is another vowel in - the stem.""" - if (self.ends("y") and self.vowelinstem()): - self.b = self.b[:self.k] + 'i' + self.b[self.k + 1:] - - def step2(self) -> None: - """step2() maps double suffices to single ones. - so -ization ( = -ize plus -ation) maps to -ize etc. note that the - string before the suffix must give m() > 0. - """ - if self.b[self.k - 1] == 'a': - if self.ends("ational"): - self.r("ate") - elif self.ends("tional"): - self.r("tion") - elif self.b[self.k - 1] == 'c': - if self.ends("enci"): - self.r("ence") - elif self.ends("anci"): - self.r("ance") - elif self.b[self.k - 1] == 'e': - if self.ends("izer"): - self.r("ize") - elif self.b[self.k - 1] == 'l': - if self.ends("bli"): - self.r("ble") # --DEPARTURE-- - # To match the published algorithm, replace this phrase with - # if self.ends("abli"): self.r("able") - elif self.ends("alli"): - self.r("al") - elif self.ends("entli"): - self.r("ent") - elif self.ends("eli"): - self.r("e") - elif self.ends("ousli"): - self.r("ous") - elif self.b[self.k - 1] == 'o': - if self.ends("ization"): - self.r("ize") - elif self.ends("ation"): - self.r("ate") - elif self.ends("ator"): - self.r("ate") - elif self.b[self.k - 1] == 's': - if self.ends("alism"): - self.r("al") - elif self.ends("iveness"): - self.r("ive") - elif self.ends("fulness"): - self.r("ful") - elif self.ends("ousness"): - self.r("ous") - elif self.b[self.k - 1] == 't': - if self.ends("aliti"): - self.r("al") - elif self.ends("iviti"): - self.r("ive") - elif self.ends("biliti"): - self.r("ble") - elif self.b[self.k - 1] == 'g': # --DEPARTURE-- - if self.ends("logi"): - self.r("log") - # To match the published algorithm, delete this phrase - - def step3(self) -> None: - """step3() dels with -ic-, -full, -ness etc. similar strategy - to step2.""" - if self.b[self.k] == 'e': - if self.ends("icate"): - self.r("ic") - elif self.ends("ative"): - self.r("") - elif self.ends("alize"): - self.r("al") - elif self.b[self.k] == 'i': - if self.ends("iciti"): - self.r("ic") - elif self.b[self.k] == 'l': - if self.ends("ical"): - self.r("ic") - elif self.ends("ful"): - self.r("") - elif self.b[self.k] == 's': - if self.ends("ness"): - self.r("") - - def step4(self) -> None: - """step4() takes off -ant, -ence etc., in context vcvc.""" - if self.b[self.k - 1] == 'a': - if self.ends("al"): - pass - else: - return - elif self.b[self.k - 1] == 'c': - if self.ends("ance"): - pass - elif self.ends("ence"): - pass - else: - return - elif self.b[self.k - 1] == 'e': - if self.ends("er"): - pass - else: - return - elif self.b[self.k - 1] == 'i': - if self.ends("ic"): - pass - else: - return - elif self.b[self.k - 1] == 'l': - if self.ends("able"): - pass - elif self.ends("ible"): - pass - else: - return - elif self.b[self.k - 1] == 'n': - if self.ends("ant"): - pass - elif self.ends("ement"): - pass - elif self.ends("ment"): - pass - elif self.ends("ent"): - pass - else: - return - elif self.b[self.k - 1] == 'o': - if self.ends("ion") and (self.b[self.j] == 's' or - self.b[self.j] == 't'): - pass - elif self.ends("ou"): - pass - # takes care of -ous - else: - return - elif self.b[self.k - 1] == 's': - if self.ends("ism"): - pass - else: - return - elif self.b[self.k - 1] == 't': - if self.ends("ate"): - pass - elif self.ends("iti"): - pass - else: - return - elif self.b[self.k - 1] == 'u': - if self.ends("ous"): - pass - else: - return - elif self.b[self.k - 1] == 'v': - if self.ends("ive"): - pass - else: - return - elif self.b[self.k - 1] == 'z': - if self.ends("ize"): - pass - else: - return - else: - return - if self.m() > 1: - self.k = self.j - - def step5(self) -> None: - """step5() removes a final -e if m() > 1, and changes -ll to -l if - m() > 1. - """ - self.j = self.k - if self.b[self.k] == 'e': - a = self.m() - if a > 1 or (a == 1 and not self.cvc(self.k - 1)): - self.k = self.k - 1 - if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: - self.k = self.k - 1 - - def stem(self, p: str, i: int, j: int) -> str: - """In stem(p,i,j), p is a char pointer, and the string to be stemmed - is from p[i] to p[j] inclusive. Typically i is zero and j is the - offset to the last character of a string, (p[j+1] == '\0'). The - stemmer adjusts the characters p[i] ... p[j] and returns the new - end-point of the string, k. Stemming never increases word length, so - i <= k <= j. To turn the stemmer into a module, declare 'stem' as - extern, and delete the remainder of this file. - """ - # copy the parameters into statics - self.b = p - self.k = j - self.k0 = i - if self.k <= self.k0 + 1: - return self.b # --DEPARTURE-- - - # With this line, strings of length 1 or 2 don't go through the - # stemming process, although no mention is made of this in the - # published algorithm. Remove the line to match the published - # algorithm. - - self.step1ab() - self.step1c() - self.step2() - self.step3() - self.step4() - self.step5() - return self.b[self.k0:self.k + 1]