diff --git a/AUTHORS b/AUTHORS index 52d0ee8e528..7454bc78067 100644 --- a/AUTHORS +++ b/AUTHORS @@ -96,5 +96,3 @@ authors and projects: * sphinx.util.jsdump uses the basestring encoding from simplejson, written by Bob Ippolito, released under the MIT license -* sphinx.util.stemmer was written by Vivake Gupta, placed in the - Public Domain diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py index 700c2683f72..86f612d5db1 100644 --- a/sphinx/search/zh.py +++ b/sphinx/search/zh.py @@ -4,8 +4,9 @@ import re from typing import Dict, List +import snowballstemmer + from sphinx.search import SearchLanguage -from sphinx.util.stemmer import get_stemmer try: import jieba @@ -230,7 +231,7 @@ def init(self, options: Dict) -> None: if dict_path and os.path.isfile(dict_path): jieba.load_userdict(dict_path) - self.stemmer = get_stemmer() + self.stemmer = snowballstemmer.stemmer('english') def split(self, input: str) -> List[str]: chinese: List[str] = [] @@ -252,8 +253,8 @@ def stem(self, word: str) -> str: should_not_be_stemmed = ( word in self.latin_terms and len(word) >= 3 and - len(self.stemmer.stem(word.lower())) < 3 + len(self.stemmer.stemWord(word.lower())) < 3 ) if should_not_be_stemmed: return word.lower() - return self.stemmer.stem(word.lower()) + return self.stemmer.stemWord(word.lower()) diff --git a/sphinx/util/stemmer/__init__.py b/sphinx/util/stemmer/__init__.py index 330b62d8ee9..2e2c41ac701 100644 --- a/sphinx/util/stemmer/__init__.py +++ b/sphinx/util/stemmer/__init__.py @@ -1,16 +1,67 @@ """Word stemming utilities for Sphinx.""" +import warnings import snowballstemmer -from sphinx.util.stemmer.porter import PorterStemmer +from sphinx.deprecation import RemovedInSphinx70Warning + +_ENGLISH_STEMMER_TYPE = type(snowballstemmer.stemmer('english')) + + +class PorterStemmer(_ENGLISH_STEMMER_TYPE): + def __init__(self): + warnings.warn(f"{self.__class__.__name__} is deprecated, use " + "snowballstemmer.stemmer('english') instead.", + RemovedInSphinx70Warning, stacklevel=2) + super().__init__() + + def stem(self, p: str, i: int, j: int) -> str: + warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use " + "snowballstemmer.stemmer('english').stemWord() instead.", + RemovedInSphinx70Warning, stacklevel=2) + return self.stemWord(p) class BaseStemmer: + def __init__(self): + warnings.warn(f"{self.__class__.__name__} is deprecated.", + RemovedInSphinx70Warning, stacklevel=2) + super().__init__() + def stem(self, word: str) -> str: raise NotImplementedError -def get_stemmer() -> BaseStemmer: - stemmer = snowballstemmer.stemmer('english') +class PyStemmer(BaseStemmer, _ENGLISH_STEMMER_TYPE): + def __init__(self): + warnings.warn(f"{self.__class__.__name__} is deprecated, use " + "snowballstemmer.stemmer('english') instead.", + RemovedInSphinx70Warning, stacklevel=2) + super().__init__() + def stem(self, word: str) -> str: + warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use " + "snowballstemmer.stemmer('english').stemWord() instead.", + RemovedInSphinx70Warning, stacklevel=2) + return self.stemWord(word) + +class StandardStemmer(BaseStemmer, PorterStemmer): + def __init__(self): + warnings.warn(f"{self.__class__.__name__} is deprecated, use " + "snowballstemmer.stemmer('english') instead.", + RemovedInSphinx70Warning, stacklevel=2) + super().__init__() + + def stem(self, word: str) -> str: + warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use " + "snowballstemmer.stemmer('english').stemWord() instead.", + RemovedInSphinx70Warning, stacklevel=2) + return self.stemWord(word) + + +def get_stemmer() -> BaseStemmer: + warnings.warn("get_stemmer() is deprecated, use " + "snowballstemmer.stemmer('english') instead.", + RemovedInSphinx70Warning, stacklevel=2) + return PyStemmer() diff --git a/sphinx/util/stemmer/porter.py b/sphinx/util/stemmer/porter.py deleted file mode 100644 index ac1e80e064e..00000000000 --- a/sphinx/util/stemmer/porter.py +++ /dev/null @@ -1,386 +0,0 @@ -"""Porter Stemming Algorithm - -This is the Porter stemming algorithm, ported to Python from the -version coded up in ANSI C by the author. It may be be regarded -as canonical, in that it follows the algorithm presented in - -Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, -no. 3, pp 130-137, - -only differing from it at the points made --DEPARTURE-- below. - -See also https://tartarus.org/martin/PorterStemmer/ - -The algorithm as described in the paper could be exactly replicated -by adjusting the points of DEPARTURE, but this is barely necessary, -because (a) the points of DEPARTURE are definitely improvements, and -(b) no encoding of the Porter stemmer I have seen is anything like -as exact as this version, even with the points of DEPARTURE! - -Release 1: January 2001 - -:author: Vivake Gupta . -:license: Public Domain ("can be used free of charge for any purpose"). -""" - -__all__ = ("PorterStemmer",) - - -def is_consonant(word: str, i: int) -> bool: - """is_consonant(word, i) is True <=> word[i] is a consonant.""" - char = word[i] - if char in {'a', 'e', 'i', 'o', 'u'}: - return False - if char == 'y': - if word[i] == word[0]: - return True - return not is_consonant(word, i - 1) - return True - - -def measure_consonant_sequences(word: str, end: int) -> int: - """Measures the number of consonant sequences in word[:end]. - if c is a consonant sequence and v a vowel sequence, and <..> - indicates arbitrary presence, - - gives 0 - vc gives 1 - vcvc gives 2 - vcvcvc gives 3 - .... - """ - i = 0 - while (i <= end) and is_consonant(word, i): - i += 1 - n = 0 - while True: - while (i <= end) and not is_consonant(word, i): - i += 1 - - if i > end: - return n - n += 1 - - while (i <= end) and is_consonant(word, i): - i += 1 - - -def vowel_in_stem(word: str, end: int) -> bool: - """vowel_in_stem() is True <=> word[:end] contains a vowel""" - for i in range(end + 1): - if not is_consonant(word, i): - return True - return False - - -def double_consonant(word: str) -> bool: - """True <=> word[-2:] contains a double consonant.""" - return ( - len(word) >= 2 - and word[-1] == word[-2] - and is_consonant(word, -1) - ) - - -def consonant_vowel_consonant(word: str) -> bool: - """consonant_vowel_consonant(word) is TRUE <=> word[-3:] has the form - consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - """ - return ( - len(word) >= 3 - and is_consonant(word, -1) - and not is_consonant(word, -2) - and is_consonant(word, -3) - and word[-1] not in {'w', 'x', 'y'} - ) - - -class PorterStemmer: - - def __init__(self) -> None: - """The main part of the stemming algorithm starts here. - - Note that only lower case sequences are stemmed. Forcing to lower case - should be done before stem(...) is called. - """ - - self.word: str = "" # buffer for word to be stemmed - self.j: int = 0 # j is a general offset into the string - - def ends(self, string: str) -> bool: - """True <=> b[:k+1] ends with the given string.""" - if self.word.endswith(string) and string: - self.j = len(self.word.removesuffix(string)) - 1 - return True - return False - - def replace(self, ends_with: str, replace: str) -> bool: - if self.word.endswith(ends_with) and ends_with: - self.j = len(self.word.removesuffix(ends_with)) - 1 - if measure_consonant_sequences(self.word, self.j) > 0: - self.word = self.word[:self.j + 1] + replace - return True - return False - - def step1ab(self) -> None: - """step1ab() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - """ - if self.word[-1] == 's': - if self.ends("sses"): - self.word = self.word[:-2] - elif self.ends("ies"): - self.word = self.word[:-3] + 'i' - elif self.word[-2] != 's': - self.word = self.word[:-1] - if self.ends("eed"): - end = len(self.word) - 1 - 3 - if measure_consonant_sequences(self.word, end) > 0: - self.word = self.word[:-1] - elif (self.ends("ed") or self.ends("ing")) and vowel_in_stem(self.word, self.j): - self.word = self.word[:self.j + 1] - if self.ends("at"): - self.word = self.word[:-2] + 'ate' - elif self.ends("bl"): - self.word = self.word[:-2] + 'ble' - elif self.ends("iz"): - self.word = self.word[:-2] + 'ize' - elif double_consonant(self.word): - if self.word[-2] not in {'l', 's', 'z'}: - self.word = self.word[:-1] - elif measure_consonant_sequences(self.word, self.j) == 1 and consonant_vowel_consonant(self.word): - self.word = self.word[:self.j + 1] + "e" - - def step1c(self) -> None: - """step1c() turns terminal y to i when there is another vowel in - the stem.""" - if self.ends("y") and vowel_in_stem(self.word, len(self.word) - 2): - self.word = self.word[:-1] + 'i' - - def step2(self) -> None: - """step2() maps double suffices to single ones. - so -ization ( = -ize plus -ation) maps to -ize etc. note that the - string before the suffix must give measure_consonant_sequences(self.word, self.j) > 0. - """ - char = self.word[-2] - if char == 'a': - if self.replace("ational", "ate"): - pass - elif self.replace("tional", "tion"): - pass - elif char == 'c': - if self.replace("enci", "ence"): - pass - elif self.replace("anci", "ance"): - pass - elif char == 'e': - if self.replace("izer", "ize"): - pass - elif char == 'l': - if self.replace("bli", "ble"): # --DEPARTURE-- - pass - # To match the published algorithm, replace this phrase with - # if self.replace("abli", "able"): - # pass - elif self.replace("alli", "al"): - pass - elif self.replace("entli", "ent"): - pass - elif self.replace("eli", "e"): - pass - elif self.replace("ousli", "ous"): - pass - elif char == 'o': - if self.replace("ization", "ize"): - pass - elif self.replace("ation", "ate"): - pass - elif self.replace("ator", "ate"): - pass - elif char == 's': - if self.replace("alism", "al"): - pass - elif self.replace("iveness", "ive"): - pass - elif self.replace("fulness", "ful"): - pass - elif self.replace("ousness", "ous"): - pass - elif char == 't': - if self.replace("aliti", "al"): - pass - elif self.replace("iviti", "ive"): - pass - elif self.replace("biliti", "ble"): - pass - # To match the published algorithm, delete this phrase - elif char == 'g': # --DEPARTURE-- - if self.replace("logi", "log"): - pass - - def step3(self) -> None: - """step3() dels with -ic-, -full, -ness etc. similar strategy - to step2.""" - char = self.word[-1] - if char == 'e': - if self.replace("icate", "ic"): - pass - elif self.replace("ative", ""): - pass - elif self.replace("alize", "al"): - pass - elif char == 'i': - if self.replace("iciti", "ic"): - pass - elif char == 'l': - if self.replace("ical", "ic"): - pass - elif self.replace("ful", ""): - pass - elif char == 's': - if self.replace("ness", ""): - pass - - def step4(self) -> None: - """step4() takes off -ant, -ence etc., in context vcvc.""" - char = self.word[-2] - if char == 'a': - if self.ends("al"): - pass - else: - return - elif char == 'c': - if self.ends("ance"): - pass - elif self.ends("ence"): - pass - else: - return - elif char == 'e': - if self.ends("er"): - pass - else: - return - elif char == 'i': - if self.ends("ic"): - pass - else: - return - elif char == 'l': - if self.ends("able"): - pass - elif self.ends("ible"): - pass - else: - return - elif char == 'n': - if self.ends("ant"): - pass - elif self.ends("ement"): - pass - elif self.ends("ment"): - pass - elif self.ends("ent"): - pass - else: - return - elif char == 'o': - if self.ends("ion") and (self.word[self.j] in {'s', 't'}): - pass - elif self.ends("ou"): - pass - # takes care of -ous - else: - return - elif char == 's': - if self.ends("ism"): - pass - else: - return - elif char == 't': - if self.ends("ate"): - pass - elif self.ends("iti"): - pass - else: - return - elif char == 'u': - if self.ends("ous"): - pass - else: - return - elif char == 'v': - if self.ends("ive"): - pass - else: - return - elif char == 'z': - if self.ends("ize"): - pass - else: - return - else: - return - if measure_consonant_sequences(self.word, self.j) > 1: - self.word = self.word[:self.j + 1] - - def step5(self) -> None: - """step5() removes a final -e if measure_consonant_sequences(self.word, self.j) > 1, and changes -ll to -l if - measure_consonant_sequences(self.word, self.j) > 1. - """ - if self.word[-1] == 'e': - a = measure_consonant_sequences(self.word, len(self.word) - 1) - if a > 1 or (a == 1 and not consonant_vowel_consonant(self.word[:-1])): - self.word = self.word[:-1] - if self.word[-1] == 'l' and double_consonant(self.word) and measure_consonant_sequences(self.word, len(self.word) - 1) > 1: - self.word = self.word[:-1] - - def stem(self, word: str) -> str: - """The string to be stemmed is ``word``. - The stemmer returns the stemmed string. - """ - - # With this line, strings of length 1 or 2 don't go through the - # stemming process, although no mention is made of this in the - # published algorithm. Remove the line to match the published - # algorithm. - if len(word) <= 2: - return word # --DEPARTURE-- - - # copy the parameters into statics - self.word = word - - self.step1ab() - self.step1c() - self.step2() - self.step3() - self.step4() - self.step5() - return self.word - - -if __name__ == '__main__': - stemmer = PorterStemmer() - stemmer.stem("agreed")