Skip to content

Commit

Permalink
Deprecate sphinx.util.stemmer in favour of snowballstemmer
Browse files Browse the repository at this point in the history
PorterStemmer, BaseStemmer, PyStemmer, StandardStemmer, and get_stemmer are deprecated
sphinx.util.stemmer.porter is removed
  • Loading branch information
AA-Turner committed May 23, 2022
1 parent 0f5c22c commit 6f6e28a
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 434 deletions.
2 changes: 0 additions & 2 deletions AUTHORS
Expand Up @@ -96,5 +96,3 @@ authors and projects:

* sphinx.util.jsdump uses the basestring encoding from simplejson,
written by Bob Ippolito, released under the MIT license
* sphinx.util.stemmer was written by Vivake Gupta, placed in the
Public Domain
7 changes: 4 additions & 3 deletions sphinx/search/en.py
Expand Up @@ -2,8 +2,9 @@

from typing import Dict

import snowballstemmer

from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer

english_stopwords = set("""
a and are as at
Expand Down Expand Up @@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage):
stopwords = english_stopwords

def init(self, options: Dict) -> None:
self.stemmer = get_stemmer()
self.stemmer = snowballstemmer.stemmer('english')

def stem(self, word: str) -> str:
return self.stemmer.stem(word.lower())
return self.stemmer.stemWord(word.lower())
9 changes: 5 additions & 4 deletions sphinx/search/zh.py
Expand Up @@ -4,8 +4,9 @@
import re
from typing import Dict, List

import snowballstemmer

from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer

try:
import jieba
Expand Down Expand Up @@ -230,7 +231,7 @@ def init(self, options: Dict) -> None:
if dict_path and os.path.isfile(dict_path):
jieba.load_userdict(dict_path)

self.stemmer = get_stemmer()
self.stemmer = snowballstemmer.stemmer('english')

def split(self, input: str) -> List[str]:
chinese: List[str] = []
Expand All @@ -252,8 +253,8 @@ def stem(self, word: str) -> str:
should_not_be_stemmed = (
word in self.latin_terms and
len(word) >= 3 and
len(self.stemmer.stem(word.lower())) < 3
len(self.stemmer.stemWord(word.lower())) < 3
)
if should_not_be_stemmed:
return word.lower()
return self.stemmer.stem(word.lower())
return self.stemmer.stemWord(word.lower())
63 changes: 44 additions & 19 deletions sphinx/util/stemmer/__init__.py
@@ -1,37 +1,62 @@
"""Word stemming utilities for Sphinx."""

from sphinx.util.stemmer.porter import PorterStemmer
import warnings

try:
from Stemmer import Stemmer as _PyStemmer
PYSTEMMER = True
except ImportError:
PYSTEMMER = False
import snowballstemmer

from sphinx.deprecation import RemovedInSphinx70Warning


class PorterStemmer:
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('english') instead.",
RemovedInSphinx70Warning, stacklevel=2)
self.stemmer = snowballstemmer.stemmer('english')

def stem(self, p: str, i: int, j: int) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('english').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(p)


class BaseStemmer:
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('english') instead.",
RemovedInSphinx70Warning, stacklevel=3)

def stem(self, word: str) -> str:
raise NotImplementedError()
raise NotImplementedError


class PyStemmer(BaseStemmer):
def __init__(self) -> None:
self.stemmer = _PyStemmer('porter')
def __init__(self): # NoQA
super().__init__()
self.stemmer = snowballstemmer.stemmer('english')

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('english').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(word)


class StandardStemmer(PorterStemmer, BaseStemmer):
"""All those porter stemmer implementations look hideous;
make at least the stem method nicer.
"""
def stem(self, word: str) -> str: # type: ignore
return super().stem(word, 0, len(word) - 1)
class StandardStemmer(BaseStemmer):
def __init__(self): # NoQA
super().__init__()
self.stemmer = snowballstemmer.stemmer('english')

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('english').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(word)


def get_stemmer() -> BaseStemmer:
if PYSTEMMER:
return PyStemmer()
else:
return StandardStemmer()
warnings.warn("get_stemmer() is deprecated, use "
"snowballstemmer.stemmer('english') instead.",
RemovedInSphinx70Warning, stacklevel=2)
return PyStemmer()

0 comments on commit 6f6e28a

Please sign in to comment.