Skip to content

Commit

Permalink
Deprecate sphinx.util.stemmer in favour of snowballstemmer
Browse files Browse the repository at this point in the history
PorterStemmer, BaseStemmer, PyStemmer, StandardStemmer, and get_stemmer are deprecated
sphinx.util.stemmer.porter is removed
  • Loading branch information
AA-Turner committed May 22, 2022
1 parent 391473a commit 3e062bd
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 395 deletions.
2 changes: 0 additions & 2 deletions AUTHORS
Expand Up @@ -96,5 +96,3 @@ authors and projects:

* sphinx.util.jsdump uses the basestring encoding from simplejson,
written by Bob Ippolito, released under the MIT license
* sphinx.util.stemmer was written by Vivake Gupta, placed in the
Public Domain
9 changes: 5 additions & 4 deletions sphinx/search/zh.py
Expand Up @@ -4,8 +4,9 @@
import re
from typing import Dict, List

import snowballstemmer

from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer

try:
import jieba
Expand Down Expand Up @@ -230,7 +231,7 @@ def init(self, options: Dict) -> None:
if dict_path and os.path.isfile(dict_path):
jieba.load_userdict(dict_path)

self.stemmer = get_stemmer()
self.stemmer = snowballstemmer.stemmer('english')

def split(self, input: str) -> List[str]:
chinese: List[str] = []
Expand All @@ -252,8 +253,8 @@ def stem(self, word: str) -> str:
should_not_be_stemmed = (
word in self.latin_terms and
len(word) >= 3 and
len(self.stemmer.stem(word.lower())) < 3
len(self.stemmer.stemWord(word.lower())) < 3
)
if should_not_be_stemmed:
return word.lower()
return self.stemmer.stem(word.lower())
return self.stemmer.stemWord(word.lower())
57 changes: 54 additions & 3 deletions sphinx/util/stemmer/__init__.py
@@ -1,16 +1,67 @@
"""Word stemming utilities for Sphinx."""
import warnings

import snowballstemmer

from sphinx.util.stemmer.porter import PorterStemmer
from sphinx.deprecation import RemovedInSphinx70Warning

_ENGLISH_STEMMER_TYPE = type(snowballstemmer.stemmer('english'))


class PorterStemmer(_ENGLISH_STEMMER_TYPE):
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('english') instead.",
RemovedInSphinx70Warning, stacklevel=2)
super().__init__()

def stem(self, p: str, i: int, j: int) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('english').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemWord(p)


class BaseStemmer:
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated.",
RemovedInSphinx70Warning, stacklevel=2)
super().__init__()

def stem(self, word: str) -> str:
raise NotImplementedError


def get_stemmer() -> BaseStemmer:
stemmer = snowballstemmer.stemmer('english')
class PyStemmer(BaseStemmer, _ENGLISH_STEMMER_TYPE):
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('english') instead.",
RemovedInSphinx70Warning, stacklevel=2)
super().__init__()

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('english').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemWord(word)


class StandardStemmer(BaseStemmer, PorterStemmer):
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('english') instead.",
RemovedInSphinx70Warning, stacklevel=2)
super().__init__()

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('english').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemWord(word)


def get_stemmer() -> BaseStemmer:
warnings.warn("get_stemmer() is deprecated, use "
"snowballstemmer.stemmer('english') instead.",
RemovedInSphinx70Warning, stacklevel=2)
return PyStemmer()

0 comments on commit 3e062bd

Please sign in to comment.