Skip to content

Commit

Permalink
Simplify Sphinx's Stemmer (#10467)
Browse files Browse the repository at this point in the history
  • Loading branch information
AA-Turner committed Jun 16, 2022
1 parent 956cddb commit 881f66c
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 434 deletions.
2 changes: 0 additions & 2 deletions AUTHORS
Expand Up @@ -96,5 +96,3 @@ authors and projects:

* sphinx.util.jsdump uses the basestring encoding from simplejson,
written by Bob Ippolito, released under the MIT license
* sphinx.util.stemmer was written by Vivake Gupta, placed in the
Public Domain
3 changes: 3 additions & 0 deletions CHANGES
Expand Up @@ -10,6 +10,9 @@ Incompatible changes
Deprecated
----------

* #10467: Deprecated ``sphinx.util.stemmer`` in favour of ``snowballstemmer``.
Patch by Adam Turner.

Features added
--------------

Expand Down
5 changes: 5 additions & 0 deletions doc/extdev/deprecated.rst
Expand Up @@ -22,6 +22,11 @@ The following is a list of deprecated interfaces.
- (will be) Removed
- Alternatives

* - ``sphinx.util.stemmer``
- 5.1
- 7.0
- ``snowballstemmer``

* - ``sphinx.util.jsdump``
- 5.0
- 7.0
Expand Down
7 changes: 4 additions & 3 deletions sphinx/search/en.py
Expand Up @@ -2,8 +2,9 @@

from typing import Dict

import snowballstemmer

from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer

english_stopwords = set("""
a and are as at
Expand Down Expand Up @@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage):
stopwords = english_stopwords

def init(self, options: Dict) -> None:
self.stemmer = get_stemmer()
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, word: str) -> str:
return self.stemmer.stem(word.lower())
return self.stemmer.stemWord(word.lower())
9 changes: 5 additions & 4 deletions sphinx/search/zh.py
Expand Up @@ -4,8 +4,9 @@
import re
from typing import Dict, List

import snowballstemmer

from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer

try:
import jieba
Expand Down Expand Up @@ -230,7 +231,7 @@ def init(self, options: Dict) -> None:
if dict_path and os.path.isfile(dict_path):
jieba.load_userdict(dict_path)

self.stemmer = get_stemmer()
self.stemmer = snowballstemmer.stemmer('english')

def split(self, input: str) -> List[str]:
chinese: List[str] = []
Expand All @@ -252,8 +253,8 @@ def stem(self, word: str) -> str:
should_not_be_stemmed = (
word in self.latin_terms and
len(word) >= 3 and
len(self.stemmer.stem(word.lower())) < 3
len(self.stemmer.stemWord(word.lower())) < 3
)
if should_not_be_stemmed:
return word.lower()
return self.stemmer.stem(word.lower())
return self.stemmer.stemWord(word.lower())
63 changes: 44 additions & 19 deletions sphinx/util/stemmer/__init__.py
@@ -1,37 +1,62 @@
"""Word stemming utilities for Sphinx."""

from sphinx.util.stemmer.porter import PorterStemmer
import warnings

try:
from Stemmer import Stemmer as _PyStemmer
PYSTEMMER = True
except ImportError:
PYSTEMMER = False
import snowballstemmer

from sphinx.deprecation import RemovedInSphinx70Warning


class PorterStemmer:
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('porter') instead.",
RemovedInSphinx70Warning, stacklevel=2)
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, p: str, i: int, j: int) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('porter').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(p)


class BaseStemmer:
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('porter') instead.",
RemovedInSphinx70Warning, stacklevel=3)

def stem(self, word: str) -> str:
raise NotImplementedError()
raise NotImplementedError


class PyStemmer(BaseStemmer):
def __init__(self) -> None:
self.stemmer = _PyStemmer('porter')
def __init__(self): # NoQA
super().__init__()
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('porter').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(word)


class StandardStemmer(PorterStemmer, BaseStemmer):
"""All those porter stemmer implementations look hideous;
make at least the stem method nicer.
"""
def stem(self, word: str) -> str: # type: ignore
return super().stem(word, 0, len(word) - 1)
class StandardStemmer(BaseStemmer):
def __init__(self): # NoQA
super().__init__()
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('porter').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(word)


def get_stemmer() -> BaseStemmer:
if PYSTEMMER:
return PyStemmer()
else:
return StandardStemmer()
warnings.warn("get_stemmer() is deprecated, use "
"snowballstemmer.stemmer('porter') instead.",
RemovedInSphinx70Warning, stacklevel=2)
return PyStemmer()

0 comments on commit 881f66c

Please sign in to comment.