Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify Sphinx's Stemmer #10467

Merged
merged 37 commits into from Jun 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
f9bc6a0
Inline the adjusted `.stem()` method into `PorterStemmer`
AA-Turner Apr 27, 2022
904f332
Add tests
AA-Turner Apr 27, 2022
f73d39b
Simplify the docstring in `PorterStemmer.stem()`
AA-Turner Apr 27, 2022
0a75457
Make the short-string check more explicit in `PorterStemmer.stem()`
AA-Turner Apr 27, 2022
adb1bdb
Add annotations to instance variables in `PorterStemmer`
AA-Turner Apr 27, 2022
1caf546
`PorterStemmer.k0` is always 0
AA-Turner Apr 27, 2022
879e2ff
`PorterStemmer.cons` -> `PorterStemmer.is_consonant`
AA-Turner Apr 27, 2022
af29f11
`PorterStemmer.doublec` -> `PorterStemmer.double_consonant`
AA-Turner Apr 27, 2022
851bbac
Inplace arithmetic
AA-Turner Apr 27, 2022
4ebf4fe
`PorterStemmer.m` -> `PorterStemmer.measure_consonant_sequences`
AA-Turner Apr 27, 2022
7a31c2a
`PorterStemmer.vowelinstem` -> `PorterStemmer.vowel_in_stem`
AA-Turner Apr 27, 2022
50063b3
`PorterStemmer.cvc` -> `PorterStemmer.consonant_vowel_consonant`
AA-Turner Apr 27, 2022
6150784
Simplify `PorterStemmer.ends`
AA-Turner Apr 27, 2022
188b5f5
Add tests
AA-Turner Apr 27, 2022
a31c6ae
`PorterStemmer.setto` -> `PorterStemmer.set_to`
AA-Turner Apr 28, 2022
5ae826c
Store locals
AA-Turner Apr 28, 2022
36ad9a7
Calculate self.k based on len(self.b)
AA-Turner Apr 28, 2022
380d50e
Inline self.k
AA-Turner Apr 28, 2022
e6f3dab
Simplify
AA-Turner Apr 28, 2022
a77533b
Simplify
AA-Turner Apr 28, 2022
1334e9c
Simplify
AA-Turner Apr 28, 2022
703417a
Remove set_to
AA-Turner Apr 28, 2022
227a71f
Use PorterStemmer for static methods
AA-Turner Apr 28, 2022
a3c4a29
Replace .r with .replace
AA-Turner Apr 28, 2022
ac493e4
Replace .r with .replace
AA-Turner Apr 28, 2022
af3ca7a
Remove more `.j`s
AA-Turner Apr 28, 2022
4e07483
Methods -> functions
AA-Turner Apr 28, 2022
523918a
snowball
AA-Turner Apr 30, 2022
496352e
set
AA-Turner May 22, 2022
e5a753d
self.b -> self.word
AA-Turner May 22, 2022
161e63d
s -> string
AA-Turner May 22, 2022
409733a
Simplify self.ends and self.replace
AA-Turner May 22, 2022
b983eb5
Revert "Add tests"
AA-Turner May 22, 2022
0f5c22c
Revert experimentation
AA-Turner May 23, 2022
b5d50fe
Deprecate sphinx.util.stemmer in favour of snowballstemmer
AA-Turner May 22, 2022
23dd223
Merge branch '5.x' into stemmer
AA-Turner Jun 16, 2022
7c53a6b
Update docs
AA-Turner Jun 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 0 additions & 2 deletions AUTHORS
Expand Up @@ -96,5 +96,3 @@ authors and projects:

* sphinx.util.jsdump uses the basestring encoding from simplejson,
written by Bob Ippolito, released under the MIT license
* sphinx.util.stemmer was written by Vivake Gupta, placed in the
Public Domain
3 changes: 3 additions & 0 deletions CHANGES
Expand Up @@ -10,6 +10,9 @@ Incompatible changes
Deprecated
----------

* #10467: Deprecated ``sphinx.util.stemmer`` in favour of ``snowballstemmer``.
Patch by Adam Turner.

Features added
--------------

Expand Down
5 changes: 5 additions & 0 deletions doc/extdev/deprecated.rst
Expand Up @@ -22,6 +22,11 @@ The following is a list of deprecated interfaces.
- (will be) Removed
- Alternatives

* - ``sphinx.util.stemmer``
- 5.1
- 7.0
- ``snowballstemmer``

* - ``sphinx.util.jsdump``
- 5.0
- 7.0
Expand Down
7 changes: 4 additions & 3 deletions sphinx/search/en.py
Expand Up @@ -2,8 +2,9 @@

from typing import Dict

import snowballstemmer

from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer

english_stopwords = set("""
a and are as at
Expand Down Expand Up @@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage):
stopwords = english_stopwords

def init(self, options: Dict) -> None:
self.stemmer = get_stemmer()
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, word: str) -> str:
return self.stemmer.stem(word.lower())
return self.stemmer.stemWord(word.lower())
9 changes: 5 additions & 4 deletions sphinx/search/zh.py
Expand Up @@ -4,8 +4,9 @@
import re
from typing import Dict, List

import snowballstemmer

from sphinx.search import SearchLanguage
from sphinx.util.stemmer import get_stemmer

try:
import jieba
Expand Down Expand Up @@ -230,7 +231,7 @@ def init(self, options: Dict) -> None:
if dict_path and os.path.isfile(dict_path):
jieba.load_userdict(dict_path)

self.stemmer = get_stemmer()
self.stemmer = snowballstemmer.stemmer('english')

def split(self, input: str) -> List[str]:
chinese: List[str] = []
Expand All @@ -252,8 +253,8 @@ def stem(self, word: str) -> str:
should_not_be_stemmed = (
word in self.latin_terms and
len(word) >= 3 and
len(self.stemmer.stem(word.lower())) < 3
len(self.stemmer.stemWord(word.lower())) < 3
)
if should_not_be_stemmed:
return word.lower()
return self.stemmer.stem(word.lower())
return self.stemmer.stemWord(word.lower())
63 changes: 44 additions & 19 deletions sphinx/util/stemmer/__init__.py
@@ -1,37 +1,62 @@
"""Word stemming utilities for Sphinx."""

from sphinx.util.stemmer.porter import PorterStemmer
import warnings

try:
from Stemmer import Stemmer as _PyStemmer
PYSTEMMER = True
except ImportError:
PYSTEMMER = False
import snowballstemmer

from sphinx.deprecation import RemovedInSphinx70Warning


class PorterStemmer:
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('porter') instead.",
RemovedInSphinx70Warning, stacklevel=2)
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, p: str, i: int, j: int) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('porter').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(p)


class BaseStemmer:
def __init__(self):
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
"snowballstemmer.stemmer('porter') instead.",
RemovedInSphinx70Warning, stacklevel=3)

def stem(self, word: str) -> str:
raise NotImplementedError()
raise NotImplementedError


class PyStemmer(BaseStemmer):
def __init__(self) -> None:
self.stemmer = _PyStemmer('porter')
def __init__(self): # NoQA
super().__init__()
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('porter').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(word)


class StandardStemmer(PorterStemmer, BaseStemmer):
"""All those porter stemmer implementations look hideous;
make at least the stem method nicer.
"""
def stem(self, word: str) -> str: # type: ignore
return super().stem(word, 0, len(word) - 1)
class StandardStemmer(BaseStemmer):
def __init__(self): # NoQA
super().__init__()
self.stemmer = snowballstemmer.stemmer('porter')

def stem(self, word: str) -> str:
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
"snowballstemmer.stemmer('porter').stemWord() instead.",
RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(word)


def get_stemmer() -> BaseStemmer:
if PYSTEMMER:
return PyStemmer()
else:
return StandardStemmer()
warnings.warn("get_stemmer() is deprecated, use "
"snowballstemmer.stemmer('porter') instead.",
RemovedInSphinx70Warning, stacklevel=2)
return PyStemmer()