Include stop word lists from Lucene and the Snowball project (#1666)

quickwit-oss · Nov 9, 2022 · a4b759d · a4b759d
1 parent 3e9c806
commit a4b759d
Show file tree

Hide file tree

Showing 6 changed files with 2,243 additions and 6 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -48,7 +48,7 @@ jobs:
     strategy:
       matrix:
         features: [
-            { label: "all", flags: "mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
+            { label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
             { label: "quickwit", flags: "mmap,quickwit,failpoints" }
         ]
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -91,8 +91,9 @@ debug-assertions = true
 overflow-checks = true
 
 [features]
-default = ["mmap", "lz4-compression" ]
+default = ["mmap", "stopwords", "lz4-compression"]
 mmap = ["fs2", "tempfile", "memmap2"]
+stopwords = []
 
 brotli-compression = ["brotli"]
 lz4-compression = ["lz4_flex"]

diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs
@@ -9,7 +9,7 @@ use crate::DocId;
 /// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
 /// of each document for each field with field norms.
 ///
-/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
+/// `FieldNormsWriter` stores a `Vec<u8>` for each tracked field, using a
 /// byte per document per field.
 pub struct FieldNormsWriter {
     fieldnorms_buffers: Vec<Option<Vec<u8>>>,

diff --git a/src/tokenizer/stop_word_filter/gen_stopwords.py b/src/tokenizer/stop_word_filter/gen_stopwords.py
@@ -0,0 +1,42 @@
+import requests
+
+LANGUAGES = [
+    "danish",
+    "dutch",
+    "finnish",
+    "french",
+    "german",
+    "italian",
+    "norwegian",
+    "portuguese",
+    "russian",
+    "spanish",
+    "swedish",
+]
+
+with requests.Session() as sess, open("stopwords.rs", "w") as mod:
+    mod.write("/*\n")
+    mod.write(
+        "These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
+    )
+
+    resp = sess.get(
+        "https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
+    )
+    resp.raise_for_status()
+    mod.write(resp.text)
+    mod.write("*/\n\n")
+
+    for lang in LANGUAGES:
+        resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
+        resp.raise_for_status()
+
+        mod.write(f"pub const {lang.upper()}: &[&str] = &[\n")
+
+        for line in resp.text.splitlines():
+            line, _, _ = line.partition("|")
+
+            for word in line.split():
+                mod.write(f'    "{word}",\n')
+
+        mod.write("];\n\n")
diff --git a/src/tokenizer/stop_word_filter.rs → src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter.rs → src/tokenizer/stop_word_filter/mod.rs
@@ -10,6 +10,10 @@
 //! assert_eq!(stream.next().unwrap().text, "crafty");
 //! assert!(stream.next().is_none());
 //! ```
+#[cfg(feature = "stopwords")]
+#[rustfmt::skip]
+mod stopwords;
+
 use std::sync::Arc;
 
 use rustc_hash::FxHashSet;
@@ -31,14 +35,87 @@ impl StopWordFilter {
         }
     }
 
-    fn english() -> StopWordFilter {
-        let words: [&'static str; 33] = [
+    fn from_word_list(words: &[&str]) -> Self {
+        Self::remove(words.iter().map(|&word| word.to_owned()))
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Danish language
+    pub fn danish() -> Self {
+        Self::from_word_list(stopwords::DANISH)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Dutch language
+    pub fn dutch() -> Self {
+        Self::from_word_list(stopwords::DUTCH)
+    }
+
+    /// Create a `StopWorldFilter` for the English language
+    pub fn english() -> Self {
+        // This is the same list of words used by the Apache-licensed Lucene project,
+        // c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
+        const WORDS: &[&str] = &[
             "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
             "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
             "there", "these", "they", "this", "to", "was", "will", "with",
         ];
 
-        StopWordFilter::remove(words.iter().map(|&s| s.to_string()))
+        Self::from_word_list(WORDS)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Finnish language
+    pub fn finnish() -> Self {
+        Self::from_word_list(stopwords::FINNISH)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the French language
+    pub fn french() -> Self {
+        Self::from_word_list(stopwords::FRENCH)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the German language
+    pub fn german() -> Self {
+        Self::from_word_list(stopwords::GERMAN)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Italian language
+    pub fn italian() -> Self {
+        Self::from_word_list(stopwords::ITALIAN)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Norwegian language
+    pub fn norwegian() -> Self {
+        Self::from_word_list(stopwords::NORWEGIAN)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Portuguese language
+    pub fn portuguese() -> Self {
+        Self::from_word_list(stopwords::PORTUGUESE)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Russian language
+    pub fn russian() -> Self {
+        Self::from_word_list(stopwords::RUSSIAN)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Spanish language
+    pub fn spanish() -> Self {
+        Self::from_word_list(stopwords::SPANISH)
+    }
+
+    #[cfg(feature = "stopwords")]
+    /// Create a `StopWorldFilter` for the Swedish language
+    pub fn swedish() -> Self {
+        Self::from_word_list(stopwords::SWEDISH)
     }
 }