diff --git a/.gitignore b/.gitignore index 6c7ae3b1..49df1a44 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +/pkg/ /target/ **/*.rs.bk @@ -31,6 +32,7 @@ out/ *.bak *.tmp *.class +*.html .buildpath .classpath .vscode/* @@ -49,4 +51,4 @@ $RECYCLE.BIN/ Desktop.ini ehthumbs.db -src/main.rs \ No newline at end of file +src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 01c53f27..e723df28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -44,6 +44,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bumpalo" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" + [[package]] name = "byteorder" version = "1.4.3" @@ -310,7 +316,7 @@ checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" [[package]] name = "lingua" -version = "1.3.3" +version = "1.4.0-SNAPSHOT" dependencies = [ "cld2", "float-cmp", @@ -405,6 +411,7 @@ dependencies = [ "strum_macros", "tempfile", "titlecase", + "wasm-bindgen", "whatlang", "zip", ] @@ -934,6 +941,15 @@ dependencies = [ "include_dir", ] +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + [[package]] name = "maplit" version = "1.0.2" @@ -1416,6 +1432,62 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" +[[package]] +name = "wasm-bindgen" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" +dependencies = [ + "cfg-if", + "serde", + "serde_json", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" +dependencies = [ + "bumpalo", + "lazy_static 1.4.0", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" + [[package]] name = "whatlang" version = "0.13.0" diff --git a/Cargo.toml b/Cargo.toml index d89ad681..04c185b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ members = ["language-models/*"] [package] name = "lingua" -version = "1.3.3" +version = "1.4.0-SNAPSHOT" authors = ["Peter M. Stahl "] description = """ An accurate natural language detection library, suitable for long and short text alike @@ -36,6 +36,9 @@ keywords = [ "nlp" ] +[lib] +crate-type = ["cdylib", "rlib"] + [dependencies] fraction = "0.10.0" include_dir = "0.7.2" @@ -125,6 +128,9 @@ lingua-xhosa-language-model = { path = "language-models/xh", version = "=1.0.1", lingua-yoruba-language-model = { path = "language-models/yo", version = "=1.0.1", optional = true } lingua-zulu-language-model = { path = "language-models/zu", version = "=1.0.1", optional = true } +[target.'cfg(target_family = "wasm")'.dependencies] +wasm-bindgen = { version = "0.2", features = ["serde-serialize"] } + [dev-dependencies] cld2 = "1.0.2" float-cmp = "0.9.0" @@ -136,8 +142,9 @@ titlecase = "1.1.0" whatlang = "0.13.0" [features] -default = [ - "parallelism", +default = ["parallelism", "all-languages"] +parallelism = ["rayon"] +all-languages = [ "afrikaans", "albanian", "arabic", "armenian", "azerbaijani", "basque", "belarusian", "bengali", "bokmal", "bosnian", "bulgarian", "catalan", "chinese", "croatian", "czech", "danish", "dutch", "english", "esperanto", @@ -151,7 +158,6 @@ default = [ "tsonga", "tswana", "turkish", "ukrainian", "urdu", "vietnamese", "welsh", "xhosa", "yoruba", "zulu" ] -parallelism = ["rayon"] afrikaans = ["lingua-afrikaans-language-model"] albanian = ["lingua-albanian-language-model"] arabic = ["lingua-arabic-language-model"] diff --git a/src/builder.rs b/src/builder.rs index b4fb8547..14223f5b 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -19,7 +19,11 @@ use crate::isocode::{IsoCode639_1, IsoCode639_3}; use crate::language::Language; use std::collections::HashSet; -const MISSING_LANGUAGE_MESSAGE: &str = "LanguageDetector needs at least 2 languages to choose from"; +pub(crate) const MISSING_LANGUAGE_MESSAGE: &str = + "LanguageDetector needs at least 2 languages to choose from"; + +pub(crate) const MINIMUM_RELATIVE_DISTANCE_MESSAGE: &str = + "Minimum relative distance must lie in between 0.0 and 0.99"; /// This struct configures and creates an instance of [LanguageDetector]. pub struct LanguageDetectorBuilder { @@ -141,7 +145,7 @@ impl LanguageDetectorBuilder { /// ⚠ Panics if `distance` is smaller than 0.0 or greater than 0.99. pub fn with_minimum_relative_distance(&mut self, distance: f64) -> &mut Self { if !(0.0..=0.99).contains(&distance) { - panic!("minimum relative distance must lie in between 0.0 and 0.99"); + panic!("{}", MINIMUM_RELATIVE_DISTANCE_MESSAGE); } self.minimum_relative_distance = distance; self @@ -303,13 +307,13 @@ mod tests { } #[test] - #[should_panic(expected = "minimum relative distance must lie in between 0.0 and 0.99")] + #[should_panic(expected = "Minimum relative distance must lie in between 0.0 and 0.99")] fn assert_detector_cannot_be_built_from_too_small_minimum_relative_distance() { LanguageDetectorBuilder::from_all_languages().with_minimum_relative_distance(-2.3); } #[test] - #[should_panic(expected = "minimum relative distance must lie in between 0.0 and 0.99")] + #[should_panic(expected = "Minimum relative distance must lie in between 0.0 and 0.99")] fn assert_detector_cannot_be_built_from_too_large_minimum_relative_distance() { LanguageDetectorBuilder::from_all_languages().with_minimum_relative_distance(1.7); } diff --git a/src/lib.rs b/src/lib.rs index 62e0c62b..bf94b8a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -281,6 +281,9 @@ mod model; mod ngram; mod writer; +#[cfg(target_family = "wasm")] +mod wasm; + pub use builder::LanguageDetectorBuilder; pub use detector::LanguageDetector; pub use isocode::{IsoCode639_1, IsoCode639_3}; diff --git a/src/wasm.rs b/src/wasm.rs new file mode 100644 index 00000000..2d3a555d --- /dev/null +++ b/src/wasm.rs @@ -0,0 +1,268 @@ +/* + * Copyright © 2020-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#![allow(non_snake_case)] + +use crate::builder::{MINIMUM_RELATIVE_DISTANCE_MESSAGE, MISSING_LANGUAGE_MESSAGE}; +use crate::{IsoCode639_1, IsoCode639_3, Language, LanguageDetector as Detector}; +use itertools::Itertools; +use serde::Serialize; +use std::collections::HashSet; +use std::str::FromStr; +use wasm_bindgen::prelude::*; + +#[wasm_bindgen] +pub struct LanguageDetectorBuilder { + languages: HashSet, + minimum_relative_distance: f64, + is_every_language_model_preloaded: bool, +} + +#[wasm_bindgen] +pub struct LanguageDetector { + detector: Detector, +} + +#[derive(Serialize)] +pub struct ConfidenceValue { + language: String, + confidence: f64, +} + +#[wasm_bindgen] +impl LanguageDetectorBuilder { + /// Creates and returns an instance of `LanguageDetectorBuilder` with all built-in languages. + pub fn fromAllLanguages() -> Self { + Self::from(Language::all()) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with all built-in spoken languages. + pub fn fromAllSpokenLanguages() -> Self { + Self::from(Language::all_spoken_ones()) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with all built-in languages supporting the Arabic script. + pub fn fromAllLanguagesWithArabicScript() -> Self { + Self::from(Language::all_with_arabic_script()) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with all built-in languages supporting the Cyrillic script. + pub fn fromAllLanguagesWithCyrillicScript() -> Self { + Self::from(Language::all_with_cyrillic_script()) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with all built-in languages supporting the Devanagari script. + pub fn fromAllLanguagesWithDevanagariScript() -> Self { + Self::from(Language::all_with_devanagari_script()) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with all built-in languages supporting the Latin script. + pub fn fromAllLanguagesWithLatinScript() -> Self { + Self::from(Language::all_with_latin_script()) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with all built-in languages except those specified in `languages`. + /// + /// ⚠ Throws an error if less than two `languages` are used to build + /// the `LanguageDetector`. + pub fn fromAllLanguagesWithout( + languages: Box<[JsValue]>, + ) -> Result { + let mut languages_to_load = Language::all(); + let languages_to_filter_out = languages + .iter() + .filter_map(|it| it.as_string()) + .filter_map(|it| Language::from_str(&it).ok()) + .collect_vec(); + languages_to_load.retain(|it| !languages_to_filter_out.contains(it)); + + if languages_to_load.len() < 2 { + return Err(JsValue::from(MISSING_LANGUAGE_MESSAGE)); + } + + Ok(Self::from(languages_to_load)) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with the specified `languages`. + /// + /// ⚠ Throws an error if less than two `languages` are specified. + pub fn fromLanguages(languages: Box<[JsValue]>) -> Result { + let selected_languages = languages + .iter() + .filter_map(|it| it.as_string()) + .filter_map(|it| Language::from_str(&it).ok()) + .collect::>(); + + if selected_languages.len() < 2 { + return Err(JsValue::from(MISSING_LANGUAGE_MESSAGE)); + } + + Ok(Self::from(selected_languages)) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with the languages specified by the respective ISO 639-1 codes. + /// + /// ⚠ Throws an error if less than two `iso_codes` are specified. + pub fn fromISOCodes6391(isoCodes: Box<[JsValue]>) -> Result { + let selected_iso_codes = isoCodes + .iter() + .filter_map(|it| it.as_string()) + .filter_map(|it| IsoCode639_1::from_str(&it).ok()) + .collect_vec(); + + if selected_iso_codes.len() < 2 { + return Err(JsValue::from(MISSING_LANGUAGE_MESSAGE)); + } + + let selected_languages = selected_iso_codes + .iter() + .map(Language::from_iso_code_639_1) + .collect::>(); + + Ok(Self::from(selected_languages)) + } + + /// Creates and returns an instance of `LanguageDetectorBuilder` + /// with the languages specified by the respective ISO 639-3 codes. + /// + /// ⚠ Throws an error if less than two `iso_codes` are specified. + pub fn fromISOCodes6393(isoCodes: Box<[JsValue]>) -> Result { + let selected_iso_codes = isoCodes + .iter() + .filter_map(|it| it.as_string()) + .filter_map(|it| IsoCode639_3::from_str(&it).ok()) + .collect_vec(); + + if selected_iso_codes.len() < 2 { + return Err(JsValue::from(MISSING_LANGUAGE_MESSAGE)); + } + + let selected_languages = selected_iso_codes + .iter() + .map(Language::from_iso_code_639_3) + .collect::>(); + + Ok(Self::from(selected_languages)) + } + + /// Sets the desired value for the minimum relative distance measure. + /// + /// By default, *Lingua* returns the most likely language for a given + /// input text. However, there are certain words that are spelled the + /// same in more than one language. The word *prologue*, for instance, + /// is both a valid English and French word. Lingua would output either + /// English or French which might be wrong in the given context. + /// For cases like that, it is possible to specify a minimum relative + /// distance that the logarithmized and summed up probabilities for + /// each possible language have to satisfy. + /// + /// Be aware that the distance between the language probabilities is + /// dependent on the length of the input text. The longer the input + /// text, the larger the distance between the languages. So if you + /// want to classify very short text phrases, do not set the minimum + /// relative distance too high. Otherwise you will get most results + /// returned as `undefined` which is the return value for cases + /// where language detection is not reliably possible. + /// + /// ⚠ Throws an error if `distance` is smaller than 0.0 or greater than 0.99. + pub fn setMinimumRelativeDistance(&mut self, distance: f64) -> Result<(), JsValue> { + if !(0.0..=0.99).contains(&distance) { + return Err(JsValue::from(MINIMUM_RELATIVE_DISTANCE_MESSAGE)); + } + self.minimum_relative_distance = distance; + Ok(()) + } + + /// Configures `LanguageDetectorBuilder` to preload all language models when creating + /// the instance of [LanguageDetector]. + /// + /// By default, *Lingua* uses lazy-loading to load only those language models + /// on demand which are considered relevant by the rule-based filter engine. + /// For web services, for instance, it is rather beneficial to preload all language + /// models into memory to avoid unexpected latency while waiting for the + /// service response. This method allows to switch between these two loading modes. + pub fn enablePreloadingLanguageModels(&mut self) { + self.is_every_language_model_preloaded = true; + } + + /// Creates and returns the configured instance of [LanguageDetector]. + pub fn build(&mut self) -> LanguageDetector { + LanguageDetector { + detector: Detector::from( + self.languages.clone(), + self.minimum_relative_distance, + self.is_every_language_model_preloaded, + ), + } + } + + fn from(languages: HashSet) -> Self { + Self { + languages, + minimum_relative_distance: 0.0, + is_every_language_model_preloaded: false, + } + } +} + +#[wasm_bindgen] +impl LanguageDetector { + /// Detects the language of given input text. + /// If the language cannot be reliably detected, `undefined` is returned. + pub fn detectLanguageOf(&self, text: &str) -> Option { + match self.detector.detect_language_of(text) { + Some(language) => Some(language.to_string()), + None => None, + } + } + + /// Computes confidence values for each language considered possible for the given input text. + /// + /// An object of all possible languages is returned, sorted by their confidence value in + /// descending order. The values that this method computes are part of a **relative** + /// confidence metric, not of an absolute one. Each value is a number between 0.0 and 1.0. + /// The most likely language is always returned with value 1.0. All other languages get values + /// assigned which are lower than 1.0, denoting how less likely those languages are in + /// comparison to the most likely language. + /// + /// The object returned by this method does not necessarily contain all languages which the + /// calling instance of `LanguageDetector` was built from. If the rule-based engine decides + /// that a specific language is truly impossible, then it will not be part of the returned + /// object. Likewise, if no ngram probabilities can be found within the detector's languages + /// for the given input text, the returned object will be empty. The confidence value for + /// each language not being part of the returned object is assumed to be 0.0. + pub fn computeLanguageConfidenceValues(&self, text: &str) -> JsValue { + let confidence_values = self + .detector + .compute_language_confidence_values(text) + .iter() + .map(|(language, confidence)| ConfidenceValue { + language: language.to_string(), + confidence: *confidence, + }) + .collect_vec(); + + JsValue::from_serde(&confidence_values).unwrap() + } +}