diff --git a/CHANGELOG.md b/CHANGELOG.md index fcf1e2c..c58872f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +### v0.16.0 - 2022-05-07 +* [breaking] Add Armenian script (`Script::Armenian`) and language (`Lang::Hye`) + ### v0.15.0 - 2022-05-01 * Update enum-map dependency to version 2 * Optimize alphabet method for Cyrillic: almost 2x improved performance for Cyrillic languages and 7% for the average `detect()` benchmark. @@ -49,35 +52,35 @@ ### v0.11.0 - 2020-11-03 * [breaking] - rename code for Arabic: `Arb` -> `Ara` -#### v0.10.0 - 2020-09-04 +### v0.10.0 - 2020-09-04 * Support Catalan -#### v0.9.0 - 2020-06-26 +### v0.9.0 - 2020-06-26 * Support Slovak -#### v0.8.0 - 2020-05-08 +### v0.8.0 - 2020-05-08 * Support Latin -#### v0.7.4 - 2020-04-26 (yanked version) +### v0.7.4 - 2020-04-26 (yanked version) * Support Latin -#### v0.7.2 - 2019-10-19 +### v0.7.2 - 2019-10-19 * (fix) respect japanese whitelisting when mandarin characters are given (#44) -#### v0.7.1 - 2019-05-06 +### v0.7.1 - 2019-05-06 * Update dependency hashbrown 0.1.8 -> 0.3.0 (10% faster) -#### v0.7.0 - 2019-03-03 +### v0.7.0 - 2019-03-03 * Support Afrikaans language (afr) * Get rid of build dependencies: installation is much faster now -#### v0.6.0 - 2018-11-09 +### v0.6.0 - 2018-11-09 * Use hashbrown instead of fnv (detect() is 30% faster) * Use array on stack instead of vector for detect_script (1-2% faster) * Use build.rs to generate `lang.rs` file * Add property based testing -#### v0.5.0 - 2017-08-06 +### v0.5.0 - 2017-08-06 * (breaking) Rename `Lang::to_code(&self)` to `Lang::code(&self)` * (fix) Fix bug with zero division in confidence calculation * (fix) Confidence can not exceed 1.0 @@ -87,49 +90,49 @@ * Implement trait `Dislpay` for `Script` * Implement `Display` trait for `Lang` -#### v0.4.1 - 2017-07-31 +### v0.4.1 - 2017-07-31 * Calculate confidence in the range from 0 to 1 for Info -#### v0.4.0 - 2017-07-30 +### v0.4.0 - 2017-07-30 * Calculate is_reliable bool for `Info` struct. * Breaking changes for `Info`. Make fields private. Now one should use methods. * Remove support of Latin version of Serbo-Croatian, because it conflicts a lot with modern Croatian. -#### v0.3.3 - 2017-07-26 +### v0.3.3 - 2017-07-26 * Replace HashMap with FnvHashMap (~ 33% faster) -#### v0.3.2 - 2017-06-04 +### v0.3.2 - 2017-06-04 * Small performance improvement: preallocate memory for counter_hash in trigrams.rs (~ 2-3% faster) -#### v0.3.1 - 2017-02-10 +### v0.3.1 - 2017-02-10 * Fix build * Add link to doc at crates.io -#### v0.3.0 - 2017-02-10 +### v0.3.0 - 2017-02-10 * Support New 14 languages * (breaking) New API -#### v0.2.1 - 2017-02-07 +### v0.2.1 - 2017-02-07 * Support 10 new languages * Optimize trigram algorithms -#### v0.2.0 - 2017-02-06 +### v0.2.0 - 2017-02-06 * Optimize script detection * Accept text, blacklist and whitelist as references * 10 new languages * Fix: always guarantee same result on same input data (fix sorting issue) -#### v0.1.4 - 2017-02-04 +### v0.1.4 - 2017-02-04 * Support whitelist and blacklist -#### v0.1.3 - 2017-02-03 +### v0.1.3 - 2017-02-03 * Support more than 50 languages -#### v0.1.2 - 2017-01-29 +### v0.1.2 - 2017-01-29 * Support about 20 languages -#### v0.1.1 - 2016-12-25 +### v0.1.1 - 2016-12-25 * Tiny improvements -#### v0.1.0 - 2016-12-25 +### v0.1.0 - 2016-12-25 * First public release diff --git a/Cargo.toml b/Cargo.toml index 3102ed6..9cbab62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "whatlang" -version = "0.15.0" +version = "0.16.0" authors = ["Sergey Potapov "] edition = "2018" -description = "Natural language detection library. Identifies language of a given text." +description = "Fast and lightweight language identification library for Rust." keywords = ["language", "nlp", "lang", "whatlang", "text"] license = "MIT" repository = "https://github.com/greyblake/whatlang-rs" diff --git a/README.md b/README.md index 32281fb..1eba8ff 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ ## Features -* Supports [68 languages](https://github.com/greyblake/whatlang-rs/blob/master/SUPPORTED_LANGUAGES.md) +* Supports [69 languages](https://github.com/greyblake/whatlang-rs/blob/master/SUPPORTED_LANGUAGES.md) * 100% written in Rust * Lightweight, fast and simple * Recognizes not only a language, but also a script (Latin, Cyrillic, etc) diff --git a/SUPPORTED_LANGUAGES.md b/SUPPORTED_LANGUAGES.md index be8d0ad..0d5b019 100644 --- a/SUPPORTED_LANGUAGES.md +++ b/SUPPORTED_LANGUAGES.md @@ -76,3 +76,4 @@ and [documentation](https://docs.rs/whatlang/). | Slovak | slk | `Lang::Slk` | | Catalan | cat | `Lang::Cat` | | Tagalog | tgl | `Lang::Tgl` | +| Armenian | hye | `Lang::Hye` | diff --git a/misc/lang.rs.erb b/misc/lang.rs.erb index f4505dd..c418c82 100644 --- a/misc/lang.rs.erb +++ b/misc/lang.rs.erb @@ -5,7 +5,7 @@ use std::fmt; use std::str::FromStr; -use crate::error::Error; +use crate::error::ParseError; #[cfg(feature = "enum-map")] use enum_map::Enum; @@ -122,10 +122,10 @@ impl fmt::Display for Lang { } impl FromStr for Lang { - type Err = Error; + type Err = ParseError; fn from_str(s: &str) -> Result { - Lang::from_code(s).ok_or_else(|| Error::ParseLang(s.to_string())) + Lang::from_code(s).ok_or_else(|| ParseError::Lang(s.to_string())) } } @@ -181,7 +181,7 @@ mod tests { assert!( matches!( result, - Err(Error::ParseLang(_)) + Err(ParseError::Lang(_)) ) ); } diff --git a/misc/supported_languages.csv b/misc/supported_languages.csv index 4198720..d3a748e 100644 --- a/misc/supported_languages.csv +++ b/misc/supported_languages.csv @@ -67,3 +67,4 @@ lat,Latin,Lingua Latina,0 slk,Slovak,Slovenčina,5 cat,Catalan,Català,10 tgl,Tagalog,Tagalog, +hye,Armenian,Հայերեն,7 diff --git a/src/lang.rs b/src/lang.rs index 2090e52..9fe3cc6 100644 --- a/src/lang.rs +++ b/src/lang.rs @@ -217,9 +217,12 @@ pub enum Lang { /// Tagalog (Tagalog) Tgl = 67, + + /// Հայերեն (Armenian) + Hye = 68, } -const VALUES: [Lang; 68] = [ +const VALUES: [Lang; 69] = [ Lang::Epo, Lang::Eng, Lang::Rus, @@ -288,6 +291,7 @@ const VALUES: [Lang; 68] = [ Lang::Slk, Lang::Cat, Lang::Tgl, + Lang::Hye, ]; fn lang_from_code>(code: S) -> Option { @@ -360,6 +364,7 @@ fn lang_from_code>(code: S) -> Option { "slk" => Some(Lang::Slk), "cat" => Some(Lang::Cat), "tgl" => Some(Lang::Tgl), + "hye" => Some(Lang::Hye), _ => None, } } @@ -434,6 +439,7 @@ fn lang_to_code(lang: Lang) -> &'static str { Lang::Slk => "slk", Lang::Cat => "cat", Lang::Tgl => "tgl", + Lang::Hye => "hye", } } @@ -507,6 +513,7 @@ fn lang_to_name(lang: Lang) -> &'static str { Lang::Slk => "Slovenčina", Lang::Cat => "Català", Lang::Tgl => "Tagalog", + Lang::Hye => "Հայերեն", } } @@ -580,6 +587,7 @@ fn lang_to_eng_name(lang: Lang) -> &'static str { Lang::Slk => "Slovak", Lang::Cat => "Catalan", Lang::Tgl => "Tagalog", + Lang::Hye => "Armenian", } } @@ -689,7 +697,7 @@ mod tests { #[test] fn test_all() { - assert_eq!(Lang::all().len(), 68); + assert_eq!(Lang::all().len(), 69); let all = Lang::all(); assert!(all.contains(&Lang::Ukr)); assert!(all.contains(&Lang::Swe)); diff --git a/src/scripts/chars.rs b/src/scripts/chars.rs index 72712ca..3b1e2b8 100644 --- a/src/scripts/chars.rs +++ b/src/scripts/chars.rs @@ -162,6 +162,14 @@ pub(crate) fn is_khmer(ch: char) -> bool { matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}') } +// See: +// * https://en.wikipedia.org/wiki/Armenian_alphabet +// * https://www.unicode.org/charts/PDF/U0530.pdf +// * https://www.unicode.org/charts/PDF/UFB00.pdf +pub(crate) fn is_armenian(ch: char) -> bool { + matches!(ch, '\u{0530}'..='\u{058F}' | '\u{FB13}'..='\u{FB17}') +} + #[cfg(test)] mod tests { use super::*; @@ -279,4 +287,14 @@ mod tests { assert_eq!(is_oriya('୷'), true); assert_eq!(is_oriya('౿'), false); } + + #[test] + fn test_is_armenian() { + assert_eq!(is_armenian('რ'), false); // Georgian + assert_eq!(is_armenian('Ш'), false); // Cyrillic + assert_eq!(is_armenian('ա'), true); + assert_eq!(is_armenian('Ա'), true); + assert_eq!(is_armenian('Փ'), true); + assert_eq!(is_armenian('և'), true); + } } diff --git a/src/scripts/detect.rs b/src/scripts/detect.rs index d6c2d3f..702f03a 100644 --- a/src/scripts/detect.rs +++ b/src/scripts/detect.rs @@ -54,7 +54,7 @@ impl RawScriptInfo { } pub fn raw_detect_script(text: &str) -> RawScriptInfo { - let mut script_counters: [ScriptCounter; 24] = [ + let mut script_counters: [ScriptCounter; 25] = [ (Script::Latin, chars::is_latin, 0), (Script::Cyrillic, chars::is_cyrillic, 0), (Script::Arabic, chars::is_arabic, 0), @@ -79,6 +79,7 @@ pub fn raw_detect_script(text: &str) -> RawScriptInfo { (Script::Myanmar, chars::is_myanmar, 0), (Script::Sinhala, chars::is_sinhala, 0), (Script::Khmer, chars::is_khmer, 0), + (Script::Armenian, chars::is_armenian, 0), ]; for ch in text.chars() { diff --git a/src/scripts/grouping.rs b/src/scripts/grouping.rs index 46cce75..1f996ba 100644 --- a/src/scripts/grouping.rs +++ b/src/scripts/grouping.rs @@ -61,6 +61,7 @@ impl Script { Script::Sinhala => One(Lang::Sin), Script::Khmer => One(Lang::Khm), Script::Ethiopic => One(Lang::Amh), + Script::Armenian => One(Lang::Hye), Script::Katakana | Script::Hiragana => One(Lang::Jpn), } } diff --git a/src/scripts/lang_mapping.rs b/src/scripts/lang_mapping.rs index 5f6d2fe..80ea329 100644 --- a/src/scripts/lang_mapping.rs +++ b/src/scripts/lang_mapping.rs @@ -75,6 +75,7 @@ pub fn script_langs(script: Script) -> &'static [Lang] { Script::Sinhala => &[Lang::Sin], Script::Khmer => &[Lang::Khm], Script::Ethiopic => &[Lang::Amh], + Script::Armenian => &[Lang::Hye], Script::Katakana | Script::Hiragana => &[Lang::Jpn], } } diff --git a/src/scripts/script.rs b/src/scripts/script.rs index 84e2ab3..2fe3fca 100644 --- a/src/scripts/script.rs +++ b/src/scripts/script.rs @@ -14,6 +14,7 @@ use enum_map::Enum; pub enum Script { // Keep this in alphabetic order (for C bindings) Arabic, + Armenian, Bengali, Cyrillic, Devanagari, @@ -40,8 +41,9 @@ pub enum Script { } // Array of all existing Script values. -const VALUES: [Script; 24] = [ +const VALUES: [Script; 25] = [ Script::Arabic, + Script::Armenian, Script::Bengali, Script::Cyrillic, Script::Devanagari, @@ -107,6 +109,7 @@ impl Script { Script::Myanmar => "Myanmar", Script::Sinhala => "Sinhala", Script::Khmer => "Khmer", + Script::Armenian => "Armenian", } } @@ -150,6 +153,7 @@ impl FromStr for Script { "myanmar" => Ok(Script::Myanmar), "sinhala" => Ok(Script::Sinhala), "khmer" => Ok(Script::Khmer), + "armenian" => Ok(Script::Armenian), _ => Err(ParseError::Script(s.to_string())), } } @@ -161,7 +165,7 @@ mod tests { #[test] fn test_all() { - assert_eq!(Script::all().len(), 24); + assert_eq!(Script::all().len(), 25); let all = Script::all(); assert!(all.contains(&Script::Cyrillic)); assert!(all.contains(&Script::Arabic)); diff --git a/tests/examples.json b/tests/examples.json index 8883b6b..47c7892 100644 --- a/tests/examples.json +++ b/tests/examples.json @@ -66,5 +66,6 @@ "lat": "Credo ego vos, iudices, mirari, quid sit, quod, cum tot summi oratores hominesque nobilissimi sedeant, ego potissimum surrexerim, is, qui neque aetate neque ingenio neque auctoritate sim cum his, qui sedeant, comparandus. Omnes hi, quos videtis adesse in hac causa, iniuriam novo scelere conflatam putant oportere defendi, defendere ipsi propter iniquitatem temporum non audent. Ita fit, ut adsint propterea, quod officium sequuntur, taceant autem idcirco, quia periculum vitant.", "slk": "Kodifikačné príručky určujú, ktoré slová sa v slovenčine považujú za spisovné. Ide o 4 zákonom predpísané knihy.", "cat": "Aquest és l’honor més gran que he rebut a la meva vida. La pau ha estat sempre la meva més gran preocupació. Ja en la meva infantesa vaig aprendre a estimar-la. La meva mare – una dona excepcional, genial - , quan jo era noi, ja em parlava de la pau, perquè en aquells temps també hi havia moltes guerres. A més, sóc català. Catalunya va tenir el primer Parlament democràtic molt abans que Anglaterra. I fou al meu país on hi hagué les primeres nacions unides. En aquell temps – segle onzè – van reunir-se a Toluges – avui França – per parlar de la pau, perquè els catalans d’aquell temps ja estaven contra, CONTRA la guerra. Per això les Nacions Unides, que treballen únicament per l’ideal de la pau, estan en el meu cor, perquè tot allò referent a la pau hi va directament. (...) Fa molts anys que no toco el violoncel en públic, però crec que he de fer-ho en aquesta ocasió. Vaig a tocar una melodia del folklore català: El cant dels ocells. Els ocells, quan són al cel, van cantant: 'Peace, Peace, Peace' (pau, pau, pau) i és una melodia que Bach, Beethoven i tots els grans haurien admirat i estimat. I, a més, neix de l’ànima del meu poble, Catalunya.", - "tgl": "Sapagkat ang pagkilala sa katutubong karangalan at sa pantay at di-maikakait na mga karapatan ng lahat ng nabibilang sa angkan ng tao ay siyang saligan ng kalayaan, katarungan at kapayapaan sa daigdig. Sapagkat ang pagwawalang-bahala at paglalapastangan sa mga karapatan ng tao ay nagbunga ng mga gawang di-makatao na humamak sa budhi ng sangkatauhan, at ang pagdatal ng isang daigdig na ang mga tao ay magtatamasa ng kalayaan sa pagsasalita at ng kaligtasan sa pangamba at pagdaralita ay ipinahayag na pinakamataas na mithiin ng mga karaniwang tao. Sapagkat mahalaga, kung ang tao ay di-pipiliting manghawakan bilang huling magagawa, sa paghihimagsik laban sa paniniil at pang-aapi, na ang mga karapatan ng tao'y mapangalagaan sa pamamagitan ng paghahari ng batas. Sapagkat mahalagang itaguyod ang pagpapaunlad ng mabuting pagsasamahan ng mga bansa. Sapagkat ang mga mamamayan ng Mga Bansang Nagkakaisa ay nagpatibay sa Karta ng kanilang pananalig sa mga Saligang karapatan ng tao, sa karangalan at " + "tgl": "Sapagkat ang pagkilala sa katutubong karangalan at sa pantay at di-maikakait na mga karapatan ng lahat ng nabibilang sa angkan ng tao ay siyang saligan ng kalayaan, katarungan at kapayapaan sa daigdig. Sapagkat ang pagwawalang-bahala at paglalapastangan sa mga karapatan ng tao ay nagbunga ng mga gawang di-makatao na humamak sa budhi ng sangkatauhan, at ang pagdatal ng isang daigdig na ang mga tao ay magtatamasa ng kalayaan sa pagsasalita at ng kaligtasan sa pangamba at pagdaralita ay ipinahayag na pinakamataas na mithiin ng mga karaniwang tao. Sapagkat mahalaga, kung ang tao ay di-pipiliting manghawakan bilang huling magagawa, sa paghihimagsik laban sa paniniil at pang-aapi, na ang mga karapatan ng tao'y mapangalagaan sa pamamagitan ng paghahari ng batas. Sapagkat mahalagang itaguyod ang pagpapaunlad ng mabuting pagsasamahan ng mga bansa. Sapagkat ang mga mamamayan ng Mga Bansang Nagkakaisa ay nagpatibay sa Karta ng kanilang pananalig sa mga Saligang karapatan ng tao, sa karangalan at ", + "hye": "Հայոց լեզվով ստեղծվել" }