From 72b6fc8382ec79ce2ade6da21d97a711df313ad9 Mon Sep 17 00:00:00 2001 From: Sergey Potapov Date: Sun, 24 Apr 2022 19:29:36 +0200 Subject: [PATCH] Use optimized alphabet score calc for Cyrillic #111 --- src/alphabets/common.rs | 106 +++++++++++++++++++++++++++ src/alphabets/cyrillic.rs | 146 ++++++-------------------------------- src/alphabets/latin.rs | 97 ++----------------------- src/alphabets/mod.rs | 1 + 4 files changed, 133 insertions(+), 217 deletions(-) create mode 100644 src/alphabets/common.rs diff --git a/src/alphabets/common.rs b/src/alphabets/common.rs new file mode 100644 index 0000000..69dc56b --- /dev/null +++ b/src/alphabets/common.rs @@ -0,0 +1,106 @@ +use std::cmp::Reverse; +use std::collections::HashMap; + +use once_cell::sync::Lazy; + +use super::RawOutcome; +use crate::core::{FilterList, LowercaseText}; +use crate::utils::is_stop_char; +use crate::{Lang, Script}; + +/// Inverted map binding a character to a set of languages. +pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec, Vec>) { + let mut map = HashMap::new(); + + for (lang, alphabet) in alphabets { + for c in alphabet.chars() { + let entry = map.entry(c).or_insert_with(Vec::new); + entry.push(*lang); + } + } + + let mut char_lang: Vec<_> = map.into_iter().collect(); + + char_lang.sort_unstable_by_key(|(c, _)| *c); + + let mut chars = Vec::with_capacity(char_lang.len()); + let mut langs = Vec::with_capacity(char_lang.len()); + for (ch, languages) in char_lang { + chars.push(ch); + langs.push(languages); + } + + (chars, langs) +} + +pub fn generic_alphabet_calculate_scores( + script: Script, + lang_map: &Lazy<(Vec, Vec>)>, + text: &LowercaseText, + filter_list: &FilterList, +) -> RawOutcome { + let (chars, langs) = &**lang_map; + let script_langs = script.langs(); + + // score of each character. + let mut char_scores = vec![0; chars.len()]; + let mut max_raw_score = 0; + // iterate over the text and scores characters. + for ch in text.chars() { + if is_stop_char(ch) { + continue; + } + + max_raw_score += 1; + + if let Ok(position) = chars.binary_search(&ch) { + // add 2 and remove max_raw_score at the end, + // to keep the score interval of -max_raw_score..max_raw_score + char_scores[position] += 2; + } + } + + // score of each lang. + let mut lang_scores = vec![0; Lang::all().len()]; + let mut common_score: usize = 0; + // iterate over scored characters to compute language's scores. + for (position, char_score) in char_scores.into_iter().enumerate() { + if char_score > 0 { + let languages = &langs[position]; + // if current character is common to all Languages, increment a common score + // instead of iterating over all Languages scores. + if languages.len() == script_langs.len() { + common_score += char_score; + } else { + for &lang in languages { + lang_scores[lang as usize] += char_score; + } + } + } + } + + // remap languages with theirs scores. + let mut raw_scores: Vec<(Lang, usize)> = script_langs + .iter() + .filter(|&&l| filter_list.is_allowed(l)) + .map(|&l| { + let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score); + (l, score) + }) + .collect(); + + raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score)); + + let mut normalized_scores = vec![]; + + for &(lang, raw_score) in raw_scores.iter() { + let normalized_score = raw_score as f64 / max_raw_score as f64; + normalized_scores.push((lang, normalized_score)); + } + + RawOutcome { + count: max_raw_score, + raw_scores, + scores: normalized_scores, + } +} diff --git a/src/alphabets/cyrillic.rs b/src/alphabets/cyrillic.rs index c7b4bc9..87fd272 100644 --- a/src/alphabets/cyrillic.rs +++ b/src/alphabets/cyrillic.rs @@ -1,8 +1,8 @@ -use std::cmp::Reverse; - +use super::common::{build_inverted_map, generic_alphabet_calculate_scores}; use super::RawOutcome; use crate::core::{FilterList, LowercaseText}; use crate::{Lang, Script}; +use once_cell::sync::Lazy; const BUL: &str = "абвгдежзийклмнопрстуфхцчшщъьюя"; const RUS: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяё"; @@ -11,142 +11,36 @@ const BEL: &str = "абвгдежзйклмнопрстуфхцчшыьэюяё const SRP: &str = "абвгдежзиклмнопрстуфхцчшђјљњћџ"; const MKD: &str = "абвгдежзиклмнопрстуфхцчшѓѕјљњќџ"; -const ALL: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґўђјљњћџѓѕќ"; - -pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome { - let mut raw_scores: Vec<(Lang, i32)> = Script::Cyrillic - .langs() - .iter() - .filter(|&&l| filter_list.is_allowed(l)) - .map(|&l| (l, 0i32)) - .collect(); - - let max_raw_score = text.chars().filter(|&ch| is_relevant(ch)).count(); - - for (lang, score) in &mut raw_scores { - let alphabet = get_lang_chars(*lang); +const CYRILLIC_ALPHABETS: &[(Lang, &str)] = &[ + (Lang::Bul, BUL), + (Lang::Rus, RUS), + (Lang::Ukr, UKR), + (Lang::Bel, BEL), + (Lang::Srp, SRP), + (Lang::Mkd, MKD), +]; - for ch in text.chars() { - if !is_relevant(ch) { - continue; - } else if alphabet.contains(&ch) { - *score += 1; - } else { - *score -= 1; - } - } - } - - raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score)); - - let raw_scores: Vec<(Lang, usize)> = raw_scores - .into_iter() - .map(|(l, s)| { - let score = if s < 0 { 0usize } else { s as usize }; - (l, score) - }) - .collect(); - - let mut normalized_scores = vec![]; - - for &(lang, raw_score) in &raw_scores { - // avoid devision by zero - let normalized_score = if raw_score == 0 { - 0.0 - } else { - raw_score as f64 / max_raw_score as f64 - }; - normalized_scores.push((lang, normalized_score)); - } +/// Inverted map binding a character to a set of languages. +static CYRILLIC_ALPHABET_LANG_MAP: Lazy<(Vec, Vec>)> = + Lazy::new(|| build_inverted_map(CYRILLIC_ALPHABETS)); - RawOutcome { - count: max_raw_score, - raw_scores, - scores: normalized_scores, - } -} - -fn is_relevant(ch: char) -> bool { - ALL.chars().any(|c| c == ch) -} - -fn get_lang_chars(lang: Lang) -> Vec { - let alphabet = match lang { - Lang::Bul => BUL, - Lang::Rus => RUS, - Lang::Ukr => UKR, - Lang::Bel => BEL, - Lang::Srp => SRP, - Lang::Mkd => MKD, - - _ => panic!("No alphabet for {}", lang), - }; - alphabet.chars().collect() +pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome { + generic_alphabet_calculate_scores( + Script::Cyrillic, + &CYRILLIC_ALPHABET_LANG_MAP, + text, + filter_list, + ) } #[cfg(test)] mod tests { use super::*; - const CYRILLIC_LANGS: [Lang; 6] = [ - Lang::Rus, - Lang::Ukr, - Lang::Srp, - Lang::Bel, - Lang::Mkd, - Lang::Bul, - ]; - fn fetch(lang: &Lang, scores: &[(Lang, T)]) -> T { scores.iter().find(|(l, _)| l == lang).unwrap().1 } - #[test] - fn test_when_latin_is_given() { - let text = LowercaseText::new("Foobar, hoh"); - let RawOutcome { - count, - raw_scores, - scores, - } = alphabet_calculate_scores(&text, &FilterList::default()); - - assert_eq!(count, 0); - assert_eq!(raw_scores.len(), CYRILLIC_LANGS.len()); - assert_eq!(scores.len(), CYRILLIC_LANGS.len()); - - for lang in &CYRILLIC_LANGS { - let raw_score = fetch(lang, &raw_scores); - assert_eq!(raw_score, 0); - } - - for lang in &CYRILLIC_LANGS { - let score = fetch(lang, &scores); - assert_eq!(score, 0.0); - } - } - - #[test] - fn test_when_common_cyrllic_is_given() { - let text = LowercaseText::new("абвг ww"); - let RawOutcome { - count, - raw_scores, - scores, - } = alphabet_calculate_scores(&text, &FilterList::default()); - - assert_eq!(count, 4); - - for lang in &CYRILLIC_LANGS { - let raw_score = fetch(lang, &raw_scores); - assert_eq!(raw_score, 4); - } - - for lang in &CYRILLIC_LANGS { - let score = fetch(lang, &scores); - assert_eq!(score, 1.0); - } - } - #[test] fn test_when_ukrainian_specific_chars_given() { let text = LowercaseText::new("Дуже цікаво"); diff --git a/src/alphabets/latin.rs b/src/alphabets/latin.rs index 1c8c924..a4772b5 100644 --- a/src/alphabets/latin.rs +++ b/src/alphabets/latin.rs @@ -1,11 +1,8 @@ -use std::cmp::Reverse; -use std::collections::HashMap; - use once_cell::sync::Lazy; +use super::common::{build_inverted_map, generic_alphabet_calculate_scores}; use super::RawOutcome; use crate::core::{FilterList, LowercaseText}; -use crate::utils::is_stop_char; use crate::{Lang, Script}; const AFR: &str = "abcdefghijklmnopqrstuvwxyzáèéêëíîïóôúû"; @@ -86,100 +83,18 @@ const LATIN_ALPHABETS: &[(Lang, &str)] = &[ ]; /// Inverted map binding a character to a set of languages. -pub static ALPHABET_LANG_MAP: Lazy<(Vec, Vec>)> = Lazy::new(|| { - let mut map = HashMap::new(); - - for (lang, alphabet) in LATIN_ALPHABETS { - for c in alphabet.chars() { - let entry = map.entry(c).or_insert_with(Vec::new); - entry.push(*lang); - } - } - - let mut char_lang: Vec<_> = map.into_iter().collect(); - - char_lang.sort_unstable_by_key(|(c, _)| *c); - - let mut chars = Vec::with_capacity(char_lang.len()); - let mut langs = Vec::with_capacity(char_lang.len()); - for (ch, languages) in char_lang { - chars.push(ch); - langs.push(languages); - } - - (chars, langs) -}); +pub static ALPHABET_LANG_MAP: Lazy<(Vec, Vec>)> = + Lazy::new(|| build_inverted_map(LATIN_ALPHABETS)); pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome { - let (chars, langs) = &*ALPHABET_LANG_MAP; - - // score of each character. - let mut char_scores = vec![0; chars.len()]; - let mut max_raw_score = 0; - // iterate over the text and scores characters. - for ch in text.chars() { - if is_stop_char(ch) { - continue; - } - - max_raw_score += 1; - - if let Ok(position) = chars.binary_search(&ch) { - // add 2 and remove max_raw_score at the end, - // to keep the score interval of -max_raw_score..max_raw_score - char_scores[position] += 2; - } - } - - // score of each lang. - let mut lang_scores = vec![0; Lang::all().len()]; - let mut common_score: usize = 0; - // iterate over scored characters to compute language's scores. - for (position, char_score) in char_scores.into_iter().enumerate() { - if char_score > 0 { - let languages = &langs[position]; - // if current character is common to all Languages, increment a common score - // instead of iterating over all Languages scores. - if languages.len() == LATIN_ALPHABETS.len() { - common_score += char_score; - } else { - for &lang in languages { - lang_scores[lang as usize] += char_score; - } - } - } - } - - // remap languages with theirs scores. - let mut raw_scores: Vec<(Lang, usize)> = Script::Latin - .langs() - .iter() - .filter(|&&l| filter_list.is_allowed(l)) - .map(|&l| { - let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score); - (l, score) - }) - .collect(); - - raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score)); - - let mut normalized_scores = vec![]; - - for &(lang, raw_score) in raw_scores.iter() { - let normalized_score = raw_score as f64 / max_raw_score as f64; - normalized_scores.push((lang, normalized_score)); - } - - RawOutcome { - count: max_raw_score, - raw_scores, - scores: normalized_scores, - } + generic_alphabet_calculate_scores(Script::Latin, &ALPHABET_LANG_MAP, text, filter_list) } #[cfg(test)] mod tests { use super::*; + use crate::utils::is_stop_char; + use crate::Script; // Old naive implementation, that is not very effective but easy to understand fn naive_alphabet_calculate_scores( diff --git a/src/alphabets/mod.rs b/src/alphabets/mod.rs index 48133af..cd79006 100644 --- a/src/alphabets/mod.rs +++ b/src/alphabets/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod common; pub(crate) mod cyrillic; pub(crate) mod detection; pub(crate) mod latin;