Skip to content

Commit

Permalink
Use optimized alphabet score calc for Cyrillic #111
Browse files Browse the repository at this point in the history
  • Loading branch information
greyblake committed Apr 24, 2022
1 parent 5f1ebeb commit 72b6fc8
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 217 deletions.
106 changes: 106 additions & 0 deletions src/alphabets/common.rs
@@ -0,0 +1,106 @@
use std::cmp::Reverse;
use std::collections::HashMap;

use once_cell::sync::Lazy;

use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::utils::is_stop_char;
use crate::{Lang, Script};

/// Inverted map binding a character to a set of languages.
pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec<char>, Vec<Vec<Lang>>) {
let mut map = HashMap::new();

for (lang, alphabet) in alphabets {
for c in alphabet.chars() {
let entry = map.entry(c).or_insert_with(Vec::new);
entry.push(*lang);
}
}

let mut char_lang: Vec<_> = map.into_iter().collect();

char_lang.sort_unstable_by_key(|(c, _)| *c);

let mut chars = Vec::with_capacity(char_lang.len());
let mut langs = Vec::with_capacity(char_lang.len());
for (ch, languages) in char_lang {
chars.push(ch);
langs.push(languages);
}

(chars, langs)
}

pub fn generic_alphabet_calculate_scores(
script: Script,
lang_map: &Lazy<(Vec<char>, Vec<Vec<Lang>>)>,
text: &LowercaseText,
filter_list: &FilterList,
) -> RawOutcome {
let (chars, langs) = &**lang_map;
let script_langs = script.langs();

// score of each character.
let mut char_scores = vec![0; chars.len()];
let mut max_raw_score = 0;
// iterate over the text and scores characters.
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}

max_raw_score += 1;

if let Ok(position) = chars.binary_search(&ch) {
// add 2 and remove max_raw_score at the end,
// to keep the score interval of -max_raw_score..max_raw_score
char_scores[position] += 2;
}
}

// score of each lang.
let mut lang_scores = vec![0; Lang::all().len()];
let mut common_score: usize = 0;
// iterate over scored characters to compute language's scores.
for (position, char_score) in char_scores.into_iter().enumerate() {
if char_score > 0 {
let languages = &langs[position];
// if current character is common to all Languages, increment a common score
// instead of iterating over all Languages scores.
if languages.len() == script_langs.len() {
common_score += char_score;
} else {
for &lang in languages {
lang_scores[lang as usize] += char_score;
}
}
}
}

// remap languages with theirs scores.
let mut raw_scores: Vec<(Lang, usize)> = script_langs
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| {
let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score);
(l, score)
})
.collect();

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let mut normalized_scores = vec![];

for &(lang, raw_score) in raw_scores.iter() {
let normalized_score = raw_score as f64 / max_raw_score as f64;
normalized_scores.push((lang, normalized_score));
}

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}
146 changes: 20 additions & 126 deletions src/alphabets/cyrillic.rs
@@ -1,8 +1,8 @@
use std::cmp::Reverse;

use super::common::{build_inverted_map, generic_alphabet_calculate_scores};
use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::{Lang, Script};
use once_cell::sync::Lazy;

const BUL: &str = "абвгдежзийклмнопрстуфхцчшщъьюя";
const RUS: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяё";
Expand All @@ -11,142 +11,36 @@ const BEL: &str = "абвгдежзйклмнопрстуфхцчшыьэюяё
const SRP: &str = "абвгдежзиклмнопрстуфхцчшђјљњћџ";
const MKD: &str = "абвгдежзиклмнопрстуфхцчшѓѕјљњќџ";

const ALL: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґўђјљњћџѓѕќ";

pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
let mut raw_scores: Vec<(Lang, i32)> = Script::Cyrillic
.langs()
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| (l, 0i32))
.collect();

let max_raw_score = text.chars().filter(|&ch| is_relevant(ch)).count();

for (lang, score) in &mut raw_scores {
let alphabet = get_lang_chars(*lang);
const CYRILLIC_ALPHABETS: &[(Lang, &str)] = &[
(Lang::Bul, BUL),
(Lang::Rus, RUS),
(Lang::Ukr, UKR),
(Lang::Bel, BEL),
(Lang::Srp, SRP),
(Lang::Mkd, MKD),
];

for ch in text.chars() {
if !is_relevant(ch) {
continue;
} else if alphabet.contains(&ch) {
*score += 1;
} else {
*score -= 1;
}
}
}

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let raw_scores: Vec<(Lang, usize)> = raw_scores
.into_iter()
.map(|(l, s)| {
let score = if s < 0 { 0usize } else { s as usize };
(l, score)
})
.collect();

let mut normalized_scores = vec![];

for &(lang, raw_score) in &raw_scores {
// avoid devision by zero
let normalized_score = if raw_score == 0 {
0.0
} else {
raw_score as f64 / max_raw_score as f64
};
normalized_scores.push((lang, normalized_score));
}
/// Inverted map binding a character to a set of languages.
static CYRILLIC_ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> =
Lazy::new(|| build_inverted_map(CYRILLIC_ALPHABETS));

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}

fn is_relevant(ch: char) -> bool {
ALL.chars().any(|c| c == ch)
}

fn get_lang_chars(lang: Lang) -> Vec<char> {
let alphabet = match lang {
Lang::Bul => BUL,
Lang::Rus => RUS,
Lang::Ukr => UKR,
Lang::Bel => BEL,
Lang::Srp => SRP,
Lang::Mkd => MKD,

_ => panic!("No alphabet for {}", lang),
};
alphabet.chars().collect()
pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
generic_alphabet_calculate_scores(
Script::Cyrillic,
&CYRILLIC_ALPHABET_LANG_MAP,
text,
filter_list,
)
}

#[cfg(test)]
mod tests {
use super::*;

const CYRILLIC_LANGS: [Lang; 6] = [
Lang::Rus,
Lang::Ukr,
Lang::Srp,
Lang::Bel,
Lang::Mkd,
Lang::Bul,
];

fn fetch<T: Copy>(lang: &Lang, scores: &[(Lang, T)]) -> T {
scores.iter().find(|(l, _)| l == lang).unwrap().1
}

#[test]
fn test_when_latin_is_given() {
let text = LowercaseText::new("Foobar, hoh");
let RawOutcome {
count,
raw_scores,
scores,
} = alphabet_calculate_scores(&text, &FilterList::default());

assert_eq!(count, 0);
assert_eq!(raw_scores.len(), CYRILLIC_LANGS.len());
assert_eq!(scores.len(), CYRILLIC_LANGS.len());

for lang in &CYRILLIC_LANGS {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 0);
}

for lang in &CYRILLIC_LANGS {
let score = fetch(lang, &scores);
assert_eq!(score, 0.0);
}
}

#[test]
fn test_when_common_cyrllic_is_given() {
let text = LowercaseText::new("абвг ww");
let RawOutcome {
count,
raw_scores,
scores,
} = alphabet_calculate_scores(&text, &FilterList::default());

assert_eq!(count, 4);

for lang in &CYRILLIC_LANGS {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 4);
}

for lang in &CYRILLIC_LANGS {
let score = fetch(lang, &scores);
assert_eq!(score, 1.0);
}
}

#[test]
fn test_when_ukrainian_specific_chars_given() {
let text = LowercaseText::new("Дуже цікаво");
Expand Down
97 changes: 6 additions & 91 deletions src/alphabets/latin.rs
@@ -1,11 +1,8 @@
use std::cmp::Reverse;
use std::collections::HashMap;

use once_cell::sync::Lazy;

use super::common::{build_inverted_map, generic_alphabet_calculate_scores};
use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::utils::is_stop_char;
use crate::{Lang, Script};

const AFR: &str = "abcdefghijklmnopqrstuvwxyzáèéêëíîïóôúû";
Expand Down Expand Up @@ -86,100 +83,18 @@ const LATIN_ALPHABETS: &[(Lang, &str)] = &[
];

/// Inverted map binding a character to a set of languages.
pub static ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> = Lazy::new(|| {
let mut map = HashMap::new();

for (lang, alphabet) in LATIN_ALPHABETS {
for c in alphabet.chars() {
let entry = map.entry(c).or_insert_with(Vec::new);
entry.push(*lang);
}
}

let mut char_lang: Vec<_> = map.into_iter().collect();

char_lang.sort_unstable_by_key(|(c, _)| *c);

let mut chars = Vec::with_capacity(char_lang.len());
let mut langs = Vec::with_capacity(char_lang.len());
for (ch, languages) in char_lang {
chars.push(ch);
langs.push(languages);
}

(chars, langs)
});
pub static ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> =
Lazy::new(|| build_inverted_map(LATIN_ALPHABETS));

pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
let (chars, langs) = &*ALPHABET_LANG_MAP;

// score of each character.
let mut char_scores = vec![0; chars.len()];
let mut max_raw_score = 0;
// iterate over the text and scores characters.
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}

max_raw_score += 1;

if let Ok(position) = chars.binary_search(&ch) {
// add 2 and remove max_raw_score at the end,
// to keep the score interval of -max_raw_score..max_raw_score
char_scores[position] += 2;
}
}

// score of each lang.
let mut lang_scores = vec![0; Lang::all().len()];
let mut common_score: usize = 0;
// iterate over scored characters to compute language's scores.
for (position, char_score) in char_scores.into_iter().enumerate() {
if char_score > 0 {
let languages = &langs[position];
// if current character is common to all Languages, increment a common score
// instead of iterating over all Languages scores.
if languages.len() == LATIN_ALPHABETS.len() {
common_score += char_score;
} else {
for &lang in languages {
lang_scores[lang as usize] += char_score;
}
}
}
}

// remap languages with theirs scores.
let mut raw_scores: Vec<(Lang, usize)> = Script::Latin
.langs()
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| {
let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score);
(l, score)
})
.collect();

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let mut normalized_scores = vec![];

for &(lang, raw_score) in raw_scores.iter() {
let normalized_score = raw_score as f64 / max_raw_score as f64;
normalized_scores.push((lang, normalized_score));
}

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
generic_alphabet_calculate_scores(Script::Latin, &ALPHABET_LANG_MAP, text, filter_list)
}

#[cfg(test)]
mod tests {
use super::*;
use crate::utils::is_stop_char;
use crate::Script;

// Old naive implementation, that is not very effective but easy to understand
fn naive_alphabet_calculate_scores(
Expand Down

0 comments on commit 72b6fc8

Please sign in to comment.