Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using generic function to compute alphabet score for Cyrillic and latin #115

Closed
wants to merge 11 commits into from
110 changes: 15 additions & 95 deletions src/alphabets/cyrillic.rs
@@ -1,108 +1,26 @@
use std::cmp::Reverse;

use super::RawOutcome;
use crate::alphabets::generic;
use crate::core::{FilterList, LowercaseText};
use crate::{Lang, Script};

const BUL: &str = "абвгдежзийклмнопрстуфхцчшщъьюя";
const RUS: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяё";
const UKR: &str = "абвгдежзийклмнопрстуфхцчшщьюяєіїґ";
const BEL: &str = "абвгдежзйклмнопрстуфхцчшыьэюяёіў";
const SRP: &str = "абвгдежзиклмнопрстуфхцчшђјљњћџ";
const MKD: &str = "абвгдежзиклмнопрстуфхцчшѓѕјљњќџ";

const ALL: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґўђјљњћџѓѕќ";
use crate::Script;

pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
let mut raw_scores: Vec<(Lang, i32)> = Script::Cyrillic
.langs()
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| (l, 0i32))
.collect();

let max_raw_score = text.chars().filter(|&ch| is_relevant(ch)).count();

for (lang, score) in &mut raw_scores {
let alphabet = get_lang_chars(*lang);

for ch in text.chars() {
if !is_relevant(ch) {
continue;
} else if alphabet.contains(&ch) {
*score += 1;
} else {
*score -= 1;
}
}
}

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let raw_scores: Vec<(Lang, usize)> = raw_scores
.into_iter()
.map(|(l, s)| {
let score = if s < 0 { 0usize } else { s as usize };
(l, score)
})
.collect();

let mut normalized_scores = vec![];

for &(lang, raw_score) in &raw_scores {
// avoid devision by zero
let normalized_score = if raw_score == 0 {
0.0
} else {
raw_score as f64 / max_raw_score as f64
};
normalized_scores.push((lang, normalized_score));
}

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}

fn is_relevant(ch: char) -> bool {
ALL.chars().any(|c| c == ch)
}

fn get_lang_chars(lang: Lang) -> Vec<char> {
let alphabet = match lang {
Lang::Bul => BUL,
Lang::Rus => RUS,
Lang::Ukr => UKR,
Lang::Bel => BEL,
Lang::Srp => SRP,
Lang::Mkd => MKD,

_ => panic!("No alphabet for {}", lang),
};
alphabet.chars().collect()
let all_langs = Script::Cyrillic.langs();
generic::alphabet_calculate_scores_generic(text, filter_list, all_langs)
}

#[cfg(test)]
mod tests {
use super::*;

const CYRILLIC_LANGS: [Lang; 6] = [
Lang::Rus,
Lang::Ukr,
Lang::Srp,
Lang::Bel,
Lang::Mkd,
Lang::Bul,
];
use crate::Lang;

fn fetch<T: Copy>(lang: &Lang, scores: &[(Lang, T)]) -> T {
scores.iter().find(|(l, _)| l == lang).unwrap().1
}

#[test]
fn test_when_latin_is_given() {
let cyrillic_langs: &[Lang] = Script::Cyrillic.langs();

let text = LowercaseText::new("Foobar, hoh");
let RawOutcome {
count,
Expand All @@ -111,22 +29,24 @@ mod tests {
} = alphabet_calculate_scores(&text, &FilterList::default());

assert_eq!(count, 0);
assert_eq!(raw_scores.len(), CYRILLIC_LANGS.len());
assert_eq!(scores.len(), CYRILLIC_LANGS.len());
assert_eq!(raw_scores.len(), cyrillic_langs.len());
assert_eq!(scores.len(), cyrillic_langs.len());

for lang in &CYRILLIC_LANGS {
for lang in cyrillic_langs {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 0);
}

for lang in &CYRILLIC_LANGS {
for lang in cyrillic_langs {
let score = fetch(lang, &scores);
assert_eq!(score, 0.0);
}
}

#[test]
fn test_when_common_cyrllic_is_given() {
let cyrillic_langs: &[Lang] = Script::Cyrillic.langs();

let text = LowercaseText::new("абвг ww");
let RawOutcome {
count,
Expand All @@ -136,12 +56,12 @@ mod tests {

assert_eq!(count, 4);

for lang in &CYRILLIC_LANGS {
for lang in cyrillic_langs {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 4);
}

for lang in &CYRILLIC_LANGS {
for lang in cyrillic_langs {
let score = fetch(lang, &scores);
assert_eq!(score, 1.0);
}
Expand Down