/
cyrillic.rs
61 lines (51 loc) · 2 KB
/
cyrillic.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
use super::common::{build_inverted_map, generic_alphabet_calculate_scores};
use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::{Lang, Script};
use once_cell::sync::Lazy;
const BUL: &str = "абвгдежзийклмнопрстуфхцчшщъьюя";
const RUS: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяё";
const UKR: &str = "абвгдежзийклмнопрстуфхцчшщьюяєіїґ";
const BEL: &str = "абвгдежзйклмнопрстуфхцчшыьэюяёіў";
const SRP: &str = "абвгдежзиклмнопрстуфхцчшђјљњћџ";
const MKD: &str = "абвгдежзиклмнопрстуфхцчшѓѕјљњќџ";
const CYRILLIC_ALPHABETS: &[(Lang, &str)] = &[
(Lang::Bul, BUL),
(Lang::Rus, RUS),
(Lang::Ukr, UKR),
(Lang::Bel, BEL),
(Lang::Srp, SRP),
(Lang::Mkd, MKD),
];
/// Inverted map binding a character to a set of languages.
static CYRILLIC_ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> =
Lazy::new(|| build_inverted_map(CYRILLIC_ALPHABETS));
pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
generic_alphabet_calculate_scores(
Script::Cyrillic,
&CYRILLIC_ALPHABET_LANG_MAP,
text,
filter_list,
)
}
#[cfg(test)]
mod tests {
use super::*;
fn fetch<T: Copy>(lang: &Lang, scores: &[(Lang, T)]) -> T {
scores.iter().find(|(l, _)| l == lang).unwrap().1
}
#[test]
fn test_when_ukrainian_specific_chars_given() {
let text = LowercaseText::new("Дуже цікаво");
let RawOutcome {
count,
raw_scores,
scores,
} = alphabet_calculate_scores(&text, &FilterList::default());
assert_eq!(count, 10);
assert_eq!(fetch(&Lang::Ukr, &raw_scores), 10);
assert_eq!(fetch(&Lang::Rus, &raw_scores), 8);
assert_eq!(fetch(&Lang::Ukr, &scores), 1.0);
assert_eq!(fetch(&Lang::Rus, &scores), 0.8);
}
}