Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize alphabet cyrillic #116

Merged
merged 6 commits into from May 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions Makefile
@@ -0,0 +1,10 @@
watch:
cargo watch -x test
watch-doc:
cargo watch -s 'cargo doc --no-deps --all-features --document-private-items'
doc:
cargo doc --no-deps --all-features --document-private-items --open
bench:
cargo bench --all-features
test:
cargo test --all-features
11 changes: 5 additions & 6 deletions README.md
Expand Up @@ -91,13 +91,12 @@ This function is a hyperbola and it looks like the following one:

For more details, please check a blog article [Introduction to Rust Whatlang Library and Natural Language Identification Algorithms](https://www.greyblake.com/blog/2017-07-30-introduction-to-rust-whatlang-library-and-natural-language-identification-algorithms/).

## Running benchmarks
## Make tasks

This is mostly useful to test performance optimizations.

```
cargo bench
```
* `make bench` - run performance benchmarks
* `make doc` - generate and open doc
* `make test` - run tests
* `make watch` - watch changes and run tests

## Comparison with alternatives

Expand Down
33 changes: 30 additions & 3 deletions benches/example.rs
@@ -1,10 +1,11 @@
#[macro_use]
extern crate bencher;
extern crate serde_json;
extern crate whatlang;

use bencher::Bencher;
use std::collections::HashMap;
use whatlang::dev::{
alphabet_cyrillic_calculate_scores, alphabet_latin_calculate_scores, FilterList, LowercaseText,
};
use whatlang::{detect, detect_script};

fn bench_detect(bench: &mut Bencher) {
Expand All @@ -29,5 +30,31 @@ fn bench_detect_script(bench: &mut Bencher) {
})
}

benchmark_group!(benches, bench_detect, bench_detect_script);
fn bench_alphabet_latin_calculate_scores(bench: &mut Bencher) {
let text = "Ich sehe auf die Uhr. Es ist kurz vor Mittag, und da heute Sonnabend ist, mache ich Schluß. Por ke lingvo internacia povu bone kaj regule progresadi kaj por ke ĝi havu plenan certecon, ke ĝi neniam disfalos kaj ia facilanima paŝo de ĝiaj amikoj estontaj ne detruos la laborojn de ĝiaj amikoj estintaj, - estas plej necesa antaŭ ĉio unu kondiĉo: la ezistado de klare difinita, neniam tuŝebla kaj neniam ŝangebla Fundamento de la lingvo.";
let lowercase_text = LowercaseText::new(text);
let filter = FilterList::All;

bench.iter(|| {
alphabet_latin_calculate_scores(&lowercase_text, &filter);
})
}

fn bench_alphabet_cyrillic_calculate_scores(bench: &mut Bencher) {
let text = "Творець есперанто Людвік Заменгоф назвав свою мову просто Lingvo internacia «міжнародна мова». Оскільки на той час у Європі популярною була інша штучна мова — волапюк, прихильники есперанто часто казали «мова доктора Есперанто». Згодом це формулювання скоротилося до «мова Есперанто», а врешті-решт залишилося одне лише слово «Esperanto», яке есперантською пишуть з великої літери, аби його можна було відрізнити від слова «людина, яка сподівається»";
let lowercase_text = LowercaseText::new(text);
let filter = FilterList::All;

bench.iter(|| {
alphabet_cyrillic_calculate_scores(&lowercase_text, &filter);
})
}

benchmark_group!(
benches,
bench_detect,
bench_detect_script,
bench_alphabet_latin_calculate_scores,
bench_alphabet_cyrillic_calculate_scores,
);
benchmark_main!(benches);
107 changes: 107 additions & 0 deletions src/alphabets/common.rs
@@ -0,0 +1,107 @@
//! It's a hard-core optimized implementation of a relatively simple algorithm.
//! The explanation of the algorithm can be found in the parent module [crate::alphabets].

use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::utils::is_stop_char;
use crate::{Lang, Script};
use once_cell::sync::Lazy;
use std::cmp::Reverse;
use std::collections::HashMap;

/// Inverted map binding a character to a set of languages.
pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec<char>, Vec<Vec<Lang>>) {
let mut map = HashMap::new();

for (lang, alphabet) in alphabets {
for c in alphabet.chars() {
let entry = map.entry(c).or_insert_with(Vec::new);
entry.push(*lang);
}
}

let mut char_lang: Vec<_> = map.into_iter().collect();

char_lang.sort_unstable_by_key(|(c, _)| *c);

let mut chars = Vec::with_capacity(char_lang.len());
let mut langs = Vec::with_capacity(char_lang.len());
for (ch, languages) in char_lang {
chars.push(ch);
langs.push(languages);
}

(chars, langs)
}

pub fn generic_alphabet_calculate_scores(
script: Script,
lang_map: &Lazy<(Vec<char>, Vec<Vec<Lang>>)>,
text: &LowercaseText,
filter_list: &FilterList,
) -> RawOutcome {
let (chars, langs) = &**lang_map;
let script_langs = script.langs();

// score of each character.
let mut char_scores = vec![0; chars.len()];
let mut max_raw_score = 0;
// iterate over the text and scores characters.
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}

max_raw_score += 1;

if let Ok(position) = chars.binary_search(&ch) {
// add 2 and remove max_raw_score at the end,
// to keep the score interval of -max_raw_score..max_raw_score
char_scores[position] += 2;
}
}

// score of each lang.
let mut lang_scores = vec![0; Lang::all().len()];
let mut common_score: usize = 0;
// iterate over scored characters to compute language's scores.
for (position, char_score) in char_scores.into_iter().enumerate() {
if char_score > 0 {
let languages = &langs[position];
// if current character is common to all Languages, increment a common score
// instead of iterating over all Languages scores.
if languages.len() == script_langs.len() {
common_score += char_score;
} else {
for &lang in languages {
lang_scores[lang as usize] += char_score;
}
}
}
}

// remap languages with theirs scores.
let mut raw_scores: Vec<(Lang, usize)> = script_langs
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| {
let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score);
(l, score)
})
.collect();

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let mut normalized_scores = vec![];

for &(lang, raw_score) in raw_scores.iter() {
let normalized_score = raw_score as f64 / max_raw_score as f64;
normalized_scores.push((lang, normalized_score));
}

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}
146 changes: 20 additions & 126 deletions src/alphabets/cyrillic.rs
@@ -1,8 +1,8 @@
use std::cmp::Reverse;

use super::common::{build_inverted_map, generic_alphabet_calculate_scores};
use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::{Lang, Script};
use once_cell::sync::Lazy;

const BUL: &str = "абвгдежзийклмнопрстуфхцчшщъьюя";
const RUS: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяё";
Expand All @@ -11,142 +11,36 @@ const BEL: &str = "абвгдежзйклмнопрстуфхцчшыьэюяё
const SRP: &str = "абвгдежзиклмнопрстуфхцчшђјљњћџ";
const MKD: &str = "абвгдежзиклмнопрстуфхцчшѓѕјљњќџ";

const ALL: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґўђјљњћџѓѕќ";

pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
let mut raw_scores: Vec<(Lang, i32)> = Script::Cyrillic
.langs()
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| (l, 0i32))
.collect();

let max_raw_score = text.chars().filter(|&ch| is_relevant(ch)).count();

for (lang, score) in &mut raw_scores {
let alphabet = get_lang_chars(*lang);
const CYRILLIC_ALPHABETS: &[(Lang, &str)] = &[
(Lang::Bul, BUL),
(Lang::Rus, RUS),
(Lang::Ukr, UKR),
(Lang::Bel, BEL),
(Lang::Srp, SRP),
(Lang::Mkd, MKD),
];

for ch in text.chars() {
if !is_relevant(ch) {
continue;
} else if alphabet.contains(&ch) {
*score += 1;
} else {
*score -= 1;
}
}
}

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let raw_scores: Vec<(Lang, usize)> = raw_scores
.into_iter()
.map(|(l, s)| {
let score = if s < 0 { 0usize } else { s as usize };
(l, score)
})
.collect();

let mut normalized_scores = vec![];

for &(lang, raw_score) in &raw_scores {
// avoid devision by zero
let normalized_score = if raw_score == 0 {
0.0
} else {
raw_score as f64 / max_raw_score as f64
};
normalized_scores.push((lang, normalized_score));
}
/// Inverted map binding a character to a set of languages.
static CYRILLIC_ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> =
Lazy::new(|| build_inverted_map(CYRILLIC_ALPHABETS));

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}

fn is_relevant(ch: char) -> bool {
ALL.chars().any(|c| c == ch)
}

fn get_lang_chars(lang: Lang) -> Vec<char> {
let alphabet = match lang {
Lang::Bul => BUL,
Lang::Rus => RUS,
Lang::Ukr => UKR,
Lang::Bel => BEL,
Lang::Srp => SRP,
Lang::Mkd => MKD,

_ => panic!("No alphabet for {}", lang),
};
alphabet.chars().collect()
pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
generic_alphabet_calculate_scores(
Script::Cyrillic,
&CYRILLIC_ALPHABET_LANG_MAP,
text,
filter_list,
)
}

#[cfg(test)]
mod tests {
use super::*;

const CYRILLIC_LANGS: [Lang; 6] = [
Lang::Rus,
Lang::Ukr,
Lang::Srp,
Lang::Bel,
Lang::Mkd,
Lang::Bul,
];

fn fetch<T: Copy>(lang: &Lang, scores: &[(Lang, T)]) -> T {
scores.iter().find(|(l, _)| l == lang).unwrap().1
}

#[test]
fn test_when_latin_is_given() {
let text = LowercaseText::new("Foobar, hoh");
let RawOutcome {
count,
raw_scores,
scores,
} = alphabet_calculate_scores(&text, &FilterList::default());

assert_eq!(count, 0);
assert_eq!(raw_scores.len(), CYRILLIC_LANGS.len());
assert_eq!(scores.len(), CYRILLIC_LANGS.len());

for lang in &CYRILLIC_LANGS {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 0);
}

for lang in &CYRILLIC_LANGS {
let score = fetch(lang, &scores);
assert_eq!(score, 0.0);
}
}

#[test]
fn test_when_common_cyrllic_is_given() {
let text = LowercaseText::new("абвг ww");
let RawOutcome {
count,
raw_scores,
scores,
} = alphabet_calculate_scores(&text, &FilterList::default());

assert_eq!(count, 4);

for lang in &CYRILLIC_LANGS {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 4);
}

for lang in &CYRILLIC_LANGS {
let score = fetch(lang, &scores);
assert_eq!(score, 1.0);
}
}

#[test]
fn test_when_ukrainian_specific_chars_given() {
let text = LowercaseText::new("Дуже цікаво");
Expand Down