Skip to content

Commit

Permalink
Merge pull request #116 from greyblake/optimize-alphabet-cyrillic
Browse files Browse the repository at this point in the history
Optimize alphabet cyrillic
  • Loading branch information
greyblake committed May 1, 2022
2 parents 879b654 + 23d3908 commit 8a23a98
Show file tree
Hide file tree
Showing 11 changed files with 527 additions and 533 deletions.
10 changes: 10 additions & 0 deletions Makefile
@@ -0,0 +1,10 @@
watch:
cargo watch -x test
watch-doc:
cargo watch -s 'cargo doc --no-deps --all-features --document-private-items'
doc:
cargo doc --no-deps --all-features --document-private-items --open
bench:
cargo bench --all-features
test:
cargo test --all-features
11 changes: 5 additions & 6 deletions README.md
Expand Up @@ -91,13 +91,12 @@ This function is a hyperbola and it looks like the following one:

For more details, please check a blog article [Introduction to Rust Whatlang Library and Natural Language Identification Algorithms](https://www.greyblake.com/blog/2017-07-30-introduction-to-rust-whatlang-library-and-natural-language-identification-algorithms/).

## Running benchmarks
## Make tasks

This is mostly useful to test performance optimizations.

```
cargo bench
```
* `make bench` - run performance benchmarks
* `make doc` - generate and open doc
* `make test` - run tests
* `make watch` - watch changes and run tests

## Comparison with alternatives

Expand Down
33 changes: 30 additions & 3 deletions benches/example.rs
@@ -1,10 +1,11 @@
#[macro_use]
extern crate bencher;
extern crate serde_json;
extern crate whatlang;

use bencher::Bencher;
use std::collections::HashMap;
use whatlang::dev::{
alphabet_cyrillic_calculate_scores, alphabet_latin_calculate_scores, FilterList, LowercaseText,
};
use whatlang::{detect, detect_script};

fn bench_detect(bench: &mut Bencher) {
Expand All @@ -29,5 +30,31 @@ fn bench_detect_script(bench: &mut Bencher) {
})
}

benchmark_group!(benches, bench_detect, bench_detect_script);
fn bench_alphabet_latin_calculate_scores(bench: &mut Bencher) {
let text = "Ich sehe auf die Uhr. Es ist kurz vor Mittag, und da heute Sonnabend ist, mache ich Schluß. Por ke lingvo internacia povu bone kaj regule progresadi kaj por ke ĝi havu plenan certecon, ke ĝi neniam disfalos kaj ia facilanima paŝo de ĝiaj amikoj estontaj ne detruos la laborojn de ĝiaj amikoj estintaj, - estas plej necesa antaŭ ĉio unu kondiĉo: la ezistado de klare difinita, neniam tuŝebla kaj neniam ŝangebla Fundamento de la lingvo.";
let lowercase_text = LowercaseText::new(text);
let filter = FilterList::All;

bench.iter(|| {
alphabet_latin_calculate_scores(&lowercase_text, &filter);
})
}

fn bench_alphabet_cyrillic_calculate_scores(bench: &mut Bencher) {
let text = "Творець есперанто Людвік Заменгоф назвав свою мову просто Lingvo internacia «міжнародна мова». Оскільки на той час у Європі популярною була інша штучна мова — волапюк, прихильники есперанто часто казали «мова доктора Есперанто». Згодом це формулювання скоротилося до «мова Есперанто», а врешті-решт залишилося одне лише слово «Esperanto», яке есперантською пишуть з великої літери, аби його можна було відрізнити від слова «людина, яка сподівається»";
let lowercase_text = LowercaseText::new(text);
let filter = FilterList::All;

bench.iter(|| {
alphabet_cyrillic_calculate_scores(&lowercase_text, &filter);
})
}

benchmark_group!(
benches,
bench_detect,
bench_detect_script,
bench_alphabet_latin_calculate_scores,
bench_alphabet_cyrillic_calculate_scores,
);
benchmark_main!(benches);
107 changes: 107 additions & 0 deletions src/alphabets/common.rs
@@ -0,0 +1,107 @@
//! It's a hard-core optimized implementation of a relatively simple algorithm.
//! The explanation of the algorithm can be found in the parent module [crate::alphabets].

use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::utils::is_stop_char;
use crate::{Lang, Script};
use once_cell::sync::Lazy;
use std::cmp::Reverse;
use std::collections::HashMap;

/// Inverted map binding a character to a set of languages.
pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec<char>, Vec<Vec<Lang>>) {
let mut map = HashMap::new();

for (lang, alphabet) in alphabets {
for c in alphabet.chars() {
let entry = map.entry(c).or_insert_with(Vec::new);
entry.push(*lang);
}
}

let mut char_lang: Vec<_> = map.into_iter().collect();

char_lang.sort_unstable_by_key(|(c, _)| *c);

let mut chars = Vec::with_capacity(char_lang.len());
let mut langs = Vec::with_capacity(char_lang.len());
for (ch, languages) in char_lang {
chars.push(ch);
langs.push(languages);
}

(chars, langs)
}

pub fn generic_alphabet_calculate_scores(
script: Script,
lang_map: &Lazy<(Vec<char>, Vec<Vec<Lang>>)>,
text: &LowercaseText,
filter_list: &FilterList,
) -> RawOutcome {
let (chars, langs) = &**lang_map;
let script_langs = script.langs();

// score of each character.
let mut char_scores = vec![0; chars.len()];
let mut max_raw_score = 0;
// iterate over the text and scores characters.
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}

max_raw_score += 1;

if let Ok(position) = chars.binary_search(&ch) {
// add 2 and remove max_raw_score at the end,
// to keep the score interval of -max_raw_score..max_raw_score
char_scores[position] += 2;
}
}

// score of each lang.
let mut lang_scores = vec![0; Lang::all().len()];
let mut common_score: usize = 0;
// iterate over scored characters to compute language's scores.
for (position, char_score) in char_scores.into_iter().enumerate() {
if char_score > 0 {
let languages = &langs[position];
// if current character is common to all Languages, increment a common score
// instead of iterating over all Languages scores.
if languages.len() == script_langs.len() {
common_score += char_score;
} else {
for &lang in languages {
lang_scores[lang as usize] += char_score;
}
}
}
}

// remap languages with theirs scores.
let mut raw_scores: Vec<(Lang, usize)> = script_langs
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| {
let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score);
(l, score)
})
.collect();

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let mut normalized_scores = vec![];

for &(lang, raw_score) in raw_scores.iter() {
let normalized_score = raw_score as f64 / max_raw_score as f64;
normalized_scores.push((lang, normalized_score));
}

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}
146 changes: 20 additions & 126 deletions src/alphabets/cyrillic.rs
@@ -1,8 +1,8 @@
use std::cmp::Reverse;

use super::common::{build_inverted_map, generic_alphabet_calculate_scores};
use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::{Lang, Script};
use once_cell::sync::Lazy;

const BUL: &str = "абвгдежзийклмнопрстуфхцчшщъьюя";
const RUS: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяё";
Expand All @@ -11,142 +11,36 @@ const BEL: &str = "абвгдежзйклмнопрстуфхцчшыьэюяё
const SRP: &str = "абвгдежзиклмнопрстуфхцчшђјљњћџ";
const MKD: &str = "абвгдежзиклмнопрстуфхцчшѓѕјљњќџ";

const ALL: &str = "абвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґўђјљњћџѓѕќ";

pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
let mut raw_scores: Vec<(Lang, i32)> = Script::Cyrillic
.langs()
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| (l, 0i32))
.collect();

let max_raw_score = text.chars().filter(|&ch| is_relevant(ch)).count();

for (lang, score) in &mut raw_scores {
let alphabet = get_lang_chars(*lang);
const CYRILLIC_ALPHABETS: &[(Lang, &str)] = &[
(Lang::Bul, BUL),
(Lang::Rus, RUS),
(Lang::Ukr, UKR),
(Lang::Bel, BEL),
(Lang::Srp, SRP),
(Lang::Mkd, MKD),
];

for ch in text.chars() {
if !is_relevant(ch) {
continue;
} else if alphabet.contains(&ch) {
*score += 1;
} else {
*score -= 1;
}
}
}

raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

let raw_scores: Vec<(Lang, usize)> = raw_scores
.into_iter()
.map(|(l, s)| {
let score = if s < 0 { 0usize } else { s as usize };
(l, score)
})
.collect();

let mut normalized_scores = vec![];

for &(lang, raw_score) in &raw_scores {
// avoid devision by zero
let normalized_score = if raw_score == 0 {
0.0
} else {
raw_score as f64 / max_raw_score as f64
};
normalized_scores.push((lang, normalized_score));
}
/// Inverted map binding a character to a set of languages.
static CYRILLIC_ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> =
Lazy::new(|| build_inverted_map(CYRILLIC_ALPHABETS));

RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}

fn is_relevant(ch: char) -> bool {
ALL.chars().any(|c| c == ch)
}

fn get_lang_chars(lang: Lang) -> Vec<char> {
let alphabet = match lang {
Lang::Bul => BUL,
Lang::Rus => RUS,
Lang::Ukr => UKR,
Lang::Bel => BEL,
Lang::Srp => SRP,
Lang::Mkd => MKD,

_ => panic!("No alphabet for {}", lang),
};
alphabet.chars().collect()
pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
generic_alphabet_calculate_scores(
Script::Cyrillic,
&CYRILLIC_ALPHABET_LANG_MAP,
text,
filter_list,
)
}

#[cfg(test)]
mod tests {
use super::*;

const CYRILLIC_LANGS: [Lang; 6] = [
Lang::Rus,
Lang::Ukr,
Lang::Srp,
Lang::Bel,
Lang::Mkd,
Lang::Bul,
];

fn fetch<T: Copy>(lang: &Lang, scores: &[(Lang, T)]) -> T {
scores.iter().find(|(l, _)| l == lang).unwrap().1
}

#[test]
fn test_when_latin_is_given() {
let text = LowercaseText::new("Foobar, hoh");
let RawOutcome {
count,
raw_scores,
scores,
} = alphabet_calculate_scores(&text, &FilterList::default());

assert_eq!(count, 0);
assert_eq!(raw_scores.len(), CYRILLIC_LANGS.len());
assert_eq!(scores.len(), CYRILLIC_LANGS.len());

for lang in &CYRILLIC_LANGS {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 0);
}

for lang in &CYRILLIC_LANGS {
let score = fetch(lang, &scores);
assert_eq!(score, 0.0);
}
}

#[test]
fn test_when_common_cyrllic_is_given() {
let text = LowercaseText::new("абвг ww");
let RawOutcome {
count,
raw_scores,
scores,
} = alphabet_calculate_scores(&text, &FilterList::default());

assert_eq!(count, 4);

for lang in &CYRILLIC_LANGS {
let raw_score = fetch(lang, &raw_scores);
assert_eq!(raw_score, 4);
}

for lang in &CYRILLIC_LANGS {
let score = fetch(lang, &scores);
assert_eq!(score, 1.0);
}
}

#[test]
fn test_when_ukrainian_specific_chars_given() {
let text = LowercaseText::new("Дуже цікаво");
Expand Down

0 comments on commit 8a23a98

Please sign in to comment.