Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #116 from greyblake/optimize-alphabet-cyrillic
Optimize alphabet cyrillic
- Loading branch information
Showing
11 changed files
with
527 additions
and
533 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
watch: | ||
cargo watch -x test | ||
watch-doc: | ||
cargo watch -s 'cargo doc --no-deps --all-features --document-private-items' | ||
doc: | ||
cargo doc --no-deps --all-features --document-private-items --open | ||
bench: | ||
cargo bench --all-features | ||
test: | ||
cargo test --all-features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
//! It's a hard-core optimized implementation of a relatively simple algorithm. | ||
//! The explanation of the algorithm can be found in the parent module [crate::alphabets]. | ||
|
||
use super::RawOutcome; | ||
use crate::core::{FilterList, LowercaseText}; | ||
use crate::utils::is_stop_char; | ||
use crate::{Lang, Script}; | ||
use once_cell::sync::Lazy; | ||
use std::cmp::Reverse; | ||
use std::collections::HashMap; | ||
|
||
/// Inverted map binding a character to a set of languages. | ||
pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec<char>, Vec<Vec<Lang>>) { | ||
let mut map = HashMap::new(); | ||
|
||
for (lang, alphabet) in alphabets { | ||
for c in alphabet.chars() { | ||
let entry = map.entry(c).or_insert_with(Vec::new); | ||
entry.push(*lang); | ||
} | ||
} | ||
|
||
let mut char_lang: Vec<_> = map.into_iter().collect(); | ||
|
||
char_lang.sort_unstable_by_key(|(c, _)| *c); | ||
|
||
let mut chars = Vec::with_capacity(char_lang.len()); | ||
let mut langs = Vec::with_capacity(char_lang.len()); | ||
for (ch, languages) in char_lang { | ||
chars.push(ch); | ||
langs.push(languages); | ||
} | ||
|
||
(chars, langs) | ||
} | ||
|
||
pub fn generic_alphabet_calculate_scores( | ||
script: Script, | ||
lang_map: &Lazy<(Vec<char>, Vec<Vec<Lang>>)>, | ||
text: &LowercaseText, | ||
filter_list: &FilterList, | ||
) -> RawOutcome { | ||
let (chars, langs) = &**lang_map; | ||
let script_langs = script.langs(); | ||
|
||
// score of each character. | ||
let mut char_scores = vec![0; chars.len()]; | ||
let mut max_raw_score = 0; | ||
// iterate over the text and scores characters. | ||
for ch in text.chars() { | ||
if is_stop_char(ch) { | ||
continue; | ||
} | ||
|
||
max_raw_score += 1; | ||
|
||
if let Ok(position) = chars.binary_search(&ch) { | ||
// add 2 and remove max_raw_score at the end, | ||
// to keep the score interval of -max_raw_score..max_raw_score | ||
char_scores[position] += 2; | ||
} | ||
} | ||
|
||
// score of each lang. | ||
let mut lang_scores = vec![0; Lang::all().len()]; | ||
let mut common_score: usize = 0; | ||
// iterate over scored characters to compute language's scores. | ||
for (position, char_score) in char_scores.into_iter().enumerate() { | ||
if char_score > 0 { | ||
let languages = &langs[position]; | ||
// if current character is common to all Languages, increment a common score | ||
// instead of iterating over all Languages scores. | ||
if languages.len() == script_langs.len() { | ||
common_score += char_score; | ||
} else { | ||
for &lang in languages { | ||
lang_scores[lang as usize] += char_score; | ||
} | ||
} | ||
} | ||
} | ||
|
||
// remap languages with theirs scores. | ||
let mut raw_scores: Vec<(Lang, usize)> = script_langs | ||
.iter() | ||
.filter(|&&l| filter_list.is_allowed(l)) | ||
.map(|&l| { | ||
let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score); | ||
(l, score) | ||
}) | ||
.collect(); | ||
|
||
raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score)); | ||
|
||
let mut normalized_scores = vec![]; | ||
|
||
for &(lang, raw_score) in raw_scores.iter() { | ||
let normalized_score = raw_score as f64 / max_raw_score as f64; | ||
normalized_scores.push((lang, normalized_score)); | ||
} | ||
|
||
RawOutcome { | ||
count: max_raw_score, | ||
raw_scores, | ||
scores: normalized_scores, | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.