Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use optimized alphabet score calc for Cyrillic #111
- Loading branch information
Showing
4 changed files
with
135 additions
and
219 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
use std::cmp::Reverse; | ||
use std::collections::HashMap; | ||
|
||
use once_cell::sync::Lazy; | ||
|
||
use super::RawOutcome; | ||
use crate::core::{FilterList, LowercaseText}; | ||
use crate::utils::is_stop_char; | ||
use crate::{Lang, Script}; | ||
|
||
/// Inverted map binding a character to a set of languages. | ||
pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec<char>, Vec<Vec<Lang>>) { | ||
let mut map = HashMap::new(); | ||
|
||
for (lang, alphabet) in alphabets { | ||
for c in alphabet.chars() { | ||
let entry = map.entry(c).or_insert_with(Vec::new); | ||
entry.push(*lang); | ||
} | ||
} | ||
|
||
let mut char_lang: Vec<_> = map.into_iter().collect(); | ||
|
||
char_lang.sort_unstable_by_key(|(c, _)| *c); | ||
|
||
let mut chars = Vec::with_capacity(char_lang.len()); | ||
let mut langs = Vec::with_capacity(char_lang.len()); | ||
for (ch, languages) in char_lang { | ||
chars.push(ch); | ||
langs.push(languages); | ||
} | ||
|
||
(chars, langs) | ||
} | ||
|
||
pub fn generic_alphabet_calculate_scores( | ||
script: Script, | ||
lang_map: &Lazy<(Vec<char>, Vec<Vec<Lang>>)>, | ||
text: &LowercaseText, | ||
filter_list: &FilterList, | ||
) -> RawOutcome { | ||
let (chars, langs) = &**lang_map; | ||
let script_langs = script.langs(); | ||
|
||
// score of each character. | ||
let mut char_scores = vec![0; chars.len()]; | ||
let mut max_raw_score = 0; | ||
// iterate over the text and scores characters. | ||
for ch in text.chars() { | ||
if is_stop_char(ch) { | ||
continue; | ||
} | ||
|
||
max_raw_score += 1; | ||
|
||
if let Ok(position) = chars.binary_search(&ch) { | ||
// add 2 and remove max_raw_score at the end, | ||
// to keep the score interval of -max_raw_score..max_raw_score | ||
char_scores[position] += 2; | ||
} | ||
} | ||
|
||
// score of each lang. | ||
let mut lang_scores = vec![0; Lang::all().len()]; | ||
let mut common_score: usize = 0; | ||
// iterate over scored characters to compute language's scores. | ||
for (position, char_score) in char_scores.into_iter().enumerate() { | ||
if char_score > 0 { | ||
let languages = &langs[position]; | ||
// if current character is common to all Languages, increment a common score | ||
// instead of iterating over all Languages scores. | ||
if languages.len() == script_langs.len() { | ||
common_score += char_score; | ||
} else { | ||
for &lang in languages { | ||
lang_scores[lang as usize] += char_score; | ||
} | ||
} | ||
} | ||
} | ||
|
||
// remap languages with theirs scores. | ||
let mut raw_scores: Vec<(Lang, usize)> = script_langs | ||
.iter() | ||
.filter(|&&l| filter_list.is_allowed(l)) | ||
.map(|&l| { | ||
let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score); | ||
(l, score) | ||
}) | ||
.collect(); | ||
|
||
raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score)); | ||
|
||
let mut normalized_scores = vec![]; | ||
|
||
for &(lang, raw_score) in raw_scores.iter() { | ||
let normalized_score = raw_score as f64 / max_raw_score as f64; | ||
normalized_scores.push((lang, normalized_score)); | ||
} | ||
|
||
RawOutcome { | ||
count: max_raw_score, | ||
raw_scores, | ||
scores: normalized_scores, | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.