Skip to content

Commit

Permalink
More emoji
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed Aug 21, 2022
1 parent e452c05 commit e43f75d
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 6 deletions.
5 changes: 3 additions & 2 deletions Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "deunicode"
version = "1.3.1"
version = "1.3.2"
authors = ["Kornel Lesinski <kornel@geekhood.net>", "Amit Chowdhury <amitc97@gmail.com>"]
description = "Convert Unicode strings to pure ASCII by intelligently transliterating them. Suppors Emoji and Chinese."
documentation = "https://docs.rs/deunicode"
Expand All @@ -9,7 +9,7 @@ homepage = "https://lib.rs/crates/deunicode"
repository = "https://github.com/kornelski/deunicode/"
readme = "README.md"
include = ["src/*", "Cargo.toml", "README.md"]
edition = "2018"
edition = "2021"

keywords = [
"unidecode",
Expand All @@ -26,6 +26,7 @@ alloc = []

[badges]
travis-ci = { repository = "kornelski/deunicode" }
maintenance = { status = "actively-developed" }

[package.metadata.docs.rs]
targets = ["x86_64-unknown-linux-gnu"]
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -49,6 +49,7 @@ Unicode data
------------
* [`Text::Unidecode`](http://search.cpan.org/~sburke/Text-Unidecode-1.30/lib/Text/Unidecode.pm) by Sean M. Burke
* [Unicodey](https://unicodey.com) by Cal Henderson
* [gh emoji](https://lib.rs/gh-emoji)
* [any_ascii](https://anyascii.com/)

For a detailed explanation on the rationale behind the original
Expand Down
3 changes: 2 additions & 1 deletion scripts/Cargo.toml
Expand Up @@ -2,7 +2,7 @@
name = "unidecode-compress"
version = "0.1.0"
authors = ["Kornel <kornel@geekhood.net>"]
edition = "2018"
edition = "2021"

[[bin]]
name = "compress"
Expand All @@ -13,3 +13,4 @@ serde = "1.0.104"
serde_json = "1.0.48"
serde_derive = "1.0.104"
any_ascii = "0.2.0"
emojis = "0.4.0"
30 changes: 28 additions & 2 deletions scripts/compress.rs
Expand Up @@ -82,21 +82,47 @@ fn main() {
if ch != "[?] " && ch != "[?]" {ch} else {UNKNOWN_CHAR}
}).collect();

if all_codepoints.len() < 140000 { all_codepoints.resize(140000, UNKNOWN_CHAR); }

all_codepoints['术' as usize] = "Shu ";
all_codepoints['价' as usize] = "Jia ";
all_codepoints['旅' as usize] = "Lv ";
all_codepoints['什' as usize] = "Shen ";
all_codepoints['么' as usize] = "Me ";
all_codepoints['❗' as usize] = "!";
all_codepoints['❕' as usize] = "!";
all_codepoints['❓' as usize] = "?";
all_codepoints['❔' as usize] = "?";
all_codepoints['➕' as usize] = "+";
all_codepoints['➖' as usize] = "-";
all_codepoints['➗' as usize] = "/";
all_codepoints['🟰' as usize] = "=";
all_codepoints['💲' as usize] = "$";
all_codepoints['💵' as usize] = "$";
all_codepoints['🌟' as usize] = "*";
all_codepoints['⭐' as usize] = "*";

for &(ch, ref name) in gemoji.iter().chain(emoji1.iter()).chain(emoji2.iter()) {
while all_codepoints.len() <= ch {
all_codepoints.push(UNKNOWN_CHAR);
if all_codepoints.len() <= ch {
all_codepoints.resize(ch as usize+1, UNKNOWN_CHAR);
}
if "" == all_codepoints[ch] || "[?]" == all_codepoints[ch] || UNKNOWN_CHAR == all_codepoints[ch] || name.len() < all_codepoints[ch].len() {
all_codepoints[ch] = name;
}
}

for (mut name, ch) in emojis::iter().filter(|e| e.as_str().chars().count() == 1)
.filter_map(|e| Some((e.shortcode().unwrap_or(e.name()), e.as_str().chars().next()? as usize))) {
if all_codepoints.len() <= ch {
all_codepoints.resize(ch as usize+1, UNKNOWN_CHAR);
}
if "" == all_codepoints[ch] || "[?]" == all_codepoints[ch] || UNKNOWN_CHAR == all_codepoints[ch] {
let new_name = format!("{} ", name.trim().replace('_', " ")).into_boxed_str();
name = Box::leak(new_name);
all_codepoints[ch] = name;
}
}

for (i, ch) in all_codepoints.iter_mut().enumerate().skip(255) {
if *ch == UNKNOWN_CHAR {
let any = std::char::from_u32(i as u32)
Expand Down
2 changes: 1 addition & 1 deletion src/mapping.txt

Large diffs are not rendered by default.

Binary file modified src/pointers.bin
Binary file not shown.

0 comments on commit e43f75d

Please sign in to comment.