Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Treat emoji presentation sequences as fullwidth #35

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ on:
branches: [ "master" ]

env:
CARGO_INCREMENTAL: 0
CARGO_TERM_COLOR: always
RUST_BACKTRACE: 1
RUSTFLAGS: -D warnings
RUSTDOCFLAGS: -D warnings

jobs:
build:
Expand All @@ -18,10 +22,19 @@ jobs:
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose
- name: Build docs
run: cargo doc
- name: Check formatting
run: cargo fmt --check
- name: Check clippy
run: cargo clippy --lib --tests
regen:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Regen
run: cd scripts && python3 unicode.py
- name: Diff
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ Cargo.lock
scripts/tmp
scripts/*.txt
scripts/*.rs
bench_data/*
10 changes: 6 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@

name = "unicode-width"
version = "0.1.11"
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]

authors = [
"kwantam <kwantam@gmail.com>",
"Manish Goregaokar <manishsmail@gmail.com>",
]
homepage = "https://github.com/unicode-rs/unicode-width"
repository = "https://github.com/unicode-rs/unicode-width"
documentation = "https://unicode-rs.github.io/unicode-width"
license = "MIT/Apache-2.0"
keywords = ["text", "width", "unicode"]
readme = "README.md"
edition = "2021"
description = """
Determine displayed width of `char` and `str` types
according to Unicode Standard Annex #11 rules.
"""

exclude = [ "target/*", "Cargo.lock" ]
exclude = ["target/*", "Cargo.lock"]

[dependencies]
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
Expand All @@ -27,7 +30,6 @@ unicode-normalization = "0.1.23"

[features]
default = []
bench = []
rustc-dep-of-std = ['std', 'core', 'compiler_builtins']

# Legacy, now a no-op
Expand Down
113 changes: 113 additions & 0 deletions benches/benches.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#![feature(test)]

extern crate test;

use std::iter;

use test::Bencher;

use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};

#[bench]
fn cargo(b: &mut Bencher) {
let string = iter::repeat('a').take(4096).collect::<String>();

b.iter(|| {
for c in string.chars() {
test::black_box(UnicodeWidthChar::width(c));
}
});
}

#[bench]
fn stdlib(b: &mut Bencher) {
let string = iter::repeat('a').take(4096).collect::<String>();

b.iter(|| {
for c in string.chars() {
test::black_box(c.width());
}
});
}

#[bench]
fn simple_if(b: &mut Bencher) {
let string = iter::repeat('a').take(4096).collect::<String>();

b.iter(|| {
for c in string.chars() {
test::black_box(simple_width_if(c));
}
});
}

#[bench]
fn simple_match(b: &mut Bencher) {
let string = iter::repeat('a').take(4096).collect::<String>();

b.iter(|| {
for c in string.chars() {
test::black_box(simple_width_match(c));
}
});
}

#[inline]
fn simple_width_if(c: char) -> Option<usize> {
let cu = c as u32;
if cu < 127 {
if cu > 31 {
Some(1)
} else if cu == 0 {
Some(0)
} else {
None
}
} else {
UnicodeWidthChar::width(c)
}
}

#[inline]
fn simple_width_match(c: char) -> Option<usize> {
match c as u32 {
cu if cu == 0 => Some(0),
cu if cu < 0x20 => None,
cu if cu < 0x7f => Some(1),
_ => UnicodeWidthChar::width(c),
}
}

#[bench]
fn enwik8(b: &mut Bencher) {
// To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip
let data_path = "bench_data/enwik8";
let string = std::fs::read_to_string(data_path).unwrap_or_default();
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
}

#[bench]
fn jawiki(b: &mut Bencher) {
// To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from
// https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2
let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt";
let string = std::fs::read_to_string(data_path).unwrap_or_default();
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
}

#[bench]
fn emoji(b: &mut Bencher) {
// To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
let data_path = "bench_data/emoji-style.txt";
let string = std::fs::read_to_string(data_path).unwrap_or_default();
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
}