diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 89c5f57..686b58f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -7,7 +7,11 @@ on: branches: [ "master" ] env: + CARGO_INCREMENTAL: 0 CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + RUSTFLAGS: -D warnings + RUSTDOCFLAGS: -D warnings jobs: build: @@ -18,6 +22,12 @@ jobs: run: cargo build --verbose - name: Run tests run: cargo test --verbose + - name: Build docs + run: cargo doc + - name: Check formatting + run: cargo fmt --check + - name: Check clippy + run: cargo clippy --lib --tests regen: runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index 2d7d550..12e0bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ Cargo.lock scripts/tmp scripts/*.txt scripts/*.rs +bench_data/* diff --git a/Cargo.toml b/Cargo.toml index bd8da9c..b922c4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,10 @@ name = "unicode-width" version = "0.1.11" -authors = ["kwantam ", "Manish Goregaokar "] +authors = [ + "kwantam ", + "Manish Goregaokar ", +] homepage = "https://github.com/unicode-rs/unicode-width" repository = "https://github.com/unicode-rs/unicode-width" @@ -14,8 +17,9 @@ description = """ Determine displayed width of `char` and `str` types according to Unicode Standard Annex #11 rules. """ +edition = "2021" -exclude = [ "target/*", "Cargo.lock" ] +exclude = ["target/*", "Cargo.lock"] [dependencies] std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } @@ -27,7 +31,6 @@ unicode-normalization = "0.1.23" [features] default = [] -bench = [] rustc-dep-of-std = ['std', 'core', 'compiler_builtins'] # Legacy, now a no-op diff --git a/benches/benches.rs b/benches/benches.rs new file mode 100644 index 0000000..646d0e1 --- /dev/null +++ b/benches/benches.rs @@ -0,0 +1,106 @@ +// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +#![feature(test)] + +extern crate test; + +use std::iter; + +use test::Bencher; + +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; + +#[bench] +fn cargo(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(UnicodeWidthChar::width(c)); + } + }); +} + +#[bench] +#[allow(deprecated)] +fn stdlib(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(c.width()); + } + }); +} + +#[bench] +fn simple_if(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(simple_width_if(c)); + } + }); +} + +#[bench] +fn simple_match(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(simple_width_match(c)); + } + }); +} + +#[inline] +fn simple_width_if(c: char) -> Option { + let cu = c as u32; + if cu < 127 { + if cu > 31 { + Some(1) + } else if cu == 0 { + Some(0) + } else { + None + } + } else { + UnicodeWidthChar::width(c) + } +} + +#[inline] +fn simple_width_match(c: char) -> Option { + match c as u32 { + cu if cu == 0 => Some(0), + cu if cu < 0x20 => None, + cu if cu < 0x7f => Some(1), + _ => UnicodeWidthChar::width(c), + } +} + +#[bench] +fn enwik8(b: &mut Bencher) { + // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip + let data_path = "bench_data/enwik8"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} + +#[bench] +fn jawiki(b: &mut Bencher) { + // To benchmark, download & extract `jawiki-20220501-pages-articles-multistream-index.txt` from + // https://dumps.wikimedia.org/jawiki/20220501/jawiki-20220501-pages-articles-multistream-index.txt.bz2 + let data_path = "bench_data/jawiki-20220501-pages-articles-multistream-index.txt"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} diff --git a/src/lib.rs b/src/lib.rs index 2f22613..8fe79e6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,24 +47,13 @@ html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" )] -#![cfg_attr(feature = "bench", feature(test))] #![no_std] -#[cfg(test)] -#[macro_use] -extern crate std; - -#[cfg(feature = "bench")] -extern crate test; - use tables::charwidth as cw; pub use tables::UNICODE_VERSION; mod tables; -#[cfg(test)] -mod tests; - /// Methods for determining displayed width of Unicode characters. pub trait UnicodeWidthChar { /// Returns the character's displayed width in columns, or `None` if the diff --git a/src/tests.rs b/tests/tests.rs similarity index 62% rename from src/tests.rs rename to tests/tests.rs index 9e3805b..c02be4f 100644 --- a/src/tests.rs +++ b/tests/tests.rs @@ -8,112 +8,10 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#[cfg(feature = "bench")] -use super::{UnicodeWidthChar, UnicodeWidthStr}; -#[cfg(feature = "bench")] -use std::iter; -#[cfg(feature = "bench")] -use test::Bencher; +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; -use std::prelude::v1::*; - -#[cfg(feature = "bench")] -#[bench] -fn cargo(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(UnicodeWidthChar::width(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -#[allow(deprecated)] -fn stdlib(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(c.width()); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -fn simple_if(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(simple_width_if(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -fn simple_match(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(simple_width_match(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[inline] -fn simple_width_if(c: char) -> Option { - let cu = c as u32; - if cu < 127 { - if cu > 31 { - Some(1) - } else if cu == 0 { - Some(0) - } else { - None - } - } else { - UnicodeWidthChar::width(c) - } -} - -#[cfg(feature = "bench")] -#[inline] -fn simple_width_match(c: char) -> Option { - match c as u32 { - cu if cu == 0 => Some(0), - cu if cu < 0x20 => None, - cu if cu < 0x7f => Some(1), - _ => UnicodeWidthChar::width(c), - } -} -#[cfg(feature = "bench")] -#[bench] -fn enwik8(b: &mut Bencher) { - // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip - let data_path = "bench_data/enwik8"; - let string = std::fs::read_to_string(data_path).unwrap_or_default(); - b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); -} -#[cfg(feature = "bench")] -#[bench] -fn jawiki(b: &mut Bencher) { - // To benchmark, download & extract `jawiki-20220501-pages-articles-multistream-index.txt` from - // https://dumps.wikimedia.org/jawiki/20220501/jawiki-20220501-pages-articles-multistream-index.txt.bz2 - let data_path = "bench_data/jawiki-20220501-pages-articles-multistream-index.txt"; - let string = std::fs::read_to_string(data_path).unwrap_or_default(); - b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); -} #[test] fn test_str() { - use super::UnicodeWidthStr; - assert_eq!(UnicodeWidthStr::width("hello"), 10); assert_eq!("hello".width_cjk(), 10); assert_eq!(UnicodeWidthStr::width("\0\0\0\x01\x01"), 0); @@ -130,8 +28,6 @@ fn test_str() { #[test] fn test_emoji() { // Example from the README. - use super::UnicodeWidthStr; - assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist @@ -139,8 +35,6 @@ fn test_emoji() { #[test] fn test_char() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('h'), Some(2)); assert_eq!('h'.width_cjk(), Some(2)); assert_eq!(UnicodeWidthChar::width('\x00'), Some(0)); @@ -153,8 +47,6 @@ fn test_char() { #[test] fn test_char2() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\x00'), Some(0)); assert_eq!('\x00'.width_cjk(), Some(0)); @@ -182,15 +74,11 @@ fn test_char2() { #[test] fn unicode_12() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{1F971}'), Some(2)); } #[test] fn test_default_ignorable() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{E0000}'), Some(0)); assert_eq!(UnicodeWidthChar::width('\u{1160}'), Some(0)); @@ -200,8 +88,6 @@ fn test_default_ignorable() { #[test] fn test_jamo() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{1100}'), Some(2)); assert_eq!(UnicodeWidthChar::width('\u{A97C}'), Some(2)); // Special case: U+115F HANGUL CHOSEONG FILLER @@ -214,8 +100,6 @@ fn test_jamo() { #[test] fn test_prepended_concatenation_marks() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{0600}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{070F}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{08E2}'), Some(1)); @@ -224,8 +108,6 @@ fn test_prepended_concatenation_marks() { #[test] fn test_interlinear_annotation_chars() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{FFF9}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{FFFA}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{FFFB}'), Some(1)); @@ -233,8 +115,6 @@ fn test_interlinear_annotation_chars() { #[test] fn test_hieroglyph_format_controls() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{13430}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{13436}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{1343C}'), Some(1)); @@ -242,8 +122,6 @@ fn test_hieroglyph_format_controls() { #[test] fn test_marks() { - use super::UnicodeWidthChar; - // Nonspacing marks have 0 width assert_eq!(UnicodeWidthChar::width('\u{0301}'), Some(0)); // Enclosing marks have 0 width @@ -256,8 +134,6 @@ fn test_marks() { #[test] fn test_canonical_equivalence() { - use super::{UnicodeWidthChar, UnicodeWidthStr}; - for c in '\0'..='\u{10FFFF}' { let mut nfd = String::new(); unicode_normalization::char::decompose_canonical(c, |d| nfd.push(d));