From 8c0c8a10db7ff1eb8c0248f2bb1e6796474d3ae2 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 23 Apr 2024 10:44:16 -0400 Subject: [PATCH 1/3] Document width rules --- .github/workflows/rust.yml | 2 +- Cargo.toml | 1 - README.md | 23 +++++------- src/lib.rs | 77 ++++++++++++++++++++++++++------------ 4 files changed, 64 insertions(+), 39 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 686b58f..082364c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -1,4 +1,4 @@ -name: Rust +name: build on: push: diff --git a/Cargo.toml b/Cargo.toml index b922c4c..1f0e0a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,6 @@ authors = [ homepage = "https://github.com/unicode-rs/unicode-width" repository = "https://github.com/unicode-rs/unicode-width" -documentation = "https://unicode-rs.github.io/unicode-width" license = "MIT/Apache-2.0" keywords = ["text", "width", "unicode"] readme = "README.md" diff --git a/README.md b/README.md index 033cc99..e49eaab 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,17 @@ -# unicode-width +# `unicode-width` -Determine displayed width of `char` and `str` types according to -[Unicode Standard Annex #11][UAX11] rules. +[![Build status](https://github.com/unicode-rs/unicode-width/actions/workflows/rust.yml/badge.svg)](https://travis-ci.org/unicode-rs/unicode-width) +[![crates.io version](https://img.shields.io/crates/v/unicode-width)](https://crates.io/crates/unicode-width) +[![Docs status](https://img.shields.io/docsrs/unicode-width)](https://docs.rs/unicode-width/) -[UAX11]: http://www.unicode.org/reports/tr11/ +Determine displayed width of `char` and `str` types according to [Unicode Standard Annex #11][UAX11], +other portions of the Unicode standard, and common implementations of POSIX [`wcwidth()`](https://pubs.opengroup.org/onlinepubs/9699919799/). -[![Build Status](https://travis-ci.org/unicode-rs/unicode-width.svg)](https://travis-ci.org/unicode-rs/unicode-width) +This crate is `#![no_std]`. -[Documentation](https://unicode-rs.github.io/unicode-width/unicode_width/index.html) +[UAX11]: http://www.unicode.org/reports/tr11/ ```rust -extern crate unicode_width; - use unicode_width::UnicodeWidthStr; fn main() { @@ -45,11 +45,6 @@ and nonstandard [Korean jamo](https://unicode.org/glossary/#jamo) sequences may be rendered with a different width than what this crate says. (This is not an exhaustive list.) -## features - -unicode-width does not depend on libstd, so it can be used in crates -with the `#![no_std]` attribute. - ## crates.io You can use this package in your project by adding the following @@ -57,5 +52,5 @@ to your `Cargo.toml`: ```toml [dependencies] -unicode-width = "0.1.7" +unicode-width = "0.1.11" ``` diff --git a/src/lib.rs b/src/lib.rs index 8fe79e6..2f3596b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,40 +9,71 @@ // except according to those terms. //! Determine displayed width of `char` and `str` types according to -//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) -//! rules. +//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/), +//! other portions of the Unicode standard, and common implementations of +//! POSIX [`wcwidth()`](https://pubs.opengroup.org/onlinepubs/9699919799/). +//! See the [Rules for determining width](#rules-for-determining-width) section +//! for the exact rules. //! -//! ```rust -//! extern crate unicode_width; +//! This crate is `#![no_std]`. //! +//! ```rust //! use unicode_width::UnicodeWidthStr; //! -//! fn main() { -//! let teststr = "Hello, world!"; -//! let width = UnicodeWidthStr::width(teststr); -//! println!("{}", teststr); -//! println!("The above string is {} columns wide.", width); -//! let width = teststr.width_cjk(); -//! println!("The above string is {} columns wide (CJK).", width); -//! } +//! let teststr = "Hello, world!"; +//! let width = UnicodeWidthStr::width(teststr); +//! println!("{}", teststr); +//! println!("The above string is {} columns wide.", width); +//! let width = teststr.width_cjk(); +//! println!("The above string is {} columns wide (CJK).", width); //! ``` //! -//! # features +//! # Rules for determining width //! -//! unicode-width does not depend on `std`, so it can be used in crates -//! with the `#![no_std]` attribute. +//! This crate currently uses the following rules to determine the width of a +//! character or string, in order of decreasing precedence. These may be tweaked in the future. //! -//! # crates.io +//! 1. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1. +//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. +//! 3. The following have width 0: +//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D) +//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property. +//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D) +//! with the [`Grapheme_Extend`] property. +//! - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] chracters: +//! - [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0), +//! - [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7), +//! - [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8), +//! - [`'\u{0CCA}'` KANNADA VOWEL SIGN O](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCA), +//! - [`'\u{0CCB}'` KANNADA VOWEL SIGN OO](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCB), +//! - [`'\u{1B3B}'` BALINESE VOWEL SIGN RA REPA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3B), +//! - [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and +//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43). +//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D) +//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593) +//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`). +//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000). +//! 4. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D) +//! have no defined width, and are ignored when determining the width of a string. +//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) +//! with an [`East_Asian_Width`] of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2) +//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2. +//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D) +//! with an [`East_Asian_Width`] of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6) +//! have width 2 in an East Asian context, and width 1 otherwise. +//! 7. All other characters have width 1. //! -//! You can use this package in your project by adding the following -//! to your `Cargo.toml`: +//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1 +//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443 + //! -//! ```toml -//! [dependencies] -//! unicode-width = "0.1.5" -//! ``` +//! ## Canonical equivalence +//! +//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width. +//! However, this guarantee does not currently hold for the CJK width variants. -#![deny(missing_docs, unsafe_code)] +#![forbid(unsafe_code)] +#![deny(missing_docs)] #![doc( html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" From f70250216033e8f797a4d133e797824b64fe5bc4 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 23 Apr 2024 11:15:12 -0400 Subject: [PATCH 2/3] Cargo.toml: add categories --- Cargo.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 1f0e0a9..50c82f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,12 @@ authors = [ homepage = "https://github.com/unicode-rs/unicode-width" repository = "https://github.com/unicode-rs/unicode-width" license = "MIT/Apache-2.0" +categories = [ + "command-line-interface", + "internationalization", + "no-std::no-alloc", + "text-processing", +] keywords = ["text", "width", "unicode"] readme = "README.md" description = """ From 1e623c58b2dd7dbd8795bdf682ff1372fc0a8e44 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 23 Apr 2024 11:16:28 -0400 Subject: [PATCH 3/3] Cargo.toml: make license SPDX-compliant --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 50c82f9..fccb1bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ authors = [ homepage = "https://github.com/unicode-rs/unicode-width" repository = "https://github.com/unicode-rs/unicode-width" -license = "MIT/Apache-2.0" +license = "MIT OR Apache-2.0" categories = [ "command-line-interface", "internationalization",