Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use unicode-width instead of len() or grapheme cluster #7. #71

Merged
merged 5 commits into from Jun 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Expand Up @@ -13,5 +13,8 @@ getopts-like option parsing.
"""
categories = ["command-line-interface"]

[dependencies]
unicode-width = "0.1.5"

[dev-dependencies]
log = "0.4"
248 changes: 116 additions & 132 deletions src/lib.rs
Expand Up @@ -104,22 +104,23 @@
reason = "use the crates.io `getopts` library instead"))]

#[cfg(test)] #[macro_use] extern crate log;
extern crate unicode_width;


use self::Name::*;
use self::HasArg::*;
use self::Occur::*;
use self::Fail::*;
use self::Optval::*;
use self::SplitWithinState::*;
use self::Whitespace::*;
use self::LengthLimit::*;

use std::error::Error;
use std::ffi::OsStr;
use std::fmt;
use std::iter::{repeat, IntoIterator};
use std::result;

use unicode_width::UnicodeWidthStr;

/// A description of the options that a program can handle.
pub struct Options {
grps: Vec<OptGroup>,
Expand Down Expand Up @@ -479,7 +480,7 @@ impl Options {
let mut row = " ".to_string();

// short option
match short_name.len() {
match short_name.width() {
0 => {
if any_short {
row.push_str(" ");
Expand All @@ -488,26 +489,23 @@ impl Options {
1 => {
row.push('-');
row.push_str(&short_name);
if long_name.len() > 0 {
if long_name.width() > 0 {
row.push_str(", ");
} else {
// Only a single space here, so that any
// argument is printed in the correct spot.
row.push(' ');
}
}
// FIXME: refer issue #7.
_ => panic!("the short name should only be 1 ascii char long"),
}

// long option
match long_name.len() {
match long_name.width() {
0 => {}
_ => {
if self.long_only {
row.push('-');
} else {
row.push_str("--");
}
row.push_str(if self.long_only { "-" } else { "--" });
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

row.push_str(&long_name);
row.push(' ');
}
Expand All @@ -524,9 +522,7 @@ impl Options {
}
}

// FIXME: #5516 should be graphemes not codepoints
// here we just need to indent the start of the description
let rowlen = row.chars().count();
let rowlen = row.width();
if rowlen < 24 {
for _ in 0 .. 24 - rowlen {
row.push(' ');
Expand All @@ -535,25 +531,7 @@ impl Options {
row.push_str(&desc_sep)
}

// Normalize desc to contain words separated by one space character
let mut desc_normalized_whitespace = String::new();
for word in desc.split(|c: char| c.is_whitespace())
.filter(|s| !s.is_empty()) {
desc_normalized_whitespace.push_str(word);
desc_normalized_whitespace.push(' ');
}

// FIXME: #5516 should be graphemes not codepoints
let mut desc_rows = Vec::new();
each_split_within(&desc_normalized_whitespace,
54,
|substr| {
desc_rows.push(substr.to_string());
true
});

// FIXME: #5516 should be graphemes not codepoints
// wrapped description
let desc_rows = each_split_within(&desc, 54);
row.push_str(&desc_rows.join(&desc_sep));

row
Expand Down Expand Up @@ -923,118 +901,59 @@ fn format_option(opt: &OptGroup) -> String {
line
}

#[derive(Clone, Copy)]
enum SplitWithinState {
A, // leading whitespace, initial state
B, // words
C, // internal and trailing whitespace
}

#[derive(Clone, Copy)]
enum Whitespace {
Ws, // current char is whitespace
Cr // current char is not whitespace
}

#[derive(Clone, Copy)]
enum LengthLimit {
UnderLim, // current char makes current substring still fit in limit
OverLim // current char makes current substring no longer fit in limit
}


/// Splits a string into substrings with possibly internal whitespace,
/// each of them at most `lim` bytes long, if possible. The substrings
/// have leading and trailing whitespace removed, and are only cut at
/// whitespace boundaries.
///
/// Note: Function was moved here from `std::str` because this module is the only place that
/// uses it, and because it was too specific for a general string function.
fn each_split_within<'a, F>(ss: &'a str, lim: usize, mut it: F)
-> bool where F: FnMut(&'a str) -> bool {
// Just for fun, let's write this as a state machine:

let mut slice_start = 0;
let mut last_start = 0;
let mut last_end = 0;
let mut state = A;
let mut fake_i = ss.len();
let mut lim = lim;

let mut cont = true;

// if the limit is larger than the string, lower it to save cycles
if lim >= fake_i {
lim = fake_i;
}

let mut machine = |cont: &mut bool, state: &mut SplitWithinState, (i, c): (usize, char)| {
let whitespace = if c.is_whitespace() { Ws } else { Cr };
let limit = if (i - slice_start + 1) <= lim { UnderLim } else { OverLim };

*state = match (*state, whitespace, limit) {
(A, Ws, _) => { A }
(A, Cr, _) => { slice_start = i; last_start = i; B }

(B, Cr, UnderLim) => { B }
(B, Cr, OverLim) if (i - last_start + 1) > lim => {
// A single word has gone over the limit. In this
// case we just accept that the word will be too long.
B
}
(B, Cr, OverLim) => {
*cont = it(&ss[slice_start..last_end]);
slice_start = last_start;
B
}
(B, Ws, UnderLim) => {
last_end = i;
C
}
(B, Ws, OverLim) => {
last_end = i;
*cont = it(&ss[slice_start..last_end]);
A
}

(C, Cr, UnderLim) => {
last_start = i;
B
fn each_split_within(desc: &str, lim: usize) -> Vec<String> {
let mut rows = Vec::new();
for line in desc.trim().lines() {
let line_chars = line.chars().chain(Some(' '));
let words = line_chars.fold( (Vec::new(), 0, 0), |(mut words, a, z), c | {
let idx = z + c.len_utf8(); // Get the current byte offset

// If the char is whitespace, advance the word start and maybe push a word
if c.is_whitespace() {
if a != z {
words.push(&line[a..z]);
}
(words, idx, idx)
}
(C, Cr, OverLim) => {
*cont = it(&ss[slice_start..last_end]);
slice_start = i;
last_start = i;
last_end = i;
B
// If the char is not whitespace, continue, retaining the current
else {
(words, a, idx)
}
(C, Ws, OverLim) => {
*cont = it(&ss[slice_start..last_end]);
A
}).0;

let mut row = String::new();
for word in words.iter() {
let sep = if row.len() > 0 { Some(" ") } else { None };
let width = row.width()
+ word.width()
+ sep.map(UnicodeWidthStr::width).unwrap_or(0);

if width <= lim {
if let Some(sep) = sep { row.push_str(sep) }
row.push_str(word);
continue
}
(C, Ws, UnderLim) => {
C
if row.len() > 0 {
rows.push(row.clone());
row.clear();
}
};

*cont
};

ss.char_indices().all(|x| machine(&mut cont, &mut state, x));

// Let the automaton 'run out' by supplying trailing whitespace
while cont && match state { B | C => true, A => false } {
machine(&mut cont, &mut state, (fake_i, ' '));
fake_i += 1;
row.push_str(word);
}
if row.len() > 0 {
rows.push(row);
}
}
return cont;
rows
}

#[test]
fn test_split_within() {
fn t(s: &str, i: usize, u: &[String]) {
let mut v = Vec::new();
each_split_within(s, i, |s| { v.push(s.to_string()); true });
let v = each_split_within(&(s.to_string()), i);
assert!(v.iter().zip(u.iter()).all(|(a,b)| a == b));
}
t("", 0, &[]);
Expand All @@ -1046,7 +965,9 @@ fn test_split_within() {
"Little lamb".to_string()
]);
t("\nMary had a little lamb\nLittle lamb\n", ::std::usize::MAX,
&["Mary had a little lamb\nLittle lamb".to_string()]);
&["Mary had a little lamb".to_string(),
"Little lamb".to_string()
]);
}

#[cfg(test)]
Expand Down Expand Up @@ -1827,6 +1748,69 @@ Options:
assert!(usage == expected)
}

#[test]
fn test_usage_description_newline_handling() {
let mut opts = Options::new();
opts.optflag("k", "k\u{2013}w\u{2013}",
"The word kiwi is normally spelled with two i's");
opts.optflag("a", "apple",
"This description forces a new line.\n Here is a premature\n\
newline");

let expected =
"Usage: fruits

Options:
-k, --k–w– The word kiwi is normally spelled with two i's
-a, --apple This description forces a new line.
Here is a premature
newline
";

let usage = opts.usage("Usage: fruits");

debug!("expected: <<{}>>", expected);
debug!("generated: <<{}>>", usage);
assert!(usage == expected)
}

#[test]
fn test_usage_multiwidth() {
let mut opts = Options::new();
opts.optflag("a", "apple", "apple description");
opts.optflag("b", "banana\u{00AB}", "banana description");
opts.optflag("c", "brûlée", "brûlée quite long description");
opts.optflag("k", "kiwi\u{20AC}", "kiwi description");
opts.optflag("o", "orange\u{2039}", "orange description");
opts.optflag("r", "raspberry-but-making-this-option-way-too-long",
"raspberry description is also quite long indeed longer than \
every other piece of text we might encounter here and thus will \
be automatically broken up"
);

let expected =
"Usage: fruits

Options:
-a, --apple apple description
-b, --banana« banana description
-c, --brûlée brûlée quite long description
-k, --kiwi€ kiwi description
-o, --orange‹ orange description
-r, --raspberry-but-making-this-option-way-too-long
raspberry description is also quite long indeed longer
than every other piece of text we might encounter here
and thus will be automatically broken up
";

let usage = opts.usage("Usage: fruits");

debug!("expected: <<{}>>", expected);
debug!("generated: <<{}>>", usage);
assert!(usage == expected)
}


#[test]
fn test_usage_short_only() {
let mut opts = Options::new();
Expand Down