Merge pull request #71 from prataprc/issue_7

use unicode-width instead of len() or grapheme cluster #7.
rust-lang · Jun 28, 2018 · 4976a82 · 4976a82
2 parents 08af2e5 + 27df9b2
commit 4976a82
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 132 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,5 +13,8 @@ getopts-like option parsing.
 """
 categories = ["command-line-interface"]
 
+[dependencies]
+unicode-width = "0.1.5"
+
 [dev-dependencies]
 log = "0.4"
diff --git a/src/lib.rs b/src/lib.rs
@@ -104,22 +104,23 @@
                      reason = "use the crates.io `getopts` library instead"))]
 
 #[cfg(test)] #[macro_use] extern crate log;
+extern crate unicode_width;
+
 
 use self::Name::*;
 use self::HasArg::*;
 use self::Occur::*;
 use self::Fail::*;
 use self::Optval::*;
-use self::SplitWithinState::*;
-use self::Whitespace::*;
-use self::LengthLimit::*;
 
 use std::error::Error;
 use std::ffi::OsStr;
 use std::fmt;
 use std::iter::{repeat, IntoIterator};
 use std::result;
 
+use unicode_width::UnicodeWidthStr;
+
 /// A description of the options that a program can handle.
 pub struct Options {
     grps: Vec<OptGroup>,
@@ -479,7 +480,7 @@ impl Options {
             let mut row = "    ".to_string();
 
             // short option
-            match short_name.len() {
+            match short_name.width() {
                 0 => {
                     if any_short {
                         row.push_str("    ");
@@ -488,26 +489,23 @@ impl Options {
                 1 => {
                     row.push('-');
                     row.push_str(&short_name);
-                    if long_name.len() > 0 {
+                    if long_name.width() > 0 {
                         row.push_str(", ");
                     } else {
                         // Only a single space here, so that any
                         // argument is printed in the correct spot.
                         row.push(' ');
                     }
                 }
+                // FIXME: refer issue #7.
                 _ => panic!("the short name should only be 1 ascii char long"),
             }
 
             // long option
-            match long_name.len() {
+            match long_name.width() {
                 0 => {}
                 _ => {
-                    if self.long_only {
-                        row.push('-');
-                    } else {
-                        row.push_str("--");
-                    }
+                    row.push_str(if self.long_only { "-" } else { "--" });
                     row.push_str(&long_name);
                     row.push(' ');
                 }
@@ -524,9 +522,7 @@ impl Options {
                 }
             }
 
-            // FIXME: #5516 should be graphemes not codepoints
-            // here we just need to indent the start of the description
-            let rowlen = row.chars().count();
+            let rowlen = row.width();
             if rowlen < 24 {
                 for _ in 0 .. 24 - rowlen {
                     row.push(' ');
@@ -535,25 +531,7 @@ impl Options {
                 row.push_str(&desc_sep)
             }
 
-            // Normalize desc to contain words separated by one space character
-            let mut desc_normalized_whitespace = String::new();
-            for word in desc.split(|c: char| c.is_whitespace())
-                            .filter(|s| !s.is_empty()) {
-                desc_normalized_whitespace.push_str(word);
-                desc_normalized_whitespace.push(' ');
-            }
-
-            // FIXME: #5516 should be graphemes not codepoints
-            let mut desc_rows = Vec::new();
-            each_split_within(&desc_normalized_whitespace,
-                              54,
-                              |substr| {
-                desc_rows.push(substr.to_string());
-                true
-            });
-
-            // FIXME: #5516 should be graphemes not codepoints
-            // wrapped description
+            let desc_rows = each_split_within(&desc, 54);
             row.push_str(&desc_rows.join(&desc_sep));
 
             row
@@ -923,118 +901,59 @@ fn format_option(opt: &OptGroup) -> String {
     line
 }
 
-#[derive(Clone, Copy)]
-enum SplitWithinState {
-    A,  // leading whitespace, initial state
-    B,  // words
-    C,  // internal and trailing whitespace
-}
-
-#[derive(Clone, Copy)]
-enum Whitespace {
-    Ws, // current char is whitespace
-    Cr  // current char is not whitespace
-}
-
-#[derive(Clone, Copy)]
-enum LengthLimit {
-    UnderLim, // current char makes current substring still fit in limit
-    OverLim   // current char makes current substring no longer fit in limit
-}
-
-
 /// Splits a string into substrings with possibly internal whitespace,
 /// each of them at most `lim` bytes long, if possible. The substrings
 /// have leading and trailing whitespace removed, and are only cut at
 /// whitespace boundaries.
-///
-/// Note: Function was moved here from `std::str` because this module is the only place that
-/// uses it, and because it was too specific for a general string function.
-fn each_split_within<'a, F>(ss: &'a str, lim: usize, mut it: F)
-                            -> bool where F: FnMut(&'a str) -> bool {
-    // Just for fun, let's write this as a state machine:
-
-    let mut slice_start = 0;
-    let mut last_start = 0;
-    let mut last_end = 0;
-    let mut state = A;
-    let mut fake_i = ss.len();
-    let mut lim = lim;
-
-    let mut cont = true;
-
-    // if the limit is larger than the string, lower it to save cycles
-    if lim >= fake_i {
-        lim = fake_i;
-    }
-
-    let mut machine = |cont: &mut bool, state: &mut SplitWithinState, (i, c): (usize, char)| {
-        let whitespace = if c.is_whitespace() { Ws }       else { Cr };
-        let limit      = if (i - slice_start + 1) <= lim  { UnderLim } else { OverLim };
-
-        *state = match (*state, whitespace, limit) {
-            (A, Ws, _)        => { A }
-            (A, Cr, _)        => { slice_start = i; last_start = i; B }
-
-            (B, Cr, UnderLim) => { B }
-            (B, Cr, OverLim)  if (i - last_start + 1) > lim => {
-                // A single word has gone over the limit.  In this
-                // case we just accept that the word will be too long.
-                B
-            }
-            (B, Cr, OverLim)  => {
-                *cont = it(&ss[slice_start..last_end]);
-                slice_start = last_start;
-                B
-            }
-            (B, Ws, UnderLim) => {
-                last_end = i;
-                C
-            }
-            (B, Ws, OverLim)  => {
-                last_end = i;
-                *cont = it(&ss[slice_start..last_end]);
-                A
-            }
-
-            (C, Cr, UnderLim) => {
-                last_start = i;
-                B
+fn each_split_within(desc: &str, lim: usize) -> Vec<String> {
+    let mut rows = Vec::new();
+    for line in desc.trim().lines() {
+        let line_chars = line.chars().chain(Some(' '));
+        let words = line_chars.fold( (Vec::new(), 0, 0), |(mut words, a, z), c | {
+            let idx = z + c.len_utf8(); // Get the current byte offset
+
+            // If the char is whitespace, advance the word start and maybe push a word
+            if c.is_whitespace() {
+                if a != z {
+                    words.push(&line[a..z]);
+                }
+                (words, idx, idx)
             }
-            (C, Cr, OverLim)  => {
-                *cont = it(&ss[slice_start..last_end]);
-                slice_start = i;
-                last_start = i;
-                last_end = i;
-                B
+            // If the char is not whitespace, continue, retaining the current
+            else {
+                (words, a, idx)
             }
-            (C, Ws, OverLim)  => {
-                *cont = it(&ss[slice_start..last_end]);
-                A
+        }).0;
+
+        let mut row = String::new();
+        for word in words.iter() {
+            let sep = if row.len() > 0 { Some(" ") } else { None };
+            let width = row.width()
+                + word.width()
+                + sep.map(UnicodeWidthStr::width).unwrap_or(0);
+
+            if width <= lim {
+                if let Some(sep) = sep { row.push_str(sep) }
+                row.push_str(word);
+                continue
             }
-            (C, Ws, UnderLim) => {
-                C
+            if row.len() > 0 {
+                rows.push(row.clone());
+                row.clear();
             }
-        };
-
-        *cont
-    };
-
-    ss.char_indices().all(|x| machine(&mut cont, &mut state, x));
-
-    // Let the automaton 'run out' by supplying trailing whitespace
-    while cont && match state { B | C => true, A => false } {
-        machine(&mut cont, &mut state, (fake_i, ' '));
-        fake_i += 1;
+            row.push_str(word);
+        }
+        if row.len() > 0 {
+            rows.push(row);
+        }
     }
-    return cont;
+    rows
 }
 
 #[test]
 fn test_split_within() {
     fn t(s: &str, i: usize, u: &[String]) {
-        let mut v = Vec::new();
-        each_split_within(s, i, |s| { v.push(s.to_string()); true });
+        let v = each_split_within(&(s.to_string()), i);
         assert!(v.iter().zip(u.iter()).all(|(a,b)| a == b));
     }
     t("", 0, &[]);
@@ -1046,7 +965,9 @@ fn test_split_within() {
         "Little lamb".to_string()
     ]);
     t("\nMary had a little lamb\nLittle lamb\n", ::std::usize::MAX,
-        &["Mary had a little lamb\nLittle lamb".to_string()]);
+        &["Mary had a little lamb".to_string(),
+        "Little lamb".to_string()
+    ]);
 }
 
 #[cfg(test)]
@@ -1827,6 +1748,69 @@ Options:
         assert!(usage == expected)
     }
 
+    #[test]
+    fn test_usage_description_newline_handling() {
+        let mut opts = Options::new();
+        opts.optflag("k", "k\u{2013}w\u{2013}",
+            "The word kiwi is normally spelled with two i's");
+        opts.optflag("a", "apple",
+            "This description forces a new line.\n Here is a premature\n\
+            newline");
+
+        let expected =
+"Usage: fruits
+
+Options:
+    -k, --k–w–          The word kiwi is normally spelled with two i's
+    -a, --apple         This description forces a new line.
+                        Here is a premature
+                        newline
+";
+
+        let usage = opts.usage("Usage: fruits");
+
+        debug!("expected: <<{}>>", expected);
+        debug!("generated: <<{}>>", usage);
+        assert!(usage == expected)
+    }
+
+    #[test]
+    fn test_usage_multiwidth() {
+        let mut opts = Options::new();
+        opts.optflag("a", "apple",        "apple description");
+        opts.optflag("b", "banana\u{00AB}", "banana description");
+        opts.optflag("c", "brûlée", "brûlée quite long description");
+        opts.optflag("k", "kiwi\u{20AC}",   "kiwi description");
+        opts.optflag("o", "orange\u{2039}", "orange description");
+        opts.optflag("r", "raspberry-but-making-this-option-way-too-long",
+            "raspberry description is also quite long indeed longer than \
+            every other piece of text we might encounter here and thus will \
+            be automatically broken up"
+        );
+
+        let expected =
+"Usage: fruits
+
+Options:
+    -a, --apple         apple description
+    -b, --banana«       banana description
+    -c, --brûlée        brûlée quite long description
+    -k, --kiwi€         kiwi description
+    -o, --orange‹       orange description
+    -r, --raspberry-but-making-this-option-way-too-long 
+                        raspberry description is also quite long indeed longer
+                        than every other piece of text we might encounter here
+                        and thus will be automatically broken up
+";
+
+        let usage = opts.usage("Usage: fruits");
+
+        debug!("expected: <<{}>>", expected);
+        debug!("generated: <<{}>>", usage);
+        assert!(usage == expected)
+    }
+
+
     #[test]
     fn test_usage_short_only() {
         let mut opts = Options::new();