From dd7634c8c244593193af435f138381a993bc7bb8 Mon Sep 17 00:00:00 2001
From: prataprc <prataprc@gmail.com>
Date: Wed, 23 May 2018 11:49:36 +0530
Subject: [PATCH 1/5] use unicode-width instead of len() or grapheme cluster
 #7.

---
 Cargo.toml |   1 +
 src/lib.rs | 248 ++++++++++++++++++++++++-----------------------------
 2 files changed, 113 insertions(+), 136 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 6aff44d3..a97ae3e5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,3 +15,4 @@ categories = ["command-line-interface"]
 
 [dev-dependencies]
 log = "0.4"
+unicode-width = "0.1.5"
diff --git a/src/lib.rs b/src/lib.rs
index 0a81c40e..7b2f1ea7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -92,6 +92,7 @@
 //! }
 //! ```
 
+#![feature(rustc_private)]
 #![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
        html_favicon_url = "https://www.rust-lang.org/favicon.ico",
        html_root_url = "https://doc.rust-lang.org/getopts/")]
@@ -105,14 +106,12 @@
 
 #[cfg(test)] #[macro_use] extern crate log;
 
+
 use self::Name::*;
 use self::HasArg::*;
 use self::Occur::*;
 use self::Fail::*;
 use self::Optval::*;
-use self::SplitWithinState::*;
-use self::Whitespace::*;
-use self::LengthLimit::*;
 
 use std::error::Error;
 use std::ffi::OsStr;
@@ -120,6 +119,9 @@ use std::fmt;
 use std::iter::{repeat, IntoIterator};
 use std::result;
 
+extern crate unicode_width;
+use unicode_width::UnicodeWidthStr;
+
 /// A description of the options that a program can handle.
 pub struct Options {
     grps: Vec<OptGroup>,
@@ -479,7 +481,7 @@ impl Options {
             let mut row = "    ".to_string();
 
             // short option
-            match short_name.len() {
+            match short_name.width() {
                 0 => {
                     if any_short {
                         row.push_str("    ");
@@ -488,7 +490,7 @@ impl Options {
                 1 => {
                     row.push('-');
                     row.push_str(&short_name);
-                    if long_name.len() > 0 {
+                    if long_name.width() > 0 {
                         row.push_str(", ");
                     } else {
                         // Only a single space here, so that any
@@ -496,18 +498,15 @@ impl Options {
                         row.push(' ');
                     }
                 }
+                // FIXME: refer issue #7.
                 _ => panic!("the short name should only be 1 ascii char long"),
             }
 
             // long option
-            match long_name.len() {
+            match long_name.width() {
                 0 => {}
                 _ => {
-                    if self.long_only {
-                        row.push('-');
-                    } else {
-                        row.push_str("--");
-                    }
+                    row.push_str(if self.long_only { "-" } else { "--" });
                     row.push_str(&long_name);
                     row.push(' ');
                 }
@@ -524,36 +523,14 @@ impl Options {
                 }
             }
 
-            // FIXME: #5516 should be graphemes not codepoints
-            // here we just need to indent the start of the description
-            let rowlen = row.chars().count();
+            let rowlen = row.width();
             if rowlen < 24 {
-                for _ in 0 .. 24 - rowlen {
-                    row.push(' ');
-                }
+                (0 .. 24 - rowlen).for_each(|_| row.push(' '));
             } else {
                 row.push_str(&desc_sep)
             }
 
-            // Normalize desc to contain words separated by one space character
-            let mut desc_normalized_whitespace = String::new();
-            for word in desc.split(|c: char| c.is_whitespace())
-                            .filter(|s| !s.is_empty()) {
-                desc_normalized_whitespace.push_str(word);
-                desc_normalized_whitespace.push(' ');
-            }
-
-            // FIXME: #5516 should be graphemes not codepoints
-            let mut desc_rows = Vec::new();
-            each_split_within(&desc_normalized_whitespace,
-                              54,
-                              |substr| {
-                desc_rows.push(substr.to_string());
-                true
-            });
-
-            // FIXME: #5516 should be graphemes not codepoints
-            // wrapped description
+            let desc_rows = each_split_within(&desc, 54);
             row.push_str(&desc_rows.join(&desc_sep));
 
             row
@@ -923,118 +900,52 @@ fn format_option(opt: &OptGroup) -> String {
     line
 }
 
-#[derive(Clone, Copy)]
-enum SplitWithinState {
-    A,  // leading whitespace, initial state
-    B,  // words
-    C,  // internal and trailing whitespace
-}
-
-#[derive(Clone, Copy)]
-enum Whitespace {
-    Ws, // current char is whitespace
-    Cr  // current char is not whitespace
-}
-
-#[derive(Clone, Copy)]
-enum LengthLimit {
-    UnderLim, // current char makes current substring still fit in limit
-    OverLim   // current char makes current substring no longer fit in limit
-}
-
-
 /// Splits a string into substrings with possibly internal whitespace,
 /// each of them at most `lim` bytes long, if possible. The substrings
 /// have leading and trailing whitespace removed, and are only cut at
 /// whitespace boundaries.
 ///
-/// Note: Function was moved here from `std::str` because this module is the only place that
-/// uses it, and because it was too specific for a general string function.
-fn each_split_within<'a, F>(ss: &'a str, lim: usize, mut it: F)
-                            -> bool where F: FnMut(&'a str) -> bool {
-    // Just for fun, let's write this as a state machine:
-
-    let mut slice_start = 0;
-    let mut last_start = 0;
-    let mut last_end = 0;
-    let mut state = A;
-    let mut fake_i = ss.len();
-    let mut lim = lim;
-
-    let mut cont = true;
-
-    // if the limit is larger than the string, lower it to save cycles
-    if lim >= fake_i {
-        lim = fake_i;
-    }
-
-    let mut machine = |cont: &mut bool, state: &mut SplitWithinState, (i, c): (usize, char)| {
-        let whitespace = if c.is_whitespace() { Ws }       else { Cr };
-        let limit      = if (i - slice_start + 1) <= lim  { UnderLim } else { OverLim };
-
-        *state = match (*state, whitespace, limit) {
-            (A, Ws, _)        => { A }
-            (A, Cr, _)        => { slice_start = i; last_start = i; B }
-
-            (B, Cr, UnderLim) => { B }
-            (B, Cr, OverLim)  if (i - last_start + 1) > lim => {
-                // A single word has gone over the limit.  In this
-                // case we just accept that the word will be too long.
-                B
-            }
-            (B, Cr, OverLim)  => {
-                *cont = it(&ss[slice_start..last_end]);
-                slice_start = last_start;
-                B
-            }
-            (B, Ws, UnderLim) => {
-                last_end = i;
-                C
+/// Note: Function was moved here from `std::str` because this module
+/// is the only place that uses it, and because it was too specific for
+/// a general string function.
+fn each_split_within(desc: &String, lim: usize) -> Vec<String> {
+    let mut rows = Vec::new();
+    for line in desc.trim().lines() {
+        let mut words = Vec::new();
+        let mut word = String::new();
+        for ch in line.chars() {
+            if !ch.is_whitespace() {
+                word.push(ch);
+                continue
             }
-            (B, Ws, OverLim)  => {
-                last_end = i;
-                *cont = it(&ss[slice_start..last_end]);
-                A
-            }
-
-            (C, Cr, UnderLim) => {
-                last_start = i;
-                B
-            }
-            (C, Cr, OverLim)  => {
-                *cont = it(&ss[slice_start..last_end]);
-                slice_start = i;
-                last_start = i;
-                last_end = i;
-                B
-            }
-            (C, Ws, OverLim)  => {
-                *cont = it(&ss[slice_start..last_end]);
-                A
-            }
-            (C, Ws, UnderLim) => {
-                C
+            words.push(word);
+            word = String::new();
+        }
+        words.push(word);
+
+        let mut row = String::new();
+        for word in words.iter().filter(|word| word.len() > 0) {
+            let mut width = row.width() + word.width();
+            if row.len() > 0 { width += " ".width(); }
+            if width <= lim {
+                if row.len() > 0 { row.push_str(" ") }
+                row.push_str(word);
+                continue
             }
-        };
-
-        *cont
-    };
-
-    ss.char_indices().all(|x| machine(&mut cont, &mut state, x));
-
-    // Let the automaton 'run out' by supplying trailing whitespace
-    while cont && match state { B | C => true, A => false } {
-        machine(&mut cont, &mut state, (fake_i, ' '));
-        fake_i += 1;
+            rows.push(row.trim().to_string());
+            row = String::new();
+            row.push_str(word);
+        }
+        rows.push(row.trim().to_string());
     }
-    return cont;
+    rows.iter().filter(|row| row.len() > 0 )
+        .map(|row| row.to_string()).collect()
 }
 
 #[test]
 fn test_split_within() {
     fn t(s: &str, i: usize, u: &[String]) {
-        let mut v = Vec::new();
-        each_split_within(s, i, |s| { v.push(s.to_string()); true });
+        let v = each_split_within(&(s.to_string()), i);
         assert!(v.iter().zip(u.iter()).all(|(a,b)| a == b));
     }
     t("", 0, &[]);
@@ -1046,7 +957,9 @@ fn test_split_within() {
         "Little lamb".to_string()
     ]);
     t("\nMary had a little lamb\nLittle lamb\n", ::std::usize::MAX,
-        &["Mary had a little lamb\nLittle lamb".to_string()]);
+        &["Mary had a little lamb".to_string(),
+        "Little lamb".to_string()
+    ]);
 }
 
 #[cfg(test)]
@@ -1827,6 +1740,69 @@ Options:
         assert!(usage == expected)
     }
 
+    #[test]
+    fn test_usage_description_newline_handling() {
+        let mut opts = Options::new();
+        opts.optflag("k", "k\u{2013}w\u{2013}",
+            "The word kiwi is normally spelled with two i's");
+        opts.optflag("a", "apple",
+            "This description forces a new line.\n Here is a premature\n\
+            newline");
+
+        let expected =
+"Usage: fruits
+
+Options:
+    -k, --k–w–          The word kiwi is normally spelled with two i's
+    -a, --apple         This description forces a new line.
+                        Here is a premature
+                        newline
+";
+
+        let usage = opts.usage("Usage: fruits");
+
+        debug!("expected: <<{}>>", expected);
+        debug!("generated: <<{}>>", usage);
+        assert!(usage == expected)
+    }
+
+    #[test]
+    fn test_usage_multiwidth() {
+        let mut opts = Options::new();
+        opts.optflag("a", "apple",        "apple description");
+        opts.optflag("b", "banana\u{00AB}", "banana description");
+        opts.optflag("c", "brûlée", "brûlée quite long description");
+        opts.optflag("k", "kiwi\u{20AC}",   "kiwi description");
+        opts.optflag("o", "orange\u{2039}", "orange description");
+        opts.optflag("r", "raspberry-but-making-this-option-way-too-long",
+            "raspberry description is also quite long indeed longer than \
+            every other piece of text we might encounter here and thus will \
+            be automatically broken up"
+        );
+
+        let expected =
+"Usage: fruits
+
+Options:
+    -a, --apple         apple description
+    -b, --banana«       banana description
+    -c, --brûlée        brûlée quite long description
+    -k, --kiwi€         kiwi description
+    -o, --orange‹       orange description
+    -r, --raspberry-but-making-this-option-way-too-long 
+                        raspberry description is also quite long indeed longer
+                        than every other piece of text we might encounter here
+                        and thus will be automatically broken up
+";
+
+        let usage = opts.usage("Usage: fruits");
+
+        debug!("expected: <<{}>>", expected);
+        debug!("generated: <<{}>>", usage);
+        assert!(usage == expected)
+    }
+
+
     #[test]
     fn test_usage_short_only() {
         let mut opts = Options::new();

From 9973a6c1ea648caefa61a677cf66a88399a40840 Mon Sep 17 00:00:00 2001
From: prataprc <prataprc@gmail.com>
Date: Wed, 23 May 2018 20:18:12 +0530
Subject: [PATCH 2/5] move unicode-width from [dev-dependencies] to
 [dependencies] issue #7.

---
 Cargo.toml | 4 +++-
 src/lib.rs | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index a97ae3e5..04acee15 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,8 @@ getopts-like option parsing.
 """
 categories = ["command-line-interface"]
 
+[dependencies]
+unicode-width = "0.1.5"
+
 [dev-dependencies]
 log = "0.4"
-unicode-width = "0.1.5"
diff --git a/src/lib.rs b/src/lib.rs
index 7b2f1ea7..b82ef749 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -92,7 +92,6 @@
 //! }
 //! ```
 
-#![feature(rustc_private)]
 #![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
        html_favicon_url = "https://www.rust-lang.org/favicon.ico",
        html_root_url = "https://doc.rust-lang.org/getopts/")]

From 7e37bd708e927d2739ab2ecc5e5ee2f24de19b74 Mon Sep 17 00:00:00 2001
From: prataprc <prataprc@gmail.com>
Date: Wed, 23 May 2018 21:30:16 +0530
Subject: [PATCH 3/5] for_each is not implemented until 1.21, issue #7.

---
 src/lib.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index b82ef749..d8b28580 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -524,7 +524,9 @@ impl Options {
 
             let rowlen = row.width();
             if rowlen < 24 {
-                (0 .. 24 - rowlen).for_each(|_| row.push(' '));
+                for _ in 0 .. 24 - rowlen {
+                    row.push(' ');
+                }
             } else {
                 row.push_str(&desc_sep)
             }

From 547e4d54bd8a91087cbeeeee0cd6148d59c5fefe Mon Sep 17 00:00:00 2001
From: prataprc <prataprc@gmail.com>
Date: Mon, 28 May 2018 16:47:39 +0530
Subject: [PATCH 4/5] optimize parsing words in each_split_within(), issue #7.

---
 src/lib.rs | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index d8b28580..1f747184 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -104,6 +104,7 @@
                      reason = "use the crates.io `getopts` library instead"))]
 
 #[cfg(test)] #[macro_use] extern crate log;
+extern crate unicode_width;
 
 
 use self::Name::*;
@@ -118,7 +119,6 @@ use std::fmt;
 use std::iter::{repeat, IntoIterator};
 use std::result;
 
-extern crate unicode_width;
 use unicode_width::UnicodeWidthStr;
 
 /// A description of the options that a program can handle.
@@ -905,24 +905,25 @@ fn format_option(opt: &OptGroup) -> String {
 /// each of them at most `lim` bytes long, if possible. The substrings
 /// have leading and trailing whitespace removed, and are only cut at
 /// whitespace boundaries.
-///
-/// Note: Function was moved here from `std::str` because this module
-/// is the only place that uses it, and because it was too specific for
-/// a general string function.
-fn each_split_within(desc: &String, lim: usize) -> Vec<String> {
+fn each_split_within(desc: &str, lim: usize) -> Vec<String> {
     let mut rows = Vec::new();
     for line in desc.trim().lines() {
-        let mut words = Vec::new();
-        let mut word = String::new();
-        for ch in line.chars() {
-            if !ch.is_whitespace() {
-                word.push(ch);
-                continue
+        let line_chars = line.chars().chain(Some(' '));
+        let words = line_chars.fold( (Vec::new(), 0, 0), |(mut words, a, z), c | {
+            let idx = z + c.len_utf8(); // Get the current byte offset
+
+            // If the char is whitespace, advance the word start and maybe push a word
+            if c.is_whitespace() {
+                if a != z {
+                    words.push(&line[a..z]);
+                }
+                (words, idx, idx)
             }
-            words.push(word);
-            word = String::new();
-        }
-        words.push(word);
+            // If the char is not whitespace, continue, retaining the current
+            else {
+                (words, a, idx)
+            }
+        }).0;
 
         let mut row = String::new();
         for word in words.iter().filter(|word| word.len() > 0) {

From 27df9b2cd6ee27b366e315df7a908e8d33089e30 Mon Sep 17 00:00:00 2001
From: prataprc <prataprc@gmail.com>
Date: Sun, 3 Jun 2018 13:41:33 +0530
Subject: [PATCH 5/5] more optimizations for issue #7.

---
 src/lib.rs | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 1f747184..743bc11f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -926,22 +926,28 @@ fn each_split_within(desc: &str, lim: usize) -> Vec<String> {
         }).0;
 
         let mut row = String::new();
-        for word in words.iter().filter(|word| word.len() > 0) {
-            let mut width = row.width() + word.width();
-            if row.len() > 0 { width += " ".width(); }
+        for word in words.iter() {
+            let sep = if row.len() > 0 { Some(" ") } else { None };
+            let width = row.width()
+                + word.width()
+                + sep.map(UnicodeWidthStr::width).unwrap_or(0);
+
             if width <= lim {
-                if row.len() > 0 { row.push_str(" ") }
+                if let Some(sep) = sep { row.push_str(sep) }
                 row.push_str(word);
                 continue
             }
-            rows.push(row.trim().to_string());
-            row = String::new();
+            if row.len() > 0 {
+                rows.push(row.clone());
+                row.clear();
+            }
             row.push_str(word);
         }
-        rows.push(row.trim().to_string());
+        if row.len() > 0 {
+            rows.push(row);
+        }
     }
-    rows.iter().filter(|row| row.len() > 0 )
-        .map(|row| row.to_string()).collect()
+    rows
 }
 
 #[test]