From afbee531e009e197c6492e7241448a292792d446 Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Thu, 5 Sep 2019 18:48:09 +0800
Subject: [PATCH 01/14] Make number_of_words respect CJK characters

CJK languages generally consider every single character a word.
This commit changed Liquid filter "number_of_words"
so it counts CJK characters correctly.
---
 lib/jekyll/filters.rb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index e52eb7d5d0d..faf36e17337 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -122,7 +122,8 @@ def normalize_whitespace(input)
     #
     # Returns the Integer word count.
     def number_of_words(input)
-      input.split.length
+      CJK_REGEX = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/
+      input.scan(CJK_REGEX).length + input.gsub(CJK_REGEX, ' ').split.length
     end
 
     # Join an array of things into a string by separating with commas and the

From 3cc01b1e5f9a669f25d07851a53b50278fe35684 Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Thu, 5 Sep 2019 21:05:55 +0800
Subject: [PATCH 02/14] Maybe use lowercase for that variable?

---
 lib/jekyll/filters.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index faf36e17337..aca19a87d16 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -122,8 +122,8 @@ def normalize_whitespace(input)
     #
     # Returns the Integer word count.
     def number_of_words(input)
-      CJK_REGEX = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/
-      input.scan(CJK_REGEX).length + input.gsub(CJK_REGEX, ' ').split.length
+      cjk_regex = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/
+      input.scan(cjk_regex).length + input.gsub(cjk_regex, ' ').split.length
     end
 
     # Join an array of things into a string by separating with commas and the

From 1d372034178dbf5a4f80a5db680c8a6082078c98 Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Thu, 5 Sep 2019 21:13:44 +0800
Subject: [PATCH 03/14] RuboCop is hard to satisfy...

---
 lib/jekyll/filters.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index aca19a87d16..477578da735 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -122,8 +122,8 @@ def normalize_whitespace(input)
     #
     # Returns the Integer word count.
     def number_of_words(input)
-      cjk_regex = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/
-      input.scan(cjk_regex).length + input.gsub(cjk_regex, ' ').split.length
+      cjk_regex = %r(\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul})
+      input.scan(cjk_regex).length + input.gsub(cjk_regex, " ").split.length
     end
 
     # Join an array of things into a string by separating with commas and the

From 6a33254a3253b1424f7f0dc63e42c251616455f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?iBug=20=E2=99=A6?= <git@ibugone.com>
Date: Thu, 5 Sep 2019 22:30:38 +0800
Subject: [PATCH 04/14] Use exclamation marks for %r!regex! to make RuboCop
 happy

Co-Authored-By: Matt Rogers <mattr-@github.com>
---
 lib/jekyll/filters.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index 477578da735..da6c19cde10 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -122,7 +122,7 @@ def normalize_whitespace(input)
     #
     # Returns the Integer word count.
     def number_of_words(input)
-      cjk_regex = %r(\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul})
+      cjk_regex = %r!\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}!
       input.scan(cjk_regex).length + input.gsub(cjk_regex, " ").split.length
     end
 

From 1a6af4293027716fc38e2f01023abb8498bbec6b Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Thu, 5 Sep 2019 22:32:23 +0800
Subject: [PATCH 05/14] Update documentation to reflect changes

---
 docs/_data/jekyll_filters.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml
index 27089e738f2..90b485074fb 100644
--- a/docs/_data/jekyll_filters.yml
+++ b/docs/_data/jekyll_filters.yml
@@ -166,7 +166,9 @@
 #
 
 - name: Number of Words
-  description: Count the number of words in some text.
+  description: >-
+    Count the number of words in some text.
+    CJK characters are counted as one word per character instead of whitespace-separated.
   examples:
     - input: '{{ page.content | number_of_words }}'
       output: 1337

From 360ecdabf82e61e3dbaa77316598ade412316ffb Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Sat, 7 Mar 2020 17:11:33 +0800
Subject: [PATCH 06/14] Add test for patched number_of_words filter

---
 test/test_filters.rb | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/test_filters.rb b/test/test_filters.rb
index 19f841eed25..f0ec6c9a1cc 100644
--- a/test/test_filters.rb
+++ b/test/test_filters.rb
@@ -1273,5 +1273,20 @@ def to_liquid
         end
       end
     end
+
+    context "number_of_words filter" do
+      should "return the number of words for Latin-only text" do
+        assert_equal 2, @filter.number_of_words("hello world!")
+      end
+
+      should "return the number of characters for CJK-only text" do
+        assert_equal 6, @filter.number_of_words("你好，世界！")
+      end
+
+      should "process Latin and CJK independently" do
+        # Intentional: No space between Latin and CJK
+        assert_equal 6, @filter.number_of_words("你好hello世界world")
+      end
+    end
   end
 end

From 28da2dc37b22104e112c79c450782125e0762cef Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Fri, 22 May 2020 01:40:54 +0800
Subject: [PATCH 07/14] Fix according to ashmaroli's suggestions

- Split with CJK characters only when present
- Avoid bloating from String#gub by using String#split with regex directly
---
 lib/jekyll/filters.rb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index da6c19cde10..ab362a13b96 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -123,7 +123,10 @@ def normalize_whitespace(input)
     # Returns the Integer word count.
     def number_of_words(input)
       cjk_regex = %r!\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}!
-      input.scan(cjk_regex).length + input.gsub(cjk_regex, " ").split.length
+      cjk_count = input.scan(cjk_regex).length
+      return input.split.length if cjk_count == 0
+      cjk_and_ws_regex = Regexp.union cjk_regex, %r!\s+!
+      cjk_count + input.split(cjk_and_ws_regex).length
     end
 
     # Join an array of things into a string by separating with commas and the

From 9c468612f6018cf04aba72d36507df87bcb62ee6 Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Fri, 22 May 2020 01:55:28 +0800
Subject: [PATCH 08/14] Using single-character regexes without repetition is
 errorneous

---
 lib/jekyll/filters.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index ab362a13b96..bfdfa6a3348 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -122,11 +122,11 @@ def normalize_whitespace(input)
     #
     # Returns the Integer word count.
     def number_of_words(input)
-      cjk_regex = %r!\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}!
-      cjk_count = input.scan(cjk_regex).length
+      cjk_regex = '\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}'
+      cjk_count = input.scan(Regexp.new cjk_regex).length
       return input.split.length if cjk_count == 0
-      cjk_and_ws_regex = Regexp.union cjk_regex, %r!\s+!
-      cjk_count + input.split(cjk_and_ws_regex).length
+      cjk_and_ws_regex = "(#{cjk_regex}|\\s)+"
+      cjk_count + input.split(Regexp.new cjk_and_ws_regex).length
     end
 
     # Join an array of things into a string by separating with commas and the

From d0ba9794247eb9ebaed2a4730ea58b1ce0ae7bd2 Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Fri, 22 May 2020 02:04:59 +0800
Subject: [PATCH 09/14] Bundle exec rake test passing

---
 lib/jekyll/filters.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index bfdfa6a3348..f09d74f73b9 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -122,11 +122,11 @@ def normalize_whitespace(input)
     #
     # Returns the Integer word count.
     def number_of_words(input)
-      cjk_regex = '\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}'
-      cjk_count = input.scan(Regexp.new cjk_regex).length
+      cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
+      cjk_count = input.scan(Regexp.new "[#{cjk_chars}]").length
       return input.split.length if cjk_count == 0
-      cjk_and_ws_regex = "(#{cjk_regex}|\\s)+"
-      cjk_count + input.split(Regexp.new cjk_and_ws_regex).length
+      word_regex = "[^#{cjk_chars}\\s]+"
+      cjk_count + input.scan(Regexp.new word_regex).length
     end
 
     # Join an array of things into a string by separating with commas and the

From 21bf5a5dbd64fba88de9dce6c8b720f81fd97872 Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Fri, 22 May 2020 15:40:41 +0800
Subject: [PATCH 10/14] Make CJK processing optional, adapt tests

---
 lib/jekyll/filters.rb | 16 +++++++++-------
 test/test_filters.rb  | 10 +++++++---
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index 82f432b1fd3..4025d70fff9 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -121,13 +121,15 @@ def normalize_whitespace(input)
     # input - The String on which to operate.
     #
     # Returns the Integer word count.
-    def number_of_words(input)
-      cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
-      cjk_count = input.scan(Regexp.new("[#{cjk_chars}]")).length
-      return input.split.length if cjk_count.zero?
-
-      word_regex = "[^#{cjk_chars}\\s]+"
-      cjk_count + input.scan(Regexp.new(word_regex)).length
+    def number_of_words(input, mode = "default")
+      if mode == "cjk"
+        cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
+        cjk_count = input.scan(Regexp.new("[#{cjk_chars}]")).length
+        word_regex = "[^#{cjk_chars}\\s]+"
+        cjk_count + input.scan(Regexp.new(word_regex)).length
+      else
+        input.split.length
+      end
     end
 
     # Join an array of things into a string by separating with commas and the
diff --git a/test/test_filters.rb b/test/test_filters.rb
index 3054125ec8b..ac1aa37cadc 100644
--- a/test/test_filters.rb
+++ b/test/test_filters.rb
@@ -1516,16 +1516,20 @@ def to_liquid
 
     context "number_of_words filter" do
       should "return the number of words for Latin-only text" do
-        assert_equal 2, @filter.number_of_words("hello world!")
+        assert_equal 2, @filter.number_of_words("hello world!", "cjk")
       end
 
       should "return the number of characters for CJK-only text" do
-        assert_equal 6, @filter.number_of_words("你好，世界！")
+        assert_equal 6, @filter.number_of_words("你好，世界！", "cjk")
       end
 
       should "process Latin and CJK independently" do
         # Intentional: No space between Latin and CJK
-        assert_equal 6, @filter.number_of_words("你好hello世界world")
+        assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
+      end
+
+      should "maintain original behavior unless specified" do
+        assert_equal 1, @filter.number_of_words("你好hello世界world")
       end
     end
   end

From 36fe3da60c8b7324cc00e257ca940f941eb62269 Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Fri, 22 May 2020 20:18:07 +0800
Subject: [PATCH 11/14] Support auto mode, add Japanese and Korean test cases

---
 lib/jekyll/filters.rb | 13 ++++++++-----
 test/test_filters.rb  |  5 ++++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index 4025d70fff9..82239995ad7 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -122,11 +122,14 @@ def normalize_whitespace(input)
     #
     # Returns the Integer word count.
     def number_of_words(input, mode = "default")
-      if mode == "cjk"
-        cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
-        cjk_count = input.scan(Regexp.new("[#{cjk_chars}]")).length
-        word_regex = "[^#{cjk_chars}\\s]+"
-        cjk_count + input.scan(Regexp.new(word_regex)).length
+      cjk_regex = %r![\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]!
+      word_regex = %r![^\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\s]+!
+      case mode
+      when "cjk"
+        input.scan(cjk_regex).length + input.scan(word_regex).length
+      when "auto"
+        cjk_count = input.scan(cjk_regex).length
+        cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
       else
         input.split.length
       end
diff --git a/test/test_filters.rb b/test/test_filters.rb
index ac1aa37cadc..c39024acfc2 100644
--- a/test/test_filters.rb
+++ b/test/test_filters.rb
@@ -1516,15 +1516,18 @@ def to_liquid
 
     context "number_of_words filter" do
       should "return the number of words for Latin-only text" do
+        assert_equal 2, @filter.number_of_words("hello world!", "auto")
         assert_equal 2, @filter.number_of_words("hello world!", "cjk")
       end
 
       should "return the number of characters for CJK-only text" do
-        assert_equal 6, @filter.number_of_words("你好，世界！", "cjk")
+        assert_equal 17, @filter.number_of_words("こんにちは、世界！안녕하세요 세상!", "auto")
+        assert_equal 17, @filter.number_of_words("こんにちは、世界！안녕하세요 세상!", "cjk")
       end
 
       should "process Latin and CJK independently" do
         # Intentional: No space between Latin and CJK
+        assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
         assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
       end
 

From 758949bb7611a4f2bed53e3bbb11fffc547c75bf Mon Sep 17 00:00:00 2001
From: iBug <git@ibugone.com>
Date: Fri, 22 May 2020 21:36:34 +0800
Subject: [PATCH 12/14] Update documentation to reflect #7813

---
 docs/_data/jekyll_filters.yml | 10 +++++++---
 test/test_filters.rb          |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml
index c1ad14db196..1b0e1c6efd5 100644
--- a/docs/_data/jekyll_filters.yml
+++ b/docs/_data/jekyll_filters.yml
@@ -202,10 +202,14 @@
 - name: Number of Words
   description: >-
     Count the number of words in some text.
-    CJK characters are counted as one word per character instead of whitespace-separated.
+    An optional argument controlling the handling of CJK characters
+    may be specified as "auto" (auto-detect) or "cjk" (treat input as containing CJK),
+    where CJK characters are counted as one word per character instead of whitespace-separated.
   examples:
-    - input: '{{ page.content | number_of_words }}'
-      output: 1337
+    - input: '{{ "Hello world!" | number_of_words }}'
+      output: 2
+    - input: '{{ "你好，世界！" | number_of_words: "auto" }}'
+      output: 6
 
 #
 
diff --git a/test/test_filters.rb b/test/test_filters.rb
index c39024acfc2..f491a8d3f0d 100644
--- a/test/test_filters.rb
+++ b/test/test_filters.rb
@@ -1516,8 +1516,8 @@ def to_liquid
 
     context "number_of_words filter" do
       should "return the number of words for Latin-only text" do
-        assert_equal 2, @filter.number_of_words("hello world!", "auto")
-        assert_equal 2, @filter.number_of_words("hello world!", "cjk")
+        assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
+        assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
       end
 
       should "return the number of characters for CJK-only text" do

From 676dd86e7ddf0e600aa5979f2d0e0bccc0cbc578 Mon Sep 17 00:00:00 2001
From: Ashwin Maroli <ashmaroli@users.noreply.github.com>
Date: Fri, 22 May 2020 19:54:43 +0530
Subject: [PATCH 13/14] Improve documentation of the change

---
 docs/_data/jekyll_filters.yml | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml
index 1b0e1c6efd5..f4727ed37ac 100644
--- a/docs/_data/jekyll_filters.yml
+++ b/docs/_data/jekyll_filters.yml
@@ -201,14 +201,23 @@
 
 - name: Number of Words
   description: >-
-    Count the number of words in some text.
-    An optional argument controlling the handling of CJK characters
-    may be specified as "auto" (auto-detect) or "cjk" (treat input as containing CJK),
-    where CJK characters are counted as one word per character instead of whitespace-separated.
+    Count the number of words in some text.<br/>
+    From <span class="version-badge">v4.1.0</span>, this filter takes an optional
+    argument to control the handling of Chinese-Japanese-Korean (CJK) characters
+    in the <code>input</code> string.<br/>
+    Passing <code>'cjk'</code> as the argument will count every CJK character
+    detected as one word irrespective of being separated by whitespace.<br/>
+    Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code>
+    but is more performant if the filter is used on a variable string that may
+    or may not contain CJK chars.
   examples:
     - input: '{{ "Hello world!" | number_of_words }}'
       output: 2
-    - input: '{{ "你好，世界！" | number_of_words: "auto" }}'
+    - input: '{{ "你好hello世界world" | number_of_words }}'
+      output: 1
+    - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
+      output: 6
+    - input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
       output: 6
 
 #

From 25d13885c7bc0e04e1beeeef9b051e8cbb9bf1f9 Mon Sep 17 00:00:00 2001
From: Ashwin Maroli <ashmaroli@users.noreply.github.com>
Date: Fri, 22 May 2020 20:10:13 +0530
Subject: [PATCH 14/14] Reduce duplication in filter

---
 lib/jekyll/filters.rb | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index 82239995ad7..5f05029a428 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -121,9 +121,11 @@ def normalize_whitespace(input)
     # input - The String on which to operate.
     #
     # Returns the Integer word count.
-    def number_of_words(input, mode = "default")
-      cjk_regex = %r![\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]!
-      word_regex = %r![^\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\s]+!
+    def number_of_words(input, mode = nil)
+      cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
+      cjk_regex = %r![#{cjk_charset}]!o
+      word_regex = %r![^#{cjk_charset}\s]+!o
+
       case mode
       when "cjk"
         input.scan(cjk_regex).length + input.scan(word_regex).length