jekyll · jekyllbot · May 22, 2020 · Sep 5, 2019 · Sep 5, 2019 · Sep 5, 2019
diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml
@@ -200,10 +200,25 @@
 #
 
 - name: Number of Words
-  description: Count the number of words in some text.
+  description: >-
+    Count the number of words in some text.<br/>
+    From <span class="version-badge">v4.1.0</span>, this filter takes an optional
+    argument to control the handling of Chinese-Japanese-Korean (CJK) characters
+    in the <code>input</code> string.<br/>
+    Passing <code>'cjk'</code> as the argument will count every CJK character
+    detected as one word irrespective of being separated by whitespace.<br/>
+    Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code>
+    but is more performant if the filter is used on a variable string that may
+    or may not contain CJK chars.
   examples:
-    - input: '{{ page.content | number_of_words }}'
-      output: 1337
+    - input: '{{ "Hello world!" | number_of_words }}'
+      output: 2
+    - input: '{{ "你好hello世界world" | number_of_words }}'
+      output: 1
+    - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
+      output: 6
+    - input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
+      output: 6
 
 #
 

diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
@@ -121,8 +121,20 @@ def normalize_whitespace(input)
     # input - The String on which to operate.
     #
     # Returns the Integer word count.
-    def number_of_words(input)
-      input.split.length
+    def number_of_words(input, mode = nil)
+      cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
+      cjk_regex = %r![#{cjk_charset}]!o
+      word_regex = %r![^#{cjk_charset}\s]+!o
+
+      case mode
+      when "cjk"
+        input.scan(cjk_regex).length + input.scan(word_regex).length
+      when "auto"
+        cjk_count = input.scan(cjk_regex).length
+        cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
+      else
+        input.split.length
+      end
     end
 
     # Join an array of things into a string by separating with commas and the

diff --git a/test/test_filters.rb b/test/test_filters.rb
@@ -1513,5 +1513,27 @@ def to_liquid
         end
       end
     end
+
+    context "number_of_words filter" do
+      should "return the number of words for Latin-only text" do
+        assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
+        assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
+      end
+
+      should "return the number of characters for CJK-only text" do
+        assert_equal 17, @filter.number_of_words("こんにちは、世界！안녕하세요 세상!", "auto")
+        assert_equal 17, @filter.number_of_words("こんにちは、世界！안녕하세요 세상!", "cjk")
+      end
+
+      should "process Latin and CJK independently" do
+        # Intentional: No space between Latin and CJK
+        assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
+        assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
+      end
+
+      should "maintain original behavior unless specified" do
+        assert_equal 1, @filter.number_of_words("你好hello世界world")
+      end
+    end
   end
 end