diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml
index 5e036feb320..f4727ed37ac 100644
--- a/docs/_data/jekyll_filters.yml
+++ b/docs/_data/jekyll_filters.yml
@@ -200,10 +200,25 @@
#
- name: Number of Words
- description: Count the number of words in some text.
+ description: >-
+ Count the number of words in some text.
+ From v4.1.0, this filter takes an optional
+ argument to control the handling of Chinese-Japanese-Korean (CJK) characters
+ in the input
string.
+ Passing 'cjk'
as the argument will count every CJK character
+ detected as one word irrespective of being separated by whitespace.
+ Passing 'auto'
(auto-detect) works similar to 'cjk'
+ but is more performant if the filter is used on a variable string that may
+ or may not contain CJK chars.
examples:
- - input: '{{ page.content | number_of_words }}'
- output: 1337
+ - input: '{{ "Hello world!" | number_of_words }}'
+ output: 2
+ - input: '{{ "你好hello世界world" | number_of_words }}'
+ output: 1
+ - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
+ output: 6
+ - input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
+ output: 6
#
diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb
index a3c6312c485..5f05029a428 100644
--- a/lib/jekyll/filters.rb
+++ b/lib/jekyll/filters.rb
@@ -121,8 +121,20 @@ def normalize_whitespace(input)
# input - The String on which to operate.
#
# Returns the Integer word count.
- def number_of_words(input)
- input.split.length
+ def number_of_words(input, mode = nil)
+ cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
+ cjk_regex = %r![#{cjk_charset}]!o
+ word_regex = %r![^#{cjk_charset}\s]+!o
+
+ case mode
+ when "cjk"
+ input.scan(cjk_regex).length + input.scan(word_regex).length
+ when "auto"
+ cjk_count = input.scan(cjk_regex).length
+ cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
+ else
+ input.split.length
+ end
end
# Join an array of things into a string by separating with commas and the
diff --git a/test/test_filters.rb b/test/test_filters.rb
index 423d472ed70..f491a8d3f0d 100644
--- a/test/test_filters.rb
+++ b/test/test_filters.rb
@@ -1513,5 +1513,27 @@ def to_liquid
end
end
end
+
+ context "number_of_words filter" do
+ should "return the number of words for Latin-only text" do
+ assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
+ assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
+ end
+
+ should "return the number of characters for CJK-only text" do
+ assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto")
+ assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk")
+ end
+
+ should "process Latin and CJK independently" do
+ # Intentional: No space between Latin and CJK
+ assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
+ assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
+ end
+
+ should "maintain original behavior unless specified" do
+ assert_equal 1, @filter.number_of_words("你好hello世界world")
+ end
+ end
end
end