diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml index 5e036feb320..f4727ed37ac 100644 --- a/docs/_data/jekyll_filters.yml +++ b/docs/_data/jekyll_filters.yml @@ -200,10 +200,25 @@ # - name: Number of Words - description: Count the number of words in some text. + description: >- + Count the number of words in some text.
+ From v4.1.0, this filter takes an optional + argument to control the handling of Chinese-Japanese-Korean (CJK) characters + in the input string.
+ Passing 'cjk' as the argument will count every CJK character + detected as one word irrespective of being separated by whitespace.
+ Passing 'auto' (auto-detect) works similar to 'cjk' + but is more performant if the filter is used on a variable string that may + or may not contain CJK chars. examples: - - input: '{{ page.content | number_of_words }}' - output: 1337 + - input: '{{ "Hello world!" | number_of_words }}' + output: 2 + - input: '{{ "你好hello世界world" | number_of_words }}' + output: 1 + - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}' + output: 6 + - input: '{{ "你好hello世界world" | number_of_words: "auto" }}' + output: 6 # diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index a3c6312c485..5f05029a428 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -121,8 +121,20 @@ def normalize_whitespace(input) # input - The String on which to operate. # # Returns the Integer word count. - def number_of_words(input) - input.split.length + def number_of_words(input, mode = nil) + cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' + cjk_regex = %r![#{cjk_charset}]!o + word_regex = %r![^#{cjk_charset}\s]+!o + + case mode + when "cjk" + input.scan(cjk_regex).length + input.scan(word_regex).length + when "auto" + cjk_count = input.scan(cjk_regex).length + cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length + else + input.split.length + end end # Join an array of things into a string by separating with commas and the diff --git a/test/test_filters.rb b/test/test_filters.rb index 423d472ed70..f491a8d3f0d 100644 --- a/test/test_filters.rb +++ b/test/test_filters.rb @@ -1513,5 +1513,27 @@ def to_liquid end end end + + context "number_of_words filter" do + should "return the number of words for Latin-only text" do + assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto") + assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk") + end + + should "return the number of characters for CJK-only text" do + assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto") + assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk") + end + + should "process Latin and CJK independently" do + # Intentional: No space between Latin and CJK + assert_equal 6, @filter.number_of_words("你好hello世界world", "auto") + assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk") + end + + should "maintain original behavior unless specified" do + assert_equal 1, @filter.number_of_words("你好hello世界world") + end + end end end