Skip to content

Commit

Permalink
Make number_of_words respect CJK characters (#7813)
Browse files Browse the repository at this point in the history
Merge pull request 7813
  • Loading branch information
iBug committed May 22, 2020
1 parent 2e80c55 commit 13b7291
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 5 deletions.
21 changes: 18 additions & 3 deletions docs/_data/jekyll_filters.yml
Expand Up @@ -200,10 +200,25 @@
#

- name: Number of Words
description: Count the number of words in some text.
description: >-
Count the number of words in some text.<br/>
From <span class="version-badge">v4.1.0</span>, this filter takes an optional
argument to control the handling of Chinese-Japanese-Korean (CJK) characters
in the <code>input</code> string.<br/>
Passing <code>'cjk'</code> as the argument will count every CJK character
detected as one word irrespective of being separated by whitespace.<br/>
Passing <code>'auto'</code> (auto-detect) works similar to <code>'cjk'</code>
but is more performant if the filter is used on a variable string that may
or may not contain CJK chars.
examples:
- input: '{{ page.content | number_of_words }}'
output: 1337
- input: '{{ "Hello world!" | number_of_words }}'
output: 2
- input: '{{ "你好hello世界world" | number_of_words }}'
output: 1
- input: '{{ "你好hello世界world" | number_of_words: "cjk" }}'
output: 6
- input: '{{ "你好hello世界world" | number_of_words: "auto" }}'
output: 6

#

Expand Down
16 changes: 14 additions & 2 deletions lib/jekyll/filters.rb
Expand Up @@ -121,8 +121,20 @@ def normalize_whitespace(input)
# input - The String on which to operate.
#
# Returns the Integer word count.
def number_of_words(input)
input.split.length
def number_of_words(input, mode = nil)
cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}'
cjk_regex = %r![#{cjk_charset}]!o
word_regex = %r![^#{cjk_charset}\s]+!o

case mode
when "cjk"
input.scan(cjk_regex).length + input.scan(word_regex).length
when "auto"
cjk_count = input.scan(cjk_regex).length
cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length
else
input.split.length
end
end

# Join an array of things into a string by separating with commas and the
Expand Down
22 changes: 22 additions & 0 deletions test/test_filters.rb
Expand Up @@ -1513,5 +1513,27 @@ def to_liquid
end
end
end

context "number_of_words filter" do
should "return the number of words for Latin-only text" do
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto")
assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk")
end

should "return the number of characters for CJK-only text" do
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto")
assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk")
end

should "process Latin and CJK independently" do
# Intentional: No space between Latin and CJK
assert_equal 6, @filter.number_of_words("你好hello世界world", "auto")
assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk")
end

should "maintain original behavior unless specified" do
assert_equal 1, @filter.number_of_words("你好hello世界world")
end
end
end
end

0 comments on commit 13b7291

Please sign in to comment.