From afbee531e009e197c6492e7241448a292792d446 Mon Sep 17 00:00:00 2001 From: iBug Date: Thu, 5 Sep 2019 18:48:09 +0800 Subject: [PATCH 01/14] Make number_of_words respect CJK characters CJK languages generally consider every single character a word. This commit changed Liquid filter "number_of_words" so it counts CJK characters correctly. --- lib/jekyll/filters.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index e52eb7d5d0d..faf36e17337 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -122,7 +122,8 @@ def normalize_whitespace(input) # # Returns the Integer word count. def number_of_words(input) - input.split.length + CJK_REGEX = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/ + input.scan(CJK_REGEX).length + input.gsub(CJK_REGEX, ' ').split.length end # Join an array of things into a string by separating with commas and the From 3cc01b1e5f9a669f25d07851a53b50278fe35684 Mon Sep 17 00:00:00 2001 From: iBug Date: Thu, 5 Sep 2019 21:05:55 +0800 Subject: [PATCH 02/14] Maybe use lowercase for that variable? --- lib/jekyll/filters.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index faf36e17337..aca19a87d16 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -122,8 +122,8 @@ def normalize_whitespace(input) # # Returns the Integer word count. def number_of_words(input) - CJK_REGEX = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/ - input.scan(CJK_REGEX).length + input.gsub(CJK_REGEX, ' ').split.length + cjk_regex = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/ + input.scan(cjk_regex).length + input.gsub(cjk_regex, ' ').split.length end # Join an array of things into a string by separating with commas and the From 1d372034178dbf5a4f80a5db680c8a6082078c98 Mon Sep 17 00:00:00 2001 From: iBug Date: Thu, 5 Sep 2019 21:13:44 +0800 Subject: [PATCH 03/14] RuboCop is hard to satisfy... --- lib/jekyll/filters.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index aca19a87d16..477578da735 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -122,8 +122,8 @@ def normalize_whitespace(input) # # Returns the Integer word count. def number_of_words(input) - cjk_regex = /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/ - input.scan(cjk_regex).length + input.gsub(cjk_regex, ' ').split.length + cjk_regex = %r(\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}) + input.scan(cjk_regex).length + input.gsub(cjk_regex, " ").split.length end # Join an array of things into a string by separating with commas and the From 6a33254a3253b1424f7f0dc63e42c251616455f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?iBug=20=E2=99=A6?= Date: Thu, 5 Sep 2019 22:30:38 +0800 Subject: [PATCH 04/14] Use exclamation marks for %r!regex! to make RuboCop happy Co-Authored-By: Matt Rogers --- lib/jekyll/filters.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index 477578da735..da6c19cde10 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -122,7 +122,7 @@ def normalize_whitespace(input) # # Returns the Integer word count. def number_of_words(input) - cjk_regex = %r(\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}) + cjk_regex = %r!\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}! input.scan(cjk_regex).length + input.gsub(cjk_regex, " ").split.length end From 1a6af4293027716fc38e2f01023abb8498bbec6b Mon Sep 17 00:00:00 2001 From: iBug Date: Thu, 5 Sep 2019 22:32:23 +0800 Subject: [PATCH 05/14] Update documentation to reflect changes --- docs/_data/jekyll_filters.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml index 27089e738f2..90b485074fb 100644 --- a/docs/_data/jekyll_filters.yml +++ b/docs/_data/jekyll_filters.yml @@ -166,7 +166,9 @@ # - name: Number of Words - description: Count the number of words in some text. + description: >- + Count the number of words in some text. + CJK characters are counted as one word per character instead of whitespace-separated. examples: - input: '{{ page.content | number_of_words }}' output: 1337 From 360ecdabf82e61e3dbaa77316598ade412316ffb Mon Sep 17 00:00:00 2001 From: iBug Date: Sat, 7 Mar 2020 17:11:33 +0800 Subject: [PATCH 06/14] Add test for patched number_of_words filter --- test/test_filters.rb | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/test_filters.rb b/test/test_filters.rb index 19f841eed25..f0ec6c9a1cc 100644 --- a/test/test_filters.rb +++ b/test/test_filters.rb @@ -1273,5 +1273,20 @@ def to_liquid end end end + + context "number_of_words filter" do + should "return the number of words for Latin-only text" do + assert_equal 2, @filter.number_of_words("hello world!") + end + + should "return the number of characters for CJK-only text" do + assert_equal 6, @filter.number_of_words("你好,世界!") + end + + should "process Latin and CJK independently" do + # Intentional: No space between Latin and CJK + assert_equal 6, @filter.number_of_words("你好hello世界world") + end + end end end From 28da2dc37b22104e112c79c450782125e0762cef Mon Sep 17 00:00:00 2001 From: iBug Date: Fri, 22 May 2020 01:40:54 +0800 Subject: [PATCH 07/14] Fix according to ashmaroli's suggestions - Split with CJK characters only when present - Avoid bloating from String#gub by using String#split with regex directly --- lib/jekyll/filters.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index da6c19cde10..ab362a13b96 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -123,7 +123,10 @@ def normalize_whitespace(input) # Returns the Integer word count. def number_of_words(input) cjk_regex = %r!\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}! - input.scan(cjk_regex).length + input.gsub(cjk_regex, " ").split.length + cjk_count = input.scan(cjk_regex).length + return input.split.length if cjk_count == 0 + cjk_and_ws_regex = Regexp.union cjk_regex, %r!\s+! + cjk_count + input.split(cjk_and_ws_regex).length end # Join an array of things into a string by separating with commas and the From 9c468612f6018cf04aba72d36507df87bcb62ee6 Mon Sep 17 00:00:00 2001 From: iBug Date: Fri, 22 May 2020 01:55:28 +0800 Subject: [PATCH 08/14] Using single-character regexes without repetition is errorneous --- lib/jekyll/filters.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index ab362a13b96..bfdfa6a3348 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -122,11 +122,11 @@ def normalize_whitespace(input) # # Returns the Integer word count. def number_of_words(input) - cjk_regex = %r!\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}! - cjk_count = input.scan(cjk_regex).length + cjk_regex = '\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}' + cjk_count = input.scan(Regexp.new cjk_regex).length return input.split.length if cjk_count == 0 - cjk_and_ws_regex = Regexp.union cjk_regex, %r!\s+! - cjk_count + input.split(cjk_and_ws_regex).length + cjk_and_ws_regex = "(#{cjk_regex}|\\s)+" + cjk_count + input.split(Regexp.new cjk_and_ws_regex).length end # Join an array of things into a string by separating with commas and the From d0ba9794247eb9ebaed2a4730ea58b1ce0ae7bd2 Mon Sep 17 00:00:00 2001 From: iBug Date: Fri, 22 May 2020 02:04:59 +0800 Subject: [PATCH 09/14] Bundle exec rake test passing --- lib/jekyll/filters.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index bfdfa6a3348..f09d74f73b9 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -122,11 +122,11 @@ def normalize_whitespace(input) # # Returns the Integer word count. def number_of_words(input) - cjk_regex = '\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}' - cjk_count = input.scan(Regexp.new cjk_regex).length + cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' + cjk_count = input.scan(Regexp.new "[#{cjk_chars}]").length return input.split.length if cjk_count == 0 - cjk_and_ws_regex = "(#{cjk_regex}|\\s)+" - cjk_count + input.split(Regexp.new cjk_and_ws_regex).length + word_regex = "[^#{cjk_chars}\\s]+" + cjk_count + input.scan(Regexp.new word_regex).length end # Join an array of things into a string by separating with commas and the From 21bf5a5dbd64fba88de9dce6c8b720f81fd97872 Mon Sep 17 00:00:00 2001 From: iBug Date: Fri, 22 May 2020 15:40:41 +0800 Subject: [PATCH 10/14] Make CJK processing optional, adapt tests --- lib/jekyll/filters.rb | 16 +++++++++------- test/test_filters.rb | 10 +++++++--- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index 82f432b1fd3..4025d70fff9 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -121,13 +121,15 @@ def normalize_whitespace(input) # input - The String on which to operate. # # Returns the Integer word count. - def number_of_words(input) - cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' - cjk_count = input.scan(Regexp.new("[#{cjk_chars}]")).length - return input.split.length if cjk_count.zero? - - word_regex = "[^#{cjk_chars}\\s]+" - cjk_count + input.scan(Regexp.new(word_regex)).length + def number_of_words(input, mode = "default") + if mode == "cjk" + cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' + cjk_count = input.scan(Regexp.new("[#{cjk_chars}]")).length + word_regex = "[^#{cjk_chars}\\s]+" + cjk_count + input.scan(Regexp.new(word_regex)).length + else + input.split.length + end end # Join an array of things into a string by separating with commas and the diff --git a/test/test_filters.rb b/test/test_filters.rb index 3054125ec8b..ac1aa37cadc 100644 --- a/test/test_filters.rb +++ b/test/test_filters.rb @@ -1516,16 +1516,20 @@ def to_liquid context "number_of_words filter" do should "return the number of words for Latin-only text" do - assert_equal 2, @filter.number_of_words("hello world!") + assert_equal 2, @filter.number_of_words("hello world!", "cjk") end should "return the number of characters for CJK-only text" do - assert_equal 6, @filter.number_of_words("你好,世界!") + assert_equal 6, @filter.number_of_words("你好,世界!", "cjk") end should "process Latin and CJK independently" do # Intentional: No space between Latin and CJK - assert_equal 6, @filter.number_of_words("你好hello世界world") + assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk") + end + + should "maintain original behavior unless specified" do + assert_equal 1, @filter.number_of_words("你好hello世界world") end end end From 36fe3da60c8b7324cc00e257ca940f941eb62269 Mon Sep 17 00:00:00 2001 From: iBug Date: Fri, 22 May 2020 20:18:07 +0800 Subject: [PATCH 11/14] Support auto mode, add Japanese and Korean test cases --- lib/jekyll/filters.rb | 13 ++++++++----- test/test_filters.rb | 5 ++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index 4025d70fff9..82239995ad7 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -122,11 +122,14 @@ def normalize_whitespace(input) # # Returns the Integer word count. def number_of_words(input, mode = "default") - if mode == "cjk" - cjk_chars = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' - cjk_count = input.scan(Regexp.new("[#{cjk_chars}]")).length - word_regex = "[^#{cjk_chars}\\s]+" - cjk_count + input.scan(Regexp.new(word_regex)).length + cjk_regex = %r![\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]! + word_regex = %r![^\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\s]+! + case mode + when "cjk" + input.scan(cjk_regex).length + input.scan(word_regex).length + when "auto" + cjk_count = input.scan(cjk_regex).length + cjk_count.zero? ? input.split.length : cjk_count + input.scan(word_regex).length else input.split.length end diff --git a/test/test_filters.rb b/test/test_filters.rb index ac1aa37cadc..c39024acfc2 100644 --- a/test/test_filters.rb +++ b/test/test_filters.rb @@ -1516,15 +1516,18 @@ def to_liquid context "number_of_words filter" do should "return the number of words for Latin-only text" do + assert_equal 2, @filter.number_of_words("hello world!", "auto") assert_equal 2, @filter.number_of_words("hello world!", "cjk") end should "return the number of characters for CJK-only text" do - assert_equal 6, @filter.number_of_words("你好,世界!", "cjk") + assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "auto") + assert_equal 17, @filter.number_of_words("こんにちは、世界!안녕하세요 세상!", "cjk") end should "process Latin and CJK independently" do # Intentional: No space between Latin and CJK + assert_equal 6, @filter.number_of_words("你好hello世界world", "auto") assert_equal 6, @filter.number_of_words("你好hello世界world", "cjk") end From 758949bb7611a4f2bed53e3bbb11fffc547c75bf Mon Sep 17 00:00:00 2001 From: iBug Date: Fri, 22 May 2020 21:36:34 +0800 Subject: [PATCH 12/14] Update documentation to reflect #7813 --- docs/_data/jekyll_filters.yml | 10 +++++++--- test/test_filters.rb | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml index c1ad14db196..1b0e1c6efd5 100644 --- a/docs/_data/jekyll_filters.yml +++ b/docs/_data/jekyll_filters.yml @@ -202,10 +202,14 @@ - name: Number of Words description: >- Count the number of words in some text. - CJK characters are counted as one word per character instead of whitespace-separated. + An optional argument controlling the handling of CJK characters + may be specified as "auto" (auto-detect) or "cjk" (treat input as containing CJK), + where CJK characters are counted as one word per character instead of whitespace-separated. examples: - - input: '{{ page.content | number_of_words }}' - output: 1337 + - input: '{{ "Hello world!" | number_of_words }}' + output: 2 + - input: '{{ "你好,世界!" | number_of_words: "auto" }}' + output: 6 # diff --git a/test/test_filters.rb b/test/test_filters.rb index c39024acfc2..f491a8d3f0d 100644 --- a/test/test_filters.rb +++ b/test/test_filters.rb @@ -1516,8 +1516,8 @@ def to_liquid context "number_of_words filter" do should "return the number of words for Latin-only text" do - assert_equal 2, @filter.number_of_words("hello world!", "auto") - assert_equal 2, @filter.number_of_words("hello world!", "cjk") + assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "auto") + assert_equal 5, @filter.number_of_words("hello world and taoky strong!", "cjk") end should "return the number of characters for CJK-only text" do From 676dd86e7ddf0e600aa5979f2d0e0bccc0cbc578 Mon Sep 17 00:00:00 2001 From: Ashwin Maroli Date: Fri, 22 May 2020 19:54:43 +0530 Subject: [PATCH 13/14] Improve documentation of the change --- docs/_data/jekyll_filters.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/_data/jekyll_filters.yml b/docs/_data/jekyll_filters.yml index 1b0e1c6efd5..f4727ed37ac 100644 --- a/docs/_data/jekyll_filters.yml +++ b/docs/_data/jekyll_filters.yml @@ -201,14 +201,23 @@ - name: Number of Words description: >- - Count the number of words in some text. - An optional argument controlling the handling of CJK characters - may be specified as "auto" (auto-detect) or "cjk" (treat input as containing CJK), - where CJK characters are counted as one word per character instead of whitespace-separated. + Count the number of words in some text.
+ From v4.1.0, this filter takes an optional + argument to control the handling of Chinese-Japanese-Korean (CJK) characters + in the input string.
+ Passing 'cjk' as the argument will count every CJK character + detected as one word irrespective of being separated by whitespace.
+ Passing 'auto' (auto-detect) works similar to 'cjk' + but is more performant if the filter is used on a variable string that may + or may not contain CJK chars. examples: - input: '{{ "Hello world!" | number_of_words }}' output: 2 - - input: '{{ "你好,世界!" | number_of_words: "auto" }}' + - input: '{{ "你好hello世界world" | number_of_words }}' + output: 1 + - input: '{{ "你好hello世界world" | number_of_words: "cjk" }}' + output: 6 + - input: '{{ "你好hello世界world" | number_of_words: "auto" }}' output: 6 # From 25d13885c7bc0e04e1beeeef9b051e8cbb9bf1f9 Mon Sep 17 00:00:00 2001 From: Ashwin Maroli Date: Fri, 22 May 2020 20:10:13 +0530 Subject: [PATCH 14/14] Reduce duplication in filter --- lib/jekyll/filters.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/jekyll/filters.rb b/lib/jekyll/filters.rb index 82239995ad7..5f05029a428 100644 --- a/lib/jekyll/filters.rb +++ b/lib/jekyll/filters.rb @@ -121,9 +121,11 @@ def normalize_whitespace(input) # input - The String on which to operate. # # Returns the Integer word count. - def number_of_words(input, mode = "default") - cjk_regex = %r![\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]! - word_regex = %r![^\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\s]+! + def number_of_words(input, mode = nil) + cjk_charset = '\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}' + cjk_regex = %r![#{cjk_charset}]!o + word_regex = %r![^#{cjk_charset}\s]+!o + case mode when "cjk" input.scan(cjk_regex).length + input.scan(word_regex).length