From 2fff371df54fdf4064a36eca58cd549d9f07ff38 Mon Sep 17 00:00:00 2001 From: Adrien Rey-Jarthon Date: Tue, 14 Feb 2023 00:00:35 +0100 Subject: [PATCH] Use ruby unicode normalize to avoid libidn C problems and heavy legacy ruby code --- benchmark/unicode_normalize.rb | 33 ++++++ lib/addressable/idna/native.rb | 2 +- lib/addressable/idna/pure.rb | 183 +-------------------------------- lib/addressable/uri.rb | 2 +- spec/addressable/idna_spec.rb | 7 ++ spec/addressable/uri_spec.rb | 8 ++ 6 files changed, 54 insertions(+), 181 deletions(-) create mode 100644 benchmark/unicode_normalize.rb diff --git a/benchmark/unicode_normalize.rb b/benchmark/unicode_normalize.rb new file mode 100644 index 00000000..cbcc8aee --- /dev/null +++ b/benchmark/unicode_normalize.rb @@ -0,0 +1,33 @@ +# /usr/bin/env ruby + +require 'benchmark' +require 'addressable/idna/pure.rb' +require 'idn' + +value = "fiᆵリ宠퐱卄.com" +expected = "fiᆵリ宠퐱卄.com" +N = 100_000 + +expected === value.unicode_normalize(:nfkc) or fail "ruby normalize does not match" +expected === IDN::Stringprep.nfkc_normalize(value) or fail "libidn normalize does not match" +expected === Addressable::IDNA.unicode_normalize_kc(value) or fail "addressable normalize does not match" + +Benchmark.bmbm do |x| + x.report("pure") { N.times { Addressable::IDNA.unicode_normalize_kc(value) } } + x.report("libidn") { N.times { IDN::Stringprep.nfkc_normalize(value) } } + x.report("ruby") { N.times { value.unicode_normalize(:nfkc) } } +end + +# February 14th 2023, before replacing the legacy pure normalize code: + +# > ruby benchmark/unicode_normalize.rb +# Rehearsal ------------------------------------------ +# pure 1.335230 0.000315 1.335545 ( 1.335657) +# libidn 0.058568 0.000000 0.058568 ( 0.058570) +# ruby 0.326008 0.000014 0.326022 ( 0.326026) +# --------------------------------- total: 1.720135sec + +# user system total real +# pure 1.325948 0.000000 1.325948 ( 1.326054) +# libidn 0.058067 0.000000 0.058067 ( 0.058069) +# ruby 0.325062 0.000000 0.325062 ( 0.325115) \ No newline at end of file diff --git a/lib/addressable/idna/native.rb b/lib/addressable/idna/native.rb index 302e1b0c..dbc55d06 100644 --- a/lib/addressable/idna/native.rb +++ b/lib/addressable/idna/native.rb @@ -30,7 +30,7 @@ def self.punycode_decode(value) end def self.unicode_normalize_kc(value) - IDN::Stringprep.nfkc_normalize(value.to_s) + value.to_s.unicode_normalize(:nfkc) end def self.to_ascii(value) diff --git a/lib/addressable/idna/pure.rb b/lib/addressable/idna/pure.rb index a7c796e3..549b07e7 100644 --- a/lib/addressable/idna/pure.rb +++ b/lib/addressable/idna/pure.rb @@ -67,6 +67,8 @@ module IDNA def self.to_ascii(input) input = input.to_s unless input.is_a?(String) input = input.dup + input.force_encoding(Encoding::UTF_8) + input = unicode_normalize_kc(input.dup) if input.respond_to?(:force_encoding) input.force_encoding(Encoding::ASCII_8BIT) end @@ -77,7 +79,7 @@ def self.to_ascii(input) part.force_encoding(Encoding::ASCII_8BIT) end if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE - ACE_PREFIX + punycode_encode(unicode_normalize_kc(part)) + ACE_PREFIX + punycode_encode(part) else part end @@ -114,11 +116,7 @@ def self.to_unicode(input) # Unicode normalization form KC. def self.unicode_normalize_kc(input) - input = input.to_s unless input.is_a?(String) - unpacked = input.unpack("U*") - unpacked = - unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked))) - return unpacked.pack("U*") + input.to_s.unicode_normalize(:nfkc) end ## @@ -136,164 +134,6 @@ def self.unicode_downcase(input) end private_class_method :unicode_downcase - def self.unicode_compose(unpacked) - unpacked_result = [] - length = unpacked.length - - return unpacked if length == 0 - - starter = unpacked[0] - starter_cc = lookup_unicode_combining_class(starter) - starter_cc = 256 if starter_cc != 0 - for i in 1...length - ch = unpacked[i] - - if (starter_cc == 0 && - (composite = unicode_compose_pair(starter, ch)) != nil) - starter = composite - else - unpacked_result << starter - starter = ch - end - end - unpacked_result << starter - return unpacked_result - end - private_class_method :unicode_compose - - def self.unicode_compose_pair(ch_one, ch_two) - if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT && - ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT - # Hangul L + V - return HANGUL_SBASE + ( - (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE) - ) * HANGUL_TCOUNT - elsif ch_one >= HANGUL_SBASE && - ch_one < HANGUL_SBASE + HANGUL_SCOUNT && - (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 && - ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT - # Hangul LV + T - return ch_one + (ch_two - HANGUL_TBASE) - end - - p = [] - - ucs4_to_utf8(ch_one, p) - ucs4_to_utf8(ch_two, p) - - return lookup_unicode_composition(p) - end - private_class_method :unicode_compose_pair - - def self.ucs4_to_utf8(char, buffer) - if char < 128 - buffer << char - elsif char < 2048 - buffer << (char >> 6 | 192) - buffer << (char & 63 | 128) - elsif char < 0x10000 - buffer << (char >> 12 | 224) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - elsif char < 0x200000 - buffer << (char >> 18 | 240) - buffer << (char >> 12 & 63 | 128) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - elsif char < 0x4000000 - buffer << (char >> 24 | 248) - buffer << (char >> 18 & 63 | 128) - buffer << (char >> 12 & 63 | 128) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - elsif char < 0x80000000 - buffer << (char >> 30 | 252) - buffer << (char >> 24 & 63 | 128) - buffer << (char >> 18 & 63 | 128) - buffer << (char >> 12 & 63 | 128) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - end - end - private_class_method :ucs4_to_utf8 - - def self.unicode_sort_canonical(unpacked) - unpacked = unpacked.dup - i = 1 - length = unpacked.length - - return unpacked if length < 2 - - while i < length - last = unpacked[i-1] - ch = unpacked[i] - last_cc = lookup_unicode_combining_class(last) - cc = lookup_unicode_combining_class(ch) - if cc != 0 && last_cc != 0 && last_cc > cc - unpacked[i] = last - unpacked[i-1] = ch - i -= 1 if i > 1 - else - i += 1 - end - end - return unpacked - end - private_class_method :unicode_sort_canonical - - def self.unicode_decompose(unpacked) - unpacked_result = [] - for cp in unpacked - if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT - l, v, t = unicode_decompose_hangul(cp) - unpacked_result << l - unpacked_result << v if v - unpacked_result << t if t - else - dc = lookup_unicode_compatibility(cp) - unless dc - unpacked_result << cp - else - unpacked_result.concat(unicode_decompose(dc.unpack("U*"))) - end - end - end - return unpacked_result - end - private_class_method :unicode_decompose - - def self.unicode_decompose_hangul(codepoint) - sindex = codepoint - HANGUL_SBASE; - if sindex < 0 || sindex >= HANGUL_SCOUNT - l = codepoint - v = t = nil - return l, v, t - end - l = HANGUL_LBASE + sindex / HANGUL_NCOUNT - v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT - t = HANGUL_TBASE + sindex % HANGUL_TCOUNT - if t == HANGUL_TBASE - t = nil - end - return l, v, t - end - private_class_method :unicode_decompose_hangul - - def self.lookup_unicode_combining_class(codepoint) - codepoint_data = UNICODE_DATA[codepoint] - (codepoint_data ? - (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) : - 0) - end - private_class_method :lookup_unicode_combining_class - - def self.lookup_unicode_compatibility(codepoint) - codepoint_data = UNICODE_DATA[codepoint] - (codepoint_data ? - codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil) - end - private_class_method :lookup_unicode_compatibility - def self.lookup_unicode_lowercase(codepoint) codepoint_data = UNICODE_DATA[codepoint] (codepoint_data ? @@ -302,21 +142,6 @@ def self.lookup_unicode_lowercase(codepoint) end private_class_method :lookup_unicode_lowercase - def self.lookup_unicode_composition(unpacked) - return COMPOSITION_TABLE[unpacked] - end - private_class_method :lookup_unicode_composition - - HANGUL_SBASE = 0xac00 - HANGUL_LBASE = 0x1100 - HANGUL_LCOUNT = 19 - HANGUL_VBASE = 0x1161 - HANGUL_VCOUNT = 21 - HANGUL_TBASE = 0x11a7 - HANGUL_TCOUNT = 28 - HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588 - HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172 - UNICODE_DATA_COMBINING_CLASS = 0 UNICODE_DATA_EXCLUSION = 1 UNICODE_DATA_CANONICAL = 2 diff --git a/lib/addressable/uri.rb b/lib/addressable/uri.rb index 5df0ae32..23613417 100644 --- a/lib/addressable/uri.rb +++ b/lib/addressable/uri.rb @@ -53,7 +53,7 @@ module CharacterClasses PCHAR = (UNRESERVED + SUB_DELIMS + "\\:\\@").freeze SCHEME = (ALPHA + DIGIT + "\\-\\+\\.").freeze HOST = (UNRESERVED + SUB_DELIMS + "\\[\\:\\]").freeze - AUTHORITY = (PCHAR + "\\[\\:\\]").freeze + AUTHORITY = (PCHAR + "\\[\\]").freeze PATH = (PCHAR + "\\/").freeze QUERY = (PCHAR + "\\/\\?").freeze FRAGMENT = (PCHAR + "\\/\\?").freeze diff --git a/spec/addressable/idna_spec.rb b/spec/addressable/idna_spec.rb index b1509d22..144f5f28 100644 --- a/spec/addressable/idna_spec.rb +++ b/spec/addressable/idna_spec.rb @@ -38,6 +38,12 @@ )).to eq("www.xn--8ws00zhy3a.com") end + it "also accepts unicode strings encoded as ascii-8bit" do + expect(Addressable::IDNA.to_ascii( + "www.詹姆斯.com".b + )).to eq("www.xn--8ws00zhy3a.com") + end + it "should convert 'www.Iñtërnâtiônàlizætiøn.com' correctly" do "www.Iñtërnâtiônàlizætiøn.com" expect(Addressable::IDNA.to_ascii( @@ -253,6 +259,7 @@ it "should normalize 'string' correctly" do expect(Addressable::IDNA.unicode_normalize_kc(:'string')).to eq("string") expect(Addressable::IDNA.unicode_normalize_kc("string")).to eq("string") + expect(Addressable::IDNA.unicode_normalize_kc("str\x00ing")).to eq("str\x00ing") end end diff --git a/spec/addressable/uri_spec.rb b/spec/addressable/uri_spec.rb index e6b1f683..6ab1b94b 100644 --- a/spec/addressable/uri_spec.rb +++ b/spec/addressable/uri_spec.rb @@ -5953,6 +5953,14 @@ def to_str end end +describe Addressable::URI, "when normalizing a path with an null byte" do + it "should result in correct percent encoded sequence" do + expect(Addressable::URI.parse("/path%00segment/").normalize.path).to eq( + "/path%00segment/" + ) + end +end + describe Addressable::URI, "when normalizing a partially encoded string" do it "should result in correct percent encoded sequence" do expect(Addressable::URI.normalize_component(