From 1998e06c81d813eac30409497b04fc6d6f19d231 Mon Sep 17 00:00:00 2001 From: Adrien Rey-Jarthon Date: Tue, 14 Feb 2023 00:00:35 +0100 Subject: [PATCH] Use ruby unicode normalize to avoid libidn C problems and heavy legacy ruby code --- benchmark/unicode_normalize.rb | 34 ++++++ lib/addressable/idna/native.rb | 4 - lib/addressable/idna/pure.rb | 186 +----------------------------- lib/addressable/template.rb | 12 +- lib/addressable/uri.rb | 9 +- spec/addressable/idna_spec.rb | 11 +- spec/addressable/template_spec.rb | 24 ++++ spec/addressable/uri_spec.rb | 20 ++++ 8 files changed, 97 insertions(+), 203 deletions(-) create mode 100644 benchmark/unicode_normalize.rb diff --git a/benchmark/unicode_normalize.rb b/benchmark/unicode_normalize.rb new file mode 100644 index 00000000..760f249f --- /dev/null +++ b/benchmark/unicode_normalize.rb @@ -0,0 +1,34 @@ +# /usr/bin/env ruby +# frozen_string_literal: true. + +require "benchmark" +require "addressable/idna/pure.rb" +require "idn" + +value = "fiᆵリ宠퐱卄.com" +expected = "fiᆵリ宠퐱卄.com" +N = 100_000 + +fail "ruby does not match" unless expected == value.unicode_normalize(:nfkc) +fail "libidn does not match" unless expected == IDN::Stringprep.nfkc_normalize(value) +fail "addressable does not match" unless expected == Addressable::IDNA.unicode_normalize_kc(value) + +Benchmark.bmbm do |x| + x.report("pure") { N.times { Addressable::IDNA.unicode_normalize_kc(value) } } + x.report("libidn") { N.times { IDN::Stringprep.nfkc_normalize(value) } } + x.report("ruby") { N.times { value.unicode_normalize(:nfkc) } } +end + +# February 14th 2023, before replacing the legacy pure normalize code: + +# > ruby benchmark/unicode_normalize.rb +# Rehearsal ------------------------------------------ +# pure 1.335230 0.000315 1.335545 ( 1.335657) +# libidn 0.058568 0.000000 0.058568 ( 0.058570) +# ruby 0.326008 0.000014 0.326022 ( 0.326026) +# --------------------------------- total: 1.720135sec + +# user system total real +# pure 1.325948 0.000000 1.325948 ( 1.326054) +# libidn 0.058067 0.000000 0.058067 ( 0.058069) +# ruby 0.325062 0.000000 0.325062 ( 0.325115) diff --git a/lib/addressable/idna/native.rb b/lib/addressable/idna/native.rb index 302e1b0c..b225e1c3 100644 --- a/lib/addressable/idna/native.rb +++ b/lib/addressable/idna/native.rb @@ -29,10 +29,6 @@ def self.punycode_decode(value) IDN::Punycode.decode(value.to_s) end - def self.unicode_normalize_kc(value) - IDN::Stringprep.nfkc_normalize(value.to_s) - end - def self.to_ascii(value) value.to_s.split('.', -1).map do |segment| if segment.size > 0 && segment.size < 64 diff --git a/lib/addressable/idna/pure.rb b/lib/addressable/idna/pure.rb index a7c796e3..ae09ec66 100644 --- a/lib/addressable/idna/pure.rb +++ b/lib/addressable/idna/pure.rb @@ -66,7 +66,7 @@ module IDNA # domain name as described in RFC 3490. def self.to_ascii(input) input = input.to_s unless input.is_a?(String) - input = input.dup + input = input.dup.force_encoding(Encoding::UTF_8).unicode_normalize(:nfkc) if input.respond_to?(:force_encoding) input.force_encoding(Encoding::ASCII_8BIT) end @@ -77,7 +77,7 @@ def self.to_ascii(input) part.force_encoding(Encoding::ASCII_8BIT) end if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE - ACE_PREFIX + punycode_encode(unicode_normalize_kc(part)) + ACE_PREFIX + punycode_encode(part) else part end @@ -112,15 +112,6 @@ def self.to_unicode(input) output end - # Unicode normalization form KC. - def self.unicode_normalize_kc(input) - input = input.to_s unless input.is_a?(String) - unpacked = input.unpack("U*") - unpacked = - unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked))) - return unpacked.pack("U*") - end - ## # Unicode aware downcase method. # @@ -136,164 +127,6 @@ def self.unicode_downcase(input) end private_class_method :unicode_downcase - def self.unicode_compose(unpacked) - unpacked_result = [] - length = unpacked.length - - return unpacked if length == 0 - - starter = unpacked[0] - starter_cc = lookup_unicode_combining_class(starter) - starter_cc = 256 if starter_cc != 0 - for i in 1...length - ch = unpacked[i] - - if (starter_cc == 0 && - (composite = unicode_compose_pair(starter, ch)) != nil) - starter = composite - else - unpacked_result << starter - starter = ch - end - end - unpacked_result << starter - return unpacked_result - end - private_class_method :unicode_compose - - def self.unicode_compose_pair(ch_one, ch_two) - if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT && - ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT - # Hangul L + V - return HANGUL_SBASE + ( - (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE) - ) * HANGUL_TCOUNT - elsif ch_one >= HANGUL_SBASE && - ch_one < HANGUL_SBASE + HANGUL_SCOUNT && - (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 && - ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT - # Hangul LV + T - return ch_one + (ch_two - HANGUL_TBASE) - end - - p = [] - - ucs4_to_utf8(ch_one, p) - ucs4_to_utf8(ch_two, p) - - return lookup_unicode_composition(p) - end - private_class_method :unicode_compose_pair - - def self.ucs4_to_utf8(char, buffer) - if char < 128 - buffer << char - elsif char < 2048 - buffer << (char >> 6 | 192) - buffer << (char & 63 | 128) - elsif char < 0x10000 - buffer << (char >> 12 | 224) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - elsif char < 0x200000 - buffer << (char >> 18 | 240) - buffer << (char >> 12 & 63 | 128) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - elsif char < 0x4000000 - buffer << (char >> 24 | 248) - buffer << (char >> 18 & 63 | 128) - buffer << (char >> 12 & 63 | 128) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - elsif char < 0x80000000 - buffer << (char >> 30 | 252) - buffer << (char >> 24 & 63 | 128) - buffer << (char >> 18 & 63 | 128) - buffer << (char >> 12 & 63 | 128) - buffer << (char >> 6 & 63 | 128) - buffer << (char & 63 | 128) - end - end - private_class_method :ucs4_to_utf8 - - def self.unicode_sort_canonical(unpacked) - unpacked = unpacked.dup - i = 1 - length = unpacked.length - - return unpacked if length < 2 - - while i < length - last = unpacked[i-1] - ch = unpacked[i] - last_cc = lookup_unicode_combining_class(last) - cc = lookup_unicode_combining_class(ch) - if cc != 0 && last_cc != 0 && last_cc > cc - unpacked[i] = last - unpacked[i-1] = ch - i -= 1 if i > 1 - else - i += 1 - end - end - return unpacked - end - private_class_method :unicode_sort_canonical - - def self.unicode_decompose(unpacked) - unpacked_result = [] - for cp in unpacked - if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT - l, v, t = unicode_decompose_hangul(cp) - unpacked_result << l - unpacked_result << v if v - unpacked_result << t if t - else - dc = lookup_unicode_compatibility(cp) - unless dc - unpacked_result << cp - else - unpacked_result.concat(unicode_decompose(dc.unpack("U*"))) - end - end - end - return unpacked_result - end - private_class_method :unicode_decompose - - def self.unicode_decompose_hangul(codepoint) - sindex = codepoint - HANGUL_SBASE; - if sindex < 0 || sindex >= HANGUL_SCOUNT - l = codepoint - v = t = nil - return l, v, t - end - l = HANGUL_LBASE + sindex / HANGUL_NCOUNT - v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT - t = HANGUL_TBASE + sindex % HANGUL_TCOUNT - if t == HANGUL_TBASE - t = nil - end - return l, v, t - end - private_class_method :unicode_decompose_hangul - - def self.lookup_unicode_combining_class(codepoint) - codepoint_data = UNICODE_DATA[codepoint] - (codepoint_data ? - (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) : - 0) - end - private_class_method :lookup_unicode_combining_class - - def self.lookup_unicode_compatibility(codepoint) - codepoint_data = UNICODE_DATA[codepoint] - (codepoint_data ? - codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil) - end - private_class_method :lookup_unicode_compatibility - def self.lookup_unicode_lowercase(codepoint) codepoint_data = UNICODE_DATA[codepoint] (codepoint_data ? @@ -302,21 +135,6 @@ def self.lookup_unicode_lowercase(codepoint) end private_class_method :lookup_unicode_lowercase - def self.lookup_unicode_composition(unpacked) - return COMPOSITION_TABLE[unpacked] - end - private_class_method :lookup_unicode_composition - - HANGUL_SBASE = 0xac00 - HANGUL_LBASE = 0x1100 - HANGUL_LCOUNT = 19 - HANGUL_VBASE = 0x1161 - HANGUL_VCOUNT = 21 - HANGUL_TBASE = 0x11a7 - HANGUL_TCOUNT = 28 - HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588 - HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172 - UNICODE_DATA_COMBINING_CLASS = 0 UNICODE_DATA_EXCLUSION = 1 UNICODE_DATA_CANONICAL = 2 diff --git a/lib/addressable/template.rb b/lib/addressable/template.rb index 9e8174bf..42cbf7cc 100644 --- a/lib/addressable/template.rb +++ b/lib/addressable/template.rb @@ -892,7 +892,7 @@ def join_values(operator, return_value) # operator. # # @param [Hash, Array, String] value - # Normalizes keys and values with IDNA#unicode_normalize_kc + # Normalizes unicode keys and values with String#unicode_normalize (NFC) # # @return [Hash, Array, String] The normalized values def normalize_value(value) @@ -902,15 +902,17 @@ def normalize_value(value) # Handle unicode normalization if value.kind_of?(Array) - value.map! { |val| Addressable::IDNA.unicode_normalize_kc(val) } + value.map! { |val| normalize_value(val) } elsif value.kind_of?(Hash) value = value.inject({}) { |acc, (k, v)| - acc[Addressable::IDNA.unicode_normalize_kc(k)] = - Addressable::IDNA.unicode_normalize_kc(v) + acc[normalize_value(k)] = normalize_value(v) acc } else - value = Addressable::IDNA.unicode_normalize_kc(value) + if value.encoding != Encoding::UTF_8 + value = value.dup.force_encoding(Encoding::UTF_8) + end + value = value.unicode_normalize(:nfc) end value end diff --git a/lib/addressable/uri.rb b/lib/addressable/uri.rb index 5df0ae32..50ccdaf5 100644 --- a/lib/addressable/uri.rb +++ b/lib/addressable/uri.rb @@ -53,7 +53,7 @@ module CharacterClasses PCHAR = (UNRESERVED + SUB_DELIMS + "\\:\\@").freeze SCHEME = (ALPHA + DIGIT + "\\-\\+\\.").freeze HOST = (UNRESERVED + SUB_DELIMS + "\\[\\:\\]").freeze - AUTHORITY = (PCHAR + "\\[\\:\\]").freeze + AUTHORITY = (PCHAR + "\\[\\]").freeze PATH = (PCHAR + "\\/").freeze QUERY = (PCHAR + "\\/\\?").freeze FRAGMENT = (PCHAR + "\\/\\?").freeze @@ -481,7 +481,7 @@ def self.unencode(uri, return_type=String, leave_encoded='') leave_encoded.include?(c) ? sequence : c end - result.force_encoding("utf-8") + result.force_encoding(Encoding::UTF_8) if return_type == String return result elsif return_type == ::Addressable::URI @@ -579,7 +579,7 @@ def self.normalize_component(component, character_class= unencoded = self.unencode_component(component, String, leave_encoded) begin encoded = self.encode_component( - Addressable::IDNA.unicode_normalize_kc(unencoded), + unencoded.unicode_normalize(:nfc), character_class, leave_encoded ) @@ -687,8 +687,7 @@ def self.normalized_encode(uri, return_type=String) components.each do |key, value| if value != nil begin - components[key] = - Addressable::IDNA.unicode_normalize_kc(value.to_str) + components[key] = value.to_str.unicode_normalize(:nfc) rescue ArgumentError # Likely a malformed UTF-8 character, skip unicode normalization components[key] = value.to_str diff --git a/spec/addressable/idna_spec.rb b/spec/addressable/idna_spec.rb index b1509d22..428c9ec8 100644 --- a/spec/addressable/idna_spec.rb +++ b/spec/addressable/idna_spec.rb @@ -38,6 +38,12 @@ )).to eq("www.xn--8ws00zhy3a.com") end + it "also accepts unicode strings encoded as ascii-8bit" do + expect(Addressable::IDNA.to_ascii( + "www.詹姆斯.com".b + )).to eq("www.xn--8ws00zhy3a.com") + end + it "should convert 'www.Iñtërnâtiônàlizætiøn.com' correctly" do "www.Iñtërnâtiônàlizætiøn.com" expect(Addressable::IDNA.to_ascii( @@ -249,11 +255,6 @@ "example..host" )).to eq("example..host") end - - it "should normalize 'string' correctly" do - expect(Addressable::IDNA.unicode_normalize_kc(:'string')).to eq("string") - expect(Addressable::IDNA.unicode_normalize_kc("string")).to eq("string") - end end describe Addressable::IDNA, "when using the pure-Ruby implementation" do diff --git a/spec/addressable/template_spec.rb b/spec/addressable/template_spec.rb index f7b0994c..4b54ea96 100644 --- a/spec/addressable/template_spec.rb +++ b/spec/addressable/template_spec.rb @@ -1021,6 +1021,19 @@ def self.match(name) ) end + it "normalizes as unicode even with wrong encoding specified" do + template = subject.partial_expand("query" => "Cafe\u0301".b) + expect(template.pattern).to eq( + "http://example.com/{resource}/Caf%C3%A9/" + ) + end + + it "raises on invalid unicode input" do + expect { + subject.partial_expand("query" => "M\xE9thode".b) + }.to raise_error(ArgumentError, "invalid byte sequence in UTF-8") + end + it "does not normalize unicode when byte semantics requested" do template = subject.partial_expand({"query" => "Cafe\u0301"}, nil, false) expect(template.pattern).to eq( @@ -1081,6 +1094,17 @@ def self.match(name) expect(uri).to eq("http://example.com/search/Caf%C3%A9/") end + it "normalizes as unicode even with wrong encoding specified" do + uri = subject.expand("query" => "Cafe\u0301".b).to_str + expect(uri).to eq("http://example.com/search/Caf%C3%A9/") + end + + it "raises on invalid unicode input" do + expect { + subject.expand("query" => "M\xE9thode".b).to_str + }.to raise_error(ArgumentError, "invalid byte sequence in UTF-8") + end + it "does not normalize unicode when byte semantics requested" do uri = subject.expand({ "query" => "Cafe\u0301" }, nil, false).to_str expect(uri).to eq("http://example.com/search/Cafe%CC%81/") diff --git a/spec/addressable/uri_spec.rb b/spec/addressable/uri_spec.rb index e6b1f683..c54fc3fb 100644 --- a/spec/addressable/uri_spec.rb +++ b/spec/addressable/uri_spec.rb @@ -5953,6 +5953,26 @@ def to_str end end +describe Addressable::URI, "when normalizing a path with special unicode" do + it "does not stop at or ignore null bytes" do + expect(Addressable::URI.parse("/path%00segment/").normalize.path).to eq( + "/path%00segment/" + ) + end + + it "does apply NFC unicode normalization" do + expect(Addressable::URI.parse("/%E2%84%A6").normalize.path).to eq( + "/%CE%A9" + ) + end + + it "does not apply NFKC unicode normalization" do + expect(Addressable::URI.parse("/%C2%AF%C2%A0").normalize.path).to eq( + "/%C2%AF%C2%A0" + ) + end +end + describe Addressable::URI, "when normalizing a partially encoded string" do it "should result in correct percent encoded sequence" do expect(Addressable::URI.normalize_component(