Use ruby unicode normalize to avoid libidn C problems and heavy legac…

…y ruby code
sporkmonger · Feb 13, 2023 · 2fff371 · 2fff371
1 parent 1fdd676
commit 2fff371
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 181 deletions.
diff --git a/benchmark/unicode_normalize.rb b/benchmark/unicode_normalize.rb
@@ -0,0 +1,33 @@
+# /usr/bin/env ruby
+
+require 'benchmark'
+require 'addressable/idna/pure.rb'
+require 'idn'
+
+value = "ﬁﾯリ宠퐱卄.com"
+expected = "fiᆵリ宠퐱卄.com"
+N = 100_000
+
+expected === value.unicode_normalize(:nfkc)  or fail "ruby normalize does not match"
+expected === IDN::Stringprep.nfkc_normalize(value) or fail "libidn normalize does not match"
+expected === Addressable::IDNA.unicode_normalize_kc(value) or fail "addressable normalize does not match"
+
+Benchmark.bmbm do |x|
+  x.report("pure") { N.times { Addressable::IDNA.unicode_normalize_kc(value) } }
+  x.report("libidn") { N.times { IDN::Stringprep.nfkc_normalize(value) } }
+  x.report("ruby") { N.times { value.unicode_normalize(:nfkc) } }
+end
+
+# February 14th 2023, before replacing the legacy pure normalize code:
+
+# > ruby benchmark/unicode_normalize.rb
+# Rehearsal ------------------------------------------
+# pure     1.335230   0.000315   1.335545 (  1.335657)
+# libidn   0.058568   0.000000   0.058568 (  0.058570)
+# ruby     0.326008   0.000014   0.326022 (  0.326026)
+# --------------------------------- total: 1.720135sec
+
+#              user     system      total        real
+# pure     1.325948   0.000000   1.325948 (  1.326054)
+# libidn   0.058067   0.000000   0.058067 (  0.058069)
+# ruby     0.325062   0.000000   0.325062 (  0.325115)
diff --git a/lib/addressable/idna/native.rb b/lib/addressable/idna/native.rb
@@ -30,7 +30,7 @@ def self.punycode_decode(value)
      end
 
     def self.unicode_normalize_kc(value)
-      IDN::Stringprep.nfkc_normalize(value.to_s)
+      value.to_s.unicode_normalize(:nfkc)
     end
 
     def self.to_ascii(value)

diff --git a/lib/addressable/idna/pure.rb b/lib/addressable/idna/pure.rb
@@ -67,6 +67,8 @@ module IDNA
     def self.to_ascii(input)
       input = input.to_s unless input.is_a?(String)
       input = input.dup
+      input.force_encoding(Encoding::UTF_8)
+      input = unicode_normalize_kc(input.dup)
       if input.respond_to?(:force_encoding)
         input.force_encoding(Encoding::ASCII_8BIT)
       end
@@ -77,7 +79,7 @@ def self.to_ascii(input)
             part.force_encoding(Encoding::ASCII_8BIT)
           end
           if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
-            ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
+            ACE_PREFIX + punycode_encode(part)
           else
             part
           end
@@ -114,11 +116,7 @@ def self.to_unicode(input)
 
     # Unicode normalization form KC.
     def self.unicode_normalize_kc(input)
-      input = input.to_s unless input.is_a?(String)
-      unpacked = input.unpack("U*")
-      unpacked =
-        unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
-      return unpacked.pack("U*")
+      input.to_s.unicode_normalize(:nfkc)
     end
 
     ##
@@ -136,164 +134,6 @@ def self.unicode_downcase(input)
     end
     private_class_method :unicode_downcase
 
-    def self.unicode_compose(unpacked)
-      unpacked_result = []
-      length = unpacked.length
-
-      return unpacked if length == 0
-
-      starter = unpacked[0]
-      starter_cc = lookup_unicode_combining_class(starter)
-      starter_cc = 256 if starter_cc != 0
-      for i in 1...length
-        ch = unpacked[i]
-
-        if (starter_cc == 0 &&
-            (composite = unicode_compose_pair(starter, ch)) != nil)
-          starter = composite
-        else
-          unpacked_result << starter
-          starter = ch
-        end
-      end
-      unpacked_result << starter
-      return unpacked_result
-    end
-    private_class_method :unicode_compose
-
-    def self.unicode_compose_pair(ch_one, ch_two)
-      if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
-          ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
-        # Hangul L + V
-        return HANGUL_SBASE + (
-          (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
-        ) * HANGUL_TCOUNT
-      elsif ch_one >= HANGUL_SBASE &&
-          ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
-          (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
-          ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
-           # Hangul LV + T
-        return ch_one + (ch_two - HANGUL_TBASE)
-      end
-
-      p = []
-
-      ucs4_to_utf8(ch_one, p)
-      ucs4_to_utf8(ch_two, p)
-
-      return lookup_unicode_composition(p)
-    end
-    private_class_method :unicode_compose_pair
-
-    def self.ucs4_to_utf8(char, buffer)
-      if char < 128
-        buffer << char
-      elsif char < 2048
-        buffer << (char >> 6 | 192)
-        buffer << (char & 63 | 128)
-      elsif char < 0x10000
-        buffer << (char >> 12 | 224)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x200000
-        buffer << (char >> 18 | 240)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x4000000
-        buffer << (char >> 24 | 248)
-        buffer << (char >> 18 & 63 | 128)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x80000000
-        buffer << (char >> 30 | 252)
-        buffer << (char >> 24 & 63 | 128)
-        buffer << (char >> 18 & 63 | 128)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      end
-    end
-    private_class_method :ucs4_to_utf8
-
-    def self.unicode_sort_canonical(unpacked)
-      unpacked = unpacked.dup
-      i = 1
-      length = unpacked.length
-
-      return unpacked if length < 2
-
-      while i < length
-        last = unpacked[i-1]
-        ch = unpacked[i]
-        last_cc = lookup_unicode_combining_class(last)
-        cc = lookup_unicode_combining_class(ch)
-        if cc != 0 && last_cc != 0 && last_cc > cc
-          unpacked[i] = last
-          unpacked[i-1] = ch
-          i -= 1 if i > 1
-        else
-          i += 1
-        end
-      end
-      return unpacked
-    end
-    private_class_method :unicode_sort_canonical
-
-    def self.unicode_decompose(unpacked)
-      unpacked_result = []
-      for cp in unpacked
-        if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
-          l, v, t = unicode_decompose_hangul(cp)
-          unpacked_result << l
-          unpacked_result << v if v
-          unpacked_result << t if t
-        else
-          dc = lookup_unicode_compatibility(cp)
-          unless dc
-            unpacked_result << cp
-          else
-            unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
-          end
-        end
-      end
-      return unpacked_result
-    end
-    private_class_method :unicode_decompose
-
-    def self.unicode_decompose_hangul(codepoint)
-      sindex = codepoint - HANGUL_SBASE;
-      if sindex < 0 || sindex >= HANGUL_SCOUNT
-        l = codepoint
-        v = t = nil
-        return l, v, t
-      end
-      l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
-      v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
-      t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
-      if t == HANGUL_TBASE
-        t = nil
-      end
-      return l, v, t
-    end
-    private_class_method :unicode_decompose_hangul
-
-    def self.lookup_unicode_combining_class(codepoint)
-      codepoint_data = UNICODE_DATA[codepoint]
-      (codepoint_data ?
-        (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
-        0)
-    end
-    private_class_method :lookup_unicode_combining_class
-
-    def self.lookup_unicode_compatibility(codepoint)
-      codepoint_data = UNICODE_DATA[codepoint]
-      (codepoint_data ?
-        codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
-    end
-    private_class_method :lookup_unicode_compatibility
-
     def self.lookup_unicode_lowercase(codepoint)
       codepoint_data = UNICODE_DATA[codepoint]
       (codepoint_data ?
@@ -302,21 +142,6 @@ def self.lookup_unicode_lowercase(codepoint)
     end
     private_class_method :lookup_unicode_lowercase
 
-    def self.lookup_unicode_composition(unpacked)
-      return COMPOSITION_TABLE[unpacked]
-    end
-    private_class_method :lookup_unicode_composition
-
-    HANGUL_SBASE =  0xac00
-    HANGUL_LBASE =  0x1100
-    HANGUL_LCOUNT = 19
-    HANGUL_VBASE =  0x1161
-    HANGUL_VCOUNT = 21
-    HANGUL_TBASE =  0x11a7
-    HANGUL_TCOUNT = 28
-    HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588
-    HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172
-
     UNICODE_DATA_COMBINING_CLASS = 0
     UNICODE_DATA_EXCLUSION = 1
     UNICODE_DATA_CANONICAL = 2

diff --git a/lib/addressable/uri.rb b/lib/addressable/uri.rb
@@ -53,7 +53,7 @@ module CharacterClasses
       PCHAR = (UNRESERVED + SUB_DELIMS + "\\:\\@").freeze
       SCHEME = (ALPHA + DIGIT + "\\-\\+\\.").freeze
       HOST = (UNRESERVED + SUB_DELIMS + "\\[\\:\\]").freeze
-      AUTHORITY = (PCHAR + "\\[\\:\\]").freeze
+      AUTHORITY = (PCHAR + "\\[\\]").freeze
       PATH = (PCHAR + "\\/").freeze
       QUERY = (PCHAR + "\\/\\?").freeze
       FRAGMENT = (PCHAR + "\\/\\?").freeze

diff --git a/spec/addressable/idna_spec.rb b/spec/addressable/idna_spec.rb
@@ -38,6 +38,12 @@
     )).to eq("www.xn--8ws00zhy3a.com")
   end
 
+  it "also accepts unicode strings encoded as ascii-8bit" do
+    expect(Addressable::IDNA.to_ascii(
+      "www.詹姆斯.com".b
+    )).to eq("www.xn--8ws00zhy3a.com")
+  end
+
   it "should convert 'www.Iñtërnâtiônàlizætiøn.com' correctly" do
     "www.Iñtërnâtiônàlizætiøn.com"
     expect(Addressable::IDNA.to_ascii(
@@ -253,6 +259,7 @@
   it "should normalize 'string' correctly" do
     expect(Addressable::IDNA.unicode_normalize_kc(:'string')).to eq("string")
     expect(Addressable::IDNA.unicode_normalize_kc("string")).to eq("string")
+    expect(Addressable::IDNA.unicode_normalize_kc("str\x00ing")).to eq("str\x00ing")
   end
 end
 

diff --git a/spec/addressable/uri_spec.rb b/spec/addressable/uri_spec.rb
@@ -5953,6 +5953,14 @@ def to_str
   end
 end
 
+describe Addressable::URI, "when normalizing a path with an null byte" do
+  it "should result in correct percent encoded sequence" do
+    expect(Addressable::URI.parse("/path%00segment/").normalize.path).to eq(
+      "/path%00segment/"
+    )
+  end
+end
+
 describe Addressable::URI, "when normalizing a partially encoded string" do
   it "should result in correct percent encoded sequence" do
     expect(Addressable::URI.normalize_component(