Skip to content

Commit

Permalink
Use ruby unicode normalize to avoid libidn C problems and heavy legac…
Browse files Browse the repository at this point in the history
…y ruby code
  • Loading branch information
jarthod committed Feb 13, 2023
1 parent 1fdd676 commit 2fff371
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 181 deletions.
33 changes: 33 additions & 0 deletions benchmark/unicode_normalize.rb
@@ -0,0 +1,33 @@
# /usr/bin/env ruby

require 'benchmark'
require 'addressable/idna/pure.rb'
require 'idn'

value = "fiᆵリ宠퐱卄.com"
expected = "fiᆵリ宠퐱卄.com"
N = 100_000

expected === value.unicode_normalize(:nfkc) or fail "ruby normalize does not match"
expected === IDN::Stringprep.nfkc_normalize(value) or fail "libidn normalize does not match"
expected === Addressable::IDNA.unicode_normalize_kc(value) or fail "addressable normalize does not match"

Benchmark.bmbm do |x|
x.report("pure") { N.times { Addressable::IDNA.unicode_normalize_kc(value) } }
x.report("libidn") { N.times { IDN::Stringprep.nfkc_normalize(value) } }
x.report("ruby") { N.times { value.unicode_normalize(:nfkc) } }
end

# February 14th 2023, before replacing the legacy pure normalize code:

# > ruby benchmark/unicode_normalize.rb
# Rehearsal ------------------------------------------
# pure 1.335230 0.000315 1.335545 ( 1.335657)
# libidn 0.058568 0.000000 0.058568 ( 0.058570)
# ruby 0.326008 0.000014 0.326022 ( 0.326026)
# --------------------------------- total: 1.720135sec

# user system total real
# pure 1.325948 0.000000 1.325948 ( 1.326054)
# libidn 0.058067 0.000000 0.058067 ( 0.058069)
# ruby 0.325062 0.000000 0.325062 ( 0.325115)
2 changes: 1 addition & 1 deletion lib/addressable/idna/native.rb
Expand Up @@ -30,7 +30,7 @@ def self.punycode_decode(value)
end

def self.unicode_normalize_kc(value)
IDN::Stringprep.nfkc_normalize(value.to_s)
value.to_s.unicode_normalize(:nfkc)
end

def self.to_ascii(value)
Expand Down
183 changes: 4 additions & 179 deletions lib/addressable/idna/pure.rb
Expand Up @@ -67,6 +67,8 @@ module IDNA
def self.to_ascii(input)
input = input.to_s unless input.is_a?(String)
input = input.dup
input.force_encoding(Encoding::UTF_8)
input = unicode_normalize_kc(input.dup)
if input.respond_to?(:force_encoding)
input.force_encoding(Encoding::ASCII_8BIT)
end
Expand All @@ -77,7 +79,7 @@ def self.to_ascii(input)
part.force_encoding(Encoding::ASCII_8BIT)
end
if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
ACE_PREFIX + punycode_encode(part)
else
part
end
Expand Down Expand Up @@ -114,11 +116,7 @@ def self.to_unicode(input)

# Unicode normalization form KC.
def self.unicode_normalize_kc(input)
input = input.to_s unless input.is_a?(String)
unpacked = input.unpack("U*")
unpacked =
unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
return unpacked.pack("U*")
input.to_s.unicode_normalize(:nfkc)
end

##
Expand All @@ -136,164 +134,6 @@ def self.unicode_downcase(input)
end
private_class_method :unicode_downcase

def self.unicode_compose(unpacked)
unpacked_result = []
length = unpacked.length

return unpacked if length == 0

starter = unpacked[0]
starter_cc = lookup_unicode_combining_class(starter)
starter_cc = 256 if starter_cc != 0
for i in 1...length
ch = unpacked[i]

if (starter_cc == 0 &&
(composite = unicode_compose_pair(starter, ch)) != nil)
starter = composite
else
unpacked_result << starter
starter = ch
end
end
unpacked_result << starter
return unpacked_result
end
private_class_method :unicode_compose

def self.unicode_compose_pair(ch_one, ch_two)
if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
# Hangul L + V
return HANGUL_SBASE + (
(ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
) * HANGUL_TCOUNT
elsif ch_one >= HANGUL_SBASE &&
ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
(ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
# Hangul LV + T
return ch_one + (ch_two - HANGUL_TBASE)
end

p = []

ucs4_to_utf8(ch_one, p)
ucs4_to_utf8(ch_two, p)

return lookup_unicode_composition(p)
end
private_class_method :unicode_compose_pair

def self.ucs4_to_utf8(char, buffer)
if char < 128
buffer << char
elsif char < 2048
buffer << (char >> 6 | 192)
buffer << (char & 63 | 128)
elsif char < 0x10000
buffer << (char >> 12 | 224)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x200000
buffer << (char >> 18 | 240)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x4000000
buffer << (char >> 24 | 248)
buffer << (char >> 18 & 63 | 128)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x80000000
buffer << (char >> 30 | 252)
buffer << (char >> 24 & 63 | 128)
buffer << (char >> 18 & 63 | 128)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
end
end
private_class_method :ucs4_to_utf8

def self.unicode_sort_canonical(unpacked)
unpacked = unpacked.dup
i = 1
length = unpacked.length

return unpacked if length < 2

while i < length
last = unpacked[i-1]
ch = unpacked[i]
last_cc = lookup_unicode_combining_class(last)
cc = lookup_unicode_combining_class(ch)
if cc != 0 && last_cc != 0 && last_cc > cc
unpacked[i] = last
unpacked[i-1] = ch
i -= 1 if i > 1
else
i += 1
end
end
return unpacked
end
private_class_method :unicode_sort_canonical

def self.unicode_decompose(unpacked)
unpacked_result = []
for cp in unpacked
if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
l, v, t = unicode_decompose_hangul(cp)
unpacked_result << l
unpacked_result << v if v
unpacked_result << t if t
else
dc = lookup_unicode_compatibility(cp)
unless dc
unpacked_result << cp
else
unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
end
end
end
return unpacked_result
end
private_class_method :unicode_decompose

def self.unicode_decompose_hangul(codepoint)
sindex = codepoint - HANGUL_SBASE;
if sindex < 0 || sindex >= HANGUL_SCOUNT
l = codepoint
v = t = nil
return l, v, t
end
l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
if t == HANGUL_TBASE
t = nil
end
return l, v, t
end
private_class_method :unicode_decompose_hangul

def self.lookup_unicode_combining_class(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
(codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
0)
end
private_class_method :lookup_unicode_combining_class

def self.lookup_unicode_compatibility(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
end
private_class_method :lookup_unicode_compatibility

def self.lookup_unicode_lowercase(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
Expand All @@ -302,21 +142,6 @@ def self.lookup_unicode_lowercase(codepoint)
end
private_class_method :lookup_unicode_lowercase

def self.lookup_unicode_composition(unpacked)
return COMPOSITION_TABLE[unpacked]
end
private_class_method :lookup_unicode_composition

HANGUL_SBASE = 0xac00
HANGUL_LBASE = 0x1100
HANGUL_LCOUNT = 19
HANGUL_VBASE = 0x1161
HANGUL_VCOUNT = 21
HANGUL_TBASE = 0x11a7
HANGUL_TCOUNT = 28
HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588
HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172

UNICODE_DATA_COMBINING_CLASS = 0
UNICODE_DATA_EXCLUSION = 1
UNICODE_DATA_CANONICAL = 2
Expand Down
2 changes: 1 addition & 1 deletion lib/addressable/uri.rb
Expand Up @@ -53,7 +53,7 @@ module CharacterClasses
PCHAR = (UNRESERVED + SUB_DELIMS + "\\:\\@").freeze
SCHEME = (ALPHA + DIGIT + "\\-\\+\\.").freeze
HOST = (UNRESERVED + SUB_DELIMS + "\\[\\:\\]").freeze
AUTHORITY = (PCHAR + "\\[\\:\\]").freeze
AUTHORITY = (PCHAR + "\\[\\]").freeze
PATH = (PCHAR + "\\/").freeze
QUERY = (PCHAR + "\\/\\?").freeze
FRAGMENT = (PCHAR + "\\/\\?").freeze
Expand Down
7 changes: 7 additions & 0 deletions spec/addressable/idna_spec.rb
Expand Up @@ -38,6 +38,12 @@
)).to eq("www.xn--8ws00zhy3a.com")
end

it "also accepts unicode strings encoded as ascii-8bit" do
expect(Addressable::IDNA.to_ascii(
"www.詹姆斯.com".b
)).to eq("www.xn--8ws00zhy3a.com")
end

it "should convert 'www.Iñtërnâtiônàlizætiøn.com' correctly" do
"www.Iñtërnâtiônàlizætiøn.com"
expect(Addressable::IDNA.to_ascii(
Expand Down Expand Up @@ -253,6 +259,7 @@
it "should normalize 'string' correctly" do
expect(Addressable::IDNA.unicode_normalize_kc(:'string')).to eq("string")
expect(Addressable::IDNA.unicode_normalize_kc("string")).to eq("string")
expect(Addressable::IDNA.unicode_normalize_kc("str\x00ing")).to eq("str\x00ing")
end
end

Expand Down
8 changes: 8 additions & 0 deletions spec/addressable/uri_spec.rb
Expand Up @@ -5953,6 +5953,14 @@ def to_str
end
end

describe Addressable::URI, "when normalizing a path with an null byte" do
it "should result in correct percent encoded sequence" do
expect(Addressable::URI.parse("/path%00segment/").normalize.path).to eq(
"/path%00segment/"
)
end
end

describe Addressable::URI, "when normalizing a partially encoded string" do
it "should result in correct percent encoded sequence" do
expect(Addressable::URI.normalize_component(
Expand Down

0 comments on commit 2fff371

Please sign in to comment.