Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use ruby unicode normalize to avoid libidn C problems and heavy legacy ruby code #492

Merged
merged 1 commit into from Mar 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 34 additions & 0 deletions benchmark/unicode_normalize.rb
@@ -0,0 +1,34 @@
# /usr/bin/env ruby
jarthod marked this conversation as resolved.
Show resolved Hide resolved
# frozen_string_literal: true.

require "benchmark"
require "addressable/idna/pure.rb"
require "idn"

value = "fiᆵリ宠퐱卄.com"
expected = "fiᆵリ宠퐱卄.com"
N = 100_000

fail "ruby does not match" unless expected == value.unicode_normalize(:nfkc)
fail "libidn does not match" unless expected == IDN::Stringprep.nfkc_normalize(value)
jarthod marked this conversation as resolved.
Show resolved Hide resolved
fail "addressable does not match" unless expected == Addressable::IDNA.unicode_normalize_kc(value)
jarthod marked this conversation as resolved.
Show resolved Hide resolved

Benchmark.bmbm do |x|
x.report("pure") { N.times { Addressable::IDNA.unicode_normalize_kc(value) } }
x.report("libidn") { N.times { IDN::Stringprep.nfkc_normalize(value) } }
x.report("ruby") { N.times { value.unicode_normalize(:nfkc) } }
end

# February 14th 2023, before replacing the legacy pure normalize code:

# > ruby benchmark/unicode_normalize.rb
# Rehearsal ------------------------------------------
# pure 1.335230 0.000315 1.335545 ( 1.335657)
# libidn 0.058568 0.000000 0.058568 ( 0.058570)
# ruby 0.326008 0.000014 0.326022 ( 0.326026)
# --------------------------------- total: 1.720135sec

# user system total real
# pure 1.325948 0.000000 1.325948 ( 1.326054)
# libidn 0.058067 0.000000 0.058067 ( 0.058069)
# ruby 0.325062 0.000000 0.325062 ( 0.325115)
4 changes: 0 additions & 4 deletions lib/addressable/idna/native.rb
Expand Up @@ -29,10 +29,6 @@ def self.punycode_decode(value)
IDN::Punycode.decode(value.to_s)
end

def self.unicode_normalize_kc(value)
IDN::Stringprep.nfkc_normalize(value.to_s)
end
jarthod marked this conversation as resolved.
Show resolved Hide resolved

def self.to_ascii(value)
value.to_s.split('.', -1).map do |segment|
if segment.size > 0 && segment.size < 64
Expand Down
186 changes: 2 additions & 184 deletions lib/addressable/idna/pure.rb
Expand Up @@ -66,7 +66,7 @@ module IDNA
# domain name as described in RFC 3490.
def self.to_ascii(input)
input = input.to_s unless input.is_a?(String)
input = input.dup
input = input.dup.force_encoding(Encoding::UTF_8).unicode_normalize(:nfkc)
jarthod marked this conversation as resolved.
Show resolved Hide resolved
if input.respond_to?(:force_encoding)
input.force_encoding(Encoding::ASCII_8BIT)
end
Expand All @@ -77,7 +77,7 @@ def self.to_ascii(input)
part.force_encoding(Encoding::ASCII_8BIT)
end
if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
ACE_PREFIX + punycode_encode(part)
jarthod marked this conversation as resolved.
Show resolved Hide resolved
else
part
end
Expand Down Expand Up @@ -112,15 +112,6 @@ def self.to_unicode(input)
output
end

# Unicode normalization form KC.
def self.unicode_normalize_kc(input)
input = input.to_s unless input.is_a?(String)
unpacked = input.unpack("U*")
unpacked =
unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
return unpacked.pack("U*")
end

##
# Unicode aware downcase method.
#
Expand All @@ -136,164 +127,6 @@ def self.unicode_downcase(input)
end
private_class_method :unicode_downcase

def self.unicode_compose(unpacked)
unpacked_result = []
length = unpacked.length

return unpacked if length == 0

starter = unpacked[0]
starter_cc = lookup_unicode_combining_class(starter)
starter_cc = 256 if starter_cc != 0
for i in 1...length
ch = unpacked[i]

if (starter_cc == 0 &&
(composite = unicode_compose_pair(starter, ch)) != nil)
starter = composite
else
unpacked_result << starter
starter = ch
end
end
unpacked_result << starter
return unpacked_result
end
private_class_method :unicode_compose

def self.unicode_compose_pair(ch_one, ch_two)
if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
# Hangul L + V
return HANGUL_SBASE + (
(ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
) * HANGUL_TCOUNT
elsif ch_one >= HANGUL_SBASE &&
ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
(ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
# Hangul LV + T
return ch_one + (ch_two - HANGUL_TBASE)
end

p = []

ucs4_to_utf8(ch_one, p)
ucs4_to_utf8(ch_two, p)

return lookup_unicode_composition(p)
end
private_class_method :unicode_compose_pair

def self.ucs4_to_utf8(char, buffer)
if char < 128
buffer << char
elsif char < 2048
buffer << (char >> 6 | 192)
buffer << (char & 63 | 128)
elsif char < 0x10000
buffer << (char >> 12 | 224)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x200000
buffer << (char >> 18 | 240)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x4000000
buffer << (char >> 24 | 248)
buffer << (char >> 18 & 63 | 128)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x80000000
buffer << (char >> 30 | 252)
buffer << (char >> 24 & 63 | 128)
buffer << (char >> 18 & 63 | 128)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
end
end
private_class_method :ucs4_to_utf8

def self.unicode_sort_canonical(unpacked)
unpacked = unpacked.dup
i = 1
length = unpacked.length

return unpacked if length < 2

while i < length
last = unpacked[i-1]
ch = unpacked[i]
last_cc = lookup_unicode_combining_class(last)
cc = lookup_unicode_combining_class(ch)
if cc != 0 && last_cc != 0 && last_cc > cc
unpacked[i] = last
unpacked[i-1] = ch
i -= 1 if i > 1
else
i += 1
end
end
return unpacked
end
private_class_method :unicode_sort_canonical

def self.unicode_decompose(unpacked)
unpacked_result = []
for cp in unpacked
if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
l, v, t = unicode_decompose_hangul(cp)
unpacked_result << l
unpacked_result << v if v
unpacked_result << t if t
else
dc = lookup_unicode_compatibility(cp)
unless dc
unpacked_result << cp
else
unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
end
end
end
return unpacked_result
end
private_class_method :unicode_decompose

def self.unicode_decompose_hangul(codepoint)
sindex = codepoint - HANGUL_SBASE;
if sindex < 0 || sindex >= HANGUL_SCOUNT
l = codepoint
v = t = nil
return l, v, t
end
l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
if t == HANGUL_TBASE
t = nil
end
return l, v, t
end
private_class_method :unicode_decompose_hangul

def self.lookup_unicode_combining_class(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
(codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
0)
end
private_class_method :lookup_unicode_combining_class

def self.lookup_unicode_compatibility(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
end
private_class_method :lookup_unicode_compatibility

def self.lookup_unicode_lowercase(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
Expand All @@ -302,21 +135,6 @@ def self.lookup_unicode_lowercase(codepoint)
end
private_class_method :lookup_unicode_lowercase

def self.lookup_unicode_composition(unpacked)
return COMPOSITION_TABLE[unpacked]
end
private_class_method :lookup_unicode_composition

HANGUL_SBASE = 0xac00
HANGUL_LBASE = 0x1100
HANGUL_LCOUNT = 19
HANGUL_VBASE = 0x1161
HANGUL_VCOUNT = 21
HANGUL_TBASE = 0x11a7
HANGUL_TCOUNT = 28
HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588
HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172

UNICODE_DATA_COMBINING_CLASS = 0
UNICODE_DATA_EXCLUSION = 1
UNICODE_DATA_CANONICAL = 2
Expand Down
12 changes: 7 additions & 5 deletions lib/addressable/template.rb
Expand Up @@ -892,7 +892,7 @@ def join_values(operator, return_value)
# operator.
#
# @param [Hash, Array, String] value
# Normalizes keys and values with IDNA#unicode_normalize_kc
# Normalizes unicode keys and values with String#unicode_normalize (NFC)
#
# @return [Hash, Array, String] The normalized values
def normalize_value(value)
Expand All @@ -902,15 +902,17 @@ def normalize_value(value)

# Handle unicode normalization
if value.kind_of?(Array)
value.map! { |val| Addressable::IDNA.unicode_normalize_kc(val) }
value.map! { |val| normalize_value(val) }
elsif value.kind_of?(Hash)
value = value.inject({}) { |acc, (k, v)|
acc[Addressable::IDNA.unicode_normalize_kc(k)] =
Addressable::IDNA.unicode_normalize_kc(v)
acc[normalize_value(k)] = normalize_value(v)
acc
}
else
value = Addressable::IDNA.unicode_normalize_kc(value)
if value.encoding != Encoding::UTF_8
value = value.dup.force_encoding(Encoding::UTF_8)
end
value = value.unicode_normalize(:nfc)
jarthod marked this conversation as resolved.
Show resolved Hide resolved
end
value
end
Expand Down
9 changes: 4 additions & 5 deletions lib/addressable/uri.rb
Expand Up @@ -53,7 +53,7 @@ module CharacterClasses
PCHAR = (UNRESERVED + SUB_DELIMS + "\\:\\@").freeze
SCHEME = (ALPHA + DIGIT + "\\-\\+\\.").freeze
HOST = (UNRESERVED + SUB_DELIMS + "\\[\\:\\]").freeze
AUTHORITY = (PCHAR + "\\[\\:\\]").freeze
AUTHORITY = (PCHAR + "\\[\\]").freeze
jarthod marked this conversation as resolved.
Show resolved Hide resolved
PATH = (PCHAR + "\\/").freeze
QUERY = (PCHAR + "\\/\\?").freeze
FRAGMENT = (PCHAR + "\\/\\?").freeze
Expand Down Expand Up @@ -481,7 +481,7 @@ def self.unencode(uri, return_type=String, leave_encoded='')
leave_encoded.include?(c) ? sequence : c
end

result.force_encoding("utf-8")
result.force_encoding(Encoding::UTF_8)
jarthod marked this conversation as resolved.
Show resolved Hide resolved
if return_type == String
return result
elsif return_type == ::Addressable::URI
Expand Down Expand Up @@ -579,7 +579,7 @@ def self.normalize_component(component, character_class=
unencoded = self.unencode_component(component, String, leave_encoded)
begin
encoded = self.encode_component(
Addressable::IDNA.unicode_normalize_kc(unencoded),
unencoded.unicode_normalize(:nfc),
character_class,
leave_encoded
)
Expand Down Expand Up @@ -687,8 +687,7 @@ def self.normalized_encode(uri, return_type=String)
components.each do |key, value|
if value != nil
begin
components[key] =
Addressable::IDNA.unicode_normalize_kc(value.to_str)
components[key] = value.to_str.unicode_normalize(:nfc)
jarthod marked this conversation as resolved.
Show resolved Hide resolved
rescue ArgumentError
# Likely a malformed UTF-8 character, skip unicode normalization
components[key] = value.to_str
Expand Down
11 changes: 6 additions & 5 deletions spec/addressable/idna_spec.rb
Expand Up @@ -38,6 +38,12 @@
)).to eq("www.xn--8ws00zhy3a.com")
end

it "also accepts unicode strings encoded as ascii-8bit" do
expect(Addressable::IDNA.to_ascii(
"www.詹姆斯.com".b
jarthod marked this conversation as resolved.
Show resolved Hide resolved
)).to eq("www.xn--8ws00zhy3a.com")
end

jarthod marked this conversation as resolved.
Show resolved Hide resolved
it "should convert 'www.Iñtërnâtiônàlizætiøn.com' correctly" do
"www.Iñtërnâtiônàlizætiøn.com"
expect(Addressable::IDNA.to_ascii(
Expand Down Expand Up @@ -249,11 +255,6 @@
"example..host"
)).to eq("example..host")
end

it "should normalize 'string' correctly" do
expect(Addressable::IDNA.unicode_normalize_kc(:'string')).to eq("string")
expect(Addressable::IDNA.unicode_normalize_kc("string")).to eq("string")
end
jarthod marked this conversation as resolved.
Show resolved Hide resolved
end

describe Addressable::IDNA, "when using the pure-Ruby implementation" do
Expand Down
24 changes: 24 additions & 0 deletions spec/addressable/template_spec.rb
Expand Up @@ -1021,6 +1021,19 @@ def self.match(name)
)
end

it "normalizes as unicode even with wrong encoding specified" do
template = subject.partial_expand("query" => "Cafe\u0301".b)
expect(template.pattern).to eq(
"http://example.com/{resource}/Caf%C3%A9/"
)
end

it "raises on invalid unicode input" do
expect {
subject.partial_expand("query" => "M\xE9thode".b)
}.to raise_error(ArgumentError, "invalid byte sequence in UTF-8")
end

it "does not normalize unicode when byte semantics requested" do
template = subject.partial_expand({"query" => "Cafe\u0301"}, nil, false)
expect(template.pattern).to eq(
Expand Down Expand Up @@ -1081,6 +1094,17 @@ def self.match(name)
expect(uri).to eq("http://example.com/search/Caf%C3%A9/")
end

it "normalizes as unicode even with wrong encoding specified" do
uri = subject.expand("query" => "Cafe\u0301".b).to_str
expect(uri).to eq("http://example.com/search/Caf%C3%A9/")
end

it "raises on invalid unicode input" do
expect {
subject.expand("query" => "M\xE9thode".b).to_str
}.to raise_error(ArgumentError, "invalid byte sequence in UTF-8")
end

it "does not normalize unicode when byte semantics requested" do
uri = subject.expand({ "query" => "Cafe\u0301" }, nil, false).to_str
expect(uri).to eq("http://example.com/search/Cafe%CC%81/")
Expand Down