Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

libidn2 support for IDNA2008+UTS#46 (using ffi) #496

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Expand Up @@ -13,7 +13,7 @@ jobs:
fail-fast: false
matrix:
ruby: [2.7]
idna_mode: [native, pure]
idna_mode: [native2, native, pure]
jarthod marked this conversation as resolved.
Show resolved Hide resolved
os: [ubuntu-20.04]
env:
IDNA_MODE: ${{ matrix.idna_mode }}
Expand Down
4 changes: 3 additions & 1 deletion addressable.gemspec
Expand Up @@ -13,7 +13,7 @@ Gem::Specification.new do |s|
s.description = "Addressable is an alternative implementation to the URI implementation that is\npart of Ruby's standard library. It is flexible, offers heuristic parsing, and\nadditionally provides extensive support for IRIs and URI templates.\n".freeze
s.email = "bob@sporkmonger.com".freeze
s.extra_rdoc_files = ["README.md".freeze]
s.files = ["CHANGELOG.md".freeze, "Gemfile".freeze, "LICENSE.txt".freeze, "README.md".freeze, "Rakefile".freeze, "data/unicode.data".freeze, "lib/addressable".freeze, "lib/addressable.rb".freeze, "lib/addressable/idna".freeze, "lib/addressable/idna.rb".freeze, "lib/addressable/idna/native.rb".freeze, "lib/addressable/idna/pure.rb".freeze, "lib/addressable/template.rb".freeze, "lib/addressable/uri.rb".freeze, "lib/addressable/version.rb".freeze, "spec/addressable".freeze, "spec/addressable/idna_spec.rb".freeze, "spec/addressable/net_http_compat_spec.rb".freeze, "spec/addressable/security_spec.rb".freeze, "spec/addressable/template_spec.rb".freeze, "spec/addressable/uri_spec.rb".freeze, "spec/spec_helper.rb".freeze, "tasks/clobber.rake".freeze, "tasks/gem.rake".freeze, "tasks/git.rake".freeze, "tasks/metrics.rake".freeze, "tasks/profile.rake".freeze, "tasks/rspec.rake".freeze, "tasks/yard.rake".freeze]
s.files = ["CHANGELOG.md".freeze, "Gemfile".freeze, "LICENSE.txt".freeze, "README.md".freeze, "Rakefile".freeze, "data/unicode.data".freeze, "lib/addressable".freeze, "lib/addressable.rb".freeze, "lib/addressable/idna".freeze, "lib/addressable/idna.rb".freeze, "lib/addressable/idna/native.rb".freeze, "lib/addressable/idna/native2.rb".freeze, "lib/addressable/idna/pure.rb".freeze, "lib/addressable/template.rb".freeze, "lib/addressable/uri.rb".freeze, "lib/addressable/version.rb".freeze, "spec/addressable".freeze, "spec/addressable/idna_spec.rb".freeze, "spec/addressable/net_http_compat_spec.rb".freeze, "spec/addressable/security_spec.rb".freeze, "spec/addressable/template_spec.rb".freeze, "spec/addressable/uri_spec.rb".freeze, "spec/spec_helper.rb".freeze, "tasks/clobber.rake".freeze, "tasks/gem.rake".freeze, "tasks/git.rake".freeze, "tasks/metrics.rake".freeze, "tasks/profile.rake".freeze, "tasks/rspec.rake".freeze, "tasks/yard.rake".freeze]
jarthod marked this conversation as resolved.
Show resolved Hide resolved
s.homepage = "https://github.com/sporkmonger/addressable".freeze
s.licenses = ["Apache-2.0".freeze]
s.rdoc_options = ["--main".freeze, "README.md".freeze]
Expand All @@ -27,9 +27,11 @@ Gem::Specification.new do |s|

if s.respond_to? :add_runtime_dependency then
s.add_runtime_dependency(%q<public_suffix>.freeze, [">= 2.0.2", "< 6.0"])
s.add_runtime_dependency(%q<ffi>.freeze)
s.add_development_dependency(%q<bundler>.freeze, [">= 1.0", "< 3.0"])
else
s.add_dependency(%q<public_suffix>.freeze, [">= 2.0.2", "< 6.0"])
s.add_dependency(%q<ffi>.freeze)
jarthod marked this conversation as resolved.
Show resolved Hide resolved
s.add_dependency(%q<bundler>.freeze, [">= 1.0", "< 3.0"])
end
end
65 changes: 65 additions & 0 deletions benchmark/idna.rb
@@ -0,0 +1,65 @@
# /usr/bin/env ruby
# frozen_string_literal: true.

require "benchmark"

value = "fiᆵリ宠퐱卄.com"
expected = "xn--fi-w1k207vk59a3qk9w9r.com"
N = 100_000

Benchmark.bmbm do |x|
x.report("pure") {
load "lib/addressable/idna/pure.rb"
fail "pure ruby does not match" unless expected == Addressable::IDNA.to_ascii(value)
N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) }
Addressable.send(:remove_const, :IDNA)
}

x.report("libidn") {
load "lib/addressable/idna/native.rb"
fail "libidn does not match" unless expected == Addressable::IDNA.to_ascii(value)
N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) }
Addressable.send(:remove_const, :IDNA)
}

x.report("libidn2") {
load "lib/addressable/idna/native2.rb"
fail "addressable does not match" unless expected == Addressable::IDNA.to_ascii(value)
N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) }
Addressable.send(:remove_const, :IDNA)
}
end

# > ruby benchmark/idna.rb
# Rehearsal -------------------------------------------
# pure 5.914630 0.000000 5.914630 ( 5.915326)
# libidn 0.518971 0.003672 0.522643 ( 0.522676)
# libidn2 0.763936 0.000000 0.763936 ( 0.763983)
# ---------------------------------- total: 7.201209sec

# user system total real
# pure 6.042877 0.000000 6.042877 ( 6.043252)
# libidn 0.521668 0.000000 0.521668 ( 0.521704)
# libidn2 0.764782 0.000000 0.764782 ( 0.764863)

puts "\nMemory leak test for libidn2 (memory should stabilize quickly):"
load "lib/addressable/idna/native2.rb"
GC.disable # Only run GC when manually called
10.times do
N.times { Addressable::IDNA.to_unicode(Addressable::IDNA.to_ascii(value)) }
GC.start # Run a major GC
pid, size = `ps ax -o pid,rss | grep -E "^[[:space:]]*#{$$}"`.strip.split.map(&:to_i)
puts " Memory: #{size/1024}MB" # show process memory
end
jarthod marked this conversation as resolved.
Show resolved Hide resolved

# Memory leak test for libidn2 (memory should stabilize quickly):
# Memory: 117MB
# Memory: 121MB
# Memory: 121MB
# Memory: 121MB
# Memory: 121MB
# Memory: 121MB
# Memory: 121MB
# Memory: 121MB
# Memory: 121MB
# Memory: 121MB
16 changes: 10 additions & 6 deletions lib/addressable/idna.rb
Expand Up @@ -16,11 +16,15 @@
# limitations under the License.
#++


begin
require "addressable/idna/native"
require "addressable/idna/native2"
rescue LoadError
# libidn or the idn gem was not available, fall back on a pure-Ruby
# implementation...
require "addressable/idna/pure"
end
# libidn2 or the ffi gem was not available, fall back on libidn1
begin
require "addressable/idna/native"
rescue LoadError
# libidn or the idn gem was not available, fall back on a pure-Ruby
# implementation...
require "addressable/idna/pure"
end
end
jarthod marked this conversation as resolved.
Show resolved Hide resolved
10 changes: 1 addition & 9 deletions lib/addressable/idna/native.rb
Expand Up @@ -16,19 +16,11 @@
# limitations under the License.
#++


# libidn1 implementing IDNA2003
require "idn"

module Addressable
module IDNA
def self.punycode_encode(value)
IDN::Punycode.encode(value.to_s)
end

def self.punycode_decode(value)
IDN::Punycode.decode(value.to_s)
end

jarthod marked this conversation as resolved.
Show resolved Hide resolved
def self.to_ascii(value)
value.to_s.split('.', -1).map do |segment|
if segment.size > 0 && segment.size < 64
Expand Down
57 changes: 57 additions & 0 deletions lib/addressable/idna/native2.rb
@@ -0,0 +1,57 @@
# frozen_string_literal: true

#--
# Copyright (C) Bob Aman
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#++

# libidn2 implementing IDNA2008+TR46
require "ffi"

module Addressable
module IDNA
extend FFI::Library

ffi_lib ["idn2", "libidn2.0", "libidn2.so.0"]

attach_function :idn2_to_ascii_8z, %i[string pointer int], :int
attach_function :idn2_to_unicode_8z8z, %i[string pointer int], :int
attach_function :idn2_strerror, [:int], :string
attach_function :idn2_free, [:pointer], :void

IDN2_TRANSITIONAL = 4
IDN2_NONTRANSITIONAL = 8

def self.to_ascii(value)
return value if value.ascii_only?
pointer = FFI::MemoryPointer.new(:pointer)
res = idn2_to_ascii_8z(value, pointer, IDN2_NONTRANSITIONAL)
# Fallback to Transitional mode in case of disallowed character
res = idn2_to_ascii_8z(value, pointer, IDN2_TRANSITIONAL) if res != 0
raise "libidn2 failed to convert \"#{value}\" to ascii (#{idn2_strerror(res)})" if res != 0
result = pointer.read_pointer.read_string
idn2_free(pointer.read_pointer)
result
end

def self.to_unicode(value)
pointer = FFI::MemoryPointer.new(:pointer)
res = idn2_to_unicode_8z8z(value, pointer, IDN2_NONTRANSITIONAL)
return value if res != 0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially put some exception here in case of invalid input, but it turns out the specs expect invalid punnycode hostname to simply be returned unchanged, so I did just that instead. It's hidding errors and silently returning the input string now, not very strict I suppose but more compatible with existing usage 🤷

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something to change in a major version bump?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well it's a more of a design choice. It's true that if it were my decision I would prefer the stricter version raising an exception, and as I suggest shipping this in a major version, we could probably do it.

But on the other end I know the direction of the gem is to be "flexible, offers heuristic parsing", as opposed to the Ruby URI module, so I understand that accepting invalid input and keeping it unchanged without raising can be a feature and a design choice. So if you guys prefer to keep this flexibility I totally understand it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the test case we're talking about?

it "should return the identity conversion when punycode decode fails" do
expect(Addressable::IDNA.to_unicode("xn--zckp1cyg1.sblo.jp")).to eq(
"xn--zckp1cyg1.sblo.jp")
end

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one and 2 others would be failing if I raise an error here:

  1) Addressable::IDNA when using the libidn2 native implementation (ffi) it should behave like converting from ASCII to unicode should convert 'AcinusFallumTrompetumNullunCreditumVisumEstAtCuadLongumEtCefallum.com' correctly
     Failure/Error: raise "libidn2 failed to convert \"#{value}\" to unicode (#{idn2_strerror(res)})" if res != 0
     
     RuntimeError:
       libidn2 failed to convert "AcinusFallumTrompetumNullunCreditumVisumEstAtCuadLongumEtCefallum.com" to unicode (domain label longer than 63 characters)
     Shared Example Group: "converting from ASCII to unicode" called from ./spec/addressable/idna_spec.rb:321
     # ./lib/addressable/idna/libidn2.rb:52:in `to_unicode'
     # ./lib/addressable/idna.rb:30:in `to_unicode'
     # ./spec/addressable/idna_spec.rb:160:in `block (2 levels) in <top (required)>'

  2) Addressable::IDNA when using the libidn2 native implementation (ffi) it should behave like converting from ASCII to unicode should return the identity conversion when punycode decode fails
     Failure/Error: raise "libidn2 failed to convert \"#{value}\" to unicode (#{idn2_strerror(res)})" if res != 0
     
     RuntimeError:
       libidn2 failed to convert "xn--zckp1cyg1.sblo.jp" to unicode (string contains invalid punycode data)
     Shared Example Group: "converting from ASCII to unicode" called from ./spec/addressable/idna_spec.rb:321
     # ./lib/addressable/idna/libidn2.rb:52:in `to_unicode'
     # ./lib/addressable/idna.rb:30:in `to_unicode'
     # ./spec/addressable/idna_spec.rb:164:in `block (2 levels) in <top (required)>'

  3) Addressable::IDNA when using the libidn2 native implementation (ffi) it should behave like converting from ASCII to unicode should return the identity conversion when the ACE prefix has no suffix
     Failure/Error: raise "libidn2 failed to convert \"#{value}\" to unicode (#{idn2_strerror(res)})" if res != 0
     
     RuntimeError:
       libidn2 failed to convert "xn--...-" to unicode (string contains invalid punycode data)
     Shared Example Group: "converting from ASCII to unicode" called from ./spec/addressable/idna_spec.rb:321
     # ./lib/addressable/idna/libidn2.rb:52:in `to_unicode'
     # ./lib/addressable/idna.rb:30:in `to_unicode'
     # ./spec/addressable/idna_spec.rb:169:in `block (2 levels) in <top (required)>'

The last two are invalid punycode and the first one is invalid DNS length (https://datatracker.ietf.org/doc/html/rfc1034#section-3.1).

libidn1 also raise in the first case but we have this workaround to explicitely allow for > 63 bytes labels:

    def self.to_ascii(value)
      value.to_s.split('.', -1).map do |segment|
        if segment.size > 0 && segment.size < 64
          IDN::Idna.toASCII(segment, IDN::Idna::ALLOW_UNASSIGNED)
        elsif segment.size >= 64
          segment
        else
          ''
        end
      end.join('.')
    end

Looks like this was made in c73810f to make it more consistent with pure. Didn't see any issue attached though.

So I suppose if we make libidn2 stricter, which means basically:

raise "libidn2 failed to convert \"#{value}\" to unicode (#{idn2_strerror(res)})" if res != 0

instead of

return value if res != 0

We would need to remove these workarounds and make all implementations rejects these domains in the same way.
Which does sound like the way to go IMO but of course could break some use-cases for people who need to handle such "slightly" invalid domains.

Copy link
Contributor Author

@jarthod jarthod Apr 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting suggestion, I do like the fact that this solution being run-time, we can use this class even if it's not defined yet. Unfortunately it doesn't work with classes, only modules: wrong argument type Class (expected Module) if I do error.extend(Addressable::URI::InvalidURIError). In your example Twingly::URL::Error is actually a module. And if we need to change Addressable::URI::InvalidURIError to be a module this would complexify the rest :/ I couldn't find any way to change the ancestory chain by adding another class in the middle.

Looking again at uri.rb I see IDNA is used only twice (in normalized_host for to_ascii and in display_uri for to_unicode), so option 2 which is to re-wrap the error here doesn't sound too complicated either.

I just gave this option a try in 9eb3910 and I think I actually prefer this one. No hierarchy issue here, every module/class only deal with its own exceptions. The wrapped exception are properly identified and the cause attribute contains the previous exception (with specs for that just in case), this means backtrace and history is complete for bug tracker. People doing rescue Addressable::URI::InvalidURIError are covered the same way.

I also added by the way specs for the case of invalid IDNA hostname at URI level this time (I couldn't find any at the moment). 3 of them for the Pure implementation have been marked pending because they are returning garbage at the moment (implementation makes up unicode characters from invalid input).

And I also fixed the libidn1 exception handling, which was still letting IDN::Idna::IdnaError exception up so not handled properly (I missed it earlier because there was no spec on this case), now it's raising Addressable::IDNA::Error like the other backends (+spec)

Sorry for the long back and forth, let me know what you think about this one ;)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh sorry for that, I did know about the module thing but forgot before I posted, oh well

Yes, I like 9eb3910 too :) Thanks for extending the spec coverage

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem ^^ I wasn't sure so I tried.

In order to validate this branch more while you fiddle with it, I just deployed this version to staging and then production on my service. Using libidn2 and strict_mode:

# Select libidn2 (not the default at the moment)
require "addressable/idna/libidn2"
Addressable::IDNA.backend = Addressable::IDNA::Libidn2
Addressable::IDNA.strict_mode = true

If I see any problem I'll report it here.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Necro-ing a little bit, but wanted to weigh in on the start of the conversation. I think it's important to offer a mechanism that's permissive in what it accepts. It's literally the reason I wrote Addressable in the first place, because the standard library doesn't take this approach to parsing and I couldn't parse URIs that were openable in a browser, leading to surprise from end users. There are often cases where failing with an exception will mean that there's no graceful way to get partial information. For instance, something might be very wrong with the encoding in the hostname, but if the library's user was only trying to retrieve the path value, the invalid URI exception is rather obstructive to that goal.

On the other hand, you're absolutely right that there are cases where the opposite is preferred and strict parsing is preferable. My view is these should simply be handled by different methods rather than changing the behavior for the whole library in a major version rev.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sporkmonger thanks for your feedback.

I understand and agree, the concensus that @dentarg and I reached further down this discussion was to introduce the Addressable::IDNA.strict_mode = true option (default to false) so that people can choose if they want stricter parsing but otherwise it's lax as before.

Of course if you prefer different methods instead of an option, we can probably do that instead. Though if there's different methods in IDNA modules for both behaviors, we would also need to mirror that in the URI module because the methods people call usually are here. I haven't checked the whole public API recently but I'm concerned this may create a lot of new methods. I also thought about people using gems which depends on adressable : if we're using different methods, the end-user won't be able to change the behavior. I can have a deeper look if you want me to 👍

result = pointer.read_pointer.read_string
idn2_free(pointer.read_pointer)
result.force_encoding('UTF-8')
end
end
end
41 changes: 36 additions & 5 deletions spec/addressable/idna_spec.rb
Expand Up @@ -258,14 +258,18 @@
end

describe Addressable::IDNA, "when using the pure-Ruby implementation" do
before do
before :all do
jarthod marked this conversation as resolved.
Show resolved Hide resolved
Addressable.send(:remove_const, :IDNA)
load "addressable/idna/pure.rb"
end

it_should_behave_like "converting from unicode to ASCII"
it_should_behave_like "converting from ASCII to unicode"

it "should implement IDNA2008 non transitional" do
expect(Addressable::IDNA.to_ascii("faß.de")).to eq("xn--fa-hia.de")
end

begin
require "fiber"

Expand All @@ -285,18 +289,45 @@
begin
require "idn"

describe Addressable::IDNA, "when using the native-code implementation" do
before do
describe Addressable::IDNA, "when using the libidn1 native implementation (idn gem)" do
before :all do
Addressable.send(:remove_const, :IDNA)
load "addressable/idna/native.rb"
end

it_should_behave_like "converting from unicode to ASCII"
it_should_behave_like "converting from ASCII to unicode"

it "should implement IDNA2003" do
expect(Addressable::IDNA.to_ascii("faß.de")).to eq("fass.de")
end
end
rescue LoadError => error
raise error if ENV["CI"] && TestHelper.native_supported?

# Cannot test the native implementation without libidn support.
warn('Could not load native IDN implementation.')
# Cannot test the native implementation without libidn installed.
warn('Could not load native libidn1 implementation.')
end

begin
require "addressable/idna/native2.rb"

describe Addressable::IDNA, "when using the libidn2 native implementation (ffi)" do
before :all do
Addressable.send(:remove_const, :IDNA)
load "addressable/idna/native2.rb"
end

it_should_behave_like "converting from unicode to ASCII"
it_should_behave_like "converting from ASCII to unicode"

it "should implement IDNA2008 non transitional" do
expect(Addressable::IDNA.to_ascii("faß.de")).to eq("xn--fa-hia.de")
end
end
rescue LoadError => error
raise error if ENV["CI"] && TestHelper.native_supported?

# Cannot test the native implementation without libidn2 installed.
warn('Could not load native libidn2 implementation.')
end
5 changes: 4 additions & 1 deletion tasks/profile.rake
Expand Up @@ -41,13 +41,16 @@ namespace :profile do
if ENV["IDNA_MODE"] == "pure"
Addressable.send(:remove_const, :IDNA)
load "addressable/idna/pure.rb"
elsif ENV["IDNA_MODE"] == "native"
Addressable.send(:remove_const, :IDNA)
load "addressable/idna/native.rb"
end

start_at = Time.now.to_f
report = MemoryProfiler.report do
30_000.times do
Addressable::URI.parse(
"http://google.com/stuff/../?with_lots=of&params=asdff#!stuff"
"http://fiᆵリ宠퐱卄.com/stuff/../?with_lots=of&params=asdff#!stuff"
jarthod marked this conversation as resolved.
Show resolved Hide resolved
).normalize
end
end
Expand Down