From e3b04ebab73ad7e5c35bdf02d4ca84021be725e5 Mon Sep 17 00:00:00 2001 From: Adrien Rey-Jarthon Date: Sat, 2 Jul 2022 12:01:52 +0200 Subject: [PATCH] fix "invalid byte sequence in UTF-8" exception when unencoding URLs containing non UTF-8 characters --- lib/addressable/uri.rb | 12 +++--------- spec/addressable/uri_spec.rb | 5 +++++ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/addressable/uri.rb b/lib/addressable/uri.rb index 6e55cda9..3ded0e87 100644 --- a/lib/addressable/uri.rb +++ b/lib/addressable/uri.rb @@ -468,19 +468,13 @@ def self.unencode(uri, return_type=String, leave_encoded='') "Expected Class (String or Addressable::URI), " + "got #{return_type.inspect}" end - uri = uri.dup - # Seriously, only use UTF-8. I'm really not kidding! - uri.force_encoding("utf-8") - unless leave_encoded.empty? - leave_encoded = leave_encoded.dup.force_encoding("utf-8") - end - - result = uri.gsub(/%[0-9a-f]{2}/iu) do |sequence| + result = uri.gsub(/%[0-9a-f]{2}/i) do |sequence| c = sequence[1..3].to_i(16).chr - c.force_encoding("utf-8") + c.force_encoding(sequence.encoding) leave_encoded.include?(c) ? sequence : c end + result.force_encoding("utf-8") if return_type == String return result diff --git a/spec/addressable/uri_spec.rb b/spec/addressable/uri_spec.rb index 76edaad0..b8ca5213 100644 --- a/spec/addressable/uri_spec.rb +++ b/spec/addressable/uri_spec.rb @@ -5992,6 +5992,11 @@ def to_str expect(Addressable::URI.unencode_component("ski=%BA%DAɫ")).to eq("ski=\xBA\xDAɫ") end + it "should not fail with UTF-8 incompatible string" do + url = "/M%E9/\xE9?p=\xFC".b + expect(Addressable::URI.unencode_component(url)).to eq("/M\xE9/\xE9?p=\xFC") + end + it "should result in correct percent encoded sequence as a URI" do expect(Addressable::URI.unencode( "/path?g%C3%BCnther", ::Addressable::URI