Skip to content

Commit

Permalink
Do more conservative URL normalization (#758)
Browse files Browse the repository at this point in the history
  • Loading branch information
c960657 committed Oct 6, 2023
1 parent 65276d7 commit 8b802bf
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 2 deletions.
1 change: 1 addition & 0 deletions .rubocop.yml
@@ -1,6 +1,7 @@
inherit_from:
- .rubocop_todo.yml
- .rubocop/layout.yml
- .rubocop/metrics.yml
- .rubocop/style.yml

AllCops:
Expand Down
4 changes: 4 additions & 0 deletions .rubocop/metrics.yml
@@ -0,0 +1,4 @@
Metrics/BlockLength:
Exclude:
- 'spec/**/*.rb'
- '*.gemspec'
20 changes: 18 additions & 2 deletions lib/http/uri.rb
Expand Up @@ -37,15 +37,18 @@ class URI
# @private
HTTPS_SCHEME = "https"

# @private
PERCENT_ENCODE = /[^\x21-\x7E]+/.freeze

# @private
NORMALIZER = lambda do |uri|
uri = HTTP::URI.parse uri

HTTP::URI.new(
:scheme => uri.normalized_scheme,
:authority => uri.normalized_authority,
:path => uri.normalized_path,
:query => uri.query,
:path => uri.path.empty? ? "/" : percent_encode(Addressable::URI.normalize_path(uri.path)),
:query => percent_encode(uri.query),
:fragment => uri.normalized_fragment
)
end
Expand All @@ -71,6 +74,19 @@ def self.form_encode(form_values, sort = false)
Addressable::URI.form_encode(form_values, sort)
end

# Percent-encode all characters matching a regular expression.
#
# @param [String] string raw string
#
# @return [String] encoded value
#
# @private
def self.percent_encode(string)
string&.gsub(PERCENT_ENCODE) do |substr|
substr.encode(Encoding::UTF_8).bytes.map { |c| format("%%%02X", c) }.join
end
end

# Creates an HTTP::URI instance from the given options
#
# @param [Hash, Addressable::URI] options_or_uri
Expand Down
95 changes: 95 additions & 0 deletions spec/lib/http/uri/normalizer_spec.rb
@@ -0,0 +1,95 @@
# frozen_string_literal: true

RSpec.describe HTTP::URI::NORMALIZER do
describe "scheme" do
it "lower-cases scheme" do
expect(HTTP::URI::NORMALIZER.call("HttP://example.com").scheme).to eq "http"
end
end

describe "hostname" do
it "lower-cases hostname" do
expect(HTTP::URI::NORMALIZER.call("http://EXAMPLE.com").host).to eq "example.com"
end

it "decodes percent-encoded hostname" do
expect(HTTP::URI::NORMALIZER.call("http://ex%61mple.com").host).to eq "example.com"
end

it "removes trailing period in hostname" do
expect(HTTP::URI::NORMALIZER.call("http://example.com.").host).to eq "example.com"
end

it "IDN-encodes non-ASCII hostname" do
expect(HTTP::URI::NORMALIZER.call("http://exämple.com").host).to eq "xn--exmple-cua.com"
end
end

describe "path" do
it "ensures path is not empty" do
expect(HTTP::URI::NORMALIZER.call("http://example.com").path).to eq "/"
end

it "preserves double slashes in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com//a///b").path).to eq "//a///b"
end

it "resolves single-dot segments in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/a/./b").path).to eq "/a/b"
end

it "resolves double-dot segments in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/a/b/../c").path).to eq "/a/c"
end

it "resolves leading double-dot segments in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/../a/b").path).to eq "/a/b"
end

it "percent-encodes control characters in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/\x00\x7F\n").path).to eq "/%00%7F%0A"
end

it "percent-encodes space in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/a b").path).to eq "/a%20b"
end

it "percent-encodes non-ASCII characters in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/キョ").path).to eq "/%E3%82%AD%E3%83%A7"
end

it "does not percent-encode non-special characters in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/~.-_!$&()*,;=:@{}").path).to eq "/~.-_!$&()*,;=:@{}"
end

it "preserves escape sequences in path" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/%41").path).to eq "/%41"
end
end

describe "query" do
it "allows no query" do
expect(HTTP::URI::NORMALIZER.call("http://example.com").query).to be_nil
end

it "percent-encodes control characters in query" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/?\x00\x7F\n").query).to eq "%00%7F%0A"
end

it "percent-encodes space in query" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/?a b").query).to eq "a%20b"
end

it "percent-encodes non-ASCII characters in query" do
expect(HTTP::URI::NORMALIZER.call("http://example.com?キョ").query).to eq "%E3%82%AD%E3%83%A7"
end

it "does not percent-encode non-special characters in query" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/?~.-_!$&()*,;=:@{}?").query).to eq "~.-_!$&()*,;=:@{}?"
end

it "preserves escape sequences in query" do
expect(HTTP::URI::NORMALIZER.call("http://example.com/?%41").query).to eq "%41"
end
end
end

0 comments on commit 8b802bf

Please sign in to comment.