Skip to content

Commit

Permalink
Merge pull request #158 from dentarg/update-for-addressable-2.8.1
Browse files Browse the repository at this point in the history
Validate the normalized hostname
  • Loading branch information
Pontus4 committed Oct 7, 2022
2 parents 767807a + ae2d264 commit f95151c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
27 changes: 24 additions & 3 deletions lib/twingly/url.rb
Expand Up @@ -22,6 +22,8 @@ class URL
Addressable::URI::InvalidURIError,
PublicSuffix::DomainInvalid,
].freeze
DOT = "."
HYPHEN = "-"
CARRIAGE_RETURN = "\u000D"
LINE_FEED = "\u000A"
NBSP = "\u00A0"
Expand All @@ -34,16 +36,20 @@ class URL
].join.freeze
LEADING_AND_TRAILING_WHITESPACE =
/\A[#{WHITESPACE_CHARS}]+|[#{WHITESPACE_CHARS}]+\z/.freeze
LETTERS_DIGITS_HYPHEN = /\A[a-zA-Z0-9-]+\z/.freeze

private_constant :ACCEPTED_SCHEMES
private_constant :CUSTOM_PSL
private_constant :STARTS_WITH_WWW
private_constant :ENDS_WITH_SLASH
private_constant :ERRORS_TO_EXTEND
private_constant :DOT
private_constant :HYPHEN
private_constant :NBSP
private_constant :SPACE
private_constant :WHITESPACE_CHARS
private_constant :LEADING_AND_TRAILING_WHITESPACE
private_constant :LETTERS_DIGITS_HYPHEN

class << self
def parse(potential_url)
Expand Down Expand Up @@ -91,10 +97,9 @@ def strip_whitespace(input)
input.gsub(LEADING_AND_TRAILING_WHITESPACE, "")
end

# Workaround for the following bug in addressable:
# https://github.com/sporkmonger/addressable/issues/224
def try_addressable_normalize(addressable_uri)
addressable_uri.normalize
ascii_host = addressable_uri.normalize.host
raise Twingly::URL::Error::ParseError unless valid_hostname?(ascii_host)
rescue ArgumentError => error
if error.message.include?("invalid byte sequence in UTF-8")
raise Twingly::URL::Error::ParseError
Expand All @@ -103,11 +108,27 @@ def try_addressable_normalize(addressable_uri)
raise
end

def valid_hostname?(hostname)
# No need to check the TLD, the public suffix list does that
labels = hostname.split(DOT)[0...-1].map(&:to_s)

labels.all? { |label| valid_label?(label) }
end

def valid_label?(label)
return false if label.start_with?(HYPHEN)
return false if label.end_with?(HYPHEN)

label.match?(LETTERS_DIGITS_HYPHEN)
end

private :new
private :internal_parse
private :clean_input
private :strip_whitespace
private :try_addressable_normalize
private :valid_hostname?
private :valid_label?
end

def initialize(addressable_uri, public_suffix_domain)
Expand Down
7 changes: 1 addition & 6 deletions spec/lib/twingly/url_spec.rb
Expand Up @@ -39,6 +39,7 @@ def invalid_urls
"http://.gl/xxx",
"http://.twingly.com/",
"http://www.twingly.",
"http://www..twingly..com/",

# Test that we can handle upstream bug in Addressable, references:
# https://github.com/twingly/twingly-url/issues/62
Expand Down Expand Up @@ -564,12 +565,6 @@ def leading_and_trailing_whitespace
it { is_expected.to eq(expected) }
end

context "oddly enough, does not alter URLs with consecutive dots" do
let(:url) { "http://www..twingly..com/" }

it { is_expected.to eq(url) }
end

context "does not add www. to blogspot URLs" do
let(:url) { "http://jlchen1026.blogspot.com/" }

Expand Down

0 comments on commit f95151c

Please sign in to comment.