Skip to content

Commit

Permalink
FIX: Try respecting charset in HTTP header of RSS feed
Browse files Browse the repository at this point in the history
  • Loading branch information
gschlager committed Aug 1, 2018
1 parent ff942ed commit 5d421fb
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 3 deletions.
16 changes: 13 additions & 3 deletions app/jobs/scheduled/poll_feed.rb
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ def topics
private

def parsed_feed
raw_feed = fetch_rss
encoded_feed = Encodings.to_utf8(raw_feed)
raw_feed, encoding = fetch_rss
encoded_feed = Encodings.try_utf8(raw_feed, encoding) if encoding
encoded_feed = Encodings.to_utf8(raw_feed, encoding_hint: encoding) unless encoded_feed

return nil if encoded_feed.blank?

Expand All @@ -107,10 +108,19 @@ def fetch_rss
feed_final_url = final_destination.resolve
return nil unless final_destination.status == :resolved

Excon.new(feed_final_url.to_s).request(method: :get, expects: 200).body
response = Excon.new(feed_final_url.to_s).request(method: :get, expects: 200)
[response.body, detect_charset(response)]
rescue Excon::Error::HTTPStatus
nil
end

def detect_charset(response)
if response.headers['Content-Type'] =~ /charset\s*=\s*([a-z0-9\-]+)/i
Encoding.find($1)
end
rescue ArgumentError
nil
end
end

class FeedTopic
Expand Down
33 changes: 33 additions & 0 deletions spec/fixtures/feed/iso-8859-15-feed.rss
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?xml version="1.0"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
xmlns:discourse="http://discourse.org/rss/modules/discourse/"
>
<channel>
<title>Discourse</title>
<atom:link href="https://blog.discourse.org/feed/" rel="self" type="application/rss+xml" />
<link>https://blog.discourse.org</link>
<description>Official blog for the open source Discourse project</description>
<lastBuildDate>Thu, 14 Sep 2017 15:22:33 +0000</lastBuildDate>
<language>en-US</language>
<sy:updatePeriod>hourly</sy:updatePeriod>
<sy:updateFrequency>1</sy:updateFrequency>
<generator>https://wordpress.org/?v=4.8.1</generator>
<item>
<title>Poll Feed Spec Fixture</title>
<link>https://blog.discourse.org/2017/09/poll-feed-spec-fixture/</link>
<pubDate>Thu, 14 Sep 2017 15:22:33 +0000</pubDate>
<dc:creator><![CDATA[xrav3nz]]></dc:creator>
<discourse:username><![CDATA[xrav3nz]]></discourse:username>
<category><![CDATA[spec]]></category>
<guid isPermaLink="false">https://blog.discourse.org/?p=pollfeedspec</guid>
<description><![CDATA[Here are some random descriptions... [&#8230;]]]></description>
<content:encoded><![CDATA[<p>This is the body &amp; content. 100¤ </p>]]></content:encoded>
</item>
</channel>
</rss>
20 changes: 20 additions & 0 deletions spec/jobs/poll_feed_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,26 @@
expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.first_post.raw).to include('<p>This is the body &amp; content. </p>')
end

it 'respects the charset in the Content-Type header' do
stub_request(:get, SiteSetting.feed_polling_url).to_return(
body: file_from_fixtures('iso-8859-15-feed.rss', 'feed').read,
headers: { "Content-Type" => "application/rss+xml; charset=ISO-8859-15" }
)

expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.first_post.raw).to include('<p>This is the body &amp; content. 100€ </p>')
end

it 'works when the charset in the Content-Type header is unknown' do
stub_request(:get, SiteSetting.feed_polling_url).to_return(
body: file_from_fixtures('feed.rss', 'feed').read,
headers: { "Content-Type" => "application/rss+xml; charset=foo" }
)

expect { poller.poll_feed }.to change { Topic.count }.by(1)
expect(Topic.last.first_post.raw).to include('<p>This is the body &amp; content. </p>')
end
end
end
end

0 comments on commit 5d421fb

Please sign in to comment.