Skip to content

Commit

Permalink
Pluggable ID generation
Browse files Browse the repository at this point in the history
Adds pluggable ID generation for Feed and Entry instances that are
missing IDs.

Also includes a test case that reproduces v0.2 behaviour.
  • Loading branch information
markpritchard committed Apr 26, 2024
1 parent 4c82095 commit a15cfee
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 49 deletions.
28 changes: 28 additions & 0 deletions feed-rs/fixture/rss2/rss_2.0_kdist.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<title>Latest Linux Kernel Versions</title>
<link>http://www.kernel.org</link>
<description>Latest Linux Kernel Versions</description>
<lastBuildDate>Fri, 08 May 2020 11:11:02 -0000</lastBuildDate>
<item>
<title>5.7-rc4: mainline</title>
<link>http://www.kernel.org/</link>
<description>
&lt;table&gt;
&lt;tr&gt;&lt;th align="right"&gt;Version:&lt;/th&gt;&lt;td&gt;&lt;strong&gt;5.7-rc4&lt;/strong&gt;
(mainline)&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;th align="right"&gt;Released:&lt;/th&gt;&lt;td&gt;2020-05-03&lt;/td&gt;&lt;/tr&gt;

&lt;tr&gt;&lt;th align="right"&gt;Source:&lt;/th&gt;&lt;td&gt;&lt;a
href="https://git.kernel.org/torvalds/t/linux-5.7-rc4.tar.gz"&gt;linux-5.7-rc4.tar.gz&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;th align="right"&gt;Patch:&lt;/th&gt;&lt;td&gt;&lt;a
href="https://git.kernel.org/torvalds/p/v5.7-rc4/v5.6"&gt;full&lt;/a&gt; (&lt;a
href="https://git.kernel.org/torvalds/p/v5.7-rc4/v5.7-rc3"&gt;incremental&lt;/a&gt;)&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;
</description>
<pubDate>Sun, 03 May 2020 21:56:15 -0000</pubDate>
<guid isPermaLink="false">kernel.org,mainline,5.7-rc4,2020-05-03</guid>
</item>
</channel>
</rss>
16 changes: 0 additions & 16 deletions feed-rs/src/parser/fuzz.rs

This file was deleted.

85 changes: 55 additions & 30 deletions feed-rs/src/parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
use chrono::{DateTime, Utc};
use std::error::Error;
use std::fmt;
use std::fmt::Debug;
use std::hash::Hasher;
use std::io::{BufRead, BufReader, Read};

use chrono::{DateTime, Utc};
use siphasher::sip128::{Hasher128, SipHasher};

use crate::model;
use crate::parser::util::TimestampParser;
use crate::parser::util::{IdGenerator, TimestampParser};
use crate::xml;
use crate::xml::NS;

Expand All @@ -21,7 +22,7 @@ pub(crate) mod itunes;
pub(crate) mod mediarss;
pub(crate) mod util;

pub type ParseFeedResult<T> = std::result::Result<T, ParseFeedError>;
pub type ParseFeedResult<T> = Result<T, ParseFeedError>;

/// An error returned when parsing a feed from a source fails
#[derive(Debug)]
Expand All @@ -30,11 +31,11 @@ pub enum ParseFeedError {
ParseError(ParseErrorKind),
// IO error
IoError(std::io::Error),
// Underlying issue with JSON (poorly formatted etc)
// Underlying issue with JSON (poorly formatted etc.)
JsonSerde(serde_json::error::Error),
// Unsupported version of the JSON feed
JsonUnsupportedVersion(String),
// Underlying issue with XML (poorly formatted etc)
// Underlying issue with XML (poorly formatted etc.)
XmlReader(xml::XmlError),
}

Expand Down Expand Up @@ -82,9 +83,9 @@ impl Error for ParseFeedError {
/// Underlying cause of the parse failure
#[derive(Debug)]
pub enum ParseErrorKind {
/// Could not find the expected root element (e.g. "channel" for RSS 2, a JSON node etc)
/// Could not find the expected root element (e.g. "channel" for RSS 2, a JSON node etc.)
NoFeedRoot,
/// The content type is unsupported and we cannot parse the value into a known representation
/// The content type is unsupported, and we cannot parse the value into a known representation
UnknownMimeType(String),
/// Required content within the source was not found e.g. the XML child text element for a "content" element
MissingContent(&'static str),
Expand All @@ -103,6 +104,7 @@ impl fmt::Display for ParseErrorKind {
/// Parser for various feed formats
pub struct Parser {
base_uri: Option<String>,
id_generator: Box<IdGenerator>,
timestamp_parser: Box<TimestampParser>,
}

Expand Down Expand Up @@ -154,7 +156,7 @@ impl Parser {

// Post processing as required
if let Ok(mut feed) = result {
assign_missing_ids(&mut feed, self.base_uri.as_deref());
assign_missing_ids(&self.id_generator, &mut feed, self.base_uri.as_deref());

Ok(feed)
} else {
Expand Down Expand Up @@ -222,6 +224,7 @@ pub fn parse_with_uri<R: Read>(source: R, uri: Option<&str>) -> ParseFeedResult<
/// Builder to create instances of `FeedParser`
pub struct Builder {
base_uri: Option<String>,
id_generator: Box<IdGenerator>,
timestamp_parser: Box<TimestampParser>,
}

Expand All @@ -241,10 +244,20 @@ impl Builder {
pub fn build(self) -> Parser {
Parser {
base_uri: self.base_uri,
timestamp_parser: Box::new(self.timestamp_parser),
id_generator: self.id_generator,
timestamp_parser: self.timestamp_parser,
}
}

/// Registers an ID generator
pub fn id_generator<F>(mut self, generator: F) -> Self
where
F: Fn(&[model::Link], &Option<model::Text>, Option<&str>) -> String + 'static,
{
self.id_generator = Box::new(generator);
self
}

/// Registers a custom timestamp parser
pub fn timestamp_parser<F>(mut self, ts_parser: F) -> Self
where
Expand All @@ -260,50 +273,62 @@ impl Default for Builder {
fn default() -> Self {
Builder {
base_uri: None,
id_generator: Box::new(generate_id),
timestamp_parser: Box::new(util::parse_timestamp_lenient),
}
}
}

// Assigns IDs to missing feed + entries as required
fn assign_missing_ids(feed: &mut model::Feed, uri: Option<&str>) {
fn assign_missing_ids(id_generator: &IdGenerator, feed: &mut model::Feed, uri: Option<&str>) {
if feed.id.is_empty() {
feed.id = create_id(&feed.links, &feed.title, uri);
feed.id = id_generator(&feed.links, &feed.title, uri);
}

for entry in feed.entries.iter_mut() {
if entry.id.is_empty() {
entry.id = create_id(&entry.links, &entry.title, uri);
entry.id = id_generator(&entry.links, &entry.title, uri);
}
}
}

const LINK_HASH_KEY1: u64 = 0x5d78_4074_2887_2d60;
const LINK_HASH_KEY2: u64 = 0x90ee_ca4c_90a5_e228;

// Creates a unique ID from the first link, or a UUID if no links are available
fn create_id(links: &[model::Link], title: &Option<model::Text>, uri: Option<&str>) -> String {
if let Some(link) = links.iter().next() {
// Generate a stable ID for this item based on the first link
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(link.href.as_bytes());
if let Some(title) = title {
hasher.write(title.content.as_bytes());
}
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
// Creates a unique ID by trying the following in order:
// 1) the first link + optional title
// 2) the uri + title provided
// 3) a UUID
pub fn generate_id(links: &[model::Link], title: &Option<model::Text>, uri: Option<&str>) -> String {
if let Some(link) = links.first() {
generate_id_from_link_and_title(link, title)
} else if let (Some(uri), Some(title)) = (uri, title) {
// if no links were provided by the feed use the optional URI passed by the caller
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(uri.as_bytes());
hasher.write(title.content.as_bytes());
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
generate_id_from_uri_and_title(uri, title)
} else {
// Generate a UUID as last resort
util::uuid_gen()
}
}

// Generate an ID from the link + title
pub fn generate_id_from_link_and_title(link: &model::Link, title: &Option<model::Text>) -> String {
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(link.href.as_bytes());
if let Some(title) = title {
hasher.write(title.content.as_bytes());
}
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
}

// Generate an ID from the URI and title
pub fn generate_id_from_uri_and_title(uri: &str, title: &model::Text) -> String {
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(uri.as_bytes());
hasher.write(title.content.as_bytes());
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
}

#[cfg(test)]
mod fuzz;
mod tests;
55 changes: 55 additions & 0 deletions feed-rs/src/parser/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use crate::model::Link;
use crate::parser;
use crate::parser::{generate_id_from_link_and_title, util};
use crate::util::test;

// Regression test for the default ID generator
#[test]
fn id_generator_default() {
let test_data = test::fixture_as_raw("rss2/rss_2.0_kdist.xml");
let feed = parser::parse(test_data.as_slice()).unwrap();
assert_eq!("354331764be7571efc15c7a1bad13d54", feed.id);
}

// Custom implementation providing backward compatibility with v0.2
#[test]
fn id_generator_v0_2() {
let test_data = test::fixture_as_raw("rss2/rss_2.0_kdist.xml");

// Custom ID that trims URLs etc
let feed = parser::Builder::new()
.id_generator(|links, title, _uri| {
// If we have a link without relative components, use that
if let Some(link) = links.iter().find(|l| l.rel.is_none()) {
// Trim the trailing slash if it exists
let mut link = Link::new(link.href.clone(), None);
if link.href.ends_with('/') {
link.href.pop();
}

generate_id_from_link_and_title(&link, title)
} else {
util::uuid_gen()
}
})
.build()
.parse(test_data.as_slice())
.unwrap();
assert_eq!("7edcf1fbe86570753646f6eb75db4d55", feed.id);
}

// Verifies failure uncovered by fuzzing is now fixed
#[test]
fn fuzz_parse() {
let data: Vec<u8> = vec![
0xdb, 0x3b, 0x3c, 0x66, 0x65, 0x65, 0x64, 0x3e, 0x00, 0xfe, 0xff, 0x00, 0x00, 0x00, 0x3c, 0x1b, 0x3b, 0x64, 0x22, 0x22, 0x0d, 0x78, 0x6d, 0x6c, 0x3a,
0x62, 0x61, 0x73, 0x65, 0x0d, 0x0d, 0x3d, 0x0a, 0x22, 0x0a, 0x0d, 0x0a, 0x0a, 0x0d, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0xff, 0x3b, 0xbf, 0x5b, 0xbf, 0xbf,
0xbc, 0xff, 0xff, 0x0a, 0x53, 0x53, 0x2b, 0x78, 0x3b, 0x22, 0x3c, 0x64, 0x3e, 0x2b, 0x00, 0x00, 0x2b, 0x3c, 0xdb, 0x3b, 0x32, 0x65, 0x64, 0x22, 0x22,
0x0d, 0x78, 0x6d, 0x6c, 0x3a, 0x62, 0x61, 0x73, 0x65, 0x0d, 0x0d, 0x3d, 0x22, 0x75, 0x7c, 0x3f, 0x0a, 0x34, 0x0a, 0xff, 0x22, 0x34, 0x3a, 0xb5, 0x2f,
0x3c, 0x66, 0x65, 0x64, 0x3e, 0x2b, 0x3c, 0xdb, 0x3b, 0x32, 0x65, 0x0d, 0x78, 0x6d, 0x6c, 0x3a, 0x62, 0x61, 0x73, 0x65, 0x0d, 0x0d, 0x3d, 0x22, 0x2e,
0x2e, 0x3f, 0x0a, 0x3c, 0x3f, 0xff, 0x22, 0x34, 0x3a, 0xb5, 0x2f, 0x2f, 0xff, 0xff, 0xfe, 0x01, 0xdb, 0x3b, 0x3c, 0x66, 0x65,
];

let result = parser::parse(data.as_slice());
assert!(result.is_err());
}
10 changes: 7 additions & 3 deletions feed-rs/src/parser/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ use std::ops::Add;
use std::sync::OnceLock;
use std::time::Duration;

use crate::model;
use chrono::{DateTime, Utc};
use model::{Link, Text};
use regex::{Captures, Regex};
use url::Url;
use uuid::Uuid;

use crate::model::{Link, Text};
use crate::parser::{ParseFeedResult, Parser};
use crate::xml::Element;

Expand Down Expand Up @@ -75,8 +76,11 @@ mod fixes {
// but without the day of week (since it is superfluous and often in languages other than English)
static RFC1123_FORMAT_STR: &str = "%d %b %Y %H:%M:%S %z";

/// Generified timestamp parser
pub(crate) type TimestampParser = dyn Fn(&str) -> Option<DateTime<Utc>>;
/// Pluggable timestamp parser
pub(crate) type TimestampParser = dyn Fn(&str) -> Option<DateTime<Utc>> + 'static;

/// Pluggable ID (feed or entry) generator
pub(crate) type IdGenerator = dyn Fn(&[Link], &Option<Text>, Option<&str>) -> String;

/// Handles <content:encoded>
pub(crate) fn handle_encoded<R: BufRead>(element: Element<R>) -> ParseFeedResult<Option<Text>> {
Expand Down

0 comments on commit a15cfee

Please sign in to comment.