Skip to content

Commit

Permalink
Merge pull request #219 from feed-rs/pluggable-id-generation
Browse files Browse the repository at this point in the history
Pluggable ID generation
  • Loading branch information
markpritchard committed Apr 26, 2024
2 parents 4c82095 + a15cfee commit 9746f58
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 49 deletions.
28 changes: 28 additions & 0 deletions feed-rs/fixture/rss2/rss_2.0_kdist.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<title>Latest Linux Kernel Versions</title>
<link>http://www.kernel.org</link>
<description>Latest Linux Kernel Versions</description>
<lastBuildDate>Fri, 08 May 2020 11:11:02 -0000</lastBuildDate>
<item>
<title>5.7-rc4: mainline</title>
<link>http://www.kernel.org/</link>
<description>
&lt;table&gt;
&lt;tr&gt;&lt;th align="right"&gt;Version:&lt;/th&gt;&lt;td&gt;&lt;strong&gt;5.7-rc4&lt;/strong&gt;
(mainline)&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;th align="right"&gt;Released:&lt;/th&gt;&lt;td&gt;2020-05-03&lt;/td&gt;&lt;/tr&gt;

&lt;tr&gt;&lt;th align="right"&gt;Source:&lt;/th&gt;&lt;td&gt;&lt;a
href="https://git.kernel.org/torvalds/t/linux-5.7-rc4.tar.gz"&gt;linux-5.7-rc4.tar.gz&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;th align="right"&gt;Patch:&lt;/th&gt;&lt;td&gt;&lt;a
href="https://git.kernel.org/torvalds/p/v5.7-rc4/v5.6"&gt;full&lt;/a&gt; (&lt;a
href="https://git.kernel.org/torvalds/p/v5.7-rc4/v5.7-rc3"&gt;incremental&lt;/a&gt;)&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;
</description>
<pubDate>Sun, 03 May 2020 21:56:15 -0000</pubDate>
<guid isPermaLink="false">kernel.org,mainline,5.7-rc4,2020-05-03</guid>
</item>
</channel>
</rss>
16 changes: 0 additions & 16 deletions feed-rs/src/parser/fuzz.rs

This file was deleted.

85 changes: 55 additions & 30 deletions feed-rs/src/parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
use chrono::{DateTime, Utc};
use std::error::Error;
use std::fmt;
use std::fmt::Debug;
use std::hash::Hasher;
use std::io::{BufRead, BufReader, Read};

use chrono::{DateTime, Utc};
use siphasher::sip128::{Hasher128, SipHasher};

use crate::model;
use crate::parser::util::TimestampParser;
use crate::parser::util::{IdGenerator, TimestampParser};
use crate::xml;
use crate::xml::NS;

Expand All @@ -21,7 +22,7 @@ pub(crate) mod itunes;
pub(crate) mod mediarss;
pub(crate) mod util;

pub type ParseFeedResult<T> = std::result::Result<T, ParseFeedError>;
pub type ParseFeedResult<T> = Result<T, ParseFeedError>;

/// An error returned when parsing a feed from a source fails
#[derive(Debug)]
Expand All @@ -30,11 +31,11 @@ pub enum ParseFeedError {
ParseError(ParseErrorKind),
// IO error
IoError(std::io::Error),
// Underlying issue with JSON (poorly formatted etc)
// Underlying issue with JSON (poorly formatted etc.)
JsonSerde(serde_json::error::Error),
// Unsupported version of the JSON feed
JsonUnsupportedVersion(String),
// Underlying issue with XML (poorly formatted etc)
// Underlying issue with XML (poorly formatted etc.)
XmlReader(xml::XmlError),
}

Expand Down Expand Up @@ -82,9 +83,9 @@ impl Error for ParseFeedError {
/// Underlying cause of the parse failure
#[derive(Debug)]
pub enum ParseErrorKind {
/// Could not find the expected root element (e.g. "channel" for RSS 2, a JSON node etc)
/// Could not find the expected root element (e.g. "channel" for RSS 2, a JSON node etc.)
NoFeedRoot,
/// The content type is unsupported and we cannot parse the value into a known representation
/// The content type is unsupported, and we cannot parse the value into a known representation
UnknownMimeType(String),
/// Required content within the source was not found e.g. the XML child text element for a "content" element
MissingContent(&'static str),
Expand All @@ -103,6 +104,7 @@ impl fmt::Display for ParseErrorKind {
/// Parser for various feed formats
pub struct Parser {
base_uri: Option<String>,
id_generator: Box<IdGenerator>,
timestamp_parser: Box<TimestampParser>,
}

Expand Down Expand Up @@ -154,7 +156,7 @@ impl Parser {

// Post processing as required
if let Ok(mut feed) = result {
assign_missing_ids(&mut feed, self.base_uri.as_deref());
assign_missing_ids(&self.id_generator, &mut feed, self.base_uri.as_deref());

Ok(feed)
} else {
Expand Down Expand Up @@ -222,6 +224,7 @@ pub fn parse_with_uri<R: Read>(source: R, uri: Option<&str>) -> ParseFeedResult<
/// Builder to create instances of `FeedParser`
pub struct Builder {
base_uri: Option<String>,
id_generator: Box<IdGenerator>,
timestamp_parser: Box<TimestampParser>,
}

Expand All @@ -241,10 +244,20 @@ impl Builder {
pub fn build(self) -> Parser {
Parser {
base_uri: self.base_uri,
timestamp_parser: Box::new(self.timestamp_parser),
id_generator: self.id_generator,
timestamp_parser: self.timestamp_parser,
}
}

/// Registers an ID generator
pub fn id_generator<F>(mut self, generator: F) -> Self
where
F: Fn(&[model::Link], &Option<model::Text>, Option<&str>) -> String + 'static,
{
self.id_generator = Box::new(generator);
self
}

/// Registers a custom timestamp parser
pub fn timestamp_parser<F>(mut self, ts_parser: F) -> Self
where
Expand All @@ -260,50 +273,62 @@ impl Default for Builder {
fn default() -> Self {
Builder {
base_uri: None,
id_generator: Box::new(generate_id),
timestamp_parser: Box::new(util::parse_timestamp_lenient),
}
}
}

// Assigns IDs to missing feed + entries as required
fn assign_missing_ids(feed: &mut model::Feed, uri: Option<&str>) {
fn assign_missing_ids(id_generator: &IdGenerator, feed: &mut model::Feed, uri: Option<&str>) {
if feed.id.is_empty() {
feed.id = create_id(&feed.links, &feed.title, uri);
feed.id = id_generator(&feed.links, &feed.title, uri);
}

for entry in feed.entries.iter_mut() {
if entry.id.is_empty() {
entry.id = create_id(&entry.links, &entry.title, uri);
entry.id = id_generator(&entry.links, &entry.title, uri);
}
}
}

const LINK_HASH_KEY1: u64 = 0x5d78_4074_2887_2d60;
const LINK_HASH_KEY2: u64 = 0x90ee_ca4c_90a5_e228;

// Creates a unique ID from the first link, or a UUID if no links are available
fn create_id(links: &[model::Link], title: &Option<model::Text>, uri: Option<&str>) -> String {
if let Some(link) = links.iter().next() {
// Generate a stable ID for this item based on the first link
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(link.href.as_bytes());
if let Some(title) = title {
hasher.write(title.content.as_bytes());
}
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
// Creates a unique ID by trying the following in order:
// 1) the first link + optional title
// 2) the uri + title provided
// 3) a UUID
pub fn generate_id(links: &[model::Link], title: &Option<model::Text>, uri: Option<&str>) -> String {
if let Some(link) = links.first() {
generate_id_from_link_and_title(link, title)
} else if let (Some(uri), Some(title)) = (uri, title) {
// if no links were provided by the feed use the optional URI passed by the caller
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(uri.as_bytes());
hasher.write(title.content.as_bytes());
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
generate_id_from_uri_and_title(uri, title)
} else {
// Generate a UUID as last resort
util::uuid_gen()
}
}

// Generate an ID from the link + title
pub fn generate_id_from_link_and_title(link: &model::Link, title: &Option<model::Text>) -> String {
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(link.href.as_bytes());
if let Some(title) = title {
hasher.write(title.content.as_bytes());
}
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
}

// Generate an ID from the URI and title
pub fn generate_id_from_uri_and_title(uri: &str, title: &model::Text) -> String {
let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
hasher.write(uri.as_bytes());
hasher.write(title.content.as_bytes());
let hash = hasher.finish128();
format!("{:x}{:x}", hash.h1, hash.h2)
}

#[cfg(test)]
mod fuzz;
mod tests;
55 changes: 55 additions & 0 deletions feed-rs/src/parser/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use crate::model::Link;
use crate::parser;
use crate::parser::{generate_id_from_link_and_title, util};
use crate::util::test;

// Regression test for the default ID generator
#[test]
fn id_generator_default() {
let test_data = test::fixture_as_raw("rss2/rss_2.0_kdist.xml");
let feed = parser::parse(test_data.as_slice()).unwrap();
assert_eq!("354331764be7571efc15c7a1bad13d54", feed.id);
}

// Custom implementation providing backward compatibility with v0.2
#[test]
fn id_generator_v0_2() {
let test_data = test::fixture_as_raw("rss2/rss_2.0_kdist.xml");

// Custom ID that trims URLs etc
let feed = parser::Builder::new()
.id_generator(|links, title, _uri| {
// If we have a link without relative components, use that
if let Some(link) = links.iter().find(|l| l.rel.is_none()) {
// Trim the trailing slash if it exists
let mut link = Link::new(link.href.clone(), None);
if link.href.ends_with('/') {
link.href.pop();
}

generate_id_from_link_and_title(&link, title)
} else {
util::uuid_gen()
}
})
.build()
.parse(test_data.as_slice())
.unwrap();
assert_eq!("7edcf1fbe86570753646f6eb75db4d55", feed.id);
}

// Verifies failure uncovered by fuzzing is now fixed
#[test]
fn fuzz_parse() {
let data: Vec<u8> = vec![
0xdb, 0x3b, 0x3c, 0x66, 0x65, 0x65, 0x64, 0x3e, 0x00, 0xfe, 0xff, 0x00, 0x00, 0x00, 0x3c, 0x1b, 0x3b, 0x64, 0x22, 0x22, 0x0d, 0x78, 0x6d, 0x6c, 0x3a,
0x62, 0x61, 0x73, 0x65, 0x0d, 0x0d, 0x3d, 0x0a, 0x22, 0x0a, 0x0d, 0x0a, 0x0a, 0x0d, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0xff, 0x3b, 0xbf, 0x5b, 0xbf, 0xbf,
0xbc, 0xff, 0xff, 0x0a, 0x53, 0x53, 0x2b, 0x78, 0x3b, 0x22, 0x3c, 0x64, 0x3e, 0x2b, 0x00, 0x00, 0x2b, 0x3c, 0xdb, 0x3b, 0x32, 0x65, 0x64, 0x22, 0x22,
0x0d, 0x78, 0x6d, 0x6c, 0x3a, 0x62, 0x61, 0x73, 0x65, 0x0d, 0x0d, 0x3d, 0x22, 0x75, 0x7c, 0x3f, 0x0a, 0x34, 0x0a, 0xff, 0x22, 0x34, 0x3a, 0xb5, 0x2f,
0x3c, 0x66, 0x65, 0x64, 0x3e, 0x2b, 0x3c, 0xdb, 0x3b, 0x32, 0x65, 0x0d, 0x78, 0x6d, 0x6c, 0x3a, 0x62, 0x61, 0x73, 0x65, 0x0d, 0x0d, 0x3d, 0x22, 0x2e,
0x2e, 0x3f, 0x0a, 0x3c, 0x3f, 0xff, 0x22, 0x34, 0x3a, 0xb5, 0x2f, 0x2f, 0xff, 0xff, 0xfe, 0x01, 0xdb, 0x3b, 0x3c, 0x66, 0x65,
];

let result = parser::parse(data.as_slice());
assert!(result.is_err());
}
10 changes: 7 additions & 3 deletions feed-rs/src/parser/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ use std::ops::Add;
use std::sync::OnceLock;
use std::time::Duration;

use crate::model;
use chrono::{DateTime, Utc};
use model::{Link, Text};
use regex::{Captures, Regex};
use url::Url;
use uuid::Uuid;

use crate::model::{Link, Text};
use crate::parser::{ParseFeedResult, Parser};
use crate::xml::Element;

Expand Down Expand Up @@ -75,8 +76,11 @@ mod fixes {
// but without the day of week (since it is superfluous and often in languages other than English)
static RFC1123_FORMAT_STR: &str = "%d %b %Y %H:%M:%S %z";

/// Generified timestamp parser
pub(crate) type TimestampParser = dyn Fn(&str) -> Option<DateTime<Utc>>;
/// Pluggable timestamp parser
pub(crate) type TimestampParser = dyn Fn(&str) -> Option<DateTime<Utc>> + 'static;

/// Pluggable ID (feed or entry) generator
pub(crate) type IdGenerator = dyn Fn(&[Link], &Option<Text>, Option<&str>) -> String;

/// Handles <content:encoded>
pub(crate) fn handle_encoded<R: BufRead>(element: Element<R>) -> ParseFeedResult<Option<Text>> {
Expand Down

0 comments on commit 9746f58

Please sign in to comment.