Skip to content

Commit

Permalink
Add html5gum as alternative link extractor (#480)
Browse files Browse the repository at this point in the history
html5gum is a HTML parser that offers lower-level control over which tokens actually get created and are tracked. As such, the extractor doesn't allocate anything tokens it doesn't care about. On some benchmarks it provides a substantial performance boost. The old parser, html5ever is still available by setting the `LYCHEE_USE_HTML5EVER=1` env var.
  • Loading branch information
untitaker committed Feb 7, 2022
1 parent 6bf8c1f commit 68d09f7
Show file tree
Hide file tree
Showing 9 changed files with 326 additions and 53 deletions.
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 7 additions & 10 deletions examples/collect_links/collect_links.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,13 @@ async fn main() -> Result<()> {
},
];

let links = Collector::new(
None, // base
false, // don't skip missing inputs
)
.collect_links(
inputs, // base url or directory
)
.await
.collect::<Result<Vec<_>>>()
.await?;
let links = Collector::new(None) // base
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
.use_html5ever(false) // use html5ever for parsing? (default=false)
.collect_links(inputs) // base url or directory
.await
.collect::<Result<Vec<_>>>()
.await?;

dbg!(links);

Expand Down
5 changes: 4 additions & 1 deletion lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,10 @@ fn run_main() -> Result<i32> {
/// Run lychee on the given inputs
async fn run(opts: &LycheeOptions) -> Result<i32> {
let inputs = opts.inputs();
let requests = Collector::new(opts.config.base.clone(), opts.config.skip_missing)
let requests = Collector::new(opts.config.base.clone())
.skip_missing_inputs(opts.config.skip_missing)
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
.collect_links(inputs)
.await;

Expand Down
3 changes: 2 additions & 1 deletion lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ version = "0.8.2"
check-if-email-exists = "0.8.26"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"
http = "0.2.6"
hubcaps = "0.6.2"
linkify = "0.8.0"
Expand Down Expand Up @@ -50,6 +49,8 @@ once_cell = "1.9.0"
thiserror = "1.0.30"
futures = "0.3.19"
lazy_static = "1.4.0"
html5ever = "0.25.1"
html5gum = "0.4.0"

[dependencies.par-stream]
version = "0.10.0"
Expand Down
28 changes: 24 additions & 4 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,34 @@ use par_stream::ParStreamExt;
pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
use_html5ever: bool,
}

impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub const fn new(base: Option<Base>, skip_missing_inputs: bool) -> Self {
pub const fn new(base: Option<Base>) -> Self {
Collector {
base,
skip_missing_inputs,
skip_missing_inputs: false,
use_html5ever: false,
}
}

/// Skip missing input files (default is to error if they don't exist)
#[must_use]
pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
self.skip_missing_inputs = yes;
self
}

/// Use `html5ever` to parse HTML instead of `html5gum`.
#[must_use]
pub const fn use_html5ever(mut self, yes: bool) -> Self {
self.use_html5ever = yes;
self
}

/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)
Expand All @@ -47,7 +63,11 @@ impl Collector {
let base = base.clone();
async move {
let content = content?;
let uris: Vec<RawUri> = Extractor::extract(&content);
let uris: Vec<RawUri> = if self.use_html5ever {
Extractor::extract_html5ever(&content)
} else {
Extractor::extract(&content)
};
let requests = request::create(uris, &content, &base)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
Expand All @@ -74,7 +94,7 @@ mod test {

// Helper function to run the collector on the given inputs
async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base, false).collect_links(inputs).await;
let responses = Collector::new(base).collect_links(inputs).await;
responses.map(|r| r.unwrap().uri).collect().await
}

Expand Down
23 changes: 3 additions & 20 deletions lychee-lib/src/extract/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use html5ever::{
use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;

#[derive(Clone)]
#[derive(Clone, Default)]
struct LinkExtractor {
links: Vec<RawUri>,
}
Expand Down Expand Up @@ -61,8 +61,8 @@ impl TokenSink for LinkExtractor {
}

impl LinkExtractor {
pub(crate) const fn new() -> Self {
Self { links: Vec::new() }
pub(crate) fn new() -> Self {
LinkExtractor::default()
}

/// Extract all semantically known links from a given html attribute.
Expand Down Expand Up @@ -125,20 +125,3 @@ pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {

tokenizer.sink.links
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_extract_link_at_end_of_line() {
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();

let uris: Vec<String> = extract_html(input)
.into_iter()
.map(|raw_uri| raw_uri.text)
.collect();
assert_eq!(vec![link.to_string()], uris);
}
}
207 changes: 207 additions & 0 deletions lychee-lib/src/extract/html5gum.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
use html5gum::{Emitter, Error, Tokenizer};

use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;

#[derive(Clone)]
struct LinkExtractor {
// note: what html5gum calls a tag, lychee calls an element
links: Vec<RawUri>,
current_string: Vec<u8>,
current_element_name: Vec<u8>,
current_element_is_closing: bool,
current_attribute_name: Vec<u8>,
current_attribute_value: Vec<u8>,
last_start_element: Vec<u8>,
}

/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
/// of debugging
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
debug_assert!(std::str::from_utf8(s).is_ok());
std::str::from_utf8_unchecked(s)
}

impl LinkExtractor {
pub(crate) const fn new() -> Self {
LinkExtractor {
links: Vec::new(),
current_string: Vec::new(),
current_element_name: Vec::new(),
current_element_is_closing: false,
current_attribute_name: Vec::new(),
current_attribute_value: Vec::new(),
last_start_element: Vec::new(),
}
}

/// Extract all semantically known links from a given html attribute.
#[allow(clippy::unnested_or_patterns)]
pub(crate) fn extract_urls_from_elem_attr<'a>(
attr_name: &str,
elem_name: &str,
attr_value: &'a str,
) -> Option<impl Iterator<Item = &'a str>> {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
| ("applet", "codebase")
| ("body", "background")
| ("button", "formaction")
| ("command", "icon")
| ("form", "action")
| ("frame", "longdesc")
| ("head", "profile")
| ("html", "manifest")
| ("iframe", "longdesc")
| ("img", "longdesc")
| ("input", "formaction")
| ("object", "classid")
| ("object", "codebase")
| ("object", "data")
| ("video", "poster") => {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
}
_ => None,
}
}

fn flush_current_characters(&mut self) {
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
self.links.extend(extract_plaintext(raw));
self.current_string.clear();
}

fn flush_old_attribute(&mut self) {
{
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };

let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);

let new_urls = match urls {
None => extract_plaintext(value),
Some(urls) => urls
.into_iter()
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
attribute: Some(attr.to_string()),
})
.collect::<Vec<_>>(),
};

self.links.extend(new_urls);
}

self.current_attribute_name.clear();
self.current_attribute_value.clear();
}
}

impl Emitter for &mut LinkExtractor {
type Token = ();

fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
self.last_start_element.clear();
self.last_start_element
.extend(last_start_tag.unwrap_or_default());
}

fn emit_eof(&mut self) {
self.flush_current_characters();
}
fn emit_error(&mut self, _: Error) {}
fn pop_token(&mut self) -> Option<()> {
None
}

fn emit_string(&mut self, c: &[u8]) {
self.current_string.extend(c);
}

fn init_start_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = false;
}

fn init_end_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = true;
}

fn init_comment(&mut self) {
self.flush_current_characters();
}

fn emit_current_tag(&mut self) {
self.flush_old_attribute();
}

fn emit_current_doctype(&mut self) {}
fn set_self_closing(&mut self) {
self.current_element_is_closing = true;
}
fn set_force_quirks(&mut self) {}

fn push_tag_name(&mut self, s: &[u8]) {
self.current_element_name.extend(s);
}

fn push_comment(&mut self, _: &[u8]) {}
fn push_doctype_name(&mut self, _: &[u8]) {}
fn init_doctype(&mut self) {
self.flush_current_characters();
}
fn init_attribute(&mut self) {
self.flush_old_attribute();
}
fn push_attribute_name(&mut self, s: &[u8]) {
self.current_attribute_name.extend(s);
}
fn push_attribute_value(&mut self, s: &[u8]) {
self.current_attribute_value.extend(s);
}

fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
self.current_element_is_closing
&& !self.current_element_name.is_empty()
&& self.current_element_name == self.last_start_element
}

fn emit_current_comment(&mut self) {}
}

/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new();
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.links
}

0 comments on commit 68d09f7

Please sign in to comment.