Add html5gum as alternative link extractor (#480)

html5gum is a HTML parser that offers lower-level control over which tokens actually get created and are tracked. As such, the extractor doesn't allocate anything tokens it doesn't care about. On some benchmarks it provides a substantial performance boost. The old parser, html5ever is still available by setting the `LYCHEE_USE_HTML5EVER=1` env var.
lycheeverse · Feb 7, 2022 · 68d09f7 · 68d09f7
1 parent 6bf8c1f
commit 68d09f7
Show file tree

Hide file tree

Showing 9 changed files with 326 additions and 53 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs
@@ -20,16 +20,13 @@ async fn main() -> Result<()> {
         },
     ];
 
-    let links = Collector::new(
-        None,  // base
-        false, // don't skip missing inputs
-    )
-    .collect_links(
-        inputs, // base url or directory
-    )
-    .await
-    .collect::<Result<Vec<_>>>()
-    .await?;
+    let links = Collector::new(None) // base
+        .skip_missing_inputs(false) // don't skip missing inputs? (default=false)
+        .use_html5ever(false) // use html5ever for parsing? (default=false)
+        .collect_links(inputs) // base url or directory
+        .await
+        .collect::<Result<Vec<_>>>()
+        .await?;
 
     dbg!(links);
 

diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs
@@ -200,7 +200,10 @@ fn run_main() -> Result<i32> {
 /// Run lychee on the given inputs
 async fn run(opts: &LycheeOptions) -> Result<i32> {
     let inputs = opts.inputs();
-    let requests = Collector::new(opts.config.base.clone(), opts.config.skip_missing)
+    let requests = Collector::new(opts.config.base.clone())
+        .skip_missing_inputs(opts.config.skip_missing)
+        // File a bug if you rely on this envvar! It's going to go away eventually.
+        .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
         .collect_links(inputs)
         .await;
 

diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml
@@ -20,7 +20,6 @@ version = "0.8.2"
 check-if-email-exists = "0.8.26"
 fast_chemail = "0.9.6"
 glob = "0.3.0"
-html5ever = "0.25.1"
 http = "0.2.6"
 hubcaps = "0.6.2"
 linkify = "0.8.0"
@@ -50,6 +49,8 @@ once_cell = "1.9.0"
 thiserror = "1.0.30"
 futures = "0.3.19"
 lazy_static = "1.4.0"
+html5ever = "0.25.1"
+html5gum = "0.4.0"
 
 [dependencies.par-stream]
 version = "0.10.0"

diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
@@ -13,18 +13,34 @@ use par_stream::ParStreamExt;
 pub struct Collector {
     base: Option<Base>,
     skip_missing_inputs: bool,
+    use_html5ever: bool,
 }
 
 impl Collector {
     /// Create a new collector with an empty cache
     #[must_use]
-    pub const fn new(base: Option<Base>, skip_missing_inputs: bool) -> Self {
+    pub const fn new(base: Option<Base>) -> Self {
         Collector {
             base,
-            skip_missing_inputs,
+            skip_missing_inputs: false,
+            use_html5ever: false,
         }
     }
 
+    /// Skip missing input files (default is to error if they don't exist)
+    #[must_use]
+    pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
+        self.skip_missing_inputs = yes;
+        self
+    }
+
+    /// Use `html5ever` to parse HTML instead of `html5gum`.
+    #[must_use]
+    pub const fn use_html5ever(mut self, yes: bool) -> Self {
+        self.use_html5ever = yes;
+        self
+    }
+
     /// Fetch all unique links from inputs
     /// All relative URLs get prefixed with `base` (if given).
     /// (This can be a directory or a base URL)
@@ -47,7 +63,11 @@ impl Collector {
                 let base = base.clone();
                 async move {
                     let content = content?;
-                    let uris: Vec<RawUri> = Extractor::extract(&content);
+                    let uris: Vec<RawUri> = if self.use_html5ever {
+                        Extractor::extract_html5ever(&content)
+                    } else {
+                        Extractor::extract(&content)
+                    };
                     let requests = request::create(uris, &content, &base)?;
                     Result::Ok(stream::iter(requests.into_iter().map(Ok)))
                 }
@@ -74,7 +94,7 @@ mod test {
 
     // Helper function to run the collector on the given inputs
     async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
-        let responses = Collector::new(base, false).collect_links(inputs).await;
+        let responses = Collector::new(base).collect_links(inputs).await;
         responses.map(|r| r.unwrap().uri).collect().await
     }
 

diff --git a/lychee-lib/src/extract/html.rs b/lychee-lib/src/extract/html.rs
@@ -7,7 +7,7 @@ use html5ever::{
 use super::plaintext::extract_plaintext;
 use crate::types::raw_uri::RawUri;
 
-#[derive(Clone)]
+#[derive(Clone, Default)]
 struct LinkExtractor {
     links: Vec<RawUri>,
 }
@@ -61,8 +61,8 @@ impl TokenSink for LinkExtractor {
 }
 
 impl LinkExtractor {
-    pub(crate) const fn new() -> Self {
-        Self { links: Vec::new() }
+    pub(crate) fn new() -> Self {
+        LinkExtractor::default()
     }
 
     /// Extract all semantically known links from a given html attribute.
@@ -125,20 +125,3 @@ pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
 
     tokenizer.sink.links
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_extract_link_at_end_of_line() {
-        let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
-        let link = input.trim_end();
-
-        let uris: Vec<String> = extract_html(input)
-            .into_iter()
-            .map(|raw_uri| raw_uri.text)
-            .collect();
-        assert_eq!(vec![link.to_string()], uris);
-    }
-}
diff --git a/lychee-lib/src/extract/html5gum.rs b/lychee-lib/src/extract/html5gum.rs
@@ -0,0 +1,207 @@
+use html5gum::{Emitter, Error, Tokenizer};
+
+use super::plaintext::extract_plaintext;
+use crate::types::raw_uri::RawUri;
+
+#[derive(Clone)]
+struct LinkExtractor {
+    // note: what html5gum calls a tag, lychee calls an element
+    links: Vec<RawUri>,
+    current_string: Vec<u8>,
+    current_element_name: Vec<u8>,
+    current_element_is_closing: bool,
+    current_attribute_name: Vec<u8>,
+    current_attribute_value: Vec<u8>,
+    last_start_element: Vec<u8>,
+}
+
+/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
+/// of debugging
+unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
+    debug_assert!(std::str::from_utf8(s).is_ok());
+    std::str::from_utf8_unchecked(s)
+}
+
+impl LinkExtractor {
+    pub(crate) const fn new() -> Self {
+        LinkExtractor {
+            links: Vec::new(),
+            current_string: Vec::new(),
+            current_element_name: Vec::new(),
+            current_element_is_closing: false,
+            current_attribute_name: Vec::new(),
+            current_attribute_value: Vec::new(),
+            last_start_element: Vec::new(),
+        }
+    }
+
+    /// Extract all semantically known links from a given html attribute.
+    #[allow(clippy::unnested_or_patterns)]
+    pub(crate) fn extract_urls_from_elem_attr<'a>(
+        attr_name: &str,
+        elem_name: &str,
+        attr_value: &'a str,
+    ) -> Option<impl Iterator<Item = &'a str>> {
+        // For a comprehensive list of elements that might contain URLs/URIs
+        // see https://www.w3.org/TR/REC-html40/index/attributes.html
+        // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+        match (elem_name, attr_name) {
+            // Common element/attribute combinations for links
+            (_, "href" | "src" | "cite" | "usemap")
+            // Less common (but still valid!) combinations
+            | ("applet", "codebase")
+            | ("body", "background")
+            | ("button", "formaction")
+            | ("command", "icon")
+            | ("form", "action")
+            | ("frame", "longdesc")
+            | ("head", "profile")
+            | ("html", "manifest")
+            | ("iframe", "longdesc")
+            | ("img", "longdesc")
+            | ("input", "formaction")
+            | ("object", "classid")
+            | ("object", "codebase")
+            | ("object", "data")
+            | ("video", "poster") => {
+                Some(vec![attr_value].into_iter())
+            }
+            (_, "srcset") => {
+                let mut urls = Vec::new();
+                for image_candidate_string in attr_value.trim().split(',') {
+                    for part in image_candidate_string.split_ascii_whitespace() {
+                        if part.is_empty() {
+                            continue;
+                        }
+                        urls.push(part);
+                        break;
+                    }
+                }
+                Some(urls.into_iter())
+            }
+            _ => None,
+        }
+    }
+
+    fn flush_current_characters(&mut self) {
+        // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
+        let raw = unsafe { from_utf8_unchecked(&self.current_string) };
+        self.links.extend(extract_plaintext(raw));
+        self.current_string.clear();
+    }
+
+    fn flush_old_attribute(&mut self) {
+        {
+            // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
+            let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
+            let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
+            let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
+
+            let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
+
+            let new_urls = match urls {
+                None => extract_plaintext(value),
+                Some(urls) => urls
+                    .into_iter()
+                    .map(|url| RawUri {
+                        text: url.to_string(),
+                        element: Some(name.to_string()),
+                        attribute: Some(attr.to_string()),
+                    })
+                    .collect::<Vec<_>>(),
+            };
+
+            self.links.extend(new_urls);
+        }
+
+        self.current_attribute_name.clear();
+        self.current_attribute_value.clear();
+    }
+}
+
+impl Emitter for &mut LinkExtractor {
+    type Token = ();
+
+    fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
+        self.last_start_element.clear();
+        self.last_start_element
+            .extend(last_start_tag.unwrap_or_default());
+    }
+
+    fn emit_eof(&mut self) {
+        self.flush_current_characters();
+    }
+    fn emit_error(&mut self, _: Error) {}
+    fn pop_token(&mut self) -> Option<()> {
+        None
+    }
+
+    fn emit_string(&mut self, c: &[u8]) {
+        self.current_string.extend(c);
+    }
+
+    fn init_start_tag(&mut self) {
+        self.flush_current_characters();
+        self.current_element_name.clear();
+        self.current_element_is_closing = false;
+    }
+
+    fn init_end_tag(&mut self) {
+        self.flush_current_characters();
+        self.current_element_name.clear();
+        self.current_element_is_closing = true;
+    }
+
+    fn init_comment(&mut self) {
+        self.flush_current_characters();
+    }
+
+    fn emit_current_tag(&mut self) {
+        self.flush_old_attribute();
+    }
+
+    fn emit_current_doctype(&mut self) {}
+    fn set_self_closing(&mut self) {
+        self.current_element_is_closing = true;
+    }
+    fn set_force_quirks(&mut self) {}
+
+    fn push_tag_name(&mut self, s: &[u8]) {
+        self.current_element_name.extend(s);
+    }
+
+    fn push_comment(&mut self, _: &[u8]) {}
+    fn push_doctype_name(&mut self, _: &[u8]) {}
+    fn init_doctype(&mut self) {
+        self.flush_current_characters();
+    }
+    fn init_attribute(&mut self) {
+        self.flush_old_attribute();
+    }
+    fn push_attribute_name(&mut self, s: &[u8]) {
+        self.current_attribute_name.extend(s);
+    }
+    fn push_attribute_value(&mut self, s: &[u8]) {
+        self.current_attribute_value.extend(s);
+    }
+
+    fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
+    fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
+    fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
+    fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
+    fn current_is_appropriate_end_tag_token(&mut self) -> bool {
+        self.current_element_is_closing
+            && !self.current_element_name.is_empty()
+            && self.current_element_name == self.last_start_element
+    }
+
+    fn emit_current_comment(&mut self) {}
+}
+
+/// Extract unparsed URL strings from an HTML string.
+pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
+    let mut extractor = LinkExtractor::new();
+    let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
+    assert!(tokenizer.next().is_none());
+    extractor.links
+}