Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add html5gum as alternative link extractor #480

Merged
merged 25 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 7 additions & 10 deletions examples/collect_links/collect_links.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,13 @@ async fn main() -> Result<()> {
},
];

let links = Collector::new(
None, // base
false, // don't skip missing inputs
)
.collect_links(
inputs, // base url or directory
)
.await
.collect::<Result<Vec<_>>>()
.await?;
let links = Collector::new(None) // base
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
.use_html5ever(false) // use html5ever for parsing? (default=false)
.collect_links(inputs) // base url or directory
.await
.collect::<Result<Vec<_>>>()
.await?;

dbg!(links);

Expand Down
5 changes: 4 additions & 1 deletion lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,10 @@ fn run_main() -> Result<i32> {
/// Run lychee on the given inputs
async fn run(opts: &LycheeOptions) -> Result<i32> {
let inputs = opts.inputs();
let requests = Collector::new(opts.config.base.clone(), opts.config.skip_missing)
let requests = Collector::new(opts.config.base.clone())
.skip_missing_inputs(opts.config.skip_missing)
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
.collect_links(inputs)
.await;

Expand Down
3 changes: 2 additions & 1 deletion lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ version = "0.8.2"
check-if-email-exists = "0.8.26"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"
http = "0.2.6"
hubcaps = "0.6.2"
linkify = "0.8.0"
Expand Down Expand Up @@ -50,6 +49,8 @@ once_cell = "1.9.0"
thiserror = "1.0.30"
futures = "0.3.19"
lazy_static = "1.4.0"
html5ever = "0.25.1"
html5gum = "0.4.0"

[dependencies.par-stream]
version = "0.10.0"
Expand Down
28 changes: 24 additions & 4 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,34 @@ use par_stream::ParStreamExt;
pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
use_html5ever: bool,
}

impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub const fn new(base: Option<Base>, skip_missing_inputs: bool) -> Self {
pub const fn new(base: Option<Base>) -> Self {
Collector {
base,
skip_missing_inputs,
skip_missing_inputs: false,
use_html5ever: false,
}
}

/// Skip missing input files (default is to error if they don't exist)
#[must_use]
pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
self.skip_missing_inputs = yes;
self
}

/// Use `html5ever` to parse HTML instead of `html5gum`.
#[must_use]
pub const fn use_html5ever(mut self, yes: bool) -> Self {
self.use_html5ever = yes;
self
}

/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)
Expand All @@ -47,7 +63,11 @@ impl Collector {
let base = base.clone();
async move {
let content = content?;
let uris: Vec<RawUri> = Extractor::extract(&content);
let uris: Vec<RawUri> = if self.use_html5ever {
Extractor::extract_html5ever(&content)
} else {
Extractor::extract(&content)
};
let requests = request::create(uris, &content, &base)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
Expand All @@ -74,7 +94,7 @@ mod test {

// Helper function to run the collector on the given inputs
async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base, false).collect_links(inputs).await;
let responses = Collector::new(base).collect_links(inputs).await;
responses.map(|r| r.unwrap().uri).collect().await
}

Expand Down
23 changes: 3 additions & 20 deletions lychee-lib/src/extract/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use html5ever::{
use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;

#[derive(Clone)]
#[derive(Clone, Default)]
struct LinkExtractor {
links: Vec<RawUri>,
}
Expand Down Expand Up @@ -61,8 +61,8 @@ impl TokenSink for LinkExtractor {
}

impl LinkExtractor {
pub(crate) const fn new() -> Self {
Self { links: Vec::new() }
pub(crate) fn new() -> Self {
LinkExtractor::default()
}

/// Extract all semantically known links from a given html attribute.
Expand Down Expand Up @@ -125,20 +125,3 @@ pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {

tokenizer.sink.links
}

#[cfg(test)]
mre marked this conversation as resolved.
Show resolved Hide resolved
mod tests {
use super::*;

#[test]
fn test_extract_link_at_end_of_line() {
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();

let uris: Vec<String> = extract_html(input)
.into_iter()
.map(|raw_uri| raw_uri.text)
.collect();
assert_eq!(vec![link.to_string()], uris);
}
}
207 changes: 207 additions & 0 deletions lychee-lib/src/extract/html5gum.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
use html5gum::{Emitter, Error, Tokenizer};

use super::plaintext::extract_plaintext;
use crate::types::raw_uri::RawUri;

#[derive(Clone)]
struct LinkExtractor {
// note: what html5gum calls a tag, lychee calls an element
links: Vec<RawUri>,
current_string: Vec<u8>,
current_element_name: Vec<u8>,
current_element_is_closing: bool,
current_attribute_name: Vec<u8>,
current_attribute_value: Vec<u8>,
last_start_element: Vec<u8>,
}

/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
/// of debugging
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
debug_assert!(std::str::from_utf8(s).is_ok());
std::str::from_utf8_unchecked(s)
}

impl LinkExtractor {
pub(crate) const fn new() -> Self {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could #[derive(Default)] for LinkExtractor and this code goes away?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did that now, and it makes new non-const as Default is not const

LinkExtractor {
links: Vec::new(),
current_string: Vec::new(),
current_element_name: Vec::new(),
current_element_is_closing: false,
current_attribute_name: Vec::new(),
current_attribute_value: Vec::new(),
last_start_element: Vec::new(),
}
}

/// Extract all semantically known links from a given html attribute.
#[allow(clippy::unnested_or_patterns)]
pub(crate) fn extract_urls_from_elem_attr<'a>(
attr_name: &str,
elem_name: &str,
attr_value: &'a str,
) -> Option<impl Iterator<Item = &'a str>> {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
| ("applet", "codebase")
| ("body", "background")
| ("button", "formaction")
| ("command", "icon")
| ("form", "action")
| ("frame", "longdesc")
| ("head", "profile")
| ("html", "manifest")
| ("iframe", "longdesc")
| ("img", "longdesc")
| ("input", "formaction")
| ("object", "classid")
| ("object", "codebase")
| ("object", "data")
| ("video", "poster") => {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
}
_ => None,
}
}

fn flush_current_characters(&mut self) {
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
self.links.extend(extract_plaintext(raw));
self.current_string.clear();
}

fn flush_old_attribute(&mut self) {
{
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };

let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

element is also called tag or name in this function. Maybe we should stick to one word?

let elem = unsafe { from_utf8_unchecked(&self.current_element_name) };

Copy link
Collaborator Author

@untitaker untitaker Feb 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's html5gum (html spec) naming clashing with lychee naming

other candidates for rename are last_start_tag but they are not used by lychee

it's unclear where to draw the line to me because you clearly can't pick different method names

i chose to do just the rename you proposed (as that's the attribute we use for link extraction) but now the discrepancy is visible in the struct definition

best i can do is do a mass rename, accept that in things like set_last_start_tag the variable names are inconsistent and document that elem and tag are the same thing


let new_urls = match urls {
None => extract_plaintext(value),
Some(urls) => urls
.into_iter()
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
attribute: Some(attr.to_string()),
})
.collect::<Vec<_>>(),
};

self.links.extend(new_urls);
}

self.current_attribute_name.clear();
self.current_attribute_value.clear();
}
}

impl Emitter for &mut LinkExtractor {
type Token = ();

fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
self.last_start_element.clear();
self.last_start_element
.extend(last_start_tag.unwrap_or_default());
}

fn emit_eof(&mut self) {
self.flush_current_characters();
}
fn emit_error(&mut self, _: Error) {}
mre marked this conversation as resolved.
Show resolved Hide resolved
fn pop_token(&mut self) -> Option<()> {
None
}

fn emit_string(&mut self, c: &[u8]) {
self.current_string.extend(c);
}

fn init_start_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = false;
}

fn init_end_tag(&mut self) {
self.flush_current_characters();
self.current_element_name.clear();
self.current_element_is_closing = true;
}

fn init_comment(&mut self) {
self.flush_current_characters();
}

fn emit_current_tag(&mut self) {
self.flush_old_attribute();
}

fn emit_current_doctype(&mut self) {}
fn set_self_closing(&mut self) {
self.current_element_is_closing = true;
}
fn set_force_quirks(&mut self) {}

fn push_tag_name(&mut self, s: &[u8]) {
self.current_element_name.extend(s);
}

fn push_comment(&mut self, _: &[u8]) {}
fn push_doctype_name(&mut self, _: &[u8]) {}
fn init_doctype(&mut self) {
self.flush_current_characters();
}
fn init_attribute(&mut self) {
self.flush_old_attribute();
}
fn push_attribute_name(&mut self, s: &[u8]) {
self.current_attribute_name.extend(s);
}
fn push_attribute_value(&mut self, s: &[u8]) {
self.current_attribute_value.extend(s);
}

fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
self.current_element_is_closing
&& !self.current_element_name.is_empty()
&& self.current_element_name == self.last_start_element
}

fn emit_current_comment(&mut self) {}
}

/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new();
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.links
}