-
-
Notifications
You must be signed in to change notification settings - Fork 107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add html5gum as alternative link extractor #480
Changes from 23 commits
0491833
6c051a5
8087067
5438744
3a951bc
8bca503
f9403ce
de91aeb
68c67af
539406d
e6b3312
afd488b
e92721b
1b0f650
9b02f58
a8bb51b
ca359d8
283c020
ef3ecd6
57b7cdd
9aa724a
57b3057
aac8610
ccedcec
9ce7e8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
use html5gum::{Emitter, Error, Tokenizer}; | ||
|
||
use super::plaintext::extract_plaintext; | ||
use crate::types::raw_uri::RawUri; | ||
|
||
#[derive(Clone)] | ||
struct LinkExtractor { | ||
links: Vec<RawUri>, | ||
current_string: Vec<u8>, | ||
current_tag_name: Vec<u8>, | ||
current_tag_is_closing: bool, | ||
current_attribute_name: Vec<u8>, | ||
current_attribute_value: Vec<u8>, | ||
last_start_tag: Vec<u8>, | ||
} | ||
|
||
/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease | ||
/// of debugging | ||
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str { | ||
debug_assert!(std::str::from_utf8(s).is_ok()); | ||
std::str::from_utf8_unchecked(s) | ||
} | ||
|
||
impl LinkExtractor { | ||
pub(crate) const fn new() -> Self { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we could There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did that now, and it makes |
||
LinkExtractor { | ||
links: Vec::new(), | ||
current_string: Vec::new(), | ||
current_tag_name: Vec::new(), | ||
current_tag_is_closing: false, | ||
current_attribute_name: Vec::new(), | ||
current_attribute_value: Vec::new(), | ||
last_start_tag: Vec::new(), | ||
} | ||
} | ||
|
||
/// Extract all semantically known links from a given html attribute. | ||
#[allow(clippy::unnested_or_patterns)] | ||
pub(crate) fn extract_urls_from_elem_attr<'a>( | ||
attr_name: &str, | ||
elem_name: &str, | ||
attr_value: &'a str, | ||
) -> Option<impl Iterator<Item = &'a str>> { | ||
// For a comprehensive list of elements that might contain URLs/URIs | ||
// see https://www.w3.org/TR/REC-html40/index/attributes.html | ||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1 | ||
match (elem_name, attr_name) { | ||
// Common element/attribute combinations for links | ||
(_, "href" | "src" | "cite" | "usemap") | ||
// Less common (but still valid!) combinations | ||
| ("applet", "codebase") | ||
| ("body", "background") | ||
| ("button", "formaction") | ||
| ("command", "icon") | ||
| ("form", "action") | ||
| ("frame", "longdesc") | ||
| ("head", "profile") | ||
| ("html", "manifest") | ||
| ("iframe", "longdesc") | ||
| ("img", "longdesc") | ||
| ("input", "formaction") | ||
| ("object", "classid") | ||
| ("object", "codebase") | ||
| ("object", "data") | ||
| ("video", "poster") => { | ||
Some(vec![attr_value].into_iter()) | ||
} | ||
(_, "srcset") => { | ||
let mut urls = Vec::new(); | ||
for image_candidate_string in attr_value.trim().split(',') { | ||
for part in image_candidate_string.split_ascii_whitespace() { | ||
if part.is_empty() { | ||
continue; | ||
} | ||
urls.push(part); | ||
break; | ||
} | ||
} | ||
Some(urls.into_iter()) | ||
} | ||
_ => None, | ||
} | ||
} | ||
|
||
fn flush_current_characters(&mut self) { | ||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well. | ||
let raw = unsafe { from_utf8_unchecked(&self.current_string) }; | ||
self.links.extend(extract_plaintext(raw)); | ||
self.current_string.clear(); | ||
} | ||
|
||
fn flush_old_attribute(&mut self) { | ||
{ | ||
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well. | ||
let name = unsafe { from_utf8_unchecked(&self.current_tag_name) }; | ||
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) }; | ||
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) }; | ||
|
||
let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
let elem = unsafe { from_utf8_unchecked(&self.current_element_name) }; There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's html5gum (html spec) naming clashing with lychee naming other candidates for rename are it's unclear where to draw the line to me because you clearly can't pick different method names
best i can do is do a mass rename, accept that in things like |
||
|
||
let new_urls = match urls { | ||
None => extract_plaintext(value), | ||
Some(urls) => urls | ||
.into_iter() | ||
.map(|url| RawUri { | ||
text: url.to_string(), | ||
element: Some(name.to_string()), | ||
attribute: Some(attr.to_string()), | ||
}) | ||
.collect::<Vec<_>>(), | ||
}; | ||
|
||
self.links.extend(new_urls); | ||
} | ||
|
||
self.current_attribute_name.clear(); | ||
self.current_attribute_value.clear(); | ||
} | ||
} | ||
|
||
impl Emitter for &mut LinkExtractor { | ||
type Token = (); | ||
|
||
fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) { | ||
self.last_start_tag.clear(); | ||
self.last_start_tag | ||
.extend(last_start_tag.unwrap_or_default()); | ||
} | ||
|
||
fn emit_eof(&mut self) { | ||
self.flush_current_characters(); | ||
} | ||
fn emit_error(&mut self, _: Error) {} | ||
mre marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fn pop_token(&mut self) -> Option<()> { | ||
None | ||
} | ||
|
||
fn emit_string(&mut self, c: &[u8]) { | ||
self.current_string.extend(c); | ||
} | ||
|
||
fn init_start_tag(&mut self) { | ||
self.flush_current_characters(); | ||
self.current_tag_name.clear(); | ||
self.current_tag_is_closing = false; | ||
} | ||
|
||
fn init_end_tag(&mut self) { | ||
self.flush_current_characters(); | ||
self.current_tag_name.clear(); | ||
self.current_tag_is_closing = true; | ||
} | ||
|
||
fn init_comment(&mut self) { | ||
self.flush_current_characters(); | ||
} | ||
|
||
fn emit_current_tag(&mut self) { | ||
self.flush_old_attribute(); | ||
} | ||
|
||
fn emit_current_doctype(&mut self) {} | ||
fn set_self_closing(&mut self) { | ||
self.current_tag_is_closing = true; | ||
} | ||
fn set_force_quirks(&mut self) {} | ||
|
||
fn push_tag_name(&mut self, s: &[u8]) { | ||
self.current_tag_name.extend(s); | ||
} | ||
|
||
fn push_comment(&mut self, _: &[u8]) {} | ||
fn push_doctype_name(&mut self, _: &[u8]) {} | ||
fn init_doctype(&mut self) { | ||
self.flush_current_characters(); | ||
} | ||
fn init_attribute(&mut self) { | ||
self.flush_old_attribute(); | ||
} | ||
fn push_attribute_name(&mut self, s: &[u8]) { | ||
self.current_attribute_name.extend(s); | ||
} | ||
fn push_attribute_value(&mut self, s: &[u8]) { | ||
self.current_attribute_value.extend(s); | ||
} | ||
|
||
fn set_doctype_public_identifier(&mut self, _: &[u8]) {} | ||
fn set_doctype_system_identifier(&mut self, _: &[u8]) {} | ||
fn push_doctype_public_identifier(&mut self, _: &[u8]) {} | ||
fn push_doctype_system_identifier(&mut self, _: &[u8]) {} | ||
fn current_is_appropriate_end_tag_token(&mut self) -> bool { | ||
self.current_tag_is_closing | ||
&& !self.current_tag_name.is_empty() | ||
&& self.current_tag_name == self.last_start_tag | ||
} | ||
|
||
fn emit_current_comment(&mut self) {} | ||
} | ||
|
||
/// Extract unparsed URL strings from an HTML string. | ||
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> { | ||
let mut extractor = LinkExtractor::new(); | ||
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible(); | ||
assert!(tokenizer.next().is_none()); | ||
extractor.links | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Came back for the final review and had a hard time remembering what
false
was for. An alternative would be to haveExtract::with_html5gum
andExtract::with_html5ever
. More explicit without using a config struct. What do you think?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i did something like that now.. it's a bit repetitive in both Extractor and usages of it, but in return we have no breaking api change