Skip to content

Commit

Permalink
Start working on checking domain names
Browse files Browse the repository at this point in the history
Came up in a couple of places: #41, #29, #38, #28. Hopefully we can fix
all of these with these changes.

Not done yet, still want to have domain checking for URLs with certain
schemes (https) but allow everything for others.

If we do that, we may be able to unify the email and plain domain
parsing with the scheme one too.
  • Loading branch information
robinst committed Jun 22, 2022
1 parent 9a6ce39 commit b1b6d5e
Show file tree
Hide file tree
Showing 7 changed files with 397 additions and 96 deletions.
80 changes: 80 additions & 0 deletions src/domains.rs
@@ -0,0 +1,80 @@
//! Domain name related scanning, used by both email and plain domain URL scanners.

pub(crate) fn find_domain_end(s: &str) -> (Option<usize>, Option<usize>) {
let mut end = None;
let mut maybe_last_dot = None;
let mut last_dot = None;
let mut dot_allowed = false;
let mut hyphen_allowed = false;
let mut all_numeric = true;

for (i, c) in s.char_indices() {
let can_be_last = match c {
'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => {
// Can start or end a domain label.
dot_allowed = true;
hyphen_allowed = true;
last_dot = maybe_last_dot;
all_numeric = false;

true
}
'0'..='9' => {
// Same as above, except we note if it's
dot_allowed = true;
hyphen_allowed = true;
last_dot = maybe_last_dot;

true
}
'-' => {
// Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com`
if !hyphen_allowed {
return (None, None);
}
// Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com`
dot_allowed = false;
false
}
'.' => {
if !dot_allowed {
// Label can't be empty, e.g. `.example.com` or `a..com`
return (None, None);
}
dot_allowed = false;
hyphen_allowed = false;
maybe_last_dot = Some(i);
false
}
_ => {
break;
}
};

if can_be_last {
end = Some(i + c.len_utf8());
}
}

if all_numeric && last_dot.is_none() {
return (None, None);
}

if !all_numeric {
if let Some(last_dot) = last_dot {
if !valid_tld(&s[last_dot + 1..]) {
return (None, None);
}
}
}

(end, last_dot)
}

fn valid_tld(tld: &str) -> bool {
tld.chars()
.take_while(|c| c.is_ascii_alphabetic())
.take(2)
.count()
>= 2
}
49 changes: 6 additions & 43 deletions src/email.rs
@@ -1,5 +1,6 @@
use std::ops::Range;

use crate::domains::find_domain_end;
use crate::scanner::Scanner;

/// Scan for email address starting from the trigger character "@".
Expand Down Expand Up @@ -40,6 +41,9 @@ impl EmailScanner {
break;
}
atom_boundary = true;
} else if c == '@' {
// In `@me@a.com`, we don't want to extract `me@a.com`.
return None;
} else {
break;
}
Expand All @@ -49,40 +53,8 @@ impl EmailScanner {

// See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531
fn find_end(&self, s: &str) -> Option<usize> {
let mut first_in_sub_domain = true;
let mut can_end_sub_domain = false;
let mut first_dot = None;
let mut end = None;

for (i, c) in s.char_indices() {
if first_in_sub_domain {
if Self::sub_domain_allowed(c) {
end = Some(i + c.len_utf8());
first_in_sub_domain = false;
can_end_sub_domain = true;
} else {
break;
}
} else if c == '.' {
if !can_end_sub_domain {
break;
}
first_in_sub_domain = true;
if first_dot.is_none() {
first_dot = Some(i);
}
} else if c == '-' {
can_end_sub_domain = false;
} else if Self::sub_domain_allowed(c) {
end = Some(i + c.len_utf8());
can_end_sub_domain = true;
} else {
break;
}
}

if let Some(end) = end {
if !self.domain_must_have_dot || first_dot.map(|d| d < end).unwrap_or(false) {
if let (Some(end), last_dot) = find_domain_end(s) {
if !self.domain_must_have_dot || last_dot.is_some() {
Some(end)
} else {
None
Expand Down Expand Up @@ -120,15 +92,6 @@ impl EmailScanner {
_ => c >= '\u{80}',
}
}

// See "sub-domain" in RFC 5321. Extension in RFC 6531 is simplified,
// this can also match invalid domains.
fn sub_domain_allowed(c: char) -> bool {
match c {
'a'..='z' | 'A'..='Z' | '0'..='9' => true,
_ => c >= '\u{80}',
}
}
}

/// Helper function to check if given string is considered an email address.
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Expand Up @@ -120,6 +120,7 @@
#![deny(missing_docs)]
#![deny(missing_debug_implementations)]

mod domains;
mod email;
mod finder;
mod scanner;
Expand Down

0 comments on commit b1b6d5e

Please sign in to comment.