From 7f1bd6ce1c2fde599a757302a843a60e714c5f72 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 15 Jul 2019 21:19:21 +0200 Subject: [PATCH 1/2] percent-encoding: make sets be values of one type, instead of types that implement a trait Fix https://github.com/servo/rust-url/issues/388 --- Cargo.toml | 2 +- data-url/src/lib.rs | 6 +- percent_encoding/Cargo.toml | 3 +- percent_encoding/lib.rs | 348 ++++++++++++++++++------------------ src/form_urlencoded.rs | 6 +- src/host.rs | 4 +- src/lib.rs | 28 ++- src/parser.rs | 113 ++++++++---- tests/data.rs | 2 +- tests/unit.rs | 38 +--- 10 files changed, 277 insertions(+), 273 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4c69f8c85..2c755ccc6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ bencher = "0.1" [dependencies] idna = { version = "0.2.0", path = "./idna" } matches = "0.1" -percent-encoding = { version = "1.0.0", path = "./percent_encoding" } +percent-encoding = { version = "2.0.0", path = "./percent_encoding" } serde = {version = "1.0", optional = true} [[bench]] diff --git a/data-url/src/lib.rs b/data-url/src/lib.rs index f6f023dae..bc0ee1961 100644 --- a/data-url/src/lib.rs +++ b/data-url/src/lib.rs @@ -103,7 +103,7 @@ impl<'a> FragmentIdentifier<'a> { match byte { // Ignore ASCII tabs or newlines like the URL parser would b'\t' | b'\n' | b'\r' => continue, - // Fragment encode set + // https://url.spec.whatwg.org/#fragment-percent-encode-set b'\0'...b' ' | b'"' | b'<' | b'>' | b'`' | b'\x7F'...b'\xFF' => { percent_encode(byte, &mut string) } @@ -182,10 +182,10 @@ fn parse_header(from_colon_to_comma: &str) -> (mime::Mime, bool) { // Ignore ASCII tabs or newlines like the URL parser would b'\t' | b'\n' | b'\r' => continue, - // C0 encode set + // https://url.spec.whatwg.org/#c0-control-percent-encode-set b'\0'...b'\x1F' | b'\x7F'...b'\xFF' => percent_encode(byte, &mut string), - // Bytes other than the C0 encode set that are percent-encoded + // Bytes other than the C0 percent-encode set that are percent-encoded // by the URL parser in the query state. // '#' is also in that list but cannot occur here // since it indicates the start of the URL’s fragment. diff --git a/percent_encoding/Cargo.toml b/percent_encoding/Cargo.toml index 4aad9c858..a737e333c 100644 --- a/percent_encoding/Cargo.toml +++ b/percent_encoding/Cargo.toml @@ -1,12 +1,11 @@ [package] name = "percent-encoding" -version = "1.0.2" +version = "2.0.0" authors = ["The rust-url developers"] description = "Percent encoding and decoding" repository = "https://github.com/servo/rust-url/" license = "MIT/Apache-2.0" [lib] -doctest = false test = false path = "lib.rs" diff --git a/percent_encoding/lib.rs b/percent_encoding/lib.rs index a5c2987a2..170674aa8 100644 --- a/percent_encoding/lib.rs +++ b/percent_encoding/lib.rs @@ -6,30 +6,35 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! URLs use special chacters to indicate the parts of the request. For example, a forward slash -//! indicates a path. In order for that character to exist outside of a path separator, that -//! character would need to be encoded. +//! URLs use special chacters to indicate the parts of the request. +//! For example, a `?` question mark marks the end of a path and the start of a query string. +//! In order for that character to exist inside a path, it needs to be encoded differently. //! -//! Percent encoding replaces reserved characters with the `%` escape character followed by hexidecimal -//! ASCII representaton. For non-ASCII character that are percent encoded, a UTF-8 byte sequence -//! becomes percent encoded. A simple example can be seen when the space literal is replaced with -//! `%20`. +//! Percent encoding replaces reserved characters with the `%` escape character +//! followed by a byte value as two hexadecimal digits. +//! For example, an ASCII space is replaced with `%20`. //! -//! Percent encoding is further complicated by the fact that different parts of an URL have -//! different encoding requirements. In order to support the variety of encoding requirements, -//! `url::percent_encoding` includes different *encode sets*. -//! See [URL Standard](https://url.spec.whatwg.org/#percent-encoded-bytes) for details. +//! When encoding, the set of characters that can (and should, for readability) be left alone +//! depends on the context. +//! The `?` question mark mentioned above is not a separator when used literally +//! inside of a query string, and therefore does not need to be encoded. +//! The [`AsciiSet`] parameter of [`percent_encode`] and [`utf8_percent_encode`] +//! lets callers configure this. //! -//! This module provides some `*_ENCODE_SET` constants. -//! If a different set is required, it can be created with -//! the [`define_encode_set!`](../macro.define_encode_set!.html) macro. +//! This crate delibarately does not provide many different sets. +//! Users should consider in what context the encoded string will be used, +//! real relevant specifications, and define their own set. +//! This is done by using the `add` method of an existing set. //! //! # Examples //! //! ``` -//! use url::percent_encoding::{utf8_percent_encode, DEFAULT_ENCODE_SET}; +//! use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; //! -//! assert_eq!(utf8_percent_encode("foo bar?", DEFAULT_ENCODE_SET).to_string(), "foo%20bar%3F"); +//! /// https://url.spec.whatwg.org/#fragment-percent-encode-set +//! const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`'); +//! +//! assert_eq!(utf8_percent_encode("foo ", FRAGMENT).to_string(), "foo%20%3Cbar%3E"); //! ``` use std::borrow::Cow; @@ -37,137 +42,127 @@ use std::fmt; use std::slice; use std::str; -/// Represents a set of characters / bytes that should be percent-encoded. -/// -/// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set). -/// -/// Different characters need to be encoded in different parts of an URL. -/// For example, a literal `?` question mark in an URL’s path would indicate -/// the start of the query string. -/// A question mark meant to be part of the path therefore needs to be percent-encoded. -/// In the query string however, a question mark does not have any special meaning -/// and does not need to be percent-encoded. +/// Represents a set of characters or bytes in the ASCII range. /// -/// A few sets are defined in this module. -/// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones. -pub trait EncodeSet: Clone { - /// Called with UTF-8 bytes rather than code points. - /// Should return true for all non-ASCII bytes. - fn contains(&self, byte: u8) -> bool; -} - -/// Define a new struct -/// that implements the [`EncodeSet`](percent_encoding/trait.EncodeSet.html) trait, -/// for use in [`percent_decode()`](percent_encoding/fn.percent_encode.html) -/// and related functions. +/// This used in [`percent_encode`] and [`utf8_percent_encode`]. +/// This is simlar to [percent-encode sets](https://url.spec.whatwg.org/#percent-encoded-bytes). /// -/// Parameters are characters to include in the set in addition to those of the base set. -/// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set). +/// Use the `add` method of an existing set to define a new set. For example: /// -/// Example -/// ======= +/// ``` +/// use percent_encoding::{AsciiSet, CONTROLS}; /// -/// ```rust -/// #[macro_use] extern crate percent_encoding; -/// use percent_encoding::{utf8_percent_encode, SIMPLE_ENCODE_SET}; -/// define_encode_set! { -/// /// This encode set is used in the URL parser for query strings. -/// pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} -/// } -/// # fn main() { -/// assert_eq!(utf8_percent_encode("foo bar", QUERY_ENCODE_SET).collect::(), "foo%20bar"); -/// # } +/// /// https://url.spec.whatwg.org/#fragment-percent-encode-set +/// const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`'); /// ``` -#[macro_export] -macro_rules! define_encode_set { - ($(#[$attr: meta])* pub $name: ident = [$base_set: expr] | {$($ch: pat),*}) => { - $(#[$attr])* - #[derive(Copy, Clone, Debug)] - #[allow(non_camel_case_types)] - pub struct $name; - - impl $crate::EncodeSet for $name { - #[inline] - fn contains(&self, byte: u8) -> bool { - match byte as char { - $( - $ch => true, - )* - _ => $base_set.contains(byte) - } - } - } - } +pub struct AsciiSet { + mask: [Chunk; ASCII_RANGE_LEN / BITS_PER_CHUNK], } -/// This encode set is used for the path of cannot-be-a-base URLs. -/// -/// All ASCII charcters less than hexidecimal 20 and greater than 7E are encoded. This includes -/// special charcters such as line feed, carriage return, NULL, etc. -#[derive(Copy, Clone, Debug)] -#[allow(non_camel_case_types)] -pub struct SIMPLE_ENCODE_SET; - -impl EncodeSet for SIMPLE_ENCODE_SET { - #[inline] - fn contains(&self, byte: u8) -> bool { - byte < 0x20 || byte > 0x7E +type Chunk = u32; + +const ASCII_RANGE_LEN: usize = 0x80; + +const BITS_PER_CHUNK: usize = 8 * std::mem::size_of::(); + +impl AsciiSet { + /// Called with UTF-8 bytes rather than code points. + /// Not used for non-ASCII bytes. + const fn contains(&self, byte: u8) -> bool { + let chunk = self.mask[byte as usize / BITS_PER_CHUNK]; + let mask = 1 << (byte as usize % BITS_PER_CHUNK); + (chunk & mask) != 0 } -} -define_encode_set! { - /// This encode set is used in the URL parser for query strings. - /// - /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html), - /// space, double quote ("), hash (#), and inequality qualifiers (<), (>) are encoded. - pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} -} + fn should_percent_encode(&self, byte: u8) -> bool { + !byte.is_ascii() || self.contains(byte) + } -define_encode_set! { - /// This encode set is used for path components. - /// - /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html), - /// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`), - /// question mark (?), and curly brackets ({), (}) are encoded. - pub DEFAULT_ENCODE_SET = [QUERY_ENCODE_SET] | {'`', '?', '{', '}'} + pub const fn add(&self, byte: u8) -> Self { + let mut mask = self.mask; + mask[byte as usize / BITS_PER_CHUNK] |= 1 << (byte as usize % BITS_PER_CHUNK); + AsciiSet { mask } + } } -define_encode_set! { - /// This encode set is used for on '/'-separated path segment - /// - /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html), - /// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`), - /// question mark (?), and curly brackets ({), (}), percent sign (%), forward slash (/) are - /// encoded. - /// - /// # Note - /// - /// For [special URLs](https://url.spec.whatwg.org/#is-special), the backslash (\) character should - /// additionally be escaped, but that is *not* included in this encode set. - pub PATH_SEGMENT_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'%', '/'} -} +/// The set of 0x00 to 0x1F (C0 controls), and 0x7F (DEL). +/// +/// Note that this includes the newline and tab characters, but not the space 0x20. +/// +/// +pub const CONTROLS: &AsciiSet = &AsciiSet { + mask: [ + !0_u32, // C0: 0x00 to 0x1F (32 bits set) + 0, + 0, + 1 << (0x7F_u32 % 32), // DEL: 0x7F (one bit set) + ], +}; -define_encode_set! { - /// This encode set is used for username and password. - /// - /// Aside from special chacters defined in the [`SIMPLE_ENCODE_SET`](struct.SIMPLE_ENCODE_SET.html), - /// space, double quote ("), hash (#), inequality qualifiers (<), (>), backtick (`), - /// question mark (?), and curly brackets ({), (}), forward slash (/), colon (:), semi-colon (;), - /// equality (=), at (@), backslash (\\), square brackets ([), (]), caret (\^), and pipe (|) are - /// encoded. - pub USERINFO_ENCODE_SET = [DEFAULT_ENCODE_SET] | { - '/', ':', ';', '=', '@', '[', '\\', ']', '^', '|' +macro_rules! static_assert { + ($( $bool: expr, )+) => { + fn _static_assert() { + $( + let _ = std::mem::transmute::<[u8; $bool as usize], u8>; + )+ + } } } -/// Return the percent-encoding of the given bytes. +static_assert! { + CONTROLS.contains(0x00), + CONTROLS.contains(0x1F), + !CONTROLS.contains(0x20), + !CONTROLS.contains(0x7E), + CONTROLS.contains(0x7F), +} + +/// Everything that is not an ASCII letter or digit. +/// +/// This is probably more eager than necessary in any context. +pub const NON_ALPHANUMERIC: &AsciiSet = &CONTROLS + .add(b' ') + .add(b'!') + .add(b'"') + .add(b'#') + .add(b'$') + .add(b'%') + .add(b'&') + .add(b'\'') + .add(b'(') + .add(b')') + .add(b'*') + .add(b'+') + .add(b',') + .add(b'-') + .add(b'.') + .add(b'/') + .add(b':') + .add(b';') + .add(b'<') + .add(b'=') + .add(b'>') + .add(b'?') + .add(b'@') + .add(b'[') + .add(b'\\') + .add(b']') + .add(b'^') + .add(b'_') + .add(b'`') + .add(b'{') + .add(b'|') + .add(b'}') + .add(b'~'); + +/// Return the percent-encoding of the given byte. /// -/// This is unconditional, unlike `percent_encode()` which uses an encode set. +/// This is unconditional, unlike `percent_encode()` which has an `AsciiSet` parameter. /// /// # Examples /// /// ``` -/// use url::percent_encoding::percent_encode_byte; +/// use percent_encoding::percent_encode_byte; /// /// assert_eq!("foo bar".bytes().map(percent_encode_byte).collect::(), /// "%66%6F%6F%20%62%61%72"); @@ -194,74 +189,69 @@ pub fn percent_encode_byte(byte: u8) -> &'static str { "[index..index + 3] } -/// Percent-encode the given bytes with the given encode set. +/// Percent-encode the given bytes with the given set. /// -/// The encode set define which bytes (in addition to non-ASCII and controls) -/// need to be percent-encoded. -/// The choice of this set depends on context. -/// For example, `?` needs to be encoded in an URL path but not in a query string. +/// Non-ASCII bytes and bytes in `ascii_set` are encoded. /// -/// The return value is an iterator of `&str` slices (so it has a `.collect::()` method) -/// that also implements `Display` and `Into>`. -/// The latter returns `Cow::Borrowed` when none of the bytes in `input` -/// are in the given encode set. +/// The return type: +/// +/// * Implements `Iterator` and therefore has a `.collect::()` method, +/// * Implements `Display` and therefore has a `.to_string()` method, +/// * Implements `Into>` borrowing `input` when none of its bytes are encoded. /// /// # Examples /// /// ``` -/// use url::percent_encoding::{percent_encode, DEFAULT_ENCODE_SET}; +/// use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; /// -/// assert_eq!(percent_encode(b"foo bar?", DEFAULT_ENCODE_SET).to_string(), "foo%20bar%3F"); +/// assert_eq!(percent_encode(b"foo bar?", NON_ALPHANUMERIC).to_string(), "foo%20bar%3F"); /// ``` #[inline] -pub fn percent_encode(input: &[u8], encode_set: E) -> PercentEncode { +pub fn percent_encode<'a>(input: &'a [u8], ascii_set: &'static AsciiSet) -> PercentEncode<'a> { PercentEncode { bytes: input, - encode_set: encode_set, + ascii_set, } } /// Percent-encode the UTF-8 encoding of the given string. /// -/// See `percent_encode()` for how to use the return value. +/// See [`percent_encode`] regarding the return type. /// /// # Examples /// /// ``` -/// use url::percent_encoding::{utf8_percent_encode, DEFAULT_ENCODE_SET}; +/// use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; /// -/// assert_eq!(utf8_percent_encode("foo bar?", DEFAULT_ENCODE_SET).to_string(), "foo%20bar%3F"); +/// assert_eq!(utf8_percent_encode("foo bar?", NON_ALPHANUMERIC).to_string(), "foo%20bar%3F"); /// ``` #[inline] -pub fn utf8_percent_encode(input: &str, encode_set: E) -> PercentEncode { - percent_encode(input.as_bytes(), encode_set) +pub fn utf8_percent_encode<'a>(input: &'a str, ascii_set: &'static AsciiSet) -> PercentEncode<'a> { + percent_encode(input.as_bytes(), ascii_set) } -/// The return type of `percent_encode()` and `utf8_percent_encode()`. -#[derive(Clone, Debug)] -pub struct PercentEncode<'a, E: EncodeSet> { +/// The return type of [`percent_encode`] and [`utf8_percent_encode`]. +#[derive(Clone)] +pub struct PercentEncode<'a> { bytes: &'a [u8], - encode_set: E, + ascii_set: &'static AsciiSet, } -impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> { +impl<'a> Iterator for PercentEncode<'a> { type Item = &'a str; fn next(&mut self) -> Option<&'a str> { if let Some((&first_byte, remaining)) = self.bytes.split_first() { - if self.encode_set.contains(first_byte) { + if self.ascii_set.should_percent_encode(first_byte) { self.bytes = remaining; Some(percent_encode_byte(first_byte)) } else { - assert!(first_byte.is_ascii()); for (i, &byte) in remaining.iter().enumerate() { - if self.encode_set.contains(byte) { + if self.ascii_set.should_percent_encode(byte) { // 1 for first_byte + i for previous iterations of this loop let (unchanged_slice, remaining) = self.bytes.split_at(1 + i); self.bytes = remaining; return Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }); - } else { - assert!(byte.is_ascii()); } } let unchanged_slice = self.bytes; @@ -282,7 +272,7 @@ impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> { } } -impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> { +impl<'a> fmt::Display for PercentEncode<'a> { fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { for c in (*self).clone() { formatter.write_str(c)? @@ -291,8 +281,8 @@ impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> { } } -impl<'a, E: EncodeSet> From> for Cow<'a, str> { - fn from(mut iter: PercentEncode<'a, E>) -> Self { +impl<'a> From> for Cow<'a, str> { + fn from(mut iter: PercentEncode<'a>) -> Self { match iter.next() { None => "".into(), Some(first) => match iter.next() { @@ -308,19 +298,33 @@ impl<'a, E: EncodeSet> From> for Cow<'a, str> { } } +/// Percent-decode the given string. +/// +/// +/// +/// See [`percent_decode`] regarding the return type. +#[inline] +pub fn percent_decode_str(input: &str) -> PercentDecode { + percent_decode(input.as_bytes()) +} + /// Percent-decode the given bytes. /// -/// The return value is an iterator of decoded `u8` bytes -/// that also implements `Into>` -/// (which returns `Cow::Borrowed` when `input` contains no percent-encoded sequence) -/// and has `decode_utf8()` and `decode_utf8_lossy()` methods. +/// +/// +/// Any sequence of `%` followed by two hexadecimal digits is decoded. +/// The return type: +/// +/// * Implements `Into>` borrowing `input` when it contains no percent-encoded sequence, +/// * Implements `Iterator` and therefore has a `.collect::>()` method, +/// * Has `decode_utf8()` and `decode_utf8_lossy()` methods. /// /// # Examples /// /// ``` -/// use url::percent_encoding::percent_decode; +/// use percent_encoding::percent_decode; /// -/// assert_eq!(percent_decode(b"foo%20bar%3F").decode_utf8().unwrap(), "foo bar?"); +/// assert_eq!(percent_decode(b"foo%20bar%3f").decode_utf8().unwrap(), "foo bar?"); /// ``` #[inline] pub fn percent_decode(input: &[u8]) -> PercentDecode { @@ -329,22 +333,18 @@ pub fn percent_decode(input: &[u8]) -> PercentDecode { } } -/// The return type of `percent_decode()`. +/// The return type of [`percent_decode`]. #[derive(Clone, Debug)] pub struct PercentDecode<'a> { bytes: slice::Iter<'a, u8>, } fn after_percent_sign(iter: &mut slice::Iter) -> Option { - let initial_iter = iter.clone(); - let h = iter.next().and_then(|&b| (b as char).to_digit(16)); - let l = iter.next().and_then(|&b| (b as char).to_digit(16)); - if let (Some(h), Some(l)) = (h, l) { - Some(h as u8 * 0x10 + l as u8) - } else { - *iter = initial_iter; - None - } + let mut cloned_iter = iter.clone(); + let h = char::from(*cloned_iter.next()?).to_digit(16)?; + let l = char::from(*cloned_iter.next()?).to_digit(16)?; + *iter = cloned_iter; + Some(h as u8 * 0x10 + l as u8) } impl<'a> Iterator for PercentDecode<'a> { @@ -377,7 +377,7 @@ impl<'a> From> for Cow<'a, [u8]> { impl<'a> PercentDecode<'a> { /// If the percent-decoding is different from the input, return it as a new bytes vector. - pub fn if_any(&self) -> Option> { + fn if_any(&self) -> Option> { let mut bytes_iter = self.bytes.clone(); while bytes_iter.any(|&b| b == b'%') { if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter) { diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 176ffb750..bdf1f9fbd 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -59,9 +59,9 @@ impl<'a> Iterator for Parse<'a> { fn decode(input: &[u8]) -> Cow { let replaced = replace_plus(input); - decode_utf8_lossy(match percent_decode(&replaced).if_any() { - Some(vec) => Cow::Owned(vec), - None => replaced, + decode_utf8_lossy(match percent_decode(&replaced).into() { + Cow::Owned(vec) => Cow::Owned(vec), + Cow::Borrowed(_) => replaced, }) } diff --git a/src/host.rs b/src/host.rs index 6aa820911..ea66139c3 100644 --- a/src/host.rs +++ b/src/host.rs @@ -8,7 +8,7 @@ use idna; use parser::{ParseError, ParseResult}; -use percent_encoding::{percent_decode, utf8_percent_encode, SIMPLE_ENCODE_SET}; +use percent_encoding::{percent_decode, utf8_percent_encode, CONTROLS}; use std::cmp; use std::fmt::{self, Formatter}; use std::net::{Ipv4Addr, Ipv6Addr}; @@ -207,7 +207,7 @@ impl Host { { return Err(ParseError::InvalidDomainCharacter); } - let s = utf8_percent_encode(input, SIMPLE_ENCODE_SET).to_string(); + let s = utf8_percent_encode(input, CONTROLS).to_string(); Ok(Host::Domain(s)) } } diff --git a/src/lib.rs b/src/lib.rs index 4cbac60ea..92777e592 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -110,17 +110,13 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css"); #[macro_use] extern crate matches; extern crate idna; +extern crate percent_encoding; #[cfg(feature = "serde")] extern crate serde; -#[macro_use] -extern crate percent_encoding; use host::HostInternal; -use parser::{to_u32, Context, Parser, SchemeType}; -use percent_encoding::{ - percent_decode, percent_encode, utf8_percent_encode, PATH_SEGMENT_ENCODE_SET, - USERINFO_ENCODE_SET, -}; +use parser::{to_u32, Context, Parser, SchemeType, PATH_SEGMENT, USERINFO}; +use percent_encoding::{percent_decode, percent_encode, utf8_percent_encode}; use std::borrow::Borrow; use std::cmp; #[cfg(feature = "serde")] @@ -1229,8 +1225,11 @@ impl Url { if let Some(input) = query { self.query_start = Some(to_u32(self.serialization.len()).unwrap()); self.serialization.push('?'); + let scheme_type = SchemeType::from(self.scheme()); let scheme_end = self.scheme_end; - self.mutate(|parser| parser.parse_query(scheme_end, parser::Input::new(input))); + self.mutate(|parser| { + parser.parse_query(scheme_type, scheme_end, parser::Input::new(input)) + }); } self.restore_already_parsed_fragment(fragment); @@ -1729,7 +1728,7 @@ impl Url { self.serialization.truncate(self.username_end as usize); self.serialization.push(':'); self.serialization - .extend(utf8_percent_encode(password, USERINFO_ENCODE_SET)); + .extend(utf8_percent_encode(password, USERINFO)); self.serialization.push('@'); let old_host_start = self.host_start; @@ -1824,7 +1823,7 @@ impl Url { let after_username = self.slice(self.username_end..).to_owned(); self.serialization.truncate(username_start as usize); self.serialization - .extend(utf8_percent_encode(username, USERINFO_ENCODE_SET)); + .extend(utf8_percent_encode(username, USERINFO)); let mut removed_bytes = self.username_end; self.username_end = to_u32(self.serialization.len()).unwrap(); @@ -2307,7 +2306,7 @@ fn path_to_file_url_segments( serialization.push('/'); serialization.extend(percent_encode( component.as_os_str().as_bytes(), - PATH_SEGMENT_ENCODE_SET, + PATH_SEGMENT, )); } if empty { @@ -2355,7 +2354,7 @@ fn path_to_file_url_segments_windows( host_internal = host.into(); serialization.push('/'); let share = share.to_str().ok_or(())?; - serialization.extend(percent_encode(share.as_bytes(), PATH_SEGMENT_ENCODE_SET)); + serialization.extend(percent_encode(share.as_bytes(), PATH_SEGMENT)); } _ => return Err(()), }, @@ -2370,10 +2369,7 @@ fn path_to_file_url_segments_windows( // FIXME: somehow work with non-unicode? let component = component.as_os_str().to_str().ok_or(())?; serialization.push('/'); - serialization.extend(percent_encode( - component.as_bytes(), - PATH_SEGMENT_ENCODE_SET, - )); + serialization.extend(percent_encode(component.as_bytes(), PATH_SEGMENT)); } Ok((host_end, host_internal)) } diff --git a/src/parser.rs b/src/parser.rs index 7a6eaad4f..96906f94a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -11,18 +11,38 @@ use std::fmt::{self, Formatter, Write}; use std::str; use host::{Host, HostInternal}; -use percent_encoding::{ - percent_encode, utf8_percent_encode, DEFAULT_ENCODE_SET, PATH_SEGMENT_ENCODE_SET, - QUERY_ENCODE_SET, SIMPLE_ENCODE_SET, USERINFO_ENCODE_SET, -}; +use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS}; use query_encoding::EncodingOverride; use Url; -define_encode_set! { - // The backslash (\) character is treated as a path separator in special URLs - // so it needs to be additionally escaped in that case. - pub SPECIAL_PATH_SEGMENT_ENCODE_SET = [PATH_SEGMENT_ENCODE_SET] | {'\\'} -} +/// https://url.spec.whatwg.org/#fragment-percent-encode-set +const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`'); + +/// https://url.spec.whatwg.org/#path-percent-encode-set +const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}'); + +/// https://url.spec.whatwg.org/#userinfo-percent-encode-set +pub(crate) const USERINFO: &AsciiSet = &PATH + .add(b'/') + .add(b':') + .add(b';') + .add(b'=') + .add(b'@') + .add(b'[') + .add(b'\\') + .add(b']') + .add(b'^') + .add(b'|'); + +pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%'); + +// The backslash (\) character is treated as a path separator in special URLs +// so it needs to be additionally escaped in that case. +pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\'); + +// https://url.spec.whatwg.org/#query-state +const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>'); +const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\''); pub type ParseResult = Result; @@ -327,7 +347,7 @@ impl<'a> Parser<'a> { } else { let scheme_type = SchemeType::from(base_url.scheme()); if scheme_type.is_file() { - self.parse_file(input, Some(base_url)) + self.parse_file(input, scheme_type, Some(base_url)) } else { self.parse_relative(input, scheme_type, base_url) } @@ -379,7 +399,7 @@ impl<'a> Parser<'a> { } }); self.serialization.clear(); - self.parse_file(input, base_file_url) + self.parse_file(input, scheme_type, base_file_url) } SchemeType::SpecialNotFile => { // special relative or authority state @@ -434,6 +454,7 @@ impl<'a> Parser<'a> { self.parse_cannot_be_a_base_path(input) }; self.with_query_and_fragment( + scheme_type, scheme_end, username_end, host_start, @@ -445,7 +466,12 @@ impl<'a> Parser<'a> { ) } - fn parse_file(mut self, input: Input, mut base_file_url: Option<&Url>) -> ParseResult { + fn parse_file( + mut self, + input: Input, + scheme_type: SchemeType, + mut base_file_url: Option<&Url>, + ) -> ParseResult { use SyntaxViolation::Backslash; // file state debug_assert!(self.serialization.is_empty()); @@ -491,7 +517,7 @@ impl<'a> Parser<'a> { }; self.serialization.push_str(before_query); let (query_start, fragment_start) = - self.parse_query_and_fragment(base_url.scheme_end, input)?; + self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?; Ok(Url { serialization: self.serialization, query_start: query_start, @@ -503,7 +529,7 @@ impl<'a> Parser<'a> { let scheme_end = "file".len() as u32; let path_start = "file://".len() as u32; let (query_start, fragment_start) = - self.parse_query_and_fragment(scheme_end, input)?; + self.parse_query_and_fragment(scheme_type, scheme_end, input)?; Ok(Url { serialization: self.serialization, scheme_end: scheme_end, @@ -572,7 +598,7 @@ impl<'a> Parser<'a> { host = HostInternal::None; } let (query_start, fragment_start) = - self.parse_query_and_fragment(scheme_end, remaining)?; + self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; Ok(Url { serialization: self.serialization, scheme_end: scheme_end, @@ -604,7 +630,7 @@ impl<'a> Parser<'a> { input_after_first_char, ); let (query_start, fragment_start) = - self.parse_query_and_fragment(scheme_end, remaining)?; + self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; let path_start = path_start as u32; Ok(Url { serialization: self.serialization, @@ -638,6 +664,7 @@ impl<'a> Parser<'a> { input, ); self.with_query_and_fragment( + SchemeType::File, base_url.scheme_end, base_url.username_end, base_url.host_start, @@ -654,7 +681,7 @@ impl<'a> Parser<'a> { let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input); let (query_start, fragment_start) = - self.parse_query_and_fragment(scheme_end, remaining)?; + self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?; let path_start = path_start as u32; Ok(Url { serialization: self.serialization, @@ -704,7 +731,7 @@ impl<'a> Parser<'a> { }; self.serialization.push_str(before_query); let (query_start, fragment_start) = - self.parse_query_and_fragment(base_url.scheme_end, input)?; + self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?; Ok(Url { serialization: self.serialization, query_start: query_start, @@ -740,6 +767,7 @@ impl<'a> Parser<'a> { input_after_first_char, ); self.with_query_and_fragment( + scheme_type, base_url.scheme_end, base_url.username_end, base_url.host_start, @@ -761,6 +789,7 @@ impl<'a> Parser<'a> { let remaining = self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input); self.with_query_and_fragment( + scheme_type, base_url.scheme_end, base_url.username_end, base_url.host_start, @@ -792,6 +821,7 @@ impl<'a> Parser<'a> { let path_start = to_u32(self.serialization.len())?; let remaining = self.parse_path_start(scheme_type, &mut true, remaining); self.with_query_and_fragment( + scheme_type, scheme_end, username_end, host_start, @@ -854,7 +884,7 @@ impl<'a> Parser<'a> { } self.check_url_code_point(c, &input); self.serialization - .extend(utf8_percent_encode(utf8_c, USERINFO_ENCODE_SET)); + .extend(utf8_percent_encode(utf8_c, USERINFO)); } } let username_end = match username_end { @@ -1082,17 +1112,14 @@ impl<'a> Parser<'a> { self.check_url_code_point(c, &input); if self.context == Context::PathSegmentSetter { if scheme_type.is_special() { - self.serialization.extend(utf8_percent_encode( - utf8_c, - SPECIAL_PATH_SEGMENT_ENCODE_SET, - )); + self.serialization + .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT)); } else { self.serialization - .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT_ENCODE_SET)); + .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT)); } } else { - self.serialization - .extend(utf8_percent_encode(utf8_c, DEFAULT_ENCODE_SET)); + self.serialization.extend(utf8_percent_encode(utf8_c, PATH)); } } } @@ -1161,7 +1188,7 @@ impl<'a> Parser<'a> { Some((c, utf8_c)) => { self.check_url_code_point(c, &input); self.serialization - .extend(utf8_percent_encode(utf8_c, SIMPLE_ENCODE_SET)); + .extend(utf8_percent_encode(utf8_c, CONTROLS)); } None => return input, } @@ -1170,6 +1197,7 @@ impl<'a> Parser<'a> { fn with_query_and_fragment( mut self, + scheme_type: SchemeType, scheme_end: u32, username_end: u32, host_start: u32, @@ -1179,7 +1207,8 @@ impl<'a> Parser<'a> { path_start: u32, remaining: Input, ) -> ParseResult { - let (query_start, fragment_start) = self.parse_query_and_fragment(scheme_end, remaining)?; + let (query_start, fragment_start) = + self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?; Ok(Url { serialization: self.serialization, scheme_end: scheme_end, @@ -1197,6 +1226,7 @@ impl<'a> Parser<'a> { /// Return (query_start, fragment_start) fn parse_query_and_fragment( &mut self, + scheme_type: SchemeType, scheme_end: u32, mut input: Input, ) -> ParseResult<(Option, Option)> { @@ -1206,7 +1236,7 @@ impl<'a> Parser<'a> { Some('?') => { query_start = Some(to_u32(self.serialization.len())?); self.serialization.push('?'); - let remaining = self.parse_query(scheme_end, input); + let remaining = self.parse_query(scheme_type, scheme_end, input); if let Some(remaining) = remaining { input = remaining } else { @@ -1223,7 +1253,12 @@ impl<'a> Parser<'a> { Ok((query_start, Some(fragment_start))) } - pub fn parse_query<'i>(&mut self, scheme_end: u32, mut input: Input<'i>) -> Option> { + pub fn parse_query<'i>( + &mut self, + scheme_type: SchemeType, + scheme_end: u32, + mut input: Input<'i>, + ) -> Option> { let mut query = String::new(); // FIXME: use a streaming decoder instead let mut remaining = None; while let Some(c) = input.next() { @@ -1241,8 +1276,12 @@ impl<'a> Parser<'a> { _ => None, }; let query_bytes = ::query_encoding::encode(encoding, &query); - self.serialization - .extend(percent_encode(&query_bytes, QUERY_ENCODE_SET)); + let set = if scheme_type.is_special() { + SPECIAL_QUERY + } else { + QUERY + }; + self.serialization.extend(percent_encode(&query_bytes, set)); remaining } @@ -1272,8 +1311,14 @@ impl<'a> Parser<'a> { self.log_violation(SyntaxViolation::NullInFragment) } else { self.check_url_code_point(c, &input); - self.serialization - .extend(utf8_percent_encode(utf8_c, SIMPLE_ENCODE_SET)); + self.serialization.extend(utf8_percent_encode( + utf8_c, + // FIXME: tests fail when we use the FRAGMENT set here + // as defined in the spec as of 2019-07-17, + // likely because tests are out of date. + // See https://github.com/servo/rust-url/issues/290 + CONTROLS, + )); } } } diff --git a/tests/data.rs b/tests/data.rs index 1981814e6..b462ec2fd 100644 --- a/tests/data.rs +++ b/tests/data.rs @@ -47,7 +47,7 @@ fn run_parsing(input: &str, base: &str, expected: Result let got = $got; assert!( expected == got, - "{:?} != {} {:?} for URL {:?}", + "\n{:?}\n!= {}\n{:?}\nfor URL {:?}\n", got, stringify!($expected), expected, diff --git a/tests/unit.rs b/tests/unit.rs index 9f3764911..d5e81986a 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -8,9 +8,8 @@ //! Unit tests -extern crate url; -#[macro_use] extern crate percent_encoding; +extern crate url; use std::borrow::Cow; use std::cell::{Cell, RefCell}; @@ -429,41 +428,6 @@ fn test_leading_dots() { assert_eq!(Url::parse("file://./foo").unwrap().domain(), Some(".")); } -// This is testing that the macro produces buildable code when invoked -// inside both a module and a function -#[test] -fn define_encode_set_scopes() { - use percent_encoding::{utf8_percent_encode, SIMPLE_ENCODE_SET}; - - define_encode_set! { - /// This encode set is used in the URL parser for query strings. - pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} - } - - assert_eq!( - utf8_percent_encode("foo bar", QUERY_ENCODE_SET).collect::(), - "foo%20bar" - ); - - mod m { - use percent_encoding::{utf8_percent_encode, SIMPLE_ENCODE_SET}; - - define_encode_set! { - /// This encode set is used in the URL parser for query strings. - pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} - } - - pub fn test() { - assert_eq!( - utf8_percent_encode("foo bar", QUERY_ENCODE_SET).collect::(), - "foo%20bar" - ); - } - } - - m::test(); -} - #[test] /// https://github.com/servo/rust-url/issues/302 fn test_origin_hash() { From a1fe49eeaa2fd4c762498e104039884e04740571 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 17 Jul 2019 17:37:33 +0200 Subject: [PATCH 2/2] Local variables in const fn require 1.33 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f3417d793..ccbeb75be 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ script: cargo test --all-features --all jobs: include: - - rust: 1.30.0 + - rust: 1.33.0 - rust: stable - rust: beta - rust: nightly