Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add try_parse_ascii to avoid string conversions when parsing from bytes #579

Merged
merged 2 commits into from Jan 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Expand Up @@ -135,11 +135,11 @@ version = "1.0"
[dev-dependencies.serde_test]
version = "1.0.56"

[dev-dependencies.wasm-bindgen-lib]
[target.'cfg(target_arch = "wasm32")'.dev-dependencies.wasm-bindgen-lib]
package = "wasm-bindgen"
version = "0.2"

[dev-dependencies.wasm-bindgen-test]
[target.'cfg(target_arch = "wasm32")'.dev-dependencies.wasm-bindgen-test]
version = "0.3"

[dev-dependencies.trybuild]
Expand Down
25 changes: 18 additions & 7 deletions src/error.rs
Expand Up @@ -28,6 +28,8 @@ pub(crate) enum ErrorKind {
len: usize,
index: usize,
},
/// The input was not a valid UTF8 string
InvalidUTF8,
/// Some other error occurred.
Other,
}
Expand All @@ -40,12 +42,18 @@ pub(crate) enum ErrorKind {
///
/// [`Uuid`]: ../struct.Uuid.html
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct InvalidUuid<'a>(pub(crate) &'a str);
pub struct InvalidUuid<'a>(pub(crate) &'a [u8]);

impl<'a> InvalidUuid<'a> {
/// Converts the lightweight error type into detailed diagnostics.
pub fn into_err(self) -> Error {
let (s, offset, simple) = match self.0.as_bytes() {
// Check whether or not the input was ever actually a valid UTF8 string
let input_str = match std::str::from_utf8(self.0) {
Ok(s) => s,
Err(_) => return Error(ErrorKind::InvalidUTF8),
};

let (uuid_str, offset, simple) = match input_str.as_bytes() {
[b'{', s @ .., b'}'] => (s, 1, false),
[b'u', b'r', b'n', b':', b'u', b'u', b'i', b'd', b':', s @ ..] => {
(s, "urn:uuid:".len(), false)
Expand All @@ -57,10 +65,10 @@ impl<'a> InvalidUuid<'a> {
let mut group_bounds = [0; 4];

// SAFETY: the byte array came from a valid utf8 string,
// and is aligned along char boundries.
let string = unsafe { std::str::from_utf8_unchecked(s) };
// and is aligned along char boundaries.
let uuid_str = unsafe { std::str::from_utf8_unchecked(uuid_str) };

for (index, character) in string.char_indices() {
for (index, character) in uuid_str.char_indices() {
let byte = character as u8;
if character as u32 - byte as u32 > 0 {
// Multibyte char
Expand All @@ -87,7 +95,9 @@ impl<'a> InvalidUuid<'a> {
// This means that we tried and failed to parse a simple uuid.
// Since we verified that all the characters are valid, this means
// that it MUST have an invalid length.
Error(ErrorKind::SimpleLength { len: s.len() })
Error(ErrorKind::SimpleLength {
len: input_str.len(),
})
} else if hyphen_count != 4 {
// We tried to parse a hyphenated variant, but there weren't
// 5 groups (4 hyphen splits).
Expand All @@ -110,7 +120,7 @@ impl<'a> InvalidUuid<'a> {
// The last group must be too long
Error(ErrorKind::GroupLength {
group: 4,
len: s.len() - BLOCK_STARTS[4],
len: input_str.len() - BLOCK_STARTS[4],
index: offset + BLOCK_STARTS[4] + 1,
})
}
Expand Down Expand Up @@ -146,6 +156,7 @@ impl fmt::Display for Error {
group, expected, len
)
}
ErrorKind::InvalidUTF8 => write!(f, "non-UTF8 input"),
ErrorKind::Other => write!(f, "failed to parse a UUID"),
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/lib.rs
Expand Up @@ -491,8 +491,8 @@ impl Uuid {
/// If the version field doesn't contain a recognized version then `None`
/// is returned. If you're trying to read the version for a future extension
/// you can also use [`Uuid::get_version_num`] to unconditionally return a
/// number. Future extensions may start to return `Some` once they're standardized
/// and supported.
/// number. Future extensions may start to return `Some` once they're
/// standardized and supported.
///
/// # Examples
///
Expand Down Expand Up @@ -525,8 +525,8 @@ impl Uuid {

/// Returns the four field values of the UUID.
///
/// These values can be passed to the [`Uuid::from_fields`] method to get the
/// original `Uuid` back.
/// These values can be passed to the [`Uuid::from_fields`] method to get
/// the original `Uuid` back.
///
/// * The first field value represents the first group of (eight) hex
/// digits, taken as a big-endian `u32` value. For V1 UUIDs, this field
Expand Down
52 changes: 49 additions & 3 deletions src/parser.rs
Expand Up @@ -42,6 +42,9 @@ impl Uuid {
/// Any of the formats generated by this module (simple, hyphenated, urn,
/// Microsoft GUID) are supported by this parsing function.
///
/// Prefer [`try_parse`] unless you need detailed user-facing diagnostics.
/// This method will be eventually deprecated in favor of `try_parse`.
///
/// # Examples
///
/// Parse a hyphenated UUID:
Expand All @@ -56,8 +59,10 @@ impl Uuid {
/// # Ok(())
/// # }
/// ```
///
/// [`try_parse`]: #method.try_parse
pub fn parse_str(input: &str) -> Result<Uuid, Error> {
try_parse(input)
try_parse(input.as_bytes())
.map(Uuid::from_bytes)
.map_err(InvalidUuid::into_err)
}
Expand All @@ -70,6 +75,9 @@ impl Uuid {
/// fails, it won't generate very useful error messages. The `parse_str`
/// function will eventually be deprecated in favor or `try_parse`.
///
/// To parse a UUID from a byte stream instead of a UTF8 string, see
/// [`try_parse_ascii`].
///
/// # Examples
///
/// Parse a hyphenated UUID:
Expand All @@ -86,16 +94,46 @@ impl Uuid {
/// ```
///
/// [`parse_str`]: #method.parse_str
/// [`try_parse_ascii`]: #method.try_parse_ascii
pub const fn try_parse(input: &str) -> Result<Uuid, Error> {
Self::try_parse_ascii(input.as_bytes())
}

/// Parses a `Uuid` from a string of hexadecimal digits with optional
/// hyphens.
///
/// The input is expected to be a string of ASCII characters. This method
/// can be more convenient than [`try_parse`] if the UUID is being
/// parsed from a byte stream instead of from a UTF8 string.
///
/// # Examples
///
/// Parse a hyphenated UUID:
///
/// ```
/// # use uuid::{Uuid, Version, Variant};
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let uuid = Uuid::try_parse_ascii(b"550e8400-e29b-41d4-a716-446655440000")?;
///
/// assert_eq!(Some(Version::Random), uuid.get_version());
/// assert_eq!(Variant::RFC4122, uuid.get_variant());
/// # Ok(())
/// # }
/// ```
///
/// [`try_parse`]: #method.try_parse
pub const fn try_parse_ascii(input: &[u8]) -> Result<Uuid, Error> {
match try_parse(input) {
Ok(bytes) => Ok(Uuid::from_bytes(bytes)),
// If parsing fails then we don't know exactly what went wrong
// In this case, we just return a generic error
Err(_) => Err(Error(ErrorKind::Other)),
}
}
}

const fn try_parse(input: &str) -> Result<[u8; 16], InvalidUuid> {
let result = match (input.len(), input.as_bytes()) {
const fn try_parse(input: &[u8]) -> Result<[u8; 16], InvalidUuid> {
let result = match (input.len(), input) {
// Inputs of 32 bytes must be a non-hyphenated UUID
(32, s) => parse_simple(s),
// Hyphenated UUIDs may be wrapped in various ways:
Expand Down Expand Up @@ -486,4 +524,12 @@ mod tests {
let uuid_out = Uuid::parse_str(&orig_str).unwrap();
assert_eq!(uuid_orig, uuid_out);
}

#[test]
fn test_try_parse_ascii_non_utf8() {
assert!(Uuid::try_parse_ascii(
b"67e55044-10b1-426f-9247-bb680e5\0e0c8"
)
.is_err());
}
}