From beac70a80a1c65f39ebadfea2d17ed280d23e6a1 Mon Sep 17 00:00:00 2001 From: KodrAus Date: Mon, 15 Nov 2021 09:51:33 +1000 Subject: [PATCH 1/2] add some inline comments to the new parser impl --- shared/error.rs | 2 ++ shared/parser.rs | 39 +++++++++++++++++++++++++++++++++++++ src/fmt.rs | 50 +++++++++++++++++++++++++++++------------------- 3 files changed, 71 insertions(+), 20 deletions(-) diff --git a/shared/error.rs b/shared/error.rs index 030a313d..9c0bce74 100644 --- a/shared/error.rs +++ b/shared/error.rs @@ -51,9 +51,11 @@ impl<'a> InvalidUuid<'a> { let mut hyphen_count = 0; let mut group_bounds = [0; 4]; + // SAFETY: the byte array came from a valid utf8 string, // and is aligned along char boundries. let string = unsafe { std::str::from_utf8_unchecked(s) }; + for (index, character) in string.char_indices() { let byte = character as u8; if character as u32 - byte as u32 > 0 { diff --git a/shared/parser.rs b/shared/parser.rs index 336d9461..e46bd1e4 100644 --- a/shared/parser.rs +++ b/shared/parser.rs @@ -14,15 +14,22 @@ use crate::error::InvalidUuid; #[inline] pub const fn try_parse(input: &str) -> Result<[u8; 16], InvalidUuid> { let result = match (input.len(), input.as_bytes()) { + // Inputs of 32 bytes must be a non-hyphenated UUID (32, s) => parse_simple(s), + // Hyphenated UUIDs may be wrapped in various ways: + // - `{UUID}` for braced UUIDs + // - `urn:uuid:UUID` for URNs + // - `UUID` for a regular hyphenated UUID (36, s) | (38, [b'{', s @ .., b'}']) | ( 45, [b'u', b'r', b'n', b':', b'u', b'u', b'i', b'd', b':', s @ ..], ) => parse_hyphenated(s), + // Any other shaped input is immediately invalid _ => Err(()), }; + match result { Ok(b) => Ok(b), Err(()) => Err(InvalidUuid(input)), @@ -31,47 +38,73 @@ pub const fn try_parse(input: &str) -> Result<[u8; 16], InvalidUuid> { #[inline] const fn parse_simple(s: &[u8]) -> Result<[u8; 16], ()> { + // Should be optimized away as redundant if s.len() != 32 { return Err(()); } let mut buf: [u8; 16] = [0; 16]; let mut i = 0; + while i < 16 { + // Convert a two-char hex value (like `A8`) + // into a byte (like `10101000`) let h1 = HEX_TABLE[s[i * 2] as usize]; let h2 = HEX_TABLE[s[i * 2 + 1] as usize]; + + // We use `0xff` as a sentinel value to indicate + // an invalid hex character sequence (like the letter `G`) if h1 | h2 == 0xff { return Err(()); } + + // The upper nibble needs to be shifted into position + // to produce the final byte value buf[i] = SHL4_TABLE[h1 as usize] | h2; i += 1; } + Ok(buf) } #[inline] const fn parse_hyphenated(s: &[u8]) -> Result<[u8; 16], ()> { + // Should be optimized away as redundant if s.len() != 36 { return Err(()); } + // First, ensure the hyphens appear in the right places match [s[8], s[13], s[18], s[23]] { [b'-', b'-', b'-', b'-'] => {} _ => return Err(()), } + // We look at two hex-encoded values (4 chars) at a time because + // that's the size of the smallest group in a hyphenated UUID: + // + // uuid : 936da01f-9abd-4d9d-80c7-02af85c822a8 + // | | || || || || | | + // hyphens : | | 8| 13| 18| 23| | | + // positions: 0 4 9 14 19 24 28 32 let positions: [u8; 8] = [0, 4, 9, 14, 19, 24, 28, 32]; let mut buf: [u8; 16] = [0; 16]; let mut j = 0; + while j < 8 { let i = positions[j]; + + // The decoding here is the same as the simple case + // We're just dealing with two values instead of one let h1 = HEX_TABLE[s[i as usize] as usize]; let h2 = HEX_TABLE[s[(i + 1) as usize] as usize]; let h3 = HEX_TABLE[s[(i + 2) as usize] as usize]; let h4 = HEX_TABLE[s[(i + 3) as usize] as usize]; + if h1 | h2 | h3 | h4 == 0xff { return Err(()); } + buf[j * 2] = SHL4_TABLE[h1 as usize] | h2; buf[j * 2 + 1] = SHL4_TABLE[h3 as usize] | h4; j += 1; @@ -83,6 +116,7 @@ const fn parse_hyphenated(s: &[u8]) -> Result<[u8; 16], ()> { const HEX_TABLE: &[u8; 256] = &{ let mut buf = [0; 256]; let mut i: u8 = 0; + loop { buf[i as usize] = match i { b'0'..=b'9' => i - b'0', @@ -90,9 +124,11 @@ const HEX_TABLE: &[u8; 256] = &{ b'A'..=b'F' => i - b'A' + 10, _ => 0xff, }; + if i == 255 { break buf; } + i += 1 } }; @@ -100,11 +136,14 @@ const HEX_TABLE: &[u8; 256] = &{ const SHL4_TABLE: &[u8; 256] = &{ let mut buf = [0; 256]; let mut i: u8 = 0; + loop { buf[i as usize] = i.wrapping_shl(4); + if i == 255 { break buf; } + i += 1; } }; diff --git a/src/fmt.rs b/src/fmt.rs index c4e8589e..5bef01c7 100644 --- a/src/fmt.rs +++ b/src/fmt.rs @@ -12,7 +12,7 @@ //! Adapters for various formats for UUIDs use crate::{ - std::{borrow::Borrow, fmt, str}, + std::{borrow::Borrow, fmt, str, ptr}, Uuid, Variant, }; @@ -227,12 +227,14 @@ fn encode_simple<'b>( buffer: &'b mut [u8], upper: bool, ) -> &'b mut str { - const LEN: usize = 32; - let buf = &mut buffer[..LEN]; + let buf = &mut buffer[..Simple::LENGTH]; + let dst = buf.as_mut_ptr(); + + // SAFETY: `buf` is guaranteed to be at least `LEN` bytes + // SAFETY: The encoded buffer is ASCII encoded unsafe { - let dst = buf.as_mut_ptr(); - core::ptr::write(dst.cast(), format_simple(src, upper)); - core::str::from_utf8_unchecked_mut(buf) // SAFETY: ascii encoding + ptr::write(dst.cast(), format_simple(src, upper)); + str::from_utf8_unchecked_mut(buf) } } @@ -242,12 +244,14 @@ fn encode_hyphenated<'b>( buffer: &'b mut [u8], upper: bool, ) -> &'b mut str { - const LEN: usize = 36; - let buf = &mut buffer[..LEN]; + let buf = &mut buffer[..Hyphenated::LENGTH]; + let dst = buf.as_mut_ptr(); + + // SAFETY: `buf` is guaranteed to be at least `LEN` bytes + // SAFETY: The encoded buffer is ASCII encoded unsafe { - let dst = buf.as_mut_ptr(); - core::ptr::write(dst.cast(), format_hyphenated(src, upper)); - core::str::from_utf8_unchecked_mut(buf) // SAFETY: ascii encoding + ptr::write(dst.cast(), format_hyphenated(src, upper)); + str::from_utf8_unchecked_mut(buf) } } @@ -257,14 +261,17 @@ fn encode_braced<'b>( buffer: &'b mut [u8], upper: bool, ) -> &'b mut str { - const LEN: usize = 38; - let buf = &mut buffer[..LEN]; + let buf = &mut buffer[..Braced::LENGTH]; buf[0] = b'{'; - buf[LEN - 1] = b'}'; + buf[Braced::LENGTH - 1] = b'}'; + + // SAFETY: `buf` is guaranteed to be at least `LEN` bytes + // SAFETY: The encoded buffer is ASCII encoded unsafe { let dst = buf.as_mut_ptr().add(1); - core::ptr::write(dst.cast(), format_hyphenated(src, upper)); - core::str::from_utf8_unchecked_mut(buf) // SAFETY: ascii encoding + + ptr::write(dst.cast(), format_hyphenated(src, upper)); + str::from_utf8_unchecked_mut(buf) } } @@ -274,13 +281,16 @@ fn encode_urn<'b>( buffer: &'b mut [u8], upper: bool, ) -> &'b mut str { - const LEN: usize = 45; - let buf = &mut buffer[..LEN]; + let buf = &mut buffer[..Urn::LENGTH]; buf[..9].copy_from_slice(b"urn:uuid:"); + + // SAFETY: `buf` is guaranteed to be at least `LEN` bytes + // SAFETY: The encoded buffer is ASCII encoded unsafe { let dst = buf.as_mut_ptr().add(9); - core::ptr::write(dst.cast(), format_hyphenated(src, upper)); - core::str::from_utf8_unchecked_mut(buf) // SAFETY: ascii encoding + + ptr::write(dst.cast(), format_hyphenated(src, upper)); + str::from_utf8_unchecked_mut(buf) } } From 708315e597030bf072b3cb053e6ec30a4e5ed25f Mon Sep 17 00:00:00 2001 From: KodrAus Date: Tue, 16 Nov 2021 07:40:59 +1000 Subject: [PATCH 2/2] update comment on bounds checks --- shared/parser.rs | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/shared/parser.rs b/shared/parser.rs index e46bd1e4..0ddf9dab 100644 --- a/shared/parser.rs +++ b/shared/parser.rs @@ -38,7 +38,8 @@ pub const fn try_parse(input: &str) -> Result<[u8; 16], InvalidUuid> { #[inline] const fn parse_simple(s: &[u8]) -> Result<[u8; 16], ()> { - // Should be optimized away as redundant + // This length check here removes all other bounds + // checks in this function if s.len() != 32 { return Err(()); } @@ -69,24 +70,27 @@ const fn parse_simple(s: &[u8]) -> Result<[u8; 16], ()> { #[inline] const fn parse_hyphenated(s: &[u8]) -> Result<[u8; 16], ()> { - // Should be optimized away as redundant + // This length check here removes all other bounds + // checks in this function if s.len() != 36 { return Err(()); } - // First, ensure the hyphens appear in the right places - match [s[8], s[13], s[18], s[23]] { - [b'-', b'-', b'-', b'-'] => {} - _ => return Err(()), - } - // We look at two hex-encoded values (4 chars) at a time because - // that's the size of the smallest group in a hyphenated UUID: + // that's the size of the smallest group in a hyphenated UUID. + // The indexes we're interested in are: // // uuid : 936da01f-9abd-4d9d-80c7-02af85c822a8 // | | || || || || | | // hyphens : | | 8| 13| 18| 23| | | // positions: 0 4 9 14 19 24 28 32 + + // First, ensure the hyphens appear in the right places + match [s[8], s[13], s[18], s[23]] { + [b'-', b'-', b'-', b'-'] => {} + _ => return Err(()), + } + let positions: [u8; 8] = [0, 4, 9, 14, 19, 24, 28, 32]; let mut buf: [u8; 16] = [0; 16]; let mut j = 0;