Skip to content

Commit

Permalink
Reintroduce unsafe in encoding::string::merge
Browse files Browse the repository at this point in the history
3ab36fda9a1788e0 introduced a ~3% regression on some decoding
benchmarks. My best guess is that swapping the value temporarily with a
new string is not being optimized well. This commit reverts to using
String::as_mut_vec, but carefully clears the string on decode failure.
It also fixes up the associated regression test, which wasn't properly
covering the malformed UTF-8 case.

    // Before #194
    dataset/google_message3_2/decode
                            time:   [219.45 ms 219.48 ms 219.51 ms]
                            thrpt:  [458.64 MiB/s 458.72 MiB/s 458.78 MiB/s]

    // #194
    dataset/google_message3_2/decode
                            time:   [226.08 ms 226.11 ms 226.18 ms]
                            thrpt:  [445.12 MiB/s 445.25 MiB/s 445.32 MiB/s]

    // This commit
    dataset/google_message3_2/decode
                            time:   [219.05 ms 219.76 ms 221.78 ms]
                            thrpt:  [453.94 MiB/s 458.12 MiB/s 459.61 MiB/s]
  • Loading branch information
danburkert committed Jan 12, 2020
1 parent 51ed67d commit 420e032
Showing 1 changed file with 41 additions and 9 deletions.
50 changes: 41 additions & 9 deletions src/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use std::cmp::min;
use std::convert::TryFrom;
use std::mem;
use std::str;
use std::u32;
use std::usize;

Expand Down Expand Up @@ -819,11 +820,41 @@ pub mod string {
where
B: Buf,
{
let mut value_bytes = mem::replace(value, String::new()).into_bytes();
bytes::merge(wire_type, &mut value_bytes, buf, ctx)?;
*value = String::from_utf8(value_bytes)
.map_err(|_| DecodeError::new("invalid string value: data is not UTF-8 encoded"))?;
Ok(())
// ## Unsafety
//
// `string::merge` reuses `bytes::merge`, with an additional check of utf-8
// well-formedness. If the utf-8 is not well-formed, or if any other error occurs, then the
// string is cleared, so as to avoid leaking a string field with invalid data.
//
// This implementation uses the unsafe `String::as_mut_vec` method instead of the safe
// alternative of temporarily swapping an empty `String` into the field, because it results
// in up to 10% better performance on the protobuf message decoding benchmarks.
//
// It's required when using `String::as_mut_vec` that invalid utf-8 data not be leaked into
// the backing `String`. To enforce this, even in the event of a panic in `bytes::merge` or
// in the buf implementation, a drop guard is used.
unsafe {
struct DropGuard<'a>(&'a mut Vec<u8>);
impl<'a> Drop for DropGuard<'a> {
#[inline]
fn drop(&mut self) {
self.0.clear();
}
}

let drop_guard = DropGuard(value.as_mut_vec());
bytes::merge(wire_type, drop_guard.0, buf, ctx)?;
match str::from_utf8(drop_guard.0) {
Ok(_) => {
// Success; do not clear the bytes.
mem::forget(drop_guard);
Ok(())
}
Err(_) => Err(DecodeError::new(
"invalid string value: data is not UTF-8 encoded",
)),
}
}
}

length_delimited!(String);
Expand Down Expand Up @@ -861,8 +892,8 @@ pub mod bytes {
//
// > Normally, an encoded message would never have more than one instance of a non-repeated
// > field. However, parsers are expected to handle the case in which they do. For numeric
// > types and strings, if the same field appears multiple times, the parser accepts the last
// > value it sees.
// > types and strings, if the same field appears multiple times, the parser accepts the
// > last value it sees.
//
// [1]: https://developers.google.com/protocol-buffers/docs/encoding#optional
value.clear();
Expand Down Expand Up @@ -1424,9 +1455,10 @@ mod test {
}

#[test]
fn string_merge_failure() {
fn string_merge_invalid_utf8() {
let mut s = String::new();
let mut buf = Cursor::new(b"\x80\x80");
let mut buf = Cursor::new(b"\x02\x80\x80");

let r = string::merge(
WireType::LengthDelimited,
&mut s,
Expand Down

0 comments on commit 420e032

Please sign in to comment.