replace the new file module with inherent methods on Hasher

New methods: - update_reader - update_mmap - update_mmap_rayon These are more discoverable and more convenient to use. There are two problems I want to avoid by taking a `Path` instead of a `File`. First, exposing `Mmap` objects to the caller is fundamentally unsafe, and making `maybe_mmap_file` private avoids that issue. Second, taking a `File` raises questions about whether memory mapped reads should behave like regular file reads. (Should they respect the current seek position? Should they update the seek position?) Taking a `Path` from the caller and opening the `File` internally avoids these questions.
BLAKE3-team · Sep 16, 2023 · b6ee296 · b6ee296
1 parent e0bb915
commit b6ee296
Show file tree

Hide file tree

Showing 8 changed files with 378 additions and 224 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -59,6 +59,10 @@ jobs:
       run: cargo test --features=rayon,traits-preview,zeroize
       env:
         RAYON_NUM_THREADS: 1
+    # The mmap methods.
+    - run: cargo test --features=mmap
+    # All public features put together.
+    - run: cargo test --features=mmap,rayon,traits-preview,zeroize
     # no_std tests.
     - run: cargo test --no-default-features
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -30,6 +30,8 @@ std = []
 # this feature is enabled, all other APIs remain single-threaded.
 rayon = ["dep:rayon", "std"]
 
+mmap = ["std", "dep:memmap2"]
+
 # Implement the zeroize::Zeroize trait for types in this crate.
 zeroize = ["dep:zeroize", "arrayvec/zeroize"]
 
@@ -81,11 +83,9 @@ no_avx2 = []
 no_avx512 = []
 no_neon = []
 
-file = ["memmap2", "rayon", "std"]
-
 [package.metadata.docs.rs]
-# Document Hasher::update_rayon on docs.rs.
-features = ["rayon", "zeroize"]
+# Document the rayon/mmap methods and the Zeroize impls on docs.rs.
+features = ["mmap", "rayon", "zeroize"]
 
 [dependencies]
 arrayref = "0.3.5"
@@ -98,12 +98,13 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive
 memmap2 = { version = "0.7.1", optional = true }
 
 [dev-dependencies]
+hmac = "0.12.0"
 hex = "0.4.2"
 page_size = "0.6.0"
 rand = "0.8.0"
 rand_chacha = "0.3.0"
 reference_impl = { path = "./reference_impl" }
-hmac = "0.12.0"
+tempfile = "3.8.0"
 
 [build-dependencies]
 cc = "1.0.4"
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
@@ -15,7 +15,7 @@ pure = ["blake3/pure"]
 
 [dependencies]
 anyhow = "1.0.25"
-blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
+blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
 clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
 memmap2 = "0.7.0"

diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
@@ -163,73 +163,22 @@ impl Args {
     }
 }
 
-enum Input {
-    Mmap(io::Cursor<memmap2::Mmap>),
-    File(File),
-    Stdin,
-}
-
-impl Input {
-    // Open an input file, using mmap if appropriate. "-" means stdin. Note
-    // that this convention applies both to command line arguments, and to
-    // filepaths that appear in a checkfile.
-    fn open(path: &Path, args: &Args) -> Result<Self> {
-        if path == Path::new("-") {
-            if args.keyed() {
-                bail!("Cannot open `-` in keyed mode");
-            }
-            return Ok(Self::Stdin);
-        }
-        let file = File::open(path)?;
-        if !args.no_mmap() {
-            if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
-                return Ok(Self::Mmap(io::Cursor::new(mmap)));
-            }
-        }
-        Ok(Self::File(file))
-    }
-
-    fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
-        let mut hasher = args.base_hasher.clone();
-        match self {
-            // The fast path: If we mmapped the file successfully, hash using
-            // multiple threads. This doesn't work on stdin, or on some files,
-            // and it can also be disabled with --no-mmap.
-            Self::Mmap(cursor) => {
-                hasher.update_rayon(cursor.get_ref());
-            }
-            // The slower paths, for stdin or files we didn't/couldn't mmap.
-            // This is currently all single-threaded. Doing multi-threaded
-            // hashing without memory mapping is tricky, since all your worker
-            // threads have to stop every time you refill the buffer, and that
-            // ends up being a lot of overhead. To solve that, we need a more
-            // complicated double-buffering strategy where a background thread
-            // fills one buffer while the worker threads are hashing the other
-            // one. We might implement that in the future, but since this is
-            // the slow path anyway, it's not high priority.
-            Self::File(file) => {
-                blake3::copy_wide(file, &mut hasher)?;
-            }
-            Self::Stdin => {
-                let stdin = io::stdin();
-                let lock = stdin.lock();
-                blake3::copy_wide(lock, &mut hasher)?;
-            }
-        }
-        let mut output_reader = hasher.finalize_xof();
-        output_reader.set_position(args.seek());
-        Ok(output_reader)
-    }
-}
-
-impl Read for Input {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Mmap(cursor) => cursor.read(buf),
-            Self::File(file) => file.read(buf),
-            Self::Stdin => io::stdin().read(buf),
+fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
+    let mut hasher = args.base_hasher.clone();
+    if path == Path::new("-") {
+        if args.keyed() {
+            bail!("Cannot open `-` in keyed mode");
         }
+        hasher.update_reader(io::stdin().lock())?;
+    } else if args.no_mmap() {
+        hasher.update_reader(File::open(path)?)?;
+    } else {
+        // The fast path: Try to mmap the file and hash it with multiple threads.
+        hasher.update_mmap_rayon(path)?;
     }
+    let mut output_reader = hasher.finalize_xof();
+    output_reader.set_position(args.seek());
+    Ok(output_reader)
 }
 
 fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
@@ -425,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
 }
 
 fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
-    let mut input = Input::open(path, args)?;
-    let output = input.hash(args)?;
+    let output = hash_path(args, path)?;
     if args.raw() {
         write_raw_output(output, args)?;
         return Ok(());
@@ -470,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool {
     } else {
         file_string
     };
-    let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
-        .and_then(|mut input| input.hash(args))
-        .map(|mut hash_output| {
+    let found_hash: blake3::Hash;
+    match hash_path(args, &file_path) {
+        Ok(mut output) => {
             let mut found_hash_bytes = [0; blake3::OUT_LEN];
-            hash_output.fill(&mut found_hash_bytes);
-            found_hash_bytes.into()
-        });
-    let found_hash: blake3::Hash = match hash_result {
-        Ok(hash) => hash,
+            output.fill(&mut found_hash_bytes);
+            found_hash = found_hash_bytes.into();
+        }
         Err(e) => {
             println!("{}: FAILED ({})", file_string, e);
             return false;
@@ -497,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool {
 }
 
 fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> {
-    let checkfile_input = Input::open(path, args)?;
-    let mut bufreader = io::BufReader::new(checkfile_input);
+    let mut file;
+    let stdin;
+    let mut stdin_lock;
+    let mut bufreader: io::BufReader<&mut dyn Read>;
+    if path == Path::new("-") {
+        stdin = io::stdin();
+        stdin_lock = stdin.lock();
+        bufreader = io::BufReader::new(&mut stdin_lock);
+    } else {
+        file = File::open(path)?;
+        bufreader = io::BufReader::new(&mut file);
+    }
     let mut line = String::new();
     loop {
         line.clear();

diff --git a/src/file.rs b/src/file.rs
diff --git a/src/io.rs b/src/io.rs
@@ -0,0 +1,78 @@
+//! Helper functions for efficient IO.
+
+use std::{fs::File, io};
+
+#[cfg(feature = "std")]
+pub(crate) fn copy_wide(
+    mut reader: impl std::io::Read,
+    hasher: &mut crate::Hasher,
+) -> std::io::Result<u64> {
+    let mut buffer = [0; 65536];
+    let mut total = 0;
+    loop {
+        match reader.read(&mut buffer) {
+            Ok(0) => return Ok(total),
+            Ok(n) => {
+                hasher.update(&buffer[..n]);
+                total += n as u64;
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
+// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
+// fails, return the error.
+//
+// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
+// str::from_utf8 on them and then have them change out from under you. Letting a caller get their
+// hands on an mmap, or even on a &[u8] that's backed by an mmap, is unsound. However, because this
+// function is private, we can guarantee that all will ever happen to this mmap is that we'll hash
+// its contents,
+//
+// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know in our
+// heart of hearts that no platform in the "real world" is ever going to do anything other than
+// compute the "wrong answer" if we race on this mmap while we hash it, aren't we still supposed to
+// feel bad about doing this? Well, this is IO, and IO does tend to get special carve-outs in the
+// memory model. Consider a memory-mapped register that returns random 32-bit words. (This is
+// actually realistic if you have a hardware RNG.) It's probably sound to construct a *const i32
+// pointing to that register and do some raw pointer reads from it. Those reads should be volatile
+// if you don't want the compiler to coalesce them, but all the same the compiler isn't allowed to
+// just _go nuts_ and insert should-never-happen branches to delete your hard drive if two adjacent
+// reads happen to give different values. As far as I'm aware, there's no such thing as a read
+// that's allowed if it's volatile but prohibited if it's not (unlike atomics). As mentioned above,
+// it's not ok to construct a safe &i32 to the register if you're going to leak that reference to
+// unknown callers. But if you "know what you're doing," I don't think *const i32 and &i32 are
+// fundamentally different here. Feedback needed.
+#[cfg(feature = "mmap")]
+pub(crate) fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
+    let metadata = file.metadata()?;
+    let file_size = metadata.len();
+    #[allow(clippy::if_same_then_else)]
+    if !metadata.is_file() {
+        // Not a real file.
+        Ok(None)
+    } else if file_size > isize::max_value() as u64 {
+        // Too long to safely map.
+        // https://github.com/danburkert/memmap-rs/issues/69
+        Ok(None)
+    } else if file_size == 0 {
+        // Mapping an empty file currently fails.
+        // https://github.com/danburkert/memmap-rs/issues/72
+        Ok(None)
+    } else if file_size < 16 * 1024 {
+        // Mapping small files is not worth it.
+        Ok(None)
+    } else {
+        // Explicitly set the length of the memory map, so that filesystem
+        // changes can't race to violate the invariants we just checked.
+        let map = unsafe {
+            memmap2::MmapOptions::new()
+                .len(file_size as usize)
+                .map(file)?
+        };
+        Ok(Some(map))
+    }
+}