Skip to content

Commit

Permalink
replace the new file module with inherent methods on Hasher
Browse files Browse the repository at this point in the history
New methods:
- update_reader
- update_mmap
- update_mmap_rayon

These are more discoverable and more convenient to use.

There are two problems I want to avoid by taking a `Path` instead of a
`File`. First, exposing `Mmap` objects to the caller is fundamentally
unsafe, and making `maybe_mmap_file` private avoids that issue. Second,
taking a `File` raises questions about whether memory mapped reads
should behave like regular file reads. (Should they respect the current
seek position? Should they update the seek position?) Taking a `Path`
from the caller and opening the `File` internally avoids these
questions.
  • Loading branch information
oconnor663 committed Sep 16, 2023
1 parent e0bb915 commit b6ee296
Show file tree
Hide file tree
Showing 8 changed files with 378 additions and 224 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ jobs:
run: cargo test --features=rayon,traits-preview,zeroize
env:
RAYON_NUM_THREADS: 1
# The mmap methods.
- run: cargo test --features=mmap
# All public features put together.
- run: cargo test --features=mmap,rayon,traits-preview,zeroize
# no_std tests.
- run: cargo test --no-default-features

Expand Down
11 changes: 6 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ std = []
# this feature is enabled, all other APIs remain single-threaded.
rayon = ["dep:rayon", "std"]

mmap = ["std", "dep:memmap2"]

# Implement the zeroize::Zeroize trait for types in this crate.
zeroize = ["dep:zeroize", "arrayvec/zeroize"]

Expand Down Expand Up @@ -81,11 +83,9 @@ no_avx2 = []
no_avx512 = []
no_neon = []

file = ["memmap2", "rayon", "std"]

[package.metadata.docs.rs]
# Document Hasher::update_rayon on docs.rs.
features = ["rayon", "zeroize"]
# Document the rayon/mmap methods and the Zeroize impls on docs.rs.
features = ["mmap", "rayon", "zeroize"]

[dependencies]
arrayref = "0.3.5"
Expand All @@ -98,12 +98,13 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive
memmap2 = { version = "0.7.1", optional = true }

[dev-dependencies]
hmac = "0.12.0"
hex = "0.4.2"
page_size = "0.6.0"
rand = "0.8.0"
rand_chacha = "0.3.0"
reference_impl = { path = "./reference_impl" }
hmac = "0.12.0"
tempfile = "3.8.0"

[build-dependencies]
cc = "1.0.4"
2 changes: 1 addition & 1 deletion b3sum/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pure = ["blake3/pure"]

[dependencies]
anyhow = "1.0.25"
blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
hex = "0.4.0"
memmap2 = "0.7.0"
Expand Down
110 changes: 33 additions & 77 deletions b3sum/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,73 +163,22 @@ impl Args {
}
}

enum Input {
Mmap(io::Cursor<memmap2::Mmap>),
File(File),
Stdin,
}

impl Input {
// Open an input file, using mmap if appropriate. "-" means stdin. Note
// that this convention applies both to command line arguments, and to
// filepaths that appear in a checkfile.
fn open(path: &Path, args: &Args) -> Result<Self> {
if path == Path::new("-") {
if args.keyed() {
bail!("Cannot open `-` in keyed mode");
}
return Ok(Self::Stdin);
}
let file = File::open(path)?;
if !args.no_mmap() {
if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
return Ok(Self::Mmap(io::Cursor::new(mmap)));
}
}
Ok(Self::File(file))
}

fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
let mut hasher = args.base_hasher.clone();
match self {
// The fast path: If we mmapped the file successfully, hash using
// multiple threads. This doesn't work on stdin, or on some files,
// and it can also be disabled with --no-mmap.
Self::Mmap(cursor) => {
hasher.update_rayon(cursor.get_ref());
}
// The slower paths, for stdin or files we didn't/couldn't mmap.
// This is currently all single-threaded. Doing multi-threaded
// hashing without memory mapping is tricky, since all your worker
// threads have to stop every time you refill the buffer, and that
// ends up being a lot of overhead. To solve that, we need a more
// complicated double-buffering strategy where a background thread
// fills one buffer while the worker threads are hashing the other
// one. We might implement that in the future, but since this is
// the slow path anyway, it's not high priority.
Self::File(file) => {
blake3::copy_wide(file, &mut hasher)?;
}
Self::Stdin => {
let stdin = io::stdin();
let lock = stdin.lock();
blake3::copy_wide(lock, &mut hasher)?;
}
}
let mut output_reader = hasher.finalize_xof();
output_reader.set_position(args.seek());
Ok(output_reader)
}
}

impl Read for Input {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
match self {
Self::Mmap(cursor) => cursor.read(buf),
Self::File(file) => file.read(buf),
Self::Stdin => io::stdin().read(buf),
fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
let mut hasher = args.base_hasher.clone();
if path == Path::new("-") {
if args.keyed() {
bail!("Cannot open `-` in keyed mode");
}
hasher.update_reader(io::stdin().lock())?;
} else if args.no_mmap() {
hasher.update_reader(File::open(path)?)?;
} else {
// The fast path: Try to mmap the file and hash it with multiple threads.
hasher.update_mmap_rayon(path)?;
}
let mut output_reader = hasher.finalize_xof();
output_reader.set_position(args.seek());
Ok(output_reader)
}

fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
Expand Down Expand Up @@ -425,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
}

fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
let mut input = Input::open(path, args)?;
let output = input.hash(args)?;
let output = hash_path(args, path)?;
if args.raw() {
write_raw_output(output, args)?;
return Ok(());
Expand Down Expand Up @@ -470,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool {
} else {
file_string
};
let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
.and_then(|mut input| input.hash(args))
.map(|mut hash_output| {
let found_hash: blake3::Hash;
match hash_path(args, &file_path) {
Ok(mut output) => {
let mut found_hash_bytes = [0; blake3::OUT_LEN];
hash_output.fill(&mut found_hash_bytes);
found_hash_bytes.into()
});
let found_hash: blake3::Hash = match hash_result {
Ok(hash) => hash,
output.fill(&mut found_hash_bytes);
found_hash = found_hash_bytes.into();
}
Err(e) => {
println!("{}: FAILED ({})", file_string, e);
return false;
Expand All @@ -497,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool {
}

fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> {
let checkfile_input = Input::open(path, args)?;
let mut bufreader = io::BufReader::new(checkfile_input);
let mut file;
let stdin;
let mut stdin_lock;
let mut bufreader: io::BufReader<&mut dyn Read>;
if path == Path::new("-") {
stdin = io::stdin();
stdin_lock = stdin.lock();
bufreader = io::BufReader::new(&mut stdin_lock);
} else {
file = File::open(path)?;
bufreader = io::BufReader::new(&mut file);
}
let mut line = String::new();
loop {
line.clear();
Expand Down
67 changes: 0 additions & 67 deletions src/file.rs

This file was deleted.

78 changes: 78 additions & 0 deletions src/io.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
//! Helper functions for efficient IO.

use std::{fs::File, io};

#[cfg(feature = "std")]
pub(crate) fn copy_wide(
mut reader: impl std::io::Read,
hasher: &mut crate::Hasher,
) -> std::io::Result<u64> {
let mut buffer = [0; 65536];
let mut total = 0;
loop {
match reader.read(&mut buffer) {
Ok(0) => return Ok(total),
Ok(n) => {
hasher.update(&buffer[..n]);
total += n as u64;
}
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e),
}
}
}

// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
// fails, return the error.
//
// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
// str::from_utf8 on them and then have them change out from under you. Letting a caller get their
// hands on an mmap, or even on a &[u8] that's backed by an mmap, is unsound. However, because this
// function is private, we can guarantee that all will ever happen to this mmap is that we'll hash
// its contents,
//
// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know in our
// heart of hearts that no platform in the "real world" is ever going to do anything other than
// compute the "wrong answer" if we race on this mmap while we hash it, aren't we still supposed to
// feel bad about doing this? Well, this is IO, and IO does tend to get special carve-outs in the
// memory model. Consider a memory-mapped register that returns random 32-bit words. (This is
// actually realistic if you have a hardware RNG.) It's probably sound to construct a *const i32
// pointing to that register and do some raw pointer reads from it. Those reads should be volatile
// if you don't want the compiler to coalesce them, but all the same the compiler isn't allowed to
// just _go nuts_ and insert should-never-happen branches to delete your hard drive if two adjacent
// reads happen to give different values. As far as I'm aware, there's no such thing as a read
// that's allowed if it's volatile but prohibited if it's not (unlike atomics). As mentioned above,
// it's not ok to construct a safe &i32 to the register if you're going to leak that reference to
// unknown callers. But if you "know what you're doing," I don't think *const i32 and &i32 are
// fundamentally different here. Feedback needed.
#[cfg(feature = "mmap")]
pub(crate) fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
let metadata = file.metadata()?;
let file_size = metadata.len();
#[allow(clippy::if_same_then_else)]
if !metadata.is_file() {
// Not a real file.
Ok(None)
} else if file_size > isize::max_value() as u64 {
// Too long to safely map.
// https://github.com/danburkert/memmap-rs/issues/69
Ok(None)
} else if file_size == 0 {
// Mapping an empty file currently fails.
// https://github.com/danburkert/memmap-rs/issues/72
Ok(None)
} else if file_size < 16 * 1024 {
// Mapping small files is not worth it.
Ok(None)
} else {
// Explicitly set the length of the memory map, so that filesystem
// changes can't race to violate the invariants we just checked.
let map = unsafe {
memmap2::MmapOptions::new()
.len(file_size as usize)
.map(file)?
};
Ok(Some(map))
}
}

0 comments on commit b6ee296

Please sign in to comment.