From 2a9c5433fefced16f6a29dfcb4dafada59bd058e Mon Sep 17 00:00:00 2001 From: Christian Ocker Date: Mon, 4 Oct 2021 14:08:13 +0200 Subject: [PATCH] Add support for readers that implement Seek (#218) `Archive::new` requires only `Read` for backward-compatibility, while `Archive::new_from_seek` can be used with readers that also implement `Seek`, to allow more efficient skipping over file contents. --- src/archive.rs | 48 ++++++++++++++++++--------------- src/entry.rs | 21 ++++++++------- src/lib.rs | 1 + src/skip.rs | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ tests/all.rs | 42 +++++++++++++++++++++++++++++ 5 files changed, 153 insertions(+), 31 deletions(-) create mode 100644 src/skip.rs diff --git a/src/archive.rs b/src/archive.rs index 8c333322..79afa83d 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -1,5 +1,4 @@ use std::cell::{Cell, RefCell}; -use std::cmp; use std::fs; use std::io; use std::io::prelude::*; @@ -10,12 +9,13 @@ use crate::entry::{EntryFields, EntryIo}; use crate::error::TarError; use crate::other; use crate::pax::pax_extensions_size; +use crate::skip::{SeekingSkipRead, SkipRead}; use crate::{Entry, GnuExtSparseHeader, GnuSparseHeader, Header}; /// A top-level representation of an archive file. /// /// This archive can have an entry added to it and it can be iterated over. -pub struct Archive { +pub struct Archive { inner: ArchiveInner, } @@ -30,19 +30,19 @@ pub struct ArchiveInner { } /// An iterator over the entries of an archive. -pub struct Entries<'a, R: 'a + Read> { +pub struct Entries<'a, R: 'a + SkipRead> { fields: EntriesFields<'a>, _ignored: marker::PhantomData<&'a Archive>, } struct EntriesFields<'a> { - archive: &'a Archive, + archive: &'a Archive, next: u64, done: bool, raw: bool, } -impl Archive { +impl Archive { /// Create a new archive with the underlying object as the reader. pub fn new(obj: R) -> Archive { Archive { @@ -70,7 +70,7 @@ impl Archive { /// iterator returns), then the contents read for each entry may be /// corrupted. pub fn entries(&mut self) -> io::Result> { - let me: &mut Archive = self; + let me: &mut Archive = self; me._entries().map(|fields| Entries { fields: fields, _ignored: marker::PhantomData, @@ -97,7 +97,7 @@ impl Archive { /// ar.unpack("foo").unwrap(); /// ``` pub fn unpack>(&mut self, dst: P) -> io::Result<()> { - let me: &mut Archive = self; + let me: &mut Archive = self; me._unpack(dst.as_ref()) } @@ -143,7 +143,15 @@ impl Archive { } } -impl<'a> Archive { +impl Archive { + /// Create a new archive using a reader that also implements `Seek`, to + /// allow more efficient skipping over file contents. + pub fn new_from_seek(obj: R) -> Archive> { + Archive::new(SeekingSkipRead::new(obj)) + } +} + +impl<'a> Archive { fn _entries(&mut self) -> io::Result { if self.inner.pos.get() != 0 { return Err(other( @@ -191,21 +199,19 @@ impl<'a> Archive { Ok(()) } - fn skip(&self, mut amt: u64) -> io::Result<()> { - let mut buf = [0u8; 4096 * 8]; - while amt > 0 { - let n = cmp::min(amt, buf.len() as u64); - let n = (&self.inner).read(&mut buf[..n as usize])?; - if n == 0 { - return Err(other("unexpected EOF during skip")); - } - amt -= n as u64; + fn skip(&self, amt: u64) -> io::Result<()> { + let n = self.inner.obj.borrow_mut().skip(amt).map(|i| { + self.inner.pos.set(self.inner.pos.get() + i); + i + })?; + if n != amt { + return Err(other("unexpected EOF during skip")); } Ok(()) } } -impl<'a, R: Read> Entries<'a, R> { +impl<'a, R: SkipRead> Entries<'a, R> { /// Indicates whether this iterator will return raw entries or not. /// /// If the raw list of entries are returned, then no preprocessing happens @@ -221,7 +227,7 @@ impl<'a, R: Read> Entries<'a, R> { } } } -impl<'a, R: Read> Iterator for Entries<'a, R> { +impl<'a, R: SkipRead> Iterator for Entries<'a, R> { type Item = io::Result>; fn next(&mut self) -> Option>> { @@ -500,7 +506,7 @@ impl<'a> Iterator for EntriesFields<'a> { } } -impl<'a, R: ?Sized + Read> Read for &'a ArchiveInner { +impl<'a, R: ?Sized + SkipRead> Read for &'a ArchiveInner { fn read(&mut self, into: &mut [u8]) -> io::Result { self.obj.borrow_mut().read(into).map(|i| { self.pos.set(self.pos.get() + i as u64); @@ -513,7 +519,7 @@ impl<'a, R: ?Sized + Read> Read for &'a ArchiveInner { /// /// If the reader reaches its end before filling the buffer at all, returns `false`. /// Otherwise returns `true`. -fn try_read_all(r: &mut R, buf: &mut [u8]) -> io::Result { +fn try_read_all(r: &mut R, buf: &mut [u8]) -> io::Result { let mut read = 0; while read < buf.len() { match r.read(&mut buf[read..])? { diff --git a/src/entry.rs b/src/entry.rs index 9e4516e2..37cf9ff6 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -14,6 +14,7 @@ use crate::error::TarError; use crate::header::bytes2path; use crate::other; use crate::pax::pax_extensions; +use crate::skip::SkipRead; use crate::{Archive, Header, PaxExtensions}; /// A read-only view into an entry of an archive. @@ -21,7 +22,7 @@ use crate::{Archive, Header, PaxExtensions}; /// This structure is a window into a portion of a borrowed archive which can /// be inspected. It acts as a file handle by implementing the Reader trait. An /// entry cannot be rewritten once inserted into an archive. -pub struct Entry<'a, R: 'a + Read> { +pub struct Entry<'a, R: 'a + SkipRead> { fields: EntryFields<'a>, _ignored: marker::PhantomData<&'a Archive>, } @@ -45,7 +46,7 @@ pub struct EntryFields<'a> { pub enum EntryIo<'a> { Pad(io::Take), - Data(io::Take<&'a ArchiveInner>), + Data(io::Take<&'a ArchiveInner>), } /// When unpacking items the unpacked thing is returned to allow custom @@ -60,7 +61,7 @@ pub enum Unpacked { __Nonexhaustive, } -impl<'a, R: Read> Entry<'a, R> { +impl<'a, R: SkipRead> Entry<'a, R> { /// Returns the path name for this entry. /// /// This method may fail if the pathname is not valid Unicode and this is @@ -260,18 +261,18 @@ impl<'a, R: Read> Entry<'a, R> { } } -impl<'a, R: Read> Read for Entry<'a, R> { +impl<'a, R: SkipRead> Read for Entry<'a, R> { fn read(&mut self, into: &mut [u8]) -> io::Result { - self.fields.read(into) + Read::read(&mut self.fields, into) } } impl<'a> EntryFields<'a> { - pub fn from(entry: Entry) -> EntryFields { + pub fn from(entry: Entry) -> EntryFields { entry.fields } - pub fn into_entry(self) -> Entry<'a, R> { + pub fn into_entry(self) -> Entry<'a, R> { Entry { fields: self, _ignored: marker::PhantomData, @@ -810,7 +811,7 @@ impl<'a> EntryFields<'a> { impl<'a> Read for EntryFields<'a> { fn read(&mut self, into: &mut [u8]) -> io::Result { loop { - match self.data.get_mut(0).map(|io| io.read(into)) { + match self.data.get_mut(0).map(|io| Read::read(io, into)) { Some(Ok(0)) => { self.data.remove(0); } @@ -824,8 +825,8 @@ impl<'a> Read for EntryFields<'a> { impl<'a> Read for EntryIo<'a> { fn read(&mut self, into: &mut [u8]) -> io::Result { match *self { - EntryIo::Pad(ref mut io) => io.read(into), - EntryIo::Data(ref mut io) => io.read(into), + EntryIo::Pad(ref mut io) => Read::read(io, into), + EntryIo::Data(ref mut io) => Read::read(io, into), } } } diff --git a/src/lib.rs b/src/lib.rs index 52251cd2..e824b583 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,6 +38,7 @@ mod entry_type; mod error; mod header; mod pax; +mod skip; fn other(msg: &str) -> Error { Error::new(ErrorKind::Other, msg) diff --git a/src/skip.rs b/src/skip.rs new file mode 100644 index 00000000..6efffb45 --- /dev/null +++ b/src/skip.rs @@ -0,0 +1,72 @@ +use std::cmp; +use std::io; +use std::io::prelude::*; + +use crate::other; + +/// The `SkipRead` trait allows for reading bytes from a source, as well as +/// skipping over a number of bytes +/// +/// This is used for backward compatibility with [`std::io::Read`]. Each reader +/// automatically implements `SkipRead`. Readers that additionally implement +/// [`std::io::Seek`] can use `SeekingSkipRead` to implements efficient skipping +/// based on [`std::io::Seek:seek`]. +pub trait SkipRead { + /// Pull some bytes from this source into the specified buffer, returning + /// how many bytes were read. + /// + /// Identical to [`std::io::Read::read`]. + fn read(&mut self, buf: &mut [u8]) -> io::Result; + + /// Skip a specified number of bytes from the source, returning how many + /// bytes were skipped. + fn skip(&mut self, amt: u64) -> io::Result; +} + +/// `SkipRead` is implemented for all readers by using [`std::io::Read::read`] +/// for both reading and skipping. +impl SkipRead for R { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.read(buf) + } + + fn skip(&mut self, amt: u64) -> io::Result { + let mut buf = [0u8; 4096 * 8]; + let mut skipped = 0; + while skipped < amt { + let n = cmp::min(amt - skipped, buf.len() as u64); + let n = self.read(&mut buf[..n as usize])?; + if n == 0 { + return Err(other("unexpected EOF during skip")); + } + skipped += n as u64; + } + Ok(skipped) + } +} + +/// Wrapper to implement `SkipRead` more efficiently for readers that also +/// implement [`std::io::Seek`]. Skipping is implemented using +/// [`std::io::Seek:seek`]. +pub struct SeekingSkipRead { + inner: R, +} + +impl SeekingSkipRead { + pub fn new(obj: R) -> SeekingSkipRead { + SeekingSkipRead { inner: obj } + } +} + +impl SkipRead for SeekingSkipRead { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.inner.read(buf) + } + + fn skip(&mut self, amount: u64) -> io::Result { + let old_pos = self.inner.stream_position()?; + self.inner + .seek(io::SeekFrom::Current(amount as i64)) + .map(|pos| pos - old_pos) + } +} diff --git a/tests/all.rs b/tests/all.rs index d29a5190..901188be 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -153,6 +153,48 @@ fn writing_files() { assert!(entries.next().is_none()); } +struct LoggingReader { + inner: R, + read_bytes: u64, +} + +impl LoggingReader { + fn new(reader: R) -> LoggingReader { + LoggingReader { + inner: reader, + read_bytes: 0, + } + } +} + +impl Read for LoggingReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.inner.read(buf).map(|i| { + self.read_bytes += i as u64; + i + }) + } +} + +impl Seek for LoggingReader { + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.inner.seek(pos) + } +} + +#[test] +fn new_from_seek() { + let mut reader = LoggingReader::new(Cursor::new(tar!("reading_files.tar"))); + let mut ar_reader = Archive::new(&mut reader); + for _ in t!(ar_reader.entries()) {} + assert!(reader.read_bytes == 2560); + + let mut seekable_reader = LoggingReader::new(Cursor::new(tar!("reading_files.tar"))); + let mut ar_seekable_reader = Archive::new_from_seek(&mut seekable_reader); + for _ in t!(ar_seekable_reader.entries()) {} + assert!(seekable_reader.read_bytes == 1536); +} + #[test] fn large_filename() { let mut ar = Builder::new(Vec::new());