Skip to content

Commit

Permalink
backuptar: Fix sparse file handling
Browse files Browse the repository at this point in the history
A recent OS change altered how sparse files are represented in backup
streams. This caused backuptar to no longer work with certain files. The
specific behavior that changed is as follows:
- Empty sparse files (size = 0), previously did not have any data or
  sparse block streams in the backup stream. Now, they will have a
  data stream with size = 0, and no sparse block streams.
- Sparse files with a single allocated range (e.g. a normal file that
  has the sparse attribute set) previously would not show as sparse in
  the backup stream. Now, they will show as sparse.
The old backuptar behavior assumed that if the sparse flag was set on
the data stream, then there would always be a set of sparse blocks
following. These changes break this assumption, and so require special
handling.

It is unsupported to have a data stream, marked sparse, that contains
file content AND a series of sparse block streams following. As far as
I can tell this is not a valid case for backup streams.

This change also cleans up some code and error messages, and expands on
the test coverage for backuptar.

For more information on backup stream format see: https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-bkup/f67950c8-d583-469a-83dd-c4ff4cedf533

Signed-off-by: Kevin Parsons <kevpar@microsoft.com>
  • Loading branch information
kevpar committed Sep 23, 2021
1 parent a0fd408 commit c5e1d6c
Show file tree
Hide file tree
Showing 2 changed files with 234 additions and 72 deletions.
74 changes: 51 additions & 23 deletions backuptar/tar.go
Expand Up @@ -5,7 +5,6 @@ package backuptar
import (
"archive/tar"
"encoding/base64"
"errors"
"fmt"
"io"
"io/ioutil"
Expand Down Expand Up @@ -42,19 +41,14 @@ const (
hdrCreationTime = "LIBARCHIVE.creationtime"
)

func writeZeroes(w io.Writer, count int64) error {
buf := make([]byte, 8192)
c := len(buf)
for i := int64(0); i < count; i += int64(c) {
if int64(c) > count-i {
c = int(count - i)
}
_, err := w.Write(buf[:c])
if err != nil {
return err
}
// zeroReader is an io.Reader that always returns 0s.
type zeroReader struct{}

func (zr zeroReader) Read(b []byte) (int, error) {
for i := range b {
b[i] = 0
}
return nil
return len(b), nil
}

func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error {
Expand All @@ -71,16 +65,26 @@ func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error {
return fmt.Errorf("unexpected stream %d", bhdr.Id)
}

// We can't seek backwards, since we have already written that data to the tar.Writer.
if bhdr.Offset < curOffset {
return fmt.Errorf("cannot seek back from %d to %d", curOffset, bhdr.Offset)
}
// archive/tar does not support writing sparse files
// so just write zeroes to catch up to the current offset.
err = writeZeroes(t, bhdr.Offset-curOffset)
if _, err := io.CopyN(t, zeroReader{}, bhdr.Offset-curOffset); err != nil {
return fmt.Errorf("seek to offset %d: %s", bhdr.Offset, err)
}
if bhdr.Size == 0 {
// A sparse block with size = 0 is used to mark the end of the sparse blocks.
break
}
n, err := io.Copy(t, br)
if err != nil {
return err
}
if n != bhdr.Size {
return fmt.Errorf("copied %d bytes instead of %d at offset %d", n, bhdr.Size, bhdr.Offset)
}
curOffset = bhdr.Offset + n
}
return nil
Expand Down Expand Up @@ -221,20 +225,44 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size
}
}

// The logic for copying file contents is fairly complicated due to the need for handling sparse files,
// and the weird ways they are represented by BackupRead. A normal file will always either have a data stream
// with size and content, or no data stream at all (if empty). However, for a sparse file, the content can also
// be represented using a series of sparse block streams following the data stream. Additionally, the way sparse
// files are handled by BackupRead has changed in the OS recently. The specifics of the representation are described
// in the list at the bottom of this block comment.
//
// Sparse files can be represented in four different ways, based on the specifics of the file.
// - Size = 0:
// Previously: BackupRead yields no data stream and no sparse block streams.
// Recently: BackupRead yields a data stream with size = 0. There are no following sparse block streams.
// - Size > 0, no allocated ranges:
// BackupRead yields a data stream with size = 0. Following is a single sparse block stream with
// size = 0 and offset = <file size>.
// - Size > 0, one allocated range:
// BackupRead yields a data stream with size = <file size> containing the file contents. There are no
// sparse block streams. This is the case if you take a normal file with contents and simply set the
// sparse flag on it.
// - Size > 0, multiple allocated ranges:
// BackupRead yields a data stream with size = 0. Following are sparse block streams for each allocated
// range of the file containing the range contents. Finally there is a sparse block stream with
// size = 0 and offset = <file size>.

if dataHdr != nil {
// A data stream was found. Copy the data.
if (dataHdr.Attributes & winio.StreamSparseAttributes) == 0 {
// We assume that we will either have a data stream size > 0 XOR have sparse block streams.
if dataHdr.Size > 0 || (dataHdr.Attributes&winio.StreamSparseAttributes) == 0 {
if size != dataHdr.Size {
return fmt.Errorf("%s: mismatch between file size %d and header size %d", name, size, dataHdr.Size)
}
_, err = io.Copy(t, br)
if err != nil {
return err
if _, err = io.Copy(t, br); err != nil {
return fmt.Errorf("%s: copying contents from data stream: %s", name, err)
}
} else {
err = copySparse(t, br)
if err != nil {
return err
} else if size > 0 {
// As of a recent OS change, BackupRead now returns a data stream for empty sparse files.
// These files have no sparse block streams, so skip the copySparse call if file size = 0.
if err = copySparse(t, br); err != nil {
return fmt.Errorf("%s: copying contents from sparse block stream: %s", name, err)
}
}
}
Expand Down Expand Up @@ -279,7 +307,7 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size
} else {
// Unsupported for now, since the size of the alternate stream is not present
// in the backup stream until after the data has been read.
return errors.New("tar of sparse alternate data streams is unsupported")
return fmt.Errorf("%s: tar of sparse alternate data streams is unsupported", name)
}
case winio.BackupEaData, winio.BackupLink, winio.BackupPropertyData, winio.BackupObjectId, winio.BackupTxfsData:
// ignore these streams
Expand Down
232 changes: 183 additions & 49 deletions backuptar/tar_test.go
Expand Up @@ -5,13 +5,15 @@ package backuptar
import (
"archive/tar"
"bytes"
"io"
"io/ioutil"
"os"
"path/filepath"
"reflect"
"testing"

"github.com/Microsoft/go-winio"
"golang.org/x/sys/windows"
)

func ensurePresent(t *testing.T, m map[string]string, keys ...string) {
Expand All @@ -22,65 +24,197 @@ func ensurePresent(t *testing.T, m map[string]string, keys ...string) {
}
}

func TestRoundTrip(t *testing.T) {
f, err := ioutil.TempFile("", "tst")
if err != nil {
t.Fatal(err)
}
defer f.Close()
defer os.Remove(f.Name())

if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil {
t.Fatal(err)
}

if _, err = f.Seek(0, 0); err != nil {
func setSparse(t *testing.T, f *os.File) {
const FSCTL_SET_SPARSE uint32 = 0x900c4
if err := windows.DeviceIoControl(windows.Handle(f.Fd()), FSCTL_SET_SPARSE, nil, 0, nil, 0, nil, nil); err != nil {
t.Fatal(err)
}
}

fi, err := f.Stat()
if err != nil {
t.Fatal(err)
// compareReaders validates that two readers contain the exact same data.
func compareReaders(t *testing.T, rActual io.Reader, rExpected io.Reader) {
const size = 8 * 1024
var bufExpected, bufActual [size]byte
var readCount int64
// Loop, first reading from rExpected, then reading the same amount from rActual.
// For each set of reads, compare the bytes to make sure they are identical.
// When we run out of data in rExpected, exit the loop.
for {
// Do a read from rExpected and see how many bytes we get.
nExpected, err := rExpected.Read(bufExpected[:])
if err == io.EOF && nExpected == 0 {
break
} else if err != nil && err != io.EOF {
t.Fatalf("Failed reading from rExpected at %d: %s", readCount, err)
}
// Do a ReadFull from rActual for the same number of bytes we got from rExpected.
if nActual, err := io.ReadFull(rActual, bufActual[:nExpected]); err != nil {
t.Fatalf("Only read %d bytes out of %d from rActual at %d: %s", nActual, nExpected, readCount, err)
}
readCount += int64(nExpected)
for i, bExpected := range bufExpected[:nExpected] {
if bExpected != bufActual[i] {
t.Fatalf("Mismatched bytes at %d. got 0x%x, expected 0x%x", i, bufActual[i], bExpected)
}
}
}

bi, err := winio.GetFileBasicInfo(f)
if err != nil {
t.Fatal(err)
}

br := winio.NewBackupFileReader(f, true)
defer br.Close()

var buf bytes.Buffer
tw := tar.NewWriter(&buf)

err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi)
if err != nil {
t.Fatal(err)
// Now we just need to make sure there isn't any further data in rActual.
var b [1]byte
if n, err := rActual.Read(b[:]); n != 0 || err != io.EOF {
t.Fatalf("rActual didn't return EOF at expected end. Read %d bytes with error %s", n, err)
}
}

tr := tar.NewReader(&buf)
hdr, err := tr.Next()
if err != nil {
t.Fatal(err)
func TestRoundTrip(t *testing.T) {
// Each test case is a name mapped to a function which must create a file and return its path.
// The test then round-trips that file through backuptar, and validates the output matches the input.
for name, setup := range map[string]func(*testing.T) string{
"normalFile": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
if err := ioutil.WriteFile(path, []byte("testing 1 2 3\n"), 0644); err != nil {
t.Fatal(err)
}
return path
},
"normalFileEmpty": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
return path
},
"sparseFileEmpty": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
return path
},
"sparseFileWithNoAllocatedRanges": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
// Set file size without writing data to produce a file with size > 0
// but no allocated ranges.
if err := f.Truncate(1000000); err != nil {
t.Fatal(err)
}
return path
},
"sparseFileWithOneAllocatedRange": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
if _, err := f.WriteString("test sparse data"); err != nil {
t.Fatal(err)
}
return path
},
"sparseFileWithMultipleAllocatedRanges": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil {
t.Fatal(err)
}
// The documentation talks about FSCTL_SET_ZERO_DATA, but seeking also
// seems to create a hole.
if _, err = f.Seek(1000000, 0); err != nil {
t.Fatal(err)
}
if _, err = f.Write([]byte("more data later\n")); err != nil {
t.Fatal(err)
}
return path
},
} {
t.Run(name, func(t *testing.T) {
path := setup(t)
f, err := os.Open(path)
if err != nil {
t.Fatal(err)
}
defer f.Close()

fi, err := f.Stat()
if err != nil {
t.Fatal(err)
}
bi, err := winio.GetFileBasicInfo(f)
if err != nil {
t.Fatal(err)
}

br := winio.NewBackupFileReader(f, true)
defer br.Close()
var buf bytes.Buffer
tw := tar.NewWriter(&buf)
err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi)
if err != nil {
t.Fatal(err)
}
tr := tar.NewReader(&buf)
hdr, err := tr.Next()
if err != nil {
t.Fatal(err)
}

name, size, bi2, err := FileInfoFromHeader(hdr)
if err != nil {
t.Fatal(err)
}
if name != filepath.ToSlash(f.Name()) {
t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name()))
}
if size != fi.Size() {
t.Errorf("got size %d, expected %d", size, fi.Size())
}
if !reflect.DeepEqual(*bi2, *bi) {
t.Errorf("got %#v, expected %#v", *bi2, *bi)
}
ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd")
// Reset file position so we can compare file contents.
// The file contents of the actual file should match what we get from the tar.
if _, err := f.Seek(0, 0); err != nil {
t.Fatal(err)
}
compareReaders(t, tr, f)
})
}
}

name, size, bi2, err := FileInfoFromHeader(hdr)
func TestZeroReader(t *testing.T) {
const size = 512
var b [size]byte
var bExpected [size]byte
var r zeroReader
n, err := r.Read(b[:])
if err != nil {
t.Fatal(err)
}

if name != filepath.ToSlash(f.Name()) {
t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name()))
t.Fatalf("Unexpected read error: %s", err)
}

if size != fi.Size() {
t.Errorf("got size %d, expected %d", size, fi.Size())
if n != size {
t.Errorf("Wrong read size. got %d, expected %d", n, size)
}

if !reflect.DeepEqual(*bi2, *bi) {
t.Errorf("got %#v, expected %#v", *bi2, *bi)
for i := range b {
if b[i] != bExpected[i] {
t.Errorf("Wrong content at index %d. got %d, expected %d", i, b[i], bExpected[i])
}
}

ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd")
}

0 comments on commit c5e1d6c

Please sign in to comment.