From 33a480106a69cee4c91265bff814a01cc27458a5 Mon Sep 17 00:00:00 2001 From: Kevin Parsons Date: Thu, 16 Sep 2021 17:23:13 -0700 Subject: [PATCH] backuptar: Fix sparse file handling A recent OS change altered how sparse files are represented in backup streams. This caused backuptar to no longer work with certain files. The specific behavior that changed is as follows: - Empty sparse files (size = 0), previously did not have any data or sparse block streams in the backup stream. Now, they will have a data stream with size = 0, and no sparse block streams. - Sparse files with a single allocated range (e.g. a normal file that has the sparse attribute set) previously would not show as sparse in the backup stream. Now, they will show as sparse. The old backuptar behavior assumed that if the sparse flag was set on the data stream, then there would always be a set of sparse blocks following. These changes break this assumption, and so require special handling. It is unsupported to have a data stream, marked sparse, that contains file content AND a series of sparse block streams following. As far as I can tell this is not a valid case for backup streams. This change also cleans up some code and error messages, and expands on the test coverage for backuptar. For more information on backup stream format see: https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-bkup/f67950c8-d583-469a-83dd-c4ff4cedf533 Signed-off-by: Kevin Parsons --- backuptar/tar.go | 74 +++++++++----- backuptar/tar_test.go | 232 +++++++++++++++++++++++++++++++++--------- 2 files changed, 234 insertions(+), 72 deletions(-) diff --git a/backuptar/tar.go b/backuptar/tar.go index cb461ca3..689e4da6 100644 --- a/backuptar/tar.go +++ b/backuptar/tar.go @@ -5,7 +5,6 @@ package backuptar import ( "archive/tar" "encoding/base64" - "errors" "fmt" "io" "io/ioutil" @@ -42,19 +41,14 @@ const ( hdrCreationTime = "LIBARCHIVE.creationtime" ) -func writeZeroes(w io.Writer, count int64) error { - buf := make([]byte, 8192) - c := len(buf) - for i := int64(0); i < count; i += int64(c) { - if int64(c) > count-i { - c = int(count - i) - } - _, err := w.Write(buf[:c]) - if err != nil { - return err - } +// zeroReader is an io.Reader that always returns 0s. +type zeroReader struct{} + +func (zr zeroReader) Read(b []byte) (int, error) { + for i := range b { + b[i] = 0 } - return nil + return len(b), nil } func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error { @@ -71,16 +65,26 @@ func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error { return fmt.Errorf("unexpected stream %d", bhdr.Id) } + // We can't seek backwards, since we have already written that data to the tar.Writer. + if bhdr.Offset < curOffset { + return fmt.Errorf("cannot seek back from %d to %d", curOffset, bhdr.Offset) + } // archive/tar does not support writing sparse files // so just write zeroes to catch up to the current offset. - err = writeZeroes(t, bhdr.Offset-curOffset) + if _, err := io.CopyN(t, zeroReader{}, bhdr.Offset-curOffset); err != nil { + return fmt.Errorf("seek to offset %d: %s", bhdr.Offset, err) + } if bhdr.Size == 0 { + // A sparse block with size = 0 is used to mark the end of the sparse blocks. break } n, err := io.Copy(t, br) if err != nil { return err } + if n != bhdr.Size { + return fmt.Errorf("copied %d bytes instead of %d at offset %d", n, bhdr.Size, bhdr.Offset) + } curOffset = bhdr.Offset + n } return nil @@ -221,20 +225,44 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size } } + // The logic for copying file contents is fairly complicated due to the need for handling sparse files, + // and the weird ways they are represented by BackupRead. A normal file will always either have a data stream + // with size and content, or no data stream at all (if empty). However, for a sparse file, the content can also + // be represented using a series of sparse block streams following the data stream. Additionally, the way sparse + // files are handled by BackupRead has changed in the OS recently. The specifics of the representation are described + // in the list at the bottom of this block comment. + // + // Sparse files can be represented in four different ways, based on the specifics of the file. + // - Size = 0: + // Previously: BackupRead yields no data stream and no sparse block streams. + // Recently: BackupRead yields a data stream with size = 0. There are no following sparse block streams. + // - Size > 0, no allocated ranges: + // BackupRead yields a data stream with size = 0. Following is a single sparse block stream with + // size = 0 and offset = . + // - Size > 0, one allocated range: + // BackupRead yields a data stream with size = containing the file contents. There are no + // sparse block streams. This is the case if you take a normal file with contents and simply set the + // sparse flag on it. + // - Size > 0, multiple allocated ranges: + // BackupRead yields a data stream with size = 0. Following are sparse block streams for each allocated + // range of the file containing the range contents. Finally there is a sparse block stream with + // size = 0 and offset = . + if dataHdr != nil { // A data stream was found. Copy the data. - if (dataHdr.Attributes & winio.StreamSparseAttributes) == 0 { + // We assume that we will either have a data stream size > 0 XOR have sparse block streams. + if dataHdr.Size > 0 || (dataHdr.Attributes&winio.StreamSparseAttributes) == 0 { if size != dataHdr.Size { return fmt.Errorf("%s: mismatch between file size %d and header size %d", name, size, dataHdr.Size) } - _, err = io.Copy(t, br) - if err != nil { - return err + if _, err = io.Copy(t, br); err != nil { + return fmt.Errorf("%s: copying contents from data stream: %s", name, err) } - } else { - err = copySparse(t, br) - if err != nil { - return err + } else if size > 0 { + // As of a recent OS change, BackupRead now returns a data stream for empty sparse files. + // These files have no sparse block streams, so skip the copySparse call if file size = 0. + if err = copySparse(t, br); err != nil { + return fmt.Errorf("%s: copying contents from sparse block stream: %s", name, err) } } } @@ -279,7 +307,7 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size } else { // Unsupported for now, since the size of the alternate stream is not present // in the backup stream until after the data has been read. - return errors.New("tar of sparse alternate data streams is unsupported") + return fmt.Errorf("%s: tar of sparse alternate data streams is unsupported", name) } case winio.BackupEaData, winio.BackupLink, winio.BackupPropertyData, winio.BackupObjectId, winio.BackupTxfsData: // ignore these streams diff --git a/backuptar/tar_test.go b/backuptar/tar_test.go index 8be0bf34..cc2cbf09 100644 --- a/backuptar/tar_test.go +++ b/backuptar/tar_test.go @@ -5,6 +5,7 @@ package backuptar import ( "archive/tar" "bytes" + "io" "io/ioutil" "os" "path/filepath" @@ -12,6 +13,7 @@ import ( "testing" "github.com/Microsoft/go-winio" + "golang.org/x/sys/windows" ) func ensurePresent(t *testing.T, m map[string]string, keys ...string) { @@ -22,65 +24,197 @@ func ensurePresent(t *testing.T, m map[string]string, keys ...string) { } } -func TestRoundTrip(t *testing.T) { - f, err := ioutil.TempFile("", "tst") - if err != nil { - t.Fatal(err) - } - defer f.Close() - defer os.Remove(f.Name()) - - if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil { - t.Fatal(err) - } - - if _, err = f.Seek(0, 0); err != nil { +func setSparse(t *testing.T, f *os.File) { + const FSCTL_SET_SPARSE uint32 = 0x900c4 + if err := windows.DeviceIoControl(windows.Handle(f.Fd()), FSCTL_SET_SPARSE, nil, 0, nil, 0, nil, nil); err != nil { t.Fatal(err) } +} - fi, err := f.Stat() - if err != nil { - t.Fatal(err) +// compareReaders validates that two readers contain the exact same data. +func compareReaders(t *testing.T, rActual io.Reader, rExpected io.Reader) { + const size = 8 * 1024 + var bufExpected, bufActual [size]byte + var readCount int64 + // Loop, first reading from rExpected, then reading the same amount from rActual. + // For each set of reads, compare the bytes to make sure they are identical. + // When we run out of data in rExpected, exit the loop. + for { + // Do a read from rExpected and see how many bytes we get. + nExpected, err := rExpected.Read(bufExpected[:]) + if err == io.EOF && nExpected == 0 { + break + } else if err != nil && err != io.EOF { + t.Fatalf("Failed reading from rExpected at %d: %s", readCount, err) + } + // Do a ReadFull from rActual for the same number of bytes we got from rExpected. + if nActual, err := io.ReadFull(rActual, bufActual[:nExpected]); err != nil { + t.Fatalf("Only read %d bytes out of %d from rActual at %d: %s", nActual, nExpected, readCount, err) + } + readCount += int64(nExpected) + for i, bExpected := range bufExpected[:nExpected] { + if bExpected != bufActual[i] { + t.Fatalf("Mismatched bytes at %d. got 0x%x, expected 0x%x", i, bufActual[i], bExpected) + } + } } - - bi, err := winio.GetFileBasicInfo(f) - if err != nil { - t.Fatal(err) - } - - br := winio.NewBackupFileReader(f, true) - defer br.Close() - - var buf bytes.Buffer - tw := tar.NewWriter(&buf) - - err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi) - if err != nil { - t.Fatal(err) + // Now we just need to make sure there isn't any further data in rActual. + var b [1]byte + if n, err := rActual.Read(b[:]); n != 0 || err != io.EOF { + t.Fatalf("rActual didn't return EOF at expected end. Read %d bytes with error %s", n, err) } +} - tr := tar.NewReader(&buf) - hdr, err := tr.Next() - if err != nil { - t.Fatal(err) +func TestRoundTrip(t *testing.T) { + // Each test case is a name mapped to a function which must create a file and return its path. + // The test then round-trips that file through backuptar, and validates the output matches the input. + for name, setup := range map[string]func(*testing.T) string{ + "normalFile": func(t *testing.T) string { + path := filepath.Join(t.TempDir(), "foo.txt") + if err := ioutil.WriteFile(path, []byte("testing 1 2 3\n"), 0644); err != nil { + t.Fatal(err) + } + return path + }, + "normalFileEmpty": func(t *testing.T) string { + path := filepath.Join(t.TempDir(), "foo.txt") + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + t.Fatal(err) + } + defer f.Close() + return path + }, + "sparseFileEmpty": func(t *testing.T) string { + path := filepath.Join(t.TempDir(), "foo.txt") + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + t.Fatal(err) + } + defer f.Close() + setSparse(t, f) + return path + }, + "sparseFileWithNoAllocatedRanges": func(t *testing.T) string { + path := filepath.Join(t.TempDir(), "foo.txt") + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + t.Fatal(err) + } + defer f.Close() + setSparse(t, f) + // Set file size without writing data to produce a file with size > 0 + // but no allocated ranges. + if err := f.Truncate(1000000); err != nil { + t.Fatal(err) + } + return path + }, + "sparseFileWithOneAllocatedRange": func(t *testing.T) string { + path := filepath.Join(t.TempDir(), "foo.txt") + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + t.Fatal(err) + } + defer f.Close() + setSparse(t, f) + if _, err := f.WriteString("test sparse data"); err != nil { + t.Fatal(err) + } + return path + }, + "sparseFileWithMultipleAllocatedRanges": func(t *testing.T) string { + path := filepath.Join(t.TempDir(), "foo.txt") + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + t.Fatal(err) + } + defer f.Close() + setSparse(t, f) + if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil { + t.Fatal(err) + } + // The documentation talks about FSCTL_SET_ZERO_DATA, but seeking also + // seems to create a hole. + if _, err = f.Seek(1000000, 0); err != nil { + t.Fatal(err) + } + if _, err = f.Write([]byte("more data later\n")); err != nil { + t.Fatal(err) + } + return path + }, + } { + t.Run(name, func(t *testing.T) { + path := setup(t) + f, err := os.Open(path) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + fi, err := f.Stat() + if err != nil { + t.Fatal(err) + } + bi, err := winio.GetFileBasicInfo(f) + if err != nil { + t.Fatal(err) + } + + br := winio.NewBackupFileReader(f, true) + defer br.Close() + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi) + if err != nil { + t.Fatal(err) + } + tr := tar.NewReader(&buf) + hdr, err := tr.Next() + if err != nil { + t.Fatal(err) + } + + name, size, bi2, err := FileInfoFromHeader(hdr) + if err != nil { + t.Fatal(err) + } + if name != filepath.ToSlash(f.Name()) { + t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name())) + } + if size != fi.Size() { + t.Errorf("got size %d, expected %d", size, fi.Size()) + } + if !reflect.DeepEqual(*bi2, *bi) { + t.Errorf("got %#v, expected %#v", *bi2, *bi) + } + ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd") + // Reset file position so we can compare file contents. + // The file contents of the actual file should match what we get from the tar. + if _, err := f.Seek(0, 0); err != nil { + t.Fatal(err) + } + compareReaders(t, tr, f) + }) } +} - name, size, bi2, err := FileInfoFromHeader(hdr) +func TestZeroReader(t *testing.T) { + const size = 512 + var b [size]byte + var bExpected [size]byte + var r zeroReader + n, err := r.Read(b[:]) if err != nil { - t.Fatal(err) - } - - if name != filepath.ToSlash(f.Name()) { - t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name())) + t.Fatalf("Unexpected read error: %s", err) } - - if size != fi.Size() { - t.Errorf("got size %d, expected %d", size, fi.Size()) + if n != size { + t.Errorf("Wrong read size. got %d, expected %d", n, size) } - - if !reflect.DeepEqual(*bi2, *bi) { - t.Errorf("got %#v, expected %#v", *bi2, *bi) + for i := range b { + if b[i] != bExpected[i] { + t.Errorf("Wrong content at index %d. got %d, expected %d", i, b[i], bExpected[i]) + } } - - ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd") }