Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backuptar: Fix sparse file handling #221

Merged
merged 1 commit into from Oct 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
74 changes: 51 additions & 23 deletions backuptar/tar.go
Expand Up @@ -5,7 +5,6 @@ package backuptar
import (
"archive/tar"
"encoding/base64"
"errors"
"fmt"
"io"
"io/ioutil"
Expand Down Expand Up @@ -42,19 +41,14 @@ const (
hdrCreationTime = "LIBARCHIVE.creationtime"
)

func writeZeroes(w io.Writer, count int64) error {
buf := make([]byte, 8192)
c := len(buf)
for i := int64(0); i < count; i += int64(c) {
if int64(c) > count-i {
c = int(count - i)
}
_, err := w.Write(buf[:c])
if err != nil {
return err
}
// zeroReader is an io.Reader that always returns 0s.
type zeroReader struct{}

func (zr zeroReader) Read(b []byte) (int, error) {
for i := range b {
b[i] = 0
}
return nil
return len(b), nil
}

func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error {
Expand All @@ -71,16 +65,26 @@ func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error {
return fmt.Errorf("unexpected stream %d", bhdr.Id)
}

// We can't seek backwards, since we have already written that data to the tar.Writer.
if bhdr.Offset < curOffset {
return fmt.Errorf("cannot seek back from %d to %d", curOffset, bhdr.Offset)
}
// archive/tar does not support writing sparse files
// so just write zeroes to catch up to the current offset.
err = writeZeroes(t, bhdr.Offset-curOffset)
if _, err := io.CopyN(t, zeroReader{}, bhdr.Offset-curOffset); err != nil {
return fmt.Errorf("seek to offset %d: %s", bhdr.Offset, err)
}
if bhdr.Size == 0 {
// A sparse block with size = 0 is used to mark the end of the sparse blocks.
break
}
n, err := io.Copy(t, br)
if err != nil {
return err
}
if n != bhdr.Size {
return fmt.Errorf("copied %d bytes instead of %d at offset %d", n, bhdr.Size, bhdr.Offset)
}
curOffset = bhdr.Offset + n
}
return nil
Expand Down Expand Up @@ -221,20 +225,44 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size
}
}

// The logic for copying file contents is fairly complicated due to the need for handling sparse files,
// and the weird ways they are represented by BackupRead. A normal file will always either have a data stream
// with size and content, or no data stream at all (if empty). However, for a sparse file, the content can also
// be represented using a series of sparse block streams following the data stream. Additionally, the way sparse
// files are handled by BackupRead has changed in the OS recently. The specifics of the representation are described
// in the list at the bottom of this block comment.
//
// Sparse files can be represented in four different ways, based on the specifics of the file.
// - Size = 0:
// Previously: BackupRead yields no data stream and no sparse block streams.
// Recently: BackupRead yields a data stream with size = 0. There are no following sparse block streams.
// - Size > 0, no allocated ranges:
// BackupRead yields a data stream with size = 0. Following is a single sparse block stream with
// size = 0 and offset = <file size>.
// - Size > 0, one allocated range:
// BackupRead yields a data stream with size = <file size> containing the file contents. There are no
// sparse block streams. This is the case if you take a normal file with contents and simply set the
// sparse flag on it.
// - Size > 0, multiple allocated ranges:
// BackupRead yields a data stream with size = 0. Following are sparse block streams for each allocated
// range of the file containing the range contents. Finally there is a sparse block stream with
// size = 0 and offset = <file size>.

if dataHdr != nil {
// A data stream was found. Copy the data.
if (dataHdr.Attributes & winio.StreamSparseAttributes) == 0 {
// We assume that we will either have a data stream size > 0 XOR have sparse block streams.
if dataHdr.Size > 0 || (dataHdr.Attributes&winio.StreamSparseAttributes) == 0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check spacing with &

Copy link
Member Author

@kevpar kevpar Oct 4, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gofmt enforces this style. I don't understand the exact rule it's using to format, but if I take away the dataHdr.Size > 0 || then it puts a space in around the &, otherwise not.

if size != dataHdr.Size {
return fmt.Errorf("%s: mismatch between file size %d and header size %d", name, size, dataHdr.Size)
}
_, err = io.Copy(t, br)
if err != nil {
return err
if _, err = io.Copy(t, br); err != nil {
return fmt.Errorf("%s: copying contents from data stream: %s", name, err)
}
} else {
err = copySparse(t, br)
if err != nil {
return err
} else if size > 0 {
// As of a recent OS change, BackupRead now returns a data stream for empty sparse files.
// These files have no sparse block streams, so skip the copySparse call if file size = 0.
if err = copySparse(t, br); err != nil {
return fmt.Errorf("%s: copying contents from sparse block stream: %s", name, err)
}
}
}
Expand Down Expand Up @@ -279,7 +307,7 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size
} else {
// Unsupported for now, since the size of the alternate stream is not present
// in the backup stream until after the data has been read.
return errors.New("tar of sparse alternate data streams is unsupported")
return fmt.Errorf("%s: tar of sparse alternate data streams is unsupported", name)
}
case winio.BackupEaData, winio.BackupLink, winio.BackupPropertyData, winio.BackupObjectId, winio.BackupTxfsData:
// ignore these streams
Expand Down
232 changes: 183 additions & 49 deletions backuptar/tar_test.go
Expand Up @@ -5,13 +5,15 @@ package backuptar
import (
"archive/tar"
"bytes"
"io"
"io/ioutil"
"os"
"path/filepath"
"reflect"
"testing"

"github.com/Microsoft/go-winio"
"golang.org/x/sys/windows"
)

func ensurePresent(t *testing.T, m map[string]string, keys ...string) {
Expand All @@ -22,65 +24,197 @@ func ensurePresent(t *testing.T, m map[string]string, keys ...string) {
}
}

func TestRoundTrip(t *testing.T) {
f, err := ioutil.TempFile("", "tst")
if err != nil {
t.Fatal(err)
}
defer f.Close()
defer os.Remove(f.Name())

if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil {
t.Fatal(err)
}

if _, err = f.Seek(0, 0); err != nil {
func setSparse(t *testing.T, f *os.File) {
const FSCTL_SET_SPARSE uint32 = 0x900c4
if err := windows.DeviceIoControl(windows.Handle(f.Fd()), FSCTL_SET_SPARSE, nil, 0, nil, 0, nil, nil); err != nil {
t.Fatal(err)
}
}

fi, err := f.Stat()
if err != nil {
t.Fatal(err)
// compareReaders validates that two readers contain the exact same data.
func compareReaders(t *testing.T, rActual io.Reader, rExpected io.Reader) {
const size = 8 * 1024
var bufExpected, bufActual [size]byte
var readCount int64
// Loop, first reading from rExpected, then reading the same amount from rActual.
// For each set of reads, compare the bytes to make sure they are identical.
// When we run out of data in rExpected, exit the loop.
for {
// Do a read from rExpected and see how many bytes we get.
nExpected, err := rExpected.Read(bufExpected[:])
if err == io.EOF && nExpected == 0 {
break
} else if err != nil && err != io.EOF {
t.Fatalf("Failed reading from rExpected at %d: %s", readCount, err)
}
// Do a ReadFull from rActual for the same number of bytes we got from rExpected.
if nActual, err := io.ReadFull(rActual, bufActual[:nExpected]); err != nil {
t.Fatalf("Only read %d bytes out of %d from rActual at %d: %s", nActual, nExpected, readCount, err)
}
readCount += int64(nExpected)
for i, bExpected := range bufExpected[:nExpected] {
if bExpected != bufActual[i] {
t.Fatalf("Mismatched bytes at %d. got 0x%x, expected 0x%x", i, bufActual[i], bExpected)
}
}
}

bi, err := winio.GetFileBasicInfo(f)
if err != nil {
t.Fatal(err)
}

br := winio.NewBackupFileReader(f, true)
defer br.Close()

var buf bytes.Buffer
tw := tar.NewWriter(&buf)

err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi)
if err != nil {
t.Fatal(err)
// Now we just need to make sure there isn't any further data in rActual.
var b [1]byte
if n, err := rActual.Read(b[:]); n != 0 || err != io.EOF {
t.Fatalf("rActual didn't return EOF at expected end. Read %d bytes with error %s", n, err)
}
}

tr := tar.NewReader(&buf)
hdr, err := tr.Next()
if err != nil {
t.Fatal(err)
func TestRoundTrip(t *testing.T) {
// Each test case is a name mapped to a function which must create a file and return its path.
// The test then round-trips that file through backuptar, and validates the output matches the input.
for name, setup := range map[string]func(*testing.T) string{
"normalFile": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
if err := ioutil.WriteFile(path, []byte("testing 1 2 3\n"), 0644); err != nil {
t.Fatal(err)
}
return path
},
"normalFileEmpty": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
return path
},
"sparseFileEmpty": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
return path
},
"sparseFileWithNoAllocatedRanges": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
// Set file size without writing data to produce a file with size > 0
// but no allocated ranges.
if err := f.Truncate(1000000); err != nil {
t.Fatal(err)
}
return path
},
"sparseFileWithOneAllocatedRange": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
if _, err := f.WriteString("test sparse data"); err != nil {
t.Fatal(err)
}
return path
},
"sparseFileWithMultipleAllocatedRanges": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil {
t.Fatal(err)
}
// The documentation talks about FSCTL_SET_ZERO_DATA, but seeking also
// seems to create a hole.
if _, err = f.Seek(1000000, 0); err != nil {
t.Fatal(err)
}
if _, err = f.Write([]byte("more data later\n")); err != nil {
t.Fatal(err)
}
return path
},
} {
t.Run(name, func(t *testing.T) {
path := setup(t)
f, err := os.Open(path)
if err != nil {
t.Fatal(err)
}
defer f.Close()

fi, err := f.Stat()
if err != nil {
t.Fatal(err)
}
bi, err := winio.GetFileBasicInfo(f)
if err != nil {
t.Fatal(err)
}

br := winio.NewBackupFileReader(f, true)
defer br.Close()
var buf bytes.Buffer
tw := tar.NewWriter(&buf)
err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi)
if err != nil {
t.Fatal(err)
}
tr := tar.NewReader(&buf)
hdr, err := tr.Next()
if err != nil {
t.Fatal(err)
}

name, size, bi2, err := FileInfoFromHeader(hdr)
if err != nil {
t.Fatal(err)
}
if name != filepath.ToSlash(f.Name()) {
t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name()))
}
if size != fi.Size() {
t.Errorf("got size %d, expected %d", size, fi.Size())
}
if !reflect.DeepEqual(*bi2, *bi) {
t.Errorf("got %#v, expected %#v", *bi2, *bi)
}
ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd")
// Reset file position so we can compare file contents.
// The file contents of the actual file should match what we get from the tar.
if _, err := f.Seek(0, 0); err != nil {
t.Fatal(err)
}
compareReaders(t, tr, f)
})
}
}

name, size, bi2, err := FileInfoFromHeader(hdr)
func TestZeroReader(t *testing.T) {
const size = 512
var b [size]byte
var bExpected [size]byte
var r zeroReader
n, err := r.Read(b[:])
if err != nil {
t.Fatal(err)
}

if name != filepath.ToSlash(f.Name()) {
t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name()))
t.Fatalf("Unexpected read error: %s", err)
}

if size != fi.Size() {
t.Errorf("got size %d, expected %d", size, fi.Size())
if n != size {
t.Errorf("Wrong read size. got %d, expected %d", n, size)
}

if !reflect.DeepEqual(*bi2, *bi) {
t.Errorf("got %#v, expected %#v", *bi2, *bi)
for i := range b {
if b[i] != bExpected[i] {
t.Errorf("Wrong content at index %d. got %d, expected %d", i, b[i], bExpected[i])
}
}

ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd")
}