From 076236eb4d48c4284306108cf4b0a2b8084d13dc Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Tue, 16 Aug 2022 22:35:58 +0300 Subject: [PATCH 01/10] e2e: pull and export stdin and stdout --- .github/workflows/e2e.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 4bc153bfc..dffcc7a4e 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -82,3 +82,12 @@ jobs: ./app/crane pull --platform=linux/arm64 --format=oci $remote $distroless ./app/crane push $distroless $local diff <(./app/crane manifest --platform linux/arm64 $remote) <(./app/crane manifest $local) + + - name: crane pull image, and export it from stdin to filesystem tar to stdout + shell: bash + run: | + set -euxo pipefail + + ./app/crane pull ubuntu ubuntu.tar + ./app/crane export - - < ubuntu.tar > filesystem.tar + From b0c6d9ce918c817c8391e40c2d6041d67890b1fa Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Tue, 16 Aug 2022 22:40:02 +0300 Subject: [PATCH 02/10] List all resulting tar files --- .github/workflows/e2e.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index dffcc7a4e..d6a377202 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -90,4 +90,5 @@ jobs: ./app/crane pull ubuntu ubuntu.tar ./app/crane export - - < ubuntu.tar > filesystem.tar + ls -la *.tar From a790e3b61a11f3a466cf690e9ff8d397ddd7fc84 Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Sun, 14 Aug 2022 12:28:44 +0300 Subject: [PATCH 03/10] tarball: Streaming image parser PoC TarBuffered scans stream (`io.Reader`) once for filename and saves unused sections in memory for later access. This should speedup parsing a bit, because right now tarball is scanned several times, and should save resources and speed for parsing well-formed images from network. See #1339. --- pkg/v1/tarball/image.go | 75 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/pkg/v1/tarball/image.go b/pkg/v1/tarball/image.go index b2e44df75..c4e0e8535 100644 --- a/pkg/v1/tarball/image.go +++ b/pkg/v1/tarball/image.go @@ -26,8 +26,10 @@ import ( "path" "path/filepath" "sync" + "unsafe" "github.com/google/go-containerregistry/internal/gzip" + "github.com/google/go-containerregistry/pkg/logs" "github.com/google/go-containerregistry/pkg/name" v1 "github.com/google/go-containerregistry/pkg/v1" "github.com/google/go-containerregistry/pkg/v1/partial" @@ -36,6 +38,7 @@ import ( type image struct { opener Opener + tarbuf *TarBuffered // replaces Opener manifest *Manifest config []byte imgDescriptor *Descriptor @@ -88,8 +91,21 @@ func LoadManifest(opener Opener) (Manifest, error) { // Image exposes an image from the tarball at the provided path. func Image(opener Opener, tag *name.Tag) (v1.Image, error) { + f, err := opener() + if err != nil { + return nil, err + } + close := true + defer func() { + if close { + f.Close() + } + }() + tarbuf := NewTarBuffered(f) + img := &image{ opener: opener, + tarbuf: tarbuf, tag: tag, } if err := img.loadTarDescriptorAndConfig(); err != nil { @@ -170,11 +186,10 @@ func (i *image) areLayersCompressed() (bool, error) { } func (i *image) loadTarDescriptorAndConfig() error { - m, err := extractFileFromTar(i.opener, "manifest.json") + m, err := i.tarbuf.scanFile("manifest.json") if err != nil { return err } - defer m.Close() if err := json.NewDecoder(m).Decode(&i.manifest); err != nil { return err @@ -206,6 +221,62 @@ func (i *image) RawConfigFile() ([]byte, error) { return i.config, nil } +// TarBuffered stores parsed entries of tar file for subsequent access +type TarBuffered struct { + tf *tar.Reader + pos int + headers map[int]*tar.Header + content map[int][]byte + EOF bool +} + +func NewTarBuffered(f io.Reader) *TarBuffered { + // [ ] what happens if we don't return a pointer here? + // will reference to tf be duplicated in gc? + logs.Debug.Printf("tarbuf: start with %+v", f) + return &TarBuffered{ + tf: tar.NewReader(f), + headers: make(map[int]*tar.Header), + content: make(map[int][]byte)} +} + +func (tb *TarBuffered) scanFile(filePath string) (io.Reader, error) { + // scan records that are already read + for i := 0; i < tb.pos; i++ { + if tb.headers[i].Name == filePath { + logs.Debug.Printf("tarbuf: found %v at chunk %v", filePath, i) + return bytes.NewReader(tb.content[i]), nil + } + } + // if file is not found, continue parsing + for tb.EOF != true { + hdr, err := tb.tf.Next() + if errors.Is(err, io.EOF) { + tb.EOF = true + break + } + if err != nil { + return nil, err + } + + tb.headers[tb.pos] = hdr + logs.Debug.Printf("tarbuf: scan %v", hdr.Name) + tb.content[tb.pos], err = io.ReadAll(tb.tf) + if err != nil { + return nil, err + } + contentidx := tb.pos + tb.pos++ + logs.Debug.Printf("tarbuf: hdr.size %v", hdr.Size) + logs.Debug.Printf("tarbuf: len %v", unsafe.Sizeof(*tb)) + if hdr.Name == filePath { + return bytes.NewReader(tb.content[contentidx]), nil + } + // logs.Debug.Printf("tarbuf: scan %v", hdr) + } + return nil, fmt.Errorf("file %s not found in tar", filePath) +} + // tarFile represents a single file inside a tar. Closing it closes the tar itself. type tarFile struct { io.Reader From f62ab3ed01b424e4698e924c39832da5537da167 Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Sun, 14 Aug 2022 13:00:37 +0300 Subject: [PATCH 04/10] Presubmit requires comment --- pkg/v1/tarball/image.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/v1/tarball/image.go b/pkg/v1/tarball/image.go index c4e0e8535..3744b7b6e 100644 --- a/pkg/v1/tarball/image.go +++ b/pkg/v1/tarball/image.go @@ -230,6 +230,7 @@ type TarBuffered struct { EOF bool } +// NewTarBuffered prepares and returns struct for parsing tar stream func NewTarBuffered(f io.Reader) *TarBuffered { // [ ] what happens if we don't return a pointer here? // will reference to tf be duplicated in gc? From 69e7441c70a3ba1f8d7d24276f4cee004228c88b Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Sun, 14 Aug 2022 14:32:29 +0300 Subject: [PATCH 05/10] Style fix --- pkg/v1/tarball/image.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/v1/tarball/image.go b/pkg/v1/tarball/image.go index 3744b7b6e..ef70bfeb5 100644 --- a/pkg/v1/tarball/image.go +++ b/pkg/v1/tarball/image.go @@ -250,7 +250,7 @@ func (tb *TarBuffered) scanFile(filePath string) (io.Reader, error) { } } // if file is not found, continue parsing - for tb.EOF != true { + for !tb.EOF { hdr, err := tb.tf.Next() if errors.Is(err, io.EOF) { tb.EOF = true From abf6a21ebd97fc1994da2949ed1d46ae26775907 Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Sun, 14 Aug 2022 15:46:11 +0300 Subject: [PATCH 06/10] Reuse buffer for `loadTarDescriptorAndConfig()` --- pkg/v1/tarball/image.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pkg/v1/tarball/image.go b/pkg/v1/tarball/image.go index ef70bfeb5..3ada8f654 100644 --- a/pkg/v1/tarball/image.go +++ b/pkg/v1/tarball/image.go @@ -204,11 +204,10 @@ func (i *image) loadTarDescriptorAndConfig() error { return err } - cfg, err := extractFileFromTar(i.opener, i.imgDescriptor.Config) + cfg, err := i.tarbuf.scanFile(i.imgDescriptor.Config) if err != nil { return err } - defer cfg.Close() i.config, err = ioutil.ReadAll(cfg) if err != nil { @@ -241,6 +240,10 @@ func NewTarBuffered(f io.Reader) *TarBuffered { content: make(map[int][]byte)} } +//func (tb *TarBuffered) append(header *tar.Header, bytes io.Reader) (, error) { +//} + +// TODO: track size of structure and remove content entries once they've been read func (tb *TarBuffered) scanFile(filePath string) (io.Reader, error) { // scan records that are already read for i := 0; i < tb.pos; i++ { @@ -261,19 +264,18 @@ func (tb *TarBuffered) scanFile(filePath string) (io.Reader, error) { } tb.headers[tb.pos] = hdr - logs.Debug.Printf("tarbuf: scan %v", hdr.Name) + logs.Debug.Printf("tarbuf: scan %v, %v", tb.pos, hdr.Name) tb.content[tb.pos], err = io.ReadAll(tb.tf) if err != nil { return nil, err } contentidx := tb.pos tb.pos++ - logs.Debug.Printf("tarbuf: hdr.size %v", hdr.Size) - logs.Debug.Printf("tarbuf: len %v", unsafe.Sizeof(*tb)) + logs.Debug.Printf("tarbuf: hdr.size %v, len %v", hdr.Size, unsafe.Sizeof(*tb)) if hdr.Name == filePath { + logs.Debug.Printf("tarbuf: found %v at pos %v", filePath, contentidx) return bytes.NewReader(tb.content[contentidx]), nil } - // logs.Debug.Printf("tarbuf: scan %v", hdr) } return nil, fmt.Errorf("file %s not found in tar", filePath) } From 34aefcedb53b2c4e499ec13da954bd27a63bd1fd Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Sun, 14 Aug 2022 15:50:56 +0300 Subject: [PATCH 07/10] Use scanFile() to detect if an image is compressed --- pkg/v1/tarball/image.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/v1/tarball/image.go b/pkg/v1/tarball/image.go index 3ada8f654..70569dd1d 100644 --- a/pkg/v1/tarball/image.go +++ b/pkg/v1/tarball/image.go @@ -177,11 +177,10 @@ func (i *image) areLayersCompressed() (bool, error) { return false, errors.New("0 layers found in image") } layer := i.imgDescriptor.Layers[0] - blob, err := extractFileFromTar(i.opener, layer) + blob, err := i.tarbuf.scanFile(layer) if err != nil { return false, err } - defer blob.Close() return gzip.Is(blob) } From 0f6cc5e8bf2a18d05c482c007b61708e6d7a8dc5 Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Tue, 16 Aug 2022 18:29:44 +0300 Subject: [PATCH 08/10] Improve error messages, show memory used by layer --- pkg/v1/tarball/image.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/v1/tarball/image.go b/pkg/v1/tarball/image.go index 70569dd1d..544d9f9a8 100644 --- a/pkg/v1/tarball/image.go +++ b/pkg/v1/tarball/image.go @@ -26,7 +26,6 @@ import ( "path" "path/filepath" "sync" - "unsafe" "github.com/google/go-containerregistry/internal/gzip" "github.com/google/go-containerregistry/pkg/logs" @@ -247,14 +246,16 @@ func (tb *TarBuffered) scanFile(filePath string) (io.Reader, error) { // scan records that are already read for i := 0; i < tb.pos; i++ { if tb.headers[i].Name == filePath { - logs.Debug.Printf("tarbuf: found %v at chunk %v", filePath, i) + logs.Debug.Printf("tarbuf: found in buffer %v %v", i, filePath) return bytes.NewReader(tb.content[i]), nil } } // if file is not found, continue parsing for !tb.EOF { + logs.Debug.Printf("tarbuf: scan entry %v", tb.pos) hdr, err := tb.tf.Next() if errors.Is(err, io.EOF) { + logs.Debug.Printf("tarbuf: EOF") tb.EOF = true break } @@ -263,16 +264,16 @@ func (tb *TarBuffered) scanFile(filePath string) (io.Reader, error) { } tb.headers[tb.pos] = hdr - logs.Debug.Printf("tarbuf: scan %v, %v", tb.pos, hdr.Name) + logs.Debug.Printf("tarbuf: %s", hdr.Name) tb.content[tb.pos], err = io.ReadAll(tb.tf) if err != nil { return nil, err } contentidx := tb.pos tb.pos++ - logs.Debug.Printf("tarbuf: hdr.size %v, len %v", hdr.Size, unsafe.Sizeof(*tb)) + logs.Debug.Printf("tarbuf: buffered %v", len(tb.content[contentidx])) if hdr.Name == filePath { - logs.Debug.Printf("tarbuf: found %v at pos %v", filePath, contentidx) + logs.Debug.Printf("tarbuf: found %v", filePath) return bytes.NewReader(tb.content[contentidx]), nil } } From 36af408eb343a7d4a1fc41711b115739e33c358b Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Tue, 16 Aug 2022 18:31:47 +0300 Subject: [PATCH 09/10] Attempt to process Stdin stream directly --- cmd/crane/cmd/export.go | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/cmd/crane/cmd/export.go b/cmd/crane/cmd/export.go index a6b8af100..4a43210d6 100644 --- a/cmd/crane/cmd/export.go +++ b/cmd/crane/cmd/export.go @@ -17,8 +17,6 @@ package cmd import ( "fmt" "io" - "io/ioutil" - "log" "os" "github.com/google/go-containerregistry/pkg/crane" @@ -55,18 +53,10 @@ func NewCmdExport(options *[]crane.Option) *cobra.Command { var img v1.Image if src == "-" { - tmpfile, err := ioutil.TempFile("", "crane") - if err != nil { - log.Fatal(err) - } - defer os.Remove(tmpfile.Name()) - - if _, err := io.Copy(tmpfile, os.Stdin); err != nil { - log.Fatal(err) + stdoutreader := func() (io.ReadCloser, error) { + return io.ReadCloser(os.Stdin), nil } - tmpfile.Close() - - img, err = tarball.ImageFromPath(tmpfile.Name(), nil) + img, err = tarball.Image(stdoutreader, nil) if err != nil { return fmt.Errorf("reading tarball from stdin: %w", err) } From 7f2ecfce5cac2b3627ea3b95be30bf31ba5f642d Mon Sep 17 00:00:00 2001 From: Anatoli Babenia Date: Wed, 17 Aug 2022 07:53:56 +0300 Subject: [PATCH 10/10] Log `img` type in `mutate.Extract()` Go subclassing with partials is rather hardcore here --- pkg/v1/mutate/mutate.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/v1/mutate/mutate.go b/pkg/v1/mutate/mutate.go index 6bae22493..e0f0355bd 100644 --- a/pkg/v1/mutate/mutate.go +++ b/pkg/v1/mutate/mutate.go @@ -27,6 +27,7 @@ import ( "time" "github.com/google/go-containerregistry/internal/gzip" + "github.com/google/go-containerregistry/pkg/logs" v1 "github.com/google/go-containerregistry/pkg/v1" "github.com/google/go-containerregistry/pkg/v1/empty" "github.com/google/go-containerregistry/pkg/v1/match" @@ -220,6 +221,7 @@ func CreatedAt(base v1.Image, created v1.Time) (v1.Image, error) { // If a caller doesn't read the full contents, they should Close it to free up // resources used during extraction. func Extract(img v1.Image) io.ReadCloser { + logs.Debug.Printf("mutate: extract %T", img) pr, pw := io.Pipe() go func() {