Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tarball: Streaming image parser PoC #1429

Closed
wants to merge 10 commits into from
10 changes: 10 additions & 0 deletions .github/workflows/e2e.yaml
Expand Up @@ -82,3 +82,13 @@ jobs:
./app/crane pull --platform=linux/arm64 --format=oci $remote $distroless
./app/crane push $distroless $local
diff <(./app/crane manifest --platform linux/arm64 $remote) <(./app/crane manifest $local)

- name: crane pull image, and export it from stdin to filesystem tar to stdout
shell: bash
run: |
set -euxo pipefail

./app/crane pull ubuntu ubuntu.tar
./app/crane export - - < ubuntu.tar > filesystem.tar
ls -la *.tar

16 changes: 3 additions & 13 deletions cmd/crane/cmd/export.go
Expand Up @@ -17,8 +17,6 @@ package cmd
import (
"fmt"
"io"
"io/ioutil"
"log"
"os"

"github.com/google/go-containerregistry/pkg/crane"
Expand Down Expand Up @@ -55,18 +53,10 @@ func NewCmdExport(options *[]crane.Option) *cobra.Command {

var img v1.Image
if src == "-" {
tmpfile, err := ioutil.TempFile("", "crane")
if err != nil {
log.Fatal(err)
}
defer os.Remove(tmpfile.Name())

if _, err := io.Copy(tmpfile, os.Stdin); err != nil {
log.Fatal(err)
stdoutreader := func() (io.ReadCloser, error) {
return io.ReadCloser(os.Stdin), nil
}
tmpfile.Close()

img, err = tarball.ImageFromPath(tmpfile.Name(), nil)
img, err = tarball.Image(stdoutreader, nil)
if err != nil {
return fmt.Errorf("reading tarball from stdin: %w", err)
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/v1/mutate/mutate.go
Expand Up @@ -27,6 +27,7 @@ import (
"time"

"github.com/google/go-containerregistry/internal/gzip"
"github.com/google/go-containerregistry/pkg/logs"
v1 "github.com/google/go-containerregistry/pkg/v1"
"github.com/google/go-containerregistry/pkg/v1/empty"
"github.com/google/go-containerregistry/pkg/v1/match"
Expand Down Expand Up @@ -220,6 +221,7 @@ func CreatedAt(base v1.Image, created v1.Time) (v1.Image, error) {
// If a caller doesn't read the full contents, they should Close it to free up
// resources used during extraction.
func Extract(img v1.Image) io.ReadCloser {
logs.Debug.Printf("mutate: extract %T", img)

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information

Sensitive data returned by [an access to password](1) is logged here. Sensitive data returned by [an access to Password](2) is logged here. Sensitive data returned by [an access to Password](3) is logged here.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how dumping type of v1.Image gives access to AuthConfig.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how that could happen either, but I can tell you this debug logging will probably be very noisy and likely not very useful. If it's helpful while working on this change that's fine, but we should remove it before reviewing and merging.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have any tools to replace Debug prints while figuring out how Go code works? I am doing "pen and paper" reconstruction of the calls, which is not very effective.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I typically just rely on logging and pen and paper debugging, but I've heard good things about delve for interactive debugging, which is also integrated into things like VSCode.

pr, pw := io.Pipe()

go func() {
Expand Down
86 changes: 80 additions & 6 deletions pkg/v1/tarball/image.go
Expand Up @@ -28,6 +28,7 @@ import (
"sync"

"github.com/google/go-containerregistry/internal/gzip"
"github.com/google/go-containerregistry/pkg/logs"
"github.com/google/go-containerregistry/pkg/name"
v1 "github.com/google/go-containerregistry/pkg/v1"
"github.com/google/go-containerregistry/pkg/v1/partial"
Expand All @@ -36,6 +37,7 @@ import (

type image struct {
opener Opener
tarbuf *TarBuffered // replaces Opener
manifest *Manifest
config []byte
imgDescriptor *Descriptor
Expand Down Expand Up @@ -88,8 +90,21 @@ func LoadManifest(opener Opener) (Manifest, error) {

// Image exposes an image from the tarball at the provided path.
func Image(opener Opener, tag *name.Tag) (v1.Image, error) {
f, err := opener()
if err != nil {
return nil, err
}
close := true
defer func() {
if close {
f.Close()
}
}()
tarbuf := NewTarBuffered(f)

img := &image{
opener: opener,
tarbuf: tarbuf,
tag: tag,
}
if err := img.loadTarDescriptorAndConfig(); err != nil {
Expand Down Expand Up @@ -161,20 +176,18 @@ func (i *image) areLayersCompressed() (bool, error) {
return false, errors.New("0 layers found in image")
}
layer := i.imgDescriptor.Layers[0]
blob, err := extractFileFromTar(i.opener, layer)
blob, err := i.tarbuf.scanFile(layer)
if err != nil {
return false, err
}
defer blob.Close()
return gzip.Is(blob)
}

func (i *image) loadTarDescriptorAndConfig() error {
m, err := extractFileFromTar(i.opener, "manifest.json")
m, err := i.tarbuf.scanFile("manifest.json")
if err != nil {
return err
}
defer m.Close()

if err := json.NewDecoder(m).Decode(&i.manifest); err != nil {
return err
Expand All @@ -189,11 +202,10 @@ func (i *image) loadTarDescriptorAndConfig() error {
return err
}

cfg, err := extractFileFromTar(i.opener, i.imgDescriptor.Config)
cfg, err := i.tarbuf.scanFile(i.imgDescriptor.Config)
if err != nil {
return err
}
defer cfg.Close()

i.config, err = ioutil.ReadAll(cfg)
if err != nil {
Expand All @@ -206,6 +218,68 @@ func (i *image) RawConfigFile() ([]byte, error) {
return i.config, nil
}

// TarBuffered stores parsed entries of tar file for subsequent access
type TarBuffered struct {
tf *tar.Reader
pos int
headers map[int]*tar.Header
content map[int][]byte
EOF bool
}

// NewTarBuffered prepares and returns struct for parsing tar stream
func NewTarBuffered(f io.Reader) *TarBuffered {
// [ ] what happens if we don't return a pointer here?
// will reference to tf be duplicated in gc?
logs.Debug.Printf("tarbuf: start with %+v", f)
return &TarBuffered{
tf: tar.NewReader(f),
headers: make(map[int]*tar.Header),
content: make(map[int][]byte)}
}

//func (tb *TarBuffered) append(header *tar.Header, bytes io.Reader) (, error) {
//}

// TODO: track size of structure and remove content entries once they've been read
func (tb *TarBuffered) scanFile(filePath string) (io.Reader, error) {
// scan records that are already read
for i := 0; i < tb.pos; i++ {
if tb.headers[i].Name == filePath {
logs.Debug.Printf("tarbuf: found in buffer %v %v", i, filePath)
return bytes.NewReader(tb.content[i]), nil
}
}
// if file is not found, continue parsing
for !tb.EOF {
logs.Debug.Printf("tarbuf: scan entry %v", tb.pos)
hdr, err := tb.tf.Next()
if errors.Is(err, io.EOF) {
logs.Debug.Printf("tarbuf: EOF")
tb.EOF = true
break
}
if err != nil {
return nil, err
}

tb.headers[tb.pos] = hdr
logs.Debug.Printf("tarbuf: %s", hdr.Name)
tb.content[tb.pos], err = io.ReadAll(tb.tf)
if err != nil {
return nil, err
}
contentidx := tb.pos
tb.pos++
logs.Debug.Printf("tarbuf: buffered %v", len(tb.content[contentidx]))
if hdr.Name == filePath {
logs.Debug.Printf("tarbuf: found %v", filePath)
return bytes.NewReader(tb.content[contentidx]), nil
}
}
return nil, fmt.Errorf("file %s not found in tar", filePath)
}

// tarFile represents a single file inside a tar. Closing it closes the tar itself.
type tarFile struct {
io.Reader
Expand Down