From 0a22e48f5e413b519abb3974224cc5302eafae35 Mon Sep 17 00:00:00 2001 From: Marcel Schramm Date: Tue, 15 Aug 2023 01:03:00 +0200 Subject: [PATCH] feat: support reading UTF16 files (#26) To allow this, we need to use a different scanner that ignores the respective BOM bytes possible to occur. This means \xFFEE and \xFEFF since UTF16 can both be little endian or big endian. By default, Windows uses UTF16 little endian. Fixes #25 --- fixtures/utf16be_bom.env | Bin 0 -> 32 bytes fixtures/utf16le_bom.env | Bin 0 -> 32 bytes fixtures/{bom.env => utf8_bom.env} | 0 go.mod | 5 ++- go.sum | 2 + gotenv.go | 57 ++++++++++++++++++++--------- gotenv_test.go | 48 +++++++++++++++++++----- 7 files changed, 83 insertions(+), 29 deletions(-) create mode 100644 fixtures/utf16be_bom.env create mode 100644 fixtures/utf16le_bom.env rename fixtures/{bom.env => utf8_bom.env} (100%) diff --git a/fixtures/utf16be_bom.env b/fixtures/utf16be_bom.env new file mode 100644 index 0000000000000000000000000000000000000000..9d65b84d5ebcd17d2db7a0b3b3921484f9efae44 GIT binary patch literal 32 mcmezOpTUX2pTU>GmLZfOgu#tLm%)(1j6ne?>dL^&zy$z#RRvlA literal 0 HcmV?d00001 diff --git a/fixtures/utf16le_bom.env b/fixtures/utf16le_bom.env new file mode 100644 index 0000000000000000000000000000000000000000..d55ea15c08f16e41caef6cb8b089ccd2742b45e4 GIT binary patch literal 32 ncmezW&xygG!I#07A(SD6!Hq$e!H~g>L4m=C!IgoRfr|kEeF6n! literal 0 HcmV?d00001 diff --git a/fixtures/bom.env b/fixtures/utf8_bom.env similarity index 100% rename from fixtures/bom.env rename to fixtures/utf8_bom.env diff --git a/go.mod b/go.mod index 42fbf18..3b45da7 100644 --- a/go.mod +++ b/go.mod @@ -2,7 +2,10 @@ module github.com/subosito/gotenv go 1.18 -require github.com/stretchr/testify v1.7.5 +require ( + github.com/stretchr/testify v1.7.5 + golang.org/x/text v0.12.0 +) require ( github.com/davecgh/go-spew v1.1.1 // indirect diff --git a/go.sum b/go.sum index f59e5c0..01c94b0 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSS github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.5 h1:s5PTfem8p8EbKQOctVV53k6jCJt3UX4IEJzwh+C324Q= github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc= +golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/gotenv.go b/gotenv.go index eddad3a..7fe03ff 100644 --- a/gotenv.go +++ b/gotenv.go @@ -12,6 +12,9 @@ import ( "sort" "strconv" "strings" + + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" ) const ( @@ -20,11 +23,13 @@ const ( // Pattern for detecting valid variable within a value variablePattern = `(\\)?(\$)(\{?([A-Z0-9_]+)?\}?)` - - // Byte order mark character - bom = "\xef\xbb\xbf" ) +// Byte order mark character +var bomUTF8 = []byte("\xEF\xBB\xBF") +var bomUTF16LE = []byte("\xFF\xFE") +var bomUTF16BE = []byte("\xFE\xFF") + // Env holds key/value pair of valid environment variable type Env map[string]string @@ -48,12 +53,12 @@ func Must(fn func(filenames ...string) error, filenames ...string) { } // Apply is a function to load an io Reader then export the valid variables into environment variables if they do not exist. -func Apply(r io.Reader) error { +func Apply(r Reader) error { return parset(r, false) } // OverApply is a function to load an io Reader then export and override the valid variables into environment variables. -func OverApply(r io.Reader) error { +func OverApply(r Reader) error { return parset(r, true) } @@ -79,7 +84,7 @@ func loadenv(override bool, filenames ...string) error { } // parse and set :) -func parset(r io.Reader, override bool) error { +func parset(r Reader, override bool) error { env, err := strictParse(r, override) if err != nil { return err @@ -105,7 +110,7 @@ func setenv(key, val string, override bool) { // Parse is a function to parse line by line any io.Reader supplied and returns the valid Env key/value pair of valid variables. // It expands the value of a variable from the environment variable but does not set the value to the environment itself. // This function is skipping any invalid lines and only processing the valid one. -func Parse(r io.Reader) Env { +func Parse(r Reader) Env { env, _ := strictParse(r, false) return env } @@ -113,7 +118,7 @@ func Parse(r io.Reader) Env { // StrictParse is a function to parse line by line any io.Reader supplied and returns the valid Env key/value pair of valid variables. // It expands the value of a variable from the environment variable but does not set the value to the environment itself. // This function is returning an error if there are any invalid lines. -func StrictParse(r io.Reader) (Env, error) { +func StrictParse(r Reader) (Env, error) { return strictParse(r, false) } @@ -201,12 +206,34 @@ func splitLines(data []byte, atEOF bool) (advance int, token []byte, err error) return eol, data[:idx], nil } -func strictParse(r io.Reader, override bool) (Env, error) { +type Reader interface { + io.Reader + io.ReaderAt +} + +func strictParse(r Reader, override bool) (Env, error) { env := make(Env) - scanner := bufio.NewScanner(r) - scanner.Split(splitLines) - firstLine := true + // We chooes a different scanner depending on file encoding. + var scanner *bufio.Scanner + + // There can be a maximum of 3 BOM bytes. + bomByteBuffer := make([]byte, 3) + if _, err := r.ReadAt(bomByteBuffer, 0); err != nil { + return env, err + } + + if bytes.HasPrefix(bomByteBuffer, bomUTF8) { + scanner = bufio.NewScanner(transform.NewReader(r, unicode.UTF8BOM.NewDecoder())) + } else if bytes.HasPrefix(bomByteBuffer, bomUTF16LE) { + scanner = bufio.NewScanner(transform.NewReader(r, unicode.UTF16(unicode.LittleEndian, unicode.ExpectBOM).NewDecoder())) + } else if bytes.HasPrefix(bomByteBuffer, bomUTF16BE) { + scanner = bufio.NewScanner(transform.NewReader(r, unicode.UTF16(unicode.BigEndian, unicode.ExpectBOM).NewDecoder())) + } else { + scanner = bufio.NewScanner(r) + } + + scanner.Split(splitLines) for scanner.Scan() { if err := scanner.Err(); err != nil { @@ -214,12 +241,6 @@ func strictParse(r io.Reader, override bool) (Env, error) { } line := strings.TrimSpace(scanner.Text()) - - if firstLine { - line = strings.TrimPrefix(line, bom) - firstLine = false - } - if line == "" || line[0] == '#' { continue } diff --git a/gotenv_test.go b/gotenv_test.go index b566b38..1a4d041 100644 --- a/gotenv_test.go +++ b/gotenv_test.go @@ -3,7 +3,6 @@ package gotenv_test import ( "bufio" "errors" - "io" "os" "strings" "testing" @@ -243,26 +242,34 @@ func TestStrictParse(t *testing.T) { } type failingReader struct { - io.Reader + gotenv.Reader } func (fr failingReader) Read(p []byte) (n int, err error) { return 0, errors.New("you shall not read") } +func (fr failingReader) ReadAt(p []byte, off int64) (n int, err error) { + return 0, errors.New("you shall not read") +} + func TestStrictParse_PassThroughErrors(t *testing.T) { _, err := gotenv.StrictParse(&failingReader{}) assert.Error(t, err) } type infiniteReader struct { - io.Reader + gotenv.Reader } func (er infiniteReader) Read(p []byte) (n int, err error) { return len(p), nil } +func (er infiniteReader) ReadAt(p []byte, off int64) (n int, err error) { + return len(p), nil +} + func TestStrictParse_NoTokenPassThroughErrors(t *testing.T) { _, err := gotenv.StrictParse(&infiniteReader{}) assert.Error(t, err) @@ -346,7 +353,7 @@ func TestLoad_nonExist(t *testing.T) { } func TestLoad_unicodeBOMFixture(t *testing.T) { - file := "fixtures/bom.env" + file := "fixtures/utf8_bom.env" f, err := os.Open(file) assert.Nil(t, err) @@ -364,13 +371,34 @@ func TestLoad_unicodeBOMFixture(t *testing.T) { } } -func TestLoad_unicodeBOM(t *testing.T) { - file := "fixtures/bom.env" +func TestLoad_BOM_UTF8(t *testing.T) { + defer os.Clearenv() - err := gotenv.Load(file) - assert.Nil(t, err) - assert.Equal(t, "UTF-8", os.Getenv("BOM")) - os.Clearenv() + file := "fixtures/utf8_bom.env" + + if err := gotenv.Load(file); assert.Nil(t, err) { + assert.Equal(t, "UTF-8", os.Getenv("BOM")) + } +} + +func TestLoad_BOM_UTF16_LE(t *testing.T) { + defer os.Clearenv() + + file := "fixtures/utf16le_bom.env" + + if err := gotenv.Load(file); assert.Nil(t, err) { + assert.Equal(t, "UTF-16 LE", os.Getenv("BOM")) + } +} + +func TestLoad_BOM_UTF16_BE(t *testing.T) { + defer os.Clearenv() + + file := "fixtures/utf16be_bom.env" + + if err := gotenv.Load(file); assert.Nil(t, err) { + assert.Equal(t, "UTF-16 BE", os.Getenv("BOM")) + } } func TestMust_Load(t *testing.T) {