From 690fbb5c2bf07677826470f3c2837c257980c1f7 Mon Sep 17 00:00:00 2001 From: braydonk Date: Thu, 24 Nov 2022 19:17:42 +0000 Subject: [PATCH 1/5] scanner: Find quoted token from context To resolve an issue with quoted map keys not correctly parsing, this PR changes how it progresses after scanning a quote token. The old behaviour was: 1. scanner reaches a : 2. pull the characters out of the current scan context buffer and turn it into the map key token 3. set the previous indent column to the beginning of the buffer So next time it gets to another key in the map, it'll check whether the indent was the same. When scanning a quote: 1. Scan from the beginning to the end of the quote sequence 2. Add a token, send back to the lexer, and reset the scan context In this scenario, it reaches the :, but nothing is in the scan context buffer because it just went ahead and added the quote token. The previous indent column doesn't get reset, so when it gets to the next map key it thinks the indent increased, which is invalid yaml. The new behaviour is essentially to not reset the context after scanning a quote, and when reaching a : check the context tokens, and if it's a quote set the previous indent column to that --- scanner/scanner.go | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/scanner/scanner.go b/scanner/scanner.go index 1e09190..4d00cf5 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -5,6 +5,7 @@ import ( "strings" "github.com/goccy/go-yaml/token" + "golang.org/x/xerrors" ) @@ -729,11 +730,36 @@ func (s *Scanner) scan(ctx *Context) (pos int) { nc := ctx.nextChar() if s.startedFlowMapNum > 0 || nc == ' ' || s.isNewLineChar(nc) || ctx.isNextEOS() { // mapping value - tk := s.bufferedToken(ctx) - if tk != nil { + + // Check for a quote token. + if len(ctx.tokens) > 0 { + tk := ctx.tokens[len(ctx.tokens)-1] + if tk.Type == token.SingleQuoteType || tk.Type == token.DoubleQuoteType { + // Spaces after quote map keys are valid, so we still check the buffer and add + // the whitespace characters to the token and reset the buffer; we consider them + // part of the map key. + allWhitespace := true + bufContent := ctx.bufferedSrc() + for _, r := range bufContent { + allWhitespace = allWhitespace && r == ' ' + } + if allWhitespace { + tk.Value += string(bufContent) + // Reset the buffer so the condition that checks the buffer isn't triggered. + ctx.resetBuffer() + } + + // Set the previous indent column to the beginning of the quote token. + s.prevIndentColumn = tk.Position.Column + } + } + + // If there's anything in the buffer, add it to the context as the map key. + if tk := s.bufferedToken(ctx); tk != nil { s.prevIndentColumn = tk.Position.Column ctx.addToken(tk) } + ctx.addToken(token.MappingValue(s.pos())) s.progressColumn(ctx, 1) return @@ -805,7 +831,7 @@ func (s *Scanner) scan(ctx *Context) (pos int) { token, progress := s.scanQuote(ctx, c) ctx.addToken(token) pos += progress - return + continue } case '\r', '\n': // There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec. From b84d0bd67b58af692cf3bdf5b0024dd17ef5404e Mon Sep 17 00:00:00 2001 From: braydonk Date: Thu, 24 Nov 2022 20:15:16 +0000 Subject: [PATCH 2/5] scanner: resolve off-by-one scanning single quote When scanning an escaped single quote `''` there was an off by one error because while the local `idx` variable is increased, the context's `idx` is not. This leads to the progress returned being off by one, so the scanner gets stuck in the `ctx.next()` loop forever, unable to progress. --- scanner/scanner.go | 1 + 1 file changed, 1 insertion(+) diff --git a/scanner/scanner.go b/scanner/scanner.go index 4d00cf5..de78734 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -249,6 +249,7 @@ func (s *Scanner) scanSingleQuote(ctx *Context) (tk *token.Token, pos int) { value = append(value, c) ctx.addOriginBuf(c) idx++ + s.progressColumn(ctx, 1) continue } s.progressColumn(ctx, 1) From 9de8125a94f9d45bfa7f2fd4a6fe6657898c42cd Mon Sep 17 00:00:00 2001 From: braydonk Date: Thu, 24 Nov 2022 22:03:08 +0000 Subject: [PATCH 3/5] scanner: apply column skip in quote scanning When we scan escape characters in quoted scalars, the local idx is shifted but not the underlying context. This adds a column skip to ensure the ctx progresses in sync with the local quote scanning. --- scanner/scanner.go | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/scanner/scanner.go b/scanner/scanner.go index de78734..1e8fe11 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -222,9 +222,11 @@ func (s *Scanner) scanSingleQuote(ctx *Context) (tk *token.Token, pos int) { value := []rune{} isFirstLineChar := false isNewLine := false + columnSkip := 0 for idx := startIndex; idx < size; idx++ { if !isNewLine { - s.progressColumn(ctx, 1) + s.progressColumn(ctx, 1+columnSkip) + columnSkip = 0 } else { isNewLine = false } @@ -249,7 +251,7 @@ func (s *Scanner) scanSingleQuote(ctx *Context) (tk *token.Token, pos int) { value = append(value, c) ctx.addOriginBuf(c) idx++ - s.progressColumn(ctx, 1) + columnSkip = 1 continue } s.progressColumn(ctx, 1) @@ -287,9 +289,11 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { value := []rune{} isFirstLineChar := false isNewLine := false + columnSkip := 0 for idx := startIndex; idx < size; idx++ { if !isNewLine { - s.progressColumn(ctx, 1) + s.progressColumn(ctx, 1+columnSkip) + columnSkip = 0 } else { isNewLine = false } @@ -313,51 +317,61 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { ctx.addOriginBuf(nextChar) value = append(value, '\b') idx++ + columnSkip = 1 continue case 'e': ctx.addOriginBuf(nextChar) value = append(value, '\x1B') idx++ + columnSkip = 1 continue case 'f': ctx.addOriginBuf(nextChar) value = append(value, '\f') idx++ + columnSkip = 1 continue case 'n': ctx.addOriginBuf(nextChar) value = append(value, '\n') idx++ + columnSkip = 1 continue case 'v': ctx.addOriginBuf(nextChar) value = append(value, '\v') idx++ + columnSkip = 1 continue case 'L': // LS (#x2028) ctx.addOriginBuf(nextChar) value = append(value, []rune{'\xE2', '\x80', '\xA8'}...) idx++ + columnSkip = 1 continue case 'N': // NEL (#x85) ctx.addOriginBuf(nextChar) value = append(value, []rune{'\xC2', '\x85'}...) idx++ + columnSkip = 1 continue case 'P': // PS (#x2029) ctx.addOriginBuf(nextChar) value = append(value, []rune{'\xE2', '\x80', '\xA9'}...) idx++ + columnSkip = 1 continue case '_': // #xA0 ctx.addOriginBuf(nextChar) value = append(value, []rune{'\xC2', '\xA0'}...) idx++ + columnSkip = 1 continue case '"': ctx.addOriginBuf(nextChar) value = append(value, nextChar) idx++ + columnSkip = 1 continue case 'x': if idx+3 >= size { @@ -368,6 +382,7 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { codeNum := hexRunesToInt(src[idx+2 : idx+4]) value = append(value, rune(codeNum)) idx += 3 + columnSkip = 3 continue case 'u': if idx+5 >= size { @@ -378,6 +393,7 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { codeNum := hexRunesToInt(src[idx+2 : idx+6]) value = append(value, rune(codeNum)) idx += 5 + columnSkip = 5 continue case 'U': if idx+9 >= size { @@ -388,10 +404,12 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { codeNum := hexRunesToInt(src[idx+2 : idx+10]) value = append(value, rune(codeNum)) idx += 9 + columnSkip = 9 continue case '\\': ctx.addOriginBuf(nextChar) idx++ + columnSkip = 1 } } value = append(value, c) From 8834a3c2de30e7ae9bc91a4a92c9589180eb2005 Mon Sep 17 00:00:00 2001 From: braydonk Date: Fri, 25 Nov 2022 13:37:17 +0000 Subject: [PATCH 4/5] scanner: fix buf and obuf sync issue `scanQuote` will add to the origin buffer for the purpose of creating the quote token, but would never reset it. This caused a problem when nothing would reset the buffer, i.e. when a quoted value was followed by a newline and the next map key. --- scanner/scanner.go | 49 ++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/scanner/scanner.go b/scanner/scanner.go index 1e8fe11..f78c203 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -111,6 +111,15 @@ func (s *Scanner) isNewLineChar(c rune) bool { return false } +func (s *Scanner) isWhitespaceBuffer(ctx *Context) bool { + allWhitespace := true + bufContent := ctx.bufferedSrc() + for _, r := range bufContent { + allWhitespace = allWhitespace && r == ' ' + } + return allWhitespace && len(bufContent) > 0 +} + func (s *Scanner) newLineCount(src []rune) int { size := len(src) cnt := 0 @@ -429,9 +438,16 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { func (s *Scanner) scanQuote(ctx *Context, ch rune) (tk *token.Token, pos int) { if ch == '\'' { - return s.scanSingleQuote(ctx) + tk, pos = s.scanSingleQuote(ctx) + } else if ch == '"' { + tk, pos = s.scanDoubleQuote(ctx) + } else { + // TODO return an error object here when scan supports returning errors + return } - return s.scanDoubleQuote(ctx) + // The origin buffer needs to be reset so it's back in sync with the main buffer. + ctx.resetBuffer() + return } func (s *Scanner) isMergeKey(ctx *Context) bool { @@ -750,31 +766,22 @@ func (s *Scanner) scan(ctx *Context) (pos int) { if s.startedFlowMapNum > 0 || nc == ' ' || s.isNewLineChar(nc) || ctx.isNextEOS() { // mapping value - // Check for a quote token. - if len(ctx.tokens) > 0 { + // If there is no buffer or there is whitespace, check if the last token is quoted. + if len(ctx.tokens) > 0 && len(ctx.buf) > 0 && s.isWhitespaceBuffer(ctx) { + // Check for a quote token. tk := ctx.tokens[len(ctx.tokens)-1] if tk.Type == token.SingleQuoteType || tk.Type == token.DoubleQuoteType { - // Spaces after quote map keys are valid, so we still check the buffer and add - // the whitespace characters to the token and reset the buffer; we consider them - // part of the map key. - allWhitespace := true - bufContent := ctx.bufferedSrc() - for _, r := range bufContent { - allWhitespace = allWhitespace && r == ' ' - } - if allWhitespace { - tk.Value += string(bufContent) - // Reset the buffer so the condition that checks the buffer isn't triggered. - ctx.resetBuffer() - } + // Spaces after quote map keys are valid, add the whitespace characters to the token + // and reset the buffer; we consider them part of the map key. + tk.Value += string(ctx.bufferedSrc()) + ctx.resetBuffer() // Set the previous indent column to the beginning of the quote token. s.prevIndentColumn = tk.Position.Column } - } - - // If there's anything in the buffer, add it to the context as the map key. - if tk := s.bufferedToken(ctx); tk != nil { + } else if tk := s.bufferedToken(ctx); tk != nil { + // If there's anything other than whitespace in the buffer, add it to the + // context as the map key. s.prevIndentColumn = tk.Position.Column ctx.addToken(tk) } From d56f06e5fabbdc942df3f78031db4aed674eb2ac Mon Sep 17 00:00:00 2001 From: braydonk Date: Sun, 27 Nov 2022 14:43:49 +0000 Subject: [PATCH 5/5] add tests, fixed bad scanner logic I had adjusted the `:` scanning logic in a way that didn't cover all cases. --- decode_test.go | 23 +++++++++ lexer/lexer_test.go | 111 ++++++++++++++++++++++++++++++++++++++++++ parser/parser_test.go | 18 +++++++ scanner/scanner.go | 21 ++++---- 4 files changed, 163 insertions(+), 10 deletions(-) diff --git a/decode_test.go b/decode_test.go index fb13b9a..c929195 100644 --- a/decode_test.go +++ b/decode_test.go @@ -2144,6 +2144,29 @@ b: *a t.Fatal("failed to unmarshal") } }) + t.Run("quoted map keys", func(t *testing.T) { + t.Parallel() + yml := ` +a: + "b": 2 + 'c': true +` + var v struct { + A struct { + B int + C bool + } + } + if err := yaml.Unmarshal([]byte(yml), &v); err != nil { + t.Fatalf("failed to unmarshal %v", err) + } + if v.A.B != 2 { + t.Fatalf("expected a.b to equal 2 but was %d", v.A.B) + } + if !v.A.C { + t.Fatal("expected a.c to be true but was false") + } + }) } type unmarshalablePtrStringContainer struct { diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index bbe4e2f..03b741f 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -56,6 +56,10 @@ func TestTokenize(t *testing.T) { "a: 'Hello #comment'\n", "a: 100.5\n", "a: bogus\n", + "\"a\": double quoted map key", + "'a': single quoted map key", + "a: \"double quoted\"\nb: \"value map\"", + "a: 'single quoted'\nb: 'value map'", } for _, src := range sources { lexer.Tokenize(src).Dump() @@ -231,6 +235,60 @@ func TestSingleLineToken_ValueLineColumnPosition(t *testing.T) { 15: "]", }, }, + { + name: "double quote key", + src: `"a": b`, + expect: map[int]string{ + 1: "a", + 4: ":", + 6: "b", + }, + }, + { + name: "single quote key", + src: `'a': b`, + expect: map[int]string{ + 1: "a", + 4: ":", + 6: "b", + }, + }, + { + name: "double quote key and value", + src: `"a": "b"`, + expect: map[int]string{ + 1: "a", + 4: ":", + 6: "b", + }, + }, + { + name: "single quote key and value", + src: `'a': 'b'`, + expect: map[int]string{ + 1: "a", + 4: ":", + 6: "b", + }, + }, + { + name: "double quote key, single quote value", + src: `"a": 'b'`, + expect: map[int]string{ + 1: "a", + 4: ":", + 6: "b", + }, + }, + { + name: "single quote key, double quote value", + src: `'a': "b"`, + expect: map[int]string{ + 1: "a", + 4: ":", + 6: "b", + }, + }, } for _, tc := range tests { @@ -432,6 +490,59 @@ foo2: 'bar2'`, }, }, }, + { + name: "single and double quote map keys", + src: `"a": test +'b': 1 +c: true`, + expect: []testToken{ + { + line: 1, + column: 1, + value: "a", + }, + { + line: 1, + column: 4, + value: ":", + }, + { + line: 1, + column: 6, + value: "test", + }, + { + line: 2, + column: 1, + value: "b", + }, + { + line: 2, + column: 4, + value: ":", + }, + { + line: 2, + column: 6, + value: "1", + }, + { + line: 3, + column: 1, + value: "c", + }, + { + line: 3, + column: 2, + value: ":", + }, + { + line: 3, + column: 4, + value: "true", + }, + }, + }, } for _, tc := range tests { diff --git a/parser/parser_test.go b/parser/parser_test.go index 595e0a3..f284f45 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -80,6 +80,8 @@ func TestParser(t *testing.T) { ? !!str "implicit" : !!str "entry", ? !!null "" : !!null "", }`, + "\"a\": a\n\"b\": b", + "'a': a\n'b': b", } for _, src := range sources { if _, err := parser.Parse(lexer.Tokenize(src), 0); err != nil { @@ -562,6 +564,22 @@ b: c ` - key1: val key2: ( foo + bar ) +`, + }, + { + ` +"a": b +'c': d +"e": "f" +g: "h" +i: 'j' +`, + ` +"a": b +'c': d +"e": "f" +g: "h" +i: 'j' `, }, } diff --git a/scanner/scanner.go b/scanner/scanner.go index f78c203..08bdffe 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -766,22 +766,23 @@ func (s *Scanner) scan(ctx *Context) (pos int) { if s.startedFlowMapNum > 0 || nc == ' ' || s.isNewLineChar(nc) || ctx.isNextEOS() { // mapping value - // If there is no buffer or there is whitespace, check if the last token is quoted. - if len(ctx.tokens) > 0 && len(ctx.buf) > 0 && s.isWhitespaceBuffer(ctx) { - // Check for a quote token. + // If there's a token in the context, we need to check if it's a quote token. + if len(ctx.tokens) > 0 { tk := ctx.tokens[len(ctx.tokens)-1] if tk.Type == token.SingleQuoteType || tk.Type == token.DoubleQuoteType { - // Spaces after quote map keys are valid, add the whitespace characters to the token - // and reset the buffer; we consider them part of the map key. - tk.Value += string(ctx.bufferedSrc()) - ctx.resetBuffer() + if len(ctx.buf) > 0 && s.isWhitespaceBuffer(ctx) { + // Spaces after quote map keys are valid, add the whitespace characters + // to the token and reset the buffer; we consider them part of the map key. + tk.Value += string(ctx.bufferedSrc()) + ctx.resetBuffer() + } // Set the previous indent column to the beginning of the quote token. s.prevIndentColumn = tk.Position.Column } - } else if tk := s.bufferedToken(ctx); tk != nil { - // If there's anything other than whitespace in the buffer, add it to the - // context as the map key. + } + if tk := s.bufferedToken(ctx); tk != nil { + // If there's anything in the buffer at this point, we'll treat that as the map key. s.prevIndentColumn = tk.Position.Column ctx.addToken(tk) }