Skip to content

Commit

Permalink
Add support for escaped UTF-16 surrogate pairs
Browse files Browse the repository at this point in the history
I have followed the convention above that error handling is TODO, but I can start implementing it if you like.

References:

* https://russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
* https://mathiasbynens.be/notes/javascript-unicode
* https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF_(surrogates)
  • Loading branch information
steveh committed Apr 9, 2024
1 parent 4653a1b commit 80fe0ae
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 0 deletions.
12 changes: 12 additions & 0 deletions decode_test.go
Expand Up @@ -437,6 +437,18 @@ func TestDecoder(t *testing.T) {
`"1": "a\x2Fb\u002Fc\U0000002Fd"`,
map[interface{}]interface{}{"1": `a/b/c/d`},
},
{
`"\ud83e\udd23"`,
"🤣",
},
{
`"\uD83D\uDE00\uD83D\uDE01"`,
"😀😁",
},
{
`"\uD83D\uDE00a\uD83D\uDE01"`,
"😀a😁",
},
{
"'1': \"2\\n3\"",
map[interface{}]interface{}{"1": "2\n3"},
Expand Down
28 changes: 28 additions & 0 deletions scanner/scanner.go
Expand Up @@ -391,6 +391,34 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) {
return
}
codeNum := hexRunesToInt(src[idx+2 : idx+6])

// Handle surrogate pairs
if codeNum >= 0xD800 && codeNum <= 0xDBFF {
high := codeNum

if idx+11 >= size {
// TODO: need to return error
//err = xerrors.New("not enough characters for surrogate pair")
return
}

if src[idx+6] != '\\' || src[idx+7] != 'u' {
// TODO: need to return error
//err = xerrors.New("expected escape code after high surrogate")
return
}

low := hexRunesToInt(src[idx+8 : idx+12])
if low < 0xDC00 || low > 0xDFFF {
// TODO: need to return error
//err = xerrors.New("expected low surrogate after high surrogate")
return
}

codeNum = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
idx += 6
}

value = append(value, rune(codeNum))
idx += 5
continue
Expand Down

0 comments on commit 80fe0ae

Please sign in to comment.