Skip to content

Commit

Permalink
Merge pull request #28 from tdakkota/fix/string-benchmark
Browse files Browse the repository at this point in the history
feat: speed up escaped string decoding
  • Loading branch information
ernado committed Jan 18, 2022
2 parents 1941dd1 + 04caca1 commit 4f7480a
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 115 deletions.
14 changes: 0 additions & 14 deletions dec.go
Expand Up @@ -45,23 +45,9 @@ const (
Object
)

var hexDigits []byte
var types []Type

func init() {
hexDigits = make([]byte, 256)
for i := 0; i < len(hexDigits); i++ {
hexDigits[i] = 255
}
for i := '0'; i <= '9'; i++ {
hexDigits[i] = byte(i - '0')
}
for i := 'a'; i <= 'f'; i++ {
hexDigits[i] = byte((i - 'a') + 10)
}
for i := 'A'; i <= 'F'; i++ {
hexDigits[i] = byte((i - 'A') + 10)
}
types = make([]Type, 256)
for i := range types {
types[i] = Invalid
Expand Down
30 changes: 18 additions & 12 deletions dec_skip.go
Expand Up @@ -240,19 +240,26 @@ stateExp:

var (
escapedStrSet = [256]byte{
'"': 1, '\\': 1, '/': 1, 'b': 1, 'f': 1, 'n': 1, 'r': 1, 't': 1,
'u': 2,
'"': '"',
'\\': '\\',
'/': '/',
'b': '\b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'u': 'u',
}
hexSet = [256]byte{
'0': 1, '1': 1, '2': 1, '3': 1,
'4': 1, '5': 1, '6': 1, '7': 1,
'8': 1, '9': 1,
'0': 0x0 + 1, '1': 0x1 + 1, '2': 0x2 + 1, '3': 0x3 + 1,
'4': 0x4 + 1, '5': 0x5 + 1, '6': 0x6 + 1, '7': 0x7 + 1,
'8': 0x8 + 1, '9': 0x9 + 1,

'A': 1, 'B': 1, 'C': 1, 'D': 1,
'E': 1, 'F': 1,
'A': 0xA + 1, 'B': 0xB + 1, 'C': 0xC + 1, 'D': 0xD + 1,
'E': 0xE + 1, 'F': 0xF + 1,

'a': 1, 'b': 1, 'c': 1, 'd': 1,
'e': 1, 'f': 1,
'a': 0xa + 1, 'b': 0xb + 1, 'c': 0xc + 1, 'd': 0xd + 1,
'e': 0xe + 1, 'f': 0xf + 1,
}
)

Expand Down Expand Up @@ -348,8 +355,7 @@ readTok:
return err
}
switch escapedStrSet[v] {
case 1:
case 2:
case 'u':
for i := 0; i < 4; i++ {
h, err := d.byte()
if err != nil {
Expand All @@ -359,7 +365,7 @@ readTok:
return badToken(h)
}
}
default:
case 0:
return badToken(v)
}
case c < ' ':
Expand Down
20 changes: 20 additions & 0 deletions dec_skip_cases_test.go
Expand Up @@ -140,11 +140,28 @@ var testStrings = append([]string{
"\"\\ueeee\"", // valid
"\"\\uFFFF\"", // valid
`"ab\n` + "\x00" + `"`, // invalid
`"\n0123456"`,
}, func() (r []string) {
// Generate tests for invalid space sequences.
for i := byte(0); i <= ' '; i++ {
r = append(r, `"`+string(i)+`"`)
}
// Generate tests to ensure unroll correctness.
for i := byte('0'); i <= '9'; i++ {
// Generates "0", "11", "222" ...
n := int(i - '0' + 1)
str := strings.Repeat(string(i), n)
r = append(r, `"`+str+`"`)
if n > 2 {
// Insert newline.
// Generates "22\n2", "333\n3" ...
str = str[:n-2] + `\n` + str[n-1:]
r = append(r, `"`+str+`"`)
// Generates "2\n22", "3\n333" ...
str = str[:1] + `\n` + str[2:]
r = append(r, `"`+str+`"`)
}
}
return r
}()...)

Expand Down Expand Up @@ -185,6 +202,9 @@ var testObjs = []string{
`{"foo":`, // invalid
`{"foo": "bar"`, // invalid
`{"foo": "bar`, // invalid
`{"foo": "bar",}`, // invalid
`{"foo": "bar", true}`, // invalid
"{\n\"foo\"\n: \n10e1 \n, \n\"bar\"\n: \ntrue\n}", // valid
}

var testArrs = []string{
Expand Down
7 changes: 4 additions & 3 deletions dec_skip_test.go
Expand Up @@ -27,9 +27,10 @@ func TestDecoder_SkipArrayNested(t *testing.T) {
a.NoError(err)
_, err = d.Elem()
a.NoError(err)
if s, _ := d.Str(); s != "stream" {
t.FailNow()
}

s, err := d.Str()
a.NoError(err)
a.Equal("stream", s)
return nil
})
}
Expand Down
194 changes: 121 additions & 73 deletions dec_str.go
Expand Up @@ -2,6 +2,7 @@ package jx

import (
"fmt"
"io"
"unicode/utf16"
"unicode/utf8"

Expand Down Expand Up @@ -33,13 +34,6 @@ func (v value) rune(r rune) value {
}
}

func (v value) byte(b byte) value {
return value{
buf: append(v.buf, b),
raw: v.raw,
}
}

// badTokenErr means that Token was unexpected while decoding.
type badTokenErr struct {
Token byte
Expand Down Expand Up @@ -124,22 +118,124 @@ func (d *Decoder) str(v value) (value, error) {
return d.strSlow(v)
}
readTok:
; // Bug in cover tool, see https://github.com/golang/go/issues/28319.
buf := d.buf[d.head:d.tail]
str := buf[:i]

switch {
case c == '"':
buf := d.buf[d.head:d.tail]
// End of string in fast path.
str := buf[:i]
// Skip string + last quote.
d.head += i + 1
if v.raw {
return value{buf: str}, nil
}
return value{buf: append(v.buf, str...)}, nil
case c == '\\':
return d.strSlow(v)
// Skip only string, keep quote in buffer.
d.head += i
// We need a copy anyway, because string is escaped.
return d.strSlow(value{buf: append(v.buf, str...)})
default:
return v, badToken(c)
}
}

func (d *Decoder) strSlow(v value) (value, error) {
var (
c byte
i int
)
readStr:
for {
i = 0
buf := d.buf[d.head:d.tail]
for len(buf) >= 8 {
c = buf[0]
if safeSet[c] != 0 {
goto readTok
}
i++

c = buf[1]
if safeSet[c] != 0 {
goto readTok
}
i++

c = buf[2]
if safeSet[c] != 0 {
goto readTok
}
i++

c = buf[3]
if safeSet[c] != 0 {
goto readTok
}
i++

c = buf[4]
if safeSet[c] != 0 {
goto readTok
}
i++

c = buf[5]
if safeSet[c] != 0 {
goto readTok
}
i++

c = buf[6]
if safeSet[c] != 0 {
goto readTok
}
i++

c = buf[7]
if safeSet[c] != 0 {
goto readTok
}
i++

buf = buf[8:]
}
for _, c = range buf {
if safeSet[c] != 0 {
goto readTok
}
i++
}

v.buf = append(v.buf, d.buf[d.head:d.head+i]...)
if err := d.read(); err != nil {
if err == io.EOF {
return value{}, io.ErrUnexpectedEOF
}
return value{}, err
}
}
readTok:
buf := d.buf[d.head:d.tail]
str := buf[:i]
d.head += i + 1

switch {
case c == '"':
return value{buf: append(v.buf, str...)}, nil
case c == '\\':
v.buf = append(v.buf, str...)
c, err := d.byte()
if err != nil {
return value{}, errors.Wrap(err, "next")
}
v, err = d.escapedChar(v, c)
if err != nil {
return v, errors.Wrap(err, "escape")
}
default:
return v, badToken(c)
}
goto readStr
}

// StrBytes returns string value as sub-slice of internal buffer.
Expand All @@ -162,36 +258,10 @@ func (d *Decoder) Str() (string, error) {
return string(s), nil
}

func (d *Decoder) strSlow(v value) (value, error) {
for {
c, err := d.byte()
if err != nil {
return value{}, errors.Wrap(err, "next")
}
switch c {
case '"':
// End of string.
return v, nil
case '\\':
c, err := d.byte()
if err != nil {
return value{}, errors.Wrap(err, "next")
}
v, err = d.escapedChar(v, c)
if err != nil {
return v, errors.Wrap(err, "escape")
}
default:
if c < ' ' {
return value{}, badToken(c)
}
v = v.byte(c)
}
}
}

func (d *Decoder) escapedChar(v value, c byte) (value, error) {
switch c {
switch val := escapedStrSet[c]; val {
default:
v.buf = append(v.buf, val)
case 'u':
r1, err := d.readU4()
if err != nil {
Expand Down Expand Up @@ -226,45 +296,23 @@ func (d *Decoder) escapedChar(v value, c byte) (value, error) {
} else {
v = v.rune(r1)
}
case '"':
v = v.rune('"')
case '\\':
v = v.rune('\\')
case '/':
v = v.rune('/')
case 'b':
v = v.rune('\b')
case 'f':
v = v.rune('\f')
case 'n':
v = v.rune('\n')
case 'r':
v = v.rune('\r')
case 't':
v = v.rune('\t')
default:
case 0:
return v, errors.Wrap(badToken(c), "bad escape: %w")
}
return v, nil
}

func (d *Decoder) readU4() (rune, error) {
var v rune
for i := 0; i < 4; i++ {
c, err := d.byte()
if err != nil {
return 0, err
}
switch {
case c >= '0' && c <= '9':
v = v*16 + rune(c-'0')
case c >= 'a' && c <= 'f':
v = v*16 + rune(c-'a'+10)
case c >= 'A' && c <= 'F':
v = v*16 + rune(c-'A'+10)
default:
func (d *Decoder) readU4() (v rune, _ error) {
var b [4]byte
if err := d.readExact4(&b); err != nil {
return 0, err
}
for _, c := range b {
val := hexSet[c]
if val == 0 {
return 0, badToken(c)
}
v = v*16 + rune(val-1)
}
return v, nil
}
Expand Down

0 comments on commit 4f7480a

Please sign in to comment.