Skip to content

Commit

Permalink
utf8: use lookup table to validate ASCII (#654)
Browse files Browse the repository at this point in the history
  • Loading branch information
pelletier committed Nov 4, 2021
1 parent 3dbca20 commit 6617e7e
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 1 deletion.
19 changes: 19 additions & 0 deletions parser_test.go
@@ -1,6 +1,8 @@
package toml

import (
"strconv"
"strings"
"testing"

"github.com/pelletier/go-toml/v2/internal/ast"
Expand Down Expand Up @@ -371,6 +373,23 @@ func BenchmarkParseBasicStringWithUnicode(b *testing.B) {
})
}

func BenchmarkParseBasicStringsEasy(b *testing.B) {
p := &parser{}

for _, size := range []int{1, 4, 8, 16, 21} {
b.Run(strconv.Itoa(size), func(b *testing.B) {
input := []byte(`"` + strings.Repeat("A", size) + `"`)

b.ReportAllocs()
b.SetBytes(int64(len(input)))

for i := 0; i < b.N; i++ {
p.parseBasicString(input)
}
})
}
}

func TestParser_AST_DateTimes(t *testing.T) {
examples := []struct {
desc string
Expand Down
39 changes: 38 additions & 1 deletion utf8.go
Expand Up @@ -140,8 +140,45 @@ func utf8ValidNext(p []byte) int {
return size
}

var invalidAsciiTable = [256]bool{
0x00: true,
0x01: true,
0x02: true,
0x03: true,
0x04: true,
0x05: true,
0x06: true,
0x07: true,
0x08: true,
// 0x09 TAB
// 0x0A LF
0x0B: true,
0x0C: true,
// 0x0D CR
0x0E: true,
0x0F: true,
0x10: true,
0x11: true,
0x12: true,
0x13: true,
0x14: true,
0x15: true,
0x16: true,
0x17: true,
0x18: true,
0x19: true,
0x1A: true,
0x1B: true,
0x1C: true,
0x1D: true,
0x1E: true,
0x1F: true,
// 0x20 - 0x7E Printable ASCII characters
0x7F: true,
}

func invalidAscii(b byte) bool {
return b <= 0x08 || (b > 0x0A && b < 0x0D) || (b > 0x0D && b <= 0x1F) || b == 0x7F
return invalidAsciiTable[b]
}

// acceptRange gives the range of valid values for the second byte in a UTF-8
Expand Down

0 comments on commit 6617e7e

Please sign in to comment.