Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid iterating over the source multiple times #936

Merged
merged 1 commit into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
60 changes: 54 additions & 6 deletions common/runes/buffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,20 +127,48 @@ var nilBuffer = &emptyBuffer{}
// elements of the byte or uint16 array, and continue. The underlying storage is an rune array
// containing any Unicode character.
func NewBuffer(data string) Buffer {
buf, _ := newBuffer(data, false)
return buf
}

// NewBuffer returns an efficient implementation of Buffer for the given text based on the ranges of
// the encoded code points contained within, as well as returning the line offsets.
//
// Code points are represented as an array of byte, uint16, or rune. This approach ensures that
// each index represents a code point by itself without needing to use an array of rune. At first
// we assume all code points are less than or equal to '\u007f'. If this holds true, the
// underlying storage is a byte array containing only ASCII characters. If we encountered a code
// point above this range but less than or equal to '\uffff' we allocate a uint16 array, copy the
// elements of previous byte array to the uint16 array, and continue. If this holds true, the
// underlying storage is a uint16 array containing only Unicode characters in the Basic Multilingual
// Plane. If we encounter a code point above '\uffff' we allocate an rune array, copy the previous
// elements of the byte or uint16 array, and continue. The underlying storage is an rune array
// containing any Unicode character.
func NewBufferAndLineOffsets(data string) (Buffer, []int32) {
return newBuffer(data, true)
}

func newBuffer(data string, lines bool) (Buffer, []int32) {
if len(data) == 0 {
return nilBuffer
return nilBuffer, []int32{0}
}
var (
idx = 0
buf8 = make([]byte, 0, len(data))
idx = 0
off int32 = 0
buf8 = make([]byte, 0, len(data))
buf16 []uint16
buf32 []rune
offs []int32
)
for idx < len(data) {
r, s := utf8.DecodeRuneInString(data[idx:])
idx += s
if lines && r == '\n' {
offs = append(offs, off+1)
}
if r < utf8.RuneSelf {
buf8 = append(buf8, byte(r))
off++
continue
}
if r <= 0xffff {
Expand All @@ -150,6 +178,7 @@ func NewBuffer(data string) Buffer {
}
buf8 = nil
buf16 = append(buf16, uint16(r))
off++
goto copy16
}
buf32 = make([]rune, len(buf8), len(data))
Expand All @@ -158,17 +187,25 @@ func NewBuffer(data string) Buffer {
}
buf8 = nil
buf32 = append(buf32, r)
off++
goto copy32
}
if lines {
offs = append(offs, off+1)
}
return &asciiBuffer{
arr: buf8,
}
}, offs
copy16:
for idx < len(data) {
r, s := utf8.DecodeRuneInString(data[idx:])
idx += s
if lines && r == '\n' {
offs = append(offs, off+1)
}
if r <= 0xffff {
buf16 = append(buf16, uint16(r))
off++
continue
}
buf32 = make([]rune, len(buf16), len(data))
Expand All @@ -177,18 +214,29 @@ copy16:
}
buf16 = nil
buf32 = append(buf32, r)
off++
goto copy32
}
if lines {
offs = append(offs, off+1)
}
return &basicBuffer{
arr: buf16,
}
}, offs
copy32:
for idx < len(data) {
r, s := utf8.DecodeRuneInString(data[idx:])
idx += s
if lines && r == '\n' {
offs = append(offs, off+1)
}
buf32 = append(buf32, r)
off++
}
if lines {
offs = append(offs, off+1)
}
return &supplementalBuffer{
arr: buf32,
}
}, offs
}
15 changes: 3 additions & 12 deletions common/source.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
package common

import (
"strings"
"unicode/utf8"

"github.com/google/cel-go/common/runes"

exprpb "google.golang.org/genproto/googleapis/api/expr/v1alpha1"
Expand Down Expand Up @@ -80,17 +77,11 @@ func NewTextSource(text string) Source {
// NewStringSource creates a new Source from the given contents and description.
func NewStringSource(contents string, description string) Source {
// Compute line offsets up front as they are referred to frequently.
lines := strings.Split(contents, "\n")
offsets := make([]int32, len(lines))
var offset int32
for i, line := range lines {
offset = offset + int32(utf8.RuneCountInString(line)) + 1
offsets[int32(i)] = offset
}
buf, offs := runes.NewBufferAndLineOffsets(contents)
return &sourceImpl{
Buffer: runes.NewBuffer(contents),
Buffer: buf,
description: description,
lineOffsets: offsets,
lineOffsets: offs,
}
}

Expand Down