Skip to content

Commit

Permalink
Add new TokeniseOption EnsureLF (alecthomas#336)
Browse files Browse the repository at this point in the history
* Add new TokeniseOption EnsureLF

ref alecthomas#329

* Use efficient process suggested by @chmike
  • Loading branch information
satotake committed Mar 4, 2020
1 parent 3f25b5e commit e591c48
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 1 deletion.
7 changes: 6 additions & 1 deletion lexer.go
Expand Up @@ -6,7 +6,8 @@ import (

var (
defaultOptions = &TokeniseOptions{
State: "root",
State: "root",
EnsureLF: true,
}
)

Expand Down Expand Up @@ -80,6 +81,10 @@ type TokeniseOptions struct {
State string
// Nested tokenisation.
Nested bool

// If true, all EOLs are converted into LF
// by replacing CRLF and CR
EnsureLF bool
}

// A Lexer for tokenising source code.
Expand Down
22 changes: 22 additions & 0 deletions regexp.go
Expand Up @@ -410,6 +410,9 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator,
if options == nil {
options = defaultOptions
}
if options.EnsureLF {
text = ensureLF(text)
}
if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
text += "\n"
}
Expand Down Expand Up @@ -437,3 +440,22 @@ func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule
}
return 0, &CompiledRule{}, nil
}

// replace \r and \r\n with \n
// same as strings.ReplaceAll but more efficient
func ensureLF(text string) string {
buf := make([]byte, len(text))
var j int
for i := 0; i < len(text); i++ {
c := text[i]
if c == '\r' {
if i < len(text)-1 && text[i+1] == '\n' {
continue
}
c = '\n'
}
buf[j] = c
j++
}
return string(buf[:j])
}
56 changes: 56 additions & 0 deletions regexp_test.go
Expand Up @@ -43,3 +43,59 @@ func TestMatchingAtStart(t *testing.T) {
[]Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}},
it.Tokens())
}

func TestEnsureLFOption(t *testing.T) {
l := Coalesce(MustNewLexer(&Config{}, Rules{
"root": {
{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
},
}))
it, err := l.Tokenise(&TokeniseOptions{
State: "root",
EnsureLF: true,
}, "hello\r\nworld\r")
assert.NoError(t, err)
assert.Equal(t, []Token{
{Keyword, "hello"},
{Whitespace, "\n"},
{Keyword, "world"},
{Whitespace, "\n"},
}, it.Tokens())

l = Coalesce(MustNewLexer(nil, Rules{
"root": {
{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
},
}))
it, err = l.Tokenise(&TokeniseOptions{
State: "root",
EnsureLF: false,
}, "hello\r\nworld\r")
assert.NoError(t, err)
assert.Equal(t, []Token{
{Keyword, "hello"},
{Whitespace, "\r\n"},
{Keyword, "world"},
{Whitespace, "\r"},
}, it.Tokens())
}

func TestEnsureLFFunc(t *testing.T) {
tests := []struct{ in, out string }{
{in: "", out: ""},
{in: "abc", out: "abc"},
{in: "\r", out: "\n"},
{in: "a\r", out: "a\n"},
{in: "\rb", out: "\nb"},
{in: "a\rb", out: "a\nb"},
{in: "\r\n", out: "\n"},
{in: "a\r\n", out: "a\n"},
{in: "\r\nb", out: "\nb"},
{in: "a\r\nb", out: "a\nb"},
{in: "\r\r\r\n\r", out: "\n\n\n\n"},
}
for _, test := range tests {
out := ensureLF(test.in)
assert.Equal(t, out, test.out)
}
}

0 comments on commit e591c48

Please sign in to comment.