diff --git a/lexer.go b/lexer.go index a6ae84b2b..1269d338b 100644 --- a/lexer.go +++ b/lexer.go @@ -6,7 +6,8 @@ import ( var ( defaultOptions = &TokeniseOptions{ - State: "root", + State: "root", + EnsureLF: true, } ) @@ -80,6 +81,10 @@ type TokeniseOptions struct { State string // Nested tokenisation. Nested bool + + // If true, all EOLs are converted into LF + // by replacing CRLF and CR + EnsureLF bool } // A Lexer for tokenising source code. diff --git a/regexp.go b/regexp.go index 7c2fb0bb8..d13d58d69 100644 --- a/regexp.go +++ b/regexp.go @@ -410,6 +410,9 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, if options == nil { options = defaultOptions } + if options.EnsureLF { + text = ensureLF(text) + } if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") { text += "\n" } @@ -437,3 +440,22 @@ func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule } return 0, &CompiledRule{}, nil } + +// replace \r and \r\n with \n +// same as strings.ReplaceAll but more efficient +func ensureLF(text string) string { + buf := make([]byte, len(text)) + var j int + for i := 0; i < len(text); i++ { + c := text[i] + if c == '\r' { + if i < len(text)-1 && text[i+1] == '\n' { + continue + } + c = '\n' + } + buf[j] = c + j++ + } + return string(buf[:j]) +} diff --git a/regexp_test.go b/regexp_test.go index 0ac7715e3..a40f3e06a 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -43,3 +43,59 @@ func TestMatchingAtStart(t *testing.T) { []Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}}, it.Tokens()) } + +func TestEnsureLFOption(t *testing.T) { + l := Coalesce(MustNewLexer(&Config{}, Rules{ + "root": { + {`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil}, + }, + })) + it, err := l.Tokenise(&TokeniseOptions{ + State: "root", + EnsureLF: true, + }, "hello\r\nworld\r") + assert.NoError(t, err) + assert.Equal(t, []Token{ + {Keyword, "hello"}, + {Whitespace, "\n"}, + {Keyword, "world"}, + {Whitespace, "\n"}, + }, it.Tokens()) + + l = Coalesce(MustNewLexer(nil, Rules{ + "root": { + {`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil}, + }, + })) + it, err = l.Tokenise(&TokeniseOptions{ + State: "root", + EnsureLF: false, + }, "hello\r\nworld\r") + assert.NoError(t, err) + assert.Equal(t, []Token{ + {Keyword, "hello"}, + {Whitespace, "\r\n"}, + {Keyword, "world"}, + {Whitespace, "\r"}, + }, it.Tokens()) +} + +func TestEnsureLFFunc(t *testing.T) { + tests := []struct{ in, out string }{ + {in: "", out: ""}, + {in: "abc", out: "abc"}, + {in: "\r", out: "\n"}, + {in: "a\r", out: "a\n"}, + {in: "\rb", out: "\nb"}, + {in: "a\rb", out: "a\nb"}, + {in: "\r\n", out: "\n"}, + {in: "a\r\n", out: "a\n"}, + {in: "\r\nb", out: "\nb"}, + {in: "a\r\nb", out: "a\nb"}, + {in: "\r\r\r\n\r", out: "\n\n\n\n"}, + } + for _, test := range tests { + out := ensureLF(test.in) + assert.Equal(t, out, test.out) + } +}