From 8b64523e3bc8a00fd520c72c13fad53dff9f8216 Mon Sep 17 00:00:00 2001 From: SatowTakeshi Date: Sun, 23 Feb 2020 19:23:43 +0900 Subject: [PATCH 1/2] Add new TokeniseOption EnsureLF ref #329 --- lexer.go | 7 ++++++- regexp.go | 4 ++++ regexp_test.go | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/lexer.go b/lexer.go index a6ae84b2b..1269d338b 100644 --- a/lexer.go +++ b/lexer.go @@ -6,7 +6,8 @@ import ( var ( defaultOptions = &TokeniseOptions{ - State: "root", + State: "root", + EnsureLF: true, } ) @@ -80,6 +81,10 @@ type TokeniseOptions struct { State string // Nested tokenisation. Nested bool + + // If true, all EOLs are converted into LF + // by replacing CRLF and CR + EnsureLF bool } // A Lexer for tokenising source code. diff --git a/regexp.go b/regexp.go index 7c2fb0bb8..f20f88c3b 100644 --- a/regexp.go +++ b/regexp.go @@ -410,6 +410,10 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, if options == nil { options = defaultOptions } + if options.EnsureLF { + text = strings.ReplaceAll(text, "\r\n", "\n") + text = strings.ReplaceAll(text, "\r", "\n") + } if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") { text += "\n" } diff --git a/regexp_test.go b/regexp_test.go index 0ac7715e3..044ba8dd9 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -43,3 +43,39 @@ func TestMatchingAtStart(t *testing.T) { []Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}}, it.Tokens()) } + +func TestEnsureLF(t *testing.T) { + l := Coalesce(MustNewLexer(&Config{}, Rules{ + "root": { + {`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil}, + }, + })) + it, err := l.Tokenise(&TokeniseOptions{ + State: "root", + EnsureLF: true, + }, "hello\r\nworld\r") + assert.NoError(t, err) + assert.Equal(t, []Token{ + {Keyword, "hello"}, + {Whitespace, "\n"}, + {Keyword, "world"}, + {Whitespace, "\n"}, + }, it.Tokens()) + + l = Coalesce(MustNewLexer(nil, Rules{ + "root": { + {`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil}, + }, + })) + it, err = l.Tokenise(&TokeniseOptions{ + State: "root", + EnsureLF: false, + }, "hello\r\nworld\r") + assert.NoError(t, err) + assert.Equal(t, []Token{ + {Keyword, "hello"}, + {Whitespace, "\r\n"}, + {Keyword, "world"}, + {Whitespace, "\r"}, + }, it.Tokens()) +} From c829e8f0c2ca610060bd37cf4a6ea91cf1e341d2 Mon Sep 17 00:00:00 2001 From: SatowTakeshi Date: Tue, 3 Mar 2020 21:50:19 +0900 Subject: [PATCH 2/2] Use efficient process suggested by @chmike --- regexp.go | 22 ++++++++++++++++++++-- regexp_test.go | 22 +++++++++++++++++++++- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/regexp.go b/regexp.go index f20f88c3b..d13d58d69 100644 --- a/regexp.go +++ b/regexp.go @@ -411,8 +411,7 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, options = defaultOptions } if options.EnsureLF { - text = strings.ReplaceAll(text, "\r\n", "\n") - text = strings.ReplaceAll(text, "\r", "\n") + text = ensureLF(text) } if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") { text += "\n" @@ -441,3 +440,22 @@ func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule } return 0, &CompiledRule{}, nil } + +// replace \r and \r\n with \n +// same as strings.ReplaceAll but more efficient +func ensureLF(text string) string { + buf := make([]byte, len(text)) + var j int + for i := 0; i < len(text); i++ { + c := text[i] + if c == '\r' { + if i < len(text)-1 && text[i+1] == '\n' { + continue + } + c = '\n' + } + buf[j] = c + j++ + } + return string(buf[:j]) +} diff --git a/regexp_test.go b/regexp_test.go index 044ba8dd9..a40f3e06a 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -44,7 +44,7 @@ func TestMatchingAtStart(t *testing.T) { it.Tokens()) } -func TestEnsureLF(t *testing.T) { +func TestEnsureLFOption(t *testing.T) { l := Coalesce(MustNewLexer(&Config{}, Rules{ "root": { {`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil}, @@ -79,3 +79,23 @@ func TestEnsureLF(t *testing.T) { {Whitespace, "\r"}, }, it.Tokens()) } + +func TestEnsureLFFunc(t *testing.T) { + tests := []struct{ in, out string }{ + {in: "", out: ""}, + {in: "abc", out: "abc"}, + {in: "\r", out: "\n"}, + {in: "a\r", out: "a\n"}, + {in: "\rb", out: "\nb"}, + {in: "a\rb", out: "a\nb"}, + {in: "\r\n", out: "\n"}, + {in: "a\r\n", out: "a\n"}, + {in: "\r\nb", out: "\nb"}, + {in: "a\r\nb", out: "a\nb"}, + {in: "\r\r\r\n\r", out: "\n\n\n\n"}, + } + for _, test := range tests { + out := ensureLF(test.in) + assert.Equal(t, out, test.out) + } +}