alecthomas · alecthomas · Mar 4, 2020 · Feb 23, 2020 · Mar 3, 2020 · alecthomas
diff --git a/lexer.go b/lexer.go
@@ -6,7 +6,8 @@ import (
 
 var (
 	defaultOptions = &TokeniseOptions{
-		State: "root",
+		State:    "root",
+		EnsureLF: true,
 	}
 )
 
@@ -80,6 +81,10 @@ type TokeniseOptions struct {
 	State string
 	// Nested tokenisation.
 	Nested bool
+
+	// If true, all EOLs are converted into LF
+	// by replacing CRLF and CR
+	EnsureLF bool
 }
 
 // A Lexer for tokenising source code.

diff --git a/regexp.go b/regexp.go
@@ -410,6 +410,9 @@ func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator,
 	if options == nil {
 		options = defaultOptions
 	}
+	if options.EnsureLF {
+		text = ensureLF(text)
+	}
 	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
 		text += "\n"
 	}
@@ -437,3 +440,22 @@ func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule
 	}
 	return 0, &CompiledRule{}, nil
 }
+
+// replace \r and \r\n with \n
+// same as strings.ReplaceAll but more efficient
+func ensureLF(text string) string {
+	buf := make([]byte, len(text))
+	var j int
+	for i := 0; i < len(text); i++ {
+		c := text[i]
+		if c == '\r' {
+			if i < len(text)-1 && text[i+1] == '\n' {
+				continue
+			}
+			c = '\n'
+		}
+		buf[j] = c
+		j++
+	}
+	return string(buf[:j])
+}
diff --git a/regexp_test.go b/regexp_test.go
@@ -43,3 +43,59 @@ func TestMatchingAtStart(t *testing.T) {
 		[]Token{{Punctuation, "-"}, {NameEntity, "module"}, {Whitespace, " "}, {Operator, "->"}},
 		it.Tokens())
 }
+
+func TestEnsureLFOption(t *testing.T) {
+	l := Coalesce(MustNewLexer(&Config{}, Rules{
+		"root": {
+			{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
+		},
+	}))
+	it, err := l.Tokenise(&TokeniseOptions{
+		State:    "root",
+		EnsureLF: true,
+	}, "hello\r\nworld\r")
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{
+		{Keyword, "hello"},
+		{Whitespace, "\n"},
+		{Keyword, "world"},
+		{Whitespace, "\n"},
+	}, it.Tokens())
+
+	l = Coalesce(MustNewLexer(nil, Rules{
+		"root": {
+			{`(\w+)(\r?\n|\r)`, ByGroups(Keyword, Whitespace), nil},
+		},
+	}))
+	it, err = l.Tokenise(&TokeniseOptions{
+		State:    "root",
+		EnsureLF: false,
+	}, "hello\r\nworld\r")
+	assert.NoError(t, err)
+	assert.Equal(t, []Token{
+		{Keyword, "hello"},
+		{Whitespace, "\r\n"},
+		{Keyword, "world"},
+		{Whitespace, "\r"},
+	}, it.Tokens())
+}
+
+func TestEnsureLFFunc(t *testing.T) {
+	tests := []struct{ in, out string }{
+		{in: "", out: ""},
+		{in: "abc", out: "abc"},
+		{in: "\r", out: "\n"},
+		{in: "a\r", out: "a\n"},
+		{in: "\rb", out: "\nb"},
+		{in: "a\rb", out: "a\nb"},
+		{in: "\r\n", out: "\n"},
+		{in: "a\r\n", out: "a\n"},
+		{in: "\r\nb", out: "\nb"},
+		{in: "a\r\nb", out: "a\nb"},
+		{in: "\r\r\r\n\r", out: "\n\n\n\n"},
+	}
+	for _, test := range tests {
+		out := ensureLF(test.in)
+		assert.Equal(t, out, test.out)
+	}
+}