Implement syntax highlighting for p4

Add "p4" syntax language (https://p4.org/)
aretext · Jan 9, 2024 · 94672f5 · 94672f5
1 parent 06f072b
commit 94672f5
Show file tree

Hide file tree

Showing 6 changed files with 426 additions and 85 deletions.
diff --git a/docs/config-reference.md b/docs/config-reference.md
@@ -23,23 +23,24 @@ Syntax Languages
 
 | Value        | Description                                                                              |
 |--------------|------------------------------------------------------------------------------------------|
-| plaintext    | Do not apply any syntax highlighting.                                                    |
-| json         | [JSON](https://www.json.org/json-en.html)                                                |
-| yaml         | [YAML](https://yaml.org/spec/)                                                           |
-| go           | [Go](https://golang.org/ref/spec)                                                        |
-| python       | [Python](https://docs.python.org/3/reference/)                                           |
-| rust         | [Rust](https://doc.rust-lang.org/stable/reference/)                                      |
-| c            | [C](http://www.gnu.org/software/gnu-c-manual/gnu-c-manual.html)                          |
 | bash         | [bash](https://www.gnu.org/software/bash/manual/bash.html)                               |
-| makefile     | [Makefile](https://www.gnu.org/software/make/manual/make.html)                           |
-| xml          | [xml](https://www.w3.org/TR/2006/REC-xml11-20060816/)                                    |
-| markdown     | [Markdown](https://commonmark.org/)                                                      |
+| c            | [C](http://www.gnu.org/software/gnu-c-manual/gnu-c-manual.html)                          |
 | criticmarkup | [CriticMarkup](https://github.com/CriticMarkup/CriticMarkup-toolkit)                     |
-| protobuf     | [Protocol Buffers Version 3](https://developers.google.com/protocol-buffers/docs/proto3) |
-| todotxt      | [todo.txt](https://github.com/todotxt/todo.txt)                                          |
 | gitcommit    | Format for editing a git commit                                                          |
 | gitrebase    | Format for git interactive rebase                                                        |
+| go           | [Go](https://golang.org/ref/spec)                                                        |
 | gotemplate   | [Go template](https://pkg.go.dev/text/template)                                          |
+| json         | [JSON](https://www.json.org/json-en.html)                                                |
+| makefile     | [Makefile](https://www.gnu.org/software/make/manual/make.html)                           |
+| markdown     | [Markdown](https://commonmark.org/)                                                      |
+| p4           | [p4](https://p4.org)                                                                     |
+| plaintext    | Do not apply any syntax highlighting.                                                    |
+| protobuf     | [Protocol Buffers Version 3](https://developers.google.com/protocol-buffers/docs/proto3) |
+| python       | [Python](https://docs.python.org/3/reference/)                                           |
+| rust         | [Rust](https://doc.rust-lang.org/stable/reference/)                                      |
+| todotxt      | [todo.txt](https://github.com/todotxt/todo.txt)                                          |
+| xml          | [xml](https://www.w3.org/TR/2006/REC-xml11-20060816/)                                    |
+| yaml         | [YAML](https://yaml.org/spec/)                                                           |
 
 Menu Command Object
 -------------------

diff --git a/syntax/languages/c.go b/syntax/languages/c.go
@@ -1,7 +1,6 @@
 package languages
 
 import (
-	"io"
 	"unicode"
 
 	"github.com/aretext/aretext/syntax/parser"
@@ -37,79 +36,11 @@ func cCommentParseFunc() parser.Func {
 }
 
 func cPreprocessorDirective() parser.Func {
-	// Consume leading '#' with optional whitespace after.
-	consumeStartOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
-		var numConsumed uint64
-		var sawHashmark bool
-		for {
-			r, err := iter.NextRune()
-			if err == io.EOF {
-				break
-			} else if err != nil {
-				return parser.FailedResult
-			}
-
-			if r == '#' && !sawHashmark {
-				sawHashmark = true
-				numConsumed++
-			} else if sawHashmark && (r == ' ' || r == '\t') {
-				numConsumed++
-			} else {
-				break
-			}
-		}
-
-		if !sawHashmark {
-			return parser.FailedResult
-		}
-
-		return parser.Result{
-			NumConsumed: numConsumed,
-			NextState:   state,
-		}
+	directives := []string{
+		"include", "pragma", "ifndef", "define", "error", "undef",
+		"endif", "ifdef", "elif", "else", "if",
 	}
-
-	// Consume to the end of line or EOF, unless the line ends with a backslash.
-	consumeToEndOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
-		var numConsumed uint64
-		var lastWasBackslash bool
-		for {
-			r, err := iter.NextRune()
-			if err == io.EOF {
-				break
-			} else if err != nil {
-				return parser.FailedResult
-			}
-
-			numConsumed++
-
-			if r == '\n' && !lastWasBackslash {
-				break
-			}
-			lastWasBackslash = (r == '\\')
-		}
-		return parser.Result{
-			NumConsumed: numConsumed,
-			NextState:   state,
-		}
-	}
-
-	return parser.Func(consumeStartOfDirective).
-		Then(consumeString("include").
-			Or(consumeString("pragma")).
-			Or(consumeString("ifndef")).
-			Or(consumeString("define")).
-			Or(consumeString("error")).
-			Or(consumeString("undef")).
-			Or(consumeString("endif")).
-			Or(consumeString("ifdef")).
-			Or(consumeString("elif")).
-			Or(consumeString("else")).
-			Or(consumeString("if"))).
-		ThenNot(consumeSingleRuneLike(func(r rune) bool {
-			return !unicode.IsSpace(r) // must be followed by space, newline, or EOF
-		})).
-		ThenMaybe(consumeToEndOfDirective).
+	return consumeCStylePreprocessorDirective(directives).
 		Map(recognizeToken(cTokenRolePreprocessorDirective))
 }
 

diff --git a/syntax/languages/helpers.go b/syntax/languages/helpers.go
@@ -4,6 +4,7 @@ import (
 	"io"
 	"sort"
 	"strings"
+	"unicode"
 	"unicode/utf8"
 
 	"github.com/aretext/aretext/syntax/parser"
@@ -367,3 +368,71 @@ func parseCStyleString(quoteRune rune, allowLineBreaks bool) parser.Func {
 	return consumeCStyleString(quoteRune, allowLineBreaks).
 		Map(recognizeToken(parser.TokenRoleString))
 }
+
+// consumeCStylePreprocessorDirective parses a preprocessor directive (like "#include")
+func consumeCStylePreprocessorDirective(directives []string) parser.Func {
+	// Consume leading '#' with optional whitespace after.
+	consumeStartOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
+		var numConsumed uint64
+		var sawHashmark bool
+		for {
+			r, err := iter.NextRune()
+			if err == io.EOF {
+				break
+			} else if err != nil {
+				return parser.FailedResult
+			}
+
+			if r == '#' && !sawHashmark {
+				sawHashmark = true
+				numConsumed++
+			} else if sawHashmark && (r == ' ' || r == '\t') {
+				numConsumed++
+			} else {
+				break
+			}
+		}
+
+		if !sawHashmark {
+			return parser.FailedResult
+		}
+
+		return parser.Result{
+			NumConsumed: numConsumed,
+			NextState:   state,
+		}
+	}
+
+	// Consume to the end of line or EOF, unless the line ends with a backslash.
+	consumeToEndOfDirective := func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
+		var numConsumed uint64
+		var lastWasBackslash bool
+		for {
+			r, err := iter.NextRune()
+			if err == io.EOF {
+				break
+			} else if err != nil {
+				return parser.FailedResult
+			}
+
+			numConsumed++
+
+			if r == '\n' && !lastWasBackslash {
+				break
+			}
+			lastWasBackslash = (r == '\\')
+		}
+		return parser.Result{
+			NumConsumed: numConsumed,
+			NextState:   state,
+		}
+	}
+
+	return parser.Func(consumeStartOfDirective).
+		Then(consumeLongestMatchingOption(directives)).
+		ThenNot(consumeSingleRuneLike(func(r rune) bool {
+			return !unicode.IsSpace(r) // must be followed by space, newline, or EOF
+		})).
+		ThenMaybe(consumeToEndOfDirective).
+		Map(recognizeToken(cTokenRolePreprocessorDirective))
+}
diff --git a/syntax/languages/p4.go b/syntax/languages/p4.go
@@ -0,0 +1,163 @@
+package languages
+
+import "github.com/aretext/aretext/syntax/parser"
+
+const (
+	p4TokenRolePreprocessorDirective = parser.TokenRoleCustom1
+	p4TokenRoleAnnotation            = parser.TokenRoleCustom2
+)
+
+// P4ParseFunc returns a parse func for P4-16.
+// See https://p4.org/p4-spec/docs/P4-16-v1.0.0-spec.html for the spec.
+// See also p4.json for syntax highlighting rules:
+// https://github.com/p4lang/p4-spec/blob/c84896fcd87f940983648b185ef9acf2b6f14838/p4-16/spec/p4.json
+func P4ParseFunc() parser.Func {
+	return p4CommentParseFunc().
+		Or(p4PreprocessorDirectiveParseFunc()).
+		Or(p4AnnotationParseFunc()).
+		Or(p4IdentifierOrKeywordParseFunc()).
+		Or(p4OperatorParseFunc()).
+		Or(p4StringParseFunc()).
+		Or(p4NumberParseFunc())
+}
+
+func p4CommentParseFunc() parser.Func {
+	consumeLineComment := consumeString("//").
+		ThenMaybe(consumeToNextLineFeed)
+
+	consumeBlockComment := consumeString("/*").
+		Then(consumeToString("*/"))
+
+	return consumeLineComment.
+		Or(consumeBlockComment).
+		Map(recognizeToken(parser.TokenRoleComment))
+}
+
+func p4PreprocessorDirectiveParseFunc() parser.Func {
+	directives := []string{
+		"include", "if", "endif", "ifdef",
+		"define", "ifndef", "undef", "line",
+	}
+	return consumeCStylePreprocessorDirective(directives).
+		Map(recognizeToken(p4TokenRolePreprocessorDirective))
+}
+
+func p4AnnotationParseFunc() parser.Func {
+	annotations := []string{
+		"atomic", "defaultonly", "deprecated", "name", "noSideEffects", "noWarn",
+		"optional", "priority", "pure", "tableonly", "hidden", "globalname",
+	}
+	return consumeString("@").
+		Then(consumeLongestMatchingOption(annotations)).
+		Map(recognizeToken(p4TokenRoleAnnotation))
+}
+
+func p4IdentifierOrKeywordParseFunc() parser.Func {
+	isIdStart := func(r rune) bool {
+		return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || r == '_' || r == '$'
+	}
+
+	isIdContinue := func(r rune) bool {
+		return isIdStart(r) || (r >= '0' && r <= '9')
+	}
+
+	keywords := []string{
+		"abstract", "action", "apply", "control", "default", "else",
+		"extern", "exit", "false", "if",
+		"package", "parser", "return", "select", "state", "switch",
+		"table", "this", "transition", "true", "type", "typedef", "value_set", "verify",
+		"bool", "bit", "const", "enum", "entries", "error", "header", "header_union",
+		"in", "inout", "int", "list", "match_kind", "out", "string", "tuple", "struct", "varbit", "void",
+	}
+
+	return consumeSingleRuneLike(isIdStart).
+		ThenMaybe(consumeRunesLike(isIdContinue)).
+		MapWithInput(recognizeKeywordOrConsume(keywords))
+}
+
+func p4OperatorParseFunc() parser.Func {
+	return consumeLongestMatchingOption([]string{
+		"=", ">", "<", "!", "~", "?", ":",
+		"==", "<=", ">=", "!=", "&&", "||", "++",
+		"+", "-", "*", "/", "&", "|", "^", "%", "<<",
+		">>", "&&&", "..",
+	}).Map(recognizeToken(parser.TokenRoleOperator))
+}
+
+func p4StringParseFunc() parser.Func {
+	return parseCStyleString('"', false)
+}
+
+func p4NumberParseFunc() parser.Func {
+	// NOTE: the number regex patterns in the spec's syntax highlighting definition (p4.json)
+	// differs from the spec itself (P4-16-spec.mdk). Follow the latter here.
+	consumeDigitsWithUnderscores := func(isDigit func(r rune) bool) parser.Func {
+		return func(iter parser.TrackingRuneIter, state parser.State) parser.Result {
+			var numUnderscores, numDigits uint64
+			for {
+				r, err := iter.NextRune()
+				if err != nil {
+					break
+				} else if r == '_' {
+					numUnderscores++
+				} else if isDigit(r) {
+					numDigits++
+				} else {
+					break
+				}
+			}
+
+			if numDigits == 0 {
+				return parser.FailedResult
+			}
+
+			return parser.Result{
+				NumConsumed: numUnderscores + numDigits,
+				NextState:   state,
+			}
+		}
+	}
+
+	isDecimalDigit := func(r rune) bool { return r >= '0' && r <= '9' }
+	isHexDigit := func(r rune) bool {
+		return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
+	}
+	isOctalDigit := func(r rune) bool {
+		return r >= '0' && r <= '7'
+	}
+	isBinaryDigit := func(r rune) bool {
+		return r == '0' || r == '1'
+	}
+
+	consumeWidthPrefix := consumeRunesLike(isDecimalDigit).
+		Then(consumeSingleRuneLike(func(r rune) bool { return r == 'w' || r == 's' }))
+
+	consumeHex := consumeString("0").
+		Then(consumeSingleRuneLike(func(r rune) bool { return r == 'x' || r == 'X' })).
+		Then(consumeDigitsWithUnderscores(isHexDigit))
+
+	consumeOctal := consumeString("0").
+		Then(consumeSingleRuneLike(func(r rune) bool { return r == 'o' || r == 'O' })).
+		Then(consumeDigitsWithUnderscores(isOctalDigit))
+
+	consumeBinary := consumeString("0").
+		Then(consumeSingleRuneLike(func(r rune) bool { return r == 'b' || r == 'B' })).
+		Then(consumeDigitsWithUnderscores(isBinaryDigit))
+
+	consumeDecimalWithPrefix := consumeString("0").
+		Then(consumeSingleRuneLike(func(r rune) bool { return r == 'd' || r == 'D' })).
+		Then(consumeDigitsWithUnderscores(isDecimalDigit))
+
+	// Ensure first digit is not an underscore.
+	consumeDecimalWithoutPrefix := consumeSingleRuneLike(isDecimalDigit).
+		ThenMaybe(consumeDigitsWithUnderscores(isDecimalDigit))
+
+	return consumeWidthPrefix.
+		MaybeBefore(
+			consumeHex.
+				Or(consumeOctal).
+				Or(consumeBinary).
+				Or(consumeDecimalWithPrefix).
+				Or(consumeDecimalWithoutPrefix)).
+		Map(recognizeToken(parser.TokenRoleNumber))
+}