Allow ATN serialization of values more than 65535 (writeCompactUInt32)

Refactor ATN serializer and deserializer, use ATNDataWriter, ATNDataReader Remove excess data cloning in deserializer fixes antlr#1863, fixes antlr#2732, fixes antlr#3338
KvanTTT · Jan 17, 2022 · 48c67ed · 48c67ed
1 parent 828a688
commit 48c67ed
Show file tree

Hide file tree

Showing 9 changed files with 354 additions and 412 deletions.
diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java
@@ -366,9 +366,10 @@ public static RuntimeTestDescriptor[] getRuntimeTestDescriptors(String group, St
 		}
 
 		if (group.equals("LexerExec")) {
-			descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfTest(targetName));
-			descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfTest(targetName));
+			descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfDescriptor(targetName));
+			descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfDescriptor(targetName));
 			descriptors.add(GeneratedLexerDescriptors.getLargeLexerDescriptor(targetName));
+			descriptors.add(GeneratedLexerDescriptors.getAtnStatesSizeMoreThan65535Descriptor(targetName));
 		}
 
 		return descriptors.toArray(new RuntimeTestDescriptor[0]);

diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java
@@ -1,7 +1,9 @@
 package org.antlr.v4.test.runtime;
 
+import java.util.Collections;
+
 public class GeneratedLexerDescriptors {
-	static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) {
+	static RuntimeTestDescriptor getLineSeparatorLfDescriptor(String targetName) {
 		UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor();
 		result.name = "LineSeparatorLf";
 		result.targetName = targetName;
@@ -20,7 +22,7 @@ static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) {
 		return result;
 	}
 
-	static RuntimeTestDescriptor getLineSeparatorCrLfTest(String targetName) {
+	static RuntimeTestDescriptor getLineSeparatorCrLfDescriptor(String targetName) {
 		UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor();
 		result.name = "LineSeparatorCrLf";
 		result.targetName = targetName;
@@ -65,4 +67,50 @@ static RuntimeTestDescriptor getLargeLexerDescriptor(String targetName) {
 				"[@1,5:4='<EOF>',<-1>,1:5]\n";
 		return result;
 	}
+
+	static RuntimeTestDescriptor getAtnStatesSizeMoreThan65535Descriptor(String targetName) {
+		UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor();
+		result.name = "AtnStatesSizeMoreThan65535";
+		result.notes = "Regression for https://github.com/antlr/antlr4/issues/1863";
+		result.targetName = targetName;
+		result.testType = "Lexer";
+
+		final int tokensCount = 1024;
+		final String suffix = String.join("", Collections.nCopies(70, "_"));
+
+		String grammarName = "L";
+		StringBuilder grammar = new StringBuilder();
+		grammar.append("lexer grammar ").append(grammarName).append(";\n");
+		grammar.append('\n');
+		StringBuilder input = new StringBuilder();
+		StringBuilder output = new StringBuilder();
+		int startOffset;
+		int stopOffset = -2;
+		for (int i = 0; i < tokensCount; i++) {
+			String value = "T_" + i + suffix;
+			grammar.append(value).append(": '").append(value).append("';\n");
+			input.append(value).append('\n');
+
+			startOffset = stopOffset + 2;
+			stopOffset += value.length() + 1;
+
+			output.append("[@").append(i).append(',').append(startOffset).append(':').append(stopOffset)
+					.append("='").append(value).append("',<").append(i + 1).append(">,").append(i + 1)
+					.append(":0]\n");
+		}
+
+		grammar.append("\n");
+		grammar.append("WS: [ \\t\\r\\n]+ -> skip;\n");
+
+		startOffset = stopOffset + 2;
+		stopOffset = startOffset - 1;
+		output.append("[@").append(tokensCount).append(',').append(startOffset).append(':').append(stopOffset)
+				.append("='<EOF>',<-1>,").append(tokensCount + 1).append(":0]\n");
+
+		result.grammar = grammar.toString();
+		result.grammarName = grammarName;
+		result.input = input.toString();
+		result.output = output.toString();
+		return result;
+	}
 }
diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataReader.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataReader.java
@@ -0,0 +1,53 @@
+package org.antlr.v4.runtime.atn;
+
+import java.util.UUID;
+
+public class ATNDataReader {
+	private final char[] data;
+	private int p;
+
+	public ATNDataReader(char[] data) {
+		this.data = data;
+	}
+
+	public UUID readUUID() {
+		long leastSigBits = ((long) readUInt32() & 0x00000000FFFFFFFFL) | ((long) readUInt32() << 32);
+		long mostSigBits = (long) readUInt32() | ((long) readUInt32() << 32);
+		return new UUID(mostSigBits, leastSigBits);
+	}
+
+	public int readUInt32() {
+		return readUInt16() | (readUInt16() << 16);
+	}
+
+	public int readCompactUInt32() {
+		int value = readUInt16();
+		return value < 0b1000_0000_0000_0000 && value >= 0
+				? value
+				: (readUInt16() << 15) | (value & 0b0111_1111_1111_1111);
+	}
+
+	public int readUInt16() {
+		return readUInt16(true);
+	}
+
+	public int readUInt16(boolean normalize) {
+		int result = data[p++];
+		// Each char value in data is shifted by +2 at the entry to this method.
+		// This is an encoding optimization targeting the serialized values 0
+		// and -1 (serialized to 0xFFFF), each of which are very common in the
+		// serialized form of the ATN. In the modified UTF-8 that Java uses for
+		// compiled string literals, these two character values have multi-byte
+		// forms. By shifting each value by +2, they become characters 2 and 1
+		// prior to writing the string, each of which have single-byte
+		// representations. Since the shift occurs in the tool during ATN
+		// serialization, each target is responsible for adjusting the values
+		// during deserialization.
+		//
+		// As a special case, note that the first element of data is not
+		// adjusted because it contains the major version number of the
+		// serialized ATN, which was fixed at 3 at the time the value shifting
+		// was implemented.
+		return normalize ? (result > 1 ? result - ATNDataWriter.OptimizeOffset : result + 65534) : result;
+	}
+}
diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataWriter.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataWriter.java
@@ -0,0 +1,53 @@
+package org.antlr.v4.runtime.atn;
+
+import org.antlr.v4.runtime.misc.IntegerList;
+
+import java.util.UUID;
+
+public class ATNDataWriter {
+	public static final int OptimizeOffset = 2;
+
+	private final IntegerList data;
+
+	public ATNDataWriter(IntegerList data) {
+		this.data = data;
+	}
+
+	public void writeUUID(UUID uuid) {
+		long leastSignificantBits = uuid.getLeastSignificantBits();
+		writeUInt32((int)leastSignificantBits);
+		writeUInt32((int)(leastSignificantBits >> 32));
+		long mostSignificantBits = uuid.getMostSignificantBits();
+		writeUInt32((int)mostSignificantBits);
+		writeUInt32((int)(mostSignificantBits >> 32));
+	}
+
+	public void writeUInt32(int value) {
+		writeUInt16((char)value);
+		writeUInt16((char)(value >> 16));
+	}
+
+	public void writeCompactUInt32(int value) {
+		if (value < 0b1000_0000_0000_0000) {
+			writeUInt16(value);
+		} else {
+			writeUInt16((value & 0b0111_1111_1111_1111) | (1 << 15));
+			writeUInt16(value >>> 15);
+		}
+	}
+
+	public void writeUInt16(int value) {
+		writeUInt16(value, true);
+	}
+
+	public void writeUInt16(int value, boolean optimize) {
+		if (value < Character.MIN_VALUE || value > Character.MAX_VALUE) {
+			throw new UnsupportedOperationException("Serialized ATN data element "+
+					data.size() + " element " + value + " out of range "+
+					(int)Character.MIN_VALUE + ".." + (int)Character.MAX_VALUE);
+		}
+		// Note: This value shifting loop is documented in ATNDeserializer.
+		// don't adjust the first value since that's the version number
+		data.add(optimize ? (value + OptimizeOffset) & 0xFFFF : value);
+	}
+}