Skip to content

Commit

Permalink
Allow ATN serialization of values more than 65535 (writeCompactUInt32)
Browse files Browse the repository at this point in the history
Refactor ATN serializer and deserializer, use ATNDataWriter, ATNDataReader

Remove excess data cloning in deserializer

fixes antlr#1863, fixes antlr#2732, fixes antlr#3338
  • Loading branch information
KvanTTT committed Jan 17, 2022
1 parent 828a688 commit 48c67ed
Show file tree
Hide file tree
Showing 9 changed files with 354 additions and 412 deletions.
Expand Up @@ -366,9 +366,10 @@ public static RuntimeTestDescriptor[] getRuntimeTestDescriptors(String group, St
}

if (group.equals("LexerExec")) {
descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfTest(targetName));
descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfTest(targetName));
descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfDescriptor(targetName));
descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfDescriptor(targetName));
descriptors.add(GeneratedLexerDescriptors.getLargeLexerDescriptor(targetName));
descriptors.add(GeneratedLexerDescriptors.getAtnStatesSizeMoreThan65535Descriptor(targetName));
}

return descriptors.toArray(new RuntimeTestDescriptor[0]);
Expand Down
@@ -1,7 +1,9 @@
package org.antlr.v4.test.runtime;

import java.util.Collections;

public class GeneratedLexerDescriptors {
static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) {
static RuntimeTestDescriptor getLineSeparatorLfDescriptor(String targetName) {
UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor();
result.name = "LineSeparatorLf";
result.targetName = targetName;
Expand All @@ -20,7 +22,7 @@ static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) {
return result;
}

static RuntimeTestDescriptor getLineSeparatorCrLfTest(String targetName) {
static RuntimeTestDescriptor getLineSeparatorCrLfDescriptor(String targetName) {
UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor();
result.name = "LineSeparatorCrLf";
result.targetName = targetName;
Expand Down Expand Up @@ -65,4 +67,50 @@ static RuntimeTestDescriptor getLargeLexerDescriptor(String targetName) {
"[@1,5:4='<EOF>',<-1>,1:5]\n";
return result;
}

static RuntimeTestDescriptor getAtnStatesSizeMoreThan65535Descriptor(String targetName) {
UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor();
result.name = "AtnStatesSizeMoreThan65535";
result.notes = "Regression for https://github.com/antlr/antlr4/issues/1863";
result.targetName = targetName;
result.testType = "Lexer";

final int tokensCount = 1024;
final String suffix = String.join("", Collections.nCopies(70, "_"));

String grammarName = "L";
StringBuilder grammar = new StringBuilder();
grammar.append("lexer grammar ").append(grammarName).append(";\n");
grammar.append('\n');
StringBuilder input = new StringBuilder();
StringBuilder output = new StringBuilder();
int startOffset;
int stopOffset = -2;
for (int i = 0; i < tokensCount; i++) {
String value = "T_" + i + suffix;
grammar.append(value).append(": '").append(value).append("';\n");
input.append(value).append('\n');

startOffset = stopOffset + 2;
stopOffset += value.length() + 1;

output.append("[@").append(i).append(',').append(startOffset).append(':').append(stopOffset)
.append("='").append(value).append("',<").append(i + 1).append(">,").append(i + 1)
.append(":0]\n");
}

grammar.append("\n");
grammar.append("WS: [ \\t\\r\\n]+ -> skip;\n");

startOffset = stopOffset + 2;
stopOffset = startOffset - 1;
output.append("[@").append(tokensCount).append(',').append(startOffset).append(':').append(stopOffset)
.append("='<EOF>',<-1>,").append(tokensCount + 1).append(":0]\n");

result.grammar = grammar.toString();
result.grammarName = grammarName;
result.input = input.toString();
result.output = output.toString();
return result;
}
}
53 changes: 53 additions & 0 deletions runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataReader.java
@@ -0,0 +1,53 @@
package org.antlr.v4.runtime.atn;

import java.util.UUID;

public class ATNDataReader {
private final char[] data;
private int p;

public ATNDataReader(char[] data) {
this.data = data;
}

public UUID readUUID() {
long leastSigBits = ((long) readUInt32() & 0x00000000FFFFFFFFL) | ((long) readUInt32() << 32);
long mostSigBits = (long) readUInt32() | ((long) readUInt32() << 32);
return new UUID(mostSigBits, leastSigBits);
}

public int readUInt32() {
return readUInt16() | (readUInt16() << 16);
}

public int readCompactUInt32() {
int value = readUInt16();
return value < 0b1000_0000_0000_0000 && value >= 0
? value
: (readUInt16() << 15) | (value & 0b0111_1111_1111_1111);
}

public int readUInt16() {
return readUInt16(true);
}

public int readUInt16(boolean normalize) {
int result = data[p++];
// Each char value in data is shifted by +2 at the entry to this method.
// This is an encoding optimization targeting the serialized values 0
// and -1 (serialized to 0xFFFF), each of which are very common in the
// serialized form of the ATN. In the modified UTF-8 that Java uses for
// compiled string literals, these two character values have multi-byte
// forms. By shifting each value by +2, they become characters 2 and 1
// prior to writing the string, each of which have single-byte
// representations. Since the shift occurs in the tool during ATN
// serialization, each target is responsible for adjusting the values
// during deserialization.
//
// As a special case, note that the first element of data is not
// adjusted because it contains the major version number of the
// serialized ATN, which was fixed at 3 at the time the value shifting
// was implemented.
return normalize ? (result > 1 ? result - ATNDataWriter.OptimizeOffset : result + 65534) : result;
}
}
53 changes: 53 additions & 0 deletions runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataWriter.java
@@ -0,0 +1,53 @@
package org.antlr.v4.runtime.atn;

import org.antlr.v4.runtime.misc.IntegerList;

import java.util.UUID;

public class ATNDataWriter {
public static final int OptimizeOffset = 2;

private final IntegerList data;

public ATNDataWriter(IntegerList data) {
this.data = data;
}

public void writeUUID(UUID uuid) {
long leastSignificantBits = uuid.getLeastSignificantBits();
writeUInt32((int)leastSignificantBits);
writeUInt32((int)(leastSignificantBits >> 32));
long mostSignificantBits = uuid.getMostSignificantBits();
writeUInt32((int)mostSignificantBits);
writeUInt32((int)(mostSignificantBits >> 32));
}

public void writeUInt32(int value) {
writeUInt16((char)value);
writeUInt16((char)(value >> 16));
}

public void writeCompactUInt32(int value) {
if (value < 0b1000_0000_0000_0000) {
writeUInt16(value);
} else {
writeUInt16((value & 0b0111_1111_1111_1111) | (1 << 15));
writeUInt16(value >>> 15);
}
}

public void writeUInt16(int value) {
writeUInt16(value, true);
}

public void writeUInt16(int value, boolean optimize) {
if (value < Character.MIN_VALUE || value > Character.MAX_VALUE) {
throw new UnsupportedOperationException("Serialized ATN data element "+
data.size() + " element " + value + " out of range "+
(int)Character.MIN_VALUE + ".." + (int)Character.MAX_VALUE);
}
// Note: This value shifting loop is documented in ATNDeserializer.
// don't adjust the first value since that's the version number
data.add(optimize ? (value + OptimizeOffset) & 0xFFFF : value);
}
}

0 comments on commit 48c67ed

Please sign in to comment.