From 48c67ed8b41ca9d856c0bfbc6d6965dc3ee83949 Mon Sep 17 00:00:00 2001 From: Ivan Kochurkin Date: Sat, 15 Jan 2022 21:11:22 +0300 Subject: [PATCH] Allow ATN serialization of values more than 65535 (writeCompactUInt32) Refactor ATN serializer and deserializer, use ATNDataWriter, ATNDataReader Remove excess data cloning in deserializer fixes #1863, fixes #2732, fixes #3338 --- .../v4/test/runtime/BaseRuntimeTest.java | 5 +- .../runtime/GeneratedLexerDescriptors.java | 52 ++- .../antlr/v4/runtime/atn/ATNDataReader.java | 53 +++ .../antlr/v4/runtime/atn/ATNDataWriter.java | 53 +++ .../antlr/v4/runtime/atn/ATNDeserializer.java | 218 ++++-------- .../antlr/v4/runtime/atn/ATNSerializer.java | 322 +++++++----------- .../antlr/v4/runtime/atn/ATNSimulator.java | 39 +-- .../v4/runtime/atn/UnicodeSerializeMode.java | 6 + .../antlr/v4/runtime/misc/IntegerList.java | 18 +- 9 files changed, 354 insertions(+), 412 deletions(-) create mode 100644 runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataReader.java create mode 100644 runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataWriter.java create mode 100644 runtime/Java/src/org/antlr/v4/runtime/atn/UnicodeSerializeMode.java diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java index f0546226fd..fdd476a6f8 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java @@ -366,9 +366,10 @@ public static RuntimeTestDescriptor[] getRuntimeTestDescriptors(String group, St } if (group.equals("LexerExec")) { - descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfTest(targetName)); - descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfTest(targetName)); + descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfDescriptor(targetName)); + descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfDescriptor(targetName)); descriptors.add(GeneratedLexerDescriptors.getLargeLexerDescriptor(targetName)); + descriptors.add(GeneratedLexerDescriptors.getAtnStatesSizeMoreThan65535Descriptor(targetName)); } return descriptors.toArray(new RuntimeTestDescriptor[0]); diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java index e0a1bb5ea3..a4327e5f30 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java @@ -1,7 +1,9 @@ package org.antlr.v4.test.runtime; +import java.util.Collections; + public class GeneratedLexerDescriptors { - static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) { + static RuntimeTestDescriptor getLineSeparatorLfDescriptor(String targetName) { UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor(); result.name = "LineSeparatorLf"; result.targetName = targetName; @@ -20,7 +22,7 @@ static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) { return result; } - static RuntimeTestDescriptor getLineSeparatorCrLfTest(String targetName) { + static RuntimeTestDescriptor getLineSeparatorCrLfDescriptor(String targetName) { UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor(); result.name = "LineSeparatorCrLf"; result.targetName = targetName; @@ -65,4 +67,50 @@ static RuntimeTestDescriptor getLargeLexerDescriptor(String targetName) { "[@1,5:4='',<-1>,1:5]\n"; return result; } + + static RuntimeTestDescriptor getAtnStatesSizeMoreThan65535Descriptor(String targetName) { + UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor(); + result.name = "AtnStatesSizeMoreThan65535"; + result.notes = "Regression for https://github.com/antlr/antlr4/issues/1863"; + result.targetName = targetName; + result.testType = "Lexer"; + + final int tokensCount = 1024; + final String suffix = String.join("", Collections.nCopies(70, "_")); + + String grammarName = "L"; + StringBuilder grammar = new StringBuilder(); + grammar.append("lexer grammar ").append(grammarName).append(";\n"); + grammar.append('\n'); + StringBuilder input = new StringBuilder(); + StringBuilder output = new StringBuilder(); + int startOffset; + int stopOffset = -2; + for (int i = 0; i < tokensCount; i++) { + String value = "T_" + i + suffix; + grammar.append(value).append(": '").append(value).append("';\n"); + input.append(value).append('\n'); + + startOffset = stopOffset + 2; + stopOffset += value.length() + 1; + + output.append("[@").append(i).append(',').append(startOffset).append(':').append(stopOffset) + .append("='").append(value).append("',<").append(i + 1).append(">,").append(i + 1) + .append(":0]\n"); + } + + grammar.append("\n"); + grammar.append("WS: [ \\t\\r\\n]+ -> skip;\n"); + + startOffset = stopOffset + 2; + stopOffset = startOffset - 1; + output.append("[@").append(tokensCount).append(',').append(startOffset).append(':').append(stopOffset) + .append("='',<-1>,").append(tokensCount + 1).append(":0]\n"); + + result.grammar = grammar.toString(); + result.grammarName = grammarName; + result.input = input.toString(); + result.output = output.toString(); + return result; + } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataReader.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataReader.java new file mode 100644 index 0000000000..9ffaeacef8 --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataReader.java @@ -0,0 +1,53 @@ +package org.antlr.v4.runtime.atn; + +import java.util.UUID; + +public class ATNDataReader { + private final char[] data; + private int p; + + public ATNDataReader(char[] data) { + this.data = data; + } + + public UUID readUUID() { + long leastSigBits = ((long) readUInt32() & 0x00000000FFFFFFFFL) | ((long) readUInt32() << 32); + long mostSigBits = (long) readUInt32() | ((long) readUInt32() << 32); + return new UUID(mostSigBits, leastSigBits); + } + + public int readUInt32() { + return readUInt16() | (readUInt16() << 16); + } + + public int readCompactUInt32() { + int value = readUInt16(); + return value < 0b1000_0000_0000_0000 && value >= 0 + ? value + : (readUInt16() << 15) | (value & 0b0111_1111_1111_1111); + } + + public int readUInt16() { + return readUInt16(true); + } + + public int readUInt16(boolean normalize) { + int result = data[p++]; + // Each char value in data is shifted by +2 at the entry to this method. + // This is an encoding optimization targeting the serialized values 0 + // and -1 (serialized to 0xFFFF), each of which are very common in the + // serialized form of the ATN. In the modified UTF-8 that Java uses for + // compiled string literals, these two character values have multi-byte + // forms. By shifting each value by +2, they become characters 2 and 1 + // prior to writing the string, each of which have single-byte + // representations. Since the shift occurs in the tool during ATN + // serialization, each target is responsible for adjusting the values + // during deserialization. + // + // As a special case, note that the first element of data is not + // adjusted because it contains the major version number of the + // serialized ATN, which was fixed at 3 at the time the value shifting + // was implemented. + return normalize ? (result > 1 ? result - ATNDataWriter.OptimizeOffset : result + 65534) : result; + } +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataWriter.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataWriter.java new file mode 100644 index 0000000000..07d6f0340a --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDataWriter.java @@ -0,0 +1,53 @@ +package org.antlr.v4.runtime.atn; + +import org.antlr.v4.runtime.misc.IntegerList; + +import java.util.UUID; + +public class ATNDataWriter { + public static final int OptimizeOffset = 2; + + private final IntegerList data; + + public ATNDataWriter(IntegerList data) { + this.data = data; + } + + public void writeUUID(UUID uuid) { + long leastSignificantBits = uuid.getLeastSignificantBits(); + writeUInt32((int)leastSignificantBits); + writeUInt32((int)(leastSignificantBits >> 32)); + long mostSignificantBits = uuid.getMostSignificantBits(); + writeUInt32((int)mostSignificantBits); + writeUInt32((int)(mostSignificantBits >> 32)); + } + + public void writeUInt32(int value) { + writeUInt16((char)value); + writeUInt16((char)(value >> 16)); + } + + public void writeCompactUInt32(int value) { + if (value < 0b1000_0000_0000_0000) { + writeUInt16(value); + } else { + writeUInt16((value & 0b0111_1111_1111_1111) | (1 << 15)); + writeUInt16(value >>> 15); + } + } + + public void writeUInt16(int value) { + writeUInt16(value, true); + } + + public void writeUInt16(int value, boolean optimize) { + if (value < Character.MIN_VALUE || value > Character.MAX_VALUE) { + throw new UnsupportedOperationException("Serialized ATN data element "+ + data.size() + " element " + value + " out of range "+ + (int)Character.MIN_VALUE + ".." + (int)Character.MAX_VALUE); + } + // Note: This value shifting loop is documented in ATNDeserializer. + // don't adjust the first value since that's the version number + data.add(optimize ? (value + OptimizeOffset) & 0xFFFF : value); + } +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java index 1cb6912592..19d9b38373 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java @@ -34,7 +34,7 @@ public class ATNDeserializer { */ private static final UUID BASE_SERIALIZED_UUID; /** - * This UUID indicates an extension of {@link BASE_SERIALIZED_UUID} for the + * This UUID indicates an extension of {@link org.antlr.v4.runtime.atn.ATNDeserializer#BASE_SERIALIZED_UUID} for the * addition of precedence predicates. */ private static final UUID ADDED_PRECEDENCE_TRANSITIONS; @@ -69,7 +69,7 @@ public class ATNDeserializer { ADDED_LEXER_ACTIONS = UUID.fromString("AADB8D7E-AEEF-4415-AD2B-8204D6CF042E"); ADDED_UNICODE_SMP = UUID.fromString("59627784-3BE5-417A-B9EB-8131A7286089"); - SUPPORTED_UUIDS = new ArrayList(); + SUPPORTED_UUIDS = new ArrayList<>(); SUPPORTED_UUIDS.add(BASE_SERIALIZED_UUID); SUPPORTED_UUIDS.add(ADDED_PRECEDENCE_TRANSITIONS); SUPPORTED_UUIDS.add(ADDED_LEXER_ACTIONS); @@ -78,49 +78,6 @@ public class ATNDeserializer { SERIALIZED_UUID = ADDED_UNICODE_SMP; } - interface UnicodeDeserializer { - // Wrapper for readInt() or readInt32() - int readUnicode(char[] data, int p); - - // Work around Java not allowing mutation of captured variables - // by returning amount by which to increment p after each read - int size(); - } - - enum UnicodeDeserializingMode { - UNICODE_BMP, - UNICODE_SMP - } - - static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) { - if (mode == UnicodeDeserializingMode.UNICODE_BMP) { - return new UnicodeDeserializer() { - @Override - public int readUnicode(char[] data, int p) { - return toInt(data[p]); - } - - @Override - public int size() { - return 1; - } - }; - } - else { - return new UnicodeDeserializer() { - @Override - public int readUnicode(char[] data, int p) { - return toInt32(data, p); - } - - @Override - public int size() { - return 2; - } - }; - } - } - private final ATNDeserializationOptions deserializationOptions; public ATNDeserializer() { @@ -157,38 +114,16 @@ static protected boolean isFeatureSupported(UUID feature, UUID actualUuid) { return SUPPORTED_UUIDS.indexOf(actualUuid) >= featureIndex; } - @SuppressWarnings("deprecation") public ATN deserialize(char[] data) { - data = data.clone(); - - // Each char value in data is shifted by +2 at the entry to this method. - // This is an encoding optimization targeting the serialized values 0 - // and -1 (serialized to 0xFFFF), each of which are very common in the - // serialized form of the ATN. In the modified UTF-8 that Java uses for - // compiled string literals, these two character values have multi-byte - // forms. By shifting each value by +2, they become characters 2 and 1 - // prior to writing the string, each of which have single-byte - // representations. Since the shift occurs in the tool during ATN - // serialization, each target is responsible for adjusting the values - // during deserialization. - // - // As a special case, note that the first element of data is not - // adjusted because it contains the major version number of the - // serialized ATN, which was fixed at 3 at the time the value shifting - // was implemented. - for (int i = 1; i < data.length; i++) { - data[i] = (char)(data[i] - 2); - } + ATNDataReader reader = new ATNDataReader(data); - int p = 0; - int version = toInt(data[p++]); + int version = reader.readUInt16(false); if (version != SERIALIZED_VERSION) { String reason = String.format(Locale.getDefault(), "Could not deserialize ATN with version %d (expected %d).", version, SERIALIZED_VERSION); throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); } - UUID uuid = toUUID(data, p); - p += 8; + UUID uuid = reader.readUUID(); if (!SUPPORTED_UUIDS.contains(uuid)) { String reason = String.format(Locale.getDefault(), "Could not deserialize ATN with UUID %s (expected %s or a legacy UUID).", uuid, SERIALIZED_UUID); throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); @@ -197,37 +132,37 @@ public ATN deserialize(char[] data) { boolean supportsPrecedencePredicates = isFeatureSupported(ADDED_PRECEDENCE_TRANSITIONS, uuid); boolean supportsLexerActions = isFeatureSupported(ADDED_LEXER_ACTIONS, uuid); - ATNType grammarType = ATNType.values()[toInt(data[p++])]; - int maxTokenType = toInt(data[p++]); + ATNType grammarType = ATNType.values()[reader.readUInt16()]; + int maxTokenType = reader.readUInt16(); ATN atn = new ATN(grammarType, maxTokenType); // // STATES // - List> loopBackStateNumbers = new ArrayList>(); - List> endStateNumbers = new ArrayList>(); - int nstates = toInt(data[p++]); + List> loopBackStateNumbers = new ArrayList<>(); + List> endStateNumbers = new ArrayList<>(); + int nstates = reader.readCompactUInt32(); for (int i=0; i((LoopEndState)s, loopBackStateNumber)); + int loopBackStateNumber = reader.readCompactUInt32(); + loopBackStateNumbers.add(new Pair<>((LoopEndState) s, loopBackStateNumber)); } else if (s instanceof BlockStartState) { - int endStateNumber = toInt(data[p++]); - endStateNumbers.add(new Pair((BlockStartState)s, endStateNumber)); + int endStateNumber = reader.readCompactUInt32(); + endStateNumbers.add(new Pair<>((BlockStartState) s, endStateNumber)); } atn.addState(s); } @@ -241,16 +176,16 @@ else if (s instanceof BlockStartState) { pair.a.endState = (BlockEndState)atn.states.get(pair.b); } - int numNonGreedyStates = toInt(data[p++]); + int numNonGreedyStates = reader.readCompactUInt32(); for (int i = 0; i < numNonGreedyStates; i++) { - int stateNumber = toInt(data[p++]); + int stateNumber = reader.readCompactUInt32(); ((DecisionState)atn.states.get(stateNumber)).nonGreedy = true; } if (supportsPrecedencePredicates) { - int numPrecedenceStates = toInt(data[p++]); + int numPrecedenceStates = reader.readCompactUInt32(); for (int i = 0; i < numPrecedenceStates; i++) { - int stateNumber = toInt(data[p++]); + int stateNumber = reader.readCompactUInt32(); ((RuleStartState)atn.states.get(stateNumber)).isLeftRecursiveRule = true; } } @@ -258,18 +193,18 @@ else if (s instanceof BlockStartState) { // // RULES // - int nrules = toInt(data[p++]); + int nrules = reader.readCompactUInt32(); if ( atn.grammarType == ATNType.LEXER ) { atn.ruleToTokenType = new int[nrules]; } atn.ruleToStartState = new RuleStartState[nrules]; for (int i=0; i sets = new ArrayList(); + List sets = new ArrayList<>(); // First, read all sets with 16-bit Unicode code points <= U+FFFF. - p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_BMP)); + deserializeSets(reader, sets, UnicodeSerializeMode.UNICODE_BMP); // Next, if the ATN was serialized with the Unicode SMP feature, // deserialize sets with 32-bit arguments <= U+10FFFF. if (isFeatureSupported(ADDED_UNICODE_SMP, uuid)) { - p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_SMP)); + deserializeSets(reader, sets, UnicodeSerializeMode.UNICODE_SMP); } // // EDGES // - int nedges = toInt(data[p++]); + int nedges = reader.readCompactUInt32(); for (int i=0; i"+trg+ -// " "+Transition.serializationNames[ttype]+ -// " "+arg1+","+arg2+","+arg3); ATNState srcState = atn.states.get(src); srcState.addTransition(trans); - p += 6; } // edges for rule stop states can be derived, so they aren't serialized @@ -362,17 +292,14 @@ else if (s instanceof BlockStartState) { for (ATNState state : atn.states) { if (state instanceof BlockStartState) { + BlockStartState blockStartState = (BlockStartState) state; // we need to know the end state to set its start state - if (((BlockStartState)state).endState == null) { - throw new IllegalStateException(); - } - // block end states can only be associated to a single block start state - if (((BlockStartState)state).endState.startState != null) { + if (blockStartState.endState == null || blockStartState.endState.startState != null) { throw new IllegalStateException(); } - ((BlockStartState)state).endState.startState = (BlockStartState)state; + blockStartState.endState.startState = blockStartState; } if (state instanceof PlusLoopbackState) { @@ -398,9 +325,9 @@ else if (state instanceof StarLoopbackState) { // // DECISIONS // - int ndecisions = toInt(data[p++]); + int ndecisions = reader.readCompactUInt32(); for (int i=1; i<=ndecisions; i++) { - int s = toInt(data[p++]); + int s = reader.readCompactUInt32(); DecisionState decState = (DecisionState)atn.states.get(s); atn.decisionToState.add(decState); decState.decision = i-1; @@ -411,15 +338,15 @@ else if (state instanceof StarLoopbackState) { // if (atn.grammarType == ATNType.LEXER) { if (supportsLexerActions) { - atn.lexerActions = new LexerAction[toInt(data[p++])]; + atn.lexerActions = new LexerAction[reader.readCompactUInt32()]; for (int i = 0; i < atn.lexerActions.length; i++) { - LexerActionType actionType = LexerActionType.values()[toInt(data[p++])]; - int data1 = toInt(data[p++]); + LexerActionType actionType = LexerActionType.values()[reader.readUInt16()]; + int data1 = reader.readUInt16(); if (data1 == 0xFFFF) { data1 = -1; } - int data2 = toInt(data[p++]); + int data2 = reader.readUInt16(); if (data2 == 0xFFFF) { data2 = -1; } @@ -433,7 +360,7 @@ else if (state instanceof StarLoopbackState) { // for compatibility with older serialized ATNs, convert the old // serialized action index for action transitions to the new // form, which is the index of a LexerCustomAction - List legacyLexerActions = new ArrayList(); + List legacyLexerActions = new ArrayList<>(); for (ATNState state : atn.states) { for (int i = 0; i < state.getNumberOfTransitions(); i++) { Transition transition = state.transition(i); @@ -552,28 +479,30 @@ else if (state instanceof StarLoopbackState) { return atn; } - private int deserializeSets(char[] data, int p, List sets, UnicodeDeserializer unicodeDeserializer) { - int nsets = toInt(data[p++]); + private void deserializeSets(ATNDataReader reader, List sets, UnicodeSerializeMode mode) { + int nsets = reader.readCompactUInt32(); for (int i=0; i tokenNames; - - private interface CodePointSerializer { - void serializeCodePoint(IntegerList data, int cp); - } + public final ATN atn; + private final List tokenNames; public ATNSerializer(ATN atn) { assert atn.grammarType != null; this.atn = atn; + this.tokenNames = null; } public ATNSerializer(ATN atn, List tokenNames) { @@ -66,12 +63,14 @@ public ATNSerializer(ATN atn, List tokenNames) { */ public IntegerList serialize() { IntegerList data = new IntegerList(); - data.add(ATNDeserializer.SERIALIZED_VERSION); - serializeUUID(data, ATNDeserializer.SERIALIZED_UUID); + ATNDataWriter writer = new ATNDataWriter(data); + + writer.writeUInt16(ATNDeserializer.SERIALIZED_VERSION, false); + writer.writeUUID(ATNDeserializer.SERIALIZED_UUID); // convert grammar type to ATN const to avoid dependence on ANTLRParser - data.add(atn.grammarType.ordinal()); - data.add(atn.maxTokenType); + writer.writeUInt16(atn.grammarType.ordinal()); + writer.writeUInt16(atn.maxTokenType); int nedges = 0; // Note that we use a LinkedHashMap as a set to @@ -82,10 +81,10 @@ public IntegerList serialize() { // dump states, count edges and collect sets while doing so IntegerList nonGreedyStates = new IntegerList(); IntegerList precedenceStates = new IntegerList(); - data.add(atn.states.size()); + writer.writeCompactUInt32(atn.states.size()); for (ATNState s : atn.states) { if ( s==null ) { // might be optimized away - data.add(ATNState.INVALID_TYPE); + writer.writeCompactUInt32(ATNState.INVALID_TYPE); continue; } @@ -98,20 +97,15 @@ public IntegerList serialize() { precedenceStates.add(s.stateNumber); } - data.add(stateType); + writer.writeCompactUInt32(stateType); - if (s.ruleIndex == -1) { - data.add(Character.MAX_VALUE); - } - else { - data.add(s.ruleIndex); - } + writer.writeUInt16(s.ruleIndex == -1 ? Character.MAX_VALUE : s.ruleIndex); if ( s.getStateType() == ATNState.LOOP_END ) { - data.add(((LoopEndState)s).loopBackState.stateNumber); + writer.writeCompactUInt32(((LoopEndState)s).loopBackState.stateNumber); } else if ( s instanceof BlockStartState ) { - data.add(((BlockStartState)s).endState.stateNumber); + writer.writeCompactUInt32(((BlockStartState)s).endState.stateNumber); } if (s.getStateType() != ATNState.RULE_STOP) { @@ -130,67 +124,42 @@ else if ( s instanceof BlockStartState ) { } // non-greedy states - data.add(nonGreedyStates.size()); + writer.writeCompactUInt32(nonGreedyStates.size()); for (int i = 0; i < nonGreedyStates.size(); i++) { - data.add(nonGreedyStates.get(i)); + writer.writeCompactUInt32(nonGreedyStates.get(i)); } // precedence states - data.add(precedenceStates.size()); + writer.writeCompactUInt32(precedenceStates.size()); for (int i = 0; i < precedenceStates.size(); i++) { - data.add(precedenceStates.get(i)); + writer.writeCompactUInt32(precedenceStates.get(i)); } int nrules = atn.ruleToStartState.length; - data.add(nrules); + writer.writeCompactUInt32(nrules); for (int r=0; r0 ) { for (ATNState modeStartState : atn.modeToStartState) { - data.add(modeStartState.stateNumber); + writer.writeCompactUInt32(modeStartState.stateNumber); } } List bmpSets = new ArrayList<>(); List smpSets = new ArrayList<>(); for (IntervalSet set : sets.keySet()) { - if (!set.isNil() && set.getMaxElement() <= Character.MAX_VALUE) { - bmpSets.add(set); - } - else { - smpSets.add(set); - } + List localSets = !set.isNil() && set.getMaxElement() <= Character.MAX_VALUE ? bmpSets : smpSets; + localSets.add(set); } - serializeSets( - data, - bmpSets, - new CodePointSerializer() { - @Override - public void serializeCodePoint(IntegerList data, int cp) { - data.add(cp); - } - }); - serializeSets( - data, - smpSets, - new CodePointSerializer() { - @Override - public void serializeCodePoint(IntegerList data, int cp) { - serializeInt(data, cp); - } - }); + serializeSets(writer, bmpSets, UnicodeSerializeMode.UNICODE_BMP); + serializeSets(writer, smpSets, UnicodeSerializeMode.UNICODE_SMP); Map setIndices = new HashMap<>(); int setIndex = 0; for (IntervalSet bmpSet : bmpSets) { @@ -200,7 +169,7 @@ public void serializeCodePoint(IntegerList data, int cp) { setIndices.put(smpSet, setIndex++); } - data.add(nedges); + writer.writeCompactUInt32(nedges); for (ATNState s : atn.states) { if ( s==null ) { // might be optimized away @@ -227,7 +196,7 @@ public void serializeCodePoint(IntegerList data, int cp) { switch ( edgeType ) { case Transition.RULE : trg = ((RuleTransition)t).followState.stateNumber; - arg1 = ((RuleTransition)t).target.stateNumber; + arg1 = t.target.stateNumber; arg2 = ((RuleTransition)t).ruleIndex; arg3 = ((RuleTransition)t).precedence; break; @@ -269,8 +238,6 @@ public void serializeCodePoint(IntegerList data, int cp) { arg3 = at.isCtxDependent ? 1 : 0 ; break; case Transition.SET : - arg1 = setIndices.get(((SetTransition)t).set); - break; case Transition.NOT_SET : arg1 = setIndices.get(((SetTransition)t).set); break; @@ -278,73 +245,65 @@ public void serializeCodePoint(IntegerList data, int cp) { break; } - data.add(src); - data.add(trg); - data.add(edgeType); - data.add(arg1); - data.add(arg2); - data.add(arg3); + writer.writeCompactUInt32(src); + writer.writeCompactUInt32(trg); + writer.writeUInt16(edgeType); + writer.writeUInt16(arg1); + writer.writeUInt16(arg2); + writer.writeUInt16(arg3); } } int ndecisions = atn.decisionToState.size(); - data.add(ndecisions); + writer.writeCompactUInt32(ndecisions); for (DecisionState decStartState : atn.decisionToState) { - data.add(decStartState.stateNumber); + writer.writeCompactUInt32(decStartState.stateNumber); } // // LEXER ACTIONS // if (atn.grammarType == ATNType.LEXER) { - data.add(atn.lexerActions.length); + writer.writeCompactUInt32(atn.lexerActions.length); for (LexerAction action : atn.lexerActions) { - data.add(action.getActionType().ordinal()); + writer.writeUInt16(action.getActionType().ordinal()); switch (action.getActionType()) { case CHANNEL: int channel = ((LexerChannelAction)action).getChannel(); - data.add(channel != -1 ? channel : 0xFFFF); - data.add(0); + writer.writeUInt16(channel != -1 ? channel : 0xFFFF); + writer.writeUInt16(0); break; case CUSTOM: int ruleIndex = ((LexerCustomAction)action).getRuleIndex(); int actionIndex = ((LexerCustomAction)action).getActionIndex(); - data.add(ruleIndex != -1 ? ruleIndex : 0xFFFF); - data.add(actionIndex != -1 ? actionIndex : 0xFFFF); + writer.writeUInt16(ruleIndex != -1 ? ruleIndex : 0xFFFF); + writer.writeUInt16(actionIndex != -1 ? actionIndex : 0xFFFF); break; case MODE: int mode = ((LexerModeAction)action).getMode(); - data.add(mode != -1 ? mode : 0xFFFF); - data.add(0); + writer.writeUInt16(mode != -1 ? mode : 0xFFFF); + writer.writeUInt16(0); break; case MORE: - data.add(0); - data.add(0); - break; - case POP_MODE: - data.add(0); - data.add(0); + case SKIP: + writer.writeUInt16(0); + writer.writeUInt16(0); break; case PUSH_MODE: mode = ((LexerPushModeAction)action).getMode(); - data.add(mode != -1 ? mode : 0xFFFF); - data.add(0); - break; - - case SKIP: - data.add(0); - data.add(0); + writer.writeUInt16(mode != -1 ? mode : 0xFFFF); + writer.writeUInt16(0); break; case TYPE: int type = ((LexerTypeAction)action).getType(); - data.add(type != -1 ? type : 0xFFFF); - data.add(0); + writer.writeUInt16(type != -1 ? type : 0xFFFF); + writer.writeUInt16(0); break; default: @@ -354,102 +313,81 @@ public void serializeCodePoint(IntegerList data, int cp) { } } - // Note: This value shifting loop is documented in ATNDeserializer. - // don't adjust the first value since that's the version number - for (int i = 1; i < data.size(); i++) { - if (data.get(i) < Character.MIN_VALUE || data.get(i) > Character.MAX_VALUE) { - throw new UnsupportedOperationException("Serialized ATN data element "+ - data.get(i)+ - " element "+i+" out of range "+ - (int)Character.MIN_VALUE+ - ".."+ - (int)Character.MAX_VALUE); - } - - int value = (data.get(i) + 2) & 0xFFFF; - data.set(i, value); - } - return data; } - private static void serializeSets( - IntegerList data, - Collection sets, - CodePointSerializer codePointSerializer) - { + private static void serializeSets(ATNDataWriter writer, Collection sets, UnicodeSerializeMode mode) { int nSets = sets.size(); - data.add(nSets); + writer.writeCompactUInt32(nSets); for (IntervalSet set : sets) { boolean containsEof = set.contains(Token.EOF); + int size = set.getIntervals().size(); if (containsEof && set.getIntervals().get(0).b == Token.EOF) { - data.add(set.getIntervals().size() - 1); - } - else { - data.add(set.getIntervals().size()); + size--; } + writer.writeCompactUInt32(size); - data.add(containsEof ? 1 : 0); + writer.writeUInt16(containsEof ? 1 : 0); for (Interval I : set.getIntervals()) { + int firstValue; if (I.a == Token.EOF) { if (I.b == Token.EOF) { continue; } else { - codePointSerializer.serializeCodePoint(data, 0); + firstValue = 0; } } else { - codePointSerializer.serializeCodePoint(data, I.a); + firstValue = I.a; } - codePointSerializer.serializeCodePoint(data, I.b); + if (mode == UnicodeSerializeMode.UNICODE_BMP) { + writer.writeUInt16(firstValue); + writer.writeUInt16(I.b); + } else { + writer.writeUInt32(firstValue); + writer.writeUInt32(I.b); + } } } } public String decode(char[] data) { - data = data.clone(); - // don't adjust the first value since that's the version number - for (int i = 1; i < data.length; i++) { - data[i] = (char)(data[i] - 2); - } - + ATNDataReader dataReader = new ATNDataReader(data); StringBuilder buf = new StringBuilder(); - int p = 0; - int version = ATNDeserializer.toInt(data[p++]); + int version = dataReader.readUInt16(false); if (version != ATNDeserializer.SERIALIZED_VERSION) { String reason = String.format("Could not deserialize ATN with version %d (expected %d).", version, ATNDeserializer.SERIALIZED_VERSION); throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); } - UUID uuid = ATNDeserializer.toUUID(data, p); - p += 8; + UUID uuid = dataReader.readUUID(); if (!uuid.equals(ATNDeserializer.SERIALIZED_UUID)) { String reason = String.format(Locale.getDefault(), "Could not deserialize ATN with UUID %s (expected %s).", uuid, ATNDeserializer.SERIALIZED_UUID); throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); } - p++; // skip grammarType - int maxType = ATNDeserializer.toInt(data[p++]); + dataReader.readUInt16(); // skip grammarType + int maxType = dataReader.readUInt16(); buf.append("max type ").append(maxType).append("\n"); - int nstates = ATNDeserializer.toInt(data[p++]); + int nstates = dataReader.readCompactUInt32(); for (int i=0; i").append(trg) .append(" ").append(Transition.serializationNames.get(ttype)) .append(" ").append(arg1).append(",").append(arg2).append(",").append(arg3) .append("\n"); - p += 6; } - int ndecisions = ATNDeserializer.toInt(data[p++]); + int ndecisions = dataReader.readCompactUInt32(); for (int i=0; i tokenNames) { char[] data = Utils.toCharArray(serialized); return new ATNSerializer(atn, tokenNames).decode(data); } - - private void serializeUUID(IntegerList data, UUID uuid) { - serializeLong(data, uuid.getLeastSignificantBits()); - serializeLong(data, uuid.getMostSignificantBits()); - } - - private void serializeLong(IntegerList data, long value) { - serializeInt(data, (int)value); - serializeInt(data, (int)(value >> 32)); - } - - private void serializeInt(IntegerList data, int value) { - data.add((char)value); - data.add((char)(value >> 16)); - } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java index 752a9a707d..ef32cb81b8 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java @@ -98,11 +98,8 @@ public PredictionContext getCachedContext(PredictionContext context) { if ( sharedContextCache==null ) return context; synchronized (sharedContextCache) { - IdentityHashMap visited = - new IdentityHashMap(); - return PredictionContext.getCachedContext(context, - sharedContextCache, - visited); + IdentityHashMap visited = new IdentityHashMap<>(); + return PredictionContext.getCachedContext(context, sharedContextCache, visited); } } @@ -130,38 +127,6 @@ public static void checkCondition(boolean condition, String message) { new ATNDeserializer().checkCondition(condition, message); } - /** - * @deprecated Use {@link ATNDeserializer#toInt} instead. - */ - @Deprecated - public static int toInt(char c) { - return ATNDeserializer.toInt(c); - } - - /** - * @deprecated Use {@link ATNDeserializer#toInt32} instead. - */ - @Deprecated - public static int toInt32(char[] data, int offset) { - return ATNDeserializer.toInt32(data, offset); - } - - /** - * @deprecated Use {@link ATNDeserializer#toLong} instead. - */ - @Deprecated - public static long toLong(char[] data, int offset) { - return ATNDeserializer.toLong(data, offset); - } - - /** - * @deprecated Use {@link ATNDeserializer#toUUID} instead. - */ - @Deprecated - public static UUID toUUID(char[] data, int offset) { - return ATNDeserializer.toUUID(data, offset); - } - /** * @deprecated Use {@link ATNDeserializer#edgeFactory} instead. */ diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/UnicodeSerializeMode.java b/runtime/Java/src/org/antlr/v4/runtime/atn/UnicodeSerializeMode.java new file mode 100644 index 0000000000..26f22da6af --- /dev/null +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/UnicodeSerializeMode.java @@ -0,0 +1,6 @@ +package org.antlr.v4.runtime.atn; + +enum UnicodeSerializeMode { + UNICODE_BMP, + UNICODE_SMP +} diff --git a/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java b/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java index d6af911835..0580b89b07 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java +++ b/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java @@ -15,12 +15,11 @@ */ public class IntegerList { - private static int[] EMPTY_DATA = new int[0]; + private static final int[] EMPTY_DATA = new int[0]; private static final int INITIAL_SIZE = 4; private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; - private int[] _data; private int _size; @@ -34,12 +33,7 @@ public IntegerList(int capacity) { throw new IllegalArgumentException(); } - if (capacity == 0) { - _data = EMPTY_DATA; - } - else { - _data = new int[capacity]; - } + _data = capacity == 0 ? EMPTY_DATA : new int[capacity]; } public IntegerList(IntegerList list) { @@ -256,13 +250,7 @@ private void ensureCapacity(int capacity) { throw new OutOfMemoryError(); } - int newLength; - if (_data.length == 0) { - newLength = INITIAL_SIZE; - } - else { - newLength = _data.length; - } + int newLength = _data.length == 0 ? INITIAL_SIZE : _data.length; while (newLength < capacity) { newLength = newLength * 2;