From 374c361e3744dd2928db7f771bc7051e0125dcd4 Mon Sep 17 00:00:00 2001 From: Terence Parr Date: Sat, 26 Mar 2022 13:30:58 -0700 Subject: [PATCH] Use signed ints for ATN serialization not uint16, except for java (#3591) * refactor serialize so we don't need comments * more cleanup during refactor * store language in serializer obj * A lexer rule token type should never be -1 (EOF). 0 is fragment but then must be > 0. * Go uses int not uint16 for ATN now. java/go/python3 pass * remove checks for 0xFFFF in Go. * C++ uint16_t to int for ATN. * add mac php dir; fix type on accept() for generated code to be mixed. * Add test from @kvanTTT. This PR fixes https://github.com/antlr/antlr4/issues/3555 for non-Java targets. * cleanup and add big lexer from https://github.com/antlr/antlr4/pull/3546 * increase mvn mem size to 2G * increase mvn mem size to 8G * turn off the big ATN lexer test as we have memory issues during testing. * Fixes #3592 * Revert "C++ uint16_t to int for ATN." This reverts commit 4d2ebbf5671a5b373d2ca3b5a05464ccb8b71b52. # Conflicts: # runtime/Cpp/runtime/src/atn/ATNSerializer.cpp # runtime/Cpp/runtime/src/tree/xpath/XPathLexer.cpp * C++ uint16_t to int32_t for ATN. * rm unnecessary include file, updating project file. get rid of the 0xFFFF does in the C++ deserialization * rm refs to 0xFFFF in swift * javascript tests were running as Node...added to ignore list. * don't distinguish between 16 and 32 bit char sets in serialization; Python2/3 updated to work with this change. * update C++ to deserialize only 32-bit sets * 0xFFFF -> -1 for C++ target. * get other targets to use 32-bit sets in serialization. tests pass locally. * refactor to reduce code size * add comment * oops. comment out call to writeSerializedATNIntegerHistogram(). I wonder if this is why it ran out of memory during testing? * all but Java, Node, PHP, Go work now for the huge lexer file; I have set them to ignore. note that the swift target takes over a minute to lex it. I've turned off Node but it does not seem to terminate but it could terminate eventually. * all but Java, Node, PHP, Go work now for the huge lexer file; I have set them to ignore. note that the swift target takes over a minute to lex it. I've turned off Node but it does not seem to terminate but it could terminate eventually. * Turn off this big lexer because we get memory errors during continuous integration * Intermediate commit where I have shuffled around all of the -1 flipping and bumping by two. work still needs to be done because the token stream rewriter stuff fails. and I assume the other decoding for human readability testing if doesn't work * convert decode to use int[]; remove dead code. don't use serializeAsChar stuff. more tests pass. * more tests passing. simplify. When copying atn, must run ATN through serializer to set some state flags. * 0xFFFD+ are not valid char * clean up. tests passing now * huge clean up. Got Java working with 32-bit ATNs!Still working on cleanup but I want to run the tests * Cleanup the hack I did earlier; everything still seems to work * Use linux DCO not our old contributors certificate of origin * remove bump-by-2 code * clean up per @kvanTTT. Can't test locally on this box. Will see what CI says. * tweak comment * Revert "Use linux DCO not our old contributors certificate of origin" This reverts commit b0f8551c9a674a0a1e045b9a710800df28e72c10. * see if C++ works in CI for huge ATN --- .circleci/scripts/run-tests-cpp.sh | 1 + .circleci/scripts/run-tests-dart.sh | 1 + .circleci/scripts/run-tests-dotnet.sh | 1 + .circleci/scripts/run-tests-go.sh | 1 + .circleci/scripts/run-tests-javascript.sh | 1 + .circleci/scripts/run-tests-php.sh | 1 + .circleci/scripts/run-tests-python2.sh | 1 + .circleci/scripts/run-tests-python3.sh | 1 + .circleci/scripts/run-tests-swift.sh | 1 + .github/scripts-macosx/run-tests-cpp.sh | 1 + .github/scripts-macosx/run-tests-dotnet.sh | 1 + .github/scripts-macosx/run-tests-swift.sh | 1 + .github/scripts-windows/run-tests-csharp.cmd | 1 + .github/scripts-windows/run-tests-dart.cmd | 1 + .github/scripts-windows/run-tests-go.cmd | 1 + .github/scripts-windows/run-tests-java.cmd | 1 + .../scripts-windows/run-tests-javascript.cmd | 1 + .github/scripts-windows/run-tests-php.cmd | 1 + .github/scripts-windows/run-tests-python2.cmd | 1 + .github/scripts-windows/run-tests-python3.cmd | 1 + runtime-testsuite/pom.xml | 2 + .../descriptors/LexerExec/TokenType0xFFFF.txt | 13 + .../descriptors/LexerExec/UnicodeCharSet.txt | 2 +- .../v4/test/runtime/BaseRuntimeTest.java | 5 +- .../test/runtime/BaseRuntimeTestSupport.java | 6 +- .../runtime/GeneratedLexerDescriptors.java | 66 +- .../java/TestInterpreterDataReader.java | 8 +- .../runtime/java/api/perf/TimeLexerSpeed.java | 2 +- .../v4/test/runtime/php/BasePHPTest.java | 2 +- runtime/CSharp/src/Atn/ATNDeserializer.cs | 23 +- runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj | 2 - .../runtime/antlr4cpp-vs2013.vcxproj.filters | 3 - runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj | 1 - .../runtime/antlr4cpp-vs2015.vcxproj.filters | 3 - runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj | 1 - .../runtime/antlr4cpp-vs2017.vcxproj.filters | 3 - runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj | 1 - .../runtime/antlr4cpp-vs2019.vcxproj.filters | 3 - runtime/Cpp/runtime/src/Parser.cpp | 4 +- runtime/Cpp/runtime/src/Recognizer.h | 2 +- runtime/Cpp/runtime/src/antlr4-runtime.h | 1 - .../Cpp/runtime/src/atn/ATNDeserializer.cpp | 49 +- runtime/Cpp/runtime/src/atn/ATNDeserializer.h | 2 +- runtime/Cpp/runtime/src/atn/ATNSerializer.cpp | 589 ------------------ runtime/Cpp/runtime/src/atn/ATNSerializer.h | 61 -- .../src/misc/InterpreterDataReader.cpp | 4 +- .../Cpp/runtime/src/tree/xpath/XPathLexer.cpp | 10 +- .../Cpp/runtime/src/tree/xpath/XPathLexer.h | 2 +- .../lib/src/atn/src/atn_deserializer.dart | 30 +- runtime/Go/antlr/atn_deserializer.go | 41 +- runtime/Go/antlr/testing_lexer_b_test.go | 4 +- runtime/Go/antlr/tokenstream_rewriter_test.go | 4 +- .../antlr/v4/runtime/atn/ATNDeserializer.java | 229 +++---- .../antlr/v4/runtime/atn/ATNSerializer.java | 571 +++++------------ .../antlr/v4/runtime/atn/ATNSimulator.java | 62 -- .../antlr/v4/runtime/misc/IntegerList.java | 5 +- .../runtime/misc/InterpreterDataReader.java | 46 +- .../src/antlr4/atn/ATNDeserializer.js | 30 +- .../Python2/src/antlr4/atn/ATNDeserializer.py | 21 +- .../Python3/src/antlr4/atn/ATNDeserializer.py | 26 +- .../Sources/Antlr4/atn/ATNDeserializer.swift | 31 +- .../org/antlr/v4/test/tool/ATNDescriber.java | 203 ++++++ .../v4/test/tool/TestATNDeserialization.java | 44 +- .../v4/test/tool/TestATNSerialization.java | 268 ++++---- ...rSupportTest.java => TestCharSupport.java} | 3 +- .../tool/TestUtils.java} | 7 +- .../v4/tool/templates/codegen/Cpp/Cpp.stg | 14 +- .../antlr/v4/tool/templates/codegen/Go/Go.stg | 10 +- .../v4/tool/templates/codegen/Java/Java.stg | 2 +- tool/src/org/antlr/v4/Tool.java | 4 +- tool/src/org/antlr/v4/codegen/Target.java | 4 +- .../antlr/v4/codegen/model/Recognizer.java | 10 +- .../antlr/v4/codegen/model/SerializedATN.java | 33 +- .../v4/codegen/model/SerializedJavaATN.java | 40 ++ tool/src/org/antlr/v4/tool/Grammar.java | 36 +- .../v4/tool/GrammarParserInterpreter.java | 7 +- 76 files changed, 984 insertions(+), 1691 deletions(-) create mode 100644 runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/TokenType0xFFFF.txt delete mode 100755 runtime/Cpp/runtime/src/atn/ATNSerializer.cpp delete mode 100755 runtime/Cpp/runtime/src/atn/ATNSerializer.h create mode 100644 tool-testsuite/test/org/antlr/v4/test/tool/ATNDescriber.java rename tool-testsuite/test/org/antlr/v4/test/tool/{CharSupportTest.java => TestCharSupport.java} (99%) rename tool-testsuite/test/org/antlr/v4/{misc/UtilsTest.java => test/tool/TestUtils.java} (97%) create mode 100644 tool/src/org/antlr/v4/codegen/model/SerializedJavaATN.java diff --git a/.circleci/scripts/run-tests-cpp.sh b/.circleci/scripts/run-tests-cpp.sh index 3d7e52835e..58b59ff283 100755 --- a/.circleci/scripts/run-tests-cpp.sh +++ b/.circleci/scripts/run-tests-cpp.sh @@ -3,5 +3,6 @@ set -euo pipefail pushd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=cpp.** test popd diff --git a/.circleci/scripts/run-tests-dart.sh b/.circleci/scripts/run-tests-dart.sh index 4bd9f78ff0..2f9015df6c 100755 --- a/.circleci/scripts/run-tests-dart.sh +++ b/.circleci/scripts/run-tests-dart.sh @@ -6,5 +6,6 @@ dart --version pushd runtime-testsuite echo "running maven tests..." + export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=dart.** test popd diff --git a/.circleci/scripts/run-tests-dotnet.sh b/.circleci/scripts/run-tests-dotnet.sh index cb237e6ab5..c0e6eeef6f 100755 --- a/.circleci/scripts/run-tests-dotnet.sh +++ b/.circleci/scripts/run-tests-dotnet.sh @@ -3,5 +3,6 @@ set -euo pipefail pushd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=csharp.** test popd diff --git a/.circleci/scripts/run-tests-go.sh b/.circleci/scripts/run-tests-go.sh index d98b2b43fb..b5317f3e2a 100755 --- a/.circleci/scripts/run-tests-go.sh +++ b/.circleci/scripts/run-tests-go.sh @@ -6,5 +6,6 @@ go version pushd runtime-testsuite echo "running maven tests..." + export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=go.** test popd diff --git a/.circleci/scripts/run-tests-javascript.sh b/.circleci/scripts/run-tests-javascript.sh index 4923eb4b9f..b0b461fce0 100755 --- a/.circleci/scripts/run-tests-javascript.sh +++ b/.circleci/scripts/run-tests-javascript.sh @@ -15,6 +15,7 @@ popd pushd runtime-testsuite echo "running maven tests..." + export MAVEN_OPTS="-Xmx8g" mvn -Dtest=javascript.** test RESULT+=$? diff --git a/.circleci/scripts/run-tests-php.sh b/.circleci/scripts/run-tests-php.sh index 92e4722bf5..e46ebb3d66 100755 --- a/.circleci/scripts/run-tests-php.sh +++ b/.circleci/scripts/run-tests-php.sh @@ -7,5 +7,6 @@ php -v php_path=$(which php) pushd runtime-testsuite echo "running maven tests..." + export MAVEN_OPTS="-Xmx8g" mvn -DPHP_PATH="${php_path}" -Dparallel=classes -DthreadCount=4 -Dtest=php.** test popd diff --git a/.circleci/scripts/run-tests-python2.sh b/.circleci/scripts/run-tests-python2.sh index e76f862ad1..772009ac3d 100755 --- a/.circleci/scripts/run-tests-python2.sh +++ b/.circleci/scripts/run-tests-python2.sh @@ -17,5 +17,6 @@ python2 --version pushd runtime-testsuite echo "running maven tests..." + export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=python2.** test popd \ No newline at end of file diff --git a/.circleci/scripts/run-tests-python3.sh b/.circleci/scripts/run-tests-python3.sh index c39b88b441..2be5773a1c 100755 --- a/.circleci/scripts/run-tests-python3.sh +++ b/.circleci/scripts/run-tests-python3.sh @@ -17,5 +17,6 @@ python3 --version pushd runtime-testsuite echo "running maven tests..." + export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=python3.** test popd diff --git a/.circleci/scripts/run-tests-swift.sh b/.circleci/scripts/run-tests-swift.sh index 8c774a8bfc..bc68f84735 100755 --- a/.circleci/scripts/run-tests-swift.sh +++ b/.circleci/scripts/run-tests-swift.sh @@ -17,5 +17,6 @@ set -euo pipefail pushd runtime-testsuite echo "running maven tests..." + export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=swift.** test popd diff --git a/.github/scripts-macosx/run-tests-cpp.sh b/.github/scripts-macosx/run-tests-cpp.sh index 3d7e52835e..58b59ff283 100755 --- a/.github/scripts-macosx/run-tests-cpp.sh +++ b/.github/scripts-macosx/run-tests-cpp.sh @@ -3,5 +3,6 @@ set -euo pipefail pushd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=cpp.** test popd diff --git a/.github/scripts-macosx/run-tests-dotnet.sh b/.github/scripts-macosx/run-tests-dotnet.sh index d943c6e952..32788e1c85 100755 --- a/.github/scripts-macosx/run-tests-dotnet.sh +++ b/.github/scripts-macosx/run-tests-dotnet.sh @@ -13,5 +13,6 @@ dotnet build -c Release -f netstandard2.0 runtime/CSharp/Antlr4.csproj # run tests pushd runtime-testsuite/ +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=4 -Dtest=csharp.** test popd diff --git a/.github/scripts-macosx/run-tests-swift.sh b/.github/scripts-macosx/run-tests-swift.sh index 4d4532d1b5..a6d6470179 100755 --- a/.github/scripts-macosx/run-tests-swift.sh +++ b/.github/scripts-macosx/run-tests-swift.sh @@ -38,6 +38,7 @@ swift build --version cd runtime-testsuite/ # mvn -e -Dparallel=classes -DthreadCount=4 -Dtest=swift.** test # I don't know swift enough to make it parallel. revert to single threaded +export MAVEN_OPTS="-Xmx8g" mvn -e -Dtest=swift.** test rc=$? cat target/surefire-reports/*.dumpstream || true diff --git a/.github/scripts-windows/run-tests-csharp.cmd b/.github/scripts-windows/run-tests-csharp.cmd index c5e1fb0a77..28fd0d3c99 100644 --- a/.github/scripts-windows/run-tests-csharp.cmd +++ b/.github/scripts-windows/run-tests-csharp.cmd @@ -1,5 +1,6 @@ dotnet build runtime/CSharp/src/Antlr4.csproj -c Release dotnet pack runtime/CSharp/src/Antlr4.csproj -c Release cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=2 -Dtest=csharp.** test cd .. diff --git a/.github/scripts-windows/run-tests-dart.cmd b/.github/scripts-windows/run-tests-dart.cmd index 2fd5034ba7..74fa92b52d 100644 --- a/.github/scripts-windows/run-tests-dart.cmd +++ b/.github/scripts-windows/run-tests-dart.cmd @@ -1,5 +1,6 @@ C:\ProgramData\chocolatey\bin\choco.exe -y install dart-sdk cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dtest=dart.** test -Dantlr-dart-dart="C:\tools\dart-sdk\bin\dart.exe" -Dantlr-dart-pub="C:\tools\dart-sdk\bin\pub.bat" -Dantlr-dart-dart2native="C:\tools\dart-sdk\bin\dart2native.bat" cd .. diff --git a/.github/scripts-windows/run-tests-go.cmd b/.github/scripts-windows/run-tests-go.cmd index 1c15fd1c95..d07d7e1ef2 100644 --- a/.github/scripts-windows/run-tests-go.cmd +++ b/.github/scripts-windows/run-tests-go.cmd @@ -1,3 +1,4 @@ cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=2 -Dtest=go.** test cd .. diff --git a/.github/scripts-windows/run-tests-java.cmd b/.github/scripts-windows/run-tests-java.cmd index 55e9ea5621..87a72b3c72 100755 --- a/.github/scripts-windows/run-tests-java.cmd +++ b/.github/scripts-windows/run-tests-java.cmd @@ -1,3 +1,4 @@ cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=2 -Dtest=java.** test cd .. diff --git a/.github/scripts-windows/run-tests-javascript.cmd b/.github/scripts-windows/run-tests-javascript.cmd index b8744e8980..81d1eacbde 100644 --- a/.github/scripts-windows/run-tests-javascript.cmd +++ b/.github/scripts-windows/run-tests-javascript.cmd @@ -1,3 +1,4 @@ cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=2 -Dtest=javascript.** test cd .. diff --git a/.github/scripts-windows/run-tests-php.cmd b/.github/scripts-windows/run-tests-php.cmd index 155cc0472f..1dfe69cdb2 100644 --- a/.github/scripts-windows/run-tests-php.cmd +++ b/.github/scripts-windows/run-tests-php.cmd @@ -4,5 +4,6 @@ git clone https://github.com/antlr/antlr-php-runtime.git move antlr-php-runtime runtime\PHP cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=2 -Dtest=php.** test -Dantlr-php-php="C:\tools\php81\php.exe" cd .. diff --git a/.github/scripts-windows/run-tests-python2.cmd b/.github/scripts-windows/run-tests-python2.cmd index 351355b03a..6f2defe23c 100644 --- a/.github/scripts-windows/run-tests-python2.cmd +++ b/.github/scripts-windows/run-tests-python2.cmd @@ -1,3 +1,4 @@ cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=2 -Dantlr-python2-python="C:\Python27\python.exe" -Dtest=python2.** test cd .. diff --git a/.github/scripts-windows/run-tests-python3.cmd b/.github/scripts-windows/run-tests-python3.cmd index fe448be018..f08639c70d 100644 --- a/.github/scripts-windows/run-tests-python3.cmd +++ b/.github/scripts-windows/run-tests-python3.cmd @@ -1,3 +1,4 @@ cd runtime-testsuite +export MAVEN_OPTS="-Xmx8g" mvn -Dparallel=classes -DthreadCount=2 -Dantlr-python3-python="C:\Python310\python.exe" -Dtest=python3.** test cd .. diff --git a/runtime-testsuite/pom.xml b/runtime-testsuite/pom.xml index 50d1bed541..bcba9ceae2 100644 --- a/runtime-testsuite/pom.xml +++ b/runtime-testsuite/pom.xml @@ -132,6 +132,8 @@ maven-compiler-plugin 8 + 9 + 9 diff --git a/runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/TokenType0xFFFF.txt b/runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/TokenType0xFFFF.txt new file mode 100644 index 0000000000..9018111fe7 --- /dev/null +++ b/runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/TokenType0xFFFF.txt @@ -0,0 +1,13 @@ +[type] +Lexer + +[grammar] +lexer grammar L; +T_FFFF: 'FFFF' -> type(65535); + +[input] +FFFF + +[output] +[@0,0:3='FFFF',<65535>,1:0] +[@1,4:3='',<-1>,1:4] diff --git a/runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/UnicodeCharSet.txt b/runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/UnicodeCharSet.txt index e9918ad741..f697767d92 100644 --- a/runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/UnicodeCharSet.txt +++ b/runtime-testsuite/resources/org/antlr/v4/test/runtime/descriptors/LexerExec/UnicodeCharSet.txt @@ -6,7 +6,7 @@ Lexer [grammar] lexer grammar L; -ID : ([A-Z_]|'Ā'..'￾') ([A-Z_0-9]|'Ā'..'￾')*; +ID : ([A-Z_]|'Ā'..'\uFFFC') ([A-Z_0-9]|'Ā'..'\uFFFC')*; // FFFD+ are not valid char [input] 均 diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java index f0546226fd..fdd476a6f8 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTest.java @@ -366,9 +366,10 @@ public static RuntimeTestDescriptor[] getRuntimeTestDescriptors(String group, St } if (group.equals("LexerExec")) { - descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfTest(targetName)); - descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfTest(targetName)); + descriptors.add(GeneratedLexerDescriptors.getLineSeparatorLfDescriptor(targetName)); + descriptors.add(GeneratedLexerDescriptors.getLineSeparatorCrLfDescriptor(targetName)); descriptors.add(GeneratedLexerDescriptors.getLargeLexerDescriptor(targetName)); + descriptors.add(GeneratedLexerDescriptors.getAtnStatesSizeMoreThan65535Descriptor(targetName)); } return descriptors.toArray(new RuntimeTestDescriptor[0]); diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTestSupport.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTestSupport.java index 5ab6cea706..578e0cf3e1 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTestSupport.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/BaseRuntimeTestSupport.java @@ -6,6 +6,7 @@ import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.atn.ATNDeserializer; import org.antlr.v4.runtime.atn.ATNSerializer; +import org.antlr.v4.runtime.misc.IntegerList; import org.antlr.v4.semantics.SemanticPipeline; import org.antlr.v4.tool.Grammar; import org.antlr.v4.tool.LexerGrammar; @@ -219,8 +220,9 @@ protected ATN createATN(Grammar g, boolean useSerializer) { ATN atn = g.atn; if ( useSerializer ) { - char[] serialized = ATNSerializer.getSerializedAsChars(atn, g.getLanguage()); - return new ATNDeserializer().deserialize(serialized); + // sets some flags in ATN + IntegerList serialized = ATNSerializer.getSerialized(atn); + return new ATNDeserializer().deserialize(serialized.toArray()); } return atn; diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java index e0a1bb5ea3..b489e36142 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/GeneratedLexerDescriptors.java @@ -1,7 +1,9 @@ package org.antlr.v4.test.runtime; +import java.util.*; + public class GeneratedLexerDescriptors { - static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) { + static RuntimeTestDescriptor getLineSeparatorLfDescriptor(String targetName) { UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor(); result.name = "LineSeparatorLf"; result.targetName = targetName; @@ -20,7 +22,7 @@ static RuntimeTestDescriptor getLineSeparatorLfTest(String targetName) { return result; } - static RuntimeTestDescriptor getLineSeparatorCrLfTest(String targetName) { + static RuntimeTestDescriptor getLineSeparatorCrLfDescriptor(String targetName) { UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor(); result.name = "LineSeparatorCrLf"; result.targetName = targetName; @@ -65,4 +67,64 @@ static RuntimeTestDescriptor getLargeLexerDescriptor(String targetName) { "[@1,5:4='',<-1>,1:5]\n"; return result; } + + static RuntimeTestDescriptor getAtnStatesSizeMoreThan65535Descriptor(String targetName) { + UniversalRuntimeTestDescriptor result = new UniversalRuntimeTestDescriptor(); + result.name = "AtnStatesSizeMoreThan65535"; + result.notes = "Regression for https://github.com/antlr/antlr4/issues/1863"; + result.targetName = targetName; + result.testType = "Lexer"; + + // I tried playing around with different sizes and I think 1002 works for Go but 1003 does not; + // the executing lexer gets a token syntax error for T208 or something like that + final int tokensCount = 1024; + final String suffix = String.join("", Collections.nCopies(70, "_")); + + String grammarName = "L"; + StringBuilder grammar = new StringBuilder(); + grammar.append("lexer grammar ").append(grammarName).append(";\n"); + grammar.append('\n'); + StringBuilder input = new StringBuilder(); + StringBuilder output = new StringBuilder(); + int startOffset; + int stopOffset = -2; + for (int i = 0; i < tokensCount; i++) { + String ruleName = String.format("T_%06d", i); + String value = ruleName+suffix; + grammar.append(ruleName).append(": '").append(value).append("';\n"); + input.append(value).append('\n'); + + startOffset = stopOffset + 2; + stopOffset += value.length() + 1; + + output.append("[@").append(i).append(',').append(startOffset).append(':').append(stopOffset) + .append("='").append(value).append("',<").append(i + 1).append(">,").append(i + 1) + .append(":0]\n"); + } + + grammar.append("\n"); + grammar.append("WS: [ \\t\\r\\n]+ -> skip;\n"); + + startOffset = stopOffset + 2; + stopOffset = startOffset - 1; + output.append("[@").append(tokensCount).append(',').append(startOffset).append(':').append(stopOffset) + .append("='',<-1>,").append(tokensCount + 1).append(":0]\n"); + + result.grammar = grammar.toString(); + result.grammarName = grammarName; + result.input = input.toString(); + result.output = output.toString(); + + // We seem to get memory errors and so I am turning this off during CI + List all = Arrays.asList( +// "CSharp", "Python2", "Python3", "Cpp", "Go", "PHP", "Swift", "Java", "JavaScript", "Node", "Dart" + "CSharp", "Python2", "Python3", "Go", "PHP", "Swift", "JavaScript", "Node", "Dart" + ); + result.skipTargets.addAll(all); + +// result.skipTargets.add("Node"); // doesn't terminate +// result.skipTargets.add("PHP"); // "Allowed memory size of 134217728 bytes exhausted (tried to allocate 16384 bytes)..." +// result.skipTargets.add("Go"); // syntax error + return result; + } } diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestInterpreterDataReader.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestInterpreterDataReader.java index 65af466a0a..d11f6904a3 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestInterpreterDataReader.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/TestInterpreterDataReader.java @@ -6,20 +6,18 @@ import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.atn.ATNDeserializer; import org.antlr.v4.runtime.atn.ATNSerializer; +import org.antlr.v4.runtime.misc.IntegerList; import org.antlr.v4.runtime.misc.InterpreterDataReader; -import org.antlr.v4.runtime.misc.Utils; import org.antlr.v4.tool.Grammar; import org.junit.Assert; import org.junit.Test; import java.io.IOException; import java.lang.reflect.Field; -import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collection; import java.util.List; /** This file represents a simple sanity checks on the parsing of the .interp file @@ -76,8 +74,8 @@ public void testParseFile() throws IOException, NoSuchFieldException, IllegalAcc Assert.assertNull(channels); Assert.assertNull(modes); - char[] atnChars = ATNSerializer.getSerializedAsChars(atn, g.getLanguage()); - Assert.assertEquals(ATNDeserializer.SERIALIZED_VERSION, atnChars[0]); + IntegerList serialized = ATNSerializer.getSerialized(atn); + Assert.assertEquals(ATNDeserializer.SERIALIZED_VERSION, serialized.get(0)); } private List castList(Object obj, Class clazz) { diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/api/perf/TimeLexerSpeed.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/api/perf/TimeLexerSpeed.java index e129fb145a..e653f8b59f 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/java/api/perf/TimeLexerSpeed.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/java/api/perf/TimeLexerSpeed.java @@ -55,7 +55,7 @@ * * Sample output on OS X with 4 GHz Intel Core i7 (us == microseconds, 1/1000 of a millisecond): * - Java VM args: -Xms2G -Xmx2G + Java VM args: -Xms2G -Xmx8g Warming up Java compiler.... load_legacy_java_ascii_file average time 53us size 58384b over 3500 loads of 29038 symbols from Parser.java load_legacy_java_ascii_file average time 27us size 15568b over 3500 loads of 7625 symbols from RuleContext.java diff --git a/runtime-testsuite/test/org/antlr/v4/test/runtime/php/BasePHPTest.java b/runtime-testsuite/test/org/antlr/v4/test/runtime/php/BasePHPTest.java index d906ff0cd0..ec6d4fda8e 100644 --- a/runtime-testsuite/test/org/antlr/v4/test/runtime/php/BasePHPTest.java +++ b/runtime-testsuite/test/org/antlr/v4/test/runtime/php/BasePHPTest.java @@ -244,7 +244,7 @@ private String locateTool(String tool) { return phpPath; } - String[] roots = {"/usr/local/bin/", "/opt/local/bin", "/usr/bin/"}; + String[] roots = {"/usr/local/bin/", "/opt/local/bin", "/opt/homebrew/bin/", "/usr/bin/"}; for (String root: roots) { if (new File(root + tool).exists()) { diff --git a/runtime/CSharp/src/Atn/ATNDeserializer.cs b/runtime/CSharp/src/Atn/ATNDeserializer.cs index 9cefab68de..7f47677062 100644 --- a/runtime/CSharp/src/Atn/ATNDeserializer.cs +++ b/runtime/CSharp/src/Atn/ATNDeserializer.cs @@ -45,8 +45,7 @@ public virtual ATN Deserialize(int[] data) ReadRules (atn); ReadModes (atn); IList sets = new List(); - ReadSets (atn, sets, ReadInt); - ReadSets (atn, sets, ReadInt32); + ReadSets (atn, sets); ReadEdges (atn, sets); ReadDecisions (atn); ReadLexerActions (atn); @@ -190,15 +189,7 @@ protected internal virtual void ReadLexerActions(ATN atn) { LexerActionType actionType = (LexerActionType)ReadInt(); int data1 = ReadInt(); - if (data1 == unchecked((int)(0xFFFF))) - { - data1 = -1; - } int data2 = ReadInt(); - if (data2 == unchecked((int)(0xFFFF))) - { - data2 = -1; - } ILexerAction lexerAction = LexerActionFactory(actionType, data1, data2); atn.lexerActions[i_10] = lexerAction; } @@ -309,7 +300,7 @@ protected internal virtual void ReadEdges(ATN atn, IList sets) } } - protected internal virtual void ReadSets(ATN atn, IList sets, System.Func readUnicode) + protected internal virtual void ReadSets(ATN atn, IList sets) { // // SETS @@ -327,7 +318,7 @@ protected internal virtual void ReadSets(ATN atn, IList sets, Syste } for (int j = 0; j < nintervals; j++) { - set.Add(readUnicode(), readUnicode()); + set.Add(ReadInt(), ReadInt()); } } } @@ -369,9 +360,6 @@ protected internal virtual void ReadRules(ATN atn) atn.ruleToStartState[i_5] = startState; if (atn.grammarType == ATNType.Lexer) { int tokenType = ReadInt (); - if (tokenType == unchecked((int)(0xFFFF))) { - tokenType = TokenConstants.EOF; - } atn.ruleToTokenType [i_5] = tokenType; } } @@ -967,11 +955,6 @@ protected internal int ReadInt() return data[p++]; } - protected internal int ReadInt32() - { - return (int)data[p++] | ((int)data[p++] << 16); - } - [return: NotNull] protected internal virtual Transition EdgeFactory(ATN atn, TransitionType type, int src, int trg, int arg1, int arg2, int arg3, IList sets) { diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj b/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj index 15e3a3c975..83f76113ef 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj +++ b/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj @@ -493,8 +493,6 @@ - - diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj.filters b/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj.filters index 499a82ed4d..0105b80e74 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj.filters +++ b/runtime/Cpp/runtime/antlr4cpp-vs2013.vcxproj.filters @@ -285,9 +285,6 @@ Header Files\atn - - Header Files\atn - Header Files\atn diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj b/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj index a90095d30e..8fb5cf9806 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj +++ b/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj @@ -507,7 +507,6 @@ - diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj.filters b/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj.filters index cc1986923d..8573ee8373 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj.filters +++ b/runtime/Cpp/runtime/antlr4cpp-vs2015.vcxproj.filters @@ -285,9 +285,6 @@ Header Files\atn - - Header Files\atn - Header Files\atn diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj b/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj index eb96aa8dfb..8ad1d01b6f 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj +++ b/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj @@ -507,7 +507,6 @@ - diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj.filters b/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj.filters index cc1986923d..8573ee8373 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj.filters +++ b/runtime/Cpp/runtime/antlr4cpp-vs2017.vcxproj.filters @@ -285,9 +285,6 @@ Header Files\atn - - Header Files\atn - Header Files\atn diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj b/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj index d07ad0b0e0..d5df910b8a 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj +++ b/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj @@ -515,7 +515,6 @@ - diff --git a/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj.filters b/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj.filters index cc1986923d..8573ee8373 100644 --- a/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj.filters +++ b/runtime/Cpp/runtime/antlr4cpp-vs2019.vcxproj.filters @@ -285,9 +285,6 @@ Header Files\atn - - Header Files\atn - Header Files\atn diff --git a/runtime/Cpp/runtime/src/Parser.cpp b/runtime/Cpp/runtime/src/Parser.cpp index b07ff4efa9..bcd36b44fb 100755 --- a/runtime/Cpp/runtime/src/Parser.cpp +++ b/runtime/Cpp/runtime/src/Parser.cpp @@ -38,7 +38,7 @@ struct BypassAltsAtnCache final { /// bypass alternatives. /// /// - std::map, std::unique_ptr> map; + std::map, std::unique_ptr> map; }; BypassAltsAtnCache* getBypassAltsAtnCache() { @@ -229,7 +229,7 @@ TokenFactory* Parser::getTokenFactory() { const atn::ATN& Parser::getATNWithBypassAlts() { - const std::vector &serializedAtn = getSerializedATN(); + const std::vector &serializedAtn = getSerializedATN(); if (serializedAtn.empty()) { throw UnsupportedOperationException("The current parser does not support an ATN with bypass alternatives."); } diff --git a/runtime/Cpp/runtime/src/Recognizer.h b/runtime/Cpp/runtime/src/Recognizer.h index 932e726675..28abfc8741 100755 --- a/runtime/Cpp/runtime/src/Recognizer.h +++ b/runtime/Cpp/runtime/src/Recognizer.h @@ -53,7 +53,7 @@ namespace antlr4 { /// For interpreters, we don't know their serialized ATN despite having /// created the interpreter from it. /// - virtual const std::vector& getSerializedATN() const { + virtual const std::vector& getSerializedATN() const { throw "there is no serialized ATN"; } diff --git a/runtime/Cpp/runtime/src/antlr4-runtime.h b/runtime/Cpp/runtime/src/antlr4-runtime.h index 70de934f50..193ea68964 100644 --- a/runtime/Cpp/runtime/src/antlr4-runtime.h +++ b/runtime/Cpp/runtime/src/antlr4-runtime.h @@ -57,7 +57,6 @@ #include "atn/ATNConfigSet.h" #include "atn/ATNDeserializationOptions.h" #include "atn/ATNDeserializer.h" -#include "atn/ATNSerializer.h" #include "atn/ATNSimulator.h" #include "atn/ATNState.h" #include "atn/ATNType.h" diff --git a/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp b/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp index 285167247b..3b5563c975 100755 --- a/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp +++ b/runtime/Cpp/runtime/src/atn/ATNDeserializer.cpp @@ -221,28 +221,14 @@ namespace { return s; } - uint32_t deserializeInt32(const std::vector& data, size_t offset) { - return static_cast(data[offset]) | (static_cast(data[offset + 1]) << 16); - } - - ssize_t readUnicodeInt(const std::vector& data, int& p) { + ssize_t readUnicodeInt32(const std::vector& data, int& p) { return static_cast(data[p++]); } - ssize_t readUnicodeInt32(const std::vector& data, int& p) { - auto result = deserializeInt32(data, p); - p += 2; - return static_cast(result); - } - - // We templatize this on the function type so the optimizer can inline - // the 16- or 32-bit readUnicodeInt/readUnicodeInt32 as needed. - template void deserializeSets( - const std::vector& data, + const std::vector& data, int& p, - std::vector& sets, - F readUnicode) { + std::vector& sets) { size_t nsets = data[p++]; sets.reserve(sets.size() + nsets); for (size_t i = 0; i < nsets; i++) { @@ -255,8 +241,8 @@ namespace { } for (size_t j = 0; j < nintervals; j++) { - auto a = readUnicode(data, p); - auto b = readUnicode(data, p); + auto a = readUnicodeInt32(data, p); + auto b = readUnicodeInt32(data, p); set.add(a, b); } sets.push_back(set); @@ -269,7 +255,7 @@ ATNDeserializer::ATNDeserializer() : ATNDeserializer(ATNDeserializationOptions:: ATNDeserializer::ATNDeserializer(ATNDeserializationOptions deserializationOptions) : _deserializationOptions(std::move(deserializationOptions)) {} -std::unique_ptr ATNDeserializer::deserialize(const std::vector& data) const { +std::unique_ptr ATNDeserializer::deserialize(const std::vector& data) const { int p = 0; int version = data[p++]; if (version != SERIALIZED_VERSION) { @@ -301,10 +287,6 @@ std::unique_ptr ATNDeserializer::deserialize(const std::vector& d } size_t ruleIndex = data[p++]; - if (ruleIndex == 0xFFFF) { - ruleIndex = INVALID_INDEX; - } - ATNState *s = stateFactory(stype, ruleIndex); if (stype == ATNStateType::LOOP_END) { // special case int loopBackStateNumber = data[p++]; @@ -352,10 +334,6 @@ std::unique_ptr ATNDeserializer::deserialize(const std::vector& d atn->ruleToStartState.push_back(startState); if (atn->grammarType == ATNType::LEXER) { size_t tokenType = data[p++]; - if (tokenType == 0xFFFF) { - tokenType = Token::EOF; - } - atn->ruleToTokenType.push_back(tokenType); } } @@ -387,12 +365,7 @@ std::unique_ptr ATNDeserializer::deserialize(const std::vector& d { std::vector sets; - // First, deserialize sets with 16-bit arguments <= U+FFFF. - deserializeSets(data, p, sets, readUnicodeInt); - - // Next, deserialize sets with 32-bit arguments <= U+10FFFF. - deserializeSets(data, p, sets, readUnicodeInt32); - + deserializeSets(data, p, sets); sets.shrink_to_fit(); // @@ -492,15 +465,7 @@ std::unique_ptr ATNDeserializer::deserialize(const std::vector& d for (size_t i = 0; i < atn->lexerActions.size(); i++) { LexerActionType actionType = static_cast(data[p++]); int data1 = data[p++]; - if (data1 == 0xFFFF) { - data1 = -1; - } - int data2 = data[p++]; - if (data2 == 0xFFFF) { - data2 = -1; - } - atn->lexerActions[i] = lexerActionFactory(actionType, data1, data2); } } diff --git a/runtime/Cpp/runtime/src/atn/ATNDeserializer.h b/runtime/Cpp/runtime/src/atn/ATNDeserializer.h index 35276ed782..2442d4b7bd 100755 --- a/runtime/Cpp/runtime/src/atn/ATNDeserializer.h +++ b/runtime/Cpp/runtime/src/atn/ATNDeserializer.h @@ -20,7 +20,7 @@ namespace atn { explicit ATNDeserializer(ATNDeserializationOptions deserializationOptions); - std::unique_ptr deserialize(const std::vector &input) const; + std::unique_ptr deserialize(const std::vector &input) const; void verifyATN(const ATN &atn) const; private: diff --git a/runtime/Cpp/runtime/src/atn/ATNSerializer.cpp b/runtime/Cpp/runtime/src/atn/ATNSerializer.cpp deleted file mode 100755 index dc62787b9c..0000000000 --- a/runtime/Cpp/runtime/src/atn/ATNSerializer.cpp +++ /dev/null @@ -1,589 +0,0 @@ -/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. - * Use of this file is governed by the BSD 3-clause license that - * can be found in the LICENSE.txt file in the project root. - */ - -#include "misc/IntervalSet.h" -#include "atn/ATNType.h" -#include "atn/ATNState.h" -#include "atn/BlockEndState.h" - -#include "atn/DecisionState.h" -#include "atn/RuleStartState.h" -#include "atn/LoopEndState.h" -#include "atn/BlockStartState.h" -#include "atn/Transition.h" -#include "atn/SetTransition.h" -#include "Token.h" -#include "misc/Interval.h" -#include "atn/ATN.h" - -#include "atn/RuleTransition.h" -#include "atn/PrecedencePredicateTransition.h" -#include "atn/PredicateTransition.h" -#include "atn/RangeTransition.h" -#include "atn/AtomTransition.h" -#include "atn/ActionTransition.h" -#include "atn/TransitionType.h" -#include "atn/ATNDeserializer.h" - -#include "atn/TokensStartState.h" -#include "Exceptions.h" -#include "support/CPPUtils.h" - -#include "atn/LexerChannelAction.h" -#include "atn/LexerCustomAction.h" -#include "atn/LexerModeAction.h" -#include "atn/LexerPushModeAction.h" -#include "atn/LexerTypeAction.h" - -#include "Exceptions.h" - -#include "atn/ATNSerializer.h" - -using namespace antlrcpp; -using namespace antlr4::atn; - -ATNSerializer::ATNSerializer(ATN *atn) { this->atn = atn; } - -ATNSerializer::ATNSerializer(ATN *atn, const std::vector &tokenNames) { - this->atn = atn; - _tokenNames = tokenNames; -} - -ATNSerializer::~ATNSerializer() { } - -std::vector ATNSerializer::serialize() { - std::vector data; - data.push_back(ATNDeserializer::SERIALIZED_VERSION); - - // convert grammar type to ATN const to avoid dependence on ANTLRParser - data.push_back(static_cast(atn->grammarType)); - data.push_back(atn->maxTokenType); - size_t nedges = 0; - - std::unordered_map setIndices; - std::vector sets; - - // dump states, count edges and collect sets while doing so - std::vector nonGreedyStates; - std::vector precedenceStates; - data.push_back(atn->states.size()); - for (ATNState *s : atn->states) { - if (s == nullptr) { // might be optimized away - data.push_back(0); - continue; - } - - size_t stateType = static_cast(s->getStateType()); - if (is(s) && (static_cast(s))->nonGreedy) { - nonGreedyStates.push_back(s->stateNumber); - } - - if (is(s) && (static_cast(s))->isLeftRecursiveRule) { - precedenceStates.push_back(s->stateNumber); - } - - data.push_back(stateType); - - if (s->ruleIndex == INVALID_INDEX) { - data.push_back(0xFFFF); - } - else { - data.push_back(s->ruleIndex); - } - - if (s->getStateType() == ATNStateType::LOOP_END) { - data.push_back((static_cast(s))->loopBackState->stateNumber); - } - else if (is(s)) { - data.push_back((static_cast(s))->endState->stateNumber); - } - - if (s->getStateType() != ATNStateType::RULE_STOP) { - // the deserializer can trivially derive these edges, so there's no need - // to serialize them - nedges += s->transitions.size(); - } - - for (size_t i = 0; i < s->transitions.size(); i++) { - const Transition *t = s->transitions[i].get(); - TransitionType edgeType = t->getTransitionType(); - if (edgeType == TransitionType::SET || edgeType == TransitionType::NOT_SET) { - const SetTransition *st = static_cast(t); - if (setIndices.find(st->set) == setIndices.end()) { - sets.push_back(st->set); - setIndices.insert({ st->set, (int)sets.size() - 1 }); - } - } - } - } - - // non-greedy states - data.push_back(nonGreedyStates.size()); - for (size_t i = 0; i < nonGreedyStates.size(); i++) { - data.push_back(nonGreedyStates.at(i)); - } - - // precedence states - data.push_back(precedenceStates.size()); - for (size_t i = 0; i < precedenceStates.size(); i++) { - data.push_back(precedenceStates.at(i)); - } - - size_t nrules = atn->ruleToStartState.size(); - data.push_back(nrules); - for (size_t r = 0; r < nrules; r++) { - ATNState *ruleStartState = atn->ruleToStartState[r]; - data.push_back(ruleStartState->stateNumber); - if (atn->grammarType == ATNType::LEXER) { - if (atn->ruleToTokenType[r] == Token::EOF) { - data.push_back(0xFFFF); - } - else { - data.push_back(atn->ruleToTokenType[r]); - } - } - } - - size_t nmodes = atn->modeToStartState.size(); - data.push_back(nmodes); - if (nmodes > 0) { - for (const auto &modeStartState : atn->modeToStartState) { - data.push_back(modeStartState->stateNumber); - } - } - - size_t nsets = sets.size(); - data.push_back(nsets); - for (const auto &set : sets) { - bool containsEof = set.contains(Token::EOF); - if (containsEof && set.getIntervals().at(0).b == -1) { - data.push_back(set.getIntervals().size() - 1); - } - else { - data.push_back(set.getIntervals().size()); - } - - data.push_back(containsEof ? 1 : 0); - for (const auto &interval : set.getIntervals()) { - if (interval.a == -1) { - if (interval.b == -1) { - continue; - } else { - data.push_back(0); - } - } - else { - data.push_back(interval.a); - } - - data.push_back(interval.b); - } - } - - data.push_back(nedges); - for (ATNState *s : atn->states) { - if (s == nullptr) { - // might be optimized away - continue; - } - - if (s->getStateType() == ATNStateType::RULE_STOP) { - continue; - } - - for (size_t i = 0; i < s->transitions.size(); i++) { - const Transition *t = s->transitions[i].get(); - - if (atn->states[t->target->stateNumber] == nullptr) { - throw IllegalStateException("Cannot serialize a transition to a removed state."); - } - - size_t src = s->stateNumber; - size_t trg = t->target->stateNumber; - TransitionType edgeType = t->getTransitionType(); - size_t arg1 = 0; - size_t arg2 = 0; - size_t arg3 = 0; - switch (edgeType) { - case TransitionType::RULE: - trg = (static_cast(t))->followState->stateNumber; - arg1 = (static_cast(t))->target->stateNumber; - arg2 = (static_cast(t))->ruleIndex; - arg3 = (static_cast(t))->precedence; - break; - case TransitionType::PRECEDENCE: - { - const PrecedencePredicateTransition *ppt = - static_cast(t); - arg1 = ppt->precedence; - } - break; - case TransitionType::PREDICATE: - { - const PredicateTransition *pt = static_cast(t); - arg1 = pt->ruleIndex; - arg2 = pt->predIndex; - arg3 = pt->isCtxDependent ? 1 : 0; - } - break; - case TransitionType::RANGE: - arg1 = (static_cast(t))->from; - arg2 = (static_cast(t))->to; - if (arg1 == Token::EOF) { - arg1 = 0; - arg3 = 1; - } - - break; - case TransitionType::ATOM: - arg1 = (static_cast(t))->_label; - if (arg1 == Token::EOF) { - arg1 = 0; - arg3 = 1; - } - - break; - case TransitionType::ACTION: - { - const ActionTransition *at = static_cast(t); - arg1 = at->ruleIndex; - arg2 = at->actionIndex; - if (arg2 == INVALID_INDEX) { - arg2 = 0xFFFF; - } - - arg3 = at->isCtxDependent ? 1 : 0; - } - break; - case TransitionType::SET: - arg1 = setIndices[(static_cast(t))->set]; - break; - - case TransitionType::NOT_SET: - arg1 = setIndices[(static_cast(t))->set]; - break; - - default: - break; - } - - data.push_back(src); - data.push_back(trg); - data.push_back(static_cast(edgeType)); - data.push_back(arg1); - data.push_back(arg2); - data.push_back(arg3); - } - } - - size_t ndecisions = atn->decisionToState.size(); - data.push_back(ndecisions); - for (DecisionState *decStartState : atn->decisionToState) { - data.push_back(decStartState->stateNumber); - } - - // LEXER ACTIONS - if (atn->grammarType == ATNType::LEXER) { - data.push_back(atn->lexerActions.size()); - for (const auto &action : atn->lexerActions) { - data.push_back(static_cast(action->getActionType())); - switch (action->getActionType()) { - case LexerActionType::CHANNEL: - { - int channel = std::dynamic_pointer_cast(action)->getChannel(); - data.push_back(channel != -1 ? channel : 0xFFFF); - data.push_back(0); - break; - } - - case LexerActionType::CUSTOM: - { - size_t ruleIndex = std::dynamic_pointer_cast(action)->getRuleIndex(); - size_t actionIndex = std::dynamic_pointer_cast(action)->getActionIndex(); - data.push_back(ruleIndex != INVALID_INDEX ? ruleIndex : 0xFFFF); - data.push_back(actionIndex != INVALID_INDEX ? actionIndex : 0xFFFF); - break; - } - - case LexerActionType::MODE: - { - int mode = std::dynamic_pointer_cast(action)->getMode(); - data.push_back(mode != -1 ? mode : 0xFFFF); - data.push_back(0); - break; - } - - case LexerActionType::MORE: - data.push_back(0); - data.push_back(0); - break; - - case LexerActionType::POP_MODE: - data.push_back(0); - data.push_back(0); - break; - - case LexerActionType::PUSH_MODE: - { - int mode = std::dynamic_pointer_cast(action)->getMode(); - data.push_back(mode != -1 ? mode : 0xFFFF); - data.push_back(0); - break; - } - - case LexerActionType::SKIP: - data.push_back(0); - data.push_back(0); - break; - - case LexerActionType::TYPE: - { - int type = std::dynamic_pointer_cast(action)->getType(); - data.push_back(type != -1 ? type : 0xFFFF); - data.push_back(0); - break; - } - - default: - throw IllegalArgumentException("The specified lexer action type " + - std::to_string(static_cast(action->getActionType())) + - " is not valid."); - } - } - } - - for (size_t i = 0; i < data.size(); i++) { - if (data.at(i) > 0xFFFF) { - throw UnsupportedOperationException("Serialized ATN data element out of range."); - } - } - - return data; -} - -//------------------------------------------------------------------------------------------------------------ - -std::string ATNSerializer::decode(const std::wstring &inpdata) { - if (inpdata.size() < 10) - throw IllegalArgumentException("Not enough data to decode"); - - std::vector data(inpdata.size()); - - for (size_t i = 0; i < inpdata.size(); ++i) { - data[i] = (uint16_t)inpdata[i]; - } - - std::string buf; - size_t p = 0; - size_t version = data[p++]; - if (version != ATNDeserializer::SERIALIZED_VERSION) { - std::string reason = "Could not deserialize ATN with version " + std::to_string(version) + "(expected " + - std::to_string(ATNDeserializer::SERIALIZED_VERSION) + ")."; - throw UnsupportedOperationException("ATN Serializer" + reason); - } - - p++; // skip grammarType - size_t maxType = data[p++]; - buf.append("max type ").append(std::to_string(maxType)).append("\n"); - size_t nstates = data[p++]; - for (size_t i = 0; i < nstates; i++) { - ATNStateType stype = static_cast(data[p++]); - if (stype == ATNStateType::INVALID) { // ignore bad type of states - continue; - } - size_t ruleIndex = data[p++]; - if (ruleIndex == 0xFFFF) { - ruleIndex = INVALID_INDEX; - } - - std::string arg = ""; - if (stype == ATNStateType::LOOP_END) { - int loopBackStateNumber = data[p++]; - arg = std::string(" ") + std::to_string(loopBackStateNumber); - } - else if (stype == ATNStateType::PLUS_BLOCK_START || - stype == ATNStateType::STAR_BLOCK_START || - stype == ATNStateType::BLOCK_START) { - int endStateNumber = data[p++]; - arg = std::string(" ") + std::to_string(endStateNumber); - } - buf.append(std::to_string(i)) - .append(":") - .append(atnStateTypeName(stype)) - .append(" ") - .append(std::to_string(ruleIndex)) - .append(arg) - .append("\n"); - } - size_t numNonGreedyStates = data[p++]; - p += numNonGreedyStates; // Instead of that useless loop below. - /* - for (int i = 0; i < numNonGreedyStates; i++) { - int stateNumber = data[p++]; - } - */ - - size_t numPrecedenceStates = data[p++]; - p += numPrecedenceStates; - /* - for (int i = 0; i < numPrecedenceStates; i++) { - int stateNumber = data[p++]; - } - */ - - size_t nrules = data[p++]; - for (size_t i = 0; i < nrules; i++) { - size_t s = data[p++]; - if (atn->grammarType == ATNType::LEXER) { - size_t arg1 = data[p++]; - buf.append("rule ") - .append(std::to_string(i)) - .append(":") - .append(std::to_string(s)) - .append(" ") - .append(std::to_string(arg1)) - .append("\n"); - } - else { - buf.append("rule ") - .append(std::to_string(i)) - .append(":") - .append(std::to_string(s)) - .append("\n"); - } - } - size_t nmodes = data[p++]; - for (size_t i = 0; i < nmodes; i++) { - size_t s = data[p++]; - buf.append("mode ") - .append(std::to_string(i)) - .append(":") - .append(std::to_string(s)) - .append("\n"); - } - size_t nsets = data[p++]; - for (size_t i = 0; i < nsets; i++) { - size_t nintervals = data[p++]; - buf.append(std::to_string(i)).append(":"); - bool containsEof = data[p++] != 0; - if (containsEof) { - buf.append(getTokenName(Token::EOF)); - } - - for (size_t j = 0; j < nintervals; j++) { - if (containsEof || j > 0) { - buf.append(", "); - } - - buf.append(getTokenName(data[p])) - .append("..") - .append(getTokenName(data[p + 1])); - p += 2; - } - buf.append("\n"); - } - size_t nedges = data[p++]; - for (size_t i = 0; i < nedges; i++) { - size_t src = data[p]; - size_t trg = data[p + 1]; - TransitionType ttype = static_cast(data[p + 2]); - size_t arg1 = data[p + 3]; - size_t arg2 = data[p + 4]; - size_t arg3 = data[p + 5]; - buf.append(std::to_string(src)) - .append("->") - .append(std::to_string(trg)) - .append(" ") - .append(transitionTypeName(ttype)) - .append(" ") - .append(std::to_string(arg1)) - .append(",") - .append(std::to_string(arg2)) - .append(",") - .append(std::to_string(arg3)) - .append("\n"); - p += 6; - } - size_t ndecisions = data[p++]; - for (size_t i = 0; i < ndecisions; i++) { - size_t s = data[p++]; - buf += std::to_string(i) + ":" + std::to_string(s) + "\n"; - } - - if (atn->grammarType == ATNType::LEXER) { - //int lexerActionCount = data[p++]; - - //p += lexerActionCount * 3; // Instead of useless loop below. - /* - for (int i = 0; i < lexerActionCount; i++) { - LexerActionType actionType = (LexerActionType)data[p++]; - int data1 = data[p++]; - int data2 = data[p++]; - } - */ - } - - return buf; -} - -std::string ATNSerializer::getTokenName(size_t t) { - if (t == Token::EOF) { - return "EOF"; - } - - if (atn->grammarType == ATNType::LEXER && t <= 0x10FFFF) { - switch (t) { - case '\n': - return "'\\n'"; - case '\r': - return "'\\r'"; - case '\t': - return "'\\t'"; - case '\b': - return "'\\b'"; - case '\f': - return "'\\f'"; - case '\\': - return "'\\\\'"; - case '\'': - return "'\\''"; - default: - std::string s_hex = antlrcpp::toHexString((int)t); - if (s_hex >= "0" && s_hex <= "7F" && !iscntrl((int)t)) { - return "'" + std::to_string(t) + "'"; - } - - // turn on the bit above max "\u10FFFF" value so that we pad with zeros - // then only take last 6 digits - std::string hex = antlrcpp::toHexString((int)t | 0x1000000).substr(1, 6); - std::string unicodeStr = std::string("'\\u") + hex + std::string("'"); - return unicodeStr; - } - } - - if (_tokenNames.size() > 0 && t < _tokenNames.size()) { - return _tokenNames[t]; - } - - return std::to_string(t); -} - -std::wstring ATNSerializer::getSerializedAsString(ATN *atn) { - std::vector data = getSerialized(atn); - std::wstring result; - for (size_t entry : data) - result.push_back((wchar_t)entry); - - return result; -} - -std::vector ATNSerializer::getSerialized(ATN *atn) { - return ATNSerializer(atn).serialize(); -} - -std::string ATNSerializer::getDecoded(ATN *atn, std::vector &tokenNames) { - std::wstring serialized = getSerializedAsString(atn); - return ATNSerializer(atn, tokenNames).decode(serialized); -} diff --git a/runtime/Cpp/runtime/src/atn/ATNSerializer.h b/runtime/Cpp/runtime/src/atn/ATNSerializer.h deleted file mode 100755 index 8b77894fa6..0000000000 --- a/runtime/Cpp/runtime/src/atn/ATNSerializer.h +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. - * Use of this file is governed by the BSD 3-clause license that - * can be found in the LICENSE.txt file in the project root. - */ - -#pragma once - -#include "antlr4-common.h" - -namespace antlr4 { -namespace atn { - - class ANTLR4CPP_PUBLIC ATNSerializer { - public: - ATN *atn; - - ATNSerializer(ATN *atn); - ATNSerializer(ATN *atn, const std::vector &tokenNames); - virtual ~ATNSerializer(); - - /// - /// Serialize state descriptors, edge descriptors, and decision->state map - /// into list of ints: - /// - /// grammar-type, (ANTLRParser.LEXER, ...) - /// max token type, - /// num states, - /// state-0-type ruleIndex, state-1-type ruleIndex, ... state-i-type - /// ruleIndex optional-arg ... - /// num rules, - /// rule-1-start-state rule-1-args, rule-2-start-state rule-2-args, ... - /// (args are token type,actionIndex in lexer else 0,0) - /// num modes, - /// mode-0-start-state, mode-1-start-state, ... (parser has 0 modes) - /// num sets - /// set-0-interval-count intervals, set-1-interval-count intervals, ... - /// num total edges, - /// src, trg, edge-type, edge arg1, optional edge arg2 (present always), - /// ... - /// num decisions, - /// decision-0-start-state, decision-1-start-state, ... - /// - /// Convenient to pack into unsigned shorts to make as Java string. - /// - virtual std::vector serialize(); - - virtual std::string decode(const std::wstring& data); - virtual std::string getTokenName(size_t t); - - /// Used by Java target to encode short/int array as chars in string. - static std::wstring getSerializedAsString(ATN *atn); - static std::vector getSerialized(ATN *atn); - - static std::string getDecoded(ATN *atn, std::vector &tokenNames); - - private: - std::vector _tokenNames; - }; - -} // namespace atn -} // namespace antlr4 diff --git a/runtime/Cpp/runtime/src/misc/InterpreterDataReader.cpp b/runtime/Cpp/runtime/src/misc/InterpreterDataReader.cpp index c77b8bca2b..0bcaf07b54 100755 --- a/runtime/Cpp/runtime/src/misc/InterpreterDataReader.cpp +++ b/runtime/Cpp/runtime/src/misc/InterpreterDataReader.cpp @@ -101,7 +101,7 @@ InterpreterData InterpreterDataReader::parseFile(std::string const& fileName) { }; } - std::vector serializedATN; + std::vector serializedATN; std::getline(input, line, '\n'); assert(line == "atn:"); @@ -115,7 +115,7 @@ InterpreterData InterpreterDataReader::parseFile(std::string const& fileName) { number = std::strtoul(&value[1], nullptr, 10); else number = std::strtoul(value.c_str(), nullptr, 10); - serializedATN.push_back(static_cast(number)); + serializedATN.push_back(static_cast(number)); } ATNDeserializer deserializer; diff --git a/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.cpp b/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.cpp index 5c2e8568a1..b648f6c085 100644 --- a/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.cpp +++ b/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.cpp @@ -33,7 +33,7 @@ struct XPathLexerStaticData final { const std::vector literalNames; const std::vector symbolicNames; const antlr4::dfa::Vocabulary vocabulary; - std::vector serializedATN; + std::vector serializedATN; std::unique_ptr atn; }; @@ -61,8 +61,8 @@ void xpathLexerInitialize() { "STRING" } ); - static const uint16_t serializedATNSegment0[] = { - 0x4, 0x0, 0x8, 0x32, 0x6, 0xffff, 0x2, 0x0, 0x7, 0x0, 0x2, 0x1, 0x7, + static const int32_t serializedATNSegment0[] = { + 0x4, 0x0, 0x8, 0x32, 0x6, -1, 0x2, 0x0, 0x7, 0x0, 0x2, 0x1, 0x7, 0x1, 0x2, 0x2, 0x7, 0x2, 0x2, 0x3, 0x7, 0x3, 0x2, 0x4, 0x7, 0x4, 0x2, 0x5, 0x7, 0x5, 0x2, 0x6, 0x7, 0x6, 0x2, 0x7, 0x7, 0x7, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x2, @@ -75,7 +75,7 @@ void xpathLexerInitialize() { 0x0, 0x2, 0x5, 0x0, 0x30, 0x39, 0x5f, 0x5f, 0xb7, 0xb7, 0x300, 0x36f, 0x203f, 0x2040, 0xd, 0x0, 0x41, 0x5a, 0x61, 0x7a, 0xc0, 0xd6, 0xd8, 0xf6, 0xf8, 0x2ff, 0x370, 0x37d, 0x37f, 0x1fff, 0x200c, 0x200d, 0x2070, - 0x218f, 0x2c00, 0x2fef, 0x3001, 0xd7ff, 0xf900, 0xfdcf, 0xfdf0, 0xffff, + 0x218f, 0x2c00, 0x2fef, 0x3001, 0xd7ff, 0xf900, 0xfdcf, 0xfdf0, -1, 0x0, 0x32, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x3, 0x1, 0x0, 0x0, 0x0, 0x0, 0x5, 0x1, 0x0, 0x0, 0x0, 0x0, 0x7, 0x1, 0x0, 0x0, 0x0, 0x0, 0x9, 0x1, 0x0, 0x0, 0x0, 0x0, 0xf, 0x1, 0x0, 0x0, 0x0, 0x1, @@ -151,7 +151,7 @@ const dfa::Vocabulary& XPathLexer::getVocabulary() const { return xpathLexerStaticData->vocabulary; } -const std::vector& XPathLexer::getSerializedATN() const { +const std::vector& XPathLexer::getSerializedATN() const { return xpathLexerStaticData->serializedATN; } diff --git a/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.h b/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.h index e35f15584d..bd6711077e 100644 --- a/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.h +++ b/runtime/Cpp/runtime/src/tree/xpath/XPathLexer.h @@ -28,7 +28,7 @@ class XPathLexer : public antlr4::Lexer { virtual const antlr4::dfa::Vocabulary& getVocabulary() const override; - virtual const std::vector& getSerializedATN() const override; + virtual const std::vector& getSerializedATN() const override; virtual const antlr4::atn::ATN& getATN() const override; diff --git a/runtime/Dart/lib/src/atn/src/atn_deserializer.dart b/runtime/Dart/lib/src/atn/src/atn_deserializer.dart index a26abd1dd8..ee6928be10 100644 --- a/runtime/Dart/lib/src/atn/src/atn_deserializer.dart +++ b/runtime/Dart/lib/src/atn/src/atn_deserializer.dart @@ -86,10 +86,7 @@ class ATNDeserializer { readRules(atn); readModes(atn); final sets = []; - // First, deserialize sets with 16-bit arguments <= U+FFFF. - readSets(atn, sets, () => readInt()); - // Next, deserialize sets with 32-bit arguments <= U+10FFFF. - readSets(atn, sets, () => readInt32()); + readSets(atn, sets); readEdges(atn, sets); readDecisions(atn); readLexerActions(atn); @@ -130,9 +127,6 @@ class ATNDeserializer { } var ruleIndex = readInt(); - if (ruleIndex == 0xFFFF) { - ruleIndex = -1; - } final s = stateFactory(stype, ruleIndex); if (s is LoopEndState) { @@ -180,9 +174,6 @@ class ATNDeserializer { atn.ruleToStartState.add(startState); if (atn.grammarType == ATNType.LEXER) { var tokenType = readInt(); - if (tokenType == 0xFFFF) { - tokenType = Token.EOF; - } atn.ruleToTokenType.add(tokenType); } @@ -208,7 +199,7 @@ class ATNDeserializer { } } - void readSets(ATN atn, List sets, readUnicode) { + void readSets(ATN atn, List sets) { final nsets = readInt(); for (var i = 0; i < nsets; i++) { final nintervals = readInt(); @@ -221,8 +212,8 @@ class ATNDeserializer { } for (var j = 0; j < nintervals; j++) { - int a = readUnicode(); - int b = readUnicode(); + int a = readInt(); + int b = readInt(); set.addRange(a, b); } } @@ -321,14 +312,7 @@ class ATNDeserializer { atn.lexerActions = List.generate(readInt(), (index) { final actionType = LexerActionType.values[readInt()]; var data1 = readInt(); - if (data1 == 0xFFFF) { - data1 = -1; - } - var data2 = readInt(); - if (data2 == 0xFFFF) { - data2 = -1; - } final lexerAction = lexerActionFactory(actionType, data1, data2); return lexerAction; @@ -537,12 +521,6 @@ class ATNDeserializer { return data[pos++]; } - int readInt32() { - final low = readInt(); - final high = readInt(); - return low | (high << 16); - } - Transition edgeFactory( ATN atn, TransitionType type, diff --git a/runtime/Go/antlr/atn_deserializer.go b/runtime/Go/antlr/atn_deserializer.go index 1b042dcb10..aea9bbfa93 100644 --- a/runtime/Go/antlr/atn_deserializer.go +++ b/runtime/Go/antlr/atn_deserializer.go @@ -23,7 +23,7 @@ type blockStartStateIntPair struct { type ATNDeserializer struct { options *ATNDeserializationOptions - data []uint16 + data []int32 pos int } @@ -45,7 +45,7 @@ func stringInSlice(a string, list []string) int { return -1 } -func (a *ATNDeserializer) DeserializeFromUInt16(data []uint16) *ATN { +func (a *ATNDeserializer) Deserialize(data []int32) *ATN { a.data = data a.pos = 0 a.checkVersion() @@ -56,10 +56,7 @@ func (a *ATNDeserializer) DeserializeFromUInt16(data []uint16) *ATN { a.readRules(atn) a.readModes(atn) - // First, deserialize sets with 16-bit arguments <= U+FFFF. - sets := a.readSets(atn, nil, a.readInt) - // Next, deserialize sets with 32-bit arguments <= U+10FFFF. - sets = a.readSets(atn, sets, a.readInt32) + sets := a.readSets(atn, nil) a.readEdges(atn, sets) a.readDecisions(atn) @@ -113,10 +110,6 @@ func (a *ATNDeserializer) readStates(atn *ATN) { ruleIndex := a.readInt() - if ruleIndex == 0xFFFF { - ruleIndex = -1 - } - s := a.stateFactory(stype, ruleIndex) if stype == ATNStateLoopEnd { @@ -175,10 +168,6 @@ func (a *ATNDeserializer) readRules(atn *ATN) { if atn.grammarType == ATNTypeLexer { tokenType := a.readInt() - if tokenType == 0xFFFF { - tokenType = TokenEOF - } - atn.ruleToTokenType[i] = tokenType } } @@ -204,7 +193,7 @@ func (a *ATNDeserializer) readModes(atn *ATN) { } } -func (a *ATNDeserializer) readSets(atn *ATN, sets []*IntervalSet, readUnicode func() int) []*IntervalSet { +func (a *ATNDeserializer) readSets(atn *ATN, sets []*IntervalSet) []*IntervalSet { m := a.readInt() // Preallocate the needed capacity. @@ -227,8 +216,8 @@ func (a *ATNDeserializer) readSets(atn *ATN, sets []*IntervalSet, readUnicode fu } for j := 0; j < n; j++ { - i1 := readUnicode() - i2 := readUnicode() + i1 := a.readInt() + i2 := a.readInt() iset.addRange(i1, i2) } @@ -330,17 +319,7 @@ func (a *ATNDeserializer) readLexerActions(atn *ATN) { for i := range atn.lexerActions { actionType := a.readInt() data1 := a.readInt() - - if data1 == 0xFFFF { - data1 = -1 - } - data2 := a.readInt() - - if data2 == 0xFFFF { - data2 = -1 - } - atn.lexerActions[i] = a.lexerActionFactory(actionType, data1, data2) } } @@ -571,13 +550,7 @@ func (a *ATNDeserializer) readInt() int { a.pos++ - return int(v) -} - -func (a *ATNDeserializer) readInt32() int { - var low = a.readInt() - var high = a.readInt() - return low | (high << 16) + return int(v) // data is 32 bits but int is at least that big } func (a *ATNDeserializer) edgeFactory(atn *ATN, typeIndex, src, trg, arg1, arg2, arg3 int, sets []*IntervalSet) Transition { diff --git a/runtime/Go/antlr/testing_lexer_b_test.go b/runtime/Go/antlr/testing_lexer_b_test.go index b5691b574c..d07782b17f 100644 --- a/runtime/Go/antlr/testing_lexer_b_test.go +++ b/runtime/Go/antlr/testing_lexer_b_test.go @@ -20,7 +20,7 @@ MULT : '*'; WS : ' '+; */ -var lexerB_serializedLexerAtn = []uint16{ +var lexerB_serializedLexerAtn = []int32{ 4, 0, 7, 38, 6, 65535, 2, 0, 7, 0, 2, 1, 7, 1, 2, 2, 7, 2, 2, 3, 7, 3, 2, 4, 7, 4, 2, 5, 7, 5, 2, 6, 7, 6, 1, 0, 4, 0, 17, 8, 0, 11, 0, 12, 0, 18, 1, 1, 4, 1, 22, 8, 1, 11, 1, 12, 1, 23, 1, 2, 1, 2, 1, 3, 1, 3, 1, @@ -40,7 +40,7 @@ var lexerB_serializedLexerAtn = []uint16{ } var lexerB_lexerDeserializer = NewATNDeserializer(nil) -var lexerB_lexerAtn = lexerB_lexerDeserializer.DeserializeFromUInt16(lexerB_serializedLexerAtn) +var lexerB_lexerAtn = lexerB_lexerDeserializer.Deserialize(lexerB_serializedLexerAtn) var lexerB_lexerChannelNames = []string{ "DEFAULT_TOKEN_CHANNEL", "HIDDEN", diff --git a/runtime/Go/antlr/tokenstream_rewriter_test.go b/runtime/Go/antlr/tokenstream_rewriter_test.go index 852f50232c..b3647ed318 100644 --- a/runtime/Go/antlr/tokenstream_rewriter_test.go +++ b/runtime/Go/antlr/tokenstream_rewriter_test.go @@ -328,7 +328,7 @@ func TestLexerA(t *testing.T){ var _ = fmt.Printf var _ = unicode.IsLetter -var serializedLexerAtn = []uint16{ +var serializedLexerAtn = []int32{ 4, 0, 3, 13, 6, 65535, 2, 0, 7, 0, 2, 1, 7, 1, 2, 2, 7, 2, 1, 0, 1, 0, 1, 1, 1, 1, 1, 2, 1, 2, 0, 0, 3, 1, 1, 3, 2, 5, 3, 1, 0, 0, 0, 12, 0, 1, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 5, 1, 0, 0, 0, 1, 7, 1, 0, 0, 0, 3, 9, @@ -338,7 +338,7 @@ var serializedLexerAtn = []uint16{ } var lexerDeserializer = NewATNDeserializer(nil) -var lexerAtn = lexerDeserializer.DeserializeFromUInt16(serializedLexerAtn) +var lexerAtn = lexerDeserializer.Deserialize(serializedLexerAtn) var lexerChannelNames = []string{ "DEFAULT_TOKEN_CHANNEL", "HIDDEN", diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java index 0bd643a131..0ec2a9359e 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNDeserializer.java @@ -7,15 +7,20 @@ package org.antlr.v4.runtime.atn; import org.antlr.v4.runtime.Token; +import org.antlr.v4.runtime.misc.IntegerList; import org.antlr.v4.runtime.misc.IntervalSet; import org.antlr.v4.runtime.misc.Pair; import java.io.InvalidClassException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Locale; -/** +/** Deserialize ATNs for JavaTarget; it's complicated by the fact that java requires + * that we serialize the list of integers as 16 bit characters in a string. Other + * targets will have an array of ints generated and can simply decode the ints + * back into an ATN. * * @author Sam Harwell */ @@ -25,49 +30,6 @@ public class ATNDeserializer { SERIALIZED_VERSION = 4; } - interface UnicodeDeserializer { - // Wrapper for readInt() or readInt32() - int readUnicode(char[] data, int p); - - // Work around Java not allowing mutation of captured variables - // by returning amount by which to increment p after each read - int size(); - } - - enum UnicodeDeserializingMode { - UNICODE_BMP, - UNICODE_SMP - } - - static UnicodeDeserializer getUnicodeDeserializer(UnicodeDeserializingMode mode) { - if (mode == UnicodeDeserializingMode.UNICODE_BMP) { - return new UnicodeDeserializer() { - @Override - public int readUnicode(char[] data, int p) { - return toInt(data[p]); - } - - @Override - public int size() { - return 1; - } - }; - } - else { - return new UnicodeDeserializer() { - @Override - public int readUnicode(char[] data, int p) { - return toInt32(data, p); - } - - @Override - public int size() { - return 2; - } - }; - } - } - private final ATNDeserializationOptions deserializationOptions; public ATNDeserializer() { @@ -83,20 +45,19 @@ public ATNDeserializer(ATNDeserializationOptions deserializationOptions) { } public ATN deserialize(char[] data) { - data = data.clone(); - for (int i = 1; i < data.length; i++) { - data[i] = (char) (data[i] - 2); - } + return deserialize(decodeIntsEncodedAs16BitWords(data)); + } + public ATN deserialize(int[] data) { int p = 0; - int version = toInt(data[p++]); + int version = data[p++]; if (version != SERIALIZED_VERSION) { String reason = String.format(Locale.getDefault(), "Could not deserialize ATN with version %d (expected %d).", version, SERIALIZED_VERSION); throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); } - ATNType grammarType = ATNType.values()[toInt(data[p++])]; - int maxTokenType = toInt(data[p++]); + ATNType grammarType = ATNType.values()[data[p++]]; + int maxTokenType = data[p++]; ATN atn = new ATN(grammarType, maxTokenType); // @@ -104,27 +65,23 @@ public ATN deserialize(char[] data) { // List> loopBackStateNumbers = new ArrayList>(); List> endStateNumbers = new ArrayList>(); - int nstates = toInt(data[p++]); + int nstates = data[p++]; for (int i=0; i((LoopEndState)s, loopBackStateNumber)); } else if (s instanceof BlockStartState) { - int endStateNumber = toInt(data[p++]); + int endStateNumber = data[p++]; endStateNumbers.add(new Pair((BlockStartState)s, endStateNumber)); } atn.addState(s); @@ -139,37 +96,33 @@ else if (s instanceof BlockStartState) { pair.a.endState = (BlockEndState)atn.states.get(pair.b); } - int numNonGreedyStates = toInt(data[p++]); + int numNonGreedyStates = data[p++]; for (int i = 0; i < numNonGreedyStates; i++) { - int stateNumber = toInt(data[p++]); + int stateNumber = data[p++]; ((DecisionState)atn.states.get(stateNumber)).nonGreedy = true; } - int numPrecedenceStates = toInt(data[p++]); + int numPrecedenceStates = data[p++]; for (int i = 0; i < numPrecedenceStates; i++) { - int stateNumber = toInt(data[p++]); + int stateNumber = data[p++]; ((RuleStartState)atn.states.get(stateNumber)).isLeftRecursiveRule = true; } // // RULES // - int nrules = toInt(data[p++]); + int nrules = data[p++]; if ( atn.grammarType == ATNType.LEXER ) { atn.ruleToTokenType = new int[nrules]; } atn.ruleToStartState = new RuleStartState[nrules]; for (int i=0; i sets = new ArrayList(); - - // First, read all sets with 16-bit Unicode code points <= U+FFFF. - p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_BMP)); - - // Next, deserialize sets with 32-bit arguments <= U+10FFFF. - p = deserializeSets(data, p, sets, getUnicodeDeserializer(UnicodeDeserializingMode.UNICODE_SMP)); + p = deserializeSets(data, p, sets); // // EDGES // - int nedges = toInt(data[p++]); + int nedges = data[p++]; for (int i=0; i"+trg+ @@ -285,9 +233,9 @@ else if (state instanceof StarLoopbackState) { // // DECISIONS // - int ndecisions = toInt(data[p++]); + int ndecisions = data[p++]; for (int i=1; i<=ndecisions; i++) { - int s = toInt(data[p++]); + int s = data[p++]; DecisionState decState = (DecisionState)atn.states.get(s); atn.decisionToState.add(decState); decState.decision = i-1; @@ -297,18 +245,11 @@ else if (state instanceof StarLoopbackState) { // LEXER ACTIONS // if (atn.grammarType == ATNType.LEXER) { - atn.lexerActions = new LexerAction[toInt(data[p++])]; + atn.lexerActions = new LexerAction[data[p++]]; for (int i = 0; i < atn.lexerActions.length; i++) { - LexerActionType actionType = LexerActionType.values()[toInt(data[p++])]; - int data1 = toInt(data[p++]); - if (data1 == 0xFFFF) { - data1 = -1; - } - - int data2 = toInt(data[p++]); - if (data2 == 0xFFFF) { - data2 = -1; - } + LexerActionType actionType = LexerActionType.values()[data[p++]]; + int data1 = data[p++]; + int data2 = data[p++]; LexerAction lexerAction = lexerActionFactory(actionType, data1, data2); @@ -415,24 +356,22 @@ else if (state instanceof StarLoopbackState) { return atn; } - private int deserializeSets(char[] data, int p, List sets, UnicodeDeserializer unicodeDeserializer) { - int nsets = toInt(data[p++]); + private int deserializeSets(int[] data, int p, List sets) { + int nsets = data[p++]; for (int i=0; i 0x7FFF + if ( v>=0x7FFF_FFFF ) { // too big to fit in 15 bits + 16 bits? (+1 would be 8000_0000 which is bad encoding) + throw new UnsupportedOperationException("Serialized ATN data element["+i+"] = "+v+" doesn't fit in 31 bits"); + } + v = v & 0x7FFF_FFFF; // strip high bit (sentinel) if set + data16.add((v >> 16) | 0x8000); // store high 15-bit word first and set high bit to say word follows + data16.add((v & 0xFFFF)); // then store lower 16-bit word + } + } + return data16; + } + + public static int[] decodeIntsEncodedAs16BitWords(char[] data16) { + return decodeIntsEncodedAs16BitWords(data16, false); + } + + /** Convert a list of chars (16 uint) that represent a serialized and compressed list of ints for an ATN. + * This method pairs with {@link #encodeIntsWith16BitWords(IntegerList)} above. Used only for Java Target. + */ + public static int[] decodeIntsEncodedAs16BitWords(char[] data16, boolean trimToSize) { + // will be strictly smaller but we waste bit of space to avoid copying during initialization of parsers + int[] data = new int[data16.length]; + int i = 0; + int i2 = 0; + while ( i < data16.length ) { + char v = data16[i++]; + if ( (v & 0x8000) == 0 ) { // hi bit not set? Implies 1-word value + data[i2++] = v; // 7 bit int + } + else { // hi bit set. Implies 2-word value + char vnext = data16[i++]; + if ( v==0xFFFF && vnext == 0xFFFF ) { // is it -1? + data[i2++] = -1; + } + else { // 31-bit int + data[i2++] = (v & 0x7FFF) << 16 | (vnext & 0xFFFF); + } + } + } + if ( trimToSize ) { + return Arrays.copyOf(data, i2); + } + return data; + } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSerializer.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSerializer.java index e201ce117d..6a9515eb5d 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSerializer.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSerializer.java @@ -10,38 +10,37 @@ import org.antlr.v4.runtime.misc.IntegerList; import org.antlr.v4.runtime.misc.Interval; import org.antlr.v4.runtime.misc.IntervalSet; -import org.antlr.v4.runtime.misc.Utils; -import java.io.InvalidClassException; -import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashMap; -import java.util.List; import java.util.Locale; import java.util.Map; +/** This class represents a target neutral serializer for ATNs. An ATN is converted to a list of integers + * that can be converted back to and ATN. We compute the list of integers and then generate an array + * into the target language for a particular lexer or parser. Java is a special case where we must + * generate strings instead of arrays, but that is handled outside of this class. + * See {@link ATNDeserializer#encodeIntsWith16BitWords(IntegerList)} and + * {@link org.antlr.v4.codegen.model.SerializedJavaATN}. + */ public class ATNSerializer { public ATN atn; - private List tokenNames; - private interface CodePointSerializer { - void serializeCodePoint(IntegerList data, int cp); - } + private final IntegerList data = new IntegerList(); + /** Note that we use a LinkedHashMap as a set to mainintain insertion order while deduplicating + entries with the same key. */ + private final Map sets = new LinkedHashMap<>(); + private final IntegerList nonGreedyStates = new IntegerList(); + private final IntegerList precedenceStates = new IntegerList(); public ATNSerializer(ATN atn) { assert atn.grammarType != null; this.atn = atn; } - public ATNSerializer(ATN atn, List tokenNames) { - assert atn.grammarType != null; - this.atn = atn; - this.tokenNames = tokenNames; - } - /** Serialize state descriptors, edge descriptors, and decision→state map - * into list of ints: + * into list of ints. Likely out of date, but keeping as it could be helpful: * * SERIALIZED_VERSION * UUID (2 longs) @@ -65,141 +64,99 @@ public ATNSerializer(ATN atn, List tokenNames) { * * Convenient to pack into unsigned shorts to make as Java string. */ - public IntegerList serialize(String language) { - IntegerList data = new IntegerList(); + public IntegerList serialize() { + addPreamble(); + int nedges = addEdges(); + addNonGreedyStates(); + addPrecedenceStates(); + addRuleStatesAndLexerTokenTypes(); + addModeStartStates(); + Map setIndices = null; + setIndices = addSets(); + addEdges(nedges, setIndices); + addDecisionStartStates(); + addLexerActions(); + + return data; + } + + private void addPreamble() { data.add(ATNDeserializer.SERIALIZED_VERSION); // convert grammar type to ATN const to avoid dependence on ANTLRParser data.add(atn.grammarType.ordinal()); data.add(atn.maxTokenType); - int nedges = 0; - - // Note that we use a LinkedHashMap as a set to - // maintain insertion order while deduplicating - // entries with the same key. - Map sets = new LinkedHashMap<>(); - - // dump states, count edges and collect sets while doing so - IntegerList nonGreedyStates = new IntegerList(); - IntegerList precedenceStates = new IntegerList(); - data.add(atn.states.size()); - for (ATNState s : atn.states) { - if ( s==null ) { // might be optimized away - data.add(ATNState.INVALID_TYPE); - continue; - } - - int stateType = s.getStateType(); - if (s instanceof DecisionState && ((DecisionState)s).nonGreedy) { - nonGreedyStates.add(s.stateNumber); - } + } - if (s instanceof RuleStartState && ((RuleStartState)s).isLeftRecursiveRule) { - precedenceStates.add(s.stateNumber); - } + private void addLexerActions() { + if (atn.grammarType == ATNType.LEXER) { + data.add(atn.lexerActions.length); + for (LexerAction action : atn.lexerActions) { + data.add(action.getActionType().ordinal()); + switch (action.getActionType()) { + case CHANNEL: + int channel = ((LexerChannelAction)action).getChannel(); + data.add(channel); + data.add(0); + break; - data.add(stateType); + case CUSTOM: + int ruleIndex = ((LexerCustomAction)action).getRuleIndex(); + int actionIndex = ((LexerCustomAction)action).getActionIndex(); + data.add(ruleIndex); + data.add(actionIndex); + break; - if (s.ruleIndex == -1) { - data.add(Character.MAX_VALUE); - } - else { - data.add(s.ruleIndex); - } + case MODE: + int mode = ((LexerModeAction)action).getMode(); + data.add(mode); + data.add(0); + break; - if ( s.getStateType() == ATNState.LOOP_END ) { - data.add(((LoopEndState)s).loopBackState.stateNumber); - } - else if ( s instanceof BlockStartState ) { - data.add(((BlockStartState)s).endState.stateNumber); - } + case MORE: + data.add(0); + data.add(0); + break; - if (s.getStateType() != ATNState.RULE_STOP) { - // the deserializer can trivially derive these edges, so there's no need to serialize them - nedges += s.getNumberOfTransitions(); - } + case POP_MODE: + data.add(0); + data.add(0); + break; - for (int i=0; i0 ) { - for (ATNState modeStartState : atn.modeToStartState) { - data.add(modeStartState.stateNumber); - } - } - List bmpSets = new ArrayList<>(); - List smpSets = new ArrayList<>(); - for (IntervalSet set : sets.keySet()) { - if (!set.isNil() && set.getMaxElement() <= Character.MAX_VALUE) { - bmpSets.add(set); - } - else { - smpSets.add(set); - } - } - serializeSets( - data, - bmpSets, - new CodePointSerializer() { - @Override - public void serializeCodePoint(IntegerList data, int cp) { - data.add(cp); - } - }); - serializeSets( - data, - smpSets, - new CodePointSerializer() { - @Override - public void serializeCodePoint(IntegerList data, int cp) { - serializeInt(data, cp); - } - }); - Map setIndices = new HashMap<>(); - int setIndex = 0; - for (IntervalSet bmpSet : bmpSets) { - setIndices.put(bmpSet, setIndex++); - } - for (IntervalSet smpSet : smpSets) { - setIndices.put(smpSet, setIndex++); + private void addDecisionStartStates() { + int ndecisions = atn.decisionToState.size(); + data.add(ndecisions); + for (DecisionState decStartState : atn.decisionToState) { + data.add(decStartState.stateNumber); } + } + private void addEdges(int nedges, Map setIndices) { data.add(nedges); for (ATNState s : atn.states) { if ( s==null ) { @@ -248,7 +205,6 @@ public void serializeCodePoint(IntegerList data, int cp) { arg1 = 0; arg3 = 1; } - break; case Transition.ATOM : arg1 = ((AtomTransition)t).label; @@ -256,16 +212,11 @@ public void serializeCodePoint(IntegerList data, int cp) { arg1 = 0; arg3 = 1; } - break; case Transition.ACTION : ActionTransition at = (ActionTransition)t; arg1 = at.ruleIndex; arg2 = at.actionIndex; - if (arg2 == -1) { - arg2 = 0xFFFF; - } - arg3 = at.isCtxDependent ? 1 : 0 ; break; case Transition.SET : @@ -286,93 +237,102 @@ public void serializeCodePoint(IntegerList data, int cp) { data.add(arg3); } } + } - int ndecisions = atn.decisionToState.size(); - data.add(ndecisions); - for (DecisionState decStartState : atn.decisionToState) { - data.add(decStartState.stateNumber); + private Map addSets() { + serializeSets(data, sets.keySet()); + Map setIndices = new HashMap<>(); + int setIndex = 0; + for (IntervalSet s : sets.keySet()) { + setIndices.put(s, setIndex++); } + return setIndices; + } - // - // LEXER ACTIONS - // - if (atn.grammarType == ATNType.LEXER) { - data.add(atn.lexerActions.length); - for (LexerAction action : atn.lexerActions) { - data.add(action.getActionType().ordinal()); - switch (action.getActionType()) { - case CHANNEL: - int channel = ((LexerChannelAction)action).getChannel(); - data.add(channel != -1 ? channel : 0xFFFF); - data.add(0); - break; + private void addModeStartStates() { + int nmodes = atn.modeToStartState.size(); + data.add(nmodes); + if ( nmodes>0 ) { + for (ATNState modeStartState : atn.modeToStartState) { + data.add(modeStartState.stateNumber); + } + } + } - case CUSTOM: - int ruleIndex = ((LexerCustomAction)action).getRuleIndex(); - int actionIndex = ((LexerCustomAction)action).getActionIndex(); - data.add(ruleIndex != -1 ? ruleIndex : 0xFFFF); - data.add(actionIndex != -1 ? actionIndex : 0xFFFF); - break; + private void addRuleStatesAndLexerTokenTypes() { + int nrules = atn.ruleToStartState.length; + data.add(nrules); + for (int r=0; r=0; // 0 implies fragment rule, other token types > 0 + data.add(atn.ruleToTokenType[r]); + } + } + } - case MODE: - int mode = ((LexerModeAction)action).getMode(); - data.add(mode != -1 ? mode : 0xFFFF); - data.add(0); - break; + private void addPrecedenceStates() { + data.add(precedenceStates.size()); + for (int i = 0; i < precedenceStates.size(); i++) { + data.add(precedenceStates.get(i)); + } + } - case MORE: - data.add(0); - data.add(0); - break; + private void addNonGreedyStates() { + data.add(nonGreedyStates.size()); + for (int i = 0; i < nonGreedyStates.size(); i++) { + data.add(nonGreedyStates.get(i)); + } + } - case POP_MODE: - data.add(0); - data.add(0); - break; + private int addEdges() { + int nedges = 0; + data.add(atn.states.size()); + for (ATNState s : atn.states) { + if ( s==null ) { // might be optimized away + data.add(ATNState.INVALID_TYPE); + continue; + } - case PUSH_MODE: - mode = ((LexerPushModeAction)action).getMode(); - data.add(mode != -1 ? mode : 0xFFFF); - data.add(0); - break; + int stateType = s.getStateType(); + if (s instanceof DecisionState && ((DecisionState)s).nonGreedy) { + nonGreedyStates.add(s.stateNumber); + } - case SKIP: - data.add(0); - data.add(0); - break; + if (s instanceof RuleStartState && ((RuleStartState)s).isLeftRecursiveRule) { + precedenceStates.add(s.stateNumber); + } - case TYPE: - int type = ((LexerTypeAction)action).getType(); - data.add(type != -1 ? type : 0xFFFF); - data.add(0); - break; + data.add(stateType); - default: - String message = String.format(Locale.getDefault(), "The specified lexer action type %s is not valid.", action.getActionType()); - throw new IllegalArgumentException(message); - } + data.add(s.ruleIndex); + + if ( s.getStateType() == ATNState.LOOP_END ) { + data.add(((LoopEndState)s).loopBackState.stateNumber); + } + else if ( s instanceof BlockStartState ) { + data.add(((BlockStartState)s).endState.stateNumber); } - } - boolean isJava = language.equals("Java"); - for (int i = 1; i < data.size(); i++) { - int value = data.get(i); - if (value < Character.MIN_VALUE || value > Character.MAX_VALUE) { - throw new UnsupportedOperationException("Serialized ATN data element " + - value + " element " + i + " out of range " + (int) Character.MIN_VALUE + ".." + (int) Character.MAX_VALUE); + if (s.getStateType() != ATNState.RULE_STOP) { + // the deserializer can trivially derive these edges, so there's no need to serialize them + nedges += s.getNumberOfTransitions(); } - data.set(i, isJava ? (value + 2) & 0xFFFF : value); + for (int i=0; i sets, - CodePointSerializer codePointSerializer) - { + private static void serializeSets(IntegerList data, Collection sets) { int nSets = sets.size(); data.add(nSets); @@ -392,209 +352,18 @@ private static void serializeSets( continue; } else { - codePointSerializer.serializeCodePoint(data, 0); + data.add(0); } } else { - codePointSerializer.serializeCodePoint(data, I.a); + data.add(I.a); } - - codePointSerializer.serializeCodePoint(data, I.b); + data.add(I.b); } } } - public String decode(char[] data) { - data = data.clone(); - // don't adjust the first value since that's the version number - for (int i = 1; i < data.length; i++) { - data[i] = (char)(data[i] - 2); - } - - StringBuilder buf = new StringBuilder(); - int p = 0; - int version = ATNDeserializer.toInt(data[p++]); - if (version != ATNDeserializer.SERIALIZED_VERSION) { - String reason = String.format("Could not deserialize ATN with version %d (expected %d).", version, ATNDeserializer.SERIALIZED_VERSION); - throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); - } - - p++; // skip grammarType - int maxType = ATNDeserializer.toInt(data[p++]); - buf.append("max type ").append(maxType).append("\n"); - int nstates = ATNDeserializer.toInt(data[p++]); - for (int i=0; i").append(trg) - .append(" ").append(Transition.serializationNames.get(ttype)) - .append(" ").append(arg1).append(",").append(arg2).append(",").append(arg3) - .append("\n"); - p += 6; - } - int ndecisions = ATNDeserializer.toInt(data[p++]); - for (int i=0; i0 ) { - buf.append(", "); - } - - int a = unicodeDeserializer.readUnicode(data, p); - p += unicodeDeserializer.size(); - int b = unicodeDeserializer.readUnicode(data, p); - p += unicodeDeserializer.size(); - buf.append(getTokenName(a)).append("..").append(getTokenName(b)); - } - buf.append("\n"); - } - return p; - } - - public String getTokenName(int t) { - if ( t==-1 ) return "EOF"; - - if ( atn.grammarType == ATNType.LEXER && - t >= Character.MIN_VALUE && t <= Character.MAX_VALUE ) - { - switch (t) { - case '\n': - return "'\\n'"; - case '\r': - return "'\\r'"; - case '\t': - return "'\\t'"; - case '\b': - return "'\\b'"; - case '\f': - return "'\\f'"; - case '\\': - return "'\\\\'"; - case '\'': - return "'\\''"; - default: - if ( Character.UnicodeBlock.of((char)t)==Character.UnicodeBlock.BASIC_LATIN && - !Character.isISOControl((char)t) ) { - return '\''+Character.toString((char)t)+'\''; - } - // turn on the bit above max "\uFFFF" value so that we pad with zeros - // then only take last 4 digits - String hex = Integer.toHexString(t|0x10000).toUpperCase().substring(1,5); - String unicodeStr = "'\\u"+hex+"'"; - return unicodeStr; - } - } - - if (tokenNames != null && t >= 0 && t < tokenNames.size()) { - return tokenNames.get(t); - } - - return String.valueOf(t); - } - - /** Used by Java target to encode short/int array as chars in string. */ - public static String getSerializedAsString(ATN atn, String language) { - return new String(getSerializedAsChars(atn, language)); - } - - public static IntegerList getSerialized(ATN atn, String language) { - return new ATNSerializer(atn).serialize(language); - } - - public static char[] getSerializedAsChars(ATN atn, String language) { - return Utils.toCharArray(getSerialized(atn, language)); - } - - private void serializeInt(IntegerList data, int value) { - data.add((char)value); - data.add((char)(value >> 16)); + public static IntegerList getSerialized(ATN atn) { + return new ATNSerializer(atn).serialize(); } } diff --git a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java index f889a1d188..b7670abc77 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java +++ b/runtime/Java/src/org/antlr/v4/runtime/atn/ATNSimulator.java @@ -85,66 +85,4 @@ public PredictionContext getCachedContext(PredictionContext context) { visited); } } - - /** - * @deprecated Use {@link ATNDeserializer#deserialize} instead. - */ - @Deprecated - public static ATN deserialize(char[] data) { - return new ATNDeserializer().deserialize(data); - } - - /** - * @deprecated Use {@link ATNDeserializer#checkCondition(boolean)} instead. - */ - @Deprecated - public static void checkCondition(boolean condition) { - new ATNDeserializer().checkCondition(condition); - } - - /** - * @deprecated Use {@link ATNDeserializer#checkCondition(boolean, String)} instead. - */ - @Deprecated - public static void checkCondition(boolean condition, String message) { - new ATNDeserializer().checkCondition(condition, message); - } - - /** - * @deprecated Use {@link ATNDeserializer#toInt} instead. - */ - @Deprecated - public static int toInt(char c) { - return ATNDeserializer.toInt(c); - } - - /** - * @deprecated Use {@link ATNDeserializer#toInt32} instead. - */ - @Deprecated - public static int toInt32(char[] data, int offset) { - return ATNDeserializer.toInt32(data, offset); - } - - /** - * @deprecated Use {@link ATNDeserializer#edgeFactory} instead. - */ - @Deprecated - - public static Transition edgeFactory(ATN atn, - int type, int src, int trg, - int arg1, int arg2, int arg3, - List sets) - { - return new ATNDeserializer().edgeFactory(atn, type, src, trg, arg1, arg2, arg3, sets); - } - - /** - * @deprecated Use {@link ATNDeserializer#stateFactory} instead. - */ - @Deprecated - public static ATNState stateFactory(int type, int ruleIndex) { - return new ATNDeserializer().stateFactory(type, ruleIndex); - } - } diff --git a/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java b/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java index 090670d5a3..d6c700d8f6 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java +++ b/runtime/Java/src/org/antlr/v4/runtime/misc/IntegerList.java @@ -276,8 +276,9 @@ private void ensureCapacity(int capacity) { _data = Arrays.copyOf(_data, newLength); } - /** Convert the list to a UTF-16 encoded char array. If all values are less - * than the 0xFFFF 16-bit code point limit then this is just a char array + /** Convert the int list to a char array where values > 0x7FFFF take 2 bytes. TODO????? + * If all values are less + * than the 0x7FFF 16-bit code point limit (1 bit taken to indicatethen this is just a char array * of 16-bit char as usual. For values in the supplementary range, encode * them as two UTF-16 code units. */ diff --git a/runtime/Java/src/org/antlr/v4/runtime/misc/InterpreterDataReader.java b/runtime/Java/src/org/antlr/v4/runtime/misc/InterpreterDataReader.java index 953bc6b757..94980e1532 100644 --- a/runtime/Java/src/org/antlr/v4/runtime/misc/InterpreterDataReader.java +++ b/runtime/Java/src/org/antlr/v4/runtime/misc/InterpreterDataReader.java @@ -20,7 +20,7 @@ // A class to read plain text interpreter data produced by ANTLR. public class InterpreterDataReader { - + public static class InterpreterData { ATN atn; Vocabulary vocabulary; @@ -28,39 +28,39 @@ public static class InterpreterData { List channels; // Only valid for lexer grammars. List modes; // ditto }; - + /** * The structure of the data file is very simple. Everything is line based with empty lines * separating the different parts. For lexers the layout is: * token literal names: * ... - * + * * token symbolic names: * ... - * + * * rule names: * ... - * + * * channel names: * ... - * + * * mode names: * ... - * + * * atn: * enclosed in a pair of squared brackets. - * + * * Data for a parser does not contain channel and mode names. */ public static InterpreterData parseFile(String fileName) { InterpreterData result = new InterpreterData(); result.ruleNames = new ArrayList(); - + try (BufferedReader br = new BufferedReader(new FileReader(fileName))) { String line; List literalNames = new ArrayList(); List symbolicNames = new ArrayList(); - + line = br.readLine(); if ( !line.equals("token literal names:") ) throw new RuntimeException("Unexpected data entry"); @@ -69,7 +69,7 @@ public static InterpreterData parseFile(String fileName) { break; literalNames.add(line.equals("null") ? "" : line); } - + line = br.readLine(); if ( !line.equals("token symbolic names:") ) throw new RuntimeException("Unexpected data entry"); @@ -89,7 +89,7 @@ public static InterpreterData parseFile(String fileName) { break; result.ruleNames.add(line); } - + line = br.readLine(); if ( line.equals("channel names:") ) { // Additional lexer data. result.channels = new ArrayList(); @@ -114,19 +114,11 @@ public static InterpreterData parseFile(String fileName) { if ( !line.equals("atn:") ) throw new RuntimeException("Unexpected data entry"); line = br.readLine(); - String[] elements = line.split(","); - char[] serializedATN = new char[elements.length]; - - for (int i = 0; i < elements.length; ++i) { - int value; - String element = elements[i]; - if ( element.startsWith("[") ) - value = Integer.parseInt(element.substring(1).trim()); - else if ( element.endsWith("]") ) - value = Integer.parseInt(element.substring(0, element.length() - 1).trim()); - else - value = Integer.parseInt(element.trim()); - serializedATN[i] = (char)value; + String[] elements = line.substring(1,line.length()-1).split(","); + int[] serializedATN = new int[elements.length]; + + for (int i = 0; i < elements.length; ++i) { // ignore [...] on ends + serializedATN[i] = Integer.parseInt(elements[i].trim()); } ATNDeserializer deserializer = new ATNDeserializer(); @@ -135,8 +127,8 @@ else if ( element.endsWith("]") ) catch (java.io.IOException e) { // We just swallow the error and return empty objects instead. } - + return result; } - + } diff --git a/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js b/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js index 35897e650d..a6f8144dcd 100644 --- a/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js +++ b/runtime/JavaScript/src/antlr4/atn/ATNDeserializer.js @@ -82,10 +82,7 @@ class ATNDeserializer { this.readRules(atn); this.readModes(atn); const sets = []; - // First, deserialize sets with 16-bit arguments <= U+FFFF. - this.readSets(atn, sets, this.readInt.bind(this)); - // Next, deserialize sets with 32-bit arguments <= U+10FFFF. - this.readSets(atn, sets, this.readInt32.bind(this)); + this.readSets(atn, sets); this.readEdges(atn, sets); this.readDecisions(atn); this.readLexerActions(atn); @@ -125,9 +122,6 @@ class ATNDeserializer { continue; } let ruleIndex = this.readInt(); - if (ruleIndex === 0xFFFF) { - ruleIndex = -1; - } const s = this.stateFactory(stype, ruleIndex); if (stype === ATNState.LOOP_END) { // special case const loopBackStateNumber = this.readInt(); @@ -175,9 +169,6 @@ class ATNDeserializer { atn.ruleToStartState[i] = atn.states[s]; if ( atn.grammarType === ATNType.LEXER ) { let tokenType = this.readInt(); - if (tokenType === 0xFFFF) { - tokenType = Token.EOF; - } atn.ruleToTokenType[i] = tokenType; } } @@ -200,7 +191,7 @@ class ATNDeserializer { } } - readSets(atn, sets, readUnicode) { + readSets(atn, sets) { const m = this.readInt(); for (let i=0; i ATN { -// let data = str.utf16.map { element in Int(element) } var p = 0 let version = data[p] @@ -50,10 +49,6 @@ public class ATNDeserializer { var ruleIndex = data[p] p += 1 - if ruleIndex == UInt16.max { - ruleIndex = -1 - } - let s = try stateFactory(stype, ruleIndex)! if stype == ATNState.LOOP_END { // special case @@ -109,10 +104,6 @@ public class ATNDeserializer { if atn.grammarType == ATNType.lexer { var tokenType = data[p] p += 1 - if tokenType == UInt16.max { - tokenType = CommonToken.EOF - } - ruleToTokenType.append(tokenType) } } @@ -139,11 +130,7 @@ public class ATNDeserializer { // var sets = [IntervalSet]() - // First, deserialize sets with 16-bit arguments <= U+FFFF. - readSets(data, &p, &sets, readUnicodeInt) - - // Next, deserialize sets with 32-bit arguments <= U+10FFFF. - readSets(data, &p, &sets, readUnicodeInt32) + readSets(data, &p, &sets, readInt) // // EDGES @@ -194,16 +181,8 @@ public class ATNDeserializer { p += 1 var data1 = data[p] p += 1 - if data1 == UInt16.max { - data1 = -1 - } - var data2 = data[p] p += 1 - if data2 == UInt16.max { - data2 = -1 - } - let lexerAction = lexerActionFactory(actionType, data1, data2) lexerActions.append(lexerAction) } @@ -214,18 +193,12 @@ public class ATNDeserializer { return atn } - private func readUnicodeInt(_ data: [Int], _ p: inout Int) -> Int { + private func readInt(_ data: [Int], _ p: inout Int) -> Int { let result = data[p] p += 1 return result } - private func readUnicodeInt32(_ data: [Int], _ p: inout Int) -> Int { - let result = toInt32(data[p.. Int) { let nsets = data[p] p += 1 diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/ATNDescriber.java b/tool-testsuite/test/org/antlr/v4/test/tool/ATNDescriber.java new file mode 100644 index 0000000000..dbc1a6bad2 --- /dev/null +++ b/tool-testsuite/test/org/antlr/v4/test/tool/ATNDescriber.java @@ -0,0 +1,203 @@ +package org.antlr.v4.test.tool; + +import org.antlr.v4.runtime.Token; +import org.antlr.v4.runtime.atn.*; + +import java.io.InvalidClassException; +import java.util.List; + +/** Make human readable set of ints from serialized ATN like this (for debugging / testing): + * + * max type 1 + * 0:TOKEN_START -1 + * 1:RULE_START 0 + * 2:RULE_STOP 0 + * 3:BASIC 0 + * 4:BASIC 0 + * rule 0:1 1 + * mode 0:0 + * 0:'a'..128169 + * 0->1 EPSILON 0,0,0 + * 1->3 EPSILON 0,0,0 + * 3->4 SET 0,0,0 + * 4->2 EPSILON 0,0,0 + * 0:0 + */ +public class ATNDescriber { + public ATN atn; + private List tokenNames; + + public ATNDescriber(ATN atn, List tokenNames) { + assert atn.grammarType != null; + this.atn = atn; + this.tokenNames = tokenNames; + } + + /** For testing really; gives a human readable version of the ATN */ + public String decode(int[] data) { + StringBuilder buf = new StringBuilder(); + int p = 0; + int version = data[p++]; + if (version != ATNDeserializer.SERIALIZED_VERSION) { + String reason = String.format("Could not deserialize ATN with version %d (expected %d).", version, ATNDeserializer.SERIALIZED_VERSION); + throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); + } + + p++; // skip grammarType + int maxType = data[p++]; + buf.append("max type ").append(maxType).append("\n"); + int nstates = data[p++]; + for (int i=0; i").append(trg) + .append(" ").append(Transition.serializationNames.get(ttype)) + .append(" ").append(arg1).append(",").append(arg2).append(",").append(arg3) + .append("\n"); + p += 6; + } + int ndecisions = data[p++]; + for (int i=0; i0 ) { + buf.append(", "); + } + + int a = data[p++]; + int b = data[p++]; + buf.append(getTokenName(a)).append("..").append(getTokenName(b)); + } + buf.append("\n"); + } + return p; + } + + public String getTokenName(int t) { + if ( t==-1 ) return "EOF"; + + if ( atn.grammarType == ATNType.LEXER && + t >= Character.MIN_VALUE && t <= Character.MAX_VALUE ) + { + switch (t) { + case '\n': + return "'\\n'"; + case '\r': + return "'\\r'"; + case '\t': + return "'\\t'"; + case '\b': + return "'\\b'"; + case '\f': + return "'\\f'"; + case '\\': + return "'\\\\'"; + case '\'': + return "'\\''"; + default: + if ( Character.UnicodeBlock.of((char)t)==Character.UnicodeBlock.BASIC_LATIN && + !Character.isISOControl((char)t) ) { + return '\''+Character.toString((char)t)+'\''; + } + // turn on the bit above max "\uFFFF" value so that we pad with zeros + // then only take last 4 digits + String hex = Integer.toHexString(t|0x10000).toUpperCase().substring(1,5); + String unicodeStr = "'\\u"+hex+"'"; + return unicodeStr; + } + } + + if (tokenNames != null && t >= 0 && t < tokenNames.size()) { + return tokenNames.get(t); + } + + return String.valueOf(t); + } + +} diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNDeserialization.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNDeserialization.java index 01301672fd..25d8127ad0 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNDeserialization.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNDeserialization.java @@ -9,7 +9,7 @@ import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.atn.ATNDeserializer; import org.antlr.v4.runtime.atn.ATNSerializer; -import org.antlr.v4.runtime.misc.Utils; +import org.antlr.v4.runtime.misc.IntegerList; import org.antlr.v4.tool.Grammar; import org.antlr.v4.tool.LexerGrammar; import org.junit.Before; @@ -17,6 +17,9 @@ import java.util.Arrays; +import static org.antlr.v4.runtime.atn.ATNDeserializer.encodeIntsWith16BitWords; +import static org.antlr.v4.runtime.atn.ATNDeserializer.decodeIntsEncodedAs16BitWords; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; public class TestATNDeserialization extends BaseJavaToolTest { @@ -151,21 +154,40 @@ public void testSetUp() throws Exception { @Test public void test2ModesInLexer() throws Exception { LexerGrammar lg = new LexerGrammar( - "lexer grammar L;\n"+ - "A : 'a'\n ;\n" + - "mode M;\n" + - "B : 'b';\n" + - "mode M2;\n" + - "C : 'c';\n"); + "lexer grammar L;\n"+ + "A : 'a'\n ;\n" + + "mode M;\n" + + "B : 'b';\n" + + "mode M2;\n" + + "C : 'c';\n"); + checkDeserializationIsStable(lg); + } + + @Test public void testLastValidBMPCharInSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n" + + "ID : 'Ā'..'\\uFFFC'; // FFFD+ are not valid char\n"); checkDeserializationIsStable(lg); } protected void checkDeserializationIsStable(Grammar g) { ATN atn = createATN(g, false); - char[] data = Utils.toCharArray(ATNSerializer.getSerialized(atn, "Java")); - String atnData = TestATNSerialization.getDecoded(atn, Arrays.asList(g.getTokenNames())); - ATN atn2 = new ATNDeserializer().deserialize(data); - String atn2Data = TestATNSerialization.getDecoded(atn2, Arrays.asList(g.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String atnData = new ATNDescriber(atn, Arrays.asList(g.getTokenNames())).decode(serialized.toArray()); + + IntegerList serialized16 = encodeIntsWith16BitWords(serialized); + int[] ints16 = serialized16.toArray(); + char[] chars = new char[ints16.length]; + for (int i = 0; i < ints16.length; i++) { + chars[i] = (char)ints16[i]; + } + int[] serialized32 = decodeIntsEncodedAs16BitWords(chars, true); + + assertArrayEquals(serialized.toArray(), serialized32); + + ATN atn2 = new ATNDeserializer().deserialize(serialized.toArray()); + IntegerList serialized1 = ATNSerializer.getSerialized(atn2); + String atn2Data = new ATNDescriber(atn2, Arrays.asList(g.getTokenNames())).decode(serialized1.toArray()); assertEquals(atnData, atn2Data); } diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java index e962e6131e..7f993cf975 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNSerialization.java @@ -9,7 +9,6 @@ import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.atn.ATNSerializer; import org.antlr.v4.runtime.misc.IntegerList; -import org.antlr.v4.runtime.misc.Utils; import org.antlr.v4.tool.DOTGenerator; import org.antlr.v4.tool.Grammar; import org.antlr.v4.tool.LexerGrammar; @@ -17,8 +16,10 @@ import org.junit.Test; import java.util.Arrays; -import java.util.List; +import static org.antlr.v4.runtime.atn.ATNDeserializer.encodeIntsWith16BitWords; +import static org.antlr.v4.runtime.atn.ATNDeserializer.decodeIntsEncodedAs16BitWords; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; public class TestATNSerialization extends BaseJavaToolTest { @@ -45,31 +46,27 @@ public void testSetUp() throws Exception { "2->3 ATOM 1,0,0\n" + "3->4 ATOM 2,0,0\n" + "4->1 EPSILON 0,0,0\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + checkResults(g, expecting); } @Test public void testEOF() throws Exception { Grammar g = new Grammar( - "parser grammar T;\n"+ - "a : A EOF ;"); + "parser grammar T;\n"+ + "a : A EOF ;"); String expecting = - "max type 1\n" + - "0:RULE_START 0\n" + - "1:RULE_STOP 0\n" + - "2:BASIC 0\n" + - "3:BASIC 0\n" + - "4:BASIC 0\n" + - "5:BASIC 0\n" + - "rule 0:0\n" + - "0->2 EPSILON 0,0,0\n" + - "2->3 ATOM 1,0,0\n" + - "3->4 ATOM 0,0,1\n" + - "4->1 EPSILON 0,0,0\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + "max type 1\n" + + "0:RULE_START 0\n" + + "1:RULE_STOP 0\n" + + "2:BASIC 0\n" + + "3:BASIC 0\n" + + "4:BASIC 0\n" + + "5:BASIC 0\n" + + "rule 0:0\n" + + "0->2 EPSILON 0,0,0\n" + + "2->3 ATOM 1,0,0\n" + + "3->4 ATOM 0,0,1\n" + + "4->1 EPSILON 0,0,0\n"; + checkResults(g, expecting); } @Test public void testEOFInSet() throws Exception { @@ -88,9 +85,7 @@ public void testSetUp() throws Exception { "0->2 EPSILON 0,0,0\n" + "2->3 SET 0,0,0\n" + "3->1 EPSILON 0,0,0\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + checkResults(g, expecting); } @Test public void testNot() throws Exception { @@ -111,8 +106,8 @@ public void testSetUp() throws Exception { "2->3 NOT_SET 0,0,0\n" + "3->1 EPSILON 0,0,0\n"; ATN atn = createATN(g, true); - DOTGenerator gen = new DOTGenerator(g); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(g.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -132,9 +127,7 @@ public void testSetUp() throws Exception { "0->2 EPSILON 0,0,0\n" + "2->3 WILDCARD 0,0,0\n" + "3->1 EPSILON 0,0,0\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + checkResults(g, expecting); } @Test public void testPEGAchillesHeel() throws Exception { @@ -160,9 +153,7 @@ public void testSetUp() throws Exception { "5->3 EPSILON 0,0,0\n" + "6->1 EPSILON 0,0,0\n" + "0:5\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + checkResults(g, expecting); } @Test public void test3Alts() throws Exception { @@ -195,9 +186,7 @@ public void testSetUp() throws Exception { "8->5 EPSILON 0,0,0\n" + "9->1 EPSILON 0,0,0\n" + "0:8\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + checkResults(g, expecting); } @Test public void testSimpleLoop() throws Exception { @@ -227,9 +216,7 @@ public void testSetUp() throws Exception { "7->8 ATOM 2,0,0\n" + "8->1 EPSILON 0,0,0\n" + "0:5\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + checkResults(g, expecting); } @Test public void testRuleRef() throws Exception { @@ -256,9 +243,7 @@ public void testSetUp() throws Exception { "5->1 EPSILON 0,0,0\n" + "6->7 ATOM 1,0,0\n" + "7->3 EPSILON 0,0,0\n"; - ATN atn = createATN(g, true); - String result = getDecoded(atn, Arrays.asList(g.getTokenNames())); - assertEquals(expecting, result); + checkResults(g, expecting); } @Test public void testLexerTwoRules() throws Exception { @@ -290,7 +275,8 @@ public void testSetUp() throws Exception { "8->4 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -314,7 +300,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -338,42 +325,88 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } - @Test public void testLexerUnicodeSMPSetSerializedAfterBMPSet() throws Exception { + @Test public void testLexerUnicodeSMPAndBMPSetSerialized() throws Exception { LexerGrammar lg = new LexerGrammar( - "lexer grammar L;\n"+ - "SMP : ('\\u{1F4A9}' | '\\u{1F4AF}') ;\n"+ - "BMP : ('a' | 'x') ;"); + "lexer grammar L;\n"+ + "SMP : ('\\u{1F4A9}' | '\\u{1F4AF}') ;\n"+ + "BMP : ('a' | 'x') ;"); String expecting = - "max type 2\n" + - "0:TOKEN_START -1\n" + - "1:RULE_START 0\n" + - "2:RULE_STOP 0\n" + - "3:RULE_START 1\n" + - "4:RULE_STOP 1\n" + - "5:BASIC 0\n" + - "6:BASIC 0\n" + - "7:BASIC 1\n" + - "8:BASIC 1\n" + - "rule 0:1 1\n" + - "rule 1:3 2\n" + - "mode 0:0\n" + - "0:'a'..'a', 'x'..'x'\n" + - "1:128169..128169, 128175..128175\n" + - "0->1 EPSILON 0,0,0\n" + - "0->3 EPSILON 0,0,0\n" + - "1->5 EPSILON 0,0,0\n" + - "3->7 EPSILON 0,0,0\n" + - "5->6 SET 1,0,0\n" + - "6->2 EPSILON 0,0,0\n" + - "7->8 SET 0,0,0\n" + - "8->4 EPSILON 0,0,0\n" + - "0:0\n"; + "max type 2\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:RULE_START 1\n" + + "4:RULE_STOP 1\n" + + "5:BASIC 0\n" + + "6:BASIC 0\n" + + "7:BASIC 1\n" + + "8:BASIC 1\n" + + "rule 0:1 1\n" + + "rule 1:3 2\n" + + "mode 0:0\n" + + "0:128169..128169, 128175..128175\n" + + "1:'a'..'a', 'x'..'x'\n" + + "0->1 EPSILON 0,0,0\n" + + "0->3 EPSILON 0,0,0\n" + + "1->5 EPSILON 0,0,0\n" + + "3->7 EPSILON 0,0,0\n" + + "5->6 SET 0,0,0\n" + + "6->2 EPSILON 0,0,0\n" + + "7->8 SET 1,0,0\n" + + "8->4 EPSILON 0,0,0\n" + + "0:0\n"; + ATN atn = createATN(lg, true); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); + assertEquals(expecting, result); + } + + @Test public void testLexerWith0xFFFCInSet() throws Exception { + LexerGrammar lg = new LexerGrammar( + "lexer grammar L;\n" + + "ID : ([A-Z_]|'Ā'..'\\uFFFC') ([A-Z_0-9]|'Ā'..'\\uFFFC')*; // FFFD+ are not valid char\n"); + String expecting = + "max type 1\n" + + "0:TOKEN_START -1\n" + + "1:RULE_START 0\n" + + "2:RULE_STOP 0\n" + + "3:BASIC 0\n" + + "4:BLOCK_START 0 5\n" + + "5:BLOCK_END 0\n" + + "6:BASIC 0\n" + + "7:STAR_BLOCK_START 0 8\n" + + "8:BLOCK_END 0\n" + + "9:STAR_LOOP_ENTRY 0\n" + + "10:LOOP_END 0 11\n" + + "11:STAR_LOOP_BACK 0\n" + + "rule 0:1 1\n" + + "mode 0:0\n" + + "0:'A'..'Z', '_'..'_', '\\u0100'..'\\uFFFC'\n" + + "1:'0'..'9', 'A'..'Z', '_'..'_', '\\u0100'..'\\uFFFC'\n" + + "0->1 EPSILON 0,0,0\n" + + "1->4 EPSILON 0,0,0\n" + + "3->5 SET 0,0,0\n" + + "4->3 EPSILON 0,0,0\n" + + "5->9 EPSILON 0,0,0\n" + + "6->8 SET 1,0,0\n" + + "7->6 EPSILON 0,0,0\n" + + "8->11 EPSILON 0,0,0\n" + + "9->7 EPSILON 0,0,0\n" + + "9->10 EPSILON 0,0,0\n" + + "10->2 EPSILON 0,0,0\n" + + "11->9 EPSILON 0,0,0\n" + + "0:0\n" + + "1:4\n" + + "2:7\n" + + "3:9\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -397,7 +430,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -420,7 +454,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -445,7 +480,8 @@ public void testSetUp() throws Exception { "5->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -474,7 +510,8 @@ public void testSetUp() throws Exception { "0:0\n" + "1:5\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -505,7 +542,8 @@ public void testSetUp() throws Exception { "0:0\n" + "1:6\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -552,7 +590,8 @@ public void testSetUp() throws Exception { "14->6 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -575,9 +614,7 @@ public void testSetUp() throws Exception { "3->4 NOT_SET 0,0,0\n" + "4->2 EPSILON 0,0,0\n" + "0:0\n"; - ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); - assertEquals(expecting, result); + checkResults(lg, expecting); } @Test public void testLexerSetWithRange() throws Exception { @@ -600,7 +637,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -624,7 +662,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -648,7 +687,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -672,7 +712,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -696,7 +737,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -720,7 +762,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -744,7 +787,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -768,7 +812,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -792,7 +837,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -816,7 +862,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -840,7 +887,8 @@ public void testSetUp() throws Exception { "4->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -903,7 +951,8 @@ public void testSetUp() throws Exception { "1:1\n" + "2:11\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -930,7 +979,8 @@ public void testSetUp() throws Exception { "5->2 EPSILON 0,0,0\n" + "0:0\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -987,7 +1037,8 @@ public void testSetUp() throws Exception { "0:0\n" + "1:1\n"; ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(lg.getTokenNames())).decode(serialized.toArray()); assertEquals(expecting, result); } @@ -1037,14 +1088,23 @@ public void testSetUp() throws Exception { "0:0\n" + "1:1\n" + "2:2\n"; - ATN atn = createATN(lg, true); - String result = getDecoded(atn, Arrays.asList(lg.getTokenNames())); - assertEquals(expecting, result); + checkResults(lg, expecting); } - public static String getDecoded(ATN atn, List tokenNames) { - IntegerList serialized = ATNSerializer.getSerialized(atn, "Java"); - char[] data = Utils.toCharArray(serialized); - return new ATNSerializer(atn, tokenNames).decode(data); + private void checkResults(Grammar g, String expecting) { + ATN atn = createATN(g, true); + IntegerList serialized = ATNSerializer.getSerialized(atn); + String result = new ATNDescriber(atn, Arrays.asList(g.getTokenNames())).decode(serialized.toArray()); + assertEquals(expecting, result); + + IntegerList serialized16 = encodeIntsWith16BitWords(serialized); + int[] ints16 = serialized16.toArray(); + char[] chars = new char[ints16.length]; + for (int i = 0; i < ints16.length; i++) { + chars[i] = (char)ints16[i]; + } + int[] serialized32 = decodeIntsEncodedAs16BitWords(chars, true); + + assertArrayEquals(serialized.toArray(), serialized32); } } diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/CharSupportTest.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestCharSupport.java similarity index 99% rename from tool-testsuite/test/org/antlr/v4/test/tool/CharSupportTest.java rename to tool-testsuite/test/org/antlr/v4/test/tool/TestCharSupport.java index e9ceab2ae4..eb3ba3ad18 100644 --- a/tool-testsuite/test/org/antlr/v4/test/tool/CharSupportTest.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestCharSupport.java @@ -11,8 +11,7 @@ import org.junit.Assert; import org.junit.Test; -public class CharSupportTest { - +public class TestCharSupport { @Test public void testGetANTLRCharLiteralForChar() { Assert.assertEquals("''", diff --git a/tool-testsuite/test/org/antlr/v4/misc/UtilsTest.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestUtils.java similarity index 97% rename from tool-testsuite/test/org/antlr/v4/misc/UtilsTest.java rename to tool-testsuite/test/org/antlr/v4/test/tool/TestUtils.java index 0c008224f8..c9ba151888 100644 --- a/tool-testsuite/test/org/antlr/v4/misc/UtilsTest.java +++ b/tool-testsuite/test/org/antlr/v4/test/tool/TestUtils.java @@ -1,15 +1,14 @@ -package org.antlr.v4.misc; +package org.antlr.v4.test.tool; import org.antlr.runtime.Token; -import org.antlr.v4.codegen.CodeGenerator; +import org.antlr.v4.misc.Utils; import org.antlr.v4.tool.ast.GrammarAST; import org.junit.Assert; import org.junit.Test; import java.util.ArrayList; -public class UtilsTest { - +public class TestUtils { @Test public void testStripFileExtension() { Assert.assertNull(Utils.stripFileExtension(null)); diff --git a/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg b/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg index 1bb19827bc..9ac3fea418 100644 --- a/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg +++ b/tool/resources/org/antlr/v4/tool/templates/codegen/Cpp/Cpp.stg @@ -79,7 +79,7 @@ public: const antlr4::dfa::Vocabulary& getVocabulary() const override; - virtual const std::vector\& getSerializedATN() const override; + virtual const std::vector\& getSerializedATN() const override; virtual const antlr4::atn::ATN& getATN() const override; @@ -138,7 +138,7 @@ struct StaticData final { const std::vector\ literalNames; const std::vector\ symbolicNames; const antlr4::dfa::Vocabulary vocabulary; - std::vector\ serializedATN; + std::vector\ serializedATN; std::unique_ptr\ atn; }; @@ -199,7 +199,7 @@ const dfa::Vocabulary& ::getVocabulary() const { return LexerStaticData->vocabulary; } -const std::vector\& ::getSerializedATN() const { +const std::vector\& ::getSerializedATN() const { return LexerStaticData->serializedATN; } @@ -307,7 +307,7 @@ public: const antlr4::dfa::Vocabulary& getVocabulary() const override; - const std::vector\& getSerializedATN() const override; + const std::vector\& getSerializedATN() const override; @@ -356,7 +356,7 @@ struct StaticData final { const std::vector\ literalNames; const std::vector\ symbolicNames; const antlr4::dfa::Vocabulary vocabulary; - std::vector\ serializedATN; + std::vector\ serializedATN; std::unique_ptr\ atn; }; @@ -407,7 +407,7 @@ const dfa::Vocabulary& ::getVocabulary() const { return ParserStaticData->vocabulary; } -const std::vector\& ::getSerializedATN() const { +const std::vector\& ::getSerializedATN() const { return ParserStaticData->serializedATN; } @@ -438,7 +438,7 @@ SerializedATNHeader(model) ::= << >> SerializedATN(model) ::= << -static const uint16_t serializedATNSegment[] = { +static const int32_t serializedATNSegment[] = { }; separator=",", wrap> }; staticData->serializedATN.reserve(sizeof(serializedATNSegment) / sizeof(serializedATNSegment[0])); diff --git a/tool/resources/org/antlr/v4/tool/templates/codegen/Go/Go.stg b/tool/resources/org/antlr/v4/tool/templates/codegen/Go/Go.stg index f95fc8e34f..a4d4b52dce 100644 --- a/tool/resources/org/antlr/v4/tool/templates/codegen/Go/Go.stg +++ b/tool/resources/org/antlr/v4/tool/templates/codegen/Go/Go.stg @@ -151,7 +151,7 @@ type struct { var ParserStaticData struct { once sync.Once - serializedATN []uint16 + serializedATN []int32 literalNames []string symbolicNames []string ruleNames []string @@ -180,7 +180,7 @@ func ParserInit() { staticData.predictionContextCache = antlr.NewPredictionContextCache() staticData.serializedATN = deserializer := antlr.NewATNDeserializer(nil) - staticData.atn = deserializer.DeserializeFromUInt16(staticData.serializedATN) + staticData.atn = deserializer.Deserialize(staticData.serializedATN) atn := staticData.atn staticData.decisionToDFA = make([]*antlr.DFA, len(atn.DecisionToState)) decisionToDFA := staticData.decisionToDFA @@ -1424,7 +1424,7 @@ type struct { var LexerStaticData struct { once sync.Once - serializedATN []uint16 + serializedATN []int32 channelNames []string modeNames []string literalNames []string @@ -1461,7 +1461,7 @@ func LexerInit() { staticData.predictionContextCache = antlr.NewPredictionContextCache() staticData.serializedATN = deserializer := antlr.NewATNDeserializer(nil) - staticData.atn = deserializer.DeserializeFromUInt16(staticData.serializedATN) + staticData.atn = deserializer.Deserialize(staticData.serializedATN) atn := staticData.atn staticData.decisionToDFA = make([]*antlr.DFA, len(atn.DecisionToState)) decisionToDFA := staticData.decisionToDFA @@ -1541,7 +1541,7 @@ const = 1 >> SerializedATN(model) ::= << -[]uint16{ +[]int32{ , } >> diff --git a/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg b/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg index 47aafe4afa..5aad297348 100644 --- a/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg +++ b/tool/resources/org/antlr/v4/tool/templates/codegen/Java/Java.stg @@ -964,7 +964,7 @@ public class extends { } >> -SerializedATN(model) ::= << +SerializedJavaATN(model) ::= << = diff --git a/tool/src/org/antlr/v4/Tool.java b/tool/src/org/antlr/v4/Tool.java index e55cfab8ba..0b13ea23c4 100644 --- a/tool/src/org/antlr/v4/Tool.java +++ b/tool/src/org/antlr/v4/Tool.java @@ -738,10 +738,10 @@ public static String generateInterpreterData(Grammar g) { } content.append("\n"); - IntegerList serializedATN = ATNSerializer.getSerialized(g.atn, g.getLanguage()); + IntegerList serializedATN = ATNSerializer.getSerialized(g.atn); // Uncomment if you'd like to write out histogram info on the numbers of // each integer value: - // Utils.writeSerializedATNIntegerHistogram(g.name+"-histo.csv", serializedATN); + //Utils.writeSerializedATNIntegerHistogram(g.name+"-histo.csv", serializedATN); content.append("atn:\n"); content.append(serializedATN.toString()); diff --git a/tool/src/org/antlr/v4/codegen/Target.java b/tool/src/org/antlr/v4/codegen/Target.java index 5fb03409c8..618d891a21 100644 --- a/tool/src/org/antlr/v4/codegen/Target.java +++ b/tool/src/org/antlr/v4/codegen/Target.java @@ -337,7 +337,7 @@ protected boolean shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(int cod } /** Assume 16-bit char */ - public String encodeIntAsCharEscape(int v) { + public String encodeInt16AsCharEscape(int v) { if (v < Character.MIN_VALUE || v > Character.MAX_VALUE) { throw new IllegalArgumentException(String.format("Cannot encode the specified value: %d", v)); } @@ -505,7 +505,7 @@ public String getBaseVisitorFileName(boolean header) { * in a single segment (a declaration in target language) of the serialized ATN. * E.g., in C++, a small segment length results in multiple decls like: * - * static const uint16_t serializedATNSegment1[] = { + * static const int32_t serializedATNSegment1[] = { * 0x7, 0x12, 0x2, 0x13, 0x7, 0x13, 0x2, 0x14, 0x7, 0x14, 0x2, 0x15, 0x7, * 0x15, 0x2, 0x16, 0x7, 0x16, 0x2, 0x17, 0x7, 0x17, 0x2, 0x18, 0x7, * 0x18, 0x2, 0x19, 0x7, 0x19, 0x2, 0x1a, 0x7, 0x1a, 0x2, 0x1b, 0x7, diff --git a/tool/src/org/antlr/v4/codegen/model/Recognizer.java b/tool/src/org/antlr/v4/codegen/model/Recognizer.java index 5b63bed4b2..8e07c29d2f 100644 --- a/tool/src/org/antlr/v4/codegen/model/Recognizer.java +++ b/tool/src/org/antlr/v4/codegen/model/Recognizer.java @@ -9,6 +9,7 @@ import org.antlr.v4.codegen.OutputModelFactory; import org.antlr.v4.codegen.model.chunk.ActionChunk; import org.antlr.v4.codegen.model.chunk.ActionText; +import org.antlr.v4.codegen.target.JavaTarget; import org.antlr.v4.tool.Grammar; import org.antlr.v4.tool.Rule; @@ -49,6 +50,7 @@ public Recognizer(OutputModelFactory factory) { super(factory); Grammar g = factory.getGrammar(); + CodeGenerator gen = factory.getGenerator(); grammarFileName = new File(g.fileName).getName(); grammarName = g.name; name = g.getRecognizerName(); @@ -63,7 +65,12 @@ public Recognizer(OutputModelFactory factory) { ruleNames = g.rules.keySet(); rules = g.rules.values(); - atn = new SerializedATN(factory, g.atn); + if ( gen.getTarget() instanceof JavaTarget ) { + atn = new SerializedJavaATN(factory, g.atn); + } + else { + atn = new SerializedATN(factory, g.atn); + } if (g.getOptionString("superClass") != null) { superClass = new ActionText(null, g.getOptionString("superClass")); } @@ -71,7 +78,6 @@ public Recognizer(OutputModelFactory factory) { superClass = null; } - CodeGenerator gen = factory.getGenerator(); tokenNames = translateTokenStringsToTarget(g.getTokenDisplayNames(), gen); literalNames = translateTokenStringsToTarget(g.getTokenLiteralNames(), gen); symbolicNames = translateTokenStringsToTarget(g.getTokenSymbolicNames(), gen); diff --git a/tool/src/org/antlr/v4/codegen/model/SerializedATN.java b/tool/src/org/antlr/v4/codegen/model/SerializedATN.java index 366499bb84..ad01932ef3 100644 --- a/tool/src/org/antlr/v4/codegen/model/SerializedATN.java +++ b/tool/src/org/antlr/v4/codegen/model/SerializedATN.java @@ -12,32 +12,21 @@ import org.antlr.v4.runtime.atn.ATNSerializer; import org.antlr.v4.runtime.misc.IntegerList; +/** Represents a serialized ATN that is just a list of signed integers; works for all targets + * except for java, which requires a 16-bit char encoding. See {@link SerializedJavaATN}. + */ public class SerializedATN extends OutputModelObject { - public final String[] serialized; - public final String[][] segments; + public int[] serialized; - public SerializedATN(OutputModelFactory factory, ATN atn) { + public SerializedATN(OutputModelFactory factory) { super(factory); - Target target = factory.getGenerator().getTarget(); - IntegerList data = ATNSerializer.getSerialized(atn, target.getLanguage()); - int size = data.size(); - int segmentLimit = target.getSerializedATNSegmentLimit(); - segments = new String[(int)(((long)size + segmentLimit - 1) / segmentLimit)][]; - int segmentIndex = 0; - - for (int i = 0; i < size; i += segmentLimit) { - int segmentSize = Math.min(i + segmentLimit, size) - i; - String[] segment = new String[segmentSize]; - segments[segmentIndex++] = segment; - for (int j = 0; j < segmentSize; j++) { - segment[j] = target.encodeIntAsCharEscape(data.get(i + j)); - } - } - - serialized = segments[0]; } - public String[][] getSegments() { - return segments; + public SerializedATN(OutputModelFactory factory, ATN atn) { + super(factory); + IntegerList data = ATNSerializer.getSerialized(atn); + serialized = data.toArray(); } + + public Object getSerialized() { return serialized; } } diff --git a/tool/src/org/antlr/v4/codegen/model/SerializedJavaATN.java b/tool/src/org/antlr/v4/codegen/model/SerializedJavaATN.java new file mode 100644 index 0000000000..46d53e755e --- /dev/null +++ b/tool/src/org/antlr/v4/codegen/model/SerializedJavaATN.java @@ -0,0 +1,40 @@ +package org.antlr.v4.codegen.model; + +import org.antlr.v4.codegen.OutputModelFactory; +import org.antlr.v4.codegen.Target; +import org.antlr.v4.runtime.atn.ATN; +import org.antlr.v4.runtime.atn.ATNDeserializer; +import org.antlr.v4.runtime.atn.ATNSerializer; +import org.antlr.v4.runtime.misc.IntegerList; + +/** A serialized ATN for the java target, which requires we use strings and 16-bit unicode values */ +public class SerializedJavaATN extends SerializedATN { + private final String[] serializedAsString; + private final String[][] segments; + + public SerializedJavaATN(OutputModelFactory factory, ATN atn) { + super(factory); + IntegerList data = ATNSerializer.getSerialized(atn); + data = ATNDeserializer.encodeIntsWith16BitWords(data); + + int size = data.size(); + Target target = factory.getGenerator().getTarget(); + int segmentLimit = target.getSerializedATNSegmentLimit(); + segments = new String[(int)(((long)size + segmentLimit - 1) / segmentLimit)][]; + int segmentIndex = 0; + + for (int i = 0; i < size; i += segmentLimit) { + int segmentSize = Math.min(i + segmentLimit, size) - i; + String[] segment = new String[segmentSize]; + segments[segmentIndex++] = segment; + for (int j = 0; j < segmentSize; j++) { + segment[j] = target.encodeInt16AsCharEscape(data.get(i + j)); + } + } + + serializedAsString = segments[0]; // serializedAsString is valid if only one segment + } + + public Object getSerialized() { return serializedAsString; } + public String[][] getSegments() { return segments; } +} diff --git a/tool/src/org/antlr/v4/tool/Grammar.java b/tool/src/org/antlr/v4/tool/Grammar.java index a64a825a0a..1406d004b1 100644 --- a/tool/src/org/antlr/v4/tool/Grammar.java +++ b/tool/src/org/antlr/v4/tool/Grammar.java @@ -29,10 +29,7 @@ import org.antlr.v4.runtime.atn.ATNSerializer; import org.antlr.v4.runtime.atn.SemanticContext; import org.antlr.v4.runtime.dfa.DFA; -import org.antlr.v4.runtime.misc.IntSet; -import org.antlr.v4.runtime.misc.Interval; -import org.antlr.v4.runtime.misc.IntervalSet; -import org.antlr.v4.runtime.misc.Pair; +import org.antlr.v4.runtime.misc.*; import org.antlr.v4.tool.ast.ActionAST; import org.antlr.v4.tool.ast.GrammarAST; import org.antlr.v4.tool.ast.GrammarASTWithOptions; @@ -1322,13 +1319,22 @@ public LexerInterpreter createLexerInterpreter(CharStream input) { return implicitLexer.createLexerInterpreter(input); } - char[] serializedAtn = ATNSerializer.getSerializedAsChars(atn, getLanguage()); - ATN deserialized = new ATNDeserializer().deserialize(serializedAtn); List allChannels = new ArrayList(); allChannels.add("DEFAULT_TOKEN_CHANNEL"); allChannels.add("HIDDEN"); allChannels.addAll(channelValueToNameList); - return new LexerInterpreter(fileName, getVocabulary(), Arrays.asList(getRuleNames()), allChannels, ((LexerGrammar)this).modes.keySet(), deserialized, input); + + // must run ATN through serializer to set some state flags + IntegerList serialized = ATNSerializer.getSerialized(atn); + ATN deserializedATN = new ATNDeserializer().deserialize(serialized.toArray()); + return new LexerInterpreter( + fileName, + getVocabulary(), + Arrays.asList(getRuleNames()), + allChannels, + ((LexerGrammar)this).modes.keySet(), + deserializedATN, + input); } /** @since 4.5.1 */ @@ -1336,9 +1342,11 @@ public GrammarParserInterpreter createGrammarParserInterpreter(TokenStream token if (this.isLexer()) { throw new IllegalStateException("A parser interpreter can only be created for a parser or combined grammar."); } - char[] serializedAtn = ATNSerializer.getSerializedAsChars(atn, getLanguage()); - ATN deserialized = new ATNDeserializer().deserialize(serializedAtn); - return new GrammarParserInterpreter(this, deserialized, tokenStream); + // must run ATN through serializer to set some state flags + IntegerList serialized = ATNSerializer.getSerialized(atn); + ATN deserializedATN = new ATNDeserializer().deserialize(serialized.toArray()); + + return new GrammarParserInterpreter(this, deserializedATN, tokenStream); } public ParserInterpreter createParserInterpreter(TokenStream tokenStream) { @@ -1346,8 +1354,10 @@ public ParserInterpreter createParserInterpreter(TokenStream tokenStream) { throw new IllegalStateException("A parser interpreter can only be created for a parser or combined grammar."); } - char[] serializedAtn = ATNSerializer.getSerializedAsChars(atn, getLanguage()); - ATN deserialized = new ATNDeserializer().deserialize(serializedAtn); - return new ParserInterpreter(fileName, getVocabulary(), Arrays.asList(getRuleNames()), deserialized, tokenStream); + // must run ATN through serializer to set some state flags + IntegerList serialized = ATNSerializer.getSerialized(atn); + ATN deserializedATN = new ATNDeserializer().deserialize(serialized.toArray()); + + return new ParserInterpreter(fileName, getVocabulary(), Arrays.asList(getRuleNames()), deserializedATN, tokenStream); } } diff --git a/tool/src/org/antlr/v4/tool/GrammarParserInterpreter.java b/tool/src/org/antlr/v4/tool/GrammarParserInterpreter.java index effaac22e5..8c2ddecccc 100644 --- a/tool/src/org/antlr/v4/tool/GrammarParserInterpreter.java +++ b/tool/src/org/antlr/v4/tool/GrammarParserInterpreter.java @@ -24,6 +24,7 @@ import org.antlr.v4.runtime.atn.PredictionMode; import org.antlr.v4.runtime.atn.RuleStartState; import org.antlr.v4.runtime.atn.StarLoopEntryState; +import org.antlr.v4.runtime.misc.IntegerList; import org.antlr.v4.runtime.misc.Interval; import org.antlr.v4.runtime.tree.Trees; @@ -401,12 +402,12 @@ public static ParserInterpreter deriveTempParserInterpreter(Grammar g, Parser or } } else { // must've been a generated parser - char[] serializedAtn = ATNSerializer.getSerializedAsChars(originalParser.getATN(), g.getLanguage()); - ATN deserialized = new ATNDeserializer().deserialize(serializedAtn); +// IntegerList serialized = ATNSerializer.getSerialized(originalParser.getATN(), g.getLanguage()); +// ATN deserialized = new ATNDeserializer().deserialize(serialized.toArray()); parser = new ParserInterpreter(originalParser.getGrammarFileName(), originalParser.getVocabulary(), Arrays.asList(originalParser.getRuleNames()), - deserialized, + originalParser.getATN(), tokens); }