graphql-java · bbakerman · Jul 26, 2022 · Jul 21, 2022 · Jul 21, 2022 · Jul 21, 2022
diff --git a/src/main/java/graphql/ParseAndValidate.java b/src/main/java/graphql/ParseAndValidate.java
@@ -11,6 +11,8 @@
 import java.util.List;
 import java.util.function.Predicate;
 
+import static java.util.Optional.ofNullable;
+
 /**
  * This class allows you to parse and validate a graphql query without executing it.  It will tell you
  * if it's syntactically valid and also semantically valid according to the graphql specification
@@ -58,6 +60,8 @@ public static ParseAndValidateResult parse(ExecutionInput executionInput) {
             //
             // we allow the caller to specify new parser options by context
             ParserOptions parserOptions = executionInput.getGraphQLContext().get(ParserOptions.class);
+            // we use the query parser options by default if they are not specified
+            parserOptions = ofNullable(parserOptions).orElse(ParserOptions.getDefaultOperationParserOptions());
             Parser parser = new Parser();
             Document document = parser.parseDocument(executionInput.getQuery(), parserOptions);
             return ParseAndValidateResult.newResult().document(document).variables(executionInput.getVariables()).build();

diff --git a/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java b/src/main/java/graphql/parser/GraphqlAntlrToLanguage.java
@@ -75,15 +75,16 @@
 import static graphql.Assert.assertShouldNeverHappen;
 import static graphql.collect.ImmutableKit.emptyList;
 import static graphql.collect.ImmutableKit.map;
+import static graphql.parser.Parser.CHANNEL_COMMENTS;
+import static graphql.parser.Parser.CHANNEL_IGNORED_CHARS;
 import static graphql.parser.StringValueParsing.parseSingleQuotedString;
 import static graphql.parser.StringValueParsing.parseTripleQuotedString;
+import static java.util.Optional.ofNullable;
 
 @Internal
 public class GraphqlAntlrToLanguage {
 
     private static final List<Comment> NO_COMMENTS = ImmutableKit.emptyList();
-    private static final int CHANNEL_COMMENTS = 2;
-    private static final int CHANNEL_IGNORED_CHARS = 3;
     private final CommonTokenStream tokens;
     private final MultiSourceReader multiSourceReader;
     private final ParserOptions parserOptions;
@@ -96,7 +97,7 @@ public GraphqlAntlrToLanguage(CommonTokenStream tokens, MultiSourceReader multiS
     public GraphqlAntlrToLanguage(CommonTokenStream tokens, MultiSourceReader multiSourceReader, ParserOptions parserOptions) {
         this.tokens = tokens;
         this.multiSourceReader = multiSourceReader;
-        this.parserOptions = parserOptions == null ? ParserOptions.getDefaultParserOptions() : parserOptions;
+        this.parserOptions = ofNullable(parserOptions).orElse(ParserOptions.getDefaultParserOptions());
     }
 
     public ParserOptions getParserOptions() {

diff --git a/src/main/java/graphql/parser/Parser.java b/src/main/java/graphql/parser/Parser.java
@@ -1,5 +1,6 @@
 package graphql.parser;
 
+import graphql.Internal;
 import graphql.PublicApi;
 import graphql.language.Document;
 import graphql.language.Node;
@@ -25,6 +26,8 @@
 import java.io.Reader;
 import java.io.UncheckedIOException;
 import java.util.List;
+import java.util.Optional;
+import java.util.function.BiConsumer;
 import java.util.function.BiFunction;
 
 /**
@@ -46,6 +49,11 @@
 @PublicApi
 public class Parser {
 
+    @Internal
+    public static final int CHANNEL_COMMENTS = 2;
+    @Internal
+    public static final int CHANNEL_IGNORED_CHARS = 3;
+
     /**
      * Parses a string input into a graphql AST {@link Document}
      *
@@ -222,7 +230,16 @@ public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int
             }
         });
 
-        CommonTokenStream tokens = new CommonTokenStream(lexer);
+        // default in the parser options if they are not set
+        parserOptions = Optional.ofNullable(parserOptions).orElse(ParserOptions.getDefaultParserOptions());
+
+        // this lexer wrapper allows us to stop lexing when too many tokens are in place.  This prevents DOS attacks.
+        int maxTokens = parserOptions.getMaxTokens();
+        int maxWhitespaceTokens = parserOptions.getMaxWhitespaceTokens();
+        BiConsumer<Integer, Token> onTooManyTokens = (maxTokenCount, token) -> throwCancelParseIfTooManyTokens(token, maxTokenCount, multiSourceReader);
+        SafeTokenSource safeTokenSource = new SafeTokenSource(lexer, maxTokens, maxWhitespaceTokens, onTooManyTokens);
+
+        CommonTokenStream tokens = new CommonTokenStream(safeTokenSource);
 
         GraphqlParser parser = new GraphqlParser(tokens);
         parser.removeErrorListeners();
@@ -295,21 +312,28 @@ public int getCharPositionInLine() {
 
                 count++;
                 if (count > maxTokens) {
-                    String msg = String.format("More than %d parse tokens have been presented. To prevent Denial Of Service attacks, parsing has been cancelled.", maxTokens);
-                    SourceLocation sourceLocation = null;
-                    String offendingToken = null;
-                    if (token != null) {
-                        offendingToken = node.getText();
-                        sourceLocation = AntlrHelper.createSourceLocation(multiSourceReader, token.getLine(), token.getCharPositionInLine());
-                    }
-
-                    throw new ParseCancelledException(msg, sourceLocation, offendingToken);
+                    throwCancelParseIfTooManyTokens(token, maxTokens, multiSourceReader);
                 }
             }
         };
         parser.addParseListener(listener);
     }
 
+    private void throwCancelParseIfTooManyTokens(Token token, int maxTokens, MultiSourceReader multiSourceReader) throws ParseCancelledException {
+        String tokenType  = "grammar";
+        SourceLocation sourceLocation = null;
+        String offendingToken = null;
+        if (token != null) {
+            int channel = token.getChannel();
+            tokenType = channel == CHANNEL_IGNORED_CHARS ? "whitespace" : (channel == CHANNEL_COMMENTS ? "comments" : "grammar");
+
+            offendingToken = token.getText();
+            sourceLocation = AntlrHelper.createSourceLocation(multiSourceReader, token.getLine(), token.getCharPositionInLine());
+        }
+        String msg = String.format("More than %d %s tokens have been presented. To prevent Denial Of Service attacks, parsing has been cancelled.", maxTokens, tokenType);
+        throw new ParseCancelledException(msg, sourceLocation, offendingToken);
+    }
+
     /**
      * Allows you to override the ANTLR to AST code.
      *

diff --git a/src/main/java/graphql/parser/ParserOptions.java b/src/main/java/graphql/parser/ParserOptions.java
@@ -13,32 +13,49 @@
 public class ParserOptions {
 
     /**
-     * An graphql hacking vector is to send nonsensical queries that burn lots of parsing CPU time and burn
-     * memory representing a document that wont ever execute.  To prevent this for most users, graphql-java
+     * A graphql hacking vector is to send nonsensical queries that burn lots of parsing CPU time and burn
+     * memory representing a document that won't ever execute.  To prevent this for most users, graphql-java
      * set this value to 15000.  ANTLR parsing time is linear to the number of tokens presented.  The more you
      * allow the longer it takes.
      *
      * If you want to allow more, then {@link #setDefaultParserOptions(ParserOptions)} allows you to change this
      * JVM wide.
      */
-    public static final int MAX_QUERY_TOKENS = 15000;
+    public static final int MAX_QUERY_TOKENS = 15_000;
+    /**
+     * Another graphql hacking vector is to send large amounts of whitespace in operations that burn lots of parsing CPU time and burn
+     * memory representing a document.  Whitespace token processing in ANTLR is 2 orders of magnitude faster than grammar token processing
+     * however it still takes some time to happen.
+     *
+     * If you want to allow more, then {@link #setDefaultParserOptions(ParserOptions)} allows you to change this
+     * JVM wide.
+     */
+    public static final int MAX_WHITESPACE_TOKENS = 200_000;
 
     private static ParserOptions defaultJvmParserOptions = newParserOptions()
             .captureIgnoredChars(false)
             .captureSourceLocation(true)
             .captureLineComments(true)
             .maxTokens(MAX_QUERY_TOKENS) // to prevent a billion laughs style attacks, we set a default for graphql-java
+            .maxWhitespaceTokens(MAX_WHITESPACE_TOKENS)
+            .build();
 
+    private static ParserOptions defaultJvmOperationParserOptions = newParserOptions()
+            .captureIgnoredChars(false)
+            .captureSourceLocation(true)
+            .captureLineComments(false) // #comments are not useful in query parsing
+            .maxTokens(MAX_QUERY_TOKENS) // to prevent a billion laughs style attacks, we set a default for graphql-java
+            .maxWhitespaceTokens(MAX_WHITESPACE_TOKENS)
             .build();
 
     /**
-     * By default the Parser will not capture ignored characters.  A static holds this default
+     * By default, the Parser will not capture ignored characters.  A static holds this default
      * value in a JVM wide basis options object.
      *
      * Significant memory savings can be made if we do NOT capture ignored characters,
      * especially in SDL parsing.
      *
-     * @return the static default value on whether to capture ignored chars
+     * @return the static default JVM value
      *
      * @see graphql.language.IgnoredChar
      * @see graphql.language.SourceLocation
@@ -48,7 +65,20 @@ public static ParserOptions getDefaultParserOptions() {
     }
 
     /**
-     * By default the Parser will not capture ignored characters.  A static holds this default
+     * By default, for operation parsing, the Parser will not capture ignored characters, and it will not capture line comments into AST
+     * elements .  A static holds this default value for operation parsing in a JVM wide basis options object.
+     *
+     * @return the static default JVM value for query parsing
+     *
+     * @see graphql.language.IgnoredChar
+     * @see graphql.language.SourceLocation
+     */
+    public static ParserOptions getDefaultOperationParserOptions() {
+        return defaultJvmOperationParserOptions;
+    }
+
+    /**
+     * By default, the Parser will not capture ignored characters.  A static holds this default
      * value in a JVM wide basis options object.
      *
      * Significant memory savings can be made if we do NOT capture ignored characters,
@@ -65,17 +95,35 @@ public static void setDefaultParserOptions(ParserOptions options) {
         defaultJvmParserOptions = assertNotNull(options);
     }
 
+    /**
+     * By default, the Parser will not capture ignored characters or line comments.  A static holds this default
+     * value in a JVM wide basis options object for operation parsing.
+     *
+     * This static can be set to true to allow the behavior of version 16.x or before.
+     *
+     * @param options - the new default JVM parser options for operation parsing
+     *
+     * @see graphql.language.IgnoredChar
+     * @see graphql.language.SourceLocation
+     */
+    public static void setDefaultOperationParserOptions(ParserOptions options) {
+        defaultJvmOperationParserOptions = assertNotNull(options);
+    }
+
+
     private final boolean captureIgnoredChars;
     private final boolean captureSourceLocation;
     private final boolean captureLineComments;
     private final int maxTokens;
+    private final int maxWhitespaceTokens;
     private final ParsingListener parsingListener;
 
     private ParserOptions(Builder builder) {
         this.captureIgnoredChars = builder.captureIgnoredChars;
         this.captureSourceLocation = builder.captureSourceLocation;
         this.captureLineComments = builder.captureLineComments;
         this.maxTokens = builder.maxTokens;
+        this.maxWhitespaceTokens = builder.maxWhitespaceTokens;
         this.parsingListener = builder.parsingListener;
     }
 
@@ -117,7 +165,7 @@ public boolean isCaptureLineComments() {
     }
 
     /**
-     * A graphql hacking vector is to send nonsensical queries that burn lots of parsing CPU time and burn
+     * A graphql hacking vector is to send nonsensical queries that burn lots of parsing CPU time and burns
      * memory representing a document that won't ever execute.  To prevent this you can set a maximum number of parse
      * tokens that will be accepted before an exception is thrown and the parsing is stopped.
      *
@@ -127,6 +175,17 @@ public int getMaxTokens() {
         return maxTokens;
     }
 
+    /**
+     * A graphql hacking vector is to send larges amounts of whitespace that burn lots of parsing CPU time and burn
+     * memory representing a document.  To prevent this you can set a maximum number of whitespace parse
+     * tokens that will be accepted before an exception is thrown and the parsing is stopped.
+     *
+     * @return the maximum number of raw whitespace tokens the parser will accept, after which an exception will be thrown.
+     */
+    public int getMaxWhitespaceTokens() {
+        return maxWhitespaceTokens;
+    }
+
     public ParsingListener getParsingListener() {
         return parsingListener;
     }
@@ -148,6 +207,7 @@ public static class Builder {
         private boolean captureLineComments = true;
         private int maxTokens = MAX_QUERY_TOKENS;
         private ParsingListener parsingListener = ParsingListener.NOOP;
+        private int maxWhitespaceTokens = MAX_WHITESPACE_TOKENS;
 
         Builder() {
         }
@@ -157,6 +217,7 @@ public static class Builder {
             this.captureSourceLocation = parserOptions.captureSourceLocation;
             this.captureLineComments = parserOptions.captureLineComments;
             this.maxTokens = parserOptions.maxTokens;
+            this.maxWhitespaceTokens = parserOptions.maxWhitespaceTokens;
             this.parsingListener = parserOptions.parsingListener;
         }
 
@@ -180,6 +241,11 @@ public Builder maxTokens(int maxTokens) {
             return this;
         }
 
+        public Builder maxWhitespaceTokens(int maxWhitespaceTokens) {
+            this.maxWhitespaceTokens = maxWhitespaceTokens;
+            return this;
+        }
+
         public Builder parsingListener(ParsingListener parsingListener) {
             this.parsingListener = assertNotNull(parsingListener);
             return this;

diff --git a/src/main/java/graphql/parser/SafeTokenSource.java b/src/main/java/graphql/parser/SafeTokenSource.java
@@ -0,0 +1,94 @@
+package graphql.parser;
+
+import graphql.Internal;
+import org.antlr.v4.runtime.CharStream;
+import org.antlr.v4.runtime.Token;
+import org.antlr.v4.runtime.TokenFactory;
+import org.antlr.v4.runtime.TokenSource;
+
+import java.util.function.BiConsumer;
+
+/**
+ * This token source can wrap a lexer and if it asks for more than a maximum number of tokens
+ * the user can take some action, typically throw an exception to stop lexing.
+ *
+ * It tracks the maximum number per token channel, so we have 3 at the moment, and they will all be tracked.
+ *
+ * This is used to protect us from evil input.  The lexer will eagerly try to find all tokens
+ * at times and certain inputs (directives butted together for example) will cause the lexer
+ * to keep doing work even though before the tokens are presented back to the parser
+ * and hence before it has a chance to stop work once too much as been done.
+ */
+@Internal
+public class SafeTokenSource implements TokenSource {
+
+    private final TokenSource lexer;
+    private final int maxTokens;
+    private final int maxWhitespaceTokens;
+    private final BiConsumer<Integer, Token> whenMaxTokensExceeded;
+    private final int channelCounts[];
+
+    public SafeTokenSource(TokenSource lexer, int maxTokens, int maxWhitespaceTokens, BiConsumer<Integer, Token> whenMaxTokensExceeded) {
+        this.lexer = lexer;
+        this.maxTokens = maxTokens;
+        this.maxWhitespaceTokens = maxWhitespaceTokens;
+        this.whenMaxTokensExceeded = whenMaxTokensExceeded;
+        // this could be a Map<int,int> however we want it to be faster as possible.
+        // we only have 3 channels - but they are 0,2 and 3 so use 5 for safety - still faster than a map get/put
+        // if we ever add another channel beyond 5 it will IOBEx during tests so future changes will be handled before release!
+        this.channelCounts = new int[]{0, 0, 0, 0, 0};
+    }
+
+
+    @Override
+    public Token nextToken() {
+        Token token = lexer.nextToken();
+        if (token != null) {
+            int channel = token.getChannel();
+            int currentCount = ++channelCounts[channel];
+            if (channel == Parser.CHANNEL_IGNORED_CHARS) {
+                // whitespace gets its own max count
+                callbackIfMaxExceeded(maxWhitespaceTokens, currentCount, token);
+            } else {
+                callbackIfMaxExceeded(maxTokens, currentCount, token);
+            }
+        }
+        return token;
+    }
+
+    private void callbackIfMaxExceeded(int maxCount, int currentCount, Token token) {
+        if (currentCount > maxCount) {
+            whenMaxTokensExceeded.accept(maxCount, token);
+        }
+    }
+
+    @Override
+    public int getLine() {
+        return lexer.getLine();
+    }
+
+    @Override
+    public int getCharPositionInLine() {
+        return lexer.getCharPositionInLine();
+    }
+
+    @Override
+    public CharStream getInputStream() {
+        return lexer.getInputStream();
+    }
+
+    @Override
+    public String getSourceName() {
+        return lexer.getSourceName();
+    }
+
+    @Override
+    public void setTokenFactory(TokenFactory<?> factory) {
+        lexer.setTokenFactory(factory);
+    }
+
+    @Override
+    public TokenFactory<?> getTokenFactory() {
+        return lexer.getTokenFactory();
+    }
+}