From 41c996907eedfd41461ad01302cc298c0dd5fffc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 12:47:46 -0600 Subject: [PATCH 01/16] prototype ability for dialects to override prefix and infix parsing --- src/dialect/mod.rs | 16 ++++++++++++++++ src/parser.rs | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 63821dd74..b96fd12db 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -22,6 +22,7 @@ mod redshift; mod snowflake; mod sqlite; +use crate::ast::Expr; use core::any::{Any, TypeId}; use core::fmt::Debug; use core::iter::Peekable; @@ -39,6 +40,8 @@ pub use self::redshift::RedshiftSqlDialect; pub use self::snowflake::SnowflakeDialect; pub use self::sqlite::SQLiteDialect; pub use crate::keywords; +use crate::parser::ParserError; +use crate::tokenizer::Token; /// `dialect_of!(parser is SQLiteDialect | GenericDialect)` evaluates /// to `true` if `parser.dialect` is one of the `Dialect`s specified. @@ -65,6 +68,19 @@ pub trait Dialect: Debug + Any { fn is_identifier_start(&self, ch: char) -> bool; /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; + /// Custom prefix parser + fn parse_prefix(&self, _tokens: &[Token]) -> Result, ParserError> { + Ok(None) + } + /// Custom infix parser + fn parse_infix( + &self, + _expr: Expr, + _precedence: u8, + _tokens: &[Token], + ) -> Result, ParserError> { + Ok(None) + } } impl dyn Dialect { diff --git a/src/parser.rs b/src/parser.rs index 3a90b3ccb..60f260f36 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -381,6 +381,14 @@ impl<'a> Parser<'a> { /// Parse an expression prefix pub fn parse_prefix(&mut self) -> Result { + // allow the dialect to override prefix parsing + if let Some((prefix_expr, num_tokens_parsed)) = + self.dialect.parse_prefix(&self.tokens[self.index..])? + { + self.index += num_tokens_parsed; + return Ok(prefix_expr); + } + // PostgreSQL allows any string literal to be preceded by a type name, indicating that the // string literal represents a literal of that type. Some examples: // @@ -1164,6 +1172,15 @@ impl<'a> Parser<'a> { /// Parse an operator following an expression pub fn parse_infix(&mut self, expr: Expr, precedence: u8) -> Result { + // allow the dialect to override infix parsing + if let Some((infix_expr, num_tokens_parsed)) = + self.dialect + .parse_infix(expr.clone(), precedence, &self.tokens[self.index..])? + { + self.index += num_tokens_parsed; + return Ok(infix_expr); + } + let tok = self.next_token(); let regular_binary_operator = match &tok { From af39f1f6063acaa8dc4495dd38a55ed1d5b4c833 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 13:33:05 -0600 Subject: [PATCH 02/16] save progress --- src/dialect/mod.rs | 19 +++++++++---------- src/parser.rs | 13 ++++++------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index b96fd12db..dc60de7a8 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -40,7 +40,7 @@ pub use self::redshift::RedshiftSqlDialect; pub use self::snowflake::SnowflakeDialect; pub use self::sqlite::SQLiteDialect; pub use crate::keywords; -use crate::parser::ParserError; +use crate::parser::{Parser, ParserError}; use crate::tokenizer::Token; /// `dialect_of!(parser is SQLiteDialect | GenericDialect)` evaluates @@ -51,6 +51,10 @@ macro_rules! dialect_of { }; } +type PrefixParser = Box Result<(Expr, usize), ParserError>>; + +type InfixParser = Box Result<(Expr, usize), ParserError>>; + pub trait Dialect: Debug + Any { /// Determine if a character starts a quoted identifier. The default /// implementation, accepting "double quoted" ids is both ANSI-compliant @@ -69,17 +73,12 @@ pub trait Dialect: Debug + Any { /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; /// Custom prefix parser - fn parse_prefix(&self, _tokens: &[Token]) -> Result, ParserError> { - Ok(None) + fn prefix_parser(&self, _tokens: &[Token]) -> Option { + None } /// Custom infix parser - fn parse_infix( - &self, - _expr: Expr, - _precedence: u8, - _tokens: &[Token], - ) -> Result, ParserError> { - Ok(None) + fn infix_parser(&self, _tokens: &[Token], expr: &Expr, precendence: u8) -> Option { + None } } diff --git a/src/parser.rs b/src/parser.rs index 60f260f36..b4266df54 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -382,9 +382,9 @@ impl<'a> Parser<'a> { /// Parse an expression prefix pub fn parse_prefix(&mut self) -> Result { // allow the dialect to override prefix parsing - if let Some((prefix_expr, num_tokens_parsed)) = - self.dialect.parse_prefix(&self.tokens[self.index..])? - { + let remaining_tokens = &self.tokens[self.index..]; + if let Some(prefix_parser) = self.dialect.prefix_parser(remaining_tokens) { + let (prefix_expr, num_tokens_parsed) = prefix_parser(&mut self)?; self.index += num_tokens_parsed; return Ok(prefix_expr); } @@ -1173,10 +1173,9 @@ impl<'a> Parser<'a> { /// Parse an operator following an expression pub fn parse_infix(&mut self, expr: Expr, precedence: u8) -> Result { // allow the dialect to override infix parsing - if let Some((infix_expr, num_tokens_parsed)) = - self.dialect - .parse_infix(expr.clone(), precedence, &self.tokens[self.index..])? - { + if let Some(infix_parser) = self.dialect + .infix_parser(&self.tokens[self.index..], &expr, precedence) { + let (infix_expr, num_tokens_parsed) = infix_parser(&mut self, &expr, precedence)?; self.index += num_tokens_parsed; return Ok(infix_expr); } From cec31081c977b285facdfe902ef5284e986295a3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 13:39:17 -0600 Subject: [PATCH 03/16] it compiles --- src/dialect/mod.rs | 15 +++++++++++++-- src/parser.rs | 18 ++++++++++++++---- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index dc60de7a8..bc1b314e5 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -22,7 +22,7 @@ mod redshift; mod snowflake; mod sqlite; -use crate::ast::Expr; +use crate::ast::{Expr, Statement}; use core::any::{Any, TypeId}; use core::fmt::Debug; use core::iter::Peekable; @@ -55,6 +55,8 @@ type PrefixParser = Box Result<(Expr, usize), ParserError type InfixParser = Box Result<(Expr, usize), ParserError>>; +type StatementParser = Box Result<(Statement, usize), ParserError>>; + pub trait Dialect: Debug + Any { /// Determine if a character starts a quoted identifier. The default /// implementation, accepting "double quoted" ids is both ANSI-compliant @@ -77,7 +79,16 @@ pub trait Dialect: Debug + Any { None } /// Custom infix parser - fn infix_parser(&self, _tokens: &[Token], expr: &Expr, precendence: u8) -> Option { + fn infix_parser( + &self, + _tokens: &[Token], + _expr: &Expr, + _precendence: u8, + ) -> Option { + None + } + /// Custom statement parser + fn statement_parser(&self, _tokens: &[Token]) -> Option { None } } diff --git a/src/parser.rs b/src/parser.rs index b4266df54..bddbdbd40 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -152,6 +152,14 @@ impl<'a> Parser<'a> { /// Parse a single top-level statement (such as SELECT, INSERT, CREATE, etc.), /// stopping before the statement separator, if any. pub fn parse_statement(&mut self) -> Result { + // allow the dialect to override statement parsing + let remaining_tokens = &self.tokens[self.index..]; + if let Some(statement_parser) = self.dialect.statement_parser(remaining_tokens) { + let (statement, num_tokens_parsed) = statement_parser(self)?; + self.index += num_tokens_parsed; + return Ok(statement); + } + match self.next_token() { Token::Word(w) => match w.keyword { Keyword::KILL => Ok(self.parse_kill()?), @@ -384,7 +392,7 @@ impl<'a> Parser<'a> { // allow the dialect to override prefix parsing let remaining_tokens = &self.tokens[self.index..]; if let Some(prefix_parser) = self.dialect.prefix_parser(remaining_tokens) { - let (prefix_expr, num_tokens_parsed) = prefix_parser(&mut self)?; + let (prefix_expr, num_tokens_parsed) = prefix_parser(self)?; self.index += num_tokens_parsed; return Ok(prefix_expr); } @@ -1173,9 +1181,11 @@ impl<'a> Parser<'a> { /// Parse an operator following an expression pub fn parse_infix(&mut self, expr: Expr, precedence: u8) -> Result { // allow the dialect to override infix parsing - if let Some(infix_parser) = self.dialect - .infix_parser(&self.tokens[self.index..], &expr, precedence) { - let (infix_expr, num_tokens_parsed) = infix_parser(&mut self, &expr, precedence)?; + if let Some(infix_parser) = + self.dialect + .infix_parser(&self.tokens[self.index..], &expr, precedence) + { + let (infix_expr, num_tokens_parsed) = infix_parser(self, &expr, precedence)?; self.index += num_tokens_parsed; return Ok(infix_expr); } From eadd84615f6a96e5769a34e7f3507c5b28b74ce2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 13:48:46 -0600 Subject: [PATCH 04/16] it works --- src/dialect/mod.rs | 6 ++--- src/dialect/postgresql.rs | 45 +++++++++++++++++++++++++++++++++++++- src/parser.rs | 46 ++++----------------------------------- 3 files changed, 51 insertions(+), 46 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index bc1b314e5..920700912 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -51,11 +51,11 @@ macro_rules! dialect_of { }; } -type PrefixParser = Box Result<(Expr, usize), ParserError>>; +type PrefixParser = Box Result>; -type InfixParser = Box Result<(Expr, usize), ParserError>>; +type InfixParser = Box Result>; -type StatementParser = Box Result<(Statement, usize), ParserError>>; +type StatementParser = Box Result>; pub trait Dialect: Debug + Any { /// Determine if a character starts a quoted identifier. The default diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 0c2eb99f0..fa357bd0e 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -10,7 +10,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::dialect::Dialect; +use crate::ast::{CommentObject, Statement}; +use crate::dialect::{Dialect, StatementParser}; +use crate::keywords::Keyword; +use crate::parser::{Parser, ParserError}; +use crate::tokenizer::Token; #[derive(Debug)] pub struct PostgreSqlDialect {} @@ -30,4 +34,43 @@ impl Dialect for PostgreSqlDialect { || ch == '$' || ch == '_' } + + fn statement_parser(&self, tokens: &[Token]) -> Option { + match &tokens[0] { + Token::Word(word) if word.keyword == Keyword::COMMENT => { + Some(Box::new(|parser| parse_comment(parser))) + } + _ => None, + } + } +} + +pub fn parse_comment(parser: &mut Parser) -> Result { + parser.expect_keyword(Keyword::COMMENT)?; + parser.expect_keyword(Keyword::ON)?; + let token = parser.next_token(); + + let (object_type, object_name) = match token { + Token::Word(w) if w.keyword == Keyword::COLUMN => { + let object_name = parser.parse_object_name()?; + (CommentObject::Column, object_name) + } + Token::Word(w) if w.keyword == Keyword::TABLE => { + let object_name = parser.parse_object_name()?; + (CommentObject::Table, object_name) + } + _ => parser.expected("comment object_type", token)?, + }; + + parser.expect_keyword(Keyword::IS)?; + let comment = if parser.parse_keyword(Keyword::NULL) { + None + } else { + Some(parser.parse_literal_string()?) + }; + Ok(Statement::Comment { + object_type, + object_name, + comment, + }) } diff --git a/src/parser.rs b/src/parser.rs index bddbdbd40..341888ce3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -155,9 +155,7 @@ impl<'a> Parser<'a> { // allow the dialect to override statement parsing let remaining_tokens = &self.tokens[self.index..]; if let Some(statement_parser) = self.dialect.statement_parser(remaining_tokens) { - let (statement, num_tokens_parsed) = statement_parser(self)?; - self.index += num_tokens_parsed; - return Ok(statement); + return statement_parser(self); } match self.next_token() { @@ -207,9 +205,6 @@ impl<'a> Parser<'a> { self.prev_token(); Ok(self.parse_insert()?) } - Keyword::COMMENT if dialect_of!(self is PostgreSqlDialect) => { - Ok(self.parse_comment()?) - } _ => self.expected("an SQL statement", Token::Word(w)), }, Token::LParen => { @@ -392,9 +387,7 @@ impl<'a> Parser<'a> { // allow the dialect to override prefix parsing let remaining_tokens = &self.tokens[self.index..]; if let Some(prefix_parser) = self.dialect.prefix_parser(remaining_tokens) { - let (prefix_expr, num_tokens_parsed) = prefix_parser(self)?; - self.index += num_tokens_parsed; - return Ok(prefix_expr); + return prefix_parser(self); } // PostgreSQL allows any string literal to be preceded by a type name, indicating that the @@ -1185,9 +1178,7 @@ impl<'a> Parser<'a> { self.dialect .infix_parser(&self.tokens[self.index..], &expr, precedence) { - let (infix_expr, num_tokens_parsed) = infix_parser(self, &expr, precedence)?; - self.index += num_tokens_parsed; - return Ok(infix_expr); + return infix_parser(self, &expr, precedence); } let tok = self.next_token(); @@ -1630,7 +1621,7 @@ impl<'a> Parser<'a> { } /// Report unexpected token - fn expected(&self, expected: &str, found: Token) -> Result { + pub fn expected(&self, expected: &str, found: Token) -> Result { parser_err!(format!("Expected {}, found: {}", expected, found)) } @@ -4757,35 +4748,6 @@ impl<'a> Parser<'a> { }) } - pub fn parse_comment(&mut self) -> Result { - self.expect_keyword(Keyword::ON)?; - let token = self.next_token(); - - let (object_type, object_name) = match token { - Token::Word(w) if w.keyword == Keyword::COLUMN => { - let object_name = self.parse_object_name()?; - (CommentObject::Column, object_name) - } - Token::Word(w) if w.keyword == Keyword::TABLE => { - let object_name = self.parse_object_name()?; - (CommentObject::Table, object_name) - } - _ => self.expected("comment object_type", token)?, - }; - - self.expect_keyword(Keyword::IS)?; - let comment = if self.parse_keyword(Keyword::NULL) { - None - } else { - Some(self.parse_literal_string()?) - }; - Ok(Statement::Comment { - object_type, - object_name, - comment, - }) - } - pub fn parse_merge_clauses(&mut self) -> Result, ParserError> { let mut clauses: Vec = vec![]; loop { From b299687f2f2c803370612f57786ca11f1570621b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 13:52:30 -0600 Subject: [PATCH 05/16] clippy --- src/dialect/postgresql.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index fa357bd0e..ca05e1b75 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -37,9 +37,7 @@ impl Dialect for PostgreSqlDialect { fn statement_parser(&self, tokens: &[Token]) -> Option { match &tokens[0] { - Token::Word(word) if word.keyword == Keyword::COMMENT => { - Some(Box::new(|parser| parse_comment(parser))) - } + Token::Word(word) if word.keyword == Keyword::COMMENT => Some(Box::new(parse_comment)), _ => None, } } From 14458867ffc311615c8d117be0a3ef11e0c409e3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 13:55:48 -0600 Subject: [PATCH 06/16] no-std support --- src/dialect/postgresql.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index ca05e1b75..8096e4f26 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -10,6 +10,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#[cfg(not(feature = "std"))] +use alloc::boxed::Box; + use crate::ast::{CommentObject, Statement}; use crate::dialect::{Dialect, StatementParser}; use crate::keywords::Keyword; From 3c6528463a59d66f1b3e78f363bee806c5af0cc3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 13:58:33 -0600 Subject: [PATCH 07/16] no-std support --- src/dialect/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 920700912..8e1eda1a3 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -10,6 +10,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#[cfg(not(feature = "std"))] +use alloc::boxed::Box; + mod ansi; mod bigquery; mod clickhouse; From 28568defc4992f17b2d157b5b634ecf7687c0f64 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 14:21:05 -0600 Subject: [PATCH 08/16] pass Parser instead of tokens --- src/dialect/mod.rs | 7 +++---- src/dialect/postgresql.rs | 10 +++++----- src/parser.rs | 11 +++-------- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 8e1eda1a3..b6178148d 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -44,7 +44,6 @@ pub use self::snowflake::SnowflakeDialect; pub use self::sqlite::SQLiteDialect; pub use crate::keywords; use crate::parser::{Parser, ParserError}; -use crate::tokenizer::Token; /// `dialect_of!(parser is SQLiteDialect | GenericDialect)` evaluates /// to `true` if `parser.dialect` is one of the `Dialect`s specified. @@ -78,20 +77,20 @@ pub trait Dialect: Debug + Any { /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; /// Custom prefix parser - fn prefix_parser(&self, _tokens: &[Token]) -> Option { + fn prefix_parser(&self, _parser: &mut Parser) -> Option { None } /// Custom infix parser fn infix_parser( &self, - _tokens: &[Token], + _parser: &mut Parser, _expr: &Expr, _precendence: u8, ) -> Option { None } /// Custom statement parser - fn statement_parser(&self, _tokens: &[Token]) -> Option { + fn statement_parser(&self, _parser: &mut Parser) -> Option { None } } diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 8096e4f26..a94537f37 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -38,16 +38,16 @@ impl Dialect for PostgreSqlDialect { || ch == '_' } - fn statement_parser(&self, tokens: &[Token]) -> Option { - match &tokens[0] { - Token::Word(word) if word.keyword == Keyword::COMMENT => Some(Box::new(parse_comment)), - _ => None, + fn statement_parser(&self, parser: &mut Parser) -> Option { + if parser.parse_keyword(Keyword::COMMENT) { + Some(Box::new(parse_comment)) + } else { + None } } } pub fn parse_comment(parser: &mut Parser) -> Result { - parser.expect_keyword(Keyword::COMMENT)?; parser.expect_keyword(Keyword::ON)?; let token = parser.next_token(); diff --git a/src/parser.rs b/src/parser.rs index 341888ce3..a7b0f5c96 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -153,8 +153,7 @@ impl<'a> Parser<'a> { /// stopping before the statement separator, if any. pub fn parse_statement(&mut self) -> Result { // allow the dialect to override statement parsing - let remaining_tokens = &self.tokens[self.index..]; - if let Some(statement_parser) = self.dialect.statement_parser(remaining_tokens) { + if let Some(statement_parser) = self.dialect.statement_parser(self) { return statement_parser(self); } @@ -385,8 +384,7 @@ impl<'a> Parser<'a> { /// Parse an expression prefix pub fn parse_prefix(&mut self) -> Result { // allow the dialect to override prefix parsing - let remaining_tokens = &self.tokens[self.index..]; - if let Some(prefix_parser) = self.dialect.prefix_parser(remaining_tokens) { + if let Some(prefix_parser) = self.dialect.prefix_parser(self) { return prefix_parser(self); } @@ -1174,10 +1172,7 @@ impl<'a> Parser<'a> { /// Parse an operator following an expression pub fn parse_infix(&mut self, expr: Expr, precedence: u8) -> Result { // allow the dialect to override infix parsing - if let Some(infix_parser) = - self.dialect - .infix_parser(&self.tokens[self.index..], &expr, precedence) - { + if let Some(infix_parser) = self.dialect.infix_parser(self, &expr, precedence) { return infix_parser(self, &expr, precedence); } From ab5e683dd60c3f94e6872a75ea5bbe47d11a1f47 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 14:23:52 -0600 Subject: [PATCH 09/16] move a SQLite statement parser --- src/dialect/sqlite.rs | 15 ++++++++++++++- src/parser.rs | 4 ---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/dialect/sqlite.rs b/src/dialect/sqlite.rs index 4ce2f834b..361b29e67 100644 --- a/src/dialect/sqlite.rs +++ b/src/dialect/sqlite.rs @@ -10,7 +10,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::dialect::Dialect; +use crate::dialect::{Dialect, StatementParser}; +use crate::keywords::Keyword; +use crate::parser::Parser; #[derive(Debug)] pub struct SQLiteDialect {} @@ -35,4 +37,15 @@ impl Dialect for SQLiteDialect { fn is_identifier_part(&self, ch: char) -> bool { self.is_identifier_start(ch) || ('0'..='9').contains(&ch) } + + fn statement_parser(&self, parser: &mut Parser) -> Option { + if parser.parse_keyword(Keyword::REPLACE) { + Some(Box::new(|parser| { + parser.prev_token(); + parser.parse_insert() + })) + } else { + None + } + } } diff --git a/src/parser.rs b/src/parser.rs index a7b0f5c96..a8e4511d2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -200,10 +200,6 @@ impl<'a> Parser<'a> { Keyword::EXECUTE => Ok(self.parse_execute()?), Keyword::PREPARE => Ok(self.parse_prepare()?), Keyword::MERGE => Ok(self.parse_merge()?), - Keyword::REPLACE if dialect_of!(self is SQLiteDialect ) => { - self.prev_token(); - Ok(self.parse_insert()?) - } _ => self.expected("an SQL statement", Token::Word(w)), }, Token::LParen => { From f70ff479270580571b41b5d64fde1aee378ee9bd Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 14:27:30 -0600 Subject: [PATCH 10/16] parse statement directly instead of returning a function --- src/dialect/mod.rs | 4 +--- src/dialect/postgresql.rs | 6 +++--- src/dialect/sqlite.rs | 16 +++++++++------- src/parser.rs | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index b6178148d..6cf2303f6 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -57,8 +57,6 @@ type PrefixParser = Box Result>; type InfixParser = Box Result>; -type StatementParser = Box Result>; - pub trait Dialect: Debug + Any { /// Determine if a character starts a quoted identifier. The default /// implementation, accepting "double quoted" ids is both ANSI-compliant @@ -90,7 +88,7 @@ pub trait Dialect: Debug + Any { None } /// Custom statement parser - fn statement_parser(&self, _parser: &mut Parser) -> Option { + fn parse_statement(&self, _parser: &mut Parser) -> Option> { None } } diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index a94537f37..b8d966ec8 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -14,7 +14,7 @@ use alloc::boxed::Box; use crate::ast::{CommentObject, Statement}; -use crate::dialect::{Dialect, StatementParser}; +use crate::dialect::Dialect; use crate::keywords::Keyword; use crate::parser::{Parser, ParserError}; use crate::tokenizer::Token; @@ -38,9 +38,9 @@ impl Dialect for PostgreSqlDialect { || ch == '_' } - fn statement_parser(&self, parser: &mut Parser) -> Option { + fn parse_statement(&self, parser: &mut Parser) -> Option> { if parser.parse_keyword(Keyword::COMMENT) { - Some(Box::new(parse_comment)) + Some(parse_comment(parser)) } else { None } diff --git a/src/dialect/sqlite.rs b/src/dialect/sqlite.rs index 361b29e67..3ed0b35e6 100644 --- a/src/dialect/sqlite.rs +++ b/src/dialect/sqlite.rs @@ -10,9 +10,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::dialect::{Dialect, StatementParser}; +#[cfg(not(feature = "std"))] +use alloc::boxed::Box; + +use crate::ast::Statement; +use crate::dialect::Dialect; use crate::keywords::Keyword; -use crate::parser::Parser; +use crate::parser::{Parser, ParserError}; #[derive(Debug)] pub struct SQLiteDialect {} @@ -38,12 +42,10 @@ impl Dialect for SQLiteDialect { self.is_identifier_start(ch) || ('0'..='9').contains(&ch) } - fn statement_parser(&self, parser: &mut Parser) -> Option { + fn parse_statement(&self, parser: &mut Parser) -> Option> { if parser.parse_keyword(Keyword::REPLACE) { - Some(Box::new(|parser| { - parser.prev_token(); - parser.parse_insert() - })) + parser.prev_token(); + Some(parser.parse_insert()) } else { None } diff --git a/src/parser.rs b/src/parser.rs index a8e4511d2..bcab35ae9 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -153,8 +153,8 @@ impl<'a> Parser<'a> { /// stopping before the statement separator, if any. pub fn parse_statement(&mut self) -> Result { // allow the dialect to override statement parsing - if let Some(statement_parser) = self.dialect.statement_parser(self) { - return statement_parser(self); + if let Some(statement) = self.dialect.parse_statement(self) { + return statement; } match self.next_token() { From f7b57fa06035df4abe631ec3e15861b0e86715d4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 14:28:53 -0600 Subject: [PATCH 11/16] parse prefix and infix directly instead of returning a function --- src/dialect/mod.rs | 10 +++------- src/parser.rs | 8 ++++---- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 6cf2303f6..8b0e4b346 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -53,10 +53,6 @@ macro_rules! dialect_of { }; } -type PrefixParser = Box Result>; - -type InfixParser = Box Result>; - pub trait Dialect: Debug + Any { /// Determine if a character starts a quoted identifier. The default /// implementation, accepting "double quoted" ids is both ANSI-compliant @@ -75,16 +71,16 @@ pub trait Dialect: Debug + Any { /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; /// Custom prefix parser - fn prefix_parser(&self, _parser: &mut Parser) -> Option { + fn parse_prefix(&self, _parser: &mut Parser) -> Option> { None } /// Custom infix parser - fn infix_parser( + fn parse_infix( &self, _parser: &mut Parser, _expr: &Expr, _precendence: u8, - ) -> Option { + ) -> Option> { None } /// Custom statement parser diff --git a/src/parser.rs b/src/parser.rs index bcab35ae9..a82c70400 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -380,8 +380,8 @@ impl<'a> Parser<'a> { /// Parse an expression prefix pub fn parse_prefix(&mut self) -> Result { // allow the dialect to override prefix parsing - if let Some(prefix_parser) = self.dialect.prefix_parser(self) { - return prefix_parser(self); + if let Some(prefix) = self.dialect.parse_prefix(self) { + return prefix; } // PostgreSQL allows any string literal to be preceded by a type name, indicating that the @@ -1168,8 +1168,8 @@ impl<'a> Parser<'a> { /// Parse an operator following an expression pub fn parse_infix(&mut self, expr: Expr, precedence: u8) -> Result { // allow the dialect to override infix parsing - if let Some(infix_parser) = self.dialect.infix_parser(self, &expr, precedence) { - return infix_parser(self, &expr, precedence); + if let Some(infix) = self.dialect.parse_infix(self, &expr, precedence) { + return infix; } let tok = self.next_token(); From 9dd2ea4d77b67670bcb9370463065824a2478d90 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 14:34:18 -0600 Subject: [PATCH 12/16] revert no-std fix --- src/dialect/postgresql.rs | 3 --- src/dialect/sqlite.rs | 3 --- 2 files changed, 6 deletions(-) diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index b8d966ec8..04d64b9bf 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -10,9 +10,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#[cfg(not(feature = "std"))] -use alloc::boxed::Box; - use crate::ast::{CommentObject, Statement}; use crate::dialect::Dialect; use crate::keywords::Keyword; diff --git a/src/dialect/sqlite.rs b/src/dialect/sqlite.rs index 3ed0b35e6..64d7f62fd 100644 --- a/src/dialect/sqlite.rs +++ b/src/dialect/sqlite.rs @@ -10,9 +10,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#[cfg(not(feature = "std"))] -use alloc::boxed::Box; - use crate::ast::Statement; use crate::dialect::Dialect; use crate::keywords::Keyword; From c2e096ebf6749e9992894982682aacd1d60c91b4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 14:35:15 -0600 Subject: [PATCH 13/16] revert no-std fix --- src/dialect/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 8b0e4b346..146ebf349 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -10,9 +10,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#[cfg(not(feature = "std"))] -use alloc::boxed::Box; - mod ansi; mod bigquery; mod clickhouse; From f518c30dee2b1545661cc64f77743e9cfdf88a64 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 15:07:44 -0600 Subject: [PATCH 14/16] unit test for custom infix parser --- src/dialect/mod.rs | 14 +++++++++--- src/parser.rs | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 146ebf349..46e8dda2c 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -67,21 +67,29 @@ pub trait Dialect: Debug + Any { fn is_identifier_start(&self, ch: char) -> bool; /// Determine if a character is a valid unquoted identifier character fn is_identifier_part(&self, ch: char) -> bool; - /// Custom prefix parser + /// Dialect-specific prefix parser override fn parse_prefix(&self, _parser: &mut Parser) -> Option> { + // return None to fall back to the default behavior None } - /// Custom infix parser + /// Dialect-specific infix parser override fn parse_infix( &self, _parser: &mut Parser, _expr: &Expr, _precendence: u8, ) -> Option> { + // return None to fall back to the default behavior None } - /// Custom statement parser + /// Dialect-specific precedence override + fn get_next_precedence(&self, _parser: &Parser) -> Option> { + // return None to fall back to the default behavior + None + } + /// Dialect-specific statement parser override fn parse_statement(&self, _parser: &mut Parser) -> Option> { + // return None to fall back to the default behavior None } } diff --git a/src/parser.rs b/src/parser.rs index a82c70400..5f1d2f145 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1485,6 +1485,11 @@ impl<'a> Parser<'a> { /// Get the precedence of the next token pub fn get_next_precedence(&self) -> Result { + // allow the dialect to override precedence logic + if let Some(precedence) = self.dialect.get_next_precedence(self) { + return precedence; + } + let token = self.peek_token(); debug!("get_next_precedence() {:?}", token); let token_0 = self.peek_nth_token(0); @@ -4905,4 +4910,52 @@ mod tests { assert_eq!(ast.to_string(), sql.to_string()); }); } + + #[test] + fn custom_infix_parser() -> Result<(), ParserError> { + #[derive(Debug)] + struct MyDialect {} + + impl Dialect for MyDialect { + fn is_identifier_start(&self, ch: char) -> bool { + // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS + // We don't yet support identifiers beginning with "letters with + // diacritical marks and non-Latin letters" + ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_' + } + + fn is_identifier_part(&self, ch: char) -> bool { + ('a'..='z').contains(&ch) + || ('A'..='Z').contains(&ch) + || ('0'..='9').contains(&ch) + || ch == '$' + || ch == '_' + } + + fn parse_infix( + &self, + parser: &mut Parser, + expr: &Expr, + _precendence: u8, + ) -> Option> { + if parser.peek_token() == Token::Plus { + assert!(parser.consume_token(&Token::Plus)); + Some(Ok(Expr::BinaryOp { + left: Box::new(expr.clone()), + op: BinaryOperator::Multiply, // translate Plus to Multiply + right: Box::new(parser.parse_expr().unwrap()), + })) + } else { + None + } + } + } + + let dialect = MyDialect {}; + let sql = "SELECT 1 + 2"; + let ast = Parser::parse_sql(&dialect, sql)?; + let query = &ast[0]; + assert_eq!("SELECT 1 * 2", &format!("{}", query)); + Ok(()) + } } From 8640d1b0e30b6321f03f39852d269861a16c51c8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 17 Aug 2022 15:16:57 -0600 Subject: [PATCH 15/16] more tests --- src/parser.rs | 94 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 11 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 5f1d2f145..f4aa40942 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -4911,6 +4911,37 @@ mod tests { }); } + #[test] + fn custom_prefix_parser() -> Result<(), ParserError> { + #[derive(Debug)] + struct MyDialect {} + + impl Dialect for MyDialect { + fn is_identifier_start(&self, ch: char) -> bool { + is_identifier_start(ch) + } + + fn is_identifier_part(&self, ch: char) -> bool { + is_identifier_part(ch) + } + + fn parse_prefix(&self, parser: &mut Parser) -> Option> { + if parser.consume_token(&Token::Number("1".to_string(), false)) { + Some(Ok(Expr::Value(Value::Null))) + } else { + None + } + } + } + + let dialect = MyDialect {}; + let sql = "SELECT 1 + 2"; + let ast = Parser::parse_sql(&dialect, sql)?; + let query = &ast[0]; + assert_eq!("SELECT NULL + 2", &format!("{}", query)); + Ok(()) + } + #[test] fn custom_infix_parser() -> Result<(), ParserError> { #[derive(Debug)] @@ -4918,18 +4949,11 @@ mod tests { impl Dialect for MyDialect { fn is_identifier_start(&self, ch: char) -> bool { - // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS - // We don't yet support identifiers beginning with "letters with - // diacritical marks and non-Latin letters" - ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_' + is_identifier_start(ch) } fn is_identifier_part(&self, ch: char) -> bool { - ('a'..='z').contains(&ch) - || ('A'..='Z').contains(&ch) - || ('0'..='9').contains(&ch) - || ch == '$' - || ch == '_' + is_identifier_part(ch) } fn parse_infix( @@ -4938,8 +4962,7 @@ mod tests { expr: &Expr, _precendence: u8, ) -> Option> { - if parser.peek_token() == Token::Plus { - assert!(parser.consume_token(&Token::Plus)); + if parser.consume_token(&Token::Plus) { Some(Ok(Expr::BinaryOp { left: Box::new(expr.clone()), op: BinaryOperator::Multiply, // translate Plus to Multiply @@ -4958,4 +4981,53 @@ mod tests { assert_eq!("SELECT 1 * 2", &format!("{}", query)); Ok(()) } + + #[test] + fn custom_statement_parser() -> Result<(), ParserError> { + #[derive(Debug)] + struct MyDialect {} + + impl Dialect for MyDialect { + fn is_identifier_start(&self, ch: char) -> bool { + is_identifier_start(ch) + } + + fn is_identifier_part(&self, ch: char) -> bool { + is_identifier_part(ch) + } + + fn parse_statement( + &self, + parser: &mut Parser, + ) -> Option> { + if parser.parse_keyword(Keyword::SELECT) { + for _ in 0..3 { + let _ = parser.next_token(); + } + Some(Ok(Statement::Commit { chain: false })) + } else { + None + } + } + } + + let dialect = MyDialect {}; + let sql = "SELECT 1 + 2"; + let ast = Parser::parse_sql(&dialect, sql)?; + let query = &ast[0]; + assert_eq!("COMMIT", &format!("{}", query)); + Ok(()) + } + + fn is_identifier_start(ch: char) -> bool { + ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_' + } + + fn is_identifier_part(ch: char) -> bool { + ('a'..='z').contains(&ch) + || ('A'..='Z').contains(&ch) + || ('0'..='9').contains(&ch) + || ch == '$' + || ch == '_' + } } From ed1494002190c33dda5578ad2441ad2dacfdcb13 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 19 Aug 2022 05:40:04 -0600 Subject: [PATCH 16/16] move tests --- src/parser.rs | 120 -------------------------- tests/sqlparser_custom_dialect.rs | 138 ++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 120 deletions(-) create mode 100644 tests/sqlparser_custom_dialect.rs diff --git a/src/parser.rs b/src/parser.rs index f4aa40942..ca5c1254a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -4910,124 +4910,4 @@ mod tests { assert_eq!(ast.to_string(), sql.to_string()); }); } - - #[test] - fn custom_prefix_parser() -> Result<(), ParserError> { - #[derive(Debug)] - struct MyDialect {} - - impl Dialect for MyDialect { - fn is_identifier_start(&self, ch: char) -> bool { - is_identifier_start(ch) - } - - fn is_identifier_part(&self, ch: char) -> bool { - is_identifier_part(ch) - } - - fn parse_prefix(&self, parser: &mut Parser) -> Option> { - if parser.consume_token(&Token::Number("1".to_string(), false)) { - Some(Ok(Expr::Value(Value::Null))) - } else { - None - } - } - } - - let dialect = MyDialect {}; - let sql = "SELECT 1 + 2"; - let ast = Parser::parse_sql(&dialect, sql)?; - let query = &ast[0]; - assert_eq!("SELECT NULL + 2", &format!("{}", query)); - Ok(()) - } - - #[test] - fn custom_infix_parser() -> Result<(), ParserError> { - #[derive(Debug)] - struct MyDialect {} - - impl Dialect for MyDialect { - fn is_identifier_start(&self, ch: char) -> bool { - is_identifier_start(ch) - } - - fn is_identifier_part(&self, ch: char) -> bool { - is_identifier_part(ch) - } - - fn parse_infix( - &self, - parser: &mut Parser, - expr: &Expr, - _precendence: u8, - ) -> Option> { - if parser.consume_token(&Token::Plus) { - Some(Ok(Expr::BinaryOp { - left: Box::new(expr.clone()), - op: BinaryOperator::Multiply, // translate Plus to Multiply - right: Box::new(parser.parse_expr().unwrap()), - })) - } else { - None - } - } - } - - let dialect = MyDialect {}; - let sql = "SELECT 1 + 2"; - let ast = Parser::parse_sql(&dialect, sql)?; - let query = &ast[0]; - assert_eq!("SELECT 1 * 2", &format!("{}", query)); - Ok(()) - } - - #[test] - fn custom_statement_parser() -> Result<(), ParserError> { - #[derive(Debug)] - struct MyDialect {} - - impl Dialect for MyDialect { - fn is_identifier_start(&self, ch: char) -> bool { - is_identifier_start(ch) - } - - fn is_identifier_part(&self, ch: char) -> bool { - is_identifier_part(ch) - } - - fn parse_statement( - &self, - parser: &mut Parser, - ) -> Option> { - if parser.parse_keyword(Keyword::SELECT) { - for _ in 0..3 { - let _ = parser.next_token(); - } - Some(Ok(Statement::Commit { chain: false })) - } else { - None - } - } - } - - let dialect = MyDialect {}; - let sql = "SELECT 1 + 2"; - let ast = Parser::parse_sql(&dialect, sql)?; - let query = &ast[0]; - assert_eq!("COMMIT", &format!("{}", query)); - Ok(()) - } - - fn is_identifier_start(ch: char) -> bool { - ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_' - } - - fn is_identifier_part(ch: char) -> bool { - ('a'..='z').contains(&ch) - || ('A'..='Z').contains(&ch) - || ('0'..='9').contains(&ch) - || ch == '$' - || ch == '_' - } } diff --git a/tests/sqlparser_custom_dialect.rs b/tests/sqlparser_custom_dialect.rs new file mode 100644 index 000000000..c0fe4c1dd --- /dev/null +++ b/tests/sqlparser_custom_dialect.rs @@ -0,0 +1,138 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Test the ability for dialects to override parsing + +use sqlparser::{ + ast::{BinaryOperator, Expr, Statement, Value}, + dialect::Dialect, + keywords::Keyword, + parser::{Parser, ParserError}, + tokenizer::Token, +}; + +#[test] +fn custom_prefix_parser() -> Result<(), ParserError> { + #[derive(Debug)] + struct MyDialect {} + + impl Dialect for MyDialect { + fn is_identifier_start(&self, ch: char) -> bool { + is_identifier_start(ch) + } + + fn is_identifier_part(&self, ch: char) -> bool { + is_identifier_part(ch) + } + + fn parse_prefix(&self, parser: &mut Parser) -> Option> { + if parser.consume_token(&Token::Number("1".to_string(), false)) { + Some(Ok(Expr::Value(Value::Null))) + } else { + None + } + } + } + + let dialect = MyDialect {}; + let sql = "SELECT 1 + 2"; + let ast = Parser::parse_sql(&dialect, sql)?; + let query = &ast[0]; + assert_eq!("SELECT NULL + 2", &format!("{}", query)); + Ok(()) +} + +#[test] +fn custom_infix_parser() -> Result<(), ParserError> { + #[derive(Debug)] + struct MyDialect {} + + impl Dialect for MyDialect { + fn is_identifier_start(&self, ch: char) -> bool { + is_identifier_start(ch) + } + + fn is_identifier_part(&self, ch: char) -> bool { + is_identifier_part(ch) + } + + fn parse_infix( + &self, + parser: &mut Parser, + expr: &Expr, + _precendence: u8, + ) -> Option> { + if parser.consume_token(&Token::Plus) { + Some(Ok(Expr::BinaryOp { + left: Box::new(expr.clone()), + op: BinaryOperator::Multiply, // translate Plus to Multiply + right: Box::new(parser.parse_expr().unwrap()), + })) + } else { + None + } + } + } + + let dialect = MyDialect {}; + let sql = "SELECT 1 + 2"; + let ast = Parser::parse_sql(&dialect, sql)?; + let query = &ast[0]; + assert_eq!("SELECT 1 * 2", &format!("{}", query)); + Ok(()) +} + +#[test] +fn custom_statement_parser() -> Result<(), ParserError> { + #[derive(Debug)] + struct MyDialect {} + + impl Dialect for MyDialect { + fn is_identifier_start(&self, ch: char) -> bool { + is_identifier_start(ch) + } + + fn is_identifier_part(&self, ch: char) -> bool { + is_identifier_part(ch) + } + + fn parse_statement(&self, parser: &mut Parser) -> Option> { + if parser.parse_keyword(Keyword::SELECT) { + for _ in 0..3 { + let _ = parser.next_token(); + } + Some(Ok(Statement::Commit { chain: false })) + } else { + None + } + } + } + + let dialect = MyDialect {}; + let sql = "SELECT 1 + 2"; + let ast = Parser::parse_sql(&dialect, sql)?; + let query = &ast[0]; + assert_eq!("COMMIT", &format!("{}", query)); + Ok(()) +} + +fn is_identifier_start(ch: char) -> bool { + ('a'..='z').contains(&ch) || ('A'..='Z').contains(&ch) || ch == '_' +} + +fn is_identifier_part(ch: char) -> bool { + ('a'..='z').contains(&ch) + || ('A'..='Z').contains(&ch) + || ('0'..='9').contains(&ch) + || ch == '$' + || ch == '_' +}