From 6ad3b26a1dbbc9951a225dc0d519da3d4b9051f1 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Wed, 18 May 2022 21:44:16 +0300 Subject: [PATCH 1/9] feat: Support escaped string literals (PostgreSQL) Signed-off-by: Dmitry Patsura --- src/ast/value.rs | 37 ++++++++++++++++++++ src/parser.rs | 7 ++++ src/tokenizer.rs | 70 +++++++++++++++++++++++++++++++++++++ tests/sqlparser_postgres.rs | 23 ++++++++++++ 4 files changed, 137 insertions(+) diff --git a/src/ast/value.rs b/src/ast/value.rs index 1401855f1..60acca7cf 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -30,6 +30,8 @@ pub enum Value { Number(BigDecimal, bool), /// 'string value' SingleQuotedString(String), + /// e'string value' + EscapedStringLiteral(String), /// N'string value' NationalStringLiteral(String), /// X'hex value' @@ -69,6 +71,7 @@ impl fmt::Display for Value { Value::Number(v, l) => write!(f, "{}{long}", v, long = if *l { "L" } else { "" }), Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v), Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)), + Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)), Value::NationalStringLiteral(v) => write!(f, "N'{}'", v), Value::HexStringLiteral(v) => write!(f, "X'{}'", v), Value::Boolean(v) => write!(f, "{}", v), @@ -193,6 +196,40 @@ pub fn escape_single_quote_string(s: &str) -> EscapeSingleQuoteString<'_> { EscapeSingleQuoteString(s) } +pub struct EscapeEscapedStringLiteral<'a>(&'a str); + +impl<'a> fmt::Display for EscapeEscapedStringLiteral<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut is_escaped = true; + + for c in self.0.chars() { + match c { + '\'' => { + write!(f, "\'\'")?; + } + '\\' => { + if is_escaped { + write!(f, r#"\\"#)?; + } else { + is_escaped = true; + } + } + '\n' => { + write!(f, r#"\n"#)?; + } + _ => { + write!(f, "{}", c)?; + } + } + } + Ok(()) + } +} + +pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> { + EscapeEscapedStringLiteral(s) +} + #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum TrimWhereField { diff --git a/src/parser.rs b/src/parser.rs index 5ee3d5cb6..80024fb47 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -496,6 +496,10 @@ impl<'a> Parser<'a> { expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?), }) } + Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect) => { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } Token::Number(_, _) | Token::SingleQuotedString(_) | Token::NationalStringLiteral(_) @@ -901,6 +905,7 @@ impl<'a> Parser<'a> { None } Token::SingleQuotedString(_) + | Token::EscapedStringLiteral(_) | Token::NationalStringLiteral(_) | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)), unexpected => { @@ -2565,6 +2570,7 @@ impl<'a> Parser<'a> { }, Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())), Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())), + Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())), Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())), Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())), unexpected => self.expected("a value", unexpected), @@ -2596,6 +2602,7 @@ impl<'a> Parser<'a> { match self.next_token() { Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value), Token::SingleQuotedString(s) => Ok(s), + Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect) => Ok(s), unexpected => self.expected("literal string", unexpected), } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3fb0f66b2..d624f09d3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -51,6 +51,8 @@ pub enum Token { SingleQuotedString(String), /// "National" string literal: i.e: N'string' NationalStringLiteral(String), + /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second' + EscapedStringLiteral(String), /// Hexadecimal string literal: i.e.: X'deadbeef' HexStringLiteral(String), /// Comma @@ -160,6 +162,7 @@ impl fmt::Display for Token { Token::Char(ref c) => write!(f, "{}", c), Token::SingleQuotedString(ref s) => write!(f, "'{}'", s), Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s), + Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s), Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s), Token::Comma => f.write_str(","), Token::Whitespace(ws) => write!(f, "{}", ws), @@ -392,6 +395,21 @@ impl<'a> Tokenizer<'a> { } } } + // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. + x @ 'e' | x @ 'E' => { + chars.next(); // consume, to check the next char + match chars.peek() { + Some('\'') => { + let s = self.tokenize_escaped_single_quoted_string(chars)?; + Ok(Some(Token::EscapedStringLiteral(s))) + } + _ => { + // regular identifier starting with an "X" + let s = self.tokenize_word(x, chars); + Ok(Some(Token::make_word(&s, None))) + } + } + } // The spec only allows an uppercase 'X' to introduce a hex // string, but PostgreSQL, at least, allows a lowercase 'x' too. x @ 'x' | x @ 'X' => { @@ -690,6 +708,58 @@ impl<'a> Tokenizer<'a> { s } + /// Read a single quoted string, starting with the opening quote. + fn tokenize_escaped_single_quoted_string( + &self, + chars: &mut Peekable>, + ) -> Result { + let mut s = String::new(); + chars.next(); // consume the opening quote + + // slash escaping is specific to MySQL dialect + let mut is_escaped = false; + while let Some(&ch) = chars.peek() { + println!("-> {}", ch); + match ch { + '\'' => { + chars.next(); // consume + if chars.peek().map(|c| *c == '\'').unwrap_or(false) { + s.push(ch); + chars.next(); + } else { + return Ok(s); + } + } + '\\' => { + if is_escaped { + s.push('\\'); + is_escaped = false; + } else { + is_escaped = true; + } + + chars.next(); + } + 'n' => { + if is_escaped { + s.push('\n'); + is_escaped = false; + } else { + s.push(ch); + } + + chars.next(); + } + _ => { + is_escaped = false; + chars.next(); // consume + s.push(ch); + } + } + } + self.tokenizer_error("Unterminated encoded string literal") + } + /// Read a single quoted string, starting with the opening quote. fn tokenize_single_quoted_string( &self, diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 69b7fcfa4..6068688ab 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1467,3 +1467,26 @@ fn pg_and_generic() -> TestedDialects { dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})], } } + +#[test] +fn parse_escaped_literal_string() { + let sql = r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4'"#; + let select = pg().verified_only_select(sql); + assert_eq!(4, select.projection.len()); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("s1 \n s1".to_string())), + expr_from_projection(&select.projection[0]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("s2 \\n s2".to_string())), + expr_from_projection(&select.projection[1]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("s3 \\\n s3".to_string())), + expr_from_projection(&select.projection[2]) + ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("s4 \\\\n s4".to_string())), + expr_from_projection(&select.projection[3]) + ); +} From c71af02a8f4dd1af8443b50d3e019e97c7cc7ed8 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Wed, 18 May 2022 21:48:49 +0300 Subject: [PATCH 2/9] lint --- src/tokenizer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index d624f09d3..6952b6696 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -719,7 +719,6 @@ impl<'a> Tokenizer<'a> { // slash escaping is specific to MySQL dialect let mut is_escaped = false; while let Some(&ch) = chars.peek() { - println!("-> {}", ch); match ch { '\'' => { chars.next(); // consume From c06771742a382ac9cda9b939ba977202e1f12779 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Fri, 20 May 2022 17:02:35 +0300 Subject: [PATCH 3/9] escape ', \r, \t --- src/ast/value.rs | 8 +++++++- src/tokenizer.rs | 33 +++++++++++++++++++++------------ tests/sqlparser_postgres.rs | 8 ++++++-- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/ast/value.rs b/src/ast/value.rs index 60acca7cf..7a62204a1 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -205,7 +205,7 @@ impl<'a> fmt::Display for EscapeEscapedStringLiteral<'a> { for c in self.0.chars() { match c { '\'' => { - write!(f, "\'\'")?; + write!(f, r#"\'"#)?; } '\\' => { if is_escaped { @@ -217,6 +217,12 @@ impl<'a> fmt::Display for EscapeEscapedStringLiteral<'a> { '\n' => { write!(f, r#"\n"#)?; } + '\t' => { + write!(f, r#"\t"#)?; + } + '\r' => { + write!(f, r#"\r"#)?; + } _ => { write!(f, "{}", c)?; } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 6952b6696..429471f0b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -716,13 +716,29 @@ impl<'a> Tokenizer<'a> { let mut s = String::new(); chars.next(); // consume the opening quote - // slash escaping is specific to MySQL dialect + // slash escaping let mut is_escaped = false; while let Some(&ch) = chars.peek() { + macro_rules! escape_control_character { + ($ESCAPED:expr) => {{ + if is_escaped { + s.push($ESCAPED); + is_escaped = false; + } else { + s.push(ch); + } + + chars.next(); + }}; + } + match ch { '\'' => { chars.next(); // consume - if chars.peek().map(|c| *c == '\'').unwrap_or(false) { + if is_escaped { + s.push(ch); + is_escaped = false; + } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) { s.push(ch); chars.next(); } else { @@ -739,16 +755,9 @@ impl<'a> Tokenizer<'a> { chars.next(); } - 'n' => { - if is_escaped { - s.push('\n'); - is_escaped = false; - } else { - s.push(ch); - } - - chars.next(); - } + 'r' => escape_control_character!('\r'), + 'n' => escape_control_character!('\n'), + 't' => escape_control_character!('\t'), _ => { is_escaped = false; chars.next(); // consume diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 6068688ab..ee34f7a28 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1470,9 +1470,9 @@ fn pg_and_generic() -> TestedDialects { #[test] fn parse_escaped_literal_string() { - let sql = r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4'"#; + let sql = r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4', E'\''"#; let select = pg().verified_only_select(sql); - assert_eq!(4, select.projection.len()); + assert_eq!(5, select.projection.len()); assert_eq!( &Expr::Value(Value::EscapedStringLiteral("s1 \n s1".to_string())), expr_from_projection(&select.projection[0]) @@ -1489,4 +1489,8 @@ fn parse_escaped_literal_string() { &Expr::Value(Value::EscapedStringLiteral("s4 \\\\n s4".to_string())), expr_from_projection(&select.projection[3]) ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("'".to_string())), + expr_from_projection(&select.projection[4]) + ); } From 85dfffe0c4e2f1207fb5b93dee54268aaf357872 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Mon, 23 May 2022 13:47:11 +0300 Subject: [PATCH 4/9] Update src/ast/value.rs Co-authored-by: Andrew Lamb --- src/ast/value.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ast/value.rs b/src/ast/value.rs index 7a62204a1..85f6afe57 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -30,7 +30,8 @@ pub enum Value { Number(BigDecimal, bool), /// 'string value' SingleQuotedString(String), - /// e'string value' + /// e'string value' (postgres extension) + /// Date: Mon, 23 May 2022 13:47:16 +0300 Subject: [PATCH 5/9] Update src/tokenizer.rs Co-authored-by: Andrew Lamb --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 429471f0b..91cb16a80 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -404,7 +404,7 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::EscapedStringLiteral(s))) } _ => { - // regular identifier starting with an "X" + // regular identifier starting with an "E" or "e" let s = self.tokenize_word(x, chars); Ok(Some(Token::make_word(&s, None))) } From 46d94f6d1913289d2de306b9a36b63e028bc4255 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Tue, 24 May 2022 17:38:01 +0300 Subject: [PATCH 6/9] test: two slashes --- tests/sqlparser_postgres.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index ee34f7a28..a44c1cad9 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1470,9 +1470,10 @@ fn pg_and_generic() -> TestedDialects { #[test] fn parse_escaped_literal_string() { - let sql = r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4', E'\''"#; + let sql = + r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4', E'\'', E'foo \\'"#; let select = pg().verified_only_select(sql); - assert_eq!(5, select.projection.len()); + assert_eq!(6, select.projection.len()); assert_eq!( &Expr::Value(Value::EscapedStringLiteral("s1 \n s1".to_string())), expr_from_projection(&select.projection[0]) @@ -1493,4 +1494,8 @@ fn parse_escaped_literal_string() { &Expr::Value(Value::EscapedStringLiteral("'".to_string())), expr_from_projection(&select.projection[4]) ); + assert_eq!( + &Expr::Value(Value::EscapedStringLiteral("foo \\".to_string())), + expr_from_projection(&select.projection[5]) + ); } From d93ec4d065f8b36fc1497bf0a838761db4b03522 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Tue, 24 May 2022 17:54:04 +0300 Subject: [PATCH 7/9] remove dead code --- src/ast/value.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/ast/value.rs b/src/ast/value.rs index 85f6afe57..8337fbaa8 100644 --- a/src/ast/value.rs +++ b/src/ast/value.rs @@ -201,19 +201,13 @@ pub struct EscapeEscapedStringLiteral<'a>(&'a str); impl<'a> fmt::Display for EscapeEscapedStringLiteral<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let mut is_escaped = true; - for c in self.0.chars() { match c { '\'' => { write!(f, r#"\'"#)?; } '\\' => { - if is_escaped { - write!(f, r#"\\"#)?; - } else { - is_escaped = true; - } + write!(f, r#"\\"#)?; } '\n' => { write!(f, r#"\n"#)?; From f46b07e7ce383148c1fcb8755cd37965f0553708 Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Tue, 24 May 2022 17:59:19 +0300 Subject: [PATCH 8/9] test: parsing error --- tests/sqlparser_postgres.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index a44c1cad9..b09b2d79e 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1498,4 +1498,10 @@ fn parse_escaped_literal_string() { &Expr::Value(Value::EscapedStringLiteral("foo \\".to_string())), expr_from_projection(&select.projection[5]) ); + + let sql = r#"SELECT E'\'"#; + assert_eq!( + pg().parse_sql_statements(sql).unwrap_err().to_string(), + "sql parser error: Unterminated encoded string literal at Line: 1, Column 8" + ); } From 5d454e46dc2663a4e4ff798b31df29f118edc03a Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Tue, 24 May 2022 19:04:10 +0300 Subject: [PATCH 9/9] support generic dialect too (for DF) --- src/parser.rs | 7 +++++-- tests/sqlparser_postgres.rs | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 80024fb47..2c3f4875d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -496,7 +496,8 @@ impl<'a> Parser<'a> { expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?), }) } - Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect) => { + Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => + { self.prev_token(); Ok(Expr::Value(self.parse_value()?)) } @@ -2602,7 +2603,9 @@ impl<'a> Parser<'a> { match self.next_token() { Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value), Token::SingleQuotedString(s) => Ok(s), - Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect) => Ok(s), + Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => { + Ok(s) + } unexpected => self.expected("literal string", unexpected), } } diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index b09b2d79e..00594ddb4 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1472,7 +1472,7 @@ fn pg_and_generic() -> TestedDialects { fn parse_escaped_literal_string() { let sql = r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4', E'\'', E'foo \\'"#; - let select = pg().verified_only_select(sql); + let select = pg_and_generic().verified_only_select(sql); assert_eq!(6, select.projection.len()); assert_eq!( &Expr::Value(Value::EscapedStringLiteral("s1 \n s1".to_string())), @@ -1501,7 +1501,10 @@ fn parse_escaped_literal_string() { let sql = r#"SELECT E'\'"#; assert_eq!( - pg().parse_sql_statements(sql).unwrap_err().to_string(), + pg_and_generic() + .parse_sql_statements(sql) + .unwrap_err() + .to_string(), "sql parser error: Unterminated encoded string literal at Line: 1, Column 8" ); }