diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs index 7b78c92bcbd..94ad5d86be3 100644 --- a/datafusion/common/src/parsers.rs +++ b/datafusion/common/src/parsers.rs @@ -19,8 +19,8 @@ use crate::{DataFusionError, Result, ScalarValue}; use std::str::FromStr; -const SECONDS_PER_HOUR: f32 = 3_600_f32; -const MILLIS_PER_SECOND: f32 = 1_000_f32; +const SECONDS_PER_HOUR: f64 = 3_600_f64; +const NANOS_PER_SECOND: f64 = 1_000_000_000_f64; /// Parses a string with an interval like `'0.5 MONTH'` to an /// appropriately typed [`ScalarValue`] @@ -29,73 +29,74 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result { // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days // INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours let align_interval_parts = - |month_part: f32, mut day_part: f32, mut milles_part: f32| -> (i32, i32, f32) { + |month_part: f64, mut day_part: f64, mut nanos_part: f64| -> (i64, i64, f64) { // Convert fractional month to days, It's not supported by Arrow types, but anyway - day_part += (month_part - (month_part as i32) as f32) * 30_f32; + day_part += (month_part - (month_part as i64) as f64) * 30_f64; // Convert fractional days to hours - milles_part += (day_part - ((day_part as i32) as f32)) - * 24_f32 + nanos_part += (day_part - ((day_part as i64) as f64)) + * 24_f64 * SECONDS_PER_HOUR - * MILLIS_PER_SECOND; + * NANOS_PER_SECOND; - (month_part as i32, day_part as i32, milles_part) + (month_part as i64, day_part as i64, nanos_part) }; - let calculate_from_part = - |interval_period_str: &str, interval_type: &str| -> Result<(i32, i32, f32)> { - // @todo It's better to use Decimal in order to protect rounding errors - // Wait https://github.com/apache/arrow/pull/9232 - let interval_period = match f32::from_str(interval_period_str) { - Ok(n) => n, - Err(_) => { - return Err(DataFusionError::NotImplemented(format!( - "Unsupported Interval Expression with value {:?}", - value - ))); - } - }; - - if interval_period > (i32::MAX as f32) { + let calculate_from_part = |interval_period_str: &str, + interval_type: &str| + -> Result<(i64, i64, f64)> { + // @todo It's better to use Decimal in order to protect rounding errors + // Wait https://github.com/apache/arrow/pull/9232 + let interval_period = match f64::from_str(interval_period_str) { + Ok(n) => n, + Err(_) => { return Err(DataFusionError::NotImplemented(format!( - "Interval field value out of range: {:?}", + "Unsupported Interval Expression with value {:?}", value ))); } + }; - match interval_type.to_lowercase().as_str() { - "century" | "centuries" => { - Ok(align_interval_parts(interval_period * 1200_f32, 0.0, 0.0)) - } - "decade" | "decades" => { - Ok(align_interval_parts(interval_period * 120_f32, 0.0, 0.0)) - } - "year" | "years" => { - Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0)) - } - "month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)), - "week" | "weeks" => { - Ok(align_interval_parts(0.0, interval_period * 7_f32, 0.0)) - } - "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)), - "hour" | "hours" => { - Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND)) - } - "minute" | "minutes" => { - Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND)) - } - "second" | "seconds" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)), - "millisecond" | "milliseconds" => Ok((0, 0, interval_period)), - _ => Err(DataFusionError::NotImplemented(format!( - "Invalid input syntax for type interval: {:?}", - value - ))), + if interval_period > (i64::MAX as f64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + match interval_type.to_lowercase().as_str() { + "century" | "centuries" => { + Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)) } - }; + "decade" | "decades" => { + Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0)) + } + "year" | "years" => { + Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0)) + } + "month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)), + "week" | "weeks" => { + Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0)) + } + "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)), + "hour" | "hours" => { + Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND)) + } + "minute" | "minutes" => { + Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND)) + } + "second" | "seconds" => Ok((0, 0, interval_period * NANOS_PER_SECOND)), + "millisecond" | "milliseconds" => Ok((0, 0, interval_period * 1_000_000f64)), + _ => Err(DataFusionError::NotImplemented(format!( + "Invalid input syntax for type interval: {:?}", + value + ))), + } + }; let mut result_month: i64 = 0; let mut result_days: i64 = 0; - let mut result_millis: i64 = 0; + let mut result_nanos: i128 = 0; let mut parts = value.split_whitespace(); @@ -107,7 +108,7 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result { let unit = parts.next().unwrap_or(leading_field); - let (diff_month, diff_days, diff_millis) = + let (diff_month, diff_days, diff_nanos) = calculate_from_part(interval_period_str.unwrap(), unit)?; result_month += diff_month as i64; @@ -128,9 +129,9 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result { ))); } - result_millis += diff_millis as i64; + result_nanos += diff_nanos as i128; - if result_millis > (i32::MAX as i64) { + if result_nanos > (i64::MAX as i128) { return Err(DataFusionError::NotImplemented(format!( "Interval field value out of range: {:?}", value @@ -142,11 +143,13 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result { // 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month // The true way to store and calculate intervals is to store it as it defined // It's why we there are 3 different interval types in Arrow - if result_month != 0 && (result_days != 0 || result_millis != 0) { - let result: i128 = ((result_month as i128) << 96) - | ((result_days as i128) << 64) - // IntervalMonthDayNano uses nanos, but IntervalDayTime uses milles - | ((result_millis * 1_000_000_i64) as i128); + + // If have a unit smaller than milliseconds then must use IntervalMonthDayNano + if (result_nanos % 1_000_000 != 0) + || (result_month != 0 && (result_days != 0 || result_nanos != 0)) + { + let result: i128 = + ((result_month as i128) << 96) | ((result_days as i128) << 64) | result_nanos; return Ok(ScalarValue::IntervalMonthDayNano(Some(result))); } @@ -156,7 +159,8 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result { return Ok(ScalarValue::IntervalYearMonth(Some(result_month as i32))); } - let result: i64 = (result_days << 32) | result_millis; + // IntervalMonthDayNano uses nanos, but IntervalDayTime uses millis + let result: i64 = (result_days << 32) | (result_nanos as i64 / 1_000_000); Ok(ScalarValue::IntervalDayTime(Some(result))) } @@ -165,6 +169,8 @@ mod test { use super::*; use crate::assert_contains; + const MILLIS_PER_SECOND: f64 = 1_000_f64; + #[test] fn test_parse_ym() { assert_eq!( @@ -212,13 +218,20 @@ mod test { } #[test] - // https://github.com/apache/arrow-rs/pull/2235 - #[ignore] fn test_mdn() { - // these should be the same, I think assert_eq!( parse_interval("months", "1 year 25 millisecond").unwrap(), ScalarValue::new_interval_mdn(12, 0, 25 * 1_000_000) ); + + assert_eq!( + parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(), + ScalarValue::new_interval_mdn(12, 1, 1) + ); + + assert_eq!( + parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(), + ScalarValue::new_interval_mdn(12, 1, 1_00 * 1_000) + ); } } diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs index 8251a554657..f82be2a5b95 100644 --- a/datafusion/core/tests/sql/expr.rs +++ b/datafusion/core/tests/sql/expr.rs @@ -917,11 +917,11 @@ async fn test_interval_expressions() -> Result<()> { ); test_expression!( "interval '0.499 day'", - "0 years 0 mons 0 days 11 hours 58 mins 33.596 secs" + "0 years 0 mons 0 days 11 hours 58 mins 33.600 secs" ); test_expression!( "interval '0.4999 day'", - "0 years 0 mons 0 days 11 hours 59 mins 51.364 secs" + "0 years 0 mons 0 days 11 hours 59 mins 51.360 secs" ); test_expression!( "interval '0.49999 day'", @@ -929,7 +929,7 @@ async fn test_interval_expressions() -> Result<()> { ); test_expression!( "interval '0.49999999999 day'", - "0 years 0 mons 0 days 12 hours 0 mins 0.00 secs" + "0 years 0 mons 0 days 11 hours 59 mins 59.999999136 secs" ); test_expression!( "interval '5 day'",