Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse nanoseconds for intervals #4186

Merged
merged 3 commits into from Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
143 changes: 78 additions & 65 deletions datafusion/common/src/parsers.rs
Expand Up @@ -19,8 +19,8 @@
use crate::{DataFusionError, Result, ScalarValue};
use std::str::FromStr;

const SECONDS_PER_HOUR: f32 = 3_600_f32;
const MILLIS_PER_SECOND: f32 = 1_000_f32;
const SECONDS_PER_HOUR: f64 = 3_600_f64;
const NANOS_PER_SECOND: f64 = 1_000_000_000_f64;

/// Parses a string with an interval like `'0.5 MONTH'` to an
/// appropriately typed [`ScalarValue`]
Expand All @@ -29,73 +29,74 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
let align_interval_parts =
|month_part: f32, mut day_part: f32, mut milles_part: f32| -> (i32, i32, f32) {
|month_part: f64, mut day_part: f64, mut nanos_part: f64| -> (i64, i64, f64) {
// Convert fractional month to days, It's not supported by Arrow types, but anyway
day_part += (month_part - (month_part as i32) as f32) * 30_f32;
day_part += (month_part - (month_part as i64) as f64) * 30_f64;

// Convert fractional days to hours
milles_part += (day_part - ((day_part as i32) as f32))
* 24_f32
nanos_part += (day_part - ((day_part as i64) as f64))
* 24_f64
* SECONDS_PER_HOUR
* MILLIS_PER_SECOND;
* NANOS_PER_SECOND;

(month_part as i32, day_part as i32, milles_part)
(month_part as i64, day_part as i64, nanos_part)
Comment on lines +32 to +42
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

};

let calculate_from_part =
|interval_period_str: &str, interval_type: &str| -> Result<(i32, i32, f32)> {
// @todo It's better to use Decimal in order to protect rounding errors
// Wait https://github.com/apache/arrow/pull/9232
let interval_period = match f32::from_str(interval_period_str) {
Ok(n) => n,
Err(_) => {
return Err(DataFusionError::NotImplemented(format!(
"Unsupported Interval Expression with value {:?}",
value
)));
}
};

if interval_period > (i32::MAX as f32) {
let calculate_from_part = |interval_period_str: &str,
interval_type: &str|
-> Result<(i64, i64, f64)> {
// @todo It's better to use Decimal in order to protect rounding errors
// Wait https://github.com/apache/arrow/pull/9232
let interval_period = match f64::from_str(interval_period_str) {
Ok(n) => n,
Err(_) => {
return Err(DataFusionError::NotImplemented(format!(
"Interval field value out of range: {:?}",
"Unsupported Interval Expression with value {:?}",
value
)));
}
};

match interval_type.to_lowercase().as_str() {
"century" | "centuries" => {
Ok(align_interval_parts(interval_period * 1200_f32, 0.0, 0.0))
}
"decade" | "decades" => {
Ok(align_interval_parts(interval_period * 120_f32, 0.0, 0.0))
}
"year" | "years" => {
Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0))
}
"month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
"week" | "weeks" => {
Ok(align_interval_parts(0.0, interval_period * 7_f32, 0.0))
}
"day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)),
"hour" | "hours" => {
Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND))
}
"minute" | "minutes" => {
Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND))
}
"second" | "seconds" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)),
"millisecond" | "milliseconds" => Ok((0, 0, interval_period)),
_ => Err(DataFusionError::NotImplemented(format!(
"Invalid input syntax for type interval: {:?}",
value
))),
if interval_period > (i64::MAX as f64) {
return Err(DataFusionError::NotImplemented(format!(
"Interval field value out of range: {:?}",
value
)));
}

match interval_type.to_lowercase().as_str() {
"century" | "centuries" => {
Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
}
};
"decade" | "decades" => {
Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
}
"year" | "years" => {
Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
}
"month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
"week" | "weeks" => {
Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
}
"day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)),
"hour" | "hours" => {
Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
}
"minute" | "minutes" => {
Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
}
"second" | "seconds" => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
"millisecond" | "milliseconds" => Ok((0, 0, interval_period * 1_000_000f64)),
_ => Err(DataFusionError::NotImplemented(format!(
"Invalid input syntax for type interval: {:?}",
value
))),
}
};

let mut result_month: i64 = 0;
let mut result_days: i64 = 0;
let mut result_millis: i64 = 0;
let mut result_nanos: i128 = 0;

let mut parts = value.split_whitespace();

Expand All @@ -107,7 +108,7 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {

let unit = parts.next().unwrap_or(leading_field);

let (diff_month, diff_days, diff_millis) =
let (diff_month, diff_days, diff_nanos) =
calculate_from_part(interval_period_str.unwrap(), unit)?;

result_month += diff_month as i64;
Expand All @@ -128,9 +129,9 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
)));
}

result_millis += diff_millis as i64;
result_nanos += diff_nanos as i128;

if result_millis > (i32::MAX as i64) {
if result_nanos > (i64::MAX as i128) {
return Err(DataFusionError::NotImplemented(format!(
"Interval field value out of range: {:?}",
value
Expand All @@ -142,11 +143,13 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
// 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month
// The true way to store and calculate intervals is to store it as it defined
// It's why we there are 3 different interval types in Arrow
if result_month != 0 && (result_days != 0 || result_millis != 0) {
let result: i128 = ((result_month as i128) << 96)
| ((result_days as i128) << 64)
// IntervalMonthDayNano uses nanos, but IntervalDayTime uses milles
| ((result_millis * 1_000_000_i64) as i128);

// If have a unit smaller than milliseconds then must use IntervalMonthDayNano
if (result_nanos % 1_000_000 != 0)
|| (result_month != 0 && (result_days != 0 || result_nanos != 0))
{
let result: i128 =
((result_month as i128) << 96) | ((result_days as i128) << 64) | result_nanos;

return Ok(ScalarValue::IntervalMonthDayNano(Some(result)));
}
Expand All @@ -156,7 +159,8 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
return Ok(ScalarValue::IntervalYearMonth(Some(result_month as i32)));
}

let result: i64 = (result_days << 32) | result_millis;
// IntervalMonthDayNano uses nanos, but IntervalDayTime uses millis
let result: i64 = (result_days << 32) | (result_nanos as i64 / 1_000_000);
Ok(ScalarValue::IntervalDayTime(Some(result)))
}

Expand All @@ -165,6 +169,8 @@ mod test {
use super::*;
use crate::assert_contains;

const MILLIS_PER_SECOND: f64 = 1_000_f64;

#[test]
fn test_parse_ym() {
assert_eq!(
Expand Down Expand Up @@ -212,13 +218,20 @@ mod test {
}

#[test]
// https://github.com/apache/arrow-rs/pull/2235
#[ignore]
fn test_mdn() {
// these should be the same, I think
assert_eq!(
parse_interval("months", "1 year 25 millisecond").unwrap(),
ScalarValue::new_interval_mdn(12, 0, 25 * 1_000_000)
);

assert_eq!(
parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(),
ScalarValue::new_interval_mdn(12, 1, 1)
);

assert_eq!(
parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(),
ScalarValue::new_interval_mdn(12, 1, 1_00 * 1_000)
);
}
}
6 changes: 3 additions & 3 deletions datafusion/core/tests/sql/expr.rs
Expand Up @@ -917,19 +917,19 @@ async fn test_interval_expressions() -> Result<()> {
);
test_expression!(
"interval '0.499 day'",
"0 years 0 mons 0 days 11 hours 58 mins 33.596 secs"
"0 years 0 mons 0 days 11 hours 58 mins 33.600 secs"
);
test_expression!(
"interval '0.4999 day'",
"0 years 0 mons 0 days 11 hours 59 mins 51.364 secs"
"0 years 0 mons 0 days 11 hours 59 mins 51.360 secs"
);
test_expression!(
"interval '0.49999 day'",
"0 years 0 mons 0 days 11 hours 59 mins 59.136 secs"
);
test_expression!(
"interval '0.49999999999 day'",
"0 years 0 mons 0 days 12 hours 0 mins 0.00 secs"
"0 years 0 mons 0 days 11 hours 59 mins 59.999999136 secs"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great improvement! 👍

);
test_expression!(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we can add a test case for this (which will fail in the current version) and wait for the update to arrow 27 #4199

Suggested change
test_expression!(
test_expression!(
"interval '0.00000000001 day'",
"0 years 0 mons 0 days 0 hours 0 mins 0.000000864 secs"
);
test_expression!(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i just found that the arrow-rs fix isn't included in 27.0.0
@tustvold do you have any suggestion for this? should we just merge this first and add some test cases after arrow 28.0.0 merged?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also add test cases that are marked as #[should_fail]

"interval '5 day'",
Expand Down