From c6145116cb8693c3c7bc2b063b0f9422034504b0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 28 Jun 2022 08:16:13 +0100 Subject: [PATCH] Set is_adjusted_to_utc if any timezone set (#1932) --- arrow/src/datatypes/datatype.rs | 57 +++++++++++++++++++++++++++++++++ parquet/src/arrow/schema.rs | 10 ++---- 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index d0c9bb6920c..f3cb58d84e6 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -78,6 +78,63 @@ pub enum DataType { /// * As used in the Olson time zone database (the "tz database" or /// "tzdata"), such as "America/New_York" /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// + /// Timestamps with a non-empty timezone + /// ------------------------------------ + /// + /// If a Timestamp column has a non-empty timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone + /// (the Unix epoch), regardless of the Timestamp's own timezone. + /// + /// Therefore, timestamp values with a non-empty timezone correspond to + /// physical points in time together with some additional information about + /// how the data was obtained and/or how to display it (the timezone). + /// + /// For example, the timestamp value 0 with the timezone string "Europe/Paris" + /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the + /// application may prefer to display it as "January 1st 1970, 01h00" in + /// the Europe/Paris timezone (which is the same physical point in time). + /// + /// One consequence is that timestamp values with a non-empty timezone + /// can be compared and ordered directly, since they all share the same + /// well-known point of reference (the Unix epoch). + /// + /// Timestamps with an unset / empty timezone + /// ----------------------------------------- + /// + /// If a Timestamp column has no timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. + /// + /// Therefore, timestamp values without a timezone cannot be meaningfully + /// interpreted as physical points in time, but only as calendar / clock + /// indications ("wall clock time") in an unspecified timezone. + /// + /// For example, the timestamp value 0 with an empty timezone string + /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there + /// is not enough information to interpret it as a well-defined physical + /// point in time. + /// + /// One consequence is that timestamp values without a timezone cannot + /// be reliably compared or ordered, since they may have different points of + /// reference. In particular, it is *not* possible to interpret an unset + /// or empty timezone as the same as "UTC". + /// + /// Conversion between timezones + /// ---------------------------- + /// + /// If a Timestamp column has a non-empty timezone, changing the timezone + /// to a different non-empty value is a metadata-only operation: + /// the timestamp values need not change as their point of reference remains + /// the same (the Unix epoch). + /// + /// However, if a Timestamp column has no timezone value, changing it to a + /// non-empty value requires to think about the desired semantics. + /// One possibility is to assume that the original timestamp values are + /// relative to the epoch of the timezone being set; timestamp values should + /// then adjusted to the Unix epoch (for example, changing the timezone from + /// empty to "Europe/Paris" would require converting the timestamp values + /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is + /// nevertheless correct). Timestamp(TimeUnit, Option), /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 71ae11d089d..a65c7585327 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -301,14 +301,10 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .build() } DataType::Timestamp(time_unit, tz) => { - let is_utc = tz - .as_ref() - .map(|tz| tz == "UTC" || tz == "+00:00" || tz == "-00:00") - .unwrap_or(false); - Type::primitive_type_builder(name, PhysicalType::INT64) .with_logical_type(Some(LogicalType::Timestamp { - is_adjusted_to_u_t_c: is_utc, + // If timezone set, values are normalized to UTC timezone + is_adjusted_to_u_t_c: matches!(tz, Some(z) if !z.as_str().is_empty()), unit: match time_unit { TimeUnit::Second => unreachable!(), TimeUnit::Millisecond => { @@ -1290,7 +1286,7 @@ mod tests { REQUIRED INT64 ts_micro_utc (TIMESTAMP(MICROS, true)); REQUIRED INT64 ts_millis_zero_offset (TIMESTAMP(MILLIS, true)); REQUIRED INT64 ts_millis_zero_negative_offset (TIMESTAMP(MILLIS, true)); - REQUIRED INT64 ts_micro_non_utc (TIMESTAMP(MICROS, false)); + REQUIRED INT64 ts_micro_non_utc (TIMESTAMP(MICROS, true)); REQUIRED GROUP struct { REQUIRED BOOLEAN bools; REQUIRED INT32 uint32 (INTEGER(32,false));