From d1b7e9321fb109be00e9cbea5eb32053381f5696 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 21 Nov 2022 16:03:47 +0100 Subject: [PATCH] refactor: convert `Field::metadata` to `HashMap` Closes #2262. --- arrow-integration-test/src/field.rs | 8 ++-- arrow-ipc/src/convert.rs | 6 +-- arrow-schema/src/field.rs | 65 +++++++++++++++++++++-------- arrow-schema/src/schema.rs | 23 +++++----- parquet/src/arrow/schema/complex.rs | 4 +- 5 files changed, 67 insertions(+), 39 deletions(-) diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 5b586355709..4bfbf8e9912 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -18,7 +18,7 @@ use crate::{data_type_from_json, data_type_to_json}; use arrow::datatypes::{DataType, Field}; use arrow::error::{ArrowError, Result}; -use std::collections::BTreeMap; +use std::collections::HashMap; /// Parse a `Field` definition from a JSON representation. pub fn field_from_json(json: &serde_json::Value) -> Result { @@ -53,7 +53,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz let metadata = match map.get("metadata") { Some(&Value::Array(ref values)) => { - let mut res: BTreeMap = BTreeMap::default(); + let mut res: HashMap = HashMap::default(); for value in values { match value.as_object() { Some(map) => { @@ -92,7 +92,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // We also support map format, because Schema's metadata supports this. // See https://github.com/apache/arrow/pull/5907 Some(&Value::Object(ref values)) => { - let mut res: BTreeMap = BTreeMap::default(); + let mut res: HashMap = HashMap::default(); for (k, v) in values { if let Some(str_value) = v.as_str() { res.insert(k.clone(), str_value.to_string().clone()); @@ -110,7 +110,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { "Field `metadata` is not json array".to_string(), )); } - _ => BTreeMap::default(), + _ => HashMap::default(), }; // if data_type is a struct or list, get its children diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index a9dda6f2a1f..e11d64a473d 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -21,7 +21,7 @@ use arrow_schema::*; use flatbuffers::{ FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, }; -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER}; use DataType::*; @@ -86,7 +86,7 @@ impl<'a> From> for Field { ) }; - let mut metadata_map = BTreeMap::default(); + let mut metadata_map = HashMap::default(); if let Some(list) = field.custom_metadata() { for kv in list { if let (Some(k), Some(v)) = (kv.key(), kv.value()) { @@ -812,7 +812,7 @@ mod tests { .iter() .cloned() .collect(); - let field_md: BTreeMap = [("k".to_string(), "v".to_string())] + let field_md: HashMap = [("k".to_string(), "v".to_string())] .iter() .cloned() .collect(); diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index b1de65e557f..21c0a65f4fb 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -17,7 +17,7 @@ use crate::error::ArrowError; use std::cmp::Ordering; -use std::collections::BTreeMap; +use std::collections::HashMap; use std::hash::{Hash, Hasher}; use crate::datatype::DataType; @@ -37,9 +37,9 @@ pub struct Field { /// A map of key-value pairs containing additional custom meta data. #[cfg_attr( feature = "serde", - serde(skip_serializing_if = "BTreeMap::is_empty", default) + serde(skip_serializing_if = "HashMap::is_empty", default) )] - metadata: BTreeMap, + metadata: HashMap, } // Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered` @@ -68,9 +68,33 @@ impl Ord for Field { fn cmp(&self, other: &Self) -> Ordering { self.name .cmp(other.name()) - .then(self.data_type.cmp(other.data_type())) - .then(self.nullable.cmp(&other.nullable)) - .then(self.metadata.cmp(&other.metadata)) + .then_with(|| self.data_type.cmp(other.data_type())) + .then_with(|| self.nullable.cmp(&other.nullable)) + .then_with(|| { + // ensure deterministic key order + let mut keys: Vec<&String> = + self.metadata.keys().chain(other.metadata.keys()).collect(); + keys.sort(); + for k in keys { + match (self.metadata.get(k), other.metadata.get(k)) { + (None, None) => {} + (Some(_), None) => { + return Ordering::Greater; + } + (None, Some(_)) => { + return Ordering::Less; + } + (Some(v1), Some(v2)) => match v1.cmp(v2) { + Ordering::Equal => {} + other => { + return other; + } + }, + } + } + + Ordering::Equal + }) } } @@ -79,7 +103,14 @@ impl Hash for Field { self.name.hash(state); self.data_type.hash(state); self.nullable.hash(state); - self.metadata.hash(state); + + // ensure deterministic key order + let mut keys: Vec<&String> = self.metadata.keys().collect(); + keys.sort(); + for k in keys { + k.hash(state); + self.metadata.get(k).expect("key valid").hash(state); + } } } @@ -92,7 +123,7 @@ impl Field { nullable, dict_id: 0, dict_is_ordered: false, - metadata: BTreeMap::default(), + metadata: HashMap::default(), } } @@ -110,29 +141,29 @@ impl Field { nullable, dict_id, dict_is_ordered, - metadata: BTreeMap::default(), + metadata: HashMap::default(), } } /// Sets the `Field`'s optional custom metadata. /// The metadata is set as `None` for empty map. #[inline] - pub fn set_metadata(&mut self, metadata: BTreeMap) { - self.metadata = BTreeMap::default(); + pub fn set_metadata(&mut self, metadata: HashMap) { + self.metadata = HashMap::default(); if !metadata.is_empty() { self.metadata = metadata; } } /// Sets the metadata of this `Field` to be `metadata` and returns self - pub fn with_metadata(mut self, metadata: BTreeMap) -> Self { + pub fn with_metadata(mut self, metadata: HashMap) -> Self { self.set_metadata(metadata); self } /// Returns the immutable reference to the `Field`'s optional custom metadata. #[inline] - pub const fn metadata(&self) -> &BTreeMap { + pub const fn metadata(&self) -> &HashMap { &self.metadata } @@ -548,7 +579,7 @@ mod test { #[test] fn test_contains_reflexivity() { let mut field = Field::new("field1", DataType::Float16, false); - field.set_metadata(BTreeMap::from([ + field.set_metadata(HashMap::from([ (String::from("k0"), String::from("v0")), (String::from("k1"), String::from("v1")), ])); @@ -560,14 +591,14 @@ mod test { let child_field = Field::new("child1", DataType::Float16, false); let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false); - field1.set_metadata(BTreeMap::from([(String::from("k1"), String::from("v1"))])); + field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])); let mut field2 = Field::new("field1", DataType::Struct(vec![]), true); - field2.set_metadata(BTreeMap::from([(String::from("k2"), String::from("v2"))])); + field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))])); field2.try_merge(&field1).unwrap(); let mut field3 = Field::new("field1", DataType::Struct(vec![]), false); - field3.set_metadata(BTreeMap::from([(String::from("k3"), String::from("v3"))])); + field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))])); field3.try_merge(&field2).unwrap(); assert!(field2.contains(&field1)); diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 519a8e089ae..8ff40866d51 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -290,7 +290,6 @@ mod tests { use super::*; use crate::datatype::DataType; use crate::{TimeUnit, UnionMode}; - use std::collections::BTreeMap; #[test] #[cfg(feature = "serde")] @@ -523,7 +522,7 @@ mod tests { fn person_schema() -> Schema { let kv_array = [("k".to_string(), "v".to_string())]; - let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); + let field_metadata: HashMap = kv_array.iter().cloned().collect(); let first_name = Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata); @@ -551,18 +550,16 @@ mod tests { #[test] fn test_try_merge_field_with_metadata() { // 1. Different values for the same key should cause error. - let metadata1: BTreeMap = - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(); + let metadata1: HashMap = [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(); let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1); - let metadata2: BTreeMap = - [("foo".to_string(), "baz".to_string())] - .iter() - .cloned() - .collect(); + let metadata2: HashMap = [("foo".to_string(), "baz".to_string())] + .iter() + .cloned() + .collect(); let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2); assert!( @@ -572,7 +569,7 @@ mod tests { // 2. None + Some let mut f1 = Field::new("first_name", DataType::Utf8, false); - let metadata2: BTreeMap = + let metadata2: HashMap = [("missing".to_string(), "value".to_string())] .iter() .cloned() diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 4ff9c7a3956..8cc23db517e 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::collections::BTreeMap; +use std::collections::{HashMap}; use crate::arrow::schema::primitive::convert_primitive; use crate::arrow::ProjectionMask; @@ -347,7 +347,7 @@ impl Visitor { let value_field = convert_field(map_value, &value, arrow_value); let field_metadata = match arrow_map { Some(field) => field.metadata().clone(), - _ => BTreeMap::default(), + _ => HashMap::default(), }; let map_field = Field::new(