Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: convert Field::metadata to HashMap #3148

Merged
merged 2 commits into from Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions arrow-integration-test/src/field.rs
Expand Up @@ -18,7 +18,7 @@
use crate::{data_type_from_json, data_type_to_json};
use arrow::datatypes::{DataType, Field};
use arrow::error::{ArrowError, Result};
use std::collections::BTreeMap;
use std::collections::HashMap;

/// Parse a `Field` definition from a JSON representation.
pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
Expand Down Expand Up @@ -53,7 +53,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
// Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz
let metadata = match map.get("metadata") {
Some(&Value::Array(ref values)) => {
let mut res: BTreeMap<String, String> = BTreeMap::default();
let mut res: HashMap<String, String> = HashMap::default();
for value in values {
match value.as_object() {
Some(map) => {
Expand Down Expand Up @@ -92,7 +92,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
// We also support map format, because Schema's metadata supports this.
// See https://github.com/apache/arrow/pull/5907
Some(&Value::Object(ref values)) => {
let mut res: BTreeMap<String, String> = BTreeMap::default();
let mut res: HashMap<String, String> = HashMap::default();
for (k, v) in values {
if let Some(str_value) = v.as_str() {
res.insert(k.clone(), str_value.to_string().clone());
Expand All @@ -110,7 +110,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
"Field `metadata` is not json array".to_string(),
));
}
_ => BTreeMap::default(),
_ => HashMap::default(),
};

// if data_type is a struct or list, get its children
Expand Down
6 changes: 3 additions & 3 deletions arrow-ipc/src/convert.rs
Expand Up @@ -21,7 +21,7 @@ use arrow_schema::*;
use flatbuffers::{
FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset,
};
use std::collections::{BTreeMap, HashMap};
use std::collections::HashMap;

use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER};
use DataType::*;
Expand Down Expand Up @@ -86,7 +86,7 @@ impl<'a> From<crate::Field<'a>> for Field {
)
};

let mut metadata_map = BTreeMap::default();
let mut metadata_map = HashMap::default();
if let Some(list) = field.custom_metadata() {
for kv in list {
if let (Some(k), Some(v)) = (kv.key(), kv.value()) {
Expand Down Expand Up @@ -812,7 +812,7 @@ mod tests {
.iter()
.cloned()
.collect();
let field_md: BTreeMap<String, String> = [("k".to_string(), "v".to_string())]
let field_md: HashMap<String, String> = [("k".to_string(), "v".to_string())]
.iter()
.cloned()
.collect();
Expand Down
6 changes: 3 additions & 3 deletions arrow-schema/src/datatype.rs
Expand Up @@ -381,18 +381,18 @@ mod tests {
#[test]
#[cfg(feature = "serde")]
fn serde_struct_type() {
use std::collections::BTreeMap;
use std::collections::HashMap;

let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();

// Non-empty map: should be converted as JSON obj { ... }
let first_name =
Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);

// Empty map: should be omitted.
let last_name = Field::new("last_name", DataType::Utf8, false)
.with_metadata(BTreeMap::default());
.with_metadata(HashMap::default());

let person = DataType::Struct(vec![
first_name,
Expand Down
85 changes: 68 additions & 17 deletions arrow-schema/src/field.rs
Expand Up @@ -17,7 +17,7 @@

use crate::error::ArrowError;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};

use crate::datatype::DataType;
Expand All @@ -37,9 +37,9 @@ pub struct Field {
/// A map of key-value pairs containing additional custom meta data.
#[cfg_attr(
feature = "serde",
serde(skip_serializing_if = "BTreeMap::is_empty", default)
serde(skip_serializing_if = "HashMap::is_empty", default)
)]
metadata: BTreeMap<String, String>,
metadata: HashMap<String, String>,
}

// Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered`
Expand Down Expand Up @@ -68,9 +68,33 @@ impl Ord for Field {
fn cmp(&self, other: &Self) -> Ordering {
self.name
.cmp(other.name())
.then(self.data_type.cmp(other.data_type()))
.then(self.nullable.cmp(&other.nullable))
.then(self.metadata.cmp(&other.metadata))
.then_with(|| self.data_type.cmp(other.data_type()))
.then_with(|| self.nullable.cmp(&other.nullable))
.then_with(|| {
// ensure deterministic key order
let mut keys: Vec<&String> =
self.metadata.keys().chain(other.metadata.keys()).collect();
keys.sort();
for k in keys {
match (self.metadata.get(k), other.metadata.get(k)) {
(None, None) => {}
(Some(_), None) => {
return Ordering::Less;
}
(None, Some(_)) => {
return Ordering::Greater;
}
(Some(v1), Some(v2)) => match v1.cmp(v2) {
Ordering::Equal => {}
other => {
return other;
}
},
}
}

Ordering::Equal
})
}
}

Expand All @@ -79,7 +103,14 @@ impl Hash for Field {
self.name.hash(state);
self.data_type.hash(state);
self.nullable.hash(state);
self.metadata.hash(state);

// ensure deterministic key order
let mut keys: Vec<&String> = self.metadata.keys().collect();
keys.sort();
for k in keys {
k.hash(state);
self.metadata.get(k).expect("key valid").hash(state);
}
}
}

Expand All @@ -92,7 +123,7 @@ impl Field {
nullable,
dict_id: 0,
dict_is_ordered: false,
metadata: BTreeMap::default(),
metadata: HashMap::default(),
}
}

Expand All @@ -110,29 +141,29 @@ impl Field {
nullable,
dict_id,
dict_is_ordered,
metadata: BTreeMap::default(),
metadata: HashMap::default(),
}
}

/// Sets the `Field`'s optional custom metadata.
/// The metadata is set as `None` for empty map.
#[inline]
pub fn set_metadata(&mut self, metadata: BTreeMap<String, String>) {
self.metadata = BTreeMap::default();
pub fn set_metadata(&mut self, metadata: HashMap<String, String>) {
self.metadata = HashMap::default();
if !metadata.is_empty() {
self.metadata = metadata;
}
}

/// Sets the metadata of this `Field` to be `metadata` and returns self
pub fn with_metadata(mut self, metadata: BTreeMap<String, String>) -> Self {
pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
self.set_metadata(metadata);
self
}

/// Returns the immutable reference to the `Field`'s optional custom metadata.
#[inline]
pub const fn metadata(&self) -> &BTreeMap<String, String> {
pub const fn metadata(&self) -> &HashMap<String, String> {
&self.metadata
}

Expand Down Expand Up @@ -545,10 +576,30 @@ mod test {
assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2));
}

#[test]
fn test_field_comparison_metadata() {
let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k2"), String::from("v2")),
]));
let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k3"), String::from("v3")),
]));
let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k3"), String::from("v4")),
]));

assert!(f1.cmp(&f2).is_lt());
assert!(f2.cmp(&f3).is_lt());
assert!(f1.cmp(&f3).is_lt());
}

#[test]
fn test_contains_reflexivity() {
let mut field = Field::new("field1", DataType::Float16, false);
field.set_metadata(BTreeMap::from([
field.set_metadata(HashMap::from([
(String::from("k0"), String::from("v0")),
(String::from("k1"), String::from("v1")),
]));
Expand All @@ -560,14 +611,14 @@ mod test {
let child_field = Field::new("child1", DataType::Float16, false);

let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false);
field1.set_metadata(BTreeMap::from([(String::from("k1"), String::from("v1"))]));
field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));

let mut field2 = Field::new("field1", DataType::Struct(vec![]), true);
field2.set_metadata(BTreeMap::from([(String::from("k2"), String::from("v2"))]));
field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
field2.try_merge(&field1).unwrap();

let mut field3 = Field::new("field1", DataType::Struct(vec![]), false);
field3.set_metadata(BTreeMap::from([(String::from("k3"), String::from("v3"))]));
field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
field3.try_merge(&field2).unwrap();

assert!(field2.contains(&field1));
Expand Down
23 changes: 10 additions & 13 deletions arrow-schema/src/schema.rs
Expand Up @@ -290,7 +290,6 @@ mod tests {
use super::*;
use crate::datatype::DataType;
use crate::{TimeUnit, UnionMode};
use std::collections::BTreeMap;

#[test]
#[cfg(feature = "serde")]
Expand Down Expand Up @@ -523,7 +522,7 @@ mod tests {

fn person_schema() -> Schema {
let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
let field_metadata: HashMap<String, String> = kv_array.iter().cloned().collect();
let first_name =
Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata);

Expand Down Expand Up @@ -551,18 +550,16 @@ mod tests {
#[test]
fn test_try_merge_field_with_metadata() {
// 1. Different values for the same key should cause error.
let metadata1: BTreeMap<String, String> =
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect();
let metadata1: HashMap<String, String> = [("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect();
let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1);

let metadata2: BTreeMap<String, String> =
[("foo".to_string(), "baz".to_string())]
.iter()
.cloned()
.collect();
let metadata2: HashMap<String, String> = [("foo".to_string(), "baz".to_string())]
.iter()
.cloned()
.collect();
let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2);

assert!(
Expand All @@ -572,7 +569,7 @@ mod tests {

// 2. None + Some
let mut f1 = Field::new("first_name", DataType::Utf8, false);
let metadata2: BTreeMap<String, String> =
let metadata2: HashMap<String, String> =
[("missing".to_string(), "value".to_string())]
.iter()
.cloned()
Expand Down
4 changes: 2 additions & 2 deletions parquet/src/arrow/schema/complex.rs
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use std::collections::BTreeMap;
use std::collections::{HashMap};
crepererum marked this conversation as resolved.
Show resolved Hide resolved

use crate::arrow::schema::primitive::convert_primitive;
use crate::arrow::ProjectionMask;
Expand Down Expand Up @@ -347,7 +347,7 @@ impl Visitor {
let value_field = convert_field(map_value, &value, arrow_value);
let field_metadata = match arrow_map {
Some(field) => field.metadata().clone(),
_ => BTreeMap::default(),
_ => HashMap::default(),
};

let map_field = Field::new(
Expand Down