From 4588632a3882280c96e100a955282c3e681d4aa7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 13 Sep 2022 18:50:45 +0100 Subject: [PATCH 1/2] Move JSON Test Format To integration-testing --- arrow/src/datatypes/datatype.rs | 344 ------- arrow/src/datatypes/field.rs | 277 ------ arrow/src/datatypes/mod.rs | 938 +----------------- arrow/src/datatypes/schema.rs | 81 -- integration-testing/src/lib.rs | 2 +- integration-testing/src/util/datatype.rs | 383 +++++++ integration-testing/src/util/field.rs | 569 +++++++++++ .../src/{util.rs => util/mod.rs} | 14 +- integration-testing/src/util/schema.rs | 716 +++++++++++++ 9 files changed, 1682 insertions(+), 1642 deletions(-) create mode 100644 integration-testing/src/util/datatype.rs create mode 100644 integration-testing/src/util/field.rs rename integration-testing/src/{util.rs => util/mod.rs} (99%) create mode 100644 integration-testing/src/util/schema.rs diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index b65bfd7725a..be8ef8061c0 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -1052,350 +1052,6 @@ pub(crate) fn validate_decimal256_precision_with_lt_bytes( } impl DataType { - /// Parse a data type from a JSON representation. - #[cfg(feature = "json")] - pub(crate) fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - let default_field = Field::new("", DataType::Boolean, true); - match *json { - Value::Object(ref map) => match map.get("name") { - Some(s) if s == "null" => Ok(DataType::Null), - Some(s) if s == "bool" => Ok(DataType::Boolean), - Some(s) if s == "binary" => Ok(DataType::Binary), - Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), - Some(s) if s == "utf8" => Ok(DataType::Utf8), - Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), - Some(s) if s == "fixedsizebinary" => { - // return a list with any type as its child isn't defined in the map - if let Some(Value::Number(size)) = map.get("byteWidth") { - Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) - } else { - Err(ArrowError::ParseError( - "Expecting a byteWidth for fixedsizebinary".to_string(), - )) - } - } - Some(s) if s == "decimal" => { - // return a list with any type as its child isn't defined in the map - let precision = match map.get("precision") { - Some(p) => Ok(p.as_u64().unwrap().try_into().unwrap()), - None => Err(ArrowError::ParseError( - "Expecting a precision for decimal".to_string(), - )), - }?; - let scale = match map.get("scale") { - Some(s) => Ok(s.as_u64().unwrap().try_into().unwrap()), - _ => Err(ArrowError::ParseError( - "Expecting a scale for decimal".to_string(), - )), - }?; - let bit_width: usize = match map.get("bitWidth") { - Some(b) => b.as_u64().unwrap() as usize, - _ => 128, // Default bit width - }; - - if bit_width == 128 { - Ok(DataType::Decimal128(precision, scale)) - } else if bit_width == 256 { - Ok(DataType::Decimal256(precision, scale)) - } else { - Err(ArrowError::ParseError( - "Decimal bit_width invalid".to_string(), - )) - } - } - Some(s) if s == "floatingpoint" => match map.get("precision") { - Some(p) if p == "HALF" => Ok(DataType::Float16), - Some(p) if p == "SINGLE" => Ok(DataType::Float32), - Some(p) if p == "DOUBLE" => Ok(DataType::Float64), - _ => Err(ArrowError::ParseError( - "floatingpoint precision missing or invalid".to_string(), - )), - }, - Some(s) if s == "timestamp" => { - let unit = match map.get("unit") { - Some(p) if p == "SECOND" => Ok(TimeUnit::Second), - Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), - Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), - Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), - _ => Err(ArrowError::ParseError( - "timestamp unit missing or invalid".to_string(), - )), - }; - let tz = match map.get("timezone") { - None => Ok(None), - Some(serde_json::Value::String(tz)) => Ok(Some(tz.clone())), - _ => Err(ArrowError::ParseError( - "timezone must be a string".to_string(), - )), - }; - Ok(DataType::Timestamp(unit?, tz?)) - } - Some(s) if s == "date" => match map.get("unit") { - Some(p) if p == "DAY" => Ok(DataType::Date32), - Some(p) if p == "MILLISECOND" => Ok(DataType::Date64), - _ => Err(ArrowError::ParseError( - "date unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "time" => { - let unit = match map.get("unit") { - Some(p) if p == "SECOND" => Ok(TimeUnit::Second), - Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), - Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), - Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), - _ => Err(ArrowError::ParseError( - "time unit missing or invalid".to_string(), - )), - }; - match map.get("bitWidth") { - Some(p) if p == 32 => Ok(DataType::Time32(unit?)), - Some(p) if p == 64 => Ok(DataType::Time64(unit?)), - _ => Err(ArrowError::ParseError( - "time bitWidth missing or invalid".to_string(), - )), - } - } - Some(s) if s == "duration" => match map.get("unit") { - Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), - Some(p) if p == "MILLISECOND" => { - Ok(DataType::Duration(TimeUnit::Millisecond)) - } - Some(p) if p == "MICROSECOND" => { - Ok(DataType::Duration(TimeUnit::Microsecond)) - } - Some(p) if p == "NANOSECOND" => { - Ok(DataType::Duration(TimeUnit::Nanosecond)) - } - _ => Err(ArrowError::ParseError( - "time unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "interval" => match map.get("unit") { - Some(p) if p == "DAY_TIME" => { - Ok(DataType::Interval(IntervalUnit::DayTime)) - } - Some(p) if p == "YEAR_MONTH" => { - Ok(DataType::Interval(IntervalUnit::YearMonth)) - } - Some(p) if p == "MONTH_DAY_NANO" => { - Ok(DataType::Interval(IntervalUnit::MonthDayNano)) - } - _ => Err(ArrowError::ParseError( - "interval unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "int" => match map.get("isSigned") { - Some(&Value::Bool(true)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { - Some(8) => Ok(DataType::Int8), - Some(16) => Ok(DataType::Int16), - Some(32) => Ok(DataType::Int32), - Some(64) => Ok(DataType::Int64), - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - Some(&Value::Bool(false)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { - Some(8) => Ok(DataType::UInt8), - Some(16) => Ok(DataType::UInt16), - Some(32) => Ok(DataType::UInt32), - Some(64) => Ok(DataType::UInt64), - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int signed missing or invalid".to_string(), - )), - }, - Some(s) if s == "list" => { - // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(default_field))) - } - Some(s) if s == "largelist" => { - // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(default_field))) - } - Some(s) if s == "fixedsizelist" => { - // return a list with any type as its child isn't defined in the map - if let Some(Value::Number(size)) = map.get("listSize") { - Ok(DataType::FixedSizeList( - Box::new(default_field), - size.as_i64().unwrap() as i32, - )) - } else { - Err(ArrowError::ParseError( - "Expecting a listSize for fixedsizelist".to_string(), - )) - } - } - Some(s) if s == "struct" => { - // return an empty `struct` type as its children aren't defined in the map - Ok(DataType::Struct(vec![])) - } - Some(s) if s == "map" => { - if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { - // Return a map with an empty type as its children aren't defined in the map - Ok(DataType::Map(Box::new(default_field), *keys_sorted)) - } else { - Err(ArrowError::ParseError( - "Expecting a keysSorted for map".to_string(), - )) - } - } - Some(s) if s == "union" => { - if let Some(Value::String(mode)) = map.get("mode") { - let union_mode = if mode == "SPARSE" { - UnionMode::Sparse - } else if mode == "DENSE" { - UnionMode::Dense - } else { - return Err(ArrowError::ParseError(format!( - "Unknown union mode {:?} for union", - mode - ))); - }; - if let Some(type_ids) = map.get("typeIds") { - let type_ids = type_ids - .as_array() - .unwrap() - .iter() - .map(|t| t.as_i64().unwrap() as i8) - .collect::>(); - - let default_fields = type_ids - .iter() - .map(|_| default_field.clone()) - .collect::>(); - - Ok(DataType::Union(default_fields, type_ids, union_mode)) - } else { - Err(ArrowError::ParseError( - "Expecting a typeIds for union ".to_string(), - )) - } - } else { - Err(ArrowError::ParseError( - "Expecting a mode for union".to_string(), - )) - } - } - Some(other) => Err(ArrowError::ParseError(format!( - "invalid or unsupported type name: {} in {:?}", - other, json - ))), - None => Err(ArrowError::ParseError("type name missing".to_string())), - }, - _ => Err(ArrowError::ParseError( - "invalid json value type".to_string(), - )), - } - } - - /// Generate a JSON representation of the data type. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - use serde_json::json; - match self { - DataType::Null => json!({"name": "null"}), - DataType::Boolean => json!({"name": "bool"}), - DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), - DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), - DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), - DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), - DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), - DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), - DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), - DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), - DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), - DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), - DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), - DataType::Utf8 => json!({"name": "utf8"}), - DataType::LargeUtf8 => json!({"name": "largeutf8"}), - DataType::Binary => json!({"name": "binary"}), - DataType::LargeBinary => json!({"name": "largebinary"}), - DataType::FixedSizeBinary(byte_width) => { - json!({"name": "fixedsizebinary", "byteWidth": byte_width}) - } - DataType::Struct(_) => json!({"name": "struct"}), - DataType::Union(_, _, _) => json!({"name": "union"}), - DataType::List(_) => json!({ "name": "list"}), - DataType::LargeList(_) => json!({ "name": "largelist"}), - DataType::FixedSizeList(_, length) => { - json!({"name":"fixedsizelist", "listSize": length}) - } - DataType::Time32(unit) => { - json!({"name": "time", "bitWidth": 32, "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Time64(unit) => { - json!({"name": "time", "bitWidth": 64, "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Date32 => { - json!({"name": "date", "unit": "DAY"}) - } - DataType::Date64 => { - json!({"name": "date", "unit": "MILLISECOND"}) - } - DataType::Timestamp(unit, None) => { - json!({"name": "timestamp", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Timestamp(unit, Some(tz)) => { - json!({"name": "timestamp", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }, "timezone": tz}) - } - DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { - IntervalUnit::YearMonth => "YEAR_MONTH", - IntervalUnit::DayTime => "DAY_TIME", - IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", - }}), - DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}), - DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), - DataType::Decimal128(precision, scale) => { - json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 128}) - } - DataType::Decimal256(precision, scale) => { - json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 256}) - } - DataType::Map(_, keys_sorted) => { - json!({"name": "map", "keysSorted": keys_sorted}) - } - } - } - /// Returns true if this type is numeric: (UInt*, Int*, or Float*). pub fn is_numeric(t: &DataType) -> bool { use DataType::*; diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index ac966cafe34..03d07807743 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -250,283 +250,6 @@ impl Field { } } - /// Parse a `Field` definition from a JSON representation. - #[cfg(feature = "json")] - pub fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - match *json { - Value::Object(ref map) => { - let name = match map.get("name") { - Some(&Value::String(ref name)) => name.to_string(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'name' attribute".to_string(), - )); - } - }; - let nullable = match map.get("nullable") { - Some(&Value::Bool(b)) => b, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'nullable' attribute".to_string(), - )); - } - }; - let data_type = match map.get("type") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'type' attribute".to_string(), - )); - } - }; - - // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz - let metadata = match map.get("metadata") { - Some(&Value::Array(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for value in values { - match value.as_object() { - Some(map) => { - if map.len() != 2 { - return Err(ArrowError::ParseError( - "Field 'metadata' must have exact two entries for each key-value map".to_string(), - )); - } - if let (Some(k), Some(v)) = - (map.get("key"), map.get("value")) - { - if let (Some(k_str), Some(v_str)) = - (k.as_str(), v.as_str()) - { - res.insert( - k_str.to_string().clone(), - v_str.to_string().clone(), - ); - } else { - return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); - } - } else { - return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); - } - } - _ => { - return Err(ArrowError::ParseError( - "Field 'metadata' contains non-object key-value pair".to_string(), - )); - } - } - } - Some(res) - } - // We also support map format, because Schema's metadata supports this. - // See https://github.com/apache/arrow/pull/5907 - Some(&Value::Object(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for (k, v) in values { - if let Some(str_value) = v.as_str() { - res.insert(k.clone(), str_value.to_string().clone()); - } else { - return Err(ArrowError::ParseError( - format!("Field 'metadata' contains non-string value for key {}", k), - )); - } - } - Some(res) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field `metadata` is not json array".to_string(), - )); - } - _ => None, - }; - - // if data_type is a struct or list, get its children - let data_type = match data_type { - DataType::List(_) - | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) => match map.get("children") { - Some(Value::Array(values)) => { - if values.len() != 1 { - return Err(ArrowError::ParseError( - "Field 'children' must have one element for a list data type".to_string(), - )); - } - match data_type { - DataType::List(_) => { - DataType::List(Box::new(Self::from(&values[0])?)) - } - DataType::LargeList(_) => { - DataType::LargeList(Box::new(Self::from(&values[0])?)) - } - DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Box::new(Self::from(&values[0])?), - int, - ), - _ => unreachable!( - "Data type should be a list, largelist or fixedsizelist" - ), - } - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - DataType::Struct(mut fields) => match map.get("children") { - Some(Value::Array(values)) => { - let struct_fields: Result> = - values.iter().map(Field::from).collect(); - fields.append(&mut struct_fields?); - DataType::Struct(fields) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - DataType::Map(_, keys_sorted) => { - match map.get("children") { - Some(Value::Array(values)) if values.len() == 1 => { - let child = Self::from(&values[0])?; - // child must be a struct - match child.data_type() { - DataType::Struct(map_fields) if map_fields.len() == 2 => { - DataType::Map(Box::new(child), keys_sorted) - } - t => { - return Err(ArrowError::ParseError( - format!("Map children should be a struct with 2 fields, found {:?}", t) - )) - } - } - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array with 1 element" - .to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - } - } - DataType::Union(_, type_ids, mode) => match map.get("children") { - Some(Value::Array(values)) => { - let union_fields: Vec = - values.iter().map(Field::from).collect::>()?; - DataType::Union(union_fields, type_ids, mode) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - _ => data_type, - }; - - let mut dict_id = 0; - let mut dict_is_ordered = false; - - let data_type = match map.get("dictionary") { - Some(dictionary) => { - let index_type = match dictionary.get("indexType") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'indexType' attribute".to_string(), - )); - } - }; - dict_id = match dictionary.get("id") { - Some(Value::Number(n)) => n.as_i64().unwrap(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'id' attribute".to_string(), - )); - } - }; - dict_is_ordered = match dictionary.get("isOrdered") { - Some(&Value::Bool(n)) => n, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'isOrdered' attribute".to_string(), - )); - } - }; - DataType::Dictionary(Box::new(index_type), Box::new(data_type)) - } - _ => data_type, - }; - Ok(Field { - name, - data_type, - nullable, - dict_id, - dict_is_ordered, - metadata, - }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for field".to_string(), - )), - } - } - - /// Generate a JSON representation of the `Field`. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - let children: Vec = match self.data_type() { - DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(field) - | DataType::LargeList(field) - | DataType::FixedSizeList(field, _) - | DataType::Map(field, _) => vec![field.to_json()], - _ => vec![], - }; - match self.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ - "name": self.name, - "nullable": self.nullable, - "type": value_type.to_json(), - "children": children, - "dictionary": { - "id": self.dict_id, - "indexType": index_type.to_json(), - "isOrdered": self.dict_is_ordered - } - }), - _ => serde_json::json!({ - "name": self.name, - "nullable": self.nullable, - "type": self.data_type.to_json(), - "children": children - }), - } - } - /// Merge this field into self if it is compatible. /// /// Struct fields are merged recursively. diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 38b6c7bf974..1586d563cd3 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -57,7 +57,7 @@ mod tests { #[cfg(feature = "json")] use serde_json::{ - Number, Value, + Number, Value::{Bool, Number as VNumber, String as VString}, }; @@ -174,942 +174,6 @@ mod tests { assert_eq!(person, deserialized); } - #[test] - #[cfg(feature = "json")] - fn struct_field_to_json() { - let f = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - let value: Value = serde_json::from_str( - r#"{ - "name": "address", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "street", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "zip", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - - #[test] - #[cfg(feature = "json")] - fn map_field_to_json() { - let f = Field::new( - "my_map", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ); - let value: Value = serde_json::from_str( - r#"{ - "name": "my_map", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - - #[test] - #[cfg(feature = "json")] - fn primitive_field_to_json() { - let f = Field::new("first_name", DataType::Utf8, false); - let value: Value = serde_json::from_str( - r#"{ - "name": "first_name", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - #[test] - #[cfg(feature = "json")] - fn parse_struct_from_json() { - let json = r#" - { - "name": "address", - "type": { - "name": "struct" - }, - "nullable": false, - "children": [ - { - "name": "street", - "type": { - "name": "utf8" - }, - "nullable": false, - "children": [] - }, - { - "name": "zip", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_map_from_json() { - let json = r#" - { - "name": "my_map", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "my_map", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_union_from_json() { - let json = r#" - { - "name": "my_union", - "nullable": false, - "type": { - "name": "union", - "mode": "SPARSE", - "typeIds": [ - 5, - 7 - ] - }, - "children": [ - { - "name": "f1", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "f2", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "my_union", - DataType::Union( - vec![ - Field::new("f1", DataType::Int32, true), - Field::new("f2", DataType::Utf8, true), - ], - vec![5, 7], - UnionMode::Sparse, - ), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_utf8_from_json() { - let json = "{\"name\":\"utf8\"}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Utf8, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_int32_from_json() { - let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Int32, dt); - } - - #[test] - #[cfg(feature = "json")] - fn schema_json() { - // Add some custom metadata - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); - - let schema = Schema::new_with_metadata( - vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Binary, false), - Field::new("c3", DataType::FixedSizeBinary(3), false), - Field::new("c4", DataType::Boolean, false), - Field::new("c5", DataType::Date32, false), - Field::new("c6", DataType::Date64, false), - Field::new("c7", DataType::Time32(TimeUnit::Second), false), - Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), - Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), - Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), - Field::new("c11", DataType::Time64(TimeUnit::Second), false), - Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), - Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), - Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), - Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), - Field::new( - "c16", - DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), - false, - ), - Field::new( - "c17", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), - ), - false, - ), - Field::new( - "c18", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - ), - Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), - Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), - Field::new( - "c22", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - false, - ), - Field::new( - "c23", - DataType::FixedSizeList( - Box::new(Field::new("bools", DataType::Boolean, false)), - 5, - ), - false, - ), - Field::new( - "c24", - DataType::List(Box::new(Field::new( - "inner_list", - DataType::List(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - true, - ))), - false, - ))), - true, - ), - Field::new( - "c25", - DataType::Struct(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::UInt16, false), - ]), - false, - ), - Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), - Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), - Field::new("c29", DataType::Duration(TimeUnit::Second), false), - Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), - Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), - Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), - Field::new_dict( - "c33", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), - true, - 123, - true, - ), - Field::new("c34", DataType::LargeBinary, true), - Field::new("c35", DataType::LargeUtf8, true), - Field::new( - "c36", - DataType::LargeList(Box::new(Field::new( - "inner_large_list", - DataType::LargeList(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - false, - ))), - true, - ))), - true, - ), - Field::new( - "c37", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ), - ], - metadata, - ); - - let expected = schema.to_json(); - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "c2", - "nullable": false, - "type": { - "name": "binary" - }, - "children": [] - }, - { - "name": "c3", - "nullable": false, - "type": { - "name": "fixedsizebinary", - "byteWidth": 3 - }, - "children": [] - }, - { - "name": "c4", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - }, - { - "name": "c5", - "nullable": false, - "type": { - "name": "date", - "unit": "DAY" - }, - "children": [] - }, - { - "name": "c6", - "nullable": false, - "type": { - "name": "date", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c7", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c8", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c9", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c10", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c11", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c12", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c13", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c14", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c15", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c16", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MILLISECOND", - "timezone": "UTC" - }, - "children": [] - }, - { - "name": "c17", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MICROSECOND", - "timezone": "Africa/Johannesburg" - }, - "children": [] - }, - { - "name": "c18", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c19", - "nullable": false, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c20", - "nullable": false, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c21", - "nullable": false, - "type": { - "name": "interval", - "unit": "MONTH_DAY_NANO" - }, - "children": [] - }, - { - "name": "c22", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "item", - "nullable": true, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c23", - "nullable": false, - "type": { - "name": "fixedsizelist", - "listSize": 5 - }, - "children": [ - { - "name": "bools", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c24", - "nullable": true, - "type": { - "name": "list" - }, - "children": [ - { - "name": "inner_list", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "struct", - "nullable": true, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - }, - { - "name": "c25", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "a", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "b", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }, - { - "name": "c26", - "nullable": true, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c27", - "nullable": true, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c28", - "nullable": true, - "type": { - "name": "interval", - "unit": "MONTH_DAY_NANO" - }, - "children": [] - }, - { - "name": "c29", - "nullable": false, - "type": { - "name": "duration", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c30", - "nullable": false, - "type": { - "name": "duration", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c31", - "nullable": false, - "type": { - "name": "duration", - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c32", - "nullable": false, - "type": { - "name": "duration", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c33", - "nullable": true, - "children": [], - "type": { - "name": "utf8" - }, - "dictionary": { - "id": 123, - "indexType": { - "name": "int", - "bitWidth": 32, - "isSigned": true - }, - "isOrdered": true - } - }, - { - "name": "c34", - "nullable": true, - "type": { - "name": "largebinary" - }, - "children": [] - }, - { - "name": "c35", - "nullable": true, - "type": { - "name": "largeutf8" - }, - "children": [] - }, - { - "name": "c36", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "inner_large_list", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "struct", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - }, - { - "name": "c37", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - } - ], - "metadata" : { - "Key": "Value" - } - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - assert_eq!(expected, value); - - // convert back to a schema - let value: Value = serde_json::from_str(json).unwrap(); - let schema2 = Schema::from(&value).unwrap(); - - assert_eq!(schema, schema2); - - // Check that empty metadata produces empty value in JSON and can be parsed - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ], - "metadata": {} - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - - // Check that metadata field is not required in the JSON. - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ] - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - } - #[test] fn create_schema_string() { let schema = person_schema(); diff --git a/arrow/src/datatypes/schema.rs b/arrow/src/datatypes/schema.rs index efde4edefa6..b0eca611474 100644 --- a/arrow/src/datatypes/schema.rs +++ b/arrow/src/datatypes/schema.rs @@ -233,80 +233,6 @@ impl Schema { .find(|&(_, c)| c.name() == name) } - /// Generate a JSON representation of the `Schema`. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - serde_json::json!({ - "fields": self.fields.iter().map(|field| field.to_json()).collect::>(), - "metadata": serde_json::to_value(&self.metadata).unwrap() - }) - } - - /// Parse a `Schema` definition from a JSON representation. - #[cfg(feature = "json")] - pub fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - match *json { - Value::Object(ref schema) => { - let fields = if let Some(Value::Array(fields)) = schema.get("fields") { - fields.iter().map(Field::from).collect::>()? - } else { - return Err(ArrowError::ParseError( - "Schema fields should be an array".to_string(), - )); - }; - - let metadata = if let Some(value) = schema.get("metadata") { - Self::from_metadata(value)? - } else { - HashMap::default() - }; - - Ok(Self { fields, metadata }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for schema".to_string(), - )), - } - } - - /// Parse a `metadata` definition from a JSON representation. - /// The JSON can either be an Object or an Array of Objects. - #[cfg(feature = "json")] - fn from_metadata(json: &serde_json::Value) -> Result> { - use serde_json::Value; - match json { - Value::Array(_) => { - let mut hashmap = HashMap::new(); - let values: Vec = serde_json::from_value(json.clone()) - .map_err(|_| { - ArrowError::JsonError( - "Unable to parse object into key-value pair".to_string(), - ) - })?; - for meta in values { - hashmap.insert(meta.key.clone(), meta.value); - } - Ok(hashmap) - } - Value::Object(md) => md - .iter() - .map(|(k, v)| { - if let Value::String(v) = v { - Ok((k.to_string(), v.to_string())) - } else { - Err(ArrowError::ParseError( - "metadata `value` field must be a string".to_string(), - )) - } - }) - .collect::>(), - _ => Err(ArrowError::ParseError( - "`metadata` field must be an object".to_string(), - )), - } - } - /// Check to see if `self` is a superset of `other` schema. Here are the comparison rules: /// /// * `self` and `other` should contain the same number of fields @@ -355,13 +281,6 @@ impl Hash for Schema { } } -#[cfg(feature = "json")] -#[derive(serde::Deserialize)] -struct MetadataKeyValue { - key: String, - value: String, -} - #[cfg(test)] mod tests { use crate::datatypes::DataType; diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index ffe112af72c..2345f1967f2 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -50,7 +50,7 @@ pub fn read_json_file(json_name: &str) -> Result { let json_file = File::open(json_name)?; let reader = BufReader::new(json_file); let arrow_json: Value = serde_json::from_reader(reader).unwrap(); - let schema = Schema::from(&arrow_json["schema"])?; + let schema = schema_from_json(&arrow_json["schema"])?; // read dictionaries let mut dictionaries = HashMap::new(); if let Some(dicts) = arrow_json.get("dictionaries") { diff --git a/integration-testing/src/util/datatype.rs b/integration-testing/src/util/datatype.rs new file mode 100644 index 00000000000..dd0b95b0a83 --- /dev/null +++ b/integration-testing/src/util/datatype.rs @@ -0,0 +1,383 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use arrow::error::{ArrowError, Result}; + +/// Parse a data type from a JSON representation. +pub fn data_type_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + let default_field = Field::new("", DataType::Boolean, true); + match *json { + Value::Object(ref map) => match map.get("name") { + Some(s) if s == "null" => Ok(DataType::Null), + Some(s) if s == "bool" => Ok(DataType::Boolean), + Some(s) if s == "binary" => Ok(DataType::Binary), + Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), + Some(s) if s == "utf8" => Ok(DataType::Utf8), + Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), + Some(s) if s == "fixedsizebinary" => { + // return a list with any type as its child isn't defined in the map + if let Some(Value::Number(size)) = map.get("byteWidth") { + Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) + } else { + Err(ArrowError::ParseError( + "Expecting a byteWidth for fixedsizebinary".to_string(), + )) + } + } + Some(s) if s == "decimal" => { + // return a list with any type as its child isn't defined in the map + let precision = match map.get("precision") { + Some(p) => Ok(p.as_u64().unwrap().try_into().unwrap()), + None => Err(ArrowError::ParseError( + "Expecting a precision for decimal".to_string(), + )), + }?; + let scale = match map.get("scale") { + Some(s) => Ok(s.as_u64().unwrap().try_into().unwrap()), + _ => Err(ArrowError::ParseError( + "Expecting a scale for decimal".to_string(), + )), + }?; + let bit_width: usize = match map.get("bitWidth") { + Some(b) => b.as_u64().unwrap() as usize, + _ => 128, // Default bit width + }; + + if bit_width == 128 { + Ok(DataType::Decimal128(precision, scale)) + } else if bit_width == 256 { + Ok(DataType::Decimal256(precision, scale)) + } else { + Err(ArrowError::ParseError( + "Decimal bit_width invalid".to_string(), + )) + } + } + Some(s) if s == "floatingpoint" => match map.get("precision") { + Some(p) if p == "HALF" => Ok(DataType::Float16), + Some(p) if p == "SINGLE" => Ok(DataType::Float32), + Some(p) if p == "DOUBLE" => Ok(DataType::Float64), + _ => Err(ArrowError::ParseError( + "floatingpoint precision missing or invalid".to_string(), + )), + }, + Some(s) if s == "timestamp" => { + let unit = match map.get("unit") { + Some(p) if p == "SECOND" => Ok(TimeUnit::Second), + Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), + Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), + Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), + _ => Err(ArrowError::ParseError( + "timestamp unit missing or invalid".to_string(), + )), + }; + let tz = match map.get("timezone") { + None => Ok(None), + Some(serde_json::Value::String(tz)) => Ok(Some(tz.clone())), + _ => Err(ArrowError::ParseError( + "timezone must be a string".to_string(), + )), + }; + Ok(DataType::Timestamp(unit?, tz?)) + } + Some(s) if s == "date" => match map.get("unit") { + Some(p) if p == "DAY" => Ok(DataType::Date32), + Some(p) if p == "MILLISECOND" => Ok(DataType::Date64), + _ => Err(ArrowError::ParseError( + "date unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "time" => { + let unit = match map.get("unit") { + Some(p) if p == "SECOND" => Ok(TimeUnit::Second), + Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), + Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), + Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), + _ => Err(ArrowError::ParseError( + "time unit missing or invalid".to_string(), + )), + }; + match map.get("bitWidth") { + Some(p) if p == 32 => Ok(DataType::Time32(unit?)), + Some(p) if p == 64 => Ok(DataType::Time64(unit?)), + _ => Err(ArrowError::ParseError( + "time bitWidth missing or invalid".to_string(), + )), + } + } + Some(s) if s == "duration" => match map.get("unit") { + Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), + Some(p) if p == "MILLISECOND" => { + Ok(DataType::Duration(TimeUnit::Millisecond)) + } + Some(p) if p == "MICROSECOND" => { + Ok(DataType::Duration(TimeUnit::Microsecond)) + } + Some(p) if p == "NANOSECOND" => { + Ok(DataType::Duration(TimeUnit::Nanosecond)) + } + _ => Err(ArrowError::ParseError( + "time unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "interval" => match map.get("unit") { + Some(p) if p == "DAY_TIME" => { + Ok(DataType::Interval(IntervalUnit::DayTime)) + } + Some(p) if p == "YEAR_MONTH" => { + Ok(DataType::Interval(IntervalUnit::YearMonth)) + } + Some(p) if p == "MONTH_DAY_NANO" => { + Ok(DataType::Interval(IntervalUnit::MonthDayNano)) + } + _ => Err(ArrowError::ParseError( + "interval unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "int" => match map.get("isSigned") { + Some(&Value::Bool(true)) => match map.get("bitWidth") { + Some(&Value::Number(ref n)) => match n.as_u64() { + Some(8) => Ok(DataType::Int8), + Some(16) => Ok(DataType::Int16), + Some(32) => Ok(DataType::Int32), + Some(64) => Ok(DataType::Int64), + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + Some(&Value::Bool(false)) => match map.get("bitWidth") { + Some(&Value::Number(ref n)) => match n.as_u64() { + Some(8) => Ok(DataType::UInt8), + Some(16) => Ok(DataType::UInt16), + Some(32) => Ok(DataType::UInt32), + Some(64) => Ok(DataType::UInt64), + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int signed missing or invalid".to_string(), + )), + }, + Some(s) if s == "list" => { + // return a list with any type as its child isn't defined in the map + Ok(DataType::List(Box::new(default_field))) + } + Some(s) if s == "largelist" => { + // return a largelist with any type as its child isn't defined in the map + Ok(DataType::LargeList(Box::new(default_field))) + } + Some(s) if s == "fixedsizelist" => { + // return a list with any type as its child isn't defined in the map + if let Some(Value::Number(size)) = map.get("listSize") { + Ok(DataType::FixedSizeList( + Box::new(default_field), + size.as_i64().unwrap() as i32, + )) + } else { + Err(ArrowError::ParseError( + "Expecting a listSize for fixedsizelist".to_string(), + )) + } + } + Some(s) if s == "struct" => { + // return an empty `struct` type as its children aren't defined in the map + Ok(DataType::Struct(vec![])) + } + Some(s) if s == "map" => { + if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { + // Return a map with an empty type as its children aren't defined in the map + Ok(DataType::Map(Box::new(default_field), *keys_sorted)) + } else { + Err(ArrowError::ParseError( + "Expecting a keysSorted for map".to_string(), + )) + } + } + Some(s) if s == "union" => { + if let Some(Value::String(mode)) = map.get("mode") { + let union_mode = if mode == "SPARSE" { + UnionMode::Sparse + } else if mode == "DENSE" { + UnionMode::Dense + } else { + return Err(ArrowError::ParseError(format!( + "Unknown union mode {:?} for union", + mode + ))); + }; + if let Some(type_ids) = map.get("typeIds") { + let type_ids = type_ids + .as_array() + .unwrap() + .iter() + .map(|t| t.as_i64().unwrap() as i8) + .collect::>(); + + let default_fields = type_ids + .iter() + .map(|_| default_field.clone()) + .collect::>(); + + Ok(DataType::Union(default_fields, type_ids, union_mode)) + } else { + Err(ArrowError::ParseError( + "Expecting a typeIds for union ".to_string(), + )) + } + } else { + Err(ArrowError::ParseError( + "Expecting a mode for union".to_string(), + )) + } + } + Some(other) => Err(ArrowError::ParseError(format!( + "invalid or unsupported type name: {} in {:?}", + other, json + ))), + None => Err(ArrowError::ParseError("type name missing".to_string())), + }, + _ => Err(ArrowError::ParseError( + "invalid json value type".to_string(), + )), + } +} + +/// Generate a JSON representation of the data type. +pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { + use serde_json::json; + match data_type { + DataType::Null => json!({"name": "null"}), + DataType::Boolean => json!({"name": "bool"}), + DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), + DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), + DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), + DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), + DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), + DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), + DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), + DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), + DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), + DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), + DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), + DataType::Utf8 => json!({"name": "utf8"}), + DataType::LargeUtf8 => json!({"name": "largeutf8"}), + DataType::Binary => json!({"name": "binary"}), + DataType::LargeBinary => json!({"name": "largebinary"}), + DataType::FixedSizeBinary(byte_width) => { + json!({"name": "fixedsizebinary", "byteWidth": byte_width}) + } + DataType::Struct(_) => json!({"name": "struct"}), + DataType::Union(_, _, _) => json!({"name": "union"}), + DataType::List(_) => json!({ "name": "list"}), + DataType::LargeList(_) => json!({ "name": "largelist"}), + DataType::FixedSizeList(_, length) => { + json!({"name":"fixedsizelist", "listSize": length}) + } + DataType::Time32(unit) => { + json!({"name": "time", "bitWidth": 32, "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Time64(unit) => { + json!({"name": "time", "bitWidth": 64, "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Date32 => { + json!({"name": "date", "unit": "DAY"}) + } + DataType::Date64 => { + json!({"name": "date", "unit": "MILLISECOND"}) + } + DataType::Timestamp(unit, None) => { + json!({"name": "timestamp", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Timestamp(unit, Some(tz)) => { + json!({"name": "timestamp", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }, "timezone": tz}) + } + DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { + IntervalUnit::YearMonth => "YEAR_MONTH", + IntervalUnit::DayTime => "DAY_TIME", + IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", + }}), + DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}), + DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), + DataType::Decimal128(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 128}) + } + DataType::Decimal256(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 256}) + } + DataType::Map(_, keys_sorted) => { + json!({"name": "map", "keysSorted": keys_sorted}) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::Value; + + #[test] + fn parse_utf8_from_json() { + let json = "{\"name\":\"utf8\"}"; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = data_type_from_json(&value).unwrap(); + assert_eq!(DataType::Utf8, dt); + } + + #[test] + fn parse_int32_from_json() { + let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = data_type_from_json(&value).unwrap(); + assert_eq!(DataType::Int32, dt); + } +} diff --git a/integration-testing/src/util/field.rs b/integration-testing/src/util/field.rs new file mode 100644 index 00000000000..d2bb3a49159 --- /dev/null +++ b/integration-testing/src/util/field.rs @@ -0,0 +1,569 @@ +use crate::util::datatype::{data_type_from_json, data_type_to_json}; +use arrow::datatypes::{DataType, Field}; +use arrow::error::{ArrowError, Result}; +use std::collections::BTreeMap; + +/// Parse a `Field` definition from a JSON representation. +pub fn field_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + match *json { + Value::Object(ref map) => { + let name = match map.get("name") { + Some(&Value::String(ref name)) => name.to_string(), + _ => { + return Err(ArrowError::ParseError( + "Field missing 'name' attribute".to_string(), + )); + } + }; + let nullable = match map.get("nullable") { + Some(&Value::Bool(b)) => b, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'nullable' attribute".to_string(), + )); + } + }; + let data_type = match map.get("type") { + Some(t) => data_type_from_json(t)?, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'type' attribute".to_string(), + )); + } + }; + + // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz + let metadata = match map.get("metadata") { + Some(&Value::Array(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for value in values { + match value.as_object() { + Some(map) => { + if map.len() != 2 { + return Err(ArrowError::ParseError( + "Field 'metadata' must have exact two entries for each key-value map".to_string(), + )); + } + if let (Some(k), Some(v)) = + (map.get("key"), map.get("value")) + { + if let (Some(k_str), Some(v_str)) = + (k.as_str(), v.as_str()) + { + res.insert( + k_str.to_string().clone(), + v_str.to_string().clone(), + ); + } else { + return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); + } + } else { + return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); + } + } + _ => { + return Err(ArrowError::ParseError( + "Field 'metadata' contains non-object key-value pair" + .to_string(), + )); + } + } + } + Some(res) + } + // We also support map format, because Schema's metadata supports this. + // See https://github.com/apache/arrow/pull/5907 + Some(&Value::Object(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for (k, v) in values { + if let Some(str_value) = v.as_str() { + res.insert(k.clone(), str_value.to_string().clone()); + } else { + return Err(ArrowError::ParseError(format!( + "Field 'metadata' contains non-string value for key {}", + k + ))); + } + } + Some(res) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field `metadata` is not json array".to_string(), + )); + } + _ => None, + }; + + // if data_type is a struct or list, get its children + let data_type = match data_type { + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) => match map.get("children") { + Some(Value::Array(values)) => { + if values.len() != 1 { + return Err(ArrowError::ParseError( + "Field 'children' must have one element for a list data type".to_string(), + )); + } + match data_type { + DataType::List(_) => { + DataType::List(Box::new(field_from_json(&values[0])?)) + } + DataType::LargeList(_) => DataType::LargeList(Box::new( + field_from_json(&values[0])?, + )), + DataType::FixedSizeList(_, int) => DataType::FixedSizeList( + Box::new(field_from_json(&values[0])?), + int, + ), + _ => unreachable!( + "Data type should be a list, largelist or fixedsizelist" + ), + } + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + DataType::Struct(mut fields) => match map.get("children") { + Some(Value::Array(values)) => { + let struct_fields: Result> = + values.iter().map(field_from_json).collect(); + fields.append(&mut struct_fields?); + DataType::Struct(fields) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + DataType::Map(_, keys_sorted) => { + match map.get("children") { + Some(Value::Array(values)) if values.len() == 1 => { + let child = field_from_json(&values[0])?; + // child must be a struct + match child.data_type() { + DataType::Struct(map_fields) if map_fields.len() == 2 => { + DataType::Map(Box::new(child), keys_sorted) + } + t => { + return Err(ArrowError::ParseError( + format!("Map children should be a struct with 2 fields, found {:?}", t) + )) + } + } + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array with 1 element" + .to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + } + } + DataType::Union(_, type_ids, mode) => match map.get("children") { + Some(Value::Array(values)) => { + let union_fields: Vec = + values.iter().map(field_from_json).collect::>()?; + DataType::Union(union_fields, type_ids, mode) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + _ => data_type, + }; + + let mut dict_id = 0; + let mut dict_is_ordered = false; + + let data_type = match map.get("dictionary") { + Some(dictionary) => { + let index_type = match dictionary.get("indexType") { + Some(t) => data_type_from_json(t)?, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'indexType' attribute".to_string(), + )); + } + }; + dict_id = match dictionary.get("id") { + Some(Value::Number(n)) => n.as_i64().unwrap(), + _ => { + return Err(ArrowError::ParseError( + "Field missing 'id' attribute".to_string(), + )); + } + }; + dict_is_ordered = match dictionary.get("isOrdered") { + Some(&Value::Bool(n)) => n, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'isOrdered' attribute".to_string(), + )); + } + }; + DataType::Dictionary(Box::new(index_type), Box::new(data_type)) + } + _ => data_type, + }; + + let mut field = + Field::new_dict(&name, data_type, nullable, dict_id, dict_is_ordered); + field.set_metadata(metadata); + Ok(field) + } + _ => Err(ArrowError::ParseError( + "Invalid json value type for field".to_string(), + )), + } +} + +/// Generate a JSON representation of the `Field`. +pub fn field_to_json(field: &Field) -> serde_json::Value { + let children: Vec = match field.data_type() { + DataType::Struct(fields) => fields.iter().map(field_to_json).collect(), + DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) + | DataType::Map(field, _) => vec![field_to_json(field)], + _ => vec![], + }; + + match field.data_type() { + DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(value_type), + "children": children, + "dictionary": { + "id": field.dict_id().unwrap(), + "indexType": data_type_to_json(index_type), + "isOrdered": field.dict_is_ordered().unwrap(), + } + }), + _ => serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(field.data_type()), + "children": children + }), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::UnionMode; + use serde_json::Value; + + #[test] + fn struct_field_to_json() { + let f = Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ); + let value: Value = serde_json::from_str( + r#"{ + "name": "address", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "street", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "zip", + "nullable": false, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + + #[test] + fn map_field_to_json() { + let f = Field::new( + "my_map", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ); + let value: Value = serde_json::from_str( + r#"{ + "name": "my_map", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + + #[test] + fn primitive_field_to_json() { + let f = Field::new("first_name", DataType::Utf8, false); + let value: Value = serde_json::from_str( + r#"{ + "name": "first_name", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + #[test] + fn parse_struct_from_json() { + let json = r#" + { + "name": "address", + "type": { + "name": "struct" + }, + "nullable": false, + "children": [ + { + "name": "street", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [] + }, + { + "name": "zip", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ); + + assert_eq!(expected, dt); + } + + #[test] + fn parse_map_from_json() { + let json = r#" + { + "name": "my_map", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "my_map", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ); + + assert_eq!(expected, dt); + } + + #[test] + fn parse_union_from_json() { + let json = r#" + { + "name": "my_union", + "nullable": false, + "type": { + "name": "union", + "mode": "SPARSE", + "typeIds": [ + 5, + 7 + ] + }, + "children": [ + { + "name": "f1", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "f2", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "my_union", + DataType::Union( + vec![ + Field::new("f1", DataType::Int32, true), + Field::new("f2", DataType::Utf8, true), + ], + vec![5, 7], + UnionMode::Sparse, + ), + false, + ); + + assert_eq!(expected, dt); + } +} diff --git a/integration-testing/src/util.rs b/integration-testing/src/util/mod.rs similarity index 99% rename from integration-testing/src/util.rs rename to integration-testing/src/util/mod.rs index e098c4e1491..9ecd301360f 100644 --- a/integration-testing/src/util.rs +++ b/integration-testing/src/util/mod.rs @@ -36,7 +36,17 @@ use arrow::record_batch::{RecordBatch, RecordBatchReader}; use arrow::util::bit_util; use arrow::util::decimal::Decimal256; +mod datatype; +mod field; +mod schema; + +use crate::util::datatype::data_type_to_json; +use crate::util::field::field_from_json; +pub use schema::*; + /// A struct that represents an Arrow file with a schema and record batches +/// +/// See #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJson { pub schema: ArrowJsonSchema, @@ -90,7 +100,7 @@ impl From<&Field> for ArrowJsonField { Self { name: field.name().to_string(), - field_type: field.data_type().to_json(), + field_type: data_type_to_json(field.data_type()), nullable: field.is_nullable(), children: vec![], dictionary: None, // TODO: not enough info @@ -256,7 +266,7 @@ impl ArrowJsonField { fn to_arrow_field(&self) -> Result { // a bit regressive, but we have to convert the field to JSON in order to convert it let field = serde_json::to_value(self)?; - Field::from(&field) + field_from_json(&field) } } diff --git a/integration-testing/src/util/schema.rs b/integration-testing/src/util/schema.rs new file mode 100644 index 00000000000..d2ca876c2b7 --- /dev/null +++ b/integration-testing/src/util/schema.rs @@ -0,0 +1,716 @@ +use crate::util::field::{field_from_json, field_to_json}; +use arrow::datatypes::Schema; +use arrow::error::{ArrowError, Result}; +use std::collections::HashMap; + +/// Generate a JSON representation of the `Schema`. +pub fn schema_to_json(schema: &Schema) -> serde_json::Value { + serde_json::json!({ + "fields": schema.fields().iter().map(field_to_json).collect::>(), + "metadata": serde_json::to_value(schema.metadata()).unwrap() + }) +} + +/// Parse a `Schema` definition from a JSON representation. +pub fn schema_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + match *json { + Value::Object(ref schema) => { + let fields = if let Some(Value::Array(fields)) = schema.get("fields") { + fields.iter().map(field_from_json).collect::>()? + } else { + return Err(ArrowError::ParseError( + "Schema fields should be an array".to_string(), + )); + }; + + let metadata = if let Some(value) = schema.get("metadata") { + from_metadata(value)? + } else { + HashMap::default() + }; + + Ok(Schema::new_with_metadata(fields, metadata)) + } + _ => Err(ArrowError::ParseError( + "Invalid json value type for schema".to_string(), + )), + } +} + +/// Parse a `metadata` definition from a JSON representation. +/// The JSON can either be an Object or an Array of Objects. +fn from_metadata(json: &serde_json::Value) -> Result> { + use serde_json::Value; + match json { + Value::Array(_) => { + let mut hashmap = HashMap::new(); + let values: Vec = serde_json::from_value(json.clone()) + .map_err(|_| { + ArrowError::JsonError( + "Unable to parse object into key-value pair".to_string(), + ) + })?; + for meta in values { + hashmap.insert(meta.key.clone(), meta.value); + } + Ok(hashmap) + } + Value::Object(md) => md + .iter() + .map(|(k, v)| { + if let Value::String(v) = v { + Ok((k.to_string(), v.to_string())) + } else { + Err(ArrowError::ParseError( + "metadata `value` field must be a string".to_string(), + )) + } + }) + .collect::>(), + _ => Err(ArrowError::ParseError( + "`metadata` field must be an object".to_string(), + )), + } +} + +#[derive(serde::Deserialize)] +struct MetadataKeyValue { + key: String, + value: String, +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; + use serde_json::Value; + + #[test] + fn schema_json() { + // Add some custom metadata + let metadata: HashMap = + [("Key".to_string(), "Value".to_string())] + .iter() + .cloned() + .collect(); + + let schema = Schema::new_with_metadata( + vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Binary, false), + Field::new("c3", DataType::FixedSizeBinary(3), false), + Field::new("c4", DataType::Boolean, false), + Field::new("c5", DataType::Date32, false), + Field::new("c6", DataType::Date64, false), + Field::new("c7", DataType::Time32(TimeUnit::Second), false), + Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), + Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), + Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), + Field::new("c11", DataType::Time64(TimeUnit::Second), false), + Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), + Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), + Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), + Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), + Field::new( + "c16", + DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), + false, + ), + Field::new( + "c17", + DataType::Timestamp( + TimeUnit::Microsecond, + Some("Africa/Johannesburg".to_string()), + ), + false, + ), + Field::new( + "c18", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), + Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), + Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), + Field::new( + "c22", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + false, + ), + Field::new( + "c23", + DataType::FixedSizeList( + Box::new(Field::new("bools", DataType::Boolean, false)), + 5, + ), + false, + ), + Field::new( + "c24", + DataType::List(Box::new(Field::new( + "inner_list", + DataType::List(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + true, + ))), + false, + ))), + true, + ), + Field::new( + "c25", + DataType::Struct(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::UInt16, false), + ]), + false, + ), + Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), + Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), + Field::new("c29", DataType::Duration(TimeUnit::Second), false), + Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), + Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), + Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), + Field::new_dict( + "c33", + DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + ), + true, + 123, + true, + ), + Field::new("c34", DataType::LargeBinary, true), + Field::new("c35", DataType::LargeUtf8, true), + Field::new( + "c36", + DataType::LargeList(Box::new(Field::new( + "inner_large_list", + DataType::LargeList(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + false, + ))), + true, + ))), + true, + ), + Field::new( + "c37", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ), + ], + metadata, + ); + + let expected = schema_to_json(&schema); + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "c2", + "nullable": false, + "type": { + "name": "binary" + }, + "children": [] + }, + { + "name": "c3", + "nullable": false, + "type": { + "name": "fixedsizebinary", + "byteWidth": 3 + }, + "children": [] + }, + { + "name": "c4", + "nullable": false, + "type": { + "name": "bool" + }, + "children": [] + }, + { + "name": "c5", + "nullable": false, + "type": { + "name": "date", + "unit": "DAY" + }, + "children": [] + }, + { + "name": "c6", + "nullable": false, + "type": { + "name": "date", + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c7", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c8", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c9", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c10", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c11", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c12", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c13", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c14", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c15", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c16", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "MILLISECOND", + "timezone": "UTC" + }, + "children": [] + }, + { + "name": "c17", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "MICROSECOND", + "timezone": "Africa/Johannesburg" + }, + "children": [] + }, + { + "name": "c18", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c19", + "nullable": false, + "type": { + "name": "interval", + "unit": "DAY_TIME" + }, + "children": [] + }, + { + "name": "c20", + "nullable": false, + "type": { + "name": "interval", + "unit": "YEAR_MONTH" + }, + "children": [] + }, + { + "name": "c21", + "nullable": false, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c22", + "nullable": false, + "type": { + "name": "list" + }, + "children": [ + { + "name": "item", + "nullable": true, + "type": { + "name": "bool" + }, + "children": [] + } + ] + }, + { + "name": "c23", + "nullable": false, + "type": { + "name": "fixedsizelist", + "listSize": 5 + }, + "children": [ + { + "name": "bools", + "nullable": false, + "type": { + "name": "bool" + }, + "children": [] + } + ] + }, + { + "name": "c24", + "nullable": true, + "type": { + "name": "list" + }, + "children": [ + { + "name": "inner_list", + "nullable": false, + "type": { + "name": "list" + }, + "children": [ + { + "name": "struct", + "nullable": true, + "type": { + "name": "struct" + }, + "children": [] + } + ] + } + ] + }, + { + "name": "c25", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "a", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "b", + "nullable": false, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + }, + { + "name": "c26", + "nullable": true, + "type": { + "name": "interval", + "unit": "YEAR_MONTH" + }, + "children": [] + }, + { + "name": "c27", + "nullable": true, + "type": { + "name": "interval", + "unit": "DAY_TIME" + }, + "children": [] + }, + { + "name": "c28", + "nullable": true, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c29", + "nullable": false, + "type": { + "name": "duration", + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c30", + "nullable": false, + "type": { + "name": "duration", + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c31", + "nullable": false, + "type": { + "name": "duration", + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c32", + "nullable": false, + "type": { + "name": "duration", + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c33", + "nullable": true, + "children": [], + "type": { + "name": "utf8" + }, + "dictionary": { + "id": 123, + "indexType": { + "name": "int", + "bitWidth": 32, + "isSigned": true + }, + "isOrdered": true + } + }, + { + "name": "c34", + "nullable": true, + "type": { + "name": "largebinary" + }, + "children": [] + }, + { + "name": "c35", + "nullable": true, + "type": { + "name": "largeutf8" + }, + "children": [] + }, + { + "name": "c36", + "nullable": true, + "type": { + "name": "largelist" + }, + "children": [ + { + "name": "inner_large_list", + "nullable": true, + "type": { + "name": "largelist" + }, + "children": [ + { + "name": "struct", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [] + } + ] + } + ] + }, + { + "name": "c37", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + } + ], + "metadata" : { + "Key": "Value" + } + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + assert_eq!(expected, value); + + // convert back to a schema + let value: Value = serde_json::from_str(json).unwrap(); + let schema2 = schema_from_json(&value).unwrap(); + + assert_eq!(schema, schema2); + + // Check that empty metadata produces empty value in JSON and can be parsed + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + } + ], + "metadata": {} + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + let schema = schema_from_json(&value).unwrap(); + assert!(schema.metadata.is_empty()); + + // Check that metadata field is not required in the JSON. + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + } + ] + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + let schema = schema_from_json(&value).unwrap(); + assert!(schema.metadata.is_empty()); + } +} From ddb91c5397934264cde8af33e4c21f08506b01eb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 13 Sep 2022 20:07:19 +0100 Subject: [PATCH 2/2] Fix RAT --- integration-testing/src/util/field.rs | 17 +++++++++++++++++ integration-testing/src/util/schema.rs | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/integration-testing/src/util/field.rs b/integration-testing/src/util/field.rs index d2bb3a49159..a2becc004d1 100644 --- a/integration-testing/src/util/field.rs +++ b/integration-testing/src/util/field.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::util::datatype::{data_type_from_json, data_type_to_json}; use arrow::datatypes::{DataType, Field}; use arrow::error::{ArrowError, Result}; diff --git a/integration-testing/src/util/schema.rs b/integration-testing/src/util/schema.rs index d2ca876c2b7..7e3475e6f46 100644 --- a/integration-testing/src/util/schema.rs +++ b/integration-testing/src/util/schema.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::util::field::{field_from_json, field_to_json}; use arrow::datatypes::Schema; use arrow::error::{ArrowError, Result};