diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 0256307b3b7..86a964e0239 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -158,8 +158,8 @@ jobs: - name: Build run: | cd arrow - cargo build --no-default-features --features=csv,ipc,simd,ffi --target wasm32-unknown-unknown - cargo build --no-default-features --features=csv,ipc,simd,ffi --target wasm32-wasi + cargo build --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-unknown-unknown + cargo build --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-wasi clippy: name: Clippy diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 7ef9935eabe..70b8767b568 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -44,9 +44,8 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -serde = { version = "1.0", default-features = false } -serde_derive = { version = "1.0", default-features = false } -serde_json = { version = "1.0", default-features = false, features = ["std"] } +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } @@ -69,10 +68,11 @@ bitflags = { version = "1.2.1", default-features = false } zstd = { version = "0.11.1", default-features = false, optional = true } [features] -default = ["csv", "ipc"] +default = ["csv", "ipc", "json"] ipc_compression = ["ipc", "zstd", "lz4"] csv = ["csv_crate"] ipc = ["flatbuffers"] +json = ["serde", "serde_json"] simd = ["packed_simd"] prettyprint = ["comfy-table"] # The test utils feature enables code used in benchmarks and tests but @@ -183,6 +183,7 @@ harness = false [[bench]] name = "json_reader" harness = false +required-features = ["json"] [[bench]] name = "equal" diff --git a/arrow/README.md b/arrow/README.md index d7501b2aea8..bf30f4599c5 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -42,6 +42,7 @@ However, for historical reasons, this crate uses versions with major numbers gre The `arrow` crate provides the following features which may be enabled in your `Cargo.toml`: - `csv` (default) - support for reading and writing Arrow arrays to/from csv files +- `json` (default) - support for reading and writing Arrow array to/from json files - `ipc` (default) - support for reading [Arrow IPC Format](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc), also used as the wire protocol in [arrow-flight](https://crates.io/crates/arrow-flight) - `ipc_compression` - Enables reading and writing compressed IPC streams (also enables `ipc`) - `prettyprint` - support for formatting record batches as textual columns diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 3496ccd888b..6ad2c26fee5 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -190,11 +190,12 @@ use crate::datatypes::*; pub use self::array::Array; pub use self::array::ArrayAccessor; pub use self::array::ArrayRef; -pub(crate) use self::data::layout; pub use self::data::ArrayData; pub use self::data::ArrayDataBuilder; pub use self::data::ArrayDataRef; -pub(crate) use self::data::BufferSpec; + +#[cfg(feature = "ipc")] +pub(crate) use self::data::{layout, BufferSpec}; pub use self::array_binary::BinaryArray; pub use self::array_binary::LargeBinaryArray; diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 51f70a6cba3..b65bfd7725a 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -19,9 +19,6 @@ use num::BigInt; use std::cmp::Ordering; use std::fmt; -use serde_derive::{Deserialize, Serialize}; -use serde_json::{json, Value, Value::String as VString}; - use crate::error::{ArrowError, Result}; use crate::util::decimal::singed_cmp_le_bytes; @@ -42,7 +39,8 @@ use super::Field; /// Nested types can themselves be nested within other arrays. /// For more information on these types please see /// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout). -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum DataType { /// Null type Null, @@ -222,7 +220,8 @@ pub enum DataType { } /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum TimeUnit { /// Time in seconds. Second, @@ -235,7 +234,8 @@ pub enum TimeUnit { } /// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum IntervalUnit { /// Indicates the number of elapsed whole months, stored as 4-byte integers. YearMonth, @@ -253,7 +253,8 @@ pub enum IntervalUnit { } // Sparse or Dense union layouts -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum UnionMode { Sparse, Dense, @@ -1052,7 +1053,9 @@ pub(crate) fn validate_decimal256_precision_with_lt_bytes( impl DataType { /// Parse a data type from a JSON representation. - pub(crate) fn from(json: &Value) -> Result { + #[cfg(feature = "json")] + pub(crate) fn from(json: &serde_json::Value) -> Result { + use serde_json::Value; let default_field = Field::new("", DataType::Boolean, true); match *json { Value::Object(ref map) => match map.get("name") { @@ -1121,7 +1124,7 @@ impl DataType { }; let tz = match map.get("timezone") { None => Ok(None), - Some(VString(tz)) => Ok(Some(tz.clone())), + Some(serde_json::Value::String(tz)) => Ok(Some(tz.clone())), _ => Err(ArrowError::ParseError( "timezone must be a string".to_string(), )), @@ -1300,7 +1303,9 @@ impl DataType { } /// Generate a JSON representation of the data type. - pub fn to_json(&self) -> Value { + #[cfg(feature = "json")] + pub fn to_json(&self) -> serde_json::Value { + use serde_json::json; match self { DataType::Null => json!({"name": "null"}), DataType::Boolean => json!({"name": "bool"}), diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index f50ebadd5e7..ac966cafe34 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -15,22 +15,19 @@ // specific language governing permissions and limitations // under the License. +use crate::error::{ArrowError, Result}; use std::cmp::Ordering; use std::collections::BTreeMap; use std::hash::{Hash, Hasher}; -use serde_derive::{Deserialize, Serialize}; -use serde_json::{json, Value}; - -use crate::error::{ArrowError, Result}; - use super::DataType; /// Describes a single column in a [`Schema`](super::Schema). /// /// A [`Schema`](super::Schema) is an ordered collection of /// [`Field`] objects. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Field { name: String, data_type: DataType, @@ -38,7 +35,7 @@ pub struct Field { dict_id: i64, dict_is_ordered: bool, /// A map of key-value pairs containing additional custom meta data. - #[serde(skip_serializing_if = "Option::is_none")] + #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))] metadata: Option>, } @@ -254,7 +251,9 @@ impl Field { } /// Parse a `Field` definition from a JSON representation. - pub fn from(json: &Value) -> Result { + #[cfg(feature = "json")] + pub fn from(json: &serde_json::Value) -> Result { + use serde_json::Value; match *json { Value::Object(ref map) => { let name = match map.get("name") { @@ -497,8 +496,9 @@ impl Field { } /// Generate a JSON representation of the `Field`. - pub fn to_json(&self) -> Value { - let children: Vec = match self.data_type() { + #[cfg(feature = "json")] + pub fn to_json(&self) -> serde_json::Value { + let children: Vec = match self.data_type() { DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), DataType::List(field) | DataType::LargeList(field) @@ -507,7 +507,7 @@ impl Field { _ => vec![], }; match self.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => json!({ + DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ "name": self.name, "nullable": self.nullable, "type": value_type.to_json(), @@ -518,7 +518,7 @@ impl Field { "isOrdered": self.dict_is_ordered } }), - _ => json!({ + _ => serde_json::json!({ "name": self.name, "nullable": self.nullable, "type": self.data_type.to_json(), diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 13c93c4ab05..38b6c7bf974 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -50,12 +50,15 @@ pub type SchemaRef = Arc; mod tests { use super::*; use crate::error::Result; + use std::collections::{BTreeMap, HashMap}; + + #[cfg(feature = "json")] use crate::json::JsonSerializable; - use serde_json::Value::{Bool, Number as VNumber, String as VString}; - use serde_json::{Number, Value}; - use std::{ - collections::{BTreeMap, HashMap}, - f32::NAN, + + #[cfg(feature = "json")] + use serde_json::{ + Number, Value, + Value::{Bool, Number as VNumber, String as VString}, }; #[test] @@ -107,6 +110,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn create_struct_type() { let _person = DataType::Struct(vec![ Field::new("first_name", DataType::Utf8, false), @@ -123,6 +127,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn serde_struct_type() { let kv_array = [("k".to_string(), "v".to_string())]; let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); @@ -170,6 +175,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn struct_field_to_json() { let f = Field::new( "address", @@ -213,6 +219,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn map_field_to_json() { let f = Field::new( "my_map", @@ -273,6 +280,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn primitive_field_to_json() { let f = Field::new("first_name", DataType::Utf8, false); let value: Value = serde_json::from_str( @@ -289,6 +297,7 @@ mod tests { assert_eq!(value, f.to_json()); } #[test] + #[cfg(feature = "json")] fn parse_struct_from_json() { let json = r#" { @@ -335,6 +344,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn parse_map_from_json() { let json = r#" { @@ -398,6 +408,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn parse_union_from_json() { let json = r#" { @@ -453,6 +464,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn parse_utf8_from_json() { let json = "{\"name\":\"utf8\"}"; let value: Value = serde_json::from_str(json).unwrap(); @@ -461,6 +473,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn parse_int32_from_json() { let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; let value: Value = serde_json::from_str(json).unwrap(); @@ -469,6 +482,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn schema_json() { // Add some custom metadata let metadata: HashMap = @@ -1229,6 +1243,7 @@ mod tests { } #[test] + #[cfg(feature = "json")] fn test_arrow_native_type_to_json() { assert_eq!(Some(Bool(true)), true.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value()); @@ -1248,7 +1263,7 @@ mod tests { Some(VNumber(Number::from_f64(0.01f64).unwrap())), 0.01f64.into_json_value() ); - assert_eq!(None, NAN.into_json_value()); + assert_eq!(None, f32::NAN.into_json_value()); } fn person_schema() -> Schema { diff --git a/arrow/src/datatypes/schema.rs b/arrow/src/datatypes/schema.rs index d08042b51f1..efde4edefa6 100644 --- a/arrow/src/datatypes/schema.rs +++ b/arrow/src/datatypes/schema.rs @@ -16,13 +16,9 @@ // under the License. use std::collections::HashMap; -use std::default::Default; use std::fmt; use std::hash::Hash; -use serde_derive::{Deserialize, Serialize}; -use serde_json::{json, Value}; - use crate::error::{ArrowError, Result}; use super::Field; @@ -31,12 +27,15 @@ use super::Field; /// /// Note that this information is only part of the meta-data and not part of the physical /// memory layout. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Schema { pub fields: Vec, /// A map of key-value pairs containing additional meta data. - #[serde(skip_serializing_if = "HashMap::is_empty")] - #[serde(default)] + #[cfg_attr( + feature = "serde", + serde(skip_serializing_if = "HashMap::is_empty", default) + )] pub metadata: HashMap, } @@ -180,6 +179,7 @@ impl Schema { /// Returns a vector with references to all fields (including nested fields) #[inline] + #[cfg(feature = "ipc")] pub(crate) fn all_fields(&self) -> Vec<&Field> { self.fields.iter().flat_map(|f| f.fields()).collect() } @@ -234,15 +234,18 @@ impl Schema { } /// Generate a JSON representation of the `Schema`. - pub fn to_json(&self) -> Value { - json!({ - "fields": self.fields.iter().map(|field| field.to_json()).collect::>(), + #[cfg(feature = "json")] + pub fn to_json(&self) -> serde_json::Value { + serde_json::json!({ + "fields": self.fields.iter().map(|field| field.to_json()).collect::>(), "metadata": serde_json::to_value(&self.metadata).unwrap() }) } /// Parse a `Schema` definition from a JSON representation. - pub fn from(json: &Value) -> Result { + #[cfg(feature = "json")] + pub fn from(json: &serde_json::Value) -> Result { + use serde_json::Value; match *json { Value::Object(ref schema) => { let fields = if let Some(Value::Array(fields)) = schema.get("fields") { @@ -269,7 +272,9 @@ impl Schema { /// Parse a `metadata` definition from a JSON representation. /// The JSON can either be an Object or an Array of Objects. - fn from_metadata(json: &Value) -> Result> { + #[cfg(feature = "json")] + fn from_metadata(json: &serde_json::Value) -> Result> { + use serde_json::Value; match json { Value::Array(_) => { let mut hashmap = HashMap::new(); @@ -350,7 +355,8 @@ impl Hash for Schema { } } -#[derive(Deserialize)] +#[cfg(feature = "json")] +#[derive(serde::Deserialize)] struct MetadataKeyValue { key: String, value: String, @@ -363,6 +369,7 @@ mod tests { use super::*; #[test] + #[cfg(feature = "json")] fn test_ser_de_metadata() { // ser/de with empty metadata let schema = Schema::new(vec![ diff --git a/arrow/src/error.rs b/arrow/src/error.rs index ef7abbbddef..5d92fb93017 100644 --- a/arrow/src/error.rs +++ b/arrow/src/error.rs @@ -85,6 +85,7 @@ impl From<::std::string::FromUtf8Error> for ArrowError { } } +#[cfg(feature = "json")] impl From for ArrowError { fn from(error: serde_json::Error) -> Self { ArrowError::JsonError(error.to_string()) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 04f495dc081..d1fb0cae0da 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -264,6 +264,7 @@ pub mod ffi; pub mod ffi_stream; #[cfg(feature = "ipc")] pub mod ipc; +#[cfg(feature = "serde_json")] pub mod json; #[cfg(feature = "pyarrow")] pub mod pyarrow; diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index 74a1ee6aa70..0a4bfa6d5b4 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -31,7 +31,7 @@ rust-version = "1.62" logging = ["tracing-subscriber"] [dependencies] -arrow = { path = "../arrow", default-features = false, features = [ "test_utils", "ipc_compression" ] } +arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json"] } arrow-flight = { path = "../arrow-flight", default-features = false } async-trait = { version = "0.1.41", default-features = false } clap = { version = "3", default-features = false, features = ["std", "derive"] } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index d9b8495559c..eeb85fd0334 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -62,7 +62,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "21.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint"] } +arrow = { path = "../arrow", version = "21.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } [package.metadata.docs.rs] all-features = true