Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utilise struct stats when available #656

Merged
merged 42 commits into from
Jul 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
e8be31b
Log stats
Tom-Newton Jun 19, 2022
89a25ae
Point unittest to problematic data
Tom-Newton Jun 19, 2022
1f2f023
Add minimal test data to reproduce
Tom-Newton Jun 20, 2022
c1c17b5
Update test data
Tom-Newton Jun 20, 2022
96b508b
Fix test data
Tom-Newton Jun 20, 2022
0c38cc8
Add test
Tom-Newton Jun 20, 2022
e37486f
Update rust logging
Tom-Newton Jun 20, 2022
a482b84
It actually compiles!
Tom-Newton Jun 22, 2022
7eb031c
I don't understand rust
Tom-Newton Jun 22, 2022
9af132a
Debug logging
Tom-Newton Jun 22, 2022
a5172e4
Fix sample data
Tom-Newton Jun 22, 2022
d5bf0da
Tidy
Tom-Newton Jun 22, 2022
514c7cd
Update rust/src/action.rs
Tom-Newton Jun 23, 2022
e3a3be1
Test data with more complex types
Tom-Newton Jun 23, 2022
1a78ccd
Still return json stats if there is an error parsing parquet stats
Tom-Newton Jun 23, 2022
5af9a3a
Better error message
Tom-Newton Jun 23, 2022
eba1a1d
Unittest covering more complex types
Tom-Newton Jun 23, 2022
94a007b
Support parsing structs
Tom-Newton Jun 24, 2022
46098e7
Compare struct comes out the same as json
Tom-Newton Jun 24, 2022
183d131
Correct timestamp formatting
Tom-Newton Jun 24, 2022
957c8b5
In progress better test and support for more columns
Tom-Newton Jun 26, 2022
c145585
All types except decimal work
Tom-Newton Jun 26, 2022
bee0d15
Test data with nested structs
Tom-Newton Jun 26, 2022
bc8128a
Update test
Tom-Newton Jun 26, 2022
c2887d9
All workng except decimal
Tom-Newton Jun 26, 2022
f82b6c9
Working decimal conversion
Tom-Newton Jun 26, 2022
2d3865c
Update test data again
Tom-Newton Jun 26, 2022
8e5efac
Passing test
Tom-Newton Jun 26, 2022
5b006f1
Tidy
Tom-Newton Jun 26, 2022
33a02c5
Tidy
Tom-Newton Jun 26, 2022
9bea24a
Tidy
Tom-Newton Jun 26, 2022
352e3d5
Remove .crc files
Tom-Newton Jun 26, 2022
0441169
Merge remote-tracking branch 'upstream2/main' into tomnewton/utilise_…
Tom-Newton Jun 26, 2022
8d9ca72
Remove unneeded return statements
Tom-Newton Jun 26, 2022
9ff8219
Remove python test
Tom-Newton Jun 26, 2022
a1571e6
Use from and reference instead of clone
Tom-Newton Jun 27, 2022
5e845cc
Use into
Tom-Newton Jun 27, 2022
a2c5733
dereferance timestamp
Tom-Newton Jun 27, 2022
70f8cee
use into
Tom-Newton Jun 27, 2022
da132ee
Use reference to field
Tom-Newton Jun 27, 2022
d0d8d5f
de-reference date
Tom-Newton Jun 27, 2022
270b027
Update rust/tests/read_delta_test.rs
Tom-Newton Jun 27, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ chrono = "0"
uuid = { version = "1", features = ["serde", "v4"] }
lazy_static = "1"
percent-encoding = "2"
num-bigint = "0.4"
num-traits = "0.2.15"

# HTTP Client
reqwest = { version = "0.11", default-features = false, features = [
Expand Down
86 changes: 77 additions & 9 deletions rust/src/action.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
#![allow(non_camel_case_types)]

use crate::{schema::*, DeltaTableMetaData};
use parquet::record::{ListAccessor, MapAccessor, RowAccessor};
use chrono::{SecondsFormat, TimeZone, Utc};
use num_bigint::BigInt;
use num_traits::cast::ToPrimitive;
use parquet::record::{Field, ListAccessor, MapAccessor, RowAccessor};
use percent_encoding::percent_decode;
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
use serde_json::{json, Map, Value};
use std::borrow::Borrow;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
Expand Down Expand Up @@ -114,7 +117,7 @@ impl ColumnCountStat {
}

/// Statistics associated with Add actions contained in the Delta log.
#[derive(Serialize, Deserialize, Debug, Default)]
#[derive(Serialize, Deserialize, Debug, Default, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct Stats {
/// Number of records in the file associated with the log action.
Expand Down Expand Up @@ -288,19 +291,34 @@ impl Add {
decode_path(&self.path).map(|path| Self { path, ..self })
}

/// Get whatever stats are available. Uses (parquet struct) parsed_stats if present falling back to json stats.
pub fn get_stats(&self) -> Result<Option<Stats>, serde_json::error::Error> {
match self.get_stats_parsed() {
Ok(Some(stats)) => Ok(Some(stats)),
Ok(None) => self.get_json_stats(),
Err(e) => {
log::error!(
"Error when reading parquet stats {:?} {e}. Attempting to read json stats",
self.stats_parsed
);
self.get_json_stats()
}
}
}

/// Returns the serde_json representation of stats contained in the action if present.
/// Since stats are defined as optional in the protocol, this may be None.
pub fn get_stats(&self) -> Result<Option<Stats>, serde_json::error::Error> {
pub fn get_json_stats(&self) -> Result<Option<Stats>, serde_json::error::Error> {
self.stats
.as_ref()
.map_or(Ok(None), |s| serde_json::from_str(s))
}

/// Returns the composite HashMap representation of stats contained in the action if present.
/// Since stats are defined as optional in the protocol, this may be None.
pub fn get_stats_parsed(&self) -> Result<Option<StatsParsed>, parquet::errors::ParquetError> {
pub fn get_stats_parsed(&self) -> Result<Option<Stats>, parquet::errors::ParquetError> {
self.stats_parsed.as_ref().map_or(Ok(None), |record| {
let mut stats = StatsParsed::default();
let mut stats = Stats::default();

for (i, (name, _)) in record.get_column_iter().enumerate() {
match name.as_str() {
Expand All @@ -315,7 +333,7 @@ impl Add {
"minValues" => match record.get_group(i) {
Ok(row) => {
for (name, field) in row.get_column_iter() {
stats.min_values.insert(name.clone(), field.clone());
stats.min_values.insert(name.clone(), field.into());
}
}
_ => {
Expand All @@ -325,7 +343,7 @@ impl Add {
"maxValues" => match record.get_group(i) {
Ok(row) => {
for (name, field) in row.get_column_iter() {
stats.max_values.insert(name.clone(), field.clone());
stats.max_values.insert(name.clone(), field.into());
}
}
_ => {
Expand All @@ -337,7 +355,7 @@ impl Add {
for (i, (name, _)) in row.get_column_iter().enumerate() {
match row.get_long(i) {
Ok(v) => {
stats.null_count.insert(name.clone(), v);
stats.null_count.insert(name.clone(), ColumnCountStat::Value(v));
}
_ => {
log::error!("Expect type of stats_parsed.nullRecords value to be struct, got: {}", row);
Expand All @@ -364,6 +382,56 @@ impl Add {
}
}

impl From<&Field> for ColumnValueStat {
fn from(field: &Field) -> Self {
match field {
Field::Group(group) => ColumnValueStat::Column(HashMap::from_iter(
group
.get_column_iter()
.map(|(field_name, field)| (field_name.clone(), field.into())),
)),
_ => ColumnValueStat::Value(primitive_parquet_field_to_json_value(field)),
}
}
}

fn primitive_parquet_field_to_json_value(field: &Field) -> serde_json::Value {
match field {
Field::Null => serde_json::Value::Null,
Field::Bool(value) => json!(value),
Field::Byte(value) => json!(value),
Field::Short(value) => json!(value),
Field::Int(value) => json!(value),
Field::Long(value) => json!(value),
Field::Float(value) => json!(value),
Field::Double(value) => json!(value),
Field::Str(value) => json!(value),
Field::Decimal(decimal) => match BigInt::from_signed_bytes_be(decimal.data()).to_f64() {
Some(int) => json!(int / (10_i64.pow((decimal.scale()).try_into().unwrap()) as f64)),
_ => serde_json::Value::Null,
},
Comment on lines +409 to +412
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So on the decimal representation, it does seem to be the case that the Spark implementation writes them out as numbers in the JSON file. I can't find the implementation code, but it does seem like they have a special parser that handles decimals. We don't so I'm worried about having a lossy conversion of Decimal into float.

It's a complicated subject and sort of an edge case, so I'm fine with this for now, but we should do a follow-up to make sure we are using decimal statistics appropriately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I was a bit concerned about this too. I'm tempted to just remove it since that is probably a safer option.

Field::TimestampMillis(timestamp) => {
serde_json::Value::String(convert_timestamp_millis_to_string(*timestamp))
}
Field::Date(date) => serde_json::Value::String(convert_date_to_string(*date)),
_ => {
log::warn!("Unexpected field type {:?}", field,);
serde_json::Value::Null
}
}
}

fn convert_timestamp_millis_to_string(value: u64) -> String {
let dt = Utc.timestamp((value / 1000) as i64, ((value % 1000) * 1000000) as u32);
dt.to_rfc3339_opts(SecondsFormat::Millis, true)
}

fn convert_date_to_string(value: u32) -> String {
static NUM_SECONDS_IN_DAY: i64 = 60 * 60 * 24;
let dt = Utc.timestamp(value as i64 * NUM_SECONDS_IN_DAY, 0).date();
format!("{}", dt.format("%Y-%m-%d"))
}

/// Describes the data format of files in the table.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct Format {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"9c4df48c-6085-4dcf-b73e-13147a5a405e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1656252116073}}
{"commitInfo":{"timestamp":1656252116149,"userId":"6114986638742036","userName":"tomnewton@wayve.ai","operation":"CREATE OR REPLACE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpoint.writeStatsAsStruct\":\"true\"}"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"af1b716b-5ec8-41f6-9cc2-bb89e010f943"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"metaData":{"id":"9c4df48c-6085-4dcf-b73e-13147a5a405e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"integer\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"null\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boolean\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(8,5)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"string\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"map\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array\",\"type\":{\"type\":\"array\",\"elementType\":\"string\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"nested_struct_element\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"struct_of_array_of_map\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"struct_element\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsJson":"false","delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1656252116073}}
{"add":{"path":"part-00000-51653f4d-b029-44bd-9fda-578e73518a26-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252122000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:01.678Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":0,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:01.678Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252122000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1656252122901,"userId":"6114986638742036","userName":"tomnewton@wayve.ai","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"4398d2b8-9a41-46ea-b92c-38820595bfec"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"add":{"path":"part-00000-a222e75a-a0b4-4e72-a776-2776ece95606-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252124000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":1,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:03.793Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":1,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:03.793Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252124000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1656252124212,"userId":"6114986638742036","userName":"tomnewton@wayve.ai","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"26079ba4-cd65-4421-9bd3-ba0416dfb509"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"add":{"path":"part-00000-b97608c1-8d8e-4369-b067-bd84435a1606-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252125000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":2,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:05.077Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":2,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:05.077Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252125000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1656252125457,"userId":"6114986638742036","userName":"tomnewton@wayve.ai","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"c2a690f3-3bb7-4dc2-b00e-cd81382e8b21"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"add":{"path":"part-00000-7c59d077-8928-4402-b5e6-7259f5440fd0-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252126000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":3,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:06.242Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":3,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:06.242Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252126000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1656252126676,"userId":"6114986638742036","userName":"tomnewton@wayve.ai","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"4522c45a-ed48-4b7d-bfc6-1bad78981261"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"add":{"path":"part-00000-7264b4fa-c3d7-4e25-956f-716358f594ff-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252127000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":4,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:07.511Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":4,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:07.511Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252127000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1656252127961,"userId":"6114986638742036","userName":"tomnewton@wayve.ai","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"f60b53dc-cb04-4e5b-a541-037c9c2cee89"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"add":{"path":"part-00000-1a04af5a-e0d5-497d-9496-7bcf9af3419f-c000.snappy.parquet","partitionValues":{},"size":5489,"modificationTime":1656252129000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"integer\":5,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:08.788Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"maxValues\":{\"integer\":5,\"double\":1.234,\"decimal\":-5.67800,\"string\":\"string\",\"date\":\"2022-06-26\",\"timestamp\":\"2022-06-26T14:02:08.788Z\",\"struct\":{\"struct_element\":\"struct_value\"},\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":\"nested_struct_value\"}}},\"nullCount\":{\"integer\":0,\"null\":1,\"boolean\":0,\"double\":0,\"decimal\":0,\"string\":0,\"binary\":0,\"date\":0,\"timestamp\":0,\"struct\":{\"struct_element\":0},\"map\":0,\"array\":0,\"nested_struct\":{\"struct_element\":{\"nested_struct_element\":0}},\"struct_of_array_of_map\":{\"struct_element\":0}}}","tags":{"INSERTION_TIME":"1656252129000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1656252129258,"userId":"6114986638742036","userName":"tomnewton@wayve.ai","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"1829280694121074"},"clusterId":"0622-151429-s7rz8ws","readVersion":5,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"5489"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"0bf1e2ce-ef4f-4235-aa4c-918507d5097b"}}