Skip to content

Commit

Permalink
Merge pull request #522 from splitgraph/delta-decimal-stats-patch
Browse files Browse the repository at this point in the history
Point delta-rs to our fork
  • Loading branch information
gruuya committed May 9, 2024
2 parents c29e2b3 + 34438a0 commit a313cf6
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 7 deletions.
19 changes: 14 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Expand Up @@ -93,7 +93,7 @@ datafusion-expr = { workspace = true }

datafusion-remote-tables = { path = "./datafusion_remote_tables", optional = true }

deltalake = { git = "https://github.com/delta-io/delta-rs", rev = "28ad3950d90573fa8ff413c336b657b8561e1d41", features = ["datafusion"] }
deltalake = { git = "https://github.com/splitgraph/delta-rs", branch = "convert-to-delta-stats", features = ["datafusion"] }

futures = "0.3"
hex = ">=0.4.0"
Expand Down
77 changes: 77 additions & 0 deletions tests/statements/convert.rs
Expand Up @@ -71,5 +71,82 @@ async fn test_convert_from_flat_parquet_table() -> Result<()> {
)
.await;

// Ensure partition/column stats are collected in add logs:
// https://github.com/delta-io/delta-rs/pull/2491
let mut table = context.try_get_delta_table("table_converted").await?;
table.load().await?;

// Convoluted way of sort-stable stats asserting
let state = table.snapshot()?;
let mut min_values = state
.min_values(&Column::from_name("column1"))
.expect("min values exist")
.as_any()
.downcast_ref::<Int64Array>()
.expect("Failed to downcast to Int64Array")
.values()
.to_vec();
min_values.sort();
assert_eq!(min_values, vec![1, 3, 5]);

let mut max_values = state
.max_values(&Column::from_name("column1"))
.expect("max values exist")
.as_any()
.downcast_ref::<Int64Array>()
.expect("Failed to downcast to Int64Array")
.values()
.to_vec();
max_values.sort();
assert_eq!(max_values, vec![2, 4, 6]);

let min_values = state
.min_values(&Column::from_name("column2"))
.expect("min values exist");
let min_values = min_values
.as_any()
.downcast_ref::<StringArray>()
.expect("Failed to downcast to StringArray")
.iter()
.flatten()
.sorted()
.collect::<Vec<&str>>();
assert_eq!(min_values, vec!["five", "four", "one"]);

let max_values = state
.max_values(&Column::from_name("column2"))
.expect("max values exist");
let max_values = max_values
.as_any()
.downcast_ref::<StringArray>()
.expect("Failed to downcast to StringArray")
.iter()
.flatten()
.sorted()
.collect::<Vec<&str>>();
assert_eq!(max_values, vec!["six", "three", "two"]);

assert_eq!(
table.statistics(),
Some(Statistics {
num_rows: Exact(6),
total_byte_size: Inexact(1985),
column_statistics: vec![
ColumnStatistics {
null_count: Exact(0),
max_value: Exact(ScalarValue::Int64(Some(6))),
min_value: Exact(ScalarValue::Int64(Some(1))),
distinct_count: Absent
},
ColumnStatistics {
null_count: Exact(0),
max_value: Absent,
min_value: Absent,
distinct_count: Absent
}
]
}),
);

Ok(())
}
7 changes: 6 additions & 1 deletion tests/statements/mod.rs
Expand Up @@ -2,12 +2,17 @@ use std::collections::HashMap;
use std::env;
use std::time::Duration;

use arrow::array::{Int64Array, StringArray};
use arrow::record_batch::RecordBatch;
use chrono::{TimeZone, Utc};
use datafusion::assert_batches_eq;
use datafusion::datasource::TableProvider;
use datafusion::physical_optimizer::pruning::PruningStatistics;
use datafusion_common::stats::Precision::{Absent, Exact, Inexact};
use datafusion_common::Column;
use datafusion_common::{assert_contains, Result};
use itertools::sorted;
use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
use itertools::{sorted, Itertools};

use seafowl::catalog::{DEFAULT_DB, DEFAULT_SCHEMA};
#[cfg(feature = "remote-tables")]
Expand Down

0 comments on commit a313cf6

Please sign in to comment.