Skip to content

Commit

Permalink
Work around LargeUtf8 sorting bug by casting to Utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
jonmmease committed Mar 29, 2023
1 parent d2d13d9 commit 8be9a9e
Showing 1 changed file with 30 additions and 1 deletion.
31 changes: 30 additions & 1 deletion vegafusion-runtime/src/data/tasks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::task_graph::task::TaskCall;

use async_trait::async_trait;

use datafusion_expr::{lit, Expr};
use datafusion_expr::{expr, lit, Expr};
use std::collections::{HashMap, HashSet};

use std::sync::Arc;
Expand Down Expand Up @@ -142,6 +142,9 @@ impl TaskCall for DataUrlTask {
df
};

// Perform any up-front type conversions
let df = pre_process_column_types(df).await?;

// Process datetime columns
let df = process_datetimes(&parse, df, &config.tz_config).await?;

Expand Down Expand Up @@ -268,6 +271,32 @@ fn check_builtin_dataset(url: String) -> String {
}
}

async fn pre_process_column_types(df: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>> {
let mut selections: Vec<Expr> = Vec::new();
let mut pre_proc_needed = false;
for field in df.schema().fields.iter() {
if field.data_type() == &DataType::LargeUtf8 {
// Work around https://github.com/apache/arrow-rs/issues/2654 by converting
// LargeUtf8 to Utf8
selections.push(
Expr::Cast(expr::Cast {
expr: Box::new(flat_col(field.name())),
data_type: DataType::Utf8,
})
.alias(field.name()),
);
pre_proc_needed = true;
} else {
selections.push(flat_col(field.name()))
}
}
if pre_proc_needed {
df.select(selections).await
} else {
Ok(df)
}
}

async fn process_datetimes(
parse: &Option<Parse>,
sql_df: Arc<dyn DataFrame>,
Expand Down

0 comments on commit 8be9a9e

Please sign in to comment.