Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't try to infer nullability in CSV reader #2860

Merged
merged 8 commits into from Oct 13, 2022
Merged
35 changes: 15 additions & 20 deletions arrow/src/csv/reader.rs
Expand Up @@ -200,8 +200,6 @@ fn infer_reader_schema_with_csv_options<R: Read>(
let header_length = headers.len();
// keep track of inferred field types
let mut column_types: Vec<HashSet<DataType>> = vec![HashSet::new(); header_length];
// keep track of columns with nulls
let mut nulls: Vec<bool> = vec![false; header_length];

let mut records_count = 0;
let mut fields = vec![];
Expand All @@ -214,12 +212,10 @@ fn infer_reader_schema_with_csv_options<R: Read>(
}
records_count += 1;

for i in 0..header_length {
for (i, column_type) in column_types.iter_mut().enumerate().take(header_length) {
Dandandan marked this conversation as resolved.
Show resolved Hide resolved
if let Some(string) = record.get(i) {
if string.is_empty() {
nulls[i] = true;
} else {
column_types[i]
if !string.is_empty() {
column_type
.insert(infer_field_schema(string, roptions.datetime_re.clone()));
}
}
Expand All @@ -229,29 +225,28 @@ fn infer_reader_schema_with_csv_options<R: Read>(
// build schema from inference results
for i in 0..header_length {
let possibilities = &column_types[i];
let has_nulls = nulls[i];
let field_name = &headers[i];

// determine data type based on possible types
// if there are incompatible types, use DataType::Utf8
match possibilities.len() {
1 => {
for dtype in possibilities.iter() {
fields.push(Field::new(field_name, dtype.clone(), has_nulls));
fields.push(Field::new(field_name, dtype.clone(), true));
}
}
2 => {
if possibilities.contains(&DataType::Int64)
&& possibilities.contains(&DataType::Float64)
{
// we have an integer and double, fall down to double
fields.push(Field::new(field_name, DataType::Float64, has_nulls));
fields.push(Field::new(field_name, DataType::Float64, true));
} else {
// default to Utf8 for conflicting datatypes (e.g bool and int)
fields.push(Field::new(field_name, DataType::Utf8, has_nulls));
fields.push(Field::new(field_name, DataType::Utf8, true));
}
}
_ => fields.push(Field::new(field_name, DataType::Utf8, has_nulls)),
_ => fields.push(Field::new(field_name, DataType::Utf8, true)),
}
}

Expand Down Expand Up @@ -1287,9 +1282,9 @@ mod tests {

let mut csv = builder.build(file).unwrap();
let expected_schema = Schema::new(vec![
Field::new("city", DataType::Utf8, false),
Field::new("lat", DataType::Float64, false),
Field::new("lng", DataType::Float64, false),
Field::new("city", DataType::Utf8, true),
Field::new("lat", DataType::Float64, true),
Field::new("lng", DataType::Float64, true),
]);
assert_eq!(Arc::new(expected_schema), csv.schema());
let batch = csv.next().unwrap().unwrap();
Expand Down Expand Up @@ -1514,10 +1509,10 @@ mod tests {
]
);

assert!(!schema.field(0).is_nullable());
assert!(schema.field(0).is_nullable());
assert!(schema.field(1).is_nullable());
assert!(schema.field(2).is_nullable());
assert!(!schema.field(3).is_nullable());
assert!(schema.field(3).is_nullable());
assert!(schema.field(4).is_nullable());
assert!(schema.field(5).is_nullable());

Expand Down Expand Up @@ -1798,10 +1793,10 @@ mod tests {
)?;

assert_eq!(schema.fields().len(), 4);
assert!(!schema.field(0).is_nullable());
assert!(schema.field(0).is_nullable());
assert!(schema.field(1).is_nullable());
assert!(!schema.field(2).is_nullable());
assert!(!schema.field(3).is_nullable());
assert!(schema.field(2).is_nullable());
assert!(schema.field(3).is_nullable());

assert_eq!(&DataType::Int64, schema.field(0).data_type());
assert_eq!(&DataType::Utf8, schema.field(1).data_type());
Expand Down