Skip to content

Commit

Permalink
Modify decimal regex to accept positive exponent specifier (#5649)
Browse files Browse the repository at this point in the history
* Modify decimal regex to accept positive exponent specifier

* add test for positive exponent specifier

* nit
  • Loading branch information
jdcasale committed Apr 16, 2024
1 parent 0d031cc commit 34e2ac2
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
23 changes: 22 additions & 1 deletion arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ lazy_static! {
static ref REGEX_SET: RegexSet = RegexSet::new([
r"(?i)^(true)$|^(false)$(?-i)", //BOOLEAN
r"^-?(\d+)$", //INTEGER
r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$", //DECIMAL
r"^-?((\d*\.\d+|\d+\.\d*)([eE][-+]?\d+)?|\d+([eE][-+]?\d+))$", //DECIMAL
r"^\d{4}-\d\d-\d\d$", //DATE32
r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d(?:[^\d\.].*)?$", //Timestamp(Second)
r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,3}(?:[^\d].*)?$", //Timestamp(Millisecond)
Expand Down Expand Up @@ -1658,6 +1658,27 @@ mod tests {
assert_eq!(batch.schema().as_ref(), &expected_schema);
}

#[test]
fn test_scientific_notation_with_inference() {
let mut file = File::open("test/data/scientific_notation_test.csv").unwrap();
let format = Format::default().with_header(false).with_delimiter(b',');

let (schema, _) = format.infer_schema(&mut file, None).unwrap();
file.rewind().unwrap();

let builder = ReaderBuilder::new(Arc::new(schema))
.with_format(format)
.with_batch_size(512)
.with_projection(vec![0, 1]);

let mut csv = builder.build(file).unwrap();
let batch = csv.next().unwrap().unwrap();

let schema = batch.schema();

assert_eq!(&DataType::Float64, schema.field(0).data_type());
}

#[test]
fn test_parse_invalid_csv() {
let file = File::open("test/data/various_types_invalid.csv").unwrap();
Expand Down
19 changes: 19 additions & 0 deletions arrow-csv/test/data/scientific_notation_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
1.439e+04, positive_exponent
1.31e+04, positive_exponent
1.2711e+0, positive_exponent
1.44e+04, positive_exponent
2.22e+04, positive_exponent
1.149e+04, positive_exponent
2.139e+04, positive_exponent
7.322e+04, positive_exponent
1.531e+04, positive_exponent
2.206e-04, negative_exponent
1.517e-04, negative_exponent
2.332e-04, negative_exponent
2.19e-04, negative_exponent
2.087e-04, negative_exponent
12683.18, no_exponent
7134.6, no_exponent
8540.17, no_exponent
21462.36, no_exponent
1120.76, no_exponent

0 comments on commit 34e2ac2

Please sign in to comment.