Skip to content

Commit

Permalink
Allow specifying comment character for CSV reader (#5759)
Browse files Browse the repository at this point in the history
This patch adds reader support for a comment character for reading CSV
files. While comments like almost nothing around the CSV format are not
truly standardized, a common format supported by many CSV
readers[^1][^2] is to ignore full lines starting with a comment
character (often `#`); inline or end of line comments are not supported.

Example:

    # This is a comment in a CSV file without header.
    1,2
    # Comment inside the data block.
    11,22

The implementation of this for Arrow is pretty straight-forward as all
we need to do is expose the existing `comment` option of `csv_core` used
to read CSV files.

Closes #5758.

[^1]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
[^2]: https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html
  • Loading branch information
bbannier committed May 13, 2024
1 parent cd39b8c commit 6ab67df
Showing 1 changed file with 51 additions and 0 deletions.
51 changes: 51 additions & 0 deletions arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ pub struct Format {
escape: Option<u8>,
quote: Option<u8>,
terminator: Option<u8>,
comment: Option<u8>,
null_regex: NullRegex,
truncated_rows: bool,
}
Expand Down Expand Up @@ -260,6 +261,11 @@ impl Format {
self
}

pub fn with_comment(mut self, comment: u8) -> Self {
self.comment = Some(comment);
self
}

/// Provide a regex to match null values, defaults to `^$`
pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
self.null_regex = NullRegex(Some(null_regex));
Expand Down Expand Up @@ -353,13 +359,17 @@ impl Format {
if let Some(t) = self.terminator {
builder.terminator(csv::Terminator::Any(t));
}
if let Some(comment) = self.comment {
builder.comment(Some(comment));
}
builder.from_reader(reader)
}

/// Build a [`csv_core::Reader`] for this [`Format`]
fn build_parser(&self) -> csv_core::Reader {
let mut builder = csv_core::ReaderBuilder::new();
builder.escape(self.escape);
builder.comment(self.comment);

if let Some(c) = self.delimiter {
builder.delimiter(c);
Expand Down Expand Up @@ -1109,6 +1119,11 @@ impl ReaderBuilder {
self
}

pub fn with_comment(mut self, comment: u8) -> Self {
self.format.comment = Some(comment);
self
}

/// Provide a regex to match null values, defaults to `^$`
pub fn with_null_regex(mut self, null_regex: Regex) -> Self {
self.format.null_regex = NullRegex(Some(null_regex));
Expand Down Expand Up @@ -2536,4 +2551,40 @@ mod tests {
assert_eq!(&t.get(), expected, "{values:?}")
}
}

#[test]
fn test_comment() {
let schema = Schema::new(vec![
Field::new("a", DataType::Int8, false),
Field::new("b", DataType::Int8, false),
]);

let csv = "# comment1 \n1,2\n#comment2\n11,22";
let mut read = Cursor::new(csv.as_bytes());
let reader = ReaderBuilder::new(Arc::new(schema))
.with_comment(b'#')
.build(&mut read)
.unwrap();

let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
assert_eq!(batches.len(), 1);
let b = batches.first().unwrap();
assert_eq!(b.num_columns(), 2);
assert_eq!(
b.column(0)
.as_any()
.downcast_ref::<Int8Array>()
.unwrap()
.values(),
&vec![1, 11]
);
assert_eq!(
b.column(1)
.as_any()
.downcast_ref::<Int8Array>()
.unwrap()
.values(),
&vec![2, 22]
);
}
}

0 comments on commit 6ab67df

Please sign in to comment.