Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 72 additions & 4 deletions arrow-csv/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ pub struct WriterBuilder {
quote: u8,
/// Optional escape character. Defaults to `b'\\'`
escape: u8,
/// Optional line terminator. Defaults to `LF` (`\n`)
terminator: Terminator,
/// Enable double quote escapes. Defaults to `true`
double_quote: bool,
/// Optional date format for date arrays
Expand All @@ -380,13 +382,23 @@ pub struct WriterBuilder {
quote_style: QuoteStyle,
}

/// The line terminator to use when writing CSV files.
#[derive(Clone, Debug)]
pub enum Terminator {
/// Use CRLF (`\r\n`) as the line terminator
CRLF,
/// Use the specified byte character as the line terminator
Any(u8),
}

impl Default for WriterBuilder {
fn default() -> Self {
WriterBuilder {
delimiter: b',',
has_header: true,
quote: b'"',
escape: b'\\',
terminator: Terminator::Any(b'\n'),
double_quote: true,
date_format: None,
datetime_format: None,
Expand Down Expand Up @@ -609,15 +621,33 @@ impl WriterBuilder {
self.quote_style
}

/// Set the CSV file's line terminator
pub fn with_line_terminator(mut self, terminator: Terminator) -> Self {
self.terminator = terminator;
self
}

/// Get the CSV file's line terminator, defaults to `LF` (`\n`)
pub fn line_terminator(&self) -> &Terminator {
&self.terminator
}

/// Create a new `Writer`
pub fn build<W: Write>(self, writer: W) -> Writer<W> {
let mut builder = csv::WriterBuilder::new();

let terminator = match self.terminator {
Terminator::CRLF => csv::Terminator::CRLF,
Terminator::Any(byte) => csv::Terminator::Any(byte),
};

let writer = builder
.delimiter(self.delimiter)
.quote(self.quote)
.quote_style(self.quote_style)
.double_quote(self.double_quote)
.escape(self.escape)
.terminator(terminator)
.from_writer(writer);
Writer {
writer,
Expand Down Expand Up @@ -1027,10 +1057,48 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
let mut buffer: Vec<u8> = vec![];
file.read_to_end(&mut buffer).unwrap();

assert_eq!(
"c1,c2\n00:02,46:17\n00:02,\n",
String::from_utf8(buffer).unwrap()
);
let output = String::from_utf8(buffer).unwrap();
assert_eq!(output, "c1,c2\n00:02,46:17\n00:02,\n");
}

#[test]
fn test_write_csv_with_lf_terminator() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these tests are quite repetitive (they are like 20 lines long but the only difference is 2 lines -- the terminaor and the expected output)

Can you please refactor them into a common harness so that it is easier for (me, a human) to review them and ensure coverage is adequate?

Thank you

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am trying to clear up the PR review queue, so I took the liberty of pushing a commit that reduces the test replication and merged up from main

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! Is there anything else for me to take care of?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nope -- this looks good

let output = write_batch_with_terminator(Terminator::Any(b'\n'));
assert_eq!(output, "c1,c2\nhello,1\nworld,2\n");
}

#[test]
fn test_write_csv_with_crlf_terminator() {
let output = write_batch_with_terminator(Terminator::CRLF);
assert_eq!(output, "c1,c2\r\nhello,1\r\nworld,2\r\n");
}

#[test]
fn test_write_csv_with_any_terminator() {
let output = write_batch_with_terminator(Terminator::Any(b'|'));
assert_eq!(output, "c1,c2|hello,1|world,2|");
}

fn write_batch_with_terminator(terminator: Terminator) -> String {
let schema = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::UInt32, false),
]);

let c1 = StringArray::from(vec!["hello", "world"]);
let c2 = PrimitiveArray::<UInt32Type>::from(vec![1, 2]);

let batch =
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap();

let mut buf = Vec::new();
let mut writer = WriterBuilder::new()
.with_line_terminator(terminator)
.build(&mut buf);
writer.write(&batch).unwrap();
drop(writer);

String::from_utf8(buf).unwrap()
}

#[test]
Expand Down
Loading