Skip to content

Commit 9aca68b

Browse files
authored
Add quote style to csv writer (#9004)
# Which issue does this PR close? Following on from #8960, we are now exposing the quote style as a part of the csv writer options which allows users to quote columns similar to Spark's `quoteAll` setting. <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Closes #[9003](#9003). # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> # What changes are included in this PR? Expose `QuoteStyle` in the `WriterBuilder` <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> # Are these changes tested? Yes with examples and unit tests. <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> # Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. -->
1 parent c133333 commit 9aca68b

File tree

2 files changed

+193
-0
lines changed

2 files changed

+193
-0
lines changed

arrow-csv/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ pub mod writer;
3030
pub use self::reader::Reader;
3131
pub use self::reader::ReaderBuilder;
3232
pub use self::reader::infer_schema_from_files;
33+
pub use self::writer::QuoteStyle;
3334
pub use self::writer::Writer;
3435
pub use self::writer::WriterBuilder;
3536
use arrow_schema::ArrowError;

arrow-csv/src/writer.rs

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,58 @@
141141
//! "name,comment\nAlice ,Great job! \nBob,Well done\nCharlie,Excellent \n"
142142
//! );
143143
//! ```
144+
//!
145+
//! # Quoting Styles
146+
//!
147+
//! The writer supports different quoting styles for fields, compatible with Apache Spark's
148+
//! CSV options like `quoteAll`. You can control when fields are quoted using the
149+
//! [`QuoteStyle`] enum.
150+
//!
151+
//! ## Example
152+
//!
153+
//! ```
154+
//! # use arrow_array::*;
155+
//! # use arrow_csv::{WriterBuilder, QuoteStyle};
156+
//! # use arrow_schema::*;
157+
//! # use std::sync::Arc;
158+
//!
159+
//! let schema = Schema::new(vec![
160+
//! Field::new("product", DataType::Utf8, false),
161+
//! Field::new("price", DataType::Float64, false),
162+
//! ]);
163+
//!
164+
//! let product = StringArray::from(vec!["apple", "banana,organic", "cherry"]);
165+
//! let price = Float64Array::from(vec![1.50, 2.25, 3.00]);
166+
//!
167+
//! let batch = RecordBatch::try_new(
168+
//! Arc::new(schema),
169+
//! vec![Arc::new(product), Arc::new(price)],
170+
//! )
171+
//! .unwrap();
172+
//!
173+
//! // Default behavior (QuoteStyle::Necessary)
174+
//! let mut output = Vec::new();
175+
//! WriterBuilder::new()
176+
//! .build(&mut output)
177+
//! .write(&batch)
178+
//! .unwrap();
179+
//! assert_eq!(
180+
//! String::from_utf8(output).unwrap(),
181+
//! "product,price\napple,1.5\n\"banana,organic\",2.25\ncherry,3.0\n"
182+
//! );
183+
//!
184+
//! // Quote all fields (Spark's quoteAll=true)
185+
//! let mut output = Vec::new();
186+
//! WriterBuilder::new()
187+
//! .with_quote_style(QuoteStyle::Always)
188+
//! .build(&mut output)
189+
//! .write(&batch)
190+
//! .unwrap();
191+
//! assert_eq!(
192+
//! String::from_utf8(output).unwrap(),
193+
//! "\"product\",\"price\"\n\"apple\",\"1.5\"\n\"banana,organic\",\"2.25\"\n\"cherry\",\"3.0\"\n"
194+
//! );
195+
//! ```
144196
145197
use arrow_array::*;
146198
use arrow_cast::display::*;
@@ -151,6 +203,22 @@ use std::io::Write;
151203
use crate::map_csv_error;
152204
const DEFAULT_NULL_VALUE: &str = "";
153205

206+
/// The quoting style to use when writing CSV files.
207+
///
208+
/// This type is re-exported from the `csv` crate and supports different
209+
/// strategies for quoting fields. It is compatible with Apache Spark's
210+
/// CSV options like `quoteAll`.
211+
///
212+
/// # Example
213+
///
214+
/// ```
215+
/// use arrow_csv::{WriterBuilder, QuoteStyle};
216+
///
217+
/// let builder = WriterBuilder::new()
218+
/// .with_quote_style(QuoteStyle::Always); // Equivalent to Spark's quoteAll=true
219+
/// ```
220+
pub use csv::QuoteStyle;
221+
154222
/// A CSV writer
155223
#[derive(Debug)]
156224
pub struct Writer<W: Write> {
@@ -324,6 +392,8 @@ pub struct WriterBuilder {
324392
ignore_leading_whitespace: bool,
325393
/// Whether to ignore trailing whitespace in string values. Defaults to `false`
326394
ignore_trailing_whitespace: bool,
395+
/// The quoting style to use. Defaults to `QuoteStyle::Necessary`
396+
quote_style: QuoteStyle,
327397
}
328398

329399
impl Default for WriterBuilder {
@@ -342,6 +412,7 @@ impl Default for WriterBuilder {
342412
null_value: None,
343413
ignore_leading_whitespace: false,
344414
ignore_trailing_whitespace: false,
415+
quote_style: QuoteStyle::default(),
345416
}
346417
}
347418
}
@@ -528,12 +599,38 @@ impl WriterBuilder {
528599
self.ignore_trailing_whitespace
529600
}
530601

602+
/// Set the quoting style for writing CSV files
603+
///
604+
/// # Example
605+
///
606+
/// ```
607+
/// use arrow_csv::{WriterBuilder, QuoteStyle};
608+
///
609+
/// // Quote all fields (equivalent to Spark's quoteAll=true)
610+
/// let builder = WriterBuilder::new()
611+
/// .with_quote_style(QuoteStyle::Always);
612+
///
613+
/// // Only quote when necessary (default)
614+
/// let builder = WriterBuilder::new()
615+
/// .with_quote_style(QuoteStyle::Necessary);
616+
/// ```
617+
pub fn with_quote_style(mut self, quote_style: QuoteStyle) -> Self {
618+
self.quote_style = quote_style;
619+
self
620+
}
621+
622+
/// Get the configured quoting style
623+
pub fn quote_style(&self) -> QuoteStyle {
624+
self.quote_style
625+
}
626+
531627
/// Create a new `Writer`
532628
pub fn build<W: Write>(self, writer: W) -> Writer<W> {
533629
let mut builder = csv::WriterBuilder::new();
534630
let writer = builder
535631
.delimiter(self.delimiter)
536632
.quote(self.quote)
633+
.quote_style(self.quote_style)
537634
.double_quote(self.double_quote)
538635
.escape(self.escape)
539636
.from_writer(writer);
@@ -1181,4 +1278,99 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo
11811278
String::from_utf8(buf).unwrap()
11821279
);
11831280
}
1281+
1282+
fn write_quote_style(batch: &RecordBatch, quote_style: QuoteStyle) -> String {
1283+
let mut buf = Vec::new();
1284+
let mut writer = WriterBuilder::new()
1285+
.with_quote_style(quote_style)
1286+
.build(&mut buf);
1287+
writer.write(batch).unwrap();
1288+
drop(writer);
1289+
String::from_utf8(buf).unwrap()
1290+
}
1291+
1292+
fn write_quote_style_with_null(
1293+
batch: &RecordBatch,
1294+
quote_style: QuoteStyle,
1295+
null_value: &str,
1296+
) -> String {
1297+
let mut buf = Vec::new();
1298+
let mut writer = WriterBuilder::new()
1299+
.with_quote_style(quote_style)
1300+
.with_null(null_value.to_string())
1301+
.build(&mut buf);
1302+
writer.write(batch).unwrap();
1303+
drop(writer);
1304+
String::from_utf8(buf).unwrap()
1305+
}
1306+
1307+
#[test]
1308+
fn test_write_csv_quote_style() {
1309+
let schema = Schema::new(vec![
1310+
Field::new("text", DataType::Utf8, false),
1311+
Field::new("number", DataType::Int32, false),
1312+
Field::new("float", DataType::Float64, false),
1313+
]);
1314+
1315+
let text = StringArray::from(vec!["hello", "world", "comma,value", "quote\"test"]);
1316+
let number = Int32Array::from(vec![1, 2, 3, 4]);
1317+
let float = Float64Array::from(vec![1.1, 2.2, 3.3, 4.4]);
1318+
1319+
let batch = RecordBatch::try_new(
1320+
Arc::new(schema),
1321+
vec![Arc::new(text), Arc::new(number), Arc::new(float)],
1322+
)
1323+
.unwrap();
1324+
1325+
// Test with QuoteStyle::Necessary (default)
1326+
assert_eq!(
1327+
"text,number,float\nhello,1,1.1\nworld,2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n",
1328+
write_quote_style(&batch, QuoteStyle::Necessary)
1329+
);
1330+
1331+
// Test with QuoteStyle::Always (equivalent to Spark's quoteAll=true)
1332+
assert_eq!(
1333+
"\"text\",\"number\",\"float\"\n\"hello\",\"1\",\"1.1\"\n\"world\",\"2\",\"2.2\"\n\"comma,value\",\"3\",\"3.3\"\n\"quote\"\"test\",\"4\",\"4.4\"\n",
1334+
write_quote_style(&batch, QuoteStyle::Always)
1335+
);
1336+
1337+
// Test with QuoteStyle::NonNumeric
1338+
assert_eq!(
1339+
"\"text\",\"number\",\"float\"\n\"hello\",1,1.1\n\"world\",2,2.2\n\"comma,value\",3,3.3\n\"quote\"\"test\",4,4.4\n",
1340+
write_quote_style(&batch, QuoteStyle::NonNumeric)
1341+
);
1342+
1343+
// Test with QuoteStyle::Never (warning: can produce invalid CSV)
1344+
// Note: This produces invalid CSV for fields with commas or quotes
1345+
assert_eq!(
1346+
"text,number,float\nhello,1,1.1\nworld,2,2.2\ncomma,value,3,3.3\nquote\"test,4,4.4\n",
1347+
write_quote_style(&batch, QuoteStyle::Never)
1348+
);
1349+
}
1350+
1351+
#[test]
1352+
fn test_write_csv_quote_style_with_nulls() {
1353+
let schema = Schema::new(vec![
1354+
Field::new("text", DataType::Utf8, true),
1355+
Field::new("number", DataType::Int32, true),
1356+
]);
1357+
1358+
let text = StringArray::from(vec![Some("hello"), None, Some("world")]);
1359+
let number = Int32Array::from(vec![Some(1), Some(2), None]);
1360+
1361+
let batch =
1362+
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(text), Arc::new(number)]).unwrap();
1363+
1364+
// Test with QuoteStyle::Always
1365+
assert_eq!(
1366+
"\"text\",\"number\"\n\"hello\",\"1\"\n\"\",\"2\"\n\"world\",\"\"\n",
1367+
write_quote_style(&batch, QuoteStyle::Always)
1368+
);
1369+
1370+
// Test with QuoteStyle::Always and custom null value
1371+
assert_eq!(
1372+
"\"text\",\"number\"\n\"hello\",\"1\"\n\"NULL\",\"2\"\n\"world\",\"NULL\"\n",
1373+
write_quote_style_with_null(&batch, QuoteStyle::Always, "NULL")
1374+
);
1375+
}
11841376
}

0 commit comments

Comments
 (0)