Skip to content

Commit 4045fb5

Browse files
authored
csv: Add support for flexible column lengths (#5679)
* Add support for truncated rows Similar to what is supported in the csv crate, as well as the pandas, arrow-cpp and polars crates. A subset of CSV files treat missing columns at the end of rows as null (if the schema allows it). This commit adds support to optionally enable treating such missing columns as null. The default behavior is still to treat an incorrect number of columns as an error. * Add truncated rows to `RecordDecoder::new` Instead of using a setter, truncated rows is passwed into the `new` method for RecordDecoder since it is not part of the public API.
1 parent f67a5ce commit 4045fb5

File tree

3 files changed

+198
-10
lines changed

3 files changed

+198
-10
lines changed

arrow-csv/src/reader/mod.rs

Lines changed: 156 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ pub struct Format {
231231
quote: Option<u8>,
232232
terminator: Option<u8>,
233233
null_regex: NullRegex,
234+
truncated_rows: bool,
234235
}
235236

236237
impl Format {
@@ -265,6 +266,17 @@ impl Format {
265266
self
266267
}
267268

269+
/// Whether to allow truncated rows when parsing.
270+
///
271+
/// By default this is set to `false` and will error if the CSV rows have different lengths.
272+
/// When set to true then it will allow records with less than the expected number of columns
273+
/// and fill the missing columns with nulls. If the record's schema is not nullable, then it
274+
/// will still return an error.
275+
pub fn with_truncated_rows(mut self, allow: bool) -> Self {
276+
self.truncated_rows = allow;
277+
self
278+
}
279+
268280
/// Infer schema of CSV records from the provided `reader`
269281
///
270282
/// If `max_records` is `None`, all records will be read, otherwise up to `max_records`
@@ -329,6 +341,7 @@ impl Format {
329341
fn build_reader<R: Read>(&self, reader: R) -> csv::Reader<R> {
330342
let mut builder = csv::ReaderBuilder::new();
331343
builder.has_headers(self.header);
344+
builder.flexible(self.truncated_rows);
332345

333346
if let Some(c) = self.delimiter {
334347
builder.delimiter(c);
@@ -1121,6 +1134,17 @@ impl ReaderBuilder {
11211134
self
11221135
}
11231136

1137+
/// Whether to allow truncated rows when parsing.
1138+
///
1139+
/// By default this is set to `false` and will error if the CSV rows have different lengths.
1140+
/// When set to true then it will allow records with less than the expected number of columns
1141+
/// and fill the missing columns with nulls. If the record's schema is not nullable, then it
1142+
/// will still return an error.
1143+
pub fn with_truncated_rows(mut self, allow: bool) -> Self {
1144+
self.format.truncated_rows = allow;
1145+
self
1146+
}
1147+
11241148
/// Create a new `Reader` from a non-buffered reader
11251149
///
11261150
/// If `R: BufRead` consider using [`Self::build_buffered`] to avoid unnecessary additional
@@ -1140,7 +1164,11 @@ impl ReaderBuilder {
11401164
/// Builds a decoder that can be used to decode CSV from an arbitrary byte stream
11411165
pub fn build_decoder(self) -> Decoder {
11421166
let delimiter = self.format.build_parser();
1143-
let record_decoder = RecordDecoder::new(delimiter, self.schema.fields().len());
1167+
let record_decoder = RecordDecoder::new(
1168+
delimiter,
1169+
self.schema.fields().len(),
1170+
self.format.truncated_rows,
1171+
);
11441172

11451173
let header = self.format.header as usize;
11461174

@@ -2164,6 +2192,133 @@ mod tests {
21642192
assert!(c.is_null(3));
21652193
}
21662194

2195+
#[test]
2196+
fn test_truncated_rows() {
2197+
let data = "a,b,c\n1,2,3\n4,5\n\n6,7,8";
2198+
let schema = Arc::new(Schema::new(vec![
2199+
Field::new("a", DataType::Int32, true),
2200+
Field::new("b", DataType::Int32, true),
2201+
Field::new("c", DataType::Int32, true),
2202+
]));
2203+
2204+
let reader = ReaderBuilder::new(schema.clone())
2205+
.with_header(true)
2206+
.with_truncated_rows(true)
2207+
.build(Cursor::new(data))
2208+
.unwrap();
2209+
2210+
let batches = reader.collect::<Result<Vec<_>, _>>();
2211+
assert!(batches.is_ok());
2212+
let batch = batches.unwrap().into_iter().next().unwrap();
2213+
// Empty rows are skipped by the underlying csv parser
2214+
assert_eq!(batch.num_rows(), 3);
2215+
2216+
let reader = ReaderBuilder::new(schema.clone())
2217+
.with_header(true)
2218+
.with_truncated_rows(false)
2219+
.build(Cursor::new(data))
2220+
.unwrap();
2221+
2222+
let batches = reader.collect::<Result<Vec<_>, _>>();
2223+
assert!(match batches {
2224+
Err(ArrowError::CsvError(e)) => e.to_string().contains("incorrect number of fields"),
2225+
_ => false,
2226+
});
2227+
}
2228+
2229+
#[test]
2230+
fn test_truncated_rows_csv() {
2231+
let file = File::open("test/data/truncated_rows.csv").unwrap();
2232+
let schema = Arc::new(Schema::new(vec![
2233+
Field::new("Name", DataType::Utf8, true),
2234+
Field::new("Age", DataType::UInt32, true),
2235+
Field::new("Occupation", DataType::Utf8, true),
2236+
Field::new("DOB", DataType::Date32, true),
2237+
]));
2238+
let reader = ReaderBuilder::new(schema.clone())
2239+
.with_header(true)
2240+
.with_batch_size(24)
2241+
.with_truncated_rows(true);
2242+
let csv = reader.build(file).unwrap();
2243+
let batches = csv.collect::<Result<Vec<_>, _>>().unwrap();
2244+
2245+
assert_eq!(batches.len(), 1);
2246+
let batch = &batches[0];
2247+
assert_eq!(batch.num_rows(), 6);
2248+
assert_eq!(batch.num_columns(), 4);
2249+
let name = batch
2250+
.column(0)
2251+
.as_any()
2252+
.downcast_ref::<StringArray>()
2253+
.unwrap();
2254+
let age = batch
2255+
.column(1)
2256+
.as_any()
2257+
.downcast_ref::<UInt32Array>()
2258+
.unwrap();
2259+
let occupation = batch
2260+
.column(2)
2261+
.as_any()
2262+
.downcast_ref::<StringArray>()
2263+
.unwrap();
2264+
let dob = batch
2265+
.column(3)
2266+
.as_any()
2267+
.downcast_ref::<Date32Array>()
2268+
.unwrap();
2269+
2270+
assert_eq!(name.value(0), "A1");
2271+
assert_eq!(name.value(1), "B2");
2272+
assert!(name.is_null(2));
2273+
assert_eq!(name.value(3), "C3");
2274+
assert_eq!(name.value(4), "D4");
2275+
assert_eq!(name.value(5), "E5");
2276+
2277+
assert_eq!(age.value(0), 34);
2278+
assert_eq!(age.value(1), 29);
2279+
assert!(age.is_null(2));
2280+
assert_eq!(age.value(3), 45);
2281+
assert!(age.is_null(4));
2282+
assert_eq!(age.value(5), 31);
2283+
2284+
assert_eq!(occupation.value(0), "Engineer");
2285+
assert_eq!(occupation.value(1), "Doctor");
2286+
assert!(occupation.is_null(2));
2287+
assert_eq!(occupation.value(3), "Artist");
2288+
assert!(occupation.is_null(4));
2289+
assert!(occupation.is_null(5));
2290+
2291+
assert_eq!(dob.value(0), 5675);
2292+
assert!(dob.is_null(1));
2293+
assert!(dob.is_null(2));
2294+
assert_eq!(dob.value(3), -1858);
2295+
assert!(dob.is_null(4));
2296+
assert!(dob.is_null(5));
2297+
}
2298+
2299+
#[test]
2300+
fn test_truncated_rows_not_nullable_error() {
2301+
let data = "a,b,c\n1,2,3\n4,5";
2302+
let schema = Arc::new(Schema::new(vec![
2303+
Field::new("a", DataType::Int32, false),
2304+
Field::new("b", DataType::Int32, false),
2305+
Field::new("c", DataType::Int32, false),
2306+
]));
2307+
2308+
let reader = ReaderBuilder::new(schema.clone())
2309+
.with_header(true)
2310+
.with_truncated_rows(true)
2311+
.build(Cursor::new(data))
2312+
.unwrap();
2313+
2314+
let batches = reader.collect::<Result<Vec<_>, _>>();
2315+
assert!(match batches {
2316+
Err(ArrowError::InvalidArgumentError(e)) =>
2317+
e.to_string().contains("contains null values"),
2318+
_ => false,
2319+
});
2320+
}
2321+
21672322
#[test]
21682323
fn test_buffered() {
21692324
let tests = [

arrow-csv/src/reader/records.rs

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,16 @@ pub struct RecordDecoder {
5656
///
5757
/// We track this independently of Vec to avoid re-zeroing memory
5858
data_len: usize,
59+
60+
/// Whether rows with less than expected columns are considered valid
61+
///
62+
/// Default value is false
63+
/// When enabled fills in missing columns with null
64+
truncated_rows: bool,
5965
}
6066

6167
impl RecordDecoder {
62-
pub fn new(delimiter: Reader, num_columns: usize) -> Self {
68+
pub fn new(delimiter: Reader, num_columns: usize, truncated_rows: bool) -> Self {
6369
Self {
6470
delimiter,
6571
num_columns,
@@ -70,6 +76,7 @@ impl RecordDecoder {
7076
data_len: 0,
7177
data: vec![],
7278
num_rows: 0,
79+
truncated_rows,
7380
}
7481
}
7582

@@ -127,10 +134,19 @@ impl RecordDecoder {
127134
}
128135
ReadRecordResult::Record => {
129136
if self.current_field != self.num_columns {
130-
return Err(ArrowError::CsvError(format!(
131-
"incorrect number of fields for line {}, expected {} got {}",
132-
self.line_number, self.num_columns, self.current_field
133-
)));
137+
if self.truncated_rows && self.current_field < self.num_columns {
138+
// If the number of fields is less than expected, pad with nulls
139+
let fill_count = self.num_columns - self.current_field;
140+
let fill_value = self.offsets[self.offsets_len - 1];
141+
self.offsets[self.offsets_len..self.offsets_len + fill_count]
142+
.fill(fill_value);
143+
self.offsets_len += fill_count;
144+
} else {
145+
return Err(ArrowError::CsvError(format!(
146+
"incorrect number of fields for line {}, expected {} got {}",
147+
self.line_number, self.num_columns, self.current_field
148+
)));
149+
}
134150
}
135151
read += 1;
136152
self.current_field = 0;
@@ -299,7 +315,7 @@ mod tests {
299315
.into_iter();
300316

301317
let mut reader = BufReader::with_capacity(3, Cursor::new(csv.as_bytes()));
302-
let mut decoder = RecordDecoder::new(Reader::new(), 3);
318+
let mut decoder = RecordDecoder::new(Reader::new(), 3, false);
303319

304320
loop {
305321
let to_read = 3;
@@ -333,15 +349,15 @@ mod tests {
333349
#[test]
334350
fn test_invalid_fields() {
335351
let csv = "a,b\nb,c\na\n";
336-
let mut decoder = RecordDecoder::new(Reader::new(), 2);
352+
let mut decoder = RecordDecoder::new(Reader::new(), 2, false);
337353
let err = decoder.decode(csv.as_bytes(), 4).unwrap_err().to_string();
338354

339355
let expected = "Csv error: incorrect number of fields for line 3, expected 2 got 1";
340356

341357
assert_eq!(err, expected);
342358

343359
// Test with initial skip
344-
let mut decoder = RecordDecoder::new(Reader::new(), 2);
360+
let mut decoder = RecordDecoder::new(Reader::new(), 2, false);
345361
let (skipped, bytes) = decoder.decode(csv.as_bytes(), 1).unwrap();
346362
assert_eq!(skipped, 1);
347363
decoder.clear();
@@ -354,9 +370,18 @@ mod tests {
354370
#[test]
355371
fn test_skip_insufficient_rows() {
356372
let csv = "a\nv\n";
357-
let mut decoder = RecordDecoder::new(Reader::new(), 1);
373+
let mut decoder = RecordDecoder::new(Reader::new(), 1, false);
358374
let (read, bytes) = decoder.decode(csv.as_bytes(), 3).unwrap();
359375
assert_eq!(read, 2);
360376
assert_eq!(bytes, csv.len());
361377
}
378+
379+
#[test]
380+
fn test_truncated_rows() {
381+
let csv = "a,b\nv\n,1\n,2\n,3\n";
382+
let mut decoder = RecordDecoder::new(Reader::new(), 2, true);
383+
let (read, bytes) = decoder.decode(csv.as_bytes(), 5).unwrap();
384+
assert_eq!(read, 5);
385+
assert_eq!(bytes, csv.len());
386+
}
362387
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Name,Age,Occupation,DOB
2+
A1,34,Engineer,1985-07-16
3+
B2,29,Doctor
4+
,
5+
C3,45,Artist,1964-11-30
6+
7+
D4
8+
E5,31,,

0 commit comments

Comments
 (0)