@@ -231,6 +231,7 @@ pub struct Format {
231
231
quote : Option < u8 > ,
232
232
terminator : Option < u8 > ,
233
233
null_regex : NullRegex ,
234
+ truncated_rows : bool ,
234
235
}
235
236
236
237
impl Format {
@@ -265,6 +266,17 @@ impl Format {
265
266
self
266
267
}
267
268
269
+ /// Whether to allow truncated rows when parsing.
270
+ ///
271
+ /// By default this is set to `false` and will error if the CSV rows have different lengths.
272
+ /// When set to true then it will allow records with less than the expected number of columns
273
+ /// and fill the missing columns with nulls. If the record's schema is not nullable, then it
274
+ /// will still return an error.
275
+ pub fn with_truncated_rows ( mut self , allow : bool ) -> Self {
276
+ self . truncated_rows = allow;
277
+ self
278
+ }
279
+
268
280
/// Infer schema of CSV records from the provided `reader`
269
281
///
270
282
/// If `max_records` is `None`, all records will be read, otherwise up to `max_records`
@@ -329,6 +341,7 @@ impl Format {
329
341
fn build_reader < R : Read > ( & self , reader : R ) -> csv:: Reader < R > {
330
342
let mut builder = csv:: ReaderBuilder :: new ( ) ;
331
343
builder. has_headers ( self . header ) ;
344
+ builder. flexible ( self . truncated_rows ) ;
332
345
333
346
if let Some ( c) = self . delimiter {
334
347
builder. delimiter ( c) ;
@@ -1121,6 +1134,17 @@ impl ReaderBuilder {
1121
1134
self
1122
1135
}
1123
1136
1137
+ /// Whether to allow truncated rows when parsing.
1138
+ ///
1139
+ /// By default this is set to `false` and will error if the CSV rows have different lengths.
1140
+ /// When set to true then it will allow records with less than the expected number of columns
1141
+ /// and fill the missing columns with nulls. If the record's schema is not nullable, then it
1142
+ /// will still return an error.
1143
+ pub fn with_truncated_rows ( mut self , allow : bool ) -> Self {
1144
+ self . format . truncated_rows = allow;
1145
+ self
1146
+ }
1147
+
1124
1148
/// Create a new `Reader` from a non-buffered reader
1125
1149
///
1126
1150
/// If `R: BufRead` consider using [`Self::build_buffered`] to avoid unnecessary additional
@@ -1140,7 +1164,11 @@ impl ReaderBuilder {
1140
1164
/// Builds a decoder that can be used to decode CSV from an arbitrary byte stream
1141
1165
pub fn build_decoder ( self ) -> Decoder {
1142
1166
let delimiter = self . format . build_parser ( ) ;
1143
- let record_decoder = RecordDecoder :: new ( delimiter, self . schema . fields ( ) . len ( ) ) ;
1167
+ let record_decoder = RecordDecoder :: new (
1168
+ delimiter,
1169
+ self . schema . fields ( ) . len ( ) ,
1170
+ self . format . truncated_rows ,
1171
+ ) ;
1144
1172
1145
1173
let header = self . format . header as usize ;
1146
1174
@@ -2164,6 +2192,133 @@ mod tests {
2164
2192
assert ! ( c. is_null( 3 ) ) ;
2165
2193
}
2166
2194
2195
+ #[ test]
2196
+ fn test_truncated_rows ( ) {
2197
+ let data = "a,b,c\n 1,2,3\n 4,5\n \n 6,7,8" ;
2198
+ let schema = Arc :: new ( Schema :: new ( vec ! [
2199
+ Field :: new( "a" , DataType :: Int32 , true ) ,
2200
+ Field :: new( "b" , DataType :: Int32 , true ) ,
2201
+ Field :: new( "c" , DataType :: Int32 , true ) ,
2202
+ ] ) ) ;
2203
+
2204
+ let reader = ReaderBuilder :: new ( schema. clone ( ) )
2205
+ . with_header ( true )
2206
+ . with_truncated_rows ( true )
2207
+ . build ( Cursor :: new ( data) )
2208
+ . unwrap ( ) ;
2209
+
2210
+ let batches = reader. collect :: < Result < Vec < _ > , _ > > ( ) ;
2211
+ assert ! ( batches. is_ok( ) ) ;
2212
+ let batch = batches. unwrap ( ) . into_iter ( ) . next ( ) . unwrap ( ) ;
2213
+ // Empty rows are skipped by the underlying csv parser
2214
+ assert_eq ! ( batch. num_rows( ) , 3 ) ;
2215
+
2216
+ let reader = ReaderBuilder :: new ( schema. clone ( ) )
2217
+ . with_header ( true )
2218
+ . with_truncated_rows ( false )
2219
+ . build ( Cursor :: new ( data) )
2220
+ . unwrap ( ) ;
2221
+
2222
+ let batches = reader. collect :: < Result < Vec < _ > , _ > > ( ) ;
2223
+ assert ! ( match batches {
2224
+ Err ( ArrowError :: CsvError ( e) ) => e. to_string( ) . contains( "incorrect number of fields" ) ,
2225
+ _ => false ,
2226
+ } ) ;
2227
+ }
2228
+
2229
+ #[ test]
2230
+ fn test_truncated_rows_csv ( ) {
2231
+ let file = File :: open ( "test/data/truncated_rows.csv" ) . unwrap ( ) ;
2232
+ let schema = Arc :: new ( Schema :: new ( vec ! [
2233
+ Field :: new( "Name" , DataType :: Utf8 , true ) ,
2234
+ Field :: new( "Age" , DataType :: UInt32 , true ) ,
2235
+ Field :: new( "Occupation" , DataType :: Utf8 , true ) ,
2236
+ Field :: new( "DOB" , DataType :: Date32 , true ) ,
2237
+ ] ) ) ;
2238
+ let reader = ReaderBuilder :: new ( schema. clone ( ) )
2239
+ . with_header ( true )
2240
+ . with_batch_size ( 24 )
2241
+ . with_truncated_rows ( true ) ;
2242
+ let csv = reader. build ( file) . unwrap ( ) ;
2243
+ let batches = csv. collect :: < Result < Vec < _ > , _ > > ( ) . unwrap ( ) ;
2244
+
2245
+ assert_eq ! ( batches. len( ) , 1 ) ;
2246
+ let batch = & batches[ 0 ] ;
2247
+ assert_eq ! ( batch. num_rows( ) , 6 ) ;
2248
+ assert_eq ! ( batch. num_columns( ) , 4 ) ;
2249
+ let name = batch
2250
+ . column ( 0 )
2251
+ . as_any ( )
2252
+ . downcast_ref :: < StringArray > ( )
2253
+ . unwrap ( ) ;
2254
+ let age = batch
2255
+ . column ( 1 )
2256
+ . as_any ( )
2257
+ . downcast_ref :: < UInt32Array > ( )
2258
+ . unwrap ( ) ;
2259
+ let occupation = batch
2260
+ . column ( 2 )
2261
+ . as_any ( )
2262
+ . downcast_ref :: < StringArray > ( )
2263
+ . unwrap ( ) ;
2264
+ let dob = batch
2265
+ . column ( 3 )
2266
+ . as_any ( )
2267
+ . downcast_ref :: < Date32Array > ( )
2268
+ . unwrap ( ) ;
2269
+
2270
+ assert_eq ! ( name. value( 0 ) , "A1" ) ;
2271
+ assert_eq ! ( name. value( 1 ) , "B2" ) ;
2272
+ assert ! ( name. is_null( 2 ) ) ;
2273
+ assert_eq ! ( name. value( 3 ) , "C3" ) ;
2274
+ assert_eq ! ( name. value( 4 ) , "D4" ) ;
2275
+ assert_eq ! ( name. value( 5 ) , "E5" ) ;
2276
+
2277
+ assert_eq ! ( age. value( 0 ) , 34 ) ;
2278
+ assert_eq ! ( age. value( 1 ) , 29 ) ;
2279
+ assert ! ( age. is_null( 2 ) ) ;
2280
+ assert_eq ! ( age. value( 3 ) , 45 ) ;
2281
+ assert ! ( age. is_null( 4 ) ) ;
2282
+ assert_eq ! ( age. value( 5 ) , 31 ) ;
2283
+
2284
+ assert_eq ! ( occupation. value( 0 ) , "Engineer" ) ;
2285
+ assert_eq ! ( occupation. value( 1 ) , "Doctor" ) ;
2286
+ assert ! ( occupation. is_null( 2 ) ) ;
2287
+ assert_eq ! ( occupation. value( 3 ) , "Artist" ) ;
2288
+ assert ! ( occupation. is_null( 4 ) ) ;
2289
+ assert ! ( occupation. is_null( 5 ) ) ;
2290
+
2291
+ assert_eq ! ( dob. value( 0 ) , 5675 ) ;
2292
+ assert ! ( dob. is_null( 1 ) ) ;
2293
+ assert ! ( dob. is_null( 2 ) ) ;
2294
+ assert_eq ! ( dob. value( 3 ) , -1858 ) ;
2295
+ assert ! ( dob. is_null( 4 ) ) ;
2296
+ assert ! ( dob. is_null( 5 ) ) ;
2297
+ }
2298
+
2299
+ #[ test]
2300
+ fn test_truncated_rows_not_nullable_error ( ) {
2301
+ let data = "a,b,c\n 1,2,3\n 4,5" ;
2302
+ let schema = Arc :: new ( Schema :: new ( vec ! [
2303
+ Field :: new( "a" , DataType :: Int32 , false ) ,
2304
+ Field :: new( "b" , DataType :: Int32 , false ) ,
2305
+ Field :: new( "c" , DataType :: Int32 , false ) ,
2306
+ ] ) ) ;
2307
+
2308
+ let reader = ReaderBuilder :: new ( schema. clone ( ) )
2309
+ . with_header ( true )
2310
+ . with_truncated_rows ( true )
2311
+ . build ( Cursor :: new ( data) )
2312
+ . unwrap ( ) ;
2313
+
2314
+ let batches = reader. collect :: < Result < Vec < _ > , _ > > ( ) ;
2315
+ assert ! ( match batches {
2316
+ Err ( ArrowError :: InvalidArgumentError ( e) ) =>
2317
+ e. to_string( ) . contains( "contains null values" ) ,
2318
+ _ => false ,
2319
+ } ) ;
2320
+ }
2321
+
2167
2322
#[ test]
2168
2323
fn test_buffered ( ) {
2169
2324
let tests = [
0 commit comments