88
88
89
89
use crate :: codec:: { AvroField , AvroFieldBuilder } ;
90
90
use crate :: schema:: {
91
- compare_schemas, Fingerprint , FingerprintAlgorithm , Schema as AvroSchema , SchemaStore ,
92
- SINGLE_OBJECT_MAGIC ,
91
+ compare_schemas, generate_fingerprint , Fingerprint , FingerprintAlgorithm , Schema as AvroSchema ,
92
+ SchemaStore , SINGLE_OBJECT_MAGIC ,
93
93
} ;
94
94
use arrow_array:: { RecordBatch , RecordBatchReader } ;
95
95
use arrow_schema:: { ArrowError , SchemaRef } ;
@@ -140,13 +140,12 @@ pub struct Decoder {
140
140
active_decoder : RecordDecoder ,
141
141
active_fingerprint : Option < Fingerprint > ,
142
142
batch_size : usize ,
143
- decoded_rows : usize ,
143
+ remaining_capacity : usize ,
144
144
cache : IndexMap < Fingerprint , RecordDecoder > ,
145
145
max_cache_size : usize ,
146
146
reader_schema : Option < AvroSchema < ' static > > ,
147
- schema_store : Option < SchemaStore < ' static > > ,
147
+ writer_schema_store : Option < SchemaStore < ' static > > ,
148
148
utf8_view : bool ,
149
- static_store_mode : bool ,
150
149
strict_mode : bool ,
151
150
pending_schema : Option < ( Fingerprint , RecordDecoder ) > ,
152
151
}
@@ -168,34 +167,43 @@ impl Decoder {
168
167
///
169
168
/// Returns the number of bytes consumed.
170
169
pub fn decode ( & mut self , data : & [ u8 ] ) -> Result < usize , ArrowError > {
170
+ if self . active_fingerprint . is_none ( ) && self . writer_schema_store . is_some ( ) {
171
+ if !data. starts_with ( & SINGLE_OBJECT_MAGIC ) {
172
+ return Err ( ArrowError :: ParseError (
173
+ "Expected single‑object encoding fingerprint prefix for first message \
174
+ (writer_schema_store is set but active_fingerprint is None)"
175
+ . into ( ) ,
176
+ ) ) ;
177
+ }
178
+ }
171
179
let mut total_consumed = 0usize ;
172
- let hash_type = self . schema_store . as_ref ( ) . map_or (
180
+ let hash_type = self . writer_schema_store . as_ref ( ) . map_or (
173
181
FingerprintAlgorithm :: Rabin ,
174
182
SchemaStore :: fingerprint_algorithm,
175
183
) ;
176
- while total_consumed < data. len ( ) && self . decoded_rows < self . batch_size {
184
+ while total_consumed < data. len ( ) && self . remaining_capacity > 0 {
177
185
if let Some ( prefix_bytes) = self . handle_prefix ( & data[ total_consumed..] , hash_type) ? {
178
- // Schema change detected (> 0) or there were insufficient bytes to read the next prefix (= 0).
179
- // If the former, the batch must end because the next record has a different schema .
180
- // If the latter, batch ends because the caller needs to fetch more bytes .
186
+ // A batch is complete when its `remaining_capacity` is 0. It may be completed early if
187
+ // a schema change is detected or there are insufficient bytes to read the next prefix .
188
+ // A schema change requires a new batch .
181
189
total_consumed += prefix_bytes;
182
190
break ;
183
191
}
184
192
let n = self . active_decoder . decode ( & data[ total_consumed..] , 1 ) ?;
185
193
total_consumed += n;
186
- self . decoded_rows + = 1 ;
194
+ self . remaining_capacity - = 1 ;
187
195
}
188
196
Ok ( total_consumed)
189
197
}
190
198
191
199
/// Produce a `RecordBatch` if at least one row is fully decoded, returning
192
200
/// `Ok(None)` if no new rows are available.
193
201
pub fn flush ( & mut self ) -> Result < Option < RecordBatch > , ArrowError > {
194
- if self . decoded_rows == 0 {
202
+ if self . remaining_capacity == self . batch_size {
195
203
return Ok ( None ) ;
196
204
}
197
205
let batch = self . active_decoder . flush ( ) ?;
198
- self . decoded_rows = 0 ;
206
+ self . remaining_capacity = self . batch_size ;
199
207
// Apply a pending schema switch if one is staged
200
208
if let Some ( ( new_fingerprint, new_decoder) ) = self . pending_schema . take ( ) {
201
209
// Cache the old decoder before replacing it
@@ -210,7 +218,6 @@ impl Decoder {
210
218
self . active_decoder = new_decoder;
211
219
}
212
220
}
213
- self . evict_cache ( ) ;
214
221
Ok ( Some ( batch) )
215
222
}
216
223
@@ -220,10 +227,7 @@ impl Decoder {
220
227
buf : & [ u8 ] ,
221
228
hash_type : FingerprintAlgorithm ,
222
229
) -> Result < Option < usize > , ArrowError > {
223
- if self . schema_store . is_none ( )
224
- || self . static_store_mode
225
- || !buf. starts_with ( & SINGLE_OBJECT_MAGIC )
226
- {
230
+ if self . writer_schema_store . is_none ( ) || !buf. starts_with ( & SINGLE_OBJECT_MAGIC ) {
227
231
return Ok ( None ) ;
228
232
}
229
233
let full_len = prefix_len ( hash_type) ;
@@ -247,8 +251,8 @@ impl Decoder {
247
251
self . prepare_schema_switch ( new_fp) ?;
248
252
// If there are already decoded rows, we must flush them first.
249
253
// Forcing the batch to be full ensures `flush` is called next.
250
- if self . decoded_rows > 0 {
251
- self . decoded_rows = self . batch_size ;
254
+ if self . remaining_capacity < self . batch_size {
255
+ self . remaining_capacity = 0 ;
252
256
}
253
257
}
254
258
Ok ( Some ( full_len) )
@@ -260,7 +264,7 @@ impl Decoder {
260
264
} else {
261
265
// No cached decoder, create a new one
262
266
let store = self
263
- . schema_store
267
+ . writer_schema_store
264
268
. as_ref ( )
265
269
. ok_or_else ( || ArrowError :: ParseError ( "Schema store unavailable" . into ( ) ) ) ?;
266
270
let writer_schema = store. lookup ( & new_fingerprint) . ok_or_else ( || {
@@ -282,23 +286,14 @@ impl Decoder {
282
286
Ok ( ( ) )
283
287
}
284
288
285
- #[ inline]
286
- fn evict_cache ( & mut self ) {
287
- while self . cache . len ( ) > self . max_cache_size {
288
- if let Some ( lru_key) = self . cache . keys ( ) . next ( ) . cloned ( ) {
289
- self . cache . shift_remove ( & lru_key) ;
290
- }
291
- }
292
- }
293
-
294
289
/// Returns the number of rows that can be added to this decoder before it is full.
295
290
pub fn capacity ( & self ) -> usize {
296
- self . batch_size . saturating_sub ( self . decoded_rows )
291
+ self . remaining_capacity
297
292
}
298
293
299
294
/// Returns true if the decoder has reached its capacity for the current batch.
300
295
pub fn batch_is_full ( & self ) -> bool {
301
- self . capacity ( ) == 0
296
+ self . remaining_capacity == 0
302
297
}
303
298
}
304
299
@@ -312,7 +307,6 @@ pub struct ReaderBuilder {
312
307
reader_schema : Option < AvroSchema < ' static > > ,
313
308
writer_schema_store : Option < SchemaStore < ' static > > ,
314
309
active_fingerprint : Option < Fingerprint > ,
315
- static_store_mode : bool ,
316
310
decoder_cache_size : usize ,
317
311
}
318
312
@@ -325,7 +319,6 @@ impl Default for ReaderBuilder {
325
319
reader_schema : None ,
326
320
writer_schema_store : None ,
327
321
active_fingerprint : None ,
328
- static_store_mode : false ,
329
322
decoder_cache_size : 20 ,
330
323
}
331
324
}
@@ -367,20 +360,18 @@ impl ReaderBuilder {
367
360
active_decoder : RecordDecoder ,
368
361
active_fingerprint : Option < Fingerprint > ,
369
362
reader_schema : Option < AvroSchema < ' static > > ,
370
- schema_store : Option < SchemaStore < ' static > > ,
371
- static_store_mode : bool ,
363
+ writer_schema_store : Option < SchemaStore < ' static > > ,
372
364
) -> Decoder {
373
365
Decoder {
374
366
batch_size : self . batch_size ,
375
- decoded_rows : 0 ,
367
+ remaining_capacity : self . batch_size ,
376
368
active_fingerprint,
377
369
active_decoder,
378
370
cache : IndexMap :: new ( ) ,
379
371
max_cache_size : self . decoder_cache_size ,
380
372
reader_schema,
381
373
utf8_view : self . utf8_view ,
382
- schema_store,
383
- static_store_mode,
374
+ writer_schema_store,
384
375
strict_mode : self . strict_mode ,
385
376
pending_schema : None ,
386
377
}
@@ -397,39 +388,36 @@ impl ReaderBuilder {
397
388
} ) ?;
398
389
let record_decoder =
399
390
self . make_record_decoder ( & writer_schema, self . reader_schema . as_ref ( ) ) ?;
400
- Ok ( self . make_decoder_with_parts ( record_decoder, None , None , None , true ) )
391
+ Ok ( self . make_decoder_with_parts ( record_decoder, None , None , None ) )
401
392
}
402
393
None => {
403
394
let reader_schema = self . reader_schema . clone ( ) . ok_or_else ( || {
404
395
ArrowError :: ParseError ( "Reader schema required for raw Avro" . into ( ) )
405
396
} ) ?;
406
397
let ( init_fingerprint, initial_decoder) =
407
- match ( & self . writer_schema_store , self . active_fingerprint ) {
398
+ if let ( Some ( schema_store) , Some ( fingerprint) ) =
399
+ ( & self . writer_schema_store , self . active_fingerprint )
400
+ {
408
401
// An initial fingerprint is provided, use it to look up the first schema.
409
- ( Some ( schema_store) , Some ( fingerprint) ) => {
410
- let writer_schema =
411
- schema_store. lookup ( & fingerprint) . ok_or_else ( || {
412
- ArrowError :: ParseError (
413
- "Active fingerprint not found in schema store" . into ( ) ,
414
- )
415
- } ) ?;
416
- let decoder =
417
- self . make_record_decoder ( writer_schema, Some ( & reader_schema) ) ?;
418
- ( Some ( fingerprint) , decoder)
419
- }
402
+ let writer_schema = schema_store. lookup ( & fingerprint) . ok_or_else ( || {
403
+ ArrowError :: ParseError (
404
+ "Active fingerprint not found in schema store" . into ( ) ,
405
+ )
406
+ } ) ?;
407
+ let decoder =
408
+ self . make_record_decoder ( writer_schema, Some ( & reader_schema) ) ?;
409
+ ( Some ( fingerprint) , decoder)
410
+ } else {
420
411
// No initial fingerprint; the first record must contain one.
421
- // A temporary decoder is created from the reader schema.
422
- _ => {
423
- let decoder = self . make_record_decoder ( & reader_schema, None ) ?;
424
- ( None , decoder)
425
- }
412
+ // A decoder is created from the reader schema only.
413
+ let decoder = self . make_record_decoder ( & reader_schema, None ) ?;
414
+ ( None , decoder)
426
415
} ;
427
416
Ok ( self . make_decoder_with_parts (
428
417
initial_decoder,
429
418
init_fingerprint,
430
419
Some ( reader_schema) ,
431
420
self . writer_schema_store . clone ( ) ,
432
- self . static_store_mode ,
433
421
) )
434
422
}
435
423
}
@@ -493,18 +481,6 @@ impl ReaderBuilder {
493
481
self
494
482
}
495
483
496
- /// If `true`, all schemas must be pre-registered in the `SchemaStore`.
497
- ///
498
- /// When this mode is enabled, decoding will fail if a schema fingerprint is
499
- /// encountered that does not already exist in the store. This prevents the
500
- /// dynamic resolution of schemas and ensures that only known schemas are used.
501
- ///
502
- /// Defaults to `false`.
503
- pub fn with_static_store_mode ( mut self , enabled : bool ) -> Self {
504
- self . static_store_mode = enabled;
505
- self
506
- }
507
-
508
484
/// Set the maximum number of decoders to cache.
509
485
///
510
486
/// When dealing with Avro files that contain multiple schemas, we may need to switch
@@ -521,20 +497,13 @@ impl ReaderBuilder {
521
497
self . writer_schema_store . as_ref ( ) ,
522
498
self . reader_schema . as_ref ( ) ,
523
499
self . active_fingerprint . as_ref ( ) ,
524
- self . static_store_mode ,
525
500
) {
526
- ( Some ( _) , None , _, _ ) => Err ( ArrowError :: ParseError (
501
+ ( Some ( _) , None , _) => Err ( ArrowError :: ParseError (
527
502
"Reader schema must be set when writer schema store is provided" . into ( ) ,
528
503
) ) ,
529
- ( None , _, Some ( _) , _ ) => Err ( ArrowError :: ParseError (
504
+ ( None , _, Some ( _) ) => Err ( ArrowError :: ParseError (
530
505
"Active fingerprint requires a writer schema store" . into ( ) ,
531
506
) ) ,
532
- ( None , _, _, true ) => Err ( ArrowError :: ParseError (
533
- "static_store_mode=true requires a writer schema store" . into ( ) ,
534
- ) ) ,
535
- ( Some ( _) , _, None , true ) => Err ( ArrowError :: ParseError (
536
- "static_store_mode=true requires an active fingerprint" . into ( ) ,
537
- ) ) ,
538
507
_ => Ok ( ( ) ) ,
539
508
}
540
509
}
@@ -780,32 +749,6 @@ mod test {
780
749
assert_eq ! ( store. fingerprint_algorithm( ) , FingerprintAlgorithm :: Rabin ) ;
781
750
}
782
751
783
- #[ test]
784
- fn test_static_store_mode_ignores_subsequent_prefix ( ) {
785
- let ( store, fp_int, fp_long, schema_int, _schema_long) = make_two_schema_store ( ) ;
786
- let mut decoder = ReaderBuilder :: new ( )
787
- . with_batch_size ( 8 )
788
- . with_reader_schema ( schema_int. clone ( ) )
789
- . with_writer_schema_store ( store)
790
- . with_active_fingerprint ( fp_int)
791
- . with_static_store_mode ( true )
792
- . build_decoder ( )
793
- . expect ( "build decoder" ) ;
794
- let prefix = make_prefix ( fp_long) ;
795
- match decoder. decode ( & prefix) {
796
- Err ( ArrowError :: ParseError ( _) ) => {
797
- assert ! (
798
- decoder. pending_schema. is_none( ) ,
799
- "no schema switch should be staged"
800
- ) ;
801
- }
802
- Ok ( n) => {
803
- panic ! ( "decode unexpectedly succeeded (consumed {n} bytes) in static_store_mode" )
804
- }
805
- Err ( e) => panic ! ( "unexpected error kind: {e}" ) ,
806
- }
807
- }
808
-
809
752
#[ test]
810
753
fn test_unknown_fingerprint_is_error ( ) {
811
754
let ( mut store, fp_int, _fp_long, schema_int, _schema_long) = make_two_schema_store ( ) ;
@@ -833,6 +776,24 @@ mod test {
833
776
) ;
834
777
}
835
778
779
+ #[ test]
780
+ fn test_missing_initial_fingerprint_error ( ) {
781
+ let ( store, _fp_int, _fp_long, schema_int, _schema_long) = make_two_schema_store ( ) ;
782
+ let mut decoder = ReaderBuilder :: new ( )
783
+ . with_batch_size ( 8 )
784
+ . with_reader_schema ( schema_int. clone ( ) )
785
+ . with_writer_schema_store ( store)
786
+ . build_decoder ( )
787
+ . expect ( "build decoder" ) ;
788
+ let buf = [ 0x02u8 , 0x00u8 ] ;
789
+ let err = decoder. decode ( & buf) . expect_err ( "decode should error" ) ;
790
+ let msg = format ! ( "{err}" ) ;
791
+ assert ! (
792
+ msg. contains( "Expected single‑object encoding fingerprint" ) ,
793
+ "unexpected error message: {msg}"
794
+ ) ;
795
+ }
796
+
836
797
#[ test]
837
798
fn test_utf8view_support ( ) {
838
799
let schema_json = r#"{
0 commit comments