Skip to content

Commit 938663e

Browse files
authored
Optimizes reading binary symbol and integer values (#396)
This PR: * Eliminates redundant boundary checking that happens when integer and symbol values are read. It does this by offering unchecked `UInt` decoding methods that read a single value from a provided slice. * Removes support for reading symbol IDs that require more than 8 bytes to encode. This limits the size of the symbol table to about 18 quintillion symbol values, which seems reasonable. * Adds `Into<Integer>` implementations for Rust's integer primitives. Together, these reduced the time needed to read every value in a 1.3GB Ion log file by about 6%. Fixes #395.
1 parent d7c4348 commit 938663e

File tree

6 files changed

+93
-28
lines changed

6 files changed

+93
-28
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,8 @@ massif.out.*
226226
bb.out
227227
bb.out.*
228228

229-
229+
# macOS folder metadata
230+
**/.DS_Store
230231

231232
### VisualStudioCode ###
232233
.vscode/*

src/binary/non_blocking/binary_buffer.rs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ impl<A: AsRef<[u8]>> BinaryBuffer<A> {
9393
/// If the buffer is not empty, returns `Some(_)` containing the next byte in the buffer.
9494
/// Otherwise, returns `None`.
9595
pub fn peek_next_byte(&self) -> Option<u8> {
96-
self.bytes().get(0).copied()
96+
self.data.as_ref().get(self.start).copied()
9797
}
9898

9999
/// If there are at least `n` bytes left in the buffer, returns `Some(_)` containing a slice
@@ -262,7 +262,7 @@ impl<A: AsRef<[u8]>> BinaryBuffer<A> {
262262
///
263263
/// See: https://amzn.github.io/ion-docs/docs/binary.html#uint-and-int-fields
264264
pub fn read_uint(&mut self, length: usize) -> IonResult<DecodedUInt> {
265-
if length <= mem::size_of::<usize>() {
265+
if length <= mem::size_of::<u64>() {
266266
return self.read_small_uint(length);
267267
}
268268

@@ -271,18 +271,13 @@ impl<A: AsRef<[u8]>> BinaryBuffer<A> {
271271
}
272272

273273
/// Reads the first `length` bytes from the buffer as a `UInt`. The caller must confirm that
274-
/// `length` is small enough to fit in a `usize`.
274+
/// `length` is small enough to fit in a `u64`.
275275
#[inline]
276276
fn read_small_uint(&mut self, length: usize) -> IonResult<DecodedUInt> {
277277
let uint_bytes = self
278278
.peek_n_bytes(length)
279279
.ok_or_else(|| incomplete_data_error_raw("a UInt", self.total_consumed()))?;
280-
let mut magnitude: u64 = 0;
281-
for &byte in uint_bytes {
282-
let byte = u64::from(byte);
283-
magnitude <<= 8;
284-
magnitude |= byte;
285-
}
280+
let magnitude = DecodedUInt::small_uint_from_slice(uint_bytes);
286281
self.consume(length);
287282
Ok(DecodedUInt::new(UInteger::U64(magnitude), length))
288283
}

src/binary/non_blocking/raw_binary_reader.rs

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use crate::result::{
99
decoding_error, decoding_error_raw, illegal_operation, illegal_operation_raw,
1010
incomplete_data_error,
1111
};
12-
use crate::types::integer::{IntAccess, UInteger};
12+
use crate::types::integer::IntAccess;
1313
use crate::types::SymbolId;
1414
use crate::{
1515
Decimal, Integer, IonResult, IonType, RawStreamItem, RawSymbolToken, StreamReader, Timestamp,
@@ -18,6 +18,7 @@ use bytes::{BigEndian, Buf, ByteOrder};
1818
use num_bigint::BigUint;
1919
use num_traits::Zero;
2020
use std::io::Read;
21+
use std::mem;
2122
use std::ops::Range;
2223

2324
/// Type, offset, and length information about the serialized value over which the
@@ -475,18 +476,13 @@ impl<A: AsRef<[u8]>> RawBinaryBufferReader<A> {
475476

476477
/// If the reader is currently positioned on a symbol value, parses that value into a `SymbolId`.
477478
pub fn read_symbol_id(&mut self) -> IonResult<SymbolId> {
478-
let (encoded_value, mut buffer) = self.value_and_buffer(IonType::Symbol)?;
479-
match buffer.read_uint(encoded_value.value_length())?.value() {
480-
UInteger::U64(symbol_id) => {
481-
// This will always succeed on 64-bit platforms where u64 and usize are the same.
482-
if let Ok(sid) = usize::try_from(*symbol_id) {
483-
Ok(sid)
484-
} else {
485-
decoding_error("found a u64 symbol ID that was too large to fit in a usize")
486-
}
487-
}
488-
UInteger::BigUInt(symbol_id) => Self::try_symbol_id_from_big_uint(symbol_id),
479+
let (_encoded_value, bytes) = self.value_and_bytes(IonType::Symbol)?;
480+
if bytes.len() > mem::size_of::<usize>() {
481+
return decoding_error("found a symbol Id that was too large to fit in a usize");
489482
}
483+
let magnitude = DecodedUInt::small_uint_from_slice(bytes);
484+
// This cast is safe because we've confirmed the value was small enough to fit in a usize.
485+
Ok(magnitude as usize)
490486
}
491487

492488
/// Tries to downgrade the provided BigUint to a SymbolId (usize).
@@ -628,6 +624,12 @@ impl<A: AsRef<[u8]>> StreamReader for RawBinaryBufferReader<A> {
628624
Box::new(self.annotations_iter())
629625
}
630626

627+
fn has_annotations(&self) -> bool {
628+
self.encoded_value()
629+
.map(|v| v.annotations_sequence_length > 0)
630+
.unwrap_or(false)
631+
}
632+
631633
fn field_name(&self) -> IonResult<Self::Symbol> {
632634
// If the reader is parked on a value...
633635
self.encoded_value()
@@ -685,9 +687,12 @@ impl<A: AsRef<[u8]>> StreamReader for RawBinaryBufferReader<A> {
685687
}
686688

687689
fn read_integer(&mut self) -> IonResult<Integer> {
688-
let (encoded_value, mut buffer) = self.value_and_buffer(IonType::Integer)?;
689-
let uint: DecodedUInt = buffer.read_uint(encoded_value.value_length())?;
690-
let value: Integer = uint.into();
690+
let (encoded_value, bytes) = self.value_and_bytes(IonType::Integer)?;
691+
let value: Integer = if bytes.len() <= mem::size_of::<u64>() {
692+
DecodedUInt::small_uint_from_slice(bytes).into()
693+
} else {
694+
DecodedUInt::big_uint_from_slice(bytes).into()
695+
};
691696

692697
use self::IonTypeCode::*;
693698
let value = match (encoded_value.header.ion_type_code, value) {
@@ -1195,7 +1200,7 @@ impl<'a, A: AsRef<[u8]>> TxReader<'a, A> {
11951200
// Read the length of the annotations sequence
11961201
let annotations_length = self.tx_buffer.read_var_uint()?;
11971202

1198-
// Validate that neither the annotations sequence is not empty.
1203+
// Validate that the annotations sequence is not empty.
11991204
if annotations_length.value() == 0 {
12001205
return decoding_error("found an annotations wrapper with no annotations");
12011206
}
@@ -1204,7 +1209,7 @@ impl<'a, A: AsRef<[u8]>> TxReader<'a, A> {
12041209
let expected_value_length = annotations_and_value_length
12051210
- annotations_length.size_in_bytes()
12061211
- annotations_length.value();
1207-
self.tx_buffer.total_consumed();
1212+
12081213
if expected_value_length == 0 {
12091214
return decoding_error("found an annotation wrapper with no value");
12101215
}
@@ -1749,7 +1754,7 @@ mod tests {
17491754
fn debug() -> IonResult<()> {
17501755
let data = &[
17511756
0xE0, 0x01, 0x00, 0xEA, // IVM
1752-
0xc3, 0xd2, 0x84, 0x11, // {'name': true}
1757+
0xc3, 0xd2, 0x84, 0x11, // ({'name': true})
17531758
]; // Empty string
17541759
let mut reader = RawBinaryBufferReader::new(data);
17551760
let item = reader.next()?;

src/binary/uint.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,24 @@ impl DecodedUInt {
2828
}
2929
}
3030

31+
/// Interprets all of the bytes in the provided slice as big-endian unsigned integer bytes.
32+
/// The caller must confirm that `uint_bytes` is no longer than 8 bytes long; otherwise,
33+
/// overflow may quietly occur.
34+
pub(crate) fn small_uint_from_slice(uint_bytes: &[u8]) -> u64 {
35+
let mut magnitude: u64 = 0;
36+
for &byte in uint_bytes {
37+
let byte = u64::from(byte);
38+
magnitude <<= 8;
39+
magnitude |= byte;
40+
}
41+
magnitude
42+
}
43+
44+
/// Interprets all of the bytes in the provided slice as big-endian unsigned integer bytes.
45+
pub(crate) fn big_uint_from_slice(uint_bytes: &[u8]) -> BigUint {
46+
BigUint::from_bytes_be(uint_bytes)
47+
}
48+
3149
/// Reads a UInt with `length` bytes from the provided data source.
3250
pub fn read<R: IonDataSource>(data_source: &mut R, length: usize) -> IonResult<DecodedUInt> {
3351
if length > MAX_UINT_SIZE_IN_BYTES {

src/types/integer.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,48 @@ impl Zero for Integer {
316316
}
317317
}
318318

319+
// Trivial conversion to Integer::I64 from integers that can safely be converted to an i64
320+
macro_rules! impl_integer_i64_from {
321+
($($t:ty),*) => ($(
322+
impl From<$t> for Integer {
323+
fn from(value: $t) -> Integer {
324+
let i64_value = i64::from(value);
325+
Integer::I64(i64_value)
326+
}
327+
}
328+
)*)
329+
}
330+
impl_integer_i64_from!(u8, u16, u32, i8, i16, i32, i64);
331+
332+
// Conversion to Integer from integer types that may or may not fit in an i64
333+
macro_rules! impl_integer_from {
334+
($($t:ty),*) => ($(
335+
impl From<$t> for Integer {
336+
fn from(value: $t) -> Integer {
337+
match i64::try_from(value) {
338+
Ok(i64_value) => Integer::I64(i64_value),
339+
Err(_) => Integer::BigInt(BigInt::from(value))
340+
}
341+
}
342+
}
343+
)*)
344+
}
345+
346+
impl_integer_from!(isize, usize, u64);
347+
348+
impl From<BigUint> for Integer {
349+
fn from(value: BigUint) -> Self {
350+
let big_int = BigInt::from(value);
351+
Integer::BigInt(big_int)
352+
}
353+
}
354+
355+
impl From<BigInt> for Integer {
356+
fn from(value: BigInt) -> Self {
357+
Integer::BigInt(value)
358+
}
359+
}
360+
319361
impl<T> IntAccess for T
320362
where
321363
T: Element,

tests/element_test_vectors.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,10 @@ mod non_blocking_native_element_tests {
788788
"ion-tests/iontestdata/good/subfieldVarUInt15bit.ion",
789789
"ion-tests/iontestdata/good/subfieldVarUInt16bit.ion",
790790
"ion-tests/iontestdata/good/subfieldVarUInt32bit.ion",
791+
// This test requires the reader to be able to read symbols whose ID is encoded
792+
// with more than 8 bytes. Having a symbol table with more than 18 quintillion
793+
// symbols is not very practical.
794+
"ion-tests/iontestdata/good/typecodes/T7-large.10n",
791795
// ---
792796
// Requires importing shared symbol tables
793797
"ion-tests/iontestdata/good/item1.10n",

0 commit comments

Comments
 (0)