-
Notifications
You must be signed in to change notification settings - Fork 38
Optimizes reading binary symbol and integer values #396
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0c866b8
cdf1dac
7f8ccae
d784c50
ceae311
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,7 +93,7 @@ impl<A: AsRef<[u8]>> BinaryBuffer<A> { | |
/// If the buffer is not empty, returns `Some(_)` containing the next byte in the buffer. | ||
/// Otherwise, returns `None`. | ||
pub fn peek_next_byte(&self) -> Option<u8> { | ||
self.bytes().get(0).copied() | ||
self.data.as_ref().get(self.start).copied() | ||
} | ||
|
||
/// If there are at least `n` bytes left in the buffer, returns `Some(_)` containing a slice | ||
|
@@ -262,7 +262,7 @@ impl<A: AsRef<[u8]>> BinaryBuffer<A> { | |
/// | ||
/// See: https://amzn.github.io/ion-docs/docs/binary.html#uint-and-int-fields | ||
pub fn read_uint(&mut self, length: usize) -> IonResult<DecodedUInt> { | ||
if length <= mem::size_of::<usize>() { | ||
if length <= mem::size_of::<u64>() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ Here and below: the code was originally checking for In some cases (like reading a |
||
return self.read_small_uint(length); | ||
} | ||
|
||
|
@@ -271,18 +271,13 @@ impl<A: AsRef<[u8]>> BinaryBuffer<A> { | |
} | ||
|
||
/// Reads the first `length` bytes from the buffer as a `UInt`. The caller must confirm that | ||
/// `length` is small enough to fit in a `usize`. | ||
/// `length` is small enough to fit in a `u64`. | ||
#[inline] | ||
fn read_small_uint(&mut self, length: usize) -> IonResult<DecodedUInt> { | ||
let uint_bytes = self | ||
.peek_n_bytes(length) | ||
.ok_or_else(|| incomplete_data_error_raw("a UInt", self.total_consumed()))?; | ||
let mut magnitude: u64 = 0; | ||
for &byte in uint_bytes { | ||
let byte = u64::from(byte); | ||
magnitude <<= 8; | ||
magnitude |= byte; | ||
} | ||
let magnitude = DecodedUInt::small_uint_from_slice(uint_bytes); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ This bitshifting loop now lives in |
||
self.consume(length); | ||
Ok(DecodedUInt::new(UInteger::U64(magnitude), length)) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ use crate::result::{ | |
decoding_error, decoding_error_raw, illegal_operation, illegal_operation_raw, | ||
incomplete_data_error, | ||
}; | ||
use crate::types::integer::{IntAccess, UInteger}; | ||
use crate::types::integer::IntAccess; | ||
use crate::types::SymbolId; | ||
use crate::{ | ||
Decimal, Integer, IonResult, IonType, RawStreamItem, RawSymbolToken, StreamReader, Timestamp, | ||
|
@@ -18,6 +18,7 @@ use bytes::{BigEndian, Buf, ByteOrder}; | |
use num_bigint::BigUint; | ||
use num_traits::Zero; | ||
use std::io::Read; | ||
use std::mem; | ||
use std::ops::Range; | ||
|
||
/// Type, offset, and length information about the serialized value over which the | ||
|
@@ -475,18 +476,13 @@ impl<A: AsRef<[u8]>> RawBinaryBufferReader<A> { | |
|
||
/// If the reader is currently positioned on a symbol value, parses that value into a `SymbolId`. | ||
pub fn read_symbol_id(&mut self) -> IonResult<SymbolId> { | ||
let (encoded_value, mut buffer) = self.value_and_buffer(IonType::Symbol)?; | ||
match buffer.read_uint(encoded_value.value_length())?.value() { | ||
Comment on lines
-478
to
-479
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ Previously, we constructed a
|
||
UInteger::U64(symbol_id) => { | ||
// This will always succeed on 64-bit platforms where u64 and usize are the same. | ||
if let Ok(sid) = usize::try_from(*symbol_id) { | ||
Ok(sid) | ||
} else { | ||
decoding_error("found a u64 symbol ID that was too large to fit in a usize") | ||
} | ||
} | ||
UInteger::BigUInt(symbol_id) => Self::try_symbol_id_from_big_uint(symbol_id), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ This code path (
Text view of its contents:
I have removed that code path altogether and added the test to the skip list. I don't believe this represents a substantive loss of functionality. |
||
let (_encoded_value, bytes) = self.value_and_bytes(IonType::Symbol)?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ We no longer construct a |
||
if bytes.len() > mem::size_of::<usize>() { | ||
return decoding_error("found a symbol Id that was too large to fit in a usize"); | ||
} | ||
let magnitude = DecodedUInt::small_uint_from_slice(bytes); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ We can read from |
||
// This cast is safe because we've confirmed the value was small enough to fit in a usize. | ||
Ok(magnitude as usize) | ||
} | ||
|
||
/// Tries to downgrade the provided BigUint to a SymbolId (usize). | ||
|
@@ -628,6 +624,12 @@ impl<A: AsRef<[u8]>> StreamReader for RawBinaryBufferReader<A> { | |
Box::new(self.annotations_iter()) | ||
} | ||
|
||
fn has_annotations(&self) -> bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ This method is incidental to this changeset, but is also an important optimization. Readers checking to see if the current value has an annotation can simply see if the encoding had an annotations sequence; this is much faster than the default implementation that constructs a boxed iterator and reports whether it is empty. |
||
self.encoded_value() | ||
.map(|v| v.annotations_sequence_length > 0) | ||
.unwrap_or(false) | ||
} | ||
|
||
fn field_name(&self) -> IonResult<Self::Symbol> { | ||
// If the reader is parked on a value... | ||
self.encoded_value() | ||
|
@@ -685,9 +687,12 @@ impl<A: AsRef<[u8]>> StreamReader for RawBinaryBufferReader<A> { | |
} | ||
|
||
fn read_integer(&mut self) -> IonResult<Integer> { | ||
let (encoded_value, mut buffer) = self.value_and_buffer(IonType::Integer)?; | ||
let uint: DecodedUInt = buffer.read_uint(encoded_value.value_length())?; | ||
let value: Integer = uint.into(); | ||
let (encoded_value, bytes) = self.value_and_bytes(IonType::Integer)?; | ||
let value: Integer = if bytes.len() <= mem::size_of::<u64>() { | ||
DecodedUInt::small_uint_from_slice(bytes).into() | ||
} else { | ||
DecodedUInt::big_uint_from_slice(bytes).into() | ||
}; | ||
Comment on lines
+690
to
+695
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ This optimization is similar to the
We also get a minor boost from the fact that |
||
|
||
use self::IonTypeCode::*; | ||
let value = match (encoded_value.header.ion_type_code, value) { | ||
|
@@ -1195,7 +1200,7 @@ impl<'a, A: AsRef<[u8]>> TxReader<'a, A> { | |
// Read the length of the annotations sequence | ||
let annotations_length = self.tx_buffer.read_var_uint()?; | ||
|
||
// Validate that neither the annotations sequence is not empty. | ||
// Validate that the annotations sequence is not empty. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ Fixing a typo. |
||
if annotations_length.value() == 0 { | ||
return decoding_error("found an annotations wrapper with no annotations"); | ||
} | ||
|
@@ -1204,7 +1209,7 @@ impl<'a, A: AsRef<[u8]>> TxReader<'a, A> { | |
let expected_value_length = annotations_and_value_length | ||
- annotations_length.size_in_bytes() | ||
- annotations_length.value(); | ||
self.tx_buffer.total_consumed(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ Removing a stray accessor function that had no impact on the method logic (and was optimized out of the assembly anyway). |
||
|
||
if expected_value_length == 0 { | ||
return decoding_error("found an annotation wrapper with no value"); | ||
} | ||
|
@@ -1749,7 +1754,7 @@ mod tests { | |
fn debug() -> IonResult<()> { | ||
let data = &[ | ||
0xE0, 0x01, 0x00, 0xEA, // IVM | ||
0xc3, 0xd2, 0x84, 0x11, // {'name': true} | ||
0xc3, 0xd2, 0x84, 0x11, // ({'name': true}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ Correcting the text Ion in the comment for this unit test. |
||
]; // Empty string | ||
let mut reader = RawBinaryBufferReader::new(data); | ||
let item = reader.next()?; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,24 @@ impl DecodedUInt { | |
} | ||
} | ||
|
||
/// Interprets all of the bytes in the provided slice as big-endian unsigned integer bytes. | ||
/// The caller must confirm that `uint_bytes` is no longer than 8 bytes long; otherwise, | ||
/// overflow may quietly occur. | ||
pub(crate) fn small_uint_from_slice(uint_bytes: &[u8]) -> u64 { | ||
let mut magnitude: u64 = 0; | ||
for &byte in uint_bytes { | ||
let byte = u64::from(byte); | ||
magnitude <<= 8; | ||
magnitude |= byte; | ||
} | ||
magnitude | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🗺️ Our new helper methods that operate directly on the provided byte slice. |
||
|
||
/// Interprets all of the bytes in the provided slice as big-endian unsigned integer bytes. | ||
pub(crate) fn big_uint_from_slice(uint_bytes: &[u8]) -> BigUint { | ||
BigUint::from_bytes_be(uint_bytes) | ||
} | ||
|
||
/// Reads a UInt with `length` bytes from the provided data source. | ||
pub fn read<R: IonDataSource>(data_source: &mut R, length: usize) -> IonResult<DecodedUInt> { | ||
if length > MAX_UINT_SIZE_IN_BYTES { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🗺️ The original code created a byte slice (
&[u8]
) even though we only needed the first byte in the data. Now it just gets the first byte without creating an intermediate slice.