More robust leb128 parsing (automerge#515)

Before this change i64 decoding did not work for negative numbers (not a real problem because it is only used for the timestamp of a change), and both u64 and i64 would allow overlong LEB encodings.
datalayer-externals · Jan 31, 2023 · a6959e7 · a6959e7
1 parent de5af2f
commit a6959e7
Show file tree

Hide file tree

Showing 2 changed files with 240 additions and 56 deletions.
diff --git a/rust/automerge/src/storage/parse.rs b/rust/automerge/src/storage/parse.rs
@@ -110,7 +110,7 @@ use crate::{ActorId, ChangeHash};
 const HASH_SIZE: usize = 32; // 256 bits = 32 bytes
 
 #[allow(unused_imports)]
-pub(crate) use self::leb128::{leb128_i32, leb128_i64, leb128_u32, leb128_u64, nonzero_leb128_u64};
+pub(crate) use self::leb128::{leb128_i64, leb128_u32, leb128_u64, nonzero_leb128_u64};
 
 pub(crate) type ParseResult<'a, O, E> = Result<(Input<'a>, O), ParseError<E>>;
 

diff --git a/rust/automerge/src/storage/parse/leb128.rs b/rust/automerge/src/storage/parse/leb128.rs
@@ -1,4 +1,3 @@
-use core::mem::size_of;
 use std::num::NonZeroU64;
 
 use super::{take1, Input, ParseError, ParseResult};
@@ -7,44 +6,83 @@ use super::{take1, Input, ParseError, ParseResult};
 pub(crate) enum Error {
     #[error("leb128 was too large for the destination type")]
     Leb128TooLarge,
+    #[error("leb128 was improperly encoded")]
+    Leb128Overlong,
     #[error("leb128 was zero when it was expected to be nonzero")]
     UnexpectedZero,
 }
 
-macro_rules! impl_leb {
-    ($parser_name: ident, $ty: ty) => {
-        #[allow(dead_code)]
-        pub(crate) fn $parser_name<'a, E>(input: Input<'a>) -> ParseResult<'a, $ty, E>
-        where
-            E: From<Error>,
-        {
-            let mut res = 0;
-            let mut shift = 0;
-
-            let mut input = input;
-            let mut pos = 0;
-            loop {
-                let (i, byte) = take1(input)?;
-                input = i;
-                if (byte & 0x80) == 0 {
-                    res |= (byte as $ty) << shift;
-                    return Ok((input, res));
-                } else if pos == leb128_size::<$ty>() - 1 {
-                    return Err(ParseError::Error(Error::Leb128TooLarge.into()));
-                } else {
-                    res |= ((byte & 0x7F) as $ty) << shift;
-                }
-                pos += 1;
-                shift += 7;
+pub(crate) fn leb128_u64<E>(input: Input<'_>) -> ParseResult<'_, u64, E>
+where
+    E: From<Error>,
+{
+    let mut res = 0;
+    let mut shift = 0;
+    let mut input = input;
+
+    loop {
+        let (i, byte) = take1(input)?;
+        input = i;
+        res |= ((byte & 0x7F) as u64) << shift;
+        shift += 7;
+
+        if (byte & 0x80) == 0 {
+            if shift > 64 && byte > 1 {
+                return Err(ParseError::Error(Error::Leb128TooLarge.into()));
+            } else if shift > 7 && byte == 0 {
+                return Err(ParseError::Error(Error::Leb128Overlong.into()));
+            }
+            return Ok((input, res));
+        } else if shift > 64 {
+            return Err(ParseError::Error(Error::Leb128TooLarge.into()));
+        }
+    }
+}
+
+pub(crate) fn leb128_i64<E>(input: Input<'_>) -> ParseResult<'_, i64, E>
+where
+    E: From<Error>,
+{
+    let mut res = 0;
+    let mut shift = 0;
+
+    let mut input = input;
+    let mut prev = 0;
+    loop {
+        let (i, byte) = take1(input)?;
+        input = i;
+        res |= ((byte & 0x7F) as i64) << shift;
+        shift += 7;
+
+        if (byte & 0x80) == 0 {
+            if shift > 64 && byte != 0 && byte != 0x7f {
+                // the 10th byte (if present) must contain only the sign-extended sign bit
+                return Err(ParseError::Error(Error::Leb128TooLarge.into()));
+            } else if shift > 7
+                && ((byte == 0 && prev & 0x40 == 0) || (byte == 0x7f && prev & 0x40 > 0))
+            {
+                // overlong if the sign bit of penultimate byte has been extended
+                return Err(ParseError::Error(Error::Leb128Overlong.into()));
+            } else if shift < 64 && byte & 0x40 > 0 {
+                // sign extend negative numbers
+                res |= -1 << shift;
             }
+            return Ok((input, res));
+        } else if shift > 64 {
+            return Err(ParseError::Error(Error::Leb128TooLarge.into()));
         }
-    };
+        prev = byte;
+    }
 }
 
-impl_leb!(leb128_u64, u64);
-impl_leb!(leb128_u32, u32);
-impl_leb!(leb128_i64, i64);
-impl_leb!(leb128_i32, i32);
+pub(crate) fn leb128_u32<E>(input: Input<'_>) -> ParseResult<'_, u32, E>
+where
+    E: From<Error>,
+{
+    let (i, num) = leb128_u64(input)?;
+    let result = u32::try_from(num).map_err(|_| ParseError::Error(Error::Leb128TooLarge.into()))?;
+    Ok((i, result))
+}
 
 /// Parse a LEB128 encoded u64 from the input, throwing an error if it is `0`
 pub(crate) fn nonzero_leb128_u64<E>(input: Input<'_>) -> ParseResult<'_, NonZeroU64, E>
@@ -57,38 +95,27 @@ where
     Ok((input, result))
 }
 
-/// Maximum LEB128-encoded size of an integer type
-const fn leb128_size<T>() -> usize {
-    let bits = size_of::<T>() * 8;
-    (bits + 6) / 7 // equivalent to ceil(bits/7) w/o floats
-}
-
 #[cfg(test)]
 mod tests {
     use super::super::Needed;
     use super::*;
-    use std::{convert::TryFrom, num::NonZeroUsize};
+    use std::num::NonZeroUsize;
 
     const NEED_ONE: Needed = Needed::Size(unsafe { NonZeroUsize::new_unchecked(1) });
 
     #[test]
-    fn leb_128_unsigned() {
+    fn leb_128_u64() {
         let one = &[0b00000001_u8];
         let one_two_nine = &[0b10000001, 0b00000001];
         let one_and_more = &[0b00000001, 0b00000011];
 
         let scenarios: Vec<(&'static [u8], ParseResult<'_, u64, Error>)> = vec![
             (one, Ok((Input::with_position(one, 1), 1))),
-            (&[0b10000001_u8], Err(ParseError::Incomplete(NEED_ONE))),
             (
                 one_two_nine,
                 Ok((Input::with_position(one_two_nine, 2), 129)),
             ),
             (one_and_more, Ok((Input::with_position(one_and_more, 1), 1))),
-            (
-                &[129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129],
-                Err(ParseError::Error(Error::Leb128TooLarge)),
-            ),
         ];
         for (index, (input, expected)) in scenarios.clone().into_iter().enumerate() {
             let result = leb128_u64(Input::new(input));
@@ -102,17 +129,174 @@ mod tests {
             }
         }
 
-        for (index, (input, expected)) in scenarios.into_iter().enumerate() {
-            let u32_expected = expected.map(|(i, e)| (i, u32::try_from(e).unwrap()));
-            let result = leb128_u32(Input::new(input));
-            if result != u32_expected {
-                panic!(
-                    "Scenario {} failed for u32: expected {:?} got {:?}",
-                    index + 1,
-                    u32_expected,
-                    result
-                );
+        let error_cases: Vec<(&'static str, &'static [u8], ParseError<_>)> = vec![
+            (
+                "too many bytes",
+                &[129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129],
+                ParseError::Error(Error::Leb128TooLarge),
+            ),
+            (
+                "too many bits",
+                &[129, 129, 129, 129, 129, 129, 129, 129, 129, 2],
+                ParseError::Error(Error::Leb128TooLarge),
+            ),
+            (
+                "overlong encoding",
+                &[129, 0],
+                ParseError::Error(Error::Leb128Overlong),
+            ),
+            ("missing data", &[255], ParseError::Incomplete(NEED_ONE)),
+        ];
+        error_cases.into_iter().for_each(|(desc, input, expected)| {
+            match leb128_u64::<Error>(Input::new(input)) {
+                Ok((_, x)) => panic!("leb128_u64 should fail with {}, got {}", desc, x),
+                Err(error) => {
+                    if error != expected {
+                        panic!("leb128_u64 should fail with {}, got {}", expected, error)
+                    }
+                }
             }
-        }
+        });
+
+        let success_cases: Vec<(&'static [u8], u64)> = vec![
+            (&[0], 0),
+            (&[0x7f], 127),
+            (&[0x80, 0x01], 128),
+            (&[0xff, 0x7f], 16383),
+            (
+                &[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1],
+                u64::MAX,
+            ),
+        ];
+        success_cases.into_iter().for_each(|(input, expected)| {
+            match leb128_u64::<Error>(Input::new(input)) {
+                Ok((_, x)) => {
+                    if x != expected {
+                        panic!("leb128_u64 should succeed with {}, got {}", expected, x)
+                    }
+                }
+                Err(error) => panic!("leb128_u64 should succeed with {}, got {}", expected, error),
+            }
+        });
+    }
+
+    #[test]
+    fn leb_128_u32() {
+        let error_cases: Vec<(&'static str, &'static [u8], ParseError<_>)> = vec![
+            (
+                "too many bytes",
+                &[129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129],
+                ParseError::Error(Error::Leb128TooLarge),
+            ),
+            (
+                "too many bits",
+                &[0xff, 0xff, 0xff, 0xff, 0x1f],
+                ParseError::Error(Error::Leb128TooLarge),
+            ),
+            (
+                "overlong encoding",
+                &[129, 0],
+                ParseError::Error(Error::Leb128Overlong),
+            ),
+            ("missing data", &[0xaa], ParseError::Incomplete(NEED_ONE)),
+        ];
+        error_cases.into_iter().for_each(|(desc, input, expected)| {
+            match leb128_u32::<Error>(Input::new(input)) {
+                Ok((_, x)) => panic!("leb128_u32 should fail with {}, got {}", desc, x),
+                Err(error) => {
+                    if error != expected {
+                        panic!("leb128_u32 should fail with {}, got {}", expected, error)
+                    }
+                }
+            }
+        });
+
+        let success_cases: Vec<(&'static [u8], u32)> = vec![
+            (&[0], 0),
+            (&[0x7f], 127),
+            (&[0x80, 0x01], 128),
+            (&[0xff, 0x7f], 16383),
+            (&[0xff, 0xff, 0xff, 0xff, 0x0f], u32::MAX),
+        ];
+        success_cases.into_iter().for_each(|(input, expected)| {
+            match leb128_u32::<Error>(Input::new(input)) {
+                Ok((_, x)) => {
+                    if x != expected {
+                        panic!("leb128_u32 should succeed with {}, got {}", expected, x)
+                    }
+                }
+                Err(error) => panic!("leb128_u64 should succeed with {}, got {}", expected, error),
+            }
+        });
+    }
+
+    #[test]
+    fn leb_128_i64() {
+        let error_cases: Vec<(&'static str, &'static [u8], ParseError<_>)> = vec![
+            (
+                "too many bytes",
+                &[129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129],
+                ParseError::Error(Error::Leb128TooLarge),
+            ),
+            (
+                "too many positive bits",
+                &[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01],
+                ParseError::Error(Error::Leb128TooLarge),
+            ),
+            (
+                "too many negative bits",
+                &[0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7e],
+                ParseError::Error(Error::Leb128TooLarge),
+            ),
+            (
+                "overlong positive encoding",
+                &[0xbf, 0],
+                ParseError::Error(Error::Leb128Overlong),
+            ),
+            (
+                "overlong negative encoding",
+                &[0x81, 0xff, 0x7f],
+                ParseError::Error(Error::Leb128Overlong),
+            ),
+            ("missing data", &[0x90], ParseError::Incomplete(NEED_ONE)),
+        ];
+        error_cases.into_iter().for_each(|(desc, input, expected)| {
+            match leb128_i64::<Error>(Input::new(input)) {
+                Ok((_, x)) => panic!("leb128_i64 should fail with {}, got {}", desc, x),
+                Err(error) => {
+                    if error != expected {
+                        panic!("leb128_i64 should fail with {}, got {}", expected, error)
+                    }
+                }
+            }
+        });
+
+        let success_cases: Vec<(&'static [u8], i64)> = vec![
+            (&[0], 0),
+            (&[0x7f], -1),
+            (&[0x3f], 63),
+            (&[0x40], -64),
+            (&[0x80, 0x01], 128),
+            (&[0xff, 0x3f], 8191),
+            (&[0x80, 0x40], -8192),
+            (
+                &[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0],
+                i64::MAX,
+            ),
+            (
+                &[0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x7f],
+                i64::MIN,
+            ),
+        ];
+        success_cases.into_iter().for_each(|(input, expected)| {
+            match leb128_i64::<Error>(Input::new(input)) {
+                Ok((_, x)) => {
+                    if x != expected {
+                        panic!("leb128_i64 should succeed with {}, got {}", expected, x)
+                    }
+                }
+                Err(error) => panic!("leb128_u64 should succeed with {}, got {}", expected, error),
+            }
+        });
     }
 }