smheidrich · smheidrich · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/src/suitable_unbuffered_bytes_stream.rs b/src/suitable_unbuffered_bytes_stream.rs
@@ -25,7 +25,7 @@ impl SuitableUnbufferedBytesStream {
 impl Utf8CharSource for SuitableUnbufferedBytesStream {
     fn read_char(&mut self) -> io::Result<Option<char>> {
         let mut buf: [u8; 4] = [0; 4];
-        let n_bytes_read = self.inner.read(&mut buf[..1])?;
+        let mut n_bytes_read = self.inner.read(&mut buf[..1])?;
         if n_bytes_read < 1 {
             // EOF
             return Ok(None);
@@ -36,23 +36,20 @@ impl Utf8CharSource for SuitableUnbufferedBytesStream {
                 "broken stream: returns more bytes than requested",
             ));
         }
+        // try to see if we're at the start of a unicode char:
         let n_bytes_in_char = get_width(buf[0]);
         if n_bytes_in_char == 0 {
             return Err(io::Error::new(
                 io::ErrorKind::Other,
                 format!("invalid UTF-8 start byte: {:x}", buf[0]),
             ));
         }
-        let n_bytes_actual = {
-            if n_bytes_in_char > 1 {
-                // this should only return fewer bytes than requested if it's cut short by EOF
-                // => will evaluate to invalid UTF-8 at the end and return an error
-                self.inner.read(&mut buf[1..n_bytes_in_char])? + 1
-            } else {
-                1
-            }
-        };
-        Ok(std::str::from_utf8(&buf[..n_bytes_actual])
+        // if we're inside a unicode char, we try and read its remaining bytes
+        // (or until EOF, in which case from_utf8 below will return an error):
+        while n_bytes_read < n_bytes_in_char {
+            n_bytes_read += self.inner.read(&mut buf[n_bytes_read..n_bytes_in_char])?;
+        }
+        Ok(std::str::from_utf8(&buf[..n_bytes_read])
             .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("{}", e)))?
             .chars()
             .next())

diff --git a/tests/test_load_iterable.py b/tests/test_load_iterable.py
@@ -0,0 +1,26 @@
+"""
+Test compatibility with json-stream's support for giving iterables to `load()`.
+"""
+import json_stream
+import pytest
+
+
+@pytest.mark.parametrize("chunk_size", [1, 2, 3, 4, 10])
+def test_chunk_boundary_inside_utf8_char(chunk_size: int) -> None:
+    """
+    Test that chunk boundaries inside UTF-8 chars are handled correctly.
+
+    Regression test for https://github.com/daggaz/json-stream/issues/59.
+    """
+    inner_str = "——"
+    document_str = f'"{inner_str}"'
+    document_bytes = document_str.encode("utf-8")
+
+    iterable = (
+        document_bytes[i : i + chunk_size]
+        for i in range(0, len(document_bytes), chunk_size)
+    )
+
+    parsed = json_stream.load(iterable)
+
+    assert parsed == inner_str