Skip to content

Commit 265b3c4

Browse files
Only read number of bytes indicated by string data item (#518)
The string parsing now no longer reads more bytes than indicated by the data item. For any incomplete UTF-8 code points an exception with a corresponding message is thrown.
1 parent 0f75408 commit 265b3c4

File tree

3 files changed

+72
-57
lines changed

3 files changed

+72
-57
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java

+49-53
Original file line numberDiff line numberDiff line change
@@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException
22812281
}
22822282
return;
22832283
}
2284-
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2285-
// the longest individual unit is 4 bytes (surrogate pair) so we
2286-
// actually need len+3 bytes to avoid bounds checks
22872284
// 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow
22882285
// like Integer.MAX_VALUE
2289-
final int needed = Math.max(len, len + 3);
22902286
final int available = _inputEnd - _inputPtr;
22912287

2292-
if ((available >= needed)
2288+
if ((available >= len)
22932289
// if not, could we read? NOTE: we do not require it, just attempt to read
2294-
|| ((_inputBuffer.length >= needed)
2295-
&& _tryToLoadToHaveAtLeast(needed))) {
2290+
|| ((_inputBuffer.length >= len)
2291+
&& _tryToLoadToHaveAtLeast(len))) {
22962292
_finishShortText(len);
22972293
return;
22982294
}
@@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException
23262322
_finishChunkedText();
23272323
return _textBuffer.contentsAsString();
23282324
}
2329-
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2330-
// the longest individual unit is 4 bytes (surrogate pair) so we
2331-
// actually need len+3 bytes to avoid bounds checks
23322325

23332326
// 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length
23342327
// we get is Integer.MAX_VALUE, leading to overflow. Could change values
23352328
// to longs but simpler to truncate "needed" (will never pass following test
23362329
// due to inputBuffer never being even close to that big).
23372330

2338-
final int needed = Math.max(len + 3, len);
23392331
final int available = _inputEnd - _inputPtr;
23402332

2341-
if ((available >= needed)
2333+
if ((available >= len)
23422334
// if not, could we read? NOTE: we do not require it, just attempt to read
2343-
|| ((_inputBuffer.length >= needed)
2344-
&& _tryToLoadToHaveAtLeast(needed))) {
2335+
|| ((_inputBuffer.length >= len)
2336+
&& _tryToLoadToHaveAtLeast(len))) {
23452337
return _finishShortText(len);
23462338
}
23472339
// If not enough space, need handling similar to chunked
@@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException
23692361
final byte[] inputBuf = _inputBuffer;
23702362

23712363
// Let's actually do a tight loop for ASCII first:
2372-
final int end = inPtr + len;
2364+
final int end = _inputPtr;
23732365

23742366
int i;
23752367
while ((i = inputBuf[inPtr]) >= 0) {
@@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException
23862378
final int[] codes = UTF8_UNIT_CODES;
23872379
do {
23882380
i = inputBuf[inPtr++] & 0xFF;
2389-
switch (codes[i]) {
2390-
case 0:
2391-
break;
2392-
case 1:
2393-
{
2394-
final int c2 = inputBuf[inPtr++];
2395-
if ((c2 & 0xC0) != 0x080) {
2396-
_reportInvalidOther(c2 & 0xFF, inPtr);
2397-
}
2398-
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
2381+
int code = codes[i];
2382+
if (code != 0) {
2383+
// 05-Jul-2021, tatu: As per [dataformats-binary#289] need to
2384+
// be careful wrt end-of-buffer truncated codepoints
2385+
if ((inPtr + code) > end) {
2386+
final int firstCharOffset = len - (end - inPtr) - 1;
2387+
_reportTruncatedUTF8InString(len, firstCharOffset, i, code);
23992388
}
2400-
break;
2401-
case 2:
2402-
{
2403-
final int c2 = inputBuf[inPtr++];
2404-
if ((c2 & 0xC0) != 0x080) {
2405-
_reportInvalidOther(c2 & 0xFF, inPtr);
2389+
2390+
switch (code) {
2391+
case 1: {
2392+
final int c2 = inputBuf[inPtr++];
2393+
if ((c2 & 0xC0) != 0x080) {
2394+
_reportInvalidOther(c2 & 0xFF, inPtr);
2395+
}
2396+
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
24062397
}
2407-
final int c3 = inputBuf[inPtr++];
2408-
if ((c3 & 0xC0) != 0x080) {
2409-
_reportInvalidOther(c3 & 0xFF, inPtr);
2398+
break;
2399+
case 2: {
2400+
final int c2 = inputBuf[inPtr++];
2401+
if ((c2 & 0xC0) != 0x080) {
2402+
_reportInvalidOther(c2 & 0xFF, inPtr);
2403+
}
2404+
final int c3 = inputBuf[inPtr++];
2405+
if ((c3 & 0xC0) != 0x080) {
2406+
_reportInvalidOther(c3 & 0xFF, inPtr);
2407+
}
2408+
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
24102409
}
2411-
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
2410+
break;
2411+
case 3:
2412+
// 30-Jan-2021, tatu: TODO - validate these too?
2413+
i = ((i & 0x07) << 18)
2414+
| ((inputBuf[inPtr++] & 0x3F) << 12)
2415+
| ((inputBuf[inPtr++] & 0x3F) << 6)
2416+
| (inputBuf[inPtr++] & 0x3F);
2417+
// note: this is the codepoint value; need to split, too
2418+
i -= 0x10000;
2419+
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
2420+
i = 0xDC00 | (i & 0x3FF);
2421+
break;
2422+
default: // invalid
2423+
_reportInvalidInitial(i);
24122424
}
2413-
break;
2414-
case 3:
2415-
// 30-Jan-2021, tatu: TODO - validate these too?
2416-
i = ((i & 0x07) << 18)
2417-
| ((inputBuf[inPtr++] & 0x3F) << 12)
2418-
| ((inputBuf[inPtr++] & 0x3F) << 6)
2419-
| (inputBuf[inPtr++] & 0x3F);
2420-
// note: this is the codepoint value; need to split, too
2421-
i -= 0x10000;
2422-
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
2423-
i = 0xDC00 | (i & 0x3FF);
2424-
break;
2425-
default: // invalid
2426-
_reportInvalidInitial(i);
24272425
}
24282426
outBuf[outPtr++] = (char) i;
24292427
} while (inPtr < end);
@@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce
38503848
expLen, actLen), _currToken);
38513849
}
38523850

3853-
// @since 2.13
3854-
/*
3851+
// @since 2.19
38553852
private String _reportTruncatedUTF8InString(int strLenBytes, int truncatedCharOffset,
38563853
int firstUTFByteValue, int bytesExpected)
38573854
throws IOException
38583855
{
38593856
throw _constructError(String.format(
3860-
"Truncated UTF-8 character in Chunked Unicode String value (%d bytes): "
3857+
"Truncated UTF-8 character in Unicode String value (%d bytes): "
38613858
+"byte 0x%02X at offset #%d indicated %d more bytes needed",
38623859
strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected));
38633860
}
3864-
*/
38653861

38663862
// @since 2.13
38673863
private String _reportTruncatedUTF8InName(int strLenBytes, int truncatedCharOffset,

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public void testInvalidTextValueWithBrokenUTF8() throws Exception
2727
p.getText();
2828
fail("Should not pass");
2929
} catch (StreamReadException e) {
30-
verifyException(e, "Malformed UTF-8 character at the end of a");
30+
verifyException(e, "Truncated UTF-8 character in Unicode String value (256 bytes)");
3131
}
3232

3333
}

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java

+22-3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
import com.fasterxml.jackson.dataformat.cbor.CBORParser;
66
import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
77

8+
import java.nio.charset.StandardCharsets;
9+
import java.util.Arrays;
10+
811
public class ParseInvalidUTF8String236Test extends CBORTestBase
912
{
1013
// [dataformats-binary#236]: Original version; broken UTF-8 all around.
@@ -24,8 +27,8 @@ public void testShortString236Original() throws Exception
2427
}
2528

2629
// Variant where the length would be valid, but the last byte is partial UTF-8
27-
// code point
28-
public void testShortString236EndsWithPartialUTF8() throws Exception
30+
// code point and no more bytes are available due to end-of-stream
31+
public void testShortString236EndsWithPartialUTF8AtEndOfStream() throws Exception
2932
{
3033
final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda};
3134
try (CBORParser p = cborParser(input)) {
@@ -34,7 +37,23 @@ public void testShortString236EndsWithPartialUTF8() throws Exception
3437
String str = p.getText();
3538
fail("Should have failed, did not, String = '"+str+"'");
3639
} catch (StreamReadException e) {
37-
verifyException(e, "Malformed UTF-8 character at the end of");
40+
verifyException(e, "Truncated UTF-8 character in Unicode String value (3 bytes)");
41+
}
42+
}
43+
}
44+
45+
// Variant where the length would be valid, but the last byte is partial UTF-8
46+
// code point and the subsequent byte would be a valid continuation byte, but belongs to next data item
47+
public void testShortString236EndsWithPartialUTF8() throws Exception
48+
{
49+
final byte[] input = {0x62, 0x33, (byte) 0xdb, (byte) 0xa0};
50+
try (CBORParser p = cborParser(input)) {
51+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
52+
try {
53+
String str = p.getText();
54+
fail("Should have failed, did not, String = '"+str+"'");
55+
} catch (StreamReadException e) {
56+
verifyException(e, "Truncated UTF-8 character in Unicode String value (2 bytes)");
3857
}
3958
}
4059
}

0 commit comments

Comments
 (0)