@@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException
2281
2281
}
2282
2282
return ;
2283
2283
}
2284
- // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2285
- // the longest individual unit is 4 bytes (surrogate pair) so we
2286
- // actually need len+3 bytes to avoid bounds checks
2287
2284
// 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow
2288
2285
// like Integer.MAX_VALUE
2289
- final int needed = Math .max (len , len + 3 );
2290
2286
final int available = _inputEnd - _inputPtr ;
2291
2287
2292
- if ((available >= needed )
2288
+ if ((available >= len )
2293
2289
// if not, could we read? NOTE: we do not require it, just attempt to read
2294
- || ((_inputBuffer .length >= needed )
2295
- && _tryToLoadToHaveAtLeast (needed ))) {
2290
+ || ((_inputBuffer .length >= len )
2291
+ && _tryToLoadToHaveAtLeast (len ))) {
2296
2292
_finishShortText (len );
2297
2293
return ;
2298
2294
}
@@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException
2326
2322
_finishChunkedText ();
2327
2323
return _textBuffer .contentsAsString ();
2328
2324
}
2329
- // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2330
- // the longest individual unit is 4 bytes (surrogate pair) so we
2331
- // actually need len+3 bytes to avoid bounds checks
2332
2325
2333
2326
// 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length
2334
2327
// we get is Integer.MAX_VALUE, leading to overflow. Could change values
2335
2328
// to longs but simpler to truncate "needed" (will never pass following test
2336
2329
// due to inputBuffer never being even close to that big).
2337
2330
2338
- final int needed = Math .max (len + 3 , len );
2339
2331
final int available = _inputEnd - _inputPtr ;
2340
2332
2341
- if ((available >= needed )
2333
+ if ((available >= len )
2342
2334
// if not, could we read? NOTE: we do not require it, just attempt to read
2343
- || ((_inputBuffer .length >= needed )
2344
- && _tryToLoadToHaveAtLeast (needed ))) {
2335
+ || ((_inputBuffer .length >= len )
2336
+ && _tryToLoadToHaveAtLeast (len ))) {
2345
2337
return _finishShortText (len );
2346
2338
}
2347
2339
// If not enough space, need handling similar to chunked
@@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException
2369
2361
final byte [] inputBuf = _inputBuffer ;
2370
2362
2371
2363
// Let's actually do a tight loop for ASCII first:
2372
- final int end = inPtr + len ;
2364
+ final int end = _inputPtr ;
2373
2365
2374
2366
int i ;
2375
2367
while ((i = inputBuf [inPtr ]) >= 0 ) {
@@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException
2386
2378
final int [] codes = UTF8_UNIT_CODES ;
2387
2379
do {
2388
2380
i = inputBuf [inPtr ++] & 0xFF ;
2389
- switch (codes [i ]) {
2390
- case 0 :
2391
- break ;
2392
- case 1 :
2393
- {
2394
- final int c2 = inputBuf [inPtr ++];
2395
- if ((c2 & 0xC0 ) != 0x080 ) {
2396
- _reportInvalidOther (c2 & 0xFF , inPtr );
2397
- }
2398
- i = ((i & 0x1F ) << 6 ) | (c2 & 0x3F );
2381
+ int code = codes [i ];
2382
+ if (code != 0 ) {
2383
+ // 05-Jul-2021, tatu: As per [dataformats-binary#289] need to
2384
+ // be careful wrt end-of-buffer truncated codepoints
2385
+ if ((inPtr + code ) > end ) {
2386
+ final int firstCharOffset = len - (end - inPtr ) - 1 ;
2387
+ _reportTruncatedUTF8InString (len , firstCharOffset , i , code );
2399
2388
}
2400
- break ;
2401
- case 2 :
2402
- {
2403
- final int c2 = inputBuf [inPtr ++];
2404
- if ((c2 & 0xC0 ) != 0x080 ) {
2405
- _reportInvalidOther (c2 & 0xFF , inPtr );
2389
+
2390
+ switch (code ) {
2391
+ case 1 : {
2392
+ final int c2 = inputBuf [inPtr ++];
2393
+ if ((c2 & 0xC0 ) != 0x080 ) {
2394
+ _reportInvalidOther (c2 & 0xFF , inPtr );
2395
+ }
2396
+ i = ((i & 0x1F ) << 6 ) | (c2 & 0x3F );
2406
2397
}
2407
- final int c3 = inputBuf [inPtr ++];
2408
- if ((c3 & 0xC0 ) != 0x080 ) {
2409
- _reportInvalidOther (c3 & 0xFF , inPtr );
2398
+ break ;
2399
+ case 2 : {
2400
+ final int c2 = inputBuf [inPtr ++];
2401
+ if ((c2 & 0xC0 ) != 0x080 ) {
2402
+ _reportInvalidOther (c2 & 0xFF , inPtr );
2403
+ }
2404
+ final int c3 = inputBuf [inPtr ++];
2405
+ if ((c3 & 0xC0 ) != 0x080 ) {
2406
+ _reportInvalidOther (c3 & 0xFF , inPtr );
2407
+ }
2408
+ i = ((i & 0x0F ) << 12 ) | ((c2 & 0x3F ) << 6 ) | (c3 & 0x3F );
2410
2409
}
2411
- i = ((i & 0x0F ) << 12 ) | ((c2 & 0x3F ) << 6 ) | (c3 & 0x3F );
2410
+ break ;
2411
+ case 3 :
2412
+ // 30-Jan-2021, tatu: TODO - validate these too?
2413
+ i = ((i & 0x07 ) << 18 )
2414
+ | ((inputBuf [inPtr ++] & 0x3F ) << 12 )
2415
+ | ((inputBuf [inPtr ++] & 0x3F ) << 6 )
2416
+ | (inputBuf [inPtr ++] & 0x3F );
2417
+ // note: this is the codepoint value; need to split, too
2418
+ i -= 0x10000 ;
2419
+ outBuf [outPtr ++] = (char ) (0xD800 | (i >> 10 ));
2420
+ i = 0xDC00 | (i & 0x3FF );
2421
+ break ;
2422
+ default : // invalid
2423
+ _reportInvalidInitial (i );
2412
2424
}
2413
- break ;
2414
- case 3 :
2415
- // 30-Jan-2021, tatu: TODO - validate these too?
2416
- i = ((i & 0x07 ) << 18 )
2417
- | ((inputBuf [inPtr ++] & 0x3F ) << 12 )
2418
- | ((inputBuf [inPtr ++] & 0x3F ) << 6 )
2419
- | (inputBuf [inPtr ++] & 0x3F );
2420
- // note: this is the codepoint value; need to split, too
2421
- i -= 0x10000 ;
2422
- outBuf [outPtr ++] = (char ) (0xD800 | (i >> 10 ));
2423
- i = 0xDC00 | (i & 0x3FF );
2424
- break ;
2425
- default : // invalid
2426
- _reportInvalidInitial (i );
2427
2425
}
2428
2426
outBuf [outPtr ++] = (char ) i ;
2429
2427
} while (inPtr < end );
@@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce
3850
3848
expLen , actLen ), _currToken );
3851
3849
}
3852
3850
3853
- // @since 2.13
3854
- /*
3851
+ // @since 2.19
3855
3852
private String _reportTruncatedUTF8InString (int strLenBytes , int truncatedCharOffset ,
3856
3853
int firstUTFByteValue , int bytesExpected )
3857
3854
throws IOException
3858
3855
{
3859
3856
throw _constructError (String .format (
3860
- "Truncated UTF-8 character in Chunked Unicode String value (%d bytes): "
3857
+ "Truncated UTF-8 character in Unicode String value (%d bytes): "
3861
3858
+"byte 0x%02X at offset #%d indicated %d more bytes needed" ,
3862
3859
strLenBytes , firstUTFByteValue , truncatedCharOffset , bytesExpected ));
3863
3860
}
3864
- */
3865
3861
3866
3862
// @since 2.13
3867
3863
private String _reportTruncatedUTF8InName (int strLenBytes , int truncatedCharOffset ,
0 commit comments