@@ -125,13 +125,14 @@ Section: Creating a string
125
125
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
126
126
pub struct Utf8Error {
127
127
valid_up_to : usize ,
128
+ invalid_length : Option < u8 > ,
128
129
}
129
130
130
131
impl Utf8Error {
131
132
/// Returns the index in the given string up to which valid UTF-8 was
132
133
/// verified.
133
134
///
134
- /// It is the maximum index such that `from_utf8(input[..index])`
135
+ /// It is the maximum index such that `from_utf8(& input[..index])`
135
136
/// would return `Ok(_)`.
136
137
///
137
138
/// # Examples
@@ -152,6 +153,21 @@ impl Utf8Error {
152
153
/// ```
153
154
#[ stable( feature = "utf8_error" , since = "1.5.0" ) ]
154
155
pub fn valid_up_to ( & self ) -> usize { self . valid_up_to }
156
+
157
+ /// Provide more information about the failure:
158
+ ///
159
+ /// * `None`: the end of the input was reached unexpectedly.
160
+ /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
161
+ /// If a byte stream (such as a file or a network socket) is being decoded incrementally,
162
+ /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
163
+ ///
164
+ /// * `Some(index)`: an unexpected byte was encountered.
165
+ /// The index provided is where decoding should resume
166
+ /// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
167
+ #[ unstable( feature = "utf8_error_resume_from" , reason ="new" , issue = "0" ) ]
168
+ pub fn resume_from ( & self ) -> Option < usize > {
169
+ self . invalid_length . map ( |l| self . valid_up_to + l as usize )
170
+ }
155
171
}
156
172
157
173
/// Converts a slice of bytes to a string slice.
@@ -300,7 +316,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
300
316
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
301
317
impl fmt:: Display for Utf8Error {
302
318
fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
303
- write ! ( f, "invalid utf-8: invalid byte near index {}" , self . valid_up_to)
319
+ if let Some ( invalid_length) = self . invalid_length {
320
+ write ! ( f, "invalid utf-8 sequence of {} bytes from index {}" ,
321
+ invalid_length, self . valid_up_to)
322
+ } else {
323
+ write ! ( f, "incomplete utf-8 byte sequence from index {}" , self . valid_up_to)
324
+ }
304
325
}
305
326
}
306
327
@@ -1241,25 +1262,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
1241
1262
1242
1263
while index < len {
1243
1264
let old_offset = index;
1244
- macro_rules! err { ( ) => { {
1245
- return Err ( Utf8Error {
1246
- valid_up_to: old_offset
1247
- } )
1248
- } } }
1265
+ macro_rules! err {
1266
+ ( $invalid_length: expr) => {
1267
+ return Err ( Utf8Error {
1268
+ valid_up_to: old_offset,
1269
+ invalid_length: $invalid_length,
1270
+ } )
1271
+ }
1272
+ }
1249
1273
1250
1274
macro_rules! next { ( ) => { {
1251
1275
index += 1 ;
1252
1276
// we needed data, but there was none: error!
1253
1277
if index >= len {
1254
- err!( )
1278
+ err!( None )
1255
1279
}
1256
1280
v[ index]
1257
1281
} } }
1258
1282
1259
1283
let first = v[ index] ;
1260
1284
if first >= 128 {
1261
1285
let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1262
- let second = next ! ( ) ;
1263
1286
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
1264
1287
// first C2 80 last DF BF
1265
1288
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
@@ -1279,25 +1302,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
1279
1302
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
1280
1303
// %xF4 %x80-8F 2( UTF8-tail )
1281
1304
match w {
1282
- 2 => if second & !CONT_MASK != TAG_CONT_U8 { err ! ( ) } ,
1305
+ 2 => if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1306
+ err ! ( Some ( 1 ) )
1307
+ } ,
1283
1308
3 => {
1284
- match ( first, second, next ! ( ) & !CONT_MASK ) {
1285
- ( 0xE0 , 0xA0 ... 0xBF , TAG_CONT_U8 ) |
1286
- ( 0xE1 ... 0xEC , 0x80 ... 0xBF , TAG_CONT_U8 ) |
1287
- ( 0xED , 0x80 ... 0x9F , TAG_CONT_U8 ) |
1288
- ( 0xEE ... 0xEF , 0x80 ... 0xBF , TAG_CONT_U8 ) => { }
1289
- _ => err ! ( )
1309
+ match ( first, next ! ( ) ) {
1310
+ ( 0xE0 , 0xA0 ... 0xBF ) |
1311
+ ( 0xE1 ... 0xEC , 0x80 ... 0xBF ) |
1312
+ ( 0xED , 0x80 ... 0x9F ) |
1313
+ ( 0xEE ... 0xEF , 0x80 ... 0xBF ) => { }
1314
+ _ => err ! ( Some ( 1 ) )
1315
+ }
1316
+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1317
+ err ! ( Some ( 2 ) )
1290
1318
}
1291
1319
}
1292
1320
4 => {
1293
- match ( first, second, next ! ( ) & !CONT_MASK , next ! ( ) & !CONT_MASK ) {
1294
- ( 0xF0 , 0x90 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1295
- ( 0xF1 ... 0xF3 , 0x80 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1296
- ( 0xF4 , 0x80 ... 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => { }
1297
- _ => err ! ( )
1321
+ match ( first, next ! ( ) ) {
1322
+ ( 0xF0 , 0x90 ... 0xBF ) |
1323
+ ( 0xF1 ... 0xF3 , 0x80 ... 0xBF ) |
1324
+ ( 0xF4 , 0x80 ... 0x8F ) => { }
1325
+ _ => err ! ( Some ( 1 ) )
1326
+ }
1327
+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1328
+ err ! ( Some ( 2 ) )
1329
+ }
1330
+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1331
+ err ! ( Some ( 3 ) )
1298
1332
}
1299
1333
}
1300
- _ => err ! ( )
1334
+ _ => err ! ( Some ( 1 ) )
1301
1335
}
1302
1336
index += 1 ;
1303
1337
} else {
0 commit comments