unsafe audit

JoshMcguigan · JoshMcguigan · commit 8dfa67982935 · 2020-06-19T22:22:31.000-07:00
diff --git a/form_urlencoded/src/lib.rs b/form_urlencoded/src/lib.rs
@@ -150,6 +150,10 @@ impl<'a> Iterator for ByteSerialize<'a> {
                 None => (self.bytes, &[][..]),
             };
             self.bytes = remaining;
+            // This unsafe is appropriate because we have already checked these
+            // bytes in byte_serialized_unchanged, which checks for a subset
+            // of UTF-8. So we know these bytes are valid UTF-8, and doing
+            // another UTF-8 check would be wasteful.
             Some(unsafe { str::from_utf8_unchecked(unchanged_slice) })
         } else {
             None
diff --git a/form_urlencoded/src/query_encoding.rs b/form_urlencoded/src/query_encoding.rs
@@ -18,18 +18,34 @@ pub(crate) fn encode<'a>(encoding_override: EncodingOverride, input: &'a str) ->
 }
 
 pub(crate) fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow<str> {
+    // Note: This function is duplicated in `percent_encoding/lib.rs`.
     match input {
         Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
         Cow::Owned(bytes) => {
-            let raw_utf8: *const [u8];
             match String::from_utf8_lossy(&bytes) {
-                Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(),
-                Cow::Owned(s) => return s.into(),
+                Cow::Borrowed(utf8) => {
+                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
+                    // be sure our original bytes were valid UTF-8. This is because
+                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
+                    // to allocate a new owned string to back the Cow so it could
+                    // replace invalid bytes with a placeholder.
+
+                    // First we do a debug_assert to confirm our description above.
+                    let raw_utf8: *const [u8];
+                    raw_utf8 = utf8.as_bytes();
+                    debug_assert!(raw_utf8 == &*bytes as *const [u8]);
+
+                    // Given we know the original input bytes are valid UTF-8,
+                    // and we have ownership of those bytes, we re-use them and
+                    // return a Cow::Owned here. Ideally we'd put our return statement
+                    // right below this line, but to support the old lexically scoped
+                    // borrow checker the return must be moved to outside the match
+                    // statement.
+                },
+                Cow::Owned(s) => return Cow::Owned(s),
             }
-            // from_utf8_lossy returned a borrow of `bytes` unchanged.
-            debug_assert!(raw_utf8 == &*bytes as *const [u8]);
-            // Reuse the existing `Vec` allocation.
-            unsafe { String::from_utf8_unchecked(bytes) }.into()
+
+            Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
         }
     }
 }
diff --git a/idna/src/punycode.rs b/idna/src/punycode.rs
@@ -143,11 +143,10 @@ pub fn encode_str(input: &str) -> Option<String> {
 /// 63 encoded bytes, the DNS limit on domain name labels.
 pub fn encode(input: &[char]) -> Option<String> {
     // Handle "basic" (ASCII) code points. They are encoded as-is.
-    let output_bytes = input
+    let mut output : String = input
         .iter()
-        .filter_map(|&c| if c.is_ascii() { Some(c as u8) } else { None })
+        .filter(|&c| c.is_ascii())
         .collect();
-    let mut output = unsafe { String::from_utf8_unchecked(output_bytes) };
     let basic_length = output.len() as u32;
     if basic_length > 0 {
         output.push_str("-")
diff --git a/percent_encoding/lib.rs b/percent_encoding/lib.rs
@@ -252,6 +252,8 @@ impl<'a> Iterator for PercentEncode<'a> {
                 self.bytes = remaining;
                 Some(percent_encode_byte(first_byte))
             } else {
+                // The unsafe blocks here are appropriate because the bytes are
+                // confirmed as a subset of UTF-8 in should_percent_encode.
                 for (i, &byte) in remaining.iter().enumerate() {
                     if self.ascii_set.should_percent_encode(byte) {
                         // 1 for first_byte + i for previous iterations of this loop
@@ -425,18 +427,34 @@ impl<'a> PercentDecode<'a> {
 }
 
 fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow<str> {
+    // Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`.
     match input {
         Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
         Cow::Owned(bytes) => {
-            let raw_utf8: *const [u8];
             match String::from_utf8_lossy(&bytes) {
-                Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(),
-                Cow::Owned(s) => return s.into(),
+                Cow::Borrowed(utf8) => {
+                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
+                    // be sure our original bytes were valid UTF-8. This is because
+                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
+                    // to allocate a new owned string to back the Cow so it could
+                    // replace invalid bytes with a placeholder.
+
+                    // First we do a debug_assert to confirm our description above.
+                    let raw_utf8: *const [u8];
+                    raw_utf8 = utf8.as_bytes();
+                    debug_assert!(raw_utf8 == &*bytes as *const [u8]);
+
+                    // Given we know the original input bytes are valid UTF-8,
+                    // and we have ownership of those bytes, we re-use them and
+                    // return a Cow::Owned here. Ideally we'd put our return statement
+                    // right below this line, but to support the old lexically scoped
+                    // borrow checker the return must be moved to outside the match
+                    // statement.
+                },
+                Cow::Owned(s) => return Cow::Owned(s),
             }
-            // from_utf8_lossy returned a borrow of `bytes` unchanged.
-            debug_assert!(raw_utf8 == &*bytes as *const [u8]);
-            // Reuse the existing `Vec` allocation.
-            unsafe { String::from_utf8_unchecked(bytes) }.into()
+
+            Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
         }
     }
 }