servo · valenting · May 27, 2016 · SimonSapin · May 28, 2016 · SimonSapin
diff --git a/Cargo.toml b/Cargo.toml
@@ -28,6 +28,7 @@ rustc-serialize = "0.3"
 [features]
 query_encoding = ["encoding"]
 heap_size = ["heapsize", "heapsize_plugin"]
+only_percent_decode_hostname_valid = []
 
 [dependencies]
 idna = { version = "0.1.0", path = "./idna" }

diff --git a/src/host.rs b/src/host.rs
@@ -12,7 +12,7 @@ use std::io;
 use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6, ToSocketAddrs};
 use std::vec;
 use parser::{ParseResult, ParseError};
-use percent_encoding::percent_decode;
+use percent_encoding::percent_decode_hostname;
 use idna;
 
 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
@@ -77,10 +77,10 @@ impl Host<String> {
             }
             return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6)
         }
-        let domain = percent_decode(input.as_bytes()).decode_utf8_lossy();
+        let domain = percent_decode_hostname(input.as_bytes()).decode_utf8_lossy();
         let domain = try!(idna::domain_to_ascii(&domain));
         if domain.find(|c| matches!(c,
-            '\0' | '\t' | '\n' | '\r' | ' ' | '#' | '%' | '/' | ':' | '?' | '@' | '[' | '\\' | ']'
+            '\0' | '\t' | '\n' | '\r' | ' ' | '#' | '/' | ':' | '?' | '@' | '[' | '\\' | ']'
         )).is_some() {
             return Err(ParseError::InvalidDomainCharacter)
         }

diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs
@@ -110,6 +110,14 @@ define_encode_set! {
     }
 }
 
+define_encode_set! {
+    /// This encode set is used to decide which characters are allowed in the hostname.
+    pub HOSTNAME_ENCODE_SET = [SIMPLE_ENCODE_SET] | {
+        ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '+', ',', '/', ':', ';',
+        '<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~'
+    }
+}
+
 /// Return the percent-encoding of the given bytes.
 ///
 /// This is unconditional, unlike `percent_encode()` which uses an encode set.
@@ -244,26 +252,44 @@ impl<'a, E: EncodeSet> From<PercentEncode<'a, E>> for Cow<'a, str> {
 #[inline]
 pub fn percent_decode<'a>(input: &'a [u8]) -> PercentDecode<'a> {
     PercentDecode {
-        bytes: input.iter()
+        bytes: input.iter(),
+        only_hostname_valid: false,
+    }
+}
+
+#[cfg(feature = "only_percent_decode_hostname_valid")]
+pub fn percent_decode_hostname<'a>(input: &'a [u8]) -> PercentDecode<'a> {
+    PercentDecode {
+        bytes: input.iter(),
+        only_hostname_valid: true,
     }
 }
 
+#[cfg(not(feature = "only_percent_decode_hostname_valid"))]
+pub fn percent_decode_hostname<'a>(input: &'a [u8]) -> PercentDecode<'a> {
+    percent_decode(input)
+}
+
 /// The return type of `percent_decode()`.
 #[derive(Clone)]
 pub struct PercentDecode<'a> {
     bytes: slice::Iter<'a, u8>,
+    only_hostname_valid: bool,
 }
 
-fn after_percent_sign(iter: &mut slice::Iter<u8>) -> Option<u8> {
+fn after_percent_sign(iter: &mut slice::Iter<u8>, only_hostname_valid: bool) -> Option<u8> {
     let initial_iter = iter.clone();
     let h = iter.next().and_then(|&b| (b as char).to_digit(16));
     let l = iter.next().and_then(|&b| (b as char).to_digit(16));
     if let (Some(h), Some(l)) = (h, l) {
-        Some(h as u8 * 0x10 + l as u8)
-    } else {
-        *iter = initial_iter;
-        None
+        let c = h as u8 * 0x10 + l as u8;
+        if !only_hostname_valid ||
+           !HOSTNAME_ENCODE_SET.contains(c) {
+            return Some(c);
+        }
     }
+    *iter = initial_iter;
+    return None
 }
 
 impl<'a> Iterator for PercentDecode<'a> {
@@ -272,7 +298,7 @@ impl<'a> Iterator for PercentDecode<'a> {
     fn next(&mut self) -> Option<u8> {
         self.bytes.next().map(|&byte| {
             if byte == b'%' {
-                after_percent_sign(&mut self.bytes).unwrap_or(byte)
+                after_percent_sign(&mut self.bytes, self.only_hostname_valid).unwrap_or(byte)
             } else {
                 byte
             }
@@ -299,13 +325,14 @@ impl<'a> PercentDecode<'a> {
     pub fn if_any(&self) -> Option<Vec<u8>> {
         let mut bytes_iter = self.bytes.clone();
         while bytes_iter.find(|&&b| b == b'%').is_some() {
-            if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter) {
+            if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter, self.only_hostname_valid) {
                 let initial_bytes = self.bytes.as_slice();
                 let unchanged_bytes_len = initial_bytes.len() - bytes_iter.len() - 3;
                 let mut decoded = initial_bytes[..unchanged_bytes_len].to_owned();
                 decoded.push(decoded_byte);
                 decoded.extend(PercentDecode {
-                    bytes: bytes_iter
+                    bytes: bytes_iter,
+                    only_hostname_valid: self.only_hostname_valid,
                 });
                 return Some(decoded)
             }

diff --git a/src/slicing.rs b/src/slicing.rs
@@ -8,6 +8,7 @@
 
 use std::ops::{Range, RangeFrom, RangeTo, RangeFull, Index};
 use Url;
+use std::mem;
 
 impl Index<RangeFull> for Url {
     type Output = str;
@@ -77,6 +78,7 @@ impl Index<Range<Position>> for Url {
 /// `BeforeScheme` and `AfterFragment` are always the start and end of the entire URL,
 /// so `&url[BeforeScheme..X]` is the same as `&url[..X]`
 /// and `&url[X..AfterFragment]` is the same as `&url[X..]`.
+#[repr(u32)]
 #[derive(Copy, Clone, Debug)]
 pub enum Position {
     BeforeScheme,
@@ -97,6 +99,14 @@ pub enum Position {
     AfterFragment
 }
 
+impl From<u32> for Position {
+    fn from(f: u32) -> Self {
+        unsafe {
+            mem::transmute(f)
+        }
+    }
+}
+
 impl Url {
     #[inline]
     fn index(&self, position: Position) -> usize {

diff --git a/tests/unit.rs b/tests/unit.rs
@@ -268,3 +268,9 @@ fn issue_197() {
     assert_eq!(url, Url::parse("file:///").unwrap());
     url.path_segments_mut().unwrap().pop_if_empty();
 }
+
+#[test]
+#[cfg(feature = "only_percent_decode_hostname_valid")]
+fn test_percent_encoded_hostname() {
+    assert_eq!(Url::parse("http://example.com%0a%23.google.com/").unwrap().domain(), Some("example.com%0a%23.google.com"));
+}
diff --git a/tests/urltestdata.json b/tests/urltestdata.json
@@ -3514,11 +3514,6 @@
     "base": "http://other.com/",
     "failure": true
   },
-  {
-    "input": "http://hello%00",
-    "base": "http://other.com/",
-    "failure": true
-  },
   "Escaped numbers should be treated like IP addresses if they are.",
   {
     "input": "http://%30%78%63%30%2e%30%32%35%30.01",