Skip to content

Changes needed for gecko-integration #201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ rustc-serialize = "0.3"
[features]
query_encoding = ["encoding"]
heap_size = ["heapsize", "heapsize_plugin"]
only_percent_decode_hostname_valid = []

[dependencies]
idna = { version = "0.1.0", path = "./idna" }
Expand Down
6 changes: 3 additions & 3 deletions src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use std::io;
use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6, ToSocketAddrs};
use std::vec;
use parser::{ParseResult, ParseError};
use percent_encoding::percent_decode;
use percent_encoding::percent_decode_hostname;
use idna;

#[derive(Copy, Clone, Debug, Eq, PartialEq)]
Expand Down Expand Up @@ -77,10 +77,10 @@ impl Host<String> {
}
return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6)
}
let domain = percent_decode(input.as_bytes()).decode_utf8_lossy();
let domain = percent_decode_hostname(input.as_bytes()).decode_utf8_lossy();
let domain = try!(idna::domain_to_ascii(&domain));
if domain.find(|c| matches!(c,
'\0' | '\t' | '\n' | '\r' | ' ' | '#' | '%' | '/' | ':' | '?' | '@' | '[' | '\\' | ']'
'\0' | '\t' | '\n' | '\r' | ' ' | '#' | '/' | ':' | '?' | '@' | '[' | '\\' | ']'
)).is_some() {
return Err(ParseError::InvalidDomainCharacter)
}
Expand Down
45 changes: 36 additions & 9 deletions src/percent_encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ define_encode_set! {
}
}

define_encode_set! {
/// This encode set is used to decide which characters are allowed in the hostname.
pub HOSTNAME_ENCODE_SET = [SIMPLE_ENCODE_SET] | {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Encode sets are normally used for percent encoding, not decoding. So I’d rather not expose this in the public API. In after_percent_sign you could "inline" !HOSTNAME_ENCODE_SET.contains(c) to !(matches(c, ' ' | '!' | '"' | /* ...*/ '~') || c < '\u{20}' || c > '\u{7E}'), or invert the logic and use matches!(c, 'a'...'z' | 'A'...'Z' | '0'...'9' | '.' | '-' | '_' | '*'). (I think that’s the complementary set, but I’m not sure * should be there.)

' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '+', ',', '/', ':', ';',
'<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~'
}
}

/// Return the percent-encoding of the given bytes.
///
/// This is unconditional, unlike `percent_encode()` which uses an encode set.
Expand Down Expand Up @@ -244,26 +252,44 @@ impl<'a, E: EncodeSet> From<PercentEncode<'a, E>> for Cow<'a, str> {
#[inline]
pub fn percent_decode<'a>(input: &'a [u8]) -> PercentDecode<'a> {
PercentDecode {
bytes: input.iter()
bytes: input.iter(),
only_hostname_valid: false,
}
}

#[cfg(feature = "only_percent_decode_hostname_valid")]
pub fn percent_decode_hostname<'a>(input: &'a [u8]) -> PercentDecode<'a> {
PercentDecode {
bytes: input.iter(),
only_hostname_valid: true,
}
}

#[cfg(not(feature = "only_percent_decode_hostname_valid"))]
pub fn percent_decode_hostname<'a>(input: &'a [u8]) -> PercentDecode<'a> {
percent_decode(input)
}

/// The return type of `percent_decode()`.
#[derive(Clone)]
pub struct PercentDecode<'a> {
bytes: slice::Iter<'a, u8>,
only_hostname_valid: bool,
}

fn after_percent_sign(iter: &mut slice::Iter<u8>) -> Option<u8> {
fn after_percent_sign(iter: &mut slice::Iter<u8>, only_hostname_valid: bool) -> Option<u8> {
let initial_iter = iter.clone();
let h = iter.next().and_then(|&b| (b as char).to_digit(16));
let l = iter.next().and_then(|&b| (b as char).to_digit(16));
if let (Some(h), Some(l)) = (h, l) {
Some(h as u8 * 0x10 + l as u8)
} else {
*iter = initial_iter;
None
let c = h as u8 * 0x10 + l as u8;
if !only_hostname_valid ||
!HOSTNAME_ENCODE_SET.contains(c) {
return Some(c);
}
}
*iter = initial_iter;
return None
}

impl<'a> Iterator for PercentDecode<'a> {
Expand All @@ -272,7 +298,7 @@ impl<'a> Iterator for PercentDecode<'a> {
fn next(&mut self) -> Option<u8> {
self.bytes.next().map(|&byte| {
if byte == b'%' {
after_percent_sign(&mut self.bytes).unwrap_or(byte)
after_percent_sign(&mut self.bytes, self.only_hostname_valid).unwrap_or(byte)
} else {
byte
}
Expand All @@ -299,13 +325,14 @@ impl<'a> PercentDecode<'a> {
pub fn if_any(&self) -> Option<Vec<u8>> {
let mut bytes_iter = self.bytes.clone();
while bytes_iter.find(|&&b| b == b'%').is_some() {
if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter) {
if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter, self.only_hostname_valid) {
let initial_bytes = self.bytes.as_slice();
let unchanged_bytes_len = initial_bytes.len() - bytes_iter.len() - 3;
let mut decoded = initial_bytes[..unchanged_bytes_len].to_owned();
decoded.push(decoded_byte);
decoded.extend(PercentDecode {
bytes: bytes_iter
bytes: bytes_iter,
only_hostname_valid: self.only_hostname_valid,
});
return Some(decoded)
}
Expand Down
10 changes: 10 additions & 0 deletions src/slicing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

use std::ops::{Range, RangeFrom, RangeTo, RangeFull, Index};
use Url;
use std::mem;

impl Index<RangeFull> for Url {
type Output = str;
Expand Down Expand Up @@ -77,6 +78,7 @@ impl Index<Range<Position>> for Url {
/// `BeforeScheme` and `AfterFragment` are always the start and end of the entire URL,
/// so `&url[BeforeScheme..X]` is the same as `&url[..X]`
/// and `&url[X..AfterFragment]` is the same as `&url[X..]`.
#[repr(u32)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why u32? u8 is more than large enough for 16 variants.

#[derive(Copy, Clone, Debug)]
pub enum Position {
BeforeScheme,
Expand All @@ -97,6 +99,14 @@ pub enum Position {
AfterFragment
}

impl From<u32> for Position {
fn from(f: u32) -> Self {
unsafe {
mem::transmute(f)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this transmute is unsound. This method itself is not unsafe, and there’s nothing stopping users from calling it with a value that doesn’t have a corresponding enum variant. This should return an option, and avoid transmute. That code would be tedious and error-prone to write by hand, but it can be generated with a macro:

macro_rules! c_like_enum {
    ( $type_name: ident: $( $variant: ident, )+ ) => {
        c_like_enum! {
            $type_name
            [ $($variant)* ]
            [ ]
            next = 0
        }
    };

    (
        $type_name: ident
        [ $next_variant: ident $( $variant: ident )* ]
        [ $($assigned_variant: ident = $value: expr),* ]
        next = $next_value: expr
    ) => {
        c_like_enum! {
            $type_name
            [ $($variant)* ]
            [ $($assigned_variant = $value,)* $next_variant = $next_value ]
            next = $next_value + 1
        }
    };

    (
        $type_name: ident
        [ ]
        [ $($variant: ident = $value: expr),+ ]
        next = $next_value: expr
    ) => {
        #[derive(Copy, Clone, Debug)]
        #[repr(u8)]
        pub enum $type_name {
            $($variant = $value,)+
        }

        impl $type_name {
            #[allow(non_upper_case_globals)]
            pub fn from_u8(value: u8) -> Option<Self> {
                $(
                    const $variant: u8 = $value;
                )+
                match value {
                    $(
                        $variant => Some($type_name::$variant),
                    )+
                    _ => None
                }
            }
        }
    };
}

c_like_enum! { Position:
    BeforeScheme,
    AfterScheme,
    BeforeUsername,
    AfterUsername,
    BeforePassword,
    AfterPassword,
    BeforeHost,
    AfterHost,
    BeforePort,
    AfterPort,
    BeforePath,
    AfterPath,
    BeforeQuery,
    AfterQuery,
    BeforeFragment,
    AfterFragment,
}

}
}
}

impl Url {
#[inline]
fn index(&self, position: Position) -> usize {
Expand Down
6 changes: 6 additions & 0 deletions tests/unit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,9 @@ fn issue_197() {
assert_eq!(url, Url::parse("file:///").unwrap());
url.path_segments_mut().unwrap().pop_if_empty();
}

#[test]
#[cfg(feature = "only_percent_decode_hostname_valid")]
fn test_percent_encoded_hostname() {
assert_eq!(Url::parse("http://example.com%0a%23.google.com/").unwrap().domain(), Some("example.com%0a%23.google.com"));
}
5 changes: 0 additions & 5 deletions tests/urltestdata.json
Original file line number Diff line number Diff line change
Expand Up @@ -3514,11 +3514,6 @@
"base": "http://other.com/",
"failure": true
},
{
"input": "http://hello%00",
"base": "http://other.com/",
"failure": true
},
"Escaped numbers should be treated like IP addresses if they are.",
{
"input": "http://%30%78%63%30%2e%30%32%35%30.01",
Expand Down