-
Notifications
You must be signed in to change notification settings - Fork 42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DecodedURL #54
DecodedURL #54
Changes from 16 commits
98edfc6
dce971b
2162c67
d1bfc68
adc7370
fc28ee3
358402f
34d4212
161e93a
95ea9ea
976c083
02a841c
9a0438e
662ec79
3aa4b61
82deb29
0b311d7
30e19a6
696e8fd
67ab0ec
6a90f4a
ad63b9b
afc907b
9f3212b
ac3be79
dd8248c
6dd3272
e8616fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -202,13 +202,27 @@ def _make_quote_map(safe_chars): | |
_QUERY_DECODE_MAP = _make_decode_map(_QUERY_DELIMS) | ||
_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE) | ||
_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS) | ||
_UNRESERVED_QUOTE_MAP = _make_quote_map(_UNRESERVED_CHARS) | ||
_UNRESERVED_DECODE_MAP = dict([(k, v) for k, v in _HEX_CHAR_MAP.items() | ||
if v.decode('ascii', 'replace') | ||
in _UNRESERVED_CHARS]) | ||
|
||
_ROOT_PATHS = frozenset(((), (u'',))) | ||
|
||
|
||
def _encode_reserved(text, maximal=True): | ||
"""A very comprehensive percent encoding for encoding all | ||
delimeters. Used for arguments to DecodedURL, where a % means a | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "delimiters" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oooh, this kind of typo is so very unlike me, I assure you! ;) Thanks! |
||
percent sign, and not the character used by URLs for escaping | ||
bytes. | ||
""" | ||
if maximal: | ||
bytestr = normalize('NFC', text).encode('utf8') | ||
return u''.join([_UNRESERVED_QUOTE_MAP[b] for b in bytestr]) | ||
return u''.join([_UNRESERVED_QUOTE_MAP[t] if t in _UNRESERVED_CHARS | ||
else t for t in text]) | ||
|
||
|
||
def _encode_path_part(text, maximal=True): | ||
"Percent-encode a single segment of a URL path." | ||
if maximal: | ||
|
@@ -485,7 +499,8 @@ def _decode_fragment_part(text, normalize_case=False): | |
_decode_map=_FRAGMENT_DECODE_MAP) | ||
|
||
|
||
def _percent_decode(text, normalize_case=False, _decode_map=_HEX_CHAR_MAP): | ||
def _percent_decode(text, normalize_case=False, subencoding='utf-8', | ||
raise_subencoding_exc=False, _decode_map=_HEX_CHAR_MAP): | ||
"""Convert percent-encoded text characters to their normal, | ||
human-readable equivalents. | ||
|
||
|
@@ -547,9 +562,13 @@ def _percent_decode(text, normalize_case=False, _decode_map=_HEX_CHAR_MAP): | |
|
||
unquoted_bytes = b''.join(res) | ||
|
||
if subencoding is False: | ||
return unquoted_bytes | ||
try: | ||
return unquoted_bytes.decode("utf-8") | ||
return unquoted_bytes.decode(subencoding) | ||
except UnicodeDecodeError: | ||
if raise_subencoding_exc: | ||
raise | ||
return text | ||
|
||
|
||
|
@@ -1040,7 +1059,7 @@ def from_text(cls, text): | |
rooted, userinfo, uses_netloc) | ||
|
||
def normalize(self, scheme=True, host=True, path=True, query=True, | ||
fragment=True): | ||
fragment=True, userinfo=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docstring should mention There's a TODO for userinfo. Should that go? |
||
"""Return a new URL object with several standard normalizations | ||
applied: | ||
|
||
|
@@ -1091,6 +1110,11 @@ def normalize(self, scheme=True, host=True, path=True, query=True, | |
if fragment: | ||
kw['fragment'] = _decode_unreserved(self.fragment, | ||
normalize_case=True) | ||
if userinfo: | ||
kw['userinfo'] = u':'.join([_decode_unreserved(p, | ||
normalize_case=True) | ||
for p in self.userinfo.split(':', 1)]) | ||
|
||
return self.replace(**kw) | ||
|
||
def child(self, *segments): | ||
|
@@ -1449,3 +1473,272 @@ def remove(self, name): | |
""" | ||
return self.replace(query=((k, v) for (k, v) in self.query | ||
if k != name)) | ||
|
||
|
||
class DecodedURL(object): | ||
"""DecodedURL is a type meant to act as a higher-level interface to | ||
the URL. It is the `unicode` to URL's `bytes`. `DecodedURL` has | ||
almost exactly the same API as `URL`, but everything going in and | ||
out is in its maximally decoded state. All percent decoding is | ||
handled automatically. | ||
|
||
Where applicable, a UTF-8 encoding is presumed. Be advised that | ||
some interactions, can raise UnicodeEncodeErrors and | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comma splice: some interactions Also, maybe you want backticks around |
||
UnicodeDecodeErrors, just like when working with | ||
bytestrings. | ||
|
||
Examples of such interactions include handling query strings | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be part of the previous paragraph? |
||
encoding binary data, and paths containing segments with special | ||
characters encoded with codecs other than UTF-8. | ||
""" | ||
def __init__(self, url): | ||
self._url = url | ||
|
||
@classmethod | ||
def from_text(cls, text): | ||
_url = URL.from_text(text) | ||
return cls(_url) | ||
|
||
def to_text(self, *a, **kw): | ||
return self._url.to_text(*a, **kw) | ||
|
||
def to_uri(self, *a, **kw): | ||
return self._url.to_uri(*a, **kw) | ||
|
||
def to_iri(self, *a, **kw): | ||
return self._url.to_iri(*a, **kw) | ||
|
||
def click(self, href=u''): | ||
return type(self)(self._url.click(href=href)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not |
||
|
||
def sibling(self, segment): | ||
return type(self)(self._url.sibling(_encode_reserved(segment))) | ||
|
||
def child(self, *segments): | ||
if not segments: | ||
return self | ||
new_segs = [_encode_reserved(s) for s in segments] | ||
return type(self)(self._url.child(*new_segs)) | ||
|
||
def normalize(self, *a, **kw): | ||
return type(self)(self._url.normalize(*a, **kw)) | ||
|
||
@property | ||
def absolute(self): | ||
return self._url.absolute | ||
|
||
@property | ||
def scheme(self): | ||
return self._url.scheme | ||
|
||
@property | ||
def host(self): | ||
host = self._url.host | ||
try: | ||
host_bytes = host.encode("ascii") | ||
except UnicodeEncodeError: | ||
host_text = host | ||
else: | ||
try: | ||
host_text = host_bytes.decode("idna") | ||
except ValueError: | ||
# only reached on "narrow" (UCS-2) Python builds <3.4, see #7 | ||
# NOTE: not going to raise here, because there's no | ||
# ambiguity in the IDNA, and the host is still | ||
# technically usable | ||
host_text = host | ||
|
||
return host_text | ||
|
||
@property | ||
def port(self): | ||
return self._url.port | ||
|
||
@property | ||
def rooted(self): | ||
return self._url.rooted | ||
|
||
@property | ||
def path(self): | ||
return tuple([_percent_decode(p, raise_subencoding_exc=True) | ||
for p in self._url.path]) | ||
|
||
@property | ||
def query(self): | ||
return tuple([tuple(_percent_decode(x, raise_subencoding_exc=True) | ||
if x is not None else None | ||
for x in (k, v)) | ||
for k, v in self._url.query]) | ||
|
||
@property | ||
def fragment(self): | ||
return _percent_decode(self._url.fragment, raise_subencoding_exc=True) | ||
|
||
@property | ||
def userinfo(self): | ||
return tuple([_percent_decode(p, raise_subencoding_exc=True) | ||
for p in self._url.userinfo.split(':', 1)]) | ||
|
||
@property | ||
def user(self): | ||
return self.userinfo[0] | ||
|
||
@property | ||
def password(self): | ||
return self.userinfo[1] | ||
|
||
@property | ||
def uses_netloc(self): | ||
return self._url.uses_netloc | ||
|
||
def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, | ||
fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET, | ||
uses_netloc=_UNSET): | ||
"""This replace differs a little from URL.replace. For instance, it | ||
accepts userinfo as a tuple, not as a string. As with the rest | ||
of the methods on DecodedURL, if you pass a reserved | ||
character, it will be automatically encoded instead of an | ||
error being raised. | ||
""" | ||
if path is not _UNSET: | ||
path = [_encode_reserved(p) for p in path] | ||
if query is not _UNSET: | ||
query = [[_encode_reserved(x) | ||
if x is not None else None | ||
for x in (k, v)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above about nullable query parameter keys. |
||
for k, v in iter_pairs(query)] | ||
if userinfo is not _UNSET: | ||
if not len(userinfo) == 2: | ||
raise ValueError('userinfo expected sequence of' | ||
' ["user", "password"], got %r' % userinfo) | ||
userinfo = u':'.join([_encode_reserved(p) for p in userinfo]) | ||
new_url = self._url.replace(scheme=scheme, | ||
host=host, | ||
path=path, | ||
query=query, | ||
fragment=fragment, | ||
port=port, | ||
rooted=rooted, | ||
userinfo=userinfo, | ||
uses_netloc=uses_netloc) | ||
return type(self)(url=new_url) | ||
|
||
def get(self, name): | ||
# TODO: another reason to do this in the __init__ | ||
return [v for (k, v) in self.query if name == k] | ||
|
||
def add(self, name, value=None): | ||
return self.replace(query=self.query + ((name, value),)) | ||
|
||
def set(self, name, value=None): | ||
query = self.query | ||
q = [(k, v) for (k, v) in query if k != name] | ||
idx = next((i for (i, (k, v)) in enumerate(query) if k == name), -1) | ||
q[idx:idx] = [(name, value)] | ||
return self.replace(query=q) | ||
|
||
def remove(self, name): | ||
return self.replace(query=((k, v) for (k, v) in self.query | ||
if k != name)) | ||
|
||
def __repr__(self): | ||
cn = self.__class__.__name__ | ||
return '%s(url=%r)' % (cn, self._url) | ||
|
||
def __str__(self): | ||
# TODO: the underlying URL's __str__ needs to change to make | ||
# this work as the URL | ||
return str(self._url) | ||
|
||
def __eq__(self, other): | ||
if not isinstance(other, self.__class__): | ||
return NotImplemented | ||
return self.normalize().to_uri() == other.normalize().to_uri() | ||
|
||
def __ne__(self, other): | ||
if not isinstance(other, self.__class__): | ||
return NotImplemented | ||
return not self.__eq__(other) | ||
|
||
def __hash__(self): | ||
return hash((self.__class__, self.scheme, self.userinfo, self.host, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: Since equality is delegated upwards to e.g. def __hash__(self):
return hash(self.normalize().to_uri()) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good point. I may have subtly changed my mind midway through development, and now I lean more toward not delegating up to URL for equality. I'll get that fixed, thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whenever you're delegating There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @glyph why is that? (correctness shouldn't be effected since dictionaries compare by equality, and I'm not entirely sure what the performance problem you're trying to forestall) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I share @moshez's curiosity. @mahmoud - will you follow @alexwlchan's advice about delegating |
||
self.path, self.query, self.fragment, self.port, | ||
self.rooted, self.uses_netloc)) | ||
|
||
# # Begin Twisted Compat Code | ||
asURI = to_uri | ||
asIRI = to_iri | ||
|
||
@classmethod | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this its own There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Twisted-style single character argument name copied for consistency with URL. |
||
def fromText(cls, s): | ||
return cls.from_text(s) | ||
|
||
def asText(self, includeSecrets=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Twisted-style camelCase argument name. |
||
return self.to_text(with_password=includeSecrets) | ||
|
||
def __dir__(self): | ||
try: | ||
ret = object.__dir__(self) | ||
except AttributeError: | ||
# object.__dir__ == AttributeError # pdw for py2 | ||
ret = dir(self.__class__) + list(self.__dict__.keys()) | ||
ret = sorted(set(ret) - set(['fromText', 'asURI', 'asIRI', 'asText'])) | ||
return ret | ||
|
||
# # End Twisted Compat Code | ||
|
||
|
||
|
||
|
||
"""Probably turn the properties into normal attributes now that they | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like most of this is done. Please remove everything except what's left to do for posterity's sake. |
||
raise exceptions, or at least cachedproperties. | ||
|
||
* Decode | ||
* Percent | ||
* userinfo | ||
* user | ||
* path | ||
* query | ||
* fragment | ||
* Wrap in decoder | ||
* .get() | ||
* Wrap in encoder | ||
* replace | ||
* add() | ||
* set() | ||
* child (split and encode?) | ||
* sibling | ||
* remove() | ||
* Passthrough | ||
* __eq__ / __ne__ / __hash__ | ||
* absolute() | ||
* Return new DecodedURL with new ._url (the other kind of passthrough) | ||
* normalize() | ||
* click() | ||
* Strict passthrough (doesn't return a DecodedURL) | ||
* to_uri() | ||
* to_iri() | ||
|
||
# Factoring | ||
|
||
Should DecodedURL be a subclass of URL? Inheritance isn't cool | ||
anymore, so obviously not right? But seriously, it could be: | ||
|
||
* Every single method of URL is wrapped with almost an identical API, | ||
except for __init__ | ||
* A DecodedURL is as much a URL as both `bytes` and `unicode` are | ||
strings | ||
|
||
On the arguments against: | ||
|
||
* __init__ differs | ||
* No real benefit to calling super() | ||
* Only a few duplicate methods could be reused (Twisted compat, | ||
.get(), a couple others) | ||
|
||
# Remaining design questions | ||
|
||
* Should _encode_reserved(maximal=False) be used instead? | ||
* If yes, should the underlying URL have .to_iri() applied to it? | ||
|
||
""" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to say that we should just avoid exposing this entirely, but it probably needs to be exposed for type annotations.
However, as per #44, could we have a "decoded" property on
URL
that provides an interface to this, and an "encoded" property onDecodedURL
that maps back to aURL
?(At this point I think I'm in favor of adding an
EncodedURL
alias forURL
, then maybe adding a top-level entry point likehyperlink.parse()
which takes adecoded
kwarg flag which defaults to true, to make it easier to get started withDecodedURL
which is what I think we all want most of the time. That can definitely be deferred to a separate ticket though.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I am pretty much in favor of all those conveniences :) And I also agree that this probably needs to be exposed.