Skip to content

Commit c612f16

Browse files
committed
Upgrade to feedparser v5.01
1 parent ed80f84 commit c612f16

File tree

1 file changed

+29
-5
lines changed

1 file changed

+29
-5
lines changed

feedparser.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
1111
"""
1212

13-
__version__ = "5.0"
13+
__version__ = "5.0.1"
1414
__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
1515
1616
Redistribution and use in source and binary forms, with or without modification,
@@ -747,7 +747,7 @@ def parse_declaration(self, i):
747747

748748
def mapContentType(self, contentType):
749749
contentType = contentType.lower()
750-
if contentType == 'text':
750+
if contentType == 'text' or contentType == 'plain':
751751
contentType = 'text/plain'
752752
elif contentType == 'html':
753753
contentType = 'text/html'
@@ -1971,6 +1971,14 @@ def output(self):
19711971
'''Return processed HTML as a single string'''
19721972
return ''.join([str(p) for p in self.pieces])
19731973

1974+
def parse_declaration(self, i):
1975+
try:
1976+
return sgmllib.SGMLParser.parse_declaration(self, i)
1977+
except sgmllib.SGMLParseError:
1978+
# escape the doctype declaration and continue parsing
1979+
self.handle_data('&lt;')
1980+
return i+1
1981+
19741982
class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
19751983
def __init__(self, baseuri, baselang, encoding, entities):
19761984
sgmllib.SGMLParser.__init__(self)
@@ -2476,9 +2484,10 @@ def _makeSafeAbsoluteURI(base, rel=None):
24762484
if not base:
24772485
return rel or u''
24782486
if not rel:
2479-
if base.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2480-
return u''
2481-
return base
2487+
scheme = urlparse.urlparse(base)[0]
2488+
if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2489+
return base
2490+
return u''
24822491
uri = _urljoin(base, rel)
24832492
if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
24842493
return u''
@@ -2666,6 +2675,9 @@ def unknown_starttag(self, tag, attrs):
26662675
for key, value in self.normalize_attrs(attrs):
26672676
if key in acceptable_attributes:
26682677
key=keymap.get(key,key)
2678+
# make sure the uri uses an acceptable uri scheme
2679+
if key == u'href':
2680+
value = _makeSafeAbsoluteURI(value)
26692681
clean_attrs.append((key,value))
26702682
elif key=='style':
26712683
clean_value = self.sanitize_style(value)
@@ -2721,6 +2733,18 @@ def sanitize_style(self, style):
27212733

27222734
return ' '.join(clean)
27232735

2736+
def parse_comment(self, i, report=1):
2737+
ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2738+
if ret >= 0:
2739+
return ret
2740+
# if ret == -1, this may be a malicious attempt to circumvent
2741+
# sanitization, or a page-destroying unclosed comment
2742+
match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2743+
if match:
2744+
return match.end()
2745+
# unclosed comment; deliberately fail to handle_data()
2746+
return len(self.rawdata)
2747+
27242748

27252749
def _sanitizeHTML(htmlSource, encoding, _type):
27262750
p = _HTMLSanitizer(encoding, _type)

0 commit comments

Comments
 (0)