|
10 | 10 | Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
11 | 11 | """
|
12 | 12 |
|
13 |
| -__version__ = "5.0" |
| 13 | +__version__ = "5.0.1" |
14 | 14 | __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
|
15 | 15 |
|
16 | 16 | Redistribution and use in source and binary forms, with or without modification,
|
@@ -747,7 +747,7 @@ def parse_declaration(self, i):
|
747 | 747 |
|
748 | 748 | def mapContentType(self, contentType):
|
749 | 749 | contentType = contentType.lower()
|
750 |
| - if contentType == 'text': |
| 750 | + if contentType == 'text' or contentType == 'plain': |
751 | 751 | contentType = 'text/plain'
|
752 | 752 | elif contentType == 'html':
|
753 | 753 | contentType = 'text/html'
|
@@ -1971,6 +1971,14 @@ def output(self):
|
1971 | 1971 | '''Return processed HTML as a single string'''
|
1972 | 1972 | return ''.join([str(p) for p in self.pieces])
|
1973 | 1973 |
|
| 1974 | + def parse_declaration(self, i): |
| 1975 | + try: |
| 1976 | + return sgmllib.SGMLParser.parse_declaration(self, i) |
| 1977 | + except sgmllib.SGMLParseError: |
| 1978 | + # escape the doctype declaration and continue parsing |
| 1979 | + self.handle_data('<') |
| 1980 | + return i+1 |
| 1981 | + |
1974 | 1982 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
|
1975 | 1983 | def __init__(self, baseuri, baselang, encoding, entities):
|
1976 | 1984 | sgmllib.SGMLParser.__init__(self)
|
@@ -2476,9 +2484,10 @@ def _makeSafeAbsoluteURI(base, rel=None):
|
2476 | 2484 | if not base:
|
2477 | 2485 | return rel or u''
|
2478 | 2486 | if not rel:
|
2479 |
| - if base.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: |
2480 |
| - return u'' |
2481 |
| - return base |
| 2487 | + scheme = urlparse.urlparse(base)[0] |
| 2488 | + if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: |
| 2489 | + return base |
| 2490 | + return u'' |
2482 | 2491 | uri = _urljoin(base, rel)
|
2483 | 2492 | if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
|
2484 | 2493 | return u''
|
@@ -2666,6 +2675,9 @@ def unknown_starttag(self, tag, attrs):
|
2666 | 2675 | for key, value in self.normalize_attrs(attrs):
|
2667 | 2676 | if key in acceptable_attributes:
|
2668 | 2677 | key=keymap.get(key,key)
|
| 2678 | + # make sure the uri uses an acceptable uri scheme |
| 2679 | + if key == u'href': |
| 2680 | + value = _makeSafeAbsoluteURI(value) |
2669 | 2681 | clean_attrs.append((key,value))
|
2670 | 2682 | elif key=='style':
|
2671 | 2683 | clean_value = self.sanitize_style(value)
|
@@ -2721,6 +2733,18 @@ def sanitize_style(self, style):
|
2721 | 2733 |
|
2722 | 2734 | return ' '.join(clean)
|
2723 | 2735 |
|
| 2736 | + def parse_comment(self, i, report=1): |
| 2737 | + ret = _BaseHTMLProcessor.parse_comment(self, i, report) |
| 2738 | + if ret >= 0: |
| 2739 | + return ret |
| 2740 | + # if ret == -1, this may be a malicious attempt to circumvent |
| 2741 | + # sanitization, or a page-destroying unclosed comment |
| 2742 | + match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) |
| 2743 | + if match: |
| 2744 | + return match.end() |
| 2745 | + # unclosed comment; deliberately fail to handle_data() |
| 2746 | + return len(self.rawdata) |
| 2747 | + |
2724 | 2748 |
|
2725 | 2749 | def _sanitizeHTML(htmlSource, encoding, _type):
|
2726 | 2750 | p = _HTMLSanitizer(encoding, _type)
|
|
0 commit comments