11# -*- coding: utf-8 -*-
22import copy
3+ import sys
34import logging
45import re
5- import urllib
66
77from bidict import bidict
88import diff_match_patch as dmp_module
99from lxml .html import fromstring , tostring , fragment_fromstring
1010from lxml import etree
1111
12+ if sys .version_info < (3 ,):
13+ chr = unichr
14+ unicode_type = unicode
15+ else :
16+ unicode_type = str
1217
13- UNICODE_KEY = [unichr (item ) for item in range (0xE000 , 0xFFFF + 1 )]
18+ UNICODE_KEY = [chr (item ) for item in range (0xE000 , 0xFFFF + 1 )]
1419# unicode spec not in use
1520DMP = dmp_module .diff_match_patch ()
1621
@@ -45,31 +50,31 @@ def _map_tag(self, content):
4550 self .tag_map [self .code_key .pop ()] = tag
4651
4752 def _map_media_tag (self , element , raw_tag ):
48- url = urlencode (element .attrib )
49- if url in self .media_url .values ():
50- code = self .media_url .inv [url ]
53+ tag_key = gen_tag_key (element .attrib )
54+ if tag_key in self .media_url .values ():
55+ code = self .media_url .inv [tag_key ]
5156 self .tag_map [code ].append (raw_tag )
5257 return
5358 code = self .code_key .pop ()
5459 self .tag_map [code ] = [raw_tag ]
55- self .media_url [code ] = url
60+ self .media_url [code ] = tag_key
5661
5762 def _replace (self , new_content , old_content ):
5863 self ._map_tag (new_content )
59- for code , tag in self .tag_map .iteritems ():
64+ for code , tag in self .tag_map .items ():
6065 if not isinstance (tag , list ):
6166 tag = [tag ]
6267 for item in tag :
6368 new_content = new_content .replace (item , code )
64- for code , tag in self .tag_map .iteritems ():
69+ for code , tag in self .tag_map .items ():
6570 if not isinstance (tag , list ):
6671 tag = [tag ]
6772 for item in tag :
6873 old_content = old_content .replace (item , code )
6974 return to_unicode (new_content ), to_unicode (old_content )
7075
7176 def _recover (self , content ):
72- for code , tag in self .tag_map .iteritems ():
77+ for code , tag in self .tag_map .items ():
7378 if isinstance (tag , list ):
7479 tag = tag [0 ]
7580 content = content .replace (code , tag )
@@ -88,15 +93,16 @@ def _diff(self, old_content, new_content):
8893 for (op , data ) in diffs :
8994 text = self ._recover (data )
9095 if op == self .INSERT :
91- html .append ("<ins style=\" background:#e6ffe6;\" >{}</ins>" .format (text ))
96+ html .append (u "<ins style=\" background:#e6ffe6;\" >{}</ins>" .format (text ))
9297 elif op == self .DELETE :
93- html .append ("<del style=\" background:#ffe6e6;\" >{}</del>" .format (text ))
98+ html .append (u "<del style=\" background:#ffe6e6;\" >{}</del>" .format (text ))
9499 elif op == self .EQUAL :
95100 html .append (text )
96- return "" .join (html )
101+ return utf8 ( u "" .join (html ) )
97102
98103
99- _TO_UNICODE_TYPES = (unicode , type (None ))
104+ _TO_UNICODE_TYPES = (unicode_type , type (None ))
105+ _UTF8_TYPES = (bytes , type (None ))
100106
101107
102108def to_unicode (value ):
@@ -109,21 +115,23 @@ def to_unicode(value):
109115 return value .decode ("utf-8" )
110116
111117
118+ def utf8 (value ):
119+ if isinstance (value , _UTF8_TYPES ):
120+ return value
121+ if not isinstance (value , unicode_type ):
122+ raise TypeError ("Expected bytes, unicode, or None; got %r" % type (value ))
123+ return value .encode ("utf-8" )
124+
125+
112126def ensure_closed_tag (html ):
113127 try :
114128 element = fromstring (html )
115129 except etree .ParserError as e :
116- logging .warn ('fromstring error: {}, use fragment_fromstring' .format (e ))
130+ logging .warning ('fromstring error: {}, use fragment_fromstring' .format (e ))
117131 element = fragment_fromstring (html , create_parent = 'div' )
118- return tostring (element , encoding = 'utf-8' )
132+ return to_unicode ( tostring (element , encoding = 'utf-8' ) )
119133
120134
121- def urlencode (query ):
122- l = []
123- for k , v in query .items ():
124- if isinstance (v , unicode ):
125- v = v .encode ('utf-8' )
126- k = urllib .quote_plus (str (k ))
127- v = urllib .quote_plus (str (v ))
128- l .append (k + '=' + v )
135+ def gen_tag_key (query ):
136+ l = ["{}={}" .format (to_unicode (k ), to_unicode (v )) for k , v in query .items ()]
129137 return '&' .join (l )
0 commit comments