This repository was archived by the owner on Oct 12, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtidy.py
190 lines (159 loc) · 6.81 KB
/
tidy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""Functions to run cherrypy.response through Tidy or NSGML."""
import cgi
import os
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import traceback
import cherrypy
def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None,
indent=False, wrap=False, warnings=True):
"""Run cherrypy.response through Tidy.
If either 'indent' or 'wrap' are specified, then response.body will be
set to the output of tidy. Otherwise, only errors (including warnings,
if warnings is True) will change the body.
Note that we use the standalone Tidy tool rather than the python
mxTidy module. This is because this module does not seem to be
stable and it crashes on some HTML pages (which means that the
server would also crash)
"""
response = cherrypy.response
# the tidy tool, by its very nature it's not generator friendly,
# so we just collapse the body and work with it.
orig_body = response.collapse_body()
fct = response.headers.get('Content-Type', '')
ct = fct.split(';')[0]
encoding = ''
i = fct.find('charset=')
if i != -1:
encoding = fct[i + 8:]
if ct == 'text/html':
page_file = os.path.join(temp_dir, 'page.html')
open(page_file, 'wb').write(orig_body)
out_file = os.path.join(temp_dir, 'tidy.out')
err_file = os.path.join(temp_dir, 'tidy.err')
tidy_enc = encoding.replace('-', '')
if tidy_enc:
tidy_enc = '-' + tidy_enc
strict_xml = ("", " -xml")[bool(strict_xml)]
if indent:
indent = ' -indent'
else:
indent = ''
if wrap is False:
wrap = ''
else:
try:
wrap = ' -wrap %d' % int(tidyWrap)
except:
wrap = ''
result = os.system('"%s" %s%s%s%s -f %s -o %s %s' %
(tidy_path, tidy_enc, strict_xml, indent, wrap,
err_file, out_file, page_file))
use_output = bool(indent or wrap) and not result
if use_output:
output = open(out_file, 'rb').read()
new_errs = []
for err in open(err_file, 'rb').read().splitlines():
if (err.find('Error') != -1 or
(warnings and err.find('Warning') != -1)):
ignore = 0
for err_ign in errors_to_ignore or []:
if err.find(err_ign) != -1:
ignore = 1
break
if not ignore:
new_errs.append(err)
if new_errs:
response.body = wrong_content('<br />'.join(new_errs), orig_body)
if "Content-Length" in response.headers:
# Delete Content-Length header so finalize() recalcs it.
del response.headers["Content-Length"]
return
elif strict_xml:
# The HTML is OK, but is it valid XML?
# Use elementtree to parse XML
from elementtree.ElementTree import parse
tag_list = ['nbsp', 'quot']
for tag in tag_list:
orig_body = orig_body.replace('&' + tag + ';', tag.upper())
if encoding:
enctag = '<?xml version="1.0" encoding="%s"?>' % encoding
orig_body = enctag + orig_body
f = StringIO(orig_body)
try:
tree = parse(f)
except:
# Wrong XML
body_file = StringIO()
traceback.print_exc(file = body_file)
body_file = '<br />'.join(body_file.getvalue())
response.body = wrong_content(body_file, orig_body, "XML")
if "Content-Length" in response.headers:
# Delete Content-Length header so finalize() recalcs it.
del response.headers["Content-Length"]
return
if use_output:
response.body = [output]
if "Content-Length" in response.headers:
# Delete Content-Length header so finalize() recalcs it.
del response.headers["Content-Length"]
def html_space(text):
"""Escape text, replacing space with nbsp and tab with 4 nbsp's."""
return cgi.escape(text).replace('\t', ' ').replace(' ', ' ')
def html_break(text):
"""Escape text, replacing newline with HTML br element."""
return cgi.escape(text).replace('\n', '<br />')
def wrong_content(header, body, content_type="HTML"):
output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))]
for i, line in enumerate(body.splitlines()):
output.append("%03d - %s" % (i + 1, html_space(line)))
return "<br />".join(output)
def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None):
response = cherrypy.response
# the tidy tool, by its very nature it's not generator friendly,
# so we just collect the body and work with it.
orig_body = response.collapse_body()
fct = response.headers.get('Content-Type', '')
ct = fct.split(';')[0]
encoding = ''
i = fct.find('charset=')
if i != -1:
encoding = fct[i + 8:]
if ct == 'text/html':
# Remove bits of Javascript (nsgmls doesn't seem to handle
# them correctly (for instance, if <a appears in your
# Javascript code nsgmls complains about it)
while True:
i = orig_body.find('<script')
if i == -1:
break
j = orig_body.find('</script>', i)
if j == -1:
break
orig_body = orig_body[:i] + orig_body[j+9:]
page_file = os.path.join(temp_dir, 'page.html')
open(page_file, 'wb').write(orig_body)
err_file = os.path.join(temp_dir, 'nsgmls.err')
command = ('%s -c%s -f%s -s -E10 %s' %
(nsgmls_path, catalog_path, err_file, page_file))
command = command.replace('\\', '/')
os.system(command)
errs = open(err_file, 'rb').read()
new_errs = []
for err in errs.splitlines():
ignore = False
for err_ign in errors_to_ignore or []:
if err.find(err_ign) != -1:
ignore = True
break
if not ignore:
new_errs.append(err)
if new_errs:
response.body = wrong_content('<br />'.join(new_errs), orig_body)
if "Content-Length" in response.headers:
# Delete Content-Length header so finalize() recalcs it.
del response.headers["Content-Length"]
cherrypy.tools.tidy = cherrypy.Tool('before_finalize', tidy)
cherrypy.tools.nsgmls = cherrypy.Tool('before_finalize', nsgmls)