-
-
Notifications
You must be signed in to change notification settings - Fork 469
/
Copy pathCleaner.php
130 lines (112 loc) · 5.06 KB
/
Cleaner.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
<?php
declare(strict_types=1);
namespace PHPHtmlParser\Dom;
use PHPHtmlParser\Contracts\Dom\CleanerInterface;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Options;
class Cleaner implements CleanerInterface
{
/**
* Cleans the html of any none-html information.
*
* @throws LogicalException
*/
public function clean(string $str, Options $options, string $defaultCharset): string
{
if (!$options->isCleanupInput()) {
// skip entire cleanup step
return $str;
}
// check if the string is gziped
$is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII');
if ($is_gzip) {
$str = \gzdecode($str);
if ($str === false) {
throw new LogicalException('gzdecode returned false. Error when trying to decode the string.');
}
}
// we must handle character encoding
$str = $this->setUpRegexEncoding($str, $options, $defaultCharset);
// remove white space before closing tags
$str = \mb_eregi_replace("'\s+>", "'>", $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.');
}
$str = \mb_eregi_replace('"\s+>', '">', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.');
}
// clean out the \n\r
$replace = ' ';
if ($options->isPreserveLineBreaks()) {
$replace = ' ';
}
$str = \str_replace(["\r\n", "\r", "\n"], $replace, $str);
if ($str === false) {
throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.');
}
// strip the doctype
$str = \mb_eregi_replace('<!doctype(.*?)>', '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.');
}
// strip out comments
$str = \mb_eregi_replace('<!--(.*?)-->', '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.');
}
// strip out cdata
$str = \mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.');
}
// strip out <script> tags
if ($options->isRemoveScripts()) {
$str = \mb_eregi_replace("<\s*script[^>|\s]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 1.');
}
$str = \mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 2.');
}
}
// strip out <style> tags
if ($options->isRemoveStyles()) {
$str = \mb_eregi_replace("<\s*style[^>|\s]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 1.');
}
$str = \mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 2.');
}
}
// strip smarty scripts
if ($options->isRemoveSmartyScripts()) {
$str = \mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove smarty scripts.');
}
}
return $str;
}
/**
* Sets up the mb_regex_encoding and converts the text to that encoding.
*
* @throws LogicalException
*/
private function setUpRegexEncoding(string $str, Options $options, string $defaultCharset): string
{
$encoding = $defaultCharset;
$enforceEncoding = $options->getEnforceEncoding();
if ($enforceEncoding !== null) {
// they want to enforce the given encoding
$encoding = $enforceEncoding;
}
if (!\mb_regex_encoding($encoding)) {
throw new LogicalException('Character encoding was not able to be changed to ' . $encoding . '.');
}
return \mb_convert_encoding($str, $encoding);
}
}