|
1 | 1 | <?php
|
2 |
| -// vim: set et sw=4 ts=4 sts=4 fdm=marker ff=unix fenc=utf8 nobomb: |
| 2 | +// vim: set et sw=4 ts=4 sts=4 ft=php fdm=marker ff=unix fenc=utf8 nobomb: |
3 | 3 | /**
|
4 | 4 | * PHP Readability
|
5 | 5 | *
|
6 | 6 | * Readability PHP 版本,详见
|
7 | 7 | * http://code.google.com/p/arc90labs-readability/
|
8 | 8 | *
|
9 | 9 | * ChangeLog:
|
| 10 | + * [+] 2014-02-08 Add lead image param and improved get title function. |
10 | 11 | * [+] 2013-12-04 Better error handling and junk tag removal.
|
11 | 12 | * [+] 2011-02-17 初始化版本
|
12 | 13 | *
|
|
19 | 20 | * @link http://tuxion.nl/
|
20 | 21 | */
|
21 | 22 |
|
22 |
| -define("READABILITY_VERSION", 0.13); |
| 23 | +define("READABILITY_VERSION", 0.21); |
23 | 24 |
|
24 | 25 | class Readability {
|
25 | 26 | // 保存判定结果的标记位名称
|
@@ -103,6 +104,10 @@ private function preparSource($string) {
|
103 | 104 | $string = preg_replace("/<br\/?>[ \r\n\s]*<br\/?>/i", "</p><p>", $string);
|
104 | 105 | $string = preg_replace("/<\/?font[^>]*>/i", "", $string);
|
105 | 106 |
|
| 107 | + // @see https://github.com/feelinglucky/php-readability/issues/7 |
| 108 | + // - from http://stackoverflow.com/questions/7130867/remove-script-tag-from-html-content |
| 109 | + $string = preg_replace("#<script(.*?)>(.*?)</script>#is", "", $string); |
| 110 | + |
106 | 111 | return trim($string);
|
107 | 112 | }
|
108 | 113 |
|
@@ -163,7 +168,7 @@ private function getTopBox() {
|
163 | 168 | if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {
|
164 | 169 | $contentScore -= 50;
|
165 | 170 | } else if(preg_match(
|
166 |
| - "/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i", |
| 171 | + "/((^|\\s)(section|post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i", |
167 | 172 | $className)) {
|
168 | 173 | $contentScore += 25;
|
169 | 174 | }
|
@@ -191,23 +196,39 @@ private function getTopBox() {
|
191 | 196 | }
|
192 | 197 |
|
193 | 198 | $topBox = null;
|
194 |
| - |
| 199 | + |
195 | 200 | // Assignment from index for performance.
|
196 | 201 | // See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
|
197 | 202 | for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {
|
198 | 203 | $parentNode = $this->parentNodes[$i];
|
199 | 204 | $contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));
|
200 | 205 | $orgContentScore = intval($topBox ? $topBox->getAttribute(Readability::ATTR_CONTENT_SCORE) : 0);
|
201 | 206 |
|
202 |
| - if ($contentScore && $contentScore > $orgContentScore) { |
203 |
| - $topBox = $parentNode; |
| 207 | + // by raywill, 2016-9-2 |
| 208 | + // for case: <div><p>xxx</p></div><div><p>yyy</p></div> |
| 209 | + if ($parentNode && $topBox && $topBox->parentNode |
| 210 | + && $parentNode !== $topBox |
| 211 | + && $parentNode->parentNode === $topBox->parentNode |
| 212 | + && $this->scoreMatch($parentNode, $topBox)) { // trust same level |
| 213 | + |
| 214 | + $topScore = intval($topBox->getAttribute(Readability::ATTR_CONTENT_SCORE)); |
| 215 | + $topBox = $topBox->parentNode; |
| 216 | + $topBox->setAttribute(Readability::ATTR_CONTENT_SCORE, $topScore + $contentScore); |
| 217 | + } else if ($contentScore && $contentScore > $orgContentScore) { |
| 218 | + |
| 219 | + $topBox = $parentNode; |
204 | 220 | }
|
205 | 221 | }
|
206 |
| - |
| 222 | + |
207 | 223 | // 此时,$topBox 应为已经判定后的页面内容主元素
|
208 | 224 | return $topBox;
|
209 | 225 | }
|
210 | 226 |
|
| 227 | + protected function scoreMatch($n1, $n2) { |
| 228 | + $n1Score = intval($n1->getAttribute(Readability::ATTR_CONTENT_SCORE)); |
| 229 | + $n2Score = intval($n2->getAttribute(Readability::ATTR_CONTENT_SCORE)); |
| 230 | + return ($n1Score > 0 && $n2Score > 0); |
| 231 | + } |
211 | 232 |
|
212 | 233 | /**
|
213 | 234 | * 获取 HTML 页面标题
|
@@ -238,9 +259,19 @@ public function getTitle() {
|
238 | 259 | public function getLeadImageUrl($node) {
|
239 | 260 | $images = $node->getElementsByTagName("img");
|
240 | 261 |
|
241 |
| - if ($images->length && $leadImage = $images->item(0)) { |
242 |
| - return $leadImage->getAttribute("src"); |
243 |
| - } |
| 262 | + if ($images->length){ |
| 263 | + $i = 0; |
| 264 | + while($leadImage = $images->item($i++)) { |
| 265 | + $imgsrc = $leadImage->getAttribute("src"); |
| 266 | + $imgdatasrc = $leadImage->getAttribute("data-src"); |
| 267 | + $imgsrclast = $imgsrc ? $imgsrc : $imgdatasrc; |
| 268 | + list($img['width'],$img['height'])=getimagesize($imgsrclast); |
| 269 | + if($img['width'] > 150 && $img['height'] >150){ |
| 270 | + return $imgsrclast; |
| 271 | + } |
| 272 | + |
| 273 | + } |
| 274 | + } |
244 | 275 |
|
245 | 276 | return null;
|
246 | 277 | }
|
|
0 commit comments