Skip to content

Commit a4d4a1b

Browse files
author
明城
authored
Merge pull request #17 from feelinglucky/master
同步 master 的代码
2 parents 34f32ff + 0fcf424 commit a4d4a1b

File tree

3 files changed

+71
-13
lines changed

3 files changed

+71
-13
lines changed

LICENSE

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) [year] [fullname]
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
22+

README.md

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
# PHP Readability Library
22

3+
If you want to use an up-to-date version of this algorithm,check this newer project:
4+
5+
https://github.com/andreskrey/readability.php
6+
7+
38
## Back the fun of reading
49

5-
The PHP port of [[http://code.google.com/p/arc90labs-readability/|Readability.js by Arc90]].
10+
The PHP port of [Readability.js by Arc90](http://code.google.com/p/arc90labs-readability/).
611

712

813
## Requirements
914

1015
* PHP Version >= 5
11-
* [[http://www.php.net/manual/en/book.dom.php|PHP has builded with DOM(Document Object Model)]]
16+
* [PHP has builded with DOM(Document Object Model)](http://www.php.net/manual/en/book.dom.php)
1217

1318

1419
## Live demo
@@ -31,5 +36,5 @@ echo $ReadabilityData['content'];
3136

3237

3338

34-
PS: For Node.js port, You can [[https://github.com/arrix/node-readability/|check this]].
39+
PS: For Node.js port, You can [check this](https://github.com/arrix/node-readability/).
3540

lib/Readability.inc.php

+41-10
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
<?php
2-
// vim: set et sw=4 ts=4 sts=4 fdm=marker ff=unix fenc=utf8 nobomb:
2+
// vim: set et sw=4 ts=4 sts=4 ft=php fdm=marker ff=unix fenc=utf8 nobomb:
33
/**
44
* PHP Readability
55
*
66
* Readability PHP 版本,详见
77
* http://code.google.com/p/arc90labs-readability/
88
*
99
* ChangeLog:
10+
* [+] 2014-02-08 Add lead image param and improved get title function.
1011
* [+] 2013-12-04 Better error handling and junk tag removal.
1112
* [+] 2011-02-17 初始化版本
1213
*
@@ -19,7 +20,7 @@
1920
* @link http://tuxion.nl/
2021
*/
2122

22-
define("READABILITY_VERSION", 0.13);
23+
define("READABILITY_VERSION", 0.21);
2324

2425
class Readability {
2526
// 保存判定结果的标记位名称
@@ -103,6 +104,10 @@ private function preparSource($string) {
103104
$string = preg_replace("/<br\/?>[ \r\n\s]*<br\/?>/i", "</p><p>", $string);
104105
$string = preg_replace("/<\/?font[^>]*>/i", "", $string);
105106

107+
// @see https://github.com/feelinglucky/php-readability/issues/7
108+
// - from http://stackoverflow.com/questions/7130867/remove-script-tag-from-html-content
109+
$string = preg_replace("#<script(.*?)>(.*?)</script>#is", "", $string);
110+
106111
return trim($string);
107112
}
108113

@@ -163,7 +168,7 @@ private function getTopBox() {
163168
if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {
164169
$contentScore -= 50;
165170
} else if(preg_match(
166-
"/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
171+
"/((^|\\s)(section|post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
167172
$className)) {
168173
$contentScore += 25;
169174
}
@@ -191,23 +196,39 @@ private function getTopBox() {
191196
}
192197

193198
$topBox = null;
194-
199+
195200
// Assignment from index for performance.
196201
// See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
197202
for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {
198203
$parentNode = $this->parentNodes[$i];
199204
$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));
200205
$orgContentScore = intval($topBox ? $topBox->getAttribute(Readability::ATTR_CONTENT_SCORE) : 0);
201206

202-
if ($contentScore && $contentScore > $orgContentScore) {
203-
$topBox = $parentNode;
207+
// by raywill, 2016-9-2
208+
// for case: <div><p>xxx</p></div><div><p>yyy</p></div>
209+
if ($parentNode && $topBox && $topBox->parentNode
210+
&& $parentNode !== $topBox
211+
&& $parentNode->parentNode === $topBox->parentNode
212+
&& $this->scoreMatch($parentNode, $topBox)) { // trust same level
213+
214+
$topScore = intval($topBox->getAttribute(Readability::ATTR_CONTENT_SCORE));
215+
$topBox = $topBox->parentNode;
216+
$topBox->setAttribute(Readability::ATTR_CONTENT_SCORE, $topScore + $contentScore);
217+
} else if ($contentScore && $contentScore > $orgContentScore) {
218+
219+
$topBox = $parentNode;
204220
}
205221
}
206-
222+
207223
// 此时,$topBox 应为已经判定后的页面内容主元素
208224
return $topBox;
209225
}
210226

227+
protected function scoreMatch($n1, $n2) {
228+
$n1Score = intval($n1->getAttribute(Readability::ATTR_CONTENT_SCORE));
229+
$n2Score = intval($n2->getAttribute(Readability::ATTR_CONTENT_SCORE));
230+
return ($n1Score > 0 && $n2Score > 0);
231+
}
211232

212233
/**
213234
* 获取 HTML 页面标题
@@ -238,9 +259,19 @@ public function getTitle() {
238259
public function getLeadImageUrl($node) {
239260
$images = $node->getElementsByTagName("img");
240261

241-
if ($images->length && $leadImage = $images->item(0)) {
242-
return $leadImage->getAttribute("src");
243-
}
262+
if ($images->length){
263+
$i = 0;
264+
while($leadImage = $images->item($i++)) {
265+
$imgsrc = $leadImage->getAttribute("src");
266+
$imgdatasrc = $leadImage->getAttribute("data-src");
267+
$imgsrclast = $imgsrc ? $imgsrc : $imgdatasrc;
268+
list($img['width'],$img['height'])=getimagesize($imgsrclast);
269+
if($img['width'] > 150 && $img['height'] >150){
270+
return $imgsrclast;
271+
}
272+
273+
}
274+
}
244275

245276
return null;
246277
}

0 commit comments

Comments
 (0)