Skip to content

Commit 1631a9e

Browse files
author
明城
authored
Merge pull request #14 from raywill/master
优化Readability,当文章有多个大段章节时能全部提取出来,而不是只提取最大的一段。
2 parents 06e073a + 80fe4b7 commit 1631a9e

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

lib/Readability.inc.php

+21-5
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ private function getTopBox() {
168168
if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {
169169
$contentScore -= 50;
170170
} else if(preg_match(
171-
"/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
171+
"/((^|\\s)(section|post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
172172
$className)) {
173173
$contentScore += 25;
174174
}
@@ -196,23 +196,39 @@ private function getTopBox() {
196196
}
197197

198198
$topBox = null;
199-
199+
200200
// Assignment from index for performance.
201201
// See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
202202
for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {
203203
$parentNode = $this->parentNodes[$i];
204204
$contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE));
205205
$orgContentScore = intval($topBox ? $topBox->getAttribute(Readability::ATTR_CONTENT_SCORE) : 0);
206206

207-
if ($contentScore && $contentScore > $orgContentScore) {
208-
$topBox = $parentNode;
207+
// by raywill, 2016-9-2
208+
// for case: <div><p>xxx</p></div><div><p>yyy</p></div>
209+
if ($parentNode && $topBox && $topBox->parentNode
210+
&& $parentNode !== $topBox
211+
&& $parentNode->parentNode === $topBox->parentNode
212+
&& $this->scoreMatch($parentNode, $topBox)) { // trust same level
213+
214+
$topScore = intval($topBox->getAttribute(Readability::ATTR_CONTENT_SCORE));
215+
$topBox = $topBox->parentNode;
216+
$topBox->setAttribute(Readability::ATTR_CONTENT_SCORE, $topScore + $contentScore);
217+
} else if ($contentScore && $contentScore > $orgContentScore) {
218+
219+
$topBox = $parentNode;
209220
}
210221
}
211-
222+
212223
// 此时,$topBox 应为已经判定后的页面内容主元素
213224
return $topBox;
214225
}
215226

227+
protected function scoreMatch($n1, $n2) {
228+
$n1Score = intval($n1->getAttribute(Readability::ATTR_CONTENT_SCORE));
229+
$n2Score = intval($n2->getAttribute(Readability::ATTR_CONTENT_SCORE));
230+
return ($n1Score > 0 && $n2Score > 0);
231+
}
216232

217233
/**
218234
* 获取 HTML 页面标题

0 commit comments

Comments
 (0)