@@ -168,7 +168,7 @@ private function getTopBox() {
168
168
if (preg_match ("/(comment|meta|footer|footnote)/i " , $ className )) {
169
169
$ contentScore -= 50 ;
170
170
} else if (preg_match (
171
- "/((^| \\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)( \\s|$))/i " ,
171
+ "/((^| \\s)(section| post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)( \\s|$))/i " ,
172
172
$ className )) {
173
173
$ contentScore += 25 ;
174
174
}
@@ -196,23 +196,39 @@ private function getTopBox() {
196
196
}
197
197
198
198
$ topBox = null ;
199
-
199
+
200
200
// Assignment from index for performance.
201
201
// See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
202
202
for ($ i = 0 , $ len = sizeof ($ this ->parentNodes ); $ i < $ len ; $ i ++) {
203
203
$ parentNode = $ this ->parentNodes [$ i ];
204
204
$ contentScore = intval ($ parentNode ->getAttribute (Readability::ATTR_CONTENT_SCORE ));
205
205
$ orgContentScore = intval ($ topBox ? $ topBox ->getAttribute (Readability::ATTR_CONTENT_SCORE ) : 0 );
206
206
207
- if ($ contentScore && $ contentScore > $ orgContentScore ) {
208
- $ topBox = $ parentNode ;
207
+ // by raywill, 2016-9-2
208
+ // for case: <div><p>xxx</p></div><div><p>yyy</p></div>
209
+ if ($ parentNode && $ topBox && $ topBox ->parentNode
210
+ && $ parentNode !== $ topBox
211
+ && $ parentNode ->parentNode === $ topBox ->parentNode
212
+ && $ this ->scoreMatch ($ parentNode , $ topBox )) { // trust same level
213
+
214
+ $ topScore = intval ($ topBox ->getAttribute (Readability::ATTR_CONTENT_SCORE ));
215
+ $ topBox = $ topBox ->parentNode ;
216
+ $ topBox ->setAttribute (Readability::ATTR_CONTENT_SCORE , $ topScore + $ contentScore );
217
+ } else if ($ contentScore && $ contentScore > $ orgContentScore ) {
218
+
219
+ $ topBox = $ parentNode ;
209
220
}
210
221
}
211
-
222
+
212
223
// 此时,$topBox 应为已经判定后的页面内容主元素
213
224
return $ topBox ;
214
225
}
215
226
227
+ protected function scoreMatch ($ n1 , $ n2 ) {
228
+ $ n1Score = intval ($ n1 ->getAttribute (Readability::ATTR_CONTENT_SCORE ));
229
+ $ n2Score = intval ($ n2 ->getAttribute (Readability::ATTR_CONTENT_SCORE ));
230
+ return ($ n1Score > 0 && $ n2Score > 0 );
231
+ }
216
232
217
233
/**
218
234
* 获取 HTML 页面标题
0 commit comments