diff --git a/LICENSE b/LICENSE
index 0cecd8527..37d7aa900 100644
--- a/LICENSE
+++ b/LICENSE
@@ -176,7 +176,7 @@ recommend that a file or class name and description of purpose be included on
the same "printed page" as the copyright notice for easier identification within
third-party archives.
- Copyright 2013 code4craft
+ Copyright 2025 code4craft
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/pom.xml b/pom.xml
index af04c6917..903ac48a9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -12,7 +12,7 @@
2.2.1
us.codecraft
- 1.0.2
+ 1.0.3
pom
UTF-8
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index f436bce26..bad11de43 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.2
+ 1.0.3
4.0.0
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index 167a5e1c6..94b00cc73 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -36,7 +36,7 @@
*/
public class HttpClientGenerator {
- private transient Logger logger = LoggerFactory.getLogger(getClass());
+ private Logger logger = LoggerFactory.getLogger(getClass());
private PoolingHttpClientConnectionManager connectionManager;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
index 85ff5fa69..74ea718e5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java
@@ -31,6 +31,11 @@ public Selectable smartContent() {
return select(smartContentSelector, getSourceTexts());
}
+ public Selectable smartContent(int threshold) {
+ SmartContentSelector smartContentSelector = Selectors.smartContent(threshold);
+ return select(smartContentSelector, getSourceTexts());
+ }
+
@Override
public Selectable links() {
return selectElements(new LinksSelector());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
index 7cd68c1d6..3600896e2 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java
@@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() {
return new SmartContentSelector();
}
+ public static SmartContentSelector smartContent(int threshold) {
+ return new SmartContentSelector(threshold);
+ }
+
public static CssSelector $(String expr) {
return new CssSelector(expr);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
index ff8e26998..c8816510b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
@@ -16,9 +16,15 @@
@Experimental
public class SmartContentSelector implements Selector {
+ private int threshold = 86;
+
public SmartContentSelector() {
}
+ public SmartContentSelector(int threshold) {
+ this.threshold = threshold;
+ }
+
@Override
public String select(String html) {
html = html.replaceAll("(?is)", "");
@@ -29,7 +35,6 @@ public String select(String html) {
html = html.replaceAll("(?is)<.*?>", "");
List lines;
int blocksWidth =3;
- int threshold =86;
int start;
int end;
StringBuilder text = new StringBuilder();
diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml
index 6265abae5..2b4a53460 100644
--- a/webmagic-coverage/pom.xml
+++ b/webmagic-coverage/pom.xml
@@ -10,7 +10,7 @@
us.codecraft
webmagic
- 1.0.2
+ 1.0.3
webmagic-coverage
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index a1c26d212..93faa4aaf 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.2
+ 1.0.3
4.0.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
index 31dfca75a..01f1af9a3 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
@@ -6,6 +6,7 @@
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
import java.io.*;
@@ -96,7 +97,7 @@ public Page download(Request request, Task task) {
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
- page.setStatusCode(200);
+ page.setStatusCode(HttpConstant.StatusCode.CODE_200);
}
onSuccess(page, task);
} catch (Exception e) {
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 2c2b34ef6..50e79c73e 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.2
+ 1.0.3
4.0.0
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
index 37349a419..26d1989d6 100644
--- a/webmagic-saxon/pom.xml
+++ b/webmagic-saxon/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.2
+ 1.0.3
4.0.0
diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml
index 4b21d5e3c..62cea3e69 100644
--- a/webmagic-scripts/pom.xml
+++ b/webmagic-scripts/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.2
+ 1.0.3
4.0.0
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index b47f84a31..16214c61a 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -8,7 +8,7 @@
us.codecraft
webmagic
- 1.0.2
+ 1.0.3
4.0.0
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index 874f8aef7..f6d2574fb 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -14,9 +14,11 @@
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
import java.io.Closeable;
import java.io.IOException;
+import java.net.http.HttpRequest;
import java.util.Map;
/**
@@ -111,6 +113,7 @@ public Page download(Request request, Task task) {
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
+ page.setStatusCode(HttpConstant.StatusCode.CODE_200);
onSuccess(page, task);
} catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e);