diff --git a/LICENSE b/LICENSE index 0cecd8527..37d7aa900 100644 --- a/LICENSE +++ b/LICENSE @@ -176,7 +176,7 @@ recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2013 code4craft + Copyright 2025 code4craft Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/pom.xml b/pom.xml index af04c6917..903ac48a9 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.2 + 1.0.3 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index f436bce26..bad11de43 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 167a5e1c6..94b00cc73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -36,7 +36,7 @@ */ public class HttpClientGenerator { - private transient Logger logger = LoggerFactory.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 85ff5fa69..74ea718e5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -31,6 +31,11 @@ public Selectable smartContent() { return select(smartContentSelector, getSourceTexts()); } + public Selectable smartContent(int threshold) { + SmartContentSelector smartContentSelector = Selectors.smartContent(threshold); + return select(smartContentSelector, getSourceTexts()); + } + @Override public Selectable links() { return selectElements(new LinksSelector()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 7cd68c1d6..3600896e2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() { return new SmartContentSelector(); } + public static SmartContentSelector smartContent(int threshold) { + return new SmartContentSelector(threshold); + } + public static CssSelector $(String expr) { return new CssSelector(expr); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..c8816510b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -16,9 +16,15 @@ @Experimental public class SmartContentSelector implements Selector { + private int threshold = 86; + public SmartContentSelector() { } + public SmartContentSelector(int threshold) { + this.threshold = threshold; + } + @Override public String select(String html) { html = html.replaceAll("(?is)", ""); @@ -29,7 +35,6 @@ public String select(String html) { html = html.replaceAll("(?is)<.*?>", ""); List lines; int blocksWidth =3; - int threshold =86; int start; int end; StringBuilder text = new StringBuilder(); diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 6265abae5..2b4a53460 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index a1c26d212..93faa4aaf 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3 4.0.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 31dfca75a..01f1af9a3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.*; @@ -96,7 +97,7 @@ public Page download(Request request, Task task) { page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - page.setStatusCode(200); + page.setStatusCode(HttpConstant.StatusCode.CODE_200); } onSuccess(page, task); } catch (Exception e) { diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 2c2b34ef6..50e79c73e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 37349a419..26d1989d6 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 4b21d5e3c..62cea3e69 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index b47f84a31..16214c61a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2 + 1.0.3 4.0.0 diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 874f8aef7..f6d2574fb 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -14,9 +14,11 @@ import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.Closeable; import java.io.IOException; +import java.net.http.HttpRequest; import java.util.Map; /** @@ -111,6 +113,7 @@ public Page download(Request request, Task task) { page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); + page.setStatusCode(HttpConstant.StatusCode.CODE_200); onSuccess(page, task); } catch (Exception e) { logger.warn("download page {} error", request.getUrl(), e);