From 6ed83769e0a10fc6be02ba3b3371a88cf6be34ae Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 26 Oct 2024 01:37:09 +0800 Subject: [PATCH 1/5] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index d0abd3568..9380a7eaa 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.1 + 1.0.2-SNAPSHOT pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 52cd7ba2c..6e31559f2 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 98db3f826..93925ab3b 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 1fe18e066..b986a8e63 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 76105d330..a7d9b809d 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index c206d21a2..52b60685c 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 123ac6699..1d99229b0 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index d09deef50..04be9c20c 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2-SNAPSHOT 4.0.0 From 7e500d7b95972d062b4442f505c24dd06ca32f0c Mon Sep 17 00:00:00 2001 From: Bob Conan Date: Fri, 22 Nov 2024 20:24:58 -0600 Subject: [PATCH 2/5] Updated README.md, fix typo(s) (#1180) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 89536c927..2af81cb22 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf ### First crawler: -Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. +Write a class implements PageProcessor. For example, I wrote a crawler of github repository information. ```java public class GithubRepoPageProcessor implements PageProcessor { @@ -112,7 +112,7 @@ public class GithubRepo { Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) -The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) +The architecture of webmagic (referred to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) From 0a9fe8d3e03e58c96e497efdc727ce6d09684229 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 27 Nov 2024 00:49:03 +0800 Subject: [PATCH 3/5] Add static methods to construct Page. --- .../main/java/us/codecraft/webmagic/Page.java | 37 +++++++++++++++++++ .../downloader/HttpClientDownloader.java | 5 +-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index e8c75ccf1..18486f7a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -52,9 +52,44 @@ public class Page { private String charset; + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofSuccess(Request request) { + return new Page(request, true); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofFailure(Request request) { + return new Page(request, false); + } + public Page() { } + /** + * Constructs a {@link Page} with {@link #request} + * and {@link #downloadSuccess} specified. + * + * @param request the request. + * @param downloadSuccess the download success flag. + * @since 1.0.2 + */ + private Page(Request request, boolean downloadSuccess) { + this.request = request; + this.downloadSuccess = downloadSuccess; + } + /** * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. * @@ -73,7 +108,9 @@ public static Page fail() { * @param request the {@link Request}. * @return the page. * @since 0.10.0 + * @deprecated Use {@link #ofFailure(Request)} instead. */ + @Deprecated(since = "1.0.2", forRemoval = true) public static Page fail(Request request){ Page page = new Page(); page.setRequest(request); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 39deecc73..789448f03 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -76,7 +76,7 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.fail(request); + Page page = Page.ofFailure(request); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); @@ -105,7 +105,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http HttpEntity entity = httpResponse.getEntity(); byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; - Page page = new Page(); + Page page = Page.ofSuccess(request); page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { @@ -117,7 +117,6 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); - page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } From c20edb824645806cd02367fd3b517efacb3e44cf Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 27 Nov 2024 03:31:30 +0800 Subject: [PATCH 4/5] Polish code. --- .../us/codecraft/webmagic/downloader/HttpClientDownloader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 789448f03..6fdae38d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -76,13 +76,14 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.ofFailure(request); + Page page = null; try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(page, task); return page; } catch (IOException e) { + page = Page.ofFailure(request); onError(page, task, e); return page; } finally { From bf1088bd67ade34b666860a7abc1c5c61886e36e Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Wed, 27 Nov 2024 04:16:05 +0800 Subject: [PATCH 5/5] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 9380a7eaa..af04c6917 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.2-SNAPSHOT + 1.0.2 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 6e31559f2..f436bce26 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 93925ab3b..6265abae5 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index b986a8e63..a1c26d212 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a7d9b809d..2c2b34ef6 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 52b60685c..37349a419 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 1d99229b0..3093284c8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 04be9c20c..b47f84a31 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.2-SNAPSHOT + 1.0.2 4.0.0