diff --git a/README.md b/README.md index 89536c927..2af81cb22 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf ### First crawler: -Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. +Write a class implements PageProcessor. For example, I wrote a crawler of github repository information. ```java public class GithubRepoPageProcessor implements PageProcessor { @@ -112,7 +112,7 @@ public class GithubRepo { Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) -The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) +The architecture of webmagic (referred to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) diff --git a/pom.xml b/pom.xml index d0abd3568..af04c6917 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ 2.2.1 us.codecraft - 1.0.1 + 1.0.2 pom UTF-8 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 52cd7ba2c..f436bce26 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index e8c75ccf1..18486f7a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -52,9 +52,44 @@ public class Page { private String charset; + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofSuccess(Request request) { + return new Page(request, true); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofFailure(Request request) { + return new Page(request, false); + } + public Page() { } + /** + * Constructs a {@link Page} with {@link #request} + * and {@link #downloadSuccess} specified. + * + * @param request the request. + * @param downloadSuccess the download success flag. + * @since 1.0.2 + */ + private Page(Request request, boolean downloadSuccess) { + this.request = request; + this.downloadSuccess = downloadSuccess; + } + /** * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. * @@ -73,7 +108,9 @@ public static Page fail() { * @param request the {@link Request}. * @return the page. * @since 0.10.0 + * @deprecated Use {@link #ofFailure(Request)} instead. */ + @Deprecated(since = "1.0.2", forRemoval = true) public static Page fail(Request request){ Page page = new Page(); page.setRequest(request); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 39deecc73..6fdae38d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -76,13 +76,14 @@ public Page download(Request request, Task task) { CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.fail(request); + Page page = null; try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(page, task); return page; } catch (IOException e) { + page = Page.ofFailure(request); onError(page, task, e); return page; } finally { @@ -105,7 +106,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http HttpEntity entity = httpResponse.getEntity(); byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; - Page page = new Page(); + Page page = Page.ofSuccess(request); page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { @@ -117,7 +118,6 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); - page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 98db3f826..6265abae5 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -10,7 +10,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 1fe18e066..a1c26d212 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 76105d330..2c2b34ef6 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index c206d21a2..37349a419 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 123ac6699..3093284c8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index d09deef50..b47f84a31 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic - 1.0.1 + 1.0.2 4.0.0