Skip to content

Commit

Permalink
Merge branch 'release/1.0.3'
Browse files Browse the repository at this point in the history
  • Loading branch information
sutra committed Feb 10, 2025
2 parents 1cd199b + f4a8825 commit cfae008
Show file tree
Hide file tree
Showing 15 changed files with 30 additions and 12 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ recommend that a file or class name and description of purpose be included on
the same "printed page" as the copyright notice for easier identification within
third-party archives.

Copyright 2013 code4craft
Copyright 2025 code4craft

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
<version>2.2.1</version>
</parent>
<groupId>us.codecraft</groupId>
<version>1.0.2</version>
<version>1.0.3</version>
<packaging>pom</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
Expand Down
2 changes: 1 addition & 1 deletion webmagic-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.2</version>
<version>1.0.3</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
*/
public class HttpClientGenerator {

private transient Logger logger = LoggerFactory.getLogger(getClass());
private Logger logger = LoggerFactory.getLogger(getClass());

private PoolingHttpClientConnectionManager connectionManager;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ public Selectable smartContent() {
return select(smartContentSelector, getSourceTexts());
}

public Selectable smartContent(int threshold) {
SmartContentSelector smartContentSelector = Selectors.smartContent(threshold);
return select(smartContentSelector, getSourceTexts());
}

@Override
public Selectable links() {
return selectElements(new LinksSelector());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() {
return new SmartContentSelector();
}

public static SmartContentSelector smartContent(int threshold) {
return new SmartContentSelector(threshold);
}

public static CssSelector $(String expr) {
return new CssSelector(expr);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@
@Experimental
public class SmartContentSelector implements Selector {

private int threshold = 86;

public SmartContentSelector() {
}

public SmartContentSelector(int threshold) {
this.threshold = threshold;
}

@Override
public String select(String html) {
html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
Expand All @@ -29,7 +35,6 @@ public String select(String html) {
html = html.replaceAll("(?is)<.*?>", "");
List<String> lines;
int blocksWidth =3;
int threshold =86;
int start;
int end;
StringBuilder text = new StringBuilder();
Expand Down
2 changes: 1 addition & 1 deletion webmagic-coverage/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.2</version>
<version>1.0.3</version>
</parent>

<artifactId>webmagic-coverage</artifactId>
Expand Down
2 changes: 1 addition & 1 deletion webmagic-extension/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.2</version>
<version>1.0.3</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant;

import java.io.*;

Expand Down Expand Up @@ -96,7 +97,7 @@ public Page download(Request request, Task task) {
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(200);
page.setStatusCode(HttpConstant.StatusCode.CODE_200);
}
onSuccess(page, task);
} catch (Exception e) {
Expand Down
2 changes: 1 addition & 1 deletion webmagic-samples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.2</version>
<version>1.0.3</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion webmagic-saxon/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.2</version>
<version>1.0.3</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion webmagic-scripts/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.2</version>
<version>1.0.3</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion webmagic-selenium/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>1.0.2</version>
<version>1.0.3</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant;

import java.io.Closeable;
import java.io.IOException;
import java.net.http.HttpRequest;
import java.util.Map;

/**
Expand Down Expand Up @@ -111,6 +113,7 @@ public Page download(Request request, Task task) {
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(HttpConstant.StatusCode.CODE_200);
onSuccess(page, task);
} catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e);
Expand Down

0 comments on commit cfae008

Please sign in to comment.