Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

解决中文路径乱码问题 #12

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<junit.version>4.11</junit.version>
<webmagic.version>0.6.1</webmagic.version>
<webmagic.version>0.7.3</webmagic.version>
<log4j.version>2.6.2</log4j.version>
<slf4j.version>1.7.21</slf4j.version>
<elasticsearch.version>5.0.1</elasticsearch.version>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
package com.brianway.webporter.collector.zhihu;

import java.io.UnsupportedEncodingException;

import com.brianway.webporter.configure.BasicConfiguration;

public class ZhihuConfiguration extends BasicConfiguration {

public static final String SUBDIR_MEMBER = "member/";
public static final String SUBDIR_FOLLOWEE = "followee/";

public ZhihuConfiguration(String path) {
public ZhihuConfiguration(String path) throws UnsupportedEncodingException {
super(path);
}

public ZhihuConfiguration() {
public ZhihuConfiguration() throws Exception{

}

Expand All @@ -31,7 +33,7 @@ public String getFolloweeDataPath() {
return getFolloweePath() + site.getDomain() + "/";
}

public static void main(String[] args) {
public static void main(String[] args) throws Exception{
ZhihuConfiguration configuration = new ZhihuConfiguration();
System.out.println(configuration.getSite());
System.out.println(configuration.getBaseDir());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,19 @@
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.selector.Json;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import javax.management.MBeanServer;

/**
* Created by brian on 16/11/24.
*
Expand All @@ -21,7 +26,11 @@
*/
public class ZhihuFolloweePageProcessor implements PageProcessor {

private Site site = new ZhihuConfiguration().getSite();
private Site site;

public ZhihuFolloweePageProcessor() throws Exception{
this.site = new ZhihuConfiguration().getSite();
}

public void process(Page page) {
Json json = page.getJson();
Expand All @@ -45,7 +54,7 @@ public Site getSite() {

public static String generateFolloweeUrl(String urlToken) {
final String URL_TEMPLATE = "https://www.zhihu.com/api/v4/members/%s/followees";
final String QUERY_PARAMS = "?include=data%5B*%5D.url_token&offset=0&per_page=30&limit=30";
final String QUERY_PARAMS = "?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20";

String encoded = StringHelper.urlEncode(urlToken);
return String.format(URL_TEMPLATE, encoded) + QUERY_PARAMS;
Expand All @@ -61,9 +70,11 @@ public static List<String> generateFolloweeUrls(List<String> urlTokens) {
* 下载关注列表的用户数据,用于提取 url_tokens
* @param args 无须其他参数
*/
public static void main(String[] args) {
public static void main(String[] args) throws Exception {
String pipelinePath = new ZhihuConfiguration().getFolloweePath();
int crawlSize = 100_0000;
System.out.println(pipelinePath);
int crawlSize = 100_0000;//下划线使数据更清晰
System.out.println(crawlSize);
Spider.create(new ZhihuFolloweePageProcessor())
.setScheduler(//new QueueScheduler()
new FileCacheQueueScheduler(pipelinePath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
*/
public class ZhihuMemberPageProcessor implements PageProcessor {

private Site site = new ZhihuConfiguration().getSite();
private Site site;

public ZhihuMemberPageProcessor() throws Exception{
new ZhihuConfiguration().getSite();
}

public void process(Page page) {
page.putField(ZhihuPipeline.URL, page.getUrl());
page.putField(ZhihuPipeline.RESPONSE, page.getRawText());
Expand All @@ -39,8 +43,9 @@ private static String generateMemberUrl(String urlToken) {
/**
* 根据提取的 url_token 逐个下载用户的完整信息
* @param args 无须其他参数
* @throws Exception
*/
public static void main(String[] args) {
public static void main(String[] args) throws Exception {
ZhihuConfiguration configuration = new ZhihuConfiguration();
String pipelinePath = configuration.getMemberPath();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,16 @@ public class MemberURLTokenGenerator implements DataProcessor<File, String> {

public final static String URLTOKEN_FILENAME = "url_tokens";

private final static String DEFAULT_FOLDER = new ZhihuConfiguration().getFolloweeDataPath();
private final static String DEFAULT_PATH = new ZhihuConfiguration().getFolloweePath() + URLTOKEN_FILENAME;
private static String DEFAULT_FOLDER;
private static String DEFAULT_PATH;

private String folder;
private String path;

public MemberURLTokenGenerator() {
public MemberURLTokenGenerator() throws Exception {
this(DEFAULT_FOLDER, DEFAULT_PATH);
DEFAULT_FOLDER = new ZhihuConfiguration().getFolloweeDataPath();
DEFAULT_PATH = new ZhihuConfiguration().getFolloweePath() + URLTOKEN_FILENAME;
}

/**
Expand Down Expand Up @@ -108,7 +110,7 @@ private Set<String> getURLTokens() {
return new HashSet<>(tokens);
}

public static void main(String[] args) {
public static void main(String[] args) throws Exception {
MemberURLTokenGenerator generator = new MemberURLTokenGenerator();
generator.generateURLTokens().stream()
.forEach(System.out::println);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public static String readMember(File inItem) {
return followees.size() == 0 ? null : followees.get(0);
}

public static void main(String[] args) {
public static void main(String[] args) throws Exception {
ZhihuConfiguration configuration = new ZhihuConfiguration();
String folder = configuration.getMemberDataPath();
DataProcessor<File, Document> processor = new ZhihuMemberDataProcessor();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import java.util.concurrent.TimeUnit;

public class FolloweeUploader {
public static void upload() {
public static void upload() throws Exception {
String index = "zhihu";
String type = "followee";
ZhihuConfiguration configuration = new ZhihuConfiguration();
Expand All @@ -31,7 +31,7 @@ public static void upload() {
System.out.println(outPipeline.getBulkProcessor());
}

public static void main(String[] args) {
public static void main(String[] args) throws Exception {
FolloweeUploader.upload();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* step 3: 将用户数据导入 Elasticsearch
*/
public class MemberUploader {
public static void upload() {
public static void upload() throws Exception {
String index = "zhihu";
String type = "member";
ZhihuConfiguration configuration = new ZhihuConfiguration();
Expand All @@ -34,7 +34,7 @@ public static void upload() {
System.out.println(outPipeline.getBulkProcessor());
}

public static void main(String[] args) {
public static void main(String[] args) throws Exception {
MemberUploader.upload();
}
}
9 changes: 5 additions & 4 deletions webporter-collector-zhihu/src/main/resources/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
"site": {
"domain": "www.zhihu.com",
"headers": {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"authorization": "Your own authorization here."
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
"X-UDID": "APCvmFgTjA2PTpjFsTjc15kWKMlBMJJ7Vag=",
"Cookie": "_zap=d35756d6-8530-4512-934a-d4ac74000ee4; __utma=155987696.492403388.1517840990.1517840990.1517840990.1; __utmz=155987696.1517840990.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); d_c0=\"APCvmFgTjA2PTpjFsTjc15kWKMlBMJJ7Vag=|1525518611\"; __DAYU_PP=MMUf3RbMn3ee2mzUNvQJffffffff83b23b7b6aed; q_c1=25af6ae0238e4f39b484eaaf92d1d807|1527508465000|1515585280000; _xsrf=7add93e5-45c4-4733-abdb-d6774ad16ef3; capsion_ticket=\"2|1:0|10:1528022290|14:capsion_ticket|44:YjdjODlhMjRhMWI0NDQzNmI2MjM5YWI0OGFkY2VlZDE=|150deeeec740cbb8c8b21e0b4a41090dc5058800cf4dadf33ddadee2de1bfe55\"; z_c0=\"2|1:0|10:1528022298|4:z_c0|92:Mi4xSU00OUFBQUFBQUFBOEstWVdCT01EU1lBQUFCZ0FsVk5HaE1CWEFCSUg2VmF6RTlCZTQwQTJHc1V0ZTdKR3YzaGtR|2633e75b1b77854a586f118a03068256a1506fb3941446559dea06015d58885e\"; tgw_l7_route=7139e401481ef2f46ce98b22af4f4bed"
},
"retryTimes": 3,
"sleepTime": 500
"sleepTime": 10000
},
"base_dir": "/Users/brian/todo/zhihu-crawl/"
"base_dir": "F:/KingWang/crawler"
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package com.brianway.webporter.collector;

import com.brianway.webporter.collector.zhihu.ZhihuConfiguration;

import java.io.UnsupportedEncodingException;

import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
Expand All @@ -22,7 +25,7 @@ public void init() {
}

@Test
public void testConfig() {
public void testConfig() throws Exception {
String memberPath = baseDir + ZhihuConfiguration.SUBDIR_MEMBER;
ZhihuConfiguration configuration = new ZhihuConfiguration();
Assert.assertEquals(baseDir, configuration.getBaseDir());
Expand All @@ -31,7 +34,7 @@ public void testConfig() {
}

@Test
public void testConfigByPath() {
public void testConfigByPath() throws UnsupportedEncodingException {
String followeePath = baseDir + ZhihuConfiguration.SUBDIR_FOLLOWEE;
String path = rootDir + "another-config.json";
ZhihuConfiguration configuration = new ZhihuConfiguration(path);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package com.brianway.webporter.configure;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;

import com.brianway.webporter.util.FileHelper;

public abstract class AbstractConfiguration {
Expand All @@ -9,11 +12,12 @@ public abstract class AbstractConfiguration {

protected String config;

protected AbstractConfiguration() {
protected AbstractConfiguration() throws UnsupportedEncodingException {
this(DEFAULT_CONFIG_DIR + DEFAULT_CONFIG_FILE);
}

protected AbstractConfiguration(String path) {
protected AbstractConfiguration(String path) throws UnsupportedEncodingException {
path = URLDecoder.decode(path,"UTF-8");
config = FileHelper.getRawText(path);
resolve();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.brianway.webporter.configure;

import java.io.UnsupportedEncodingException;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.Site;
Expand All @@ -9,11 +11,11 @@ public class BasicConfiguration extends AbstractConfiguration {

protected String baseDir;

public BasicConfiguration(String path) {
public BasicConfiguration(String path) throws UnsupportedEncodingException {
super(path);
}

public BasicConfiguration() {
public BasicConfiguration() throws Exception{
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.brianway.webporter.configure;

import java.io.UnsupportedEncodingException;

import com.alibaba.fastjson.JSON;
import us.codecraft.webmagic.Site;

Expand All @@ -11,19 +13,19 @@ public Site getSite() {
return site;
}

public SiteConfiguration(String path) {
public SiteConfiguration(String path) throws UnsupportedEncodingException {
super(path);
}

public SiteConfiguration() {
public SiteConfiguration() throws Exception{

}

protected void resolve() {
site = JSON.parseObject(config, Site.class);
}

public static void main(String[] args) {
public static void main(String[] args) throws Exception {
SiteConfiguration siteConfiguration = new SiteConfiguration();
Site site = siteConfiguration.getSite();
System.out.println(site);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
package com.brianway.webporter;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;

import org.junit.BeforeClass;

public class BaseTest {
protected static String rootDir;

@BeforeClass
public static void init() {
public static void init() throws UnsupportedEncodingException {
rootDir = BaseTest.class.getResource("/").getPath();
rootDir = URLDecoder.decode(rootDir,"UTF-8");
}

}
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package com.brianway.webporter.configure;

import com.brianway.webporter.BaseTest;

import java.io.UnsupportedEncodingException;

import org.junit.Assert;
import org.junit.Test;
import us.codecraft.webmagic.Site;

public class BasicConfigurationTest extends BaseTest {
@Test
public void testGetConfiguration() {
public void testGetConfiguration() throws UnsupportedEncodingException {
String configPath = rootDir + "basic-config.json";
int retryTimes = 3;
String domain = "www.zhihu.com";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
public class SiteConfigurationTest {

@Test
public void testGetConfiguredSite() {
public void testGetConfiguredSite() throws Exception {
SiteConfiguration siteConfiguration = new SiteConfiguration();
Site site = siteConfiguration.getSite();
String key = "AAA";
Expand Down