Skip to content

Commit 3de9e91

Browse files
committed
数据库结构修改,增加待爬取url的优先度
1 parent 040644b commit 3de9e91

23 files changed

+465
-413
lines changed

.idea/libraries/Maven__com_google_guava_guava_15_0.xml

-13
This file was deleted.

.idea/workspace.xml

+279-343
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ZhihuSpider/sql/zhihu.sql

+24-23
Original file line numberDiff line numberDiff line change
@@ -15,35 +15,36 @@ CREATE TABLE `zhihu_user` (
1515
`modify_time` datetime DEFAULT NULL COMMENT '更新时间',
1616
PRIMARY KEY (`id`),
1717
UNIQUE KEY `character_url` (`character_url`)
18-
) ENGINE=InnoDB DEFAULT CHARSET=utf8
18+
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
1919

2020

2121
#创建知乎文章表#
2222
CREATE TABLE `zhihu_article` (
23-
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
24-
`character_url` varchar(50) DEFAULT NULL COMMENT '特征url',
25-
`article_url` varchar(200) DEFAULT NULL COMMENT '文章url',
26-
`article_title` varchar(200) DEFAULT NULL COMMENT '文章标题',
27-
`agrees` int(11) DEFAULT NULL COMMENT '赞同数',
28-
`comments` int(11) DEFAULT NULL COMMENT '评论数',
29-
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
30-
`modify_time` datetime DEFAULT NULL COMMENT '更新时间',
23+
`id` INT(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
24+
`character_url` VARCHAR(50) DEFAULT NULL COMMENT '特征url',
25+
`article_id` INT(11) DEFAULT NULL COMMENT '文章id',
26+
`article_url` VARCHAR(200) DEFAULT NULL COMMENT '文章url',
27+
`article_title` VARCHAR(200) DEFAULT NULL COMMENT '文章标题',
28+
`agrees` INT(11) DEFAULT NULL COMMENT '赞同数',
29+
`comments` INT(11) DEFAULT NULL COMMENT '评论数',
30+
`create_time` DATETIME DEFAULT NULL COMMENT '创建时间',
31+
`modify_time` DATETIME DEFAULT NULL COMMENT '更新时间',
3132
PRIMARY KEY (`id`),
32-
UNIQUE KEY `character_url` (`character_url`)
33-
) ENGINE=InnoDB DEFAULT CHARSET=utf8
33+
UNIQUE KEY `article_id` (`article_id`)
34+
) ENGINE=INNODB DEFAULT CHARSET=utf8;
3435

3536
#创建知乎回答表#
3637
CREATE TABLE `zhihu_answer` (
37-
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
38-
`character_url` varchar(50) DEFAULT NULL COMMENT '特征url',
39-
`answer_url` varchar(200) DEFAULT NULL COMMENT '回答url',
40-
`question_id` int(11) DEFAULT NULL COMMENT '问题id',
41-
`question_title` varchar(200) DEFAULT NULL COMMENT '问题标题',
42-
`answer_id` int(11) DEFAULT NULL COMMENT '回答id',
43-
`agrees` int(11) DEFAULT NULL COMMENT '赞同数',
44-
`comments` int(11) DEFAULT NULL COMMENT '评论数',
45-
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
46-
`modify_time` datetime DEFAULT NULL COMMENT '更新时间',
38+
`id` INT(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
39+
`character_url` VARCHAR(50) DEFAULT NULL COMMENT '特征url',
40+
`answer_url` VARCHAR(200) DEFAULT NULL COMMENT '回答url',
41+
`question_id` INT(11) DEFAULT NULL COMMENT '问题id',
42+
`question_title` VARCHAR(200) DEFAULT NULL COMMENT '问题标题',
43+
`answer_id` INT(11) DEFAULT NULL COMMENT '回答id',
44+
`agrees` INT(11) DEFAULT NULL COMMENT '赞同数',
45+
`comments` INT(11) DEFAULT NULL COMMENT '评论数',
46+
`create_time` DATETIME DEFAULT NULL COMMENT '创建时间',
47+
`modify_time` DATETIME DEFAULT NULL COMMENT '更新时间',
4748
PRIMARY KEY (`id`),
48-
UNIQUE KEY `character_url` (`character_url`)
49-
) ENGINE=InnoDB DEFAULT CHARSET=utf8
49+
UNIQUE KEY (`question_id`, `answer_id`)
50+
) ENGINE=INNODB DEFAULT CHARSET=utf8;

ZhihuSpider/src/main/java/cn/mccreefei/zhihu/ZhihuSpider.java

+13-9
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
import us.codecraft.webmagic.downloader.PhantomJSDownloader;
1111
import us.codecraft.webmagic.pipeline.Pipeline;
1212
import us.codecraft.webmagic.processor.PageProcessor;
13+
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
1314
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
15+
import us.codecraft.webmagic.scheduler.PriorityScheduler;
1416

1517
import java.util.MissingResourceException;
1618
import java.util.ResourceBundle;
@@ -47,20 +49,22 @@ public void crawl(int threadNum, String... baseUrl){
4749
Spider spider = Spider.create(pageProcessor).addPipeline(pipeline).addUrl(baseUrl)
4850
.setDownloader(simpleSeleniumDownloader)
4951
.thread(threadNum);
50-
try {
51-
String fileCachePath = resource.getString("FILE_CACHE_PATH");
52-
log.info("use FileCacheQueueScheduler and FILE_CACHE_PATH is : " + fileCachePath);
53-
spider.setScheduler(new FileCacheQueueScheduler(fileCachePath));
54-
}catch (MissingResourceException e){
55-
log.info("no FILE_CACHE_PATH founds, default use QueueSchedule");
56-
}
57-
spider.run();
52+
// try {
53+
// String fileCachePath = resource.getString("FILE_CACHE_PATH");
54+
// log.info("use FileCacheQueueScheduler and FILE_CACHE_PATH is : " + fileCachePath);
55+
// spider.setScheduler(new FileCacheQueueScheduler(fileCachePath));
56+
// }catch (MissingResourceException e){
57+
// log.info("no FILE_CACHE_PATH founds, default use QueueSchedule");
58+
// }
59+
// spider.run();
60+
spider.setScheduler(new PriorityScheduler()
61+
.setDuplicateRemover(new BloomFilterDuplicateRemover(10000000))).run();
5862
}
5963

6064
public static void main(String[] args) {
6165
ApplicationContext context = new ClassPathXmlApplicationContext("classpath:/spring/spring-dao.xml");
6266
ZhihuSpider zhihuSpider = context.getBean(ZhihuSpider.class);
63-
zhihuSpider.crawl(3, "https://www.zhihu.com/people/zhou-ruo-yu-99-95/following",
67+
zhihuSpider.crawl(5, "https://www.zhihu.com/people/zhou-ruo-yu-99-95/following",
6468
"https://www.zhihu.com/people/zhou-ruo-yu-99-95/answers/by_votes",
6569
"https://www.zhihu.com/people/zhou-ruo-yu-99-95/posts/posts_by_votes");
6670
}

ZhihuSpider/src/main/java/cn/mccreefei/zhihu/magic/ZhihuPageProcessor.java

+5-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import org.springframework.beans.factory.annotation.Autowired;
88
import org.springframework.stereotype.Component;
99
import us.codecraft.webmagic.Page;
10+
import us.codecraft.webmagic.Request;
1011
import us.codecraft.webmagic.Site;
1112
import us.codecraft.webmagic.processor.PageProcessor;
1213

@@ -87,17 +88,17 @@ protected void addTargetUrls(Page page){
8788
int maxPage = Integer.parseInt(pageList.get(pageList.size() - 1));
8889
for (int i = 2; i <= maxPage; i++) {
8990
String pageUrl = sourceUrl + "?page=" + i;
90-
page.addTargetRequest(pageUrl);
91+
page.addTargetRequest(new Request(pageUrl).setPriority(20));
9192
}
9293
}
9394
}
9495
List<String> urlList = page.getHtml().xpath("//div[@id='Profile-following']//div[@class='List-item]" +
9596
"//div[@class='ContentItem-head']//a[@class='UserLink-link]").links().all();
9697
if (urlList != null && urlList.size() > 0) {
9798
for (String url : urlList) {
98-
page.addTargetRequest(url + "/following");
99-
page.addTargetRequest(url + "/answers/by_votes");
100-
page.addTargetRequest(url + "/posts/posts_by_votes");
99+
page.addTargetRequest(new Request(url + "/following").setPriority(100));
100+
page.addTargetRequest(new Request(url + "/answers/by_votes").setPriority(100));
101+
page.addTargetRequest(new Request(url + "/posts/posts_by_votes").setPriority(100));
101102
}
102103
}
103104
}

ZhihuSpider/src/main/java/cn/mccreefei/zhihu/model/ZhihuArticle.java

+8
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
public class ZhihuArticle {
1313
private Integer id;
1414
private String characterUrl;
15+
private Integer articleId;
1516
private String articleUrl;
1617
private String articleTitle;
1718
private Integer agrees;
@@ -21,6 +22,7 @@ public class ZhihuArticle {
2122

2223
public static class ArticleBuilder {
2324
private String characterUrl;
25+
private Integer articleId;
2426
private String articleUrl;
2527
private String articleTitle;
2628
private Integer agrees;
@@ -33,6 +35,11 @@ public ArticleBuilder setCharacterUrl(String characterUrl) {
3335
return this;
3436
}
3537

38+
public ArticleBuilder setArticleId(Integer articleId) {
39+
this.articleId = articleId;
40+
return this;
41+
}
42+
3643
public ArticleBuilder setArticleUrl(String articleUrl) {
3744
this.articleUrl = articleUrl;
3845
return this;
@@ -66,6 +73,7 @@ public ArticleBuilder setModifyTime(Date modifyTime){
6673
public ZhihuArticle build(){
6774
ZhihuArticle article = new ZhihuArticle();
6875
article.setCharacterUrl(characterUrl);
76+
article.setArticleId(articleId);
6977
article.setArticleUrl(articleUrl);
7078
article.setArticleTitle(articleTitle);
7179
article.setAgrees(agrees);

ZhihuSpider/src/main/java/cn/mccreefei/zhihu/parse/ZhihuAnswerParser.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import lombok.extern.slf4j.Slf4j;
66
import org.springframework.stereotype.Component;
77
import us.codecraft.webmagic.Page;
8+
import us.codecraft.webmagic.Request;
89
import us.codecraft.webmagic.selector.Html;
910

1011
import java.util.Date;
@@ -51,7 +52,7 @@ public void parseAnswerInfo(Page page) {
5152
try {
5253
int maxPage = Integer.parseInt(pageList.get(pageList.size() - 1));
5354
for (int j = 2; j <= maxPage; j++) {
54-
page.addTargetRequest(url + "?page=" + j);
55+
page.addTargetRequest(new Request(url + "?page=" + j).setPriority(50));
5556
}
5657
}catch (Exception e){
5758
log.warn("添加分页url失败!", e);

ZhihuSpider/src/main/java/cn/mccreefei/zhihu/parse/ZhihuArticleParser.java

+24-3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import lombok.extern.slf4j.Slf4j;
66
import org.springframework.stereotype.Component;
77
import us.codecraft.webmagic.Page;
8+
import us.codecraft.webmagic.Request;
89
import us.codecraft.webmagic.selector.Html;
910

1011
import java.util.Date;
@@ -50,7 +51,7 @@ public void parseArticleInfo(Page page){
5051
try {
5152
int maxPage = Integer.parseInt(pageList.get(pageList.size() - 1));
5253
for (int i = 2; i <= maxPage; i++) {
53-
page.addTargetRequest(url + "?page=" + i);
54+
page.addTargetRequest(new Request(url + "?page=" + i).setPriority(50));
5455
}
5556
}catch (Exception e){
5657
log.warn("添加分页url失败!", e);
@@ -69,15 +70,16 @@ public void parseArticleInfo(Page page){
6970
break;
7071
}
7172

72-
String articleUrl = itemHtml.xpath("//h2[@class='ContentItem-title']").links().get();
73+
String articleUrl = formatArticleUrl(itemHtml.xpath("//h2[@class='ContentItem-title']").links().get());
74+
Integer articleId = getArticleId(articleUrl);
7375
String articleTitle = itemHtml.xpath("//h2[@class='ContentItem-title']//a/text()").get();
7476

7577
String commentsText = itemHtml.xpath("//div[@class='ContentItem-actions']" +
7678
"/button[@class='Button ContentItem-action Button--plain Button--withIcon Button--withLabel']/text()").get();
7779
Integer comments = getComments(commentsText);
7880

7981
ZhihuArticle article = new ZhihuArticle.ArticleBuilder().setCharacterUrl(characterUrl)
80-
.setArticleUrl(articleUrl).setArticleTitle(articleTitle).setAgrees(agrees)
82+
.setArticleUrl(articleUrl).setArticleTitle(articleTitle).setAgrees(agrees).setArticleId(articleId)
8183
.setComments(comments).setCreateTime(new Date()).setModifyTime(new Date()).build();
8284

8385
page.putField("article", article);
@@ -107,6 +109,25 @@ private Integer getComments(String commentsText){
107109
return result;
108110
}
109111

112+
private String formatArticleUrl(String url){
113+
int index = url.indexOf("//");
114+
if (index != -1){
115+
return url.replace("//", "https://");
116+
}
117+
return url;
118+
}
119+
120+
private Integer getArticleId(String articleUrl){
121+
int index = articleUrl.indexOf("/p/");
122+
Integer result = null;
123+
try {
124+
result = Integer.valueOf(articleUrl.substring(index + 3));
125+
}catch (Exception e){
126+
log.warn("parse article id failed! url is " + articleUrl, e);
127+
}
128+
return result;
129+
}
130+
110131
private String getCharacterUrl(String url){
111132
int startIndex = url.indexOf("people/") + 7;
112133
if (startIndex < 7){

ZhihuSpider/src/main/resources/db.properties

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#mysql host
22
jdbc.host = 127.0.0.1
33
#数据库名
4-
jdbc.database = zhihu
4+
jdbc.database = zhihu_test
55
#用户名
66
jdbc.username = root
77
#密码

ZhihuSpider/src/main/resources/mybatis/ZhihuAnswerMapper.xml

-9
Original file line numberDiff line numberDiff line change
@@ -65,18 +65,9 @@
6565

6666
ON DUPLICATE KEY UPDATE
6767
<trim suffixOverrides=",">
68-
<if test="answerUrl != null">
69-
answer_url = #{answerUrl},
70-
</if>
71-
<if test="questionId != null">
72-
question_id = #{questionId},
73-
</if>
7468
<if test="questionTitle != null">
7569
question_title = #{questionTitle},
7670
</if>
77-
<if test="answerId != null">
78-
answer_id = #{answerId},
79-
</if>
8071
<if test="agrees != null">
8172
agrees = #{agrees},
8273
</if>

ZhihuSpider/src/main/resources/mybatis/ZhihuArticleMapper.xml

+6-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
<if test="characterUrl != null">
88
character_url,
99
</if>
10+
<if test="articleId != null">
11+
article_id,
12+
</if>
1013
<if test="articleTitle != null">
1114
article_title,
1215
</if>
@@ -31,6 +34,9 @@
3134
<if test="characterUrl != null">
3235
#{characterUrl},
3336
</if>
37+
<if test="articleId != null">
38+
#{articleId},
39+
</if>
3440
<if test="articleTitle != null">
3541
#{articleTitle},
3642
</if>
@@ -62,9 +68,6 @@
6268
<if test="comments != null">
6369
comments = #{comments},
6470
</if>
65-
<if test="articleUrl != null">
66-
article_url = #{articleUrl},
67-
</if>
6871
<if test="modifyTime != null">
6972
modify_time = #{modifyTime},
7073
</if>

ZhihuSpider/src/test/java/cn/mccreefei/zhihu/DaoTest.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,14 @@ public void testAddAnswer(){
6464
answer.setComments(1500);
6565
answer.setAgrees(2000);
6666
assertEquals(answerDao.addZhihuAnswer(answer), 2);
67+
68+
answer.setQuestionId(1500);
69+
assertEquals(answerDao.addZhihuAnswer(answer), 1);
6770
}
6871

6972
@Test
7073
public void testAddArticle(){
71-
ZhihuArticle article = new ZhihuArticle.ArticleBuilder().setCharacterUrl("MccreeFei")
74+
ZhihuArticle article = new ZhihuArticle.ArticleBuilder().setCharacterUrl("MccreeFei").setArticleId(1000)
7275
.setArticleTitle("how to be rich?").setArticleUrl("www.google.com").setAgrees(2000)
7376
.setComments(1500).setCreateTime(new Date()).build();
7477

@@ -82,5 +85,8 @@ public void testAddArticle(){
8285
article.setAgrees(1500);
8386
article.setComments(1000);
8487
assertEquals(articleDao.addZhihuArticle(article), 2);
88+
89+
article.setArticleId(1500);
90+
assertEquals(articleDao.addZhihuArticle(article), 1);
8591
}
8692
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

ZhihuSpider/target/classes/mybatis/ZhihuAnswerMapper.xml

+23-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
33
<mapper namespace="cn.mccreefei.zhihu.dao.ZhihuAnswerDao">
44
<insert id="addZhihuAnswer" parameterType="cn.mccreefei.zhihu.model.ZhihuAnswer">
5-
insert ignore into zhihu_answer
5+
insert into zhihu_answer
66
<trim prefix="(" suffix=")" suffixOverrides=",">
77
<if test="characterUrl != null">
88
character_url,
@@ -28,6 +28,9 @@
2828
<if test="createTime != null">
2929
create_time,
3030
</if>
31+
<if test="modifyTime != null">
32+
modify_time,
33+
</if>
3134
</trim>
3235

3336
<trim prefix="values(" suffix=")" suffixOverrides=",">
@@ -55,6 +58,25 @@
5558
<if test="createTime != null">
5659
#{createTime},
5760
</if>
61+
<if test="modifyTime != null">
62+
#{modifyTime},
63+
</if>
64+
</trim>
65+
66+
ON DUPLICATE KEY UPDATE
67+
<trim suffixOverrides=",">
68+
<if test="questionTitle != null">
69+
question_title = #{questionTitle},
70+
</if>
71+
<if test="agrees != null">
72+
agrees = #{agrees},
73+
</if>
74+
<if test="comments != null">
75+
comments = #{comments},
76+
</if>
77+
<if test="modifyTime != null">
78+
modify_time = #{modifyTime},
79+
</if>
5880
</trim>
5981
</insert>
6082

0 commit comments

Comments
 (0)