forked from satrong/node-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.js
327 lines (307 loc) · 11.3 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
/// 依赖模块
var fs = require('fs');
var request = require("request");
var cheerio = require("cheerio");
var mkdirp = require('mkdirp');
var iconv = require('iconv-lite');
var async = require('async');
var color = require('./color.js');
var path = require('path');
var URL = require('url');
/// 配置文件
var config = require('./config.js');
var rooturl = config.isPagination ? function (i) { return config.url.replace('%%', i); }:config.url;
var rootsite = config.url.match(/[^\.]+[^/]+/)[0];
var hostname = URL.parse(rootsite).hostname;
console.log(color('blueBG', 2), '抓取对象:', rootsite);
var Crawler = function () {
this.from = config.from || 1;
this.to = config.to || 1;
};
/// 开始处理的入口
Crawler.prototype.crawl = function () {
var that = this;
var urlLevels = []; /// 收集每个层级的url
that.log('程序正在执行中...');
/// 通过config.selector的长度来确定页面的层级
async.eachSeries(config.selector, function (item, callback) {
var index = config.selector.indexOf(item);
/// 最后一层级
if (index === config.selector.length - 1) {
if (config.type) {
if (that[config.type]) {
that[config.type](urlLevels[index - 1]);
} else {
that.log('参数type值无效,参数值:text|image', 'redBG');
}
} else {
that.log('您没有配置参数type,参数值:text|image', 'redBG');
}
}
/// 第一层级
else if (index === 0) {
urlLevels[0] = [];
if (config.isPagination) {
var i = config.from;
async.whilst(function () {
return i <= config.to;
}, function (_callback) {
that.request(rooturl(i), function (status, $) {
if (status) {
var $$ = eval(item.$);
$$.each(function () {
var nextUrl = $(this).attr(item.attr);
if (!/^http:\/\//i.test(nextUrl)) {
nextUrl = rootsite + nextUrl;
}
urlLevels[0].push(nextUrl);
});
that.log('第' + i + '页分析完成');
} else {
that.log(rooturl(i) + '请求失败', 'red');
}
setTimeout(function () {
++i;
_callback(null);
}, parseInt(Math.random() * 2000));
});
}, function (err) {
if (err) {
that.log(err, 'red');
} else {
var show_txt = '';
if (config.type === 'image') {
show_txt = '套图片';
} else if (config.type === 'text') {
show_txt = '篇文章';
}
that.log('分页处理完成,共收集到了' + urlLevels[0].length + show_txt, 'green');
}
callback(null);
});
} else {
that.request(rooturl, function (status, $) {
if (status) {
eval(item.$).each(function () {
urlLevels[0].push($(this).attr(item.attr));
});
} else {
that.log(rooturl + '请求失败', 'red');
}
callback(null);
});
}
}
/// 中间层级
else {
urlLevels[index] = [];
async.eachSeries(urlLevels[index - 1], function (_item, _callback) {
that.request(_item, function (status, $) {
if (status) {
eval(_item.$).each(function () {
urlLevels[index].push($(this).attr(_item.attr));
});
} else {
that.log(_item + '请求失败', 'red');
}
_callback(null);
});
}, function () {
callback(null);
});
}
}, function (err) {
if (err) {
that.log(err, 'red');
} else {
that.log('层级地址完成', 'green');
}
});
};
/// 处理text
/// urls:{Array}
Crawler.prototype.text = function (urls) {
var that = this;
that.log('抓取文本中...');
var i = 0;
var count = urls.length;
mkdirp(config.saveDir + '/' + hostname, function (err) {
if (err) {
that.log('创建目录失败', 'red');
process.exit(0);
} else {
async.whilst(function () {
return i < urls.length;
}, function (callback) {
var uri = urls[i];
that.request(uri, function (status, $) {
if (status) {
var title = that.title($("title").text());
var filepath = path.join(config.saveDir, hostname, title + '.txt');
var last = config.selector[config.selector.length - 1];
var content = eval(last.$).text();
fs.writeFile(filepath, content, { flag: 'wx' }, function (_err) {
if (_err) {
if (_err.code === 'EEXIST') {
that.log('文件' + filepath + '已存在', 'yellow');
} else {
that.log('保存文件' + filepath + '失败', 'red');
}
} else {
that.log(i + '/' + count + ' 文件' + filepath + '保存成功', 'green');
}
setTimeout(callback, parseInt(Math.random() * 2000));
});
} else {
setTimeout(callback, parseInt(Math.random() * 2000));
}
});
++i;
}, function (err) {
if (err) {
that.log(err, "red");
} else {
that.log('执行完毕~', "green");
}
});
}
});
};
/// 处理image
/// urls:{Array}
Crawler.prototype.image = function (urls) {
var that = this;
that.log('抓取图片中...');
var i = 0;
var count = urls.length;
async.whilst(function () {
return i < count;
}, function (callback) {
var uri = urls[i];
that.request(uri, function (status, $) {
var list = []; /// 存储图片路径
if (status) {
var last = config.selector[config.selector.length - 1];
var $$ = eval(last.$);
var len = $$.length;
if (len > 0) {
$$.each(function () {
list.push({
url: $(this).attr(last.attr),
title: that.title($("title").text())
});
});
}
that.log('第{0}套图片收集了{1}张图片'.format((i + 1) + '/' + count, $$.length));
that.dlImage(list, function () {
++i;
callback();
});
} else {
++i;
callback();
that.log('页面' + uri + '请求失败', 'redBG');
}
});
}, function (err) {
if (err) that.log('imageError:' + err);
process.exit(0);
});
};
/// 下载图片
Crawler.prototype.dlImage = function (list, callback) {
var that = this;
var count = list.length;
that.log('准备下载到本地中...');
if (count < 1) {
callback();
return;
}
async.eachSeries(list, function (item, callback) {
var filename = item.url.match(/[^\/]+\.\w{3,4}$/)[0];
var filepath = path.join(config.saveDir, item.title);
mkdirp(filepath, function (err) {
if (err) {
callback(err);
} else {
request.head(item.url, function (err, res, body) {
var url = config.imageFn ? config.imageFn(item.url) : item.url;
var savePath = path.join(filepath, filename);
fs.exists(savePath, function (exists) {
if (exists) {
that.log(savePath + '已存在', 'yellow');
callback();
} else {
request(url).pipe(fs.createWriteStream(savePath));
that.log((list.indexOf(item) + 1) + '/' + count + ' :' + path.join(filepath, filename) + '保存成功', 'green');
setTimeout(callback, parseInt(Math.random() * 2000));
}
});
});
}
});
}, function (err) {
if (err) {
that.log(err, "red");
} else {
that.log(list[0].title + ' :下载完毕~', "greenBG");
}
callback();
});
};
/// 获取页面
/// url:{String} 页面地址
/// callback:{Function} 获取页面完成后的回调callback(boolen,$)
Crawler.prototype.request = function (url, callback) {
var that = this;
var opts = {
url: url,
encoding: config.charset || 'utf8'
};
config.headers && (opts.headers = config.headers);
that.log('发送' + url + ',等待响应中...', 'grey');
iconv.extendNodeEncodings(); /// 转码用
request(opts, function (err, res, body) {
var $ = null;
if (!err && res.statusCode == 200) {
that.log(res.statusCode, 'green');
$ = cheerio.load(body);
} else {
!err && that.log(res.statusCode, 'red');
}
iconv.undoExtendNodeEncodings();
callback(!!$, $);
});
};
/// 处理标题(title)
Crawler.prototype.title = function (str) {
var title = str.replace(/[\\/:\*\?"<>\|\n\r]/g, '').trim();
if (/-/.test(title)) {
title = title.match(/(.+)\-[^\-]+$/)[1].trim();
}
return title;
};
/// 输出信息
Crawler.prototype.log = function (info, c) {
//console.log(info);
var that = this;
if (config.mode === 'web') {
process.send(JSON.stringify({ color: c, info: info })); /// 发送数据给主进程
} else if (config.mode === 'console') {
console.log(color(c), info);
}
};
String.prototype.format = function () {
var formatted = this;
var length = arguments.length;
for (var i = 0; i < length; i++) {
var regexp = new RegExp('\\{' + i + '\\}', 'gi');
var value = arguments[i];
if (value === null || value === undefined)
value = '';
formatted = formatted.replace(regexp, value);
}
return formatted;
};
new Crawler().crawl();
//module.exports = Crawler;