-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathchinaz_top.py
77 lines (70 loc) · 1.8 KB
/
chinaz_top.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
爬取站长之家 中文排行榜域名
起始链接:https://top.chinaz.com/all/index.html
分页链接:https://top.chinaz.com/all/index_2.html
By:Black_list
'''
import requests
import re
url = 'https://top.chinaz.com/all/'
def black(url):
blacklist = [
#只能匹配一级后缀 无法匹配 .com.cn 二级后缀 懒得匹配了
'163.com',
'qq.com',
'gov.cn',
'baidu.com',
'sohu.com',
'weibo.com',
'douban.com',
'iqiyi.com',
'ifeng.com',
'sogou.com',
'youku.com',
'so.com',
'taobao.com',
'apple.com',
'58.com',
'jd.com',
'chinaz.com',
'1688.com'
]
url = url.split('.')
url = url[-2]+'.'+url[-1]
for j in blacklist:
if url == j:
return True
def pachong(url):
try:
html = requests.get(url)
url_regular = 'class="col-gray">(.*?)</span>'
if html.status_code == 200:
res_url = re.findall(url_regular,html.text)
del res_url[0]
for urls in res_url:
if black(urls) == True:
print('黑名单域名已过滤')
else:
print(urls)
with open('top_url.txt','a+') as f:
f.write(urls)
f.write('\n')
f.close()
else:
'链接访问错误!'
except BaseException as e:
pachong(url)
def fenye(res_url):
pachong(res_url)
i = 2
while True:
url = res_url + 'index_%d.html' % i
i = i+1
if i == 1922:
print('域名采集结束')
break
else:
pachong(url)
fenye(url)