forked from kezakez/python-web-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
61 lines (47 loc) · 1.28 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import sys
import httplib
import re
url = "http://www.hackernews.net"
depth = 2
search = "python"
# get the parameters or use defaults
if (len(sys.argv) > 1):
url = sys.argv[1]
if (len(sys.argv) > 2):
depth = int(sys.argv[2])
if (len(sys.argv) > 3):
search = sys.argv[3]
processed = []
def searchURL(url, depth, search):
# only do http links
if (url.startswith("http://") and (not url in processed)):
processed.append(url)
url = url.replace("http://", "", 1)
# split out the url into host and doc
host = url
path = "/"
urlparts = url.split("/")
if (len(urlparts) > 1):
host = urlparts[0]
path = url.replace(host, "", 1)
# make the first request
print "crawling host: " + host + " path: " + path
conn = httplib.HTTPConnection(host)
req = conn.request("GET", path)
res = conn.getresponse()
# find the links
contents = res.read()
m = re.findall('href="(.*?)"', contents)
if (search in contents):
print "Found " + search + " at " + url
print str(depth) + ": processing " + str(len(m)) + " links"
for href in m:
# do relative urls
if (href.startswith("/")):
href = "http://" + host + href
# follow the links
if (depth > 0):
searchURL(href, depth-1, search)
else:
print "skipping " + url
searchURL(url, depth, search)