-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspider.py
70 lines (61 loc) · 1.9 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from urllib.request import urlopen
from linkFinder import LinkFinder
from functions import *
class Spider:
# Class variables shared among all instances
projectName = ''
baseURL = ''
domainName = ''
queueFile = ''
crawledFile = ''
queue = set()
crawled = set()
def __init__(self, projectName, baseURL, domainName):
Spider.projectName = projectName
Spider.baseURL = baseURL
Spider.domainName = domainName
Spider.queueFile = Spider.projectName + '/queue.txt'
Spider.crawledFile = Spider.projectName + '/crawled.txt'
self.boot()
self.crawlPage('First Spider', Spider.baseURL)
@staticmethod
def boot():
createProjectDir(Spider.projectName)
createDataFiles(Spider.projectName, Spider.baseURL)
Spider.queue = fileToSet(Spider.queueFile)
Spider.crawled = fileToSet(Spider.crawledFile)
@staticmethod
def crawlPage(threadName, pageURL):
if pageURL not in Spider.crawled:
print(threadName + ' at page ' + pageURL)
print('Pages in Queue: ' + str(len(Spider.queue)) + " | Crawled Pages: " + str(len(Spider.crawled)))
Spider.addLinksToQueue(Spider.gatherLinks(pageURL))
if pageURL in Spider.queue:
Spider.queue.remove(pageURL)
Spider.crawled.add(pageURL)
Spider.updateFiles()
@staticmethod
def gatherLinks(pageURL):
htmlString = ''
try:
response = urlopen(pageURL)
if 'text/html' in response.getheader('Content-Type'):
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
finder = LinkFinder(Spider.baseURL, pageURL)
finder.feed(htmlString)
except:
print("Error: Cannot crawl the page at " + pageURL)
return set()
return finder.returnLinks()
@staticmethod
def addLinksToQueue(links):
for link in links:
if Spider.domainName not in link:
continue
if link not in Spider.crawled:
Spider.queue.add(link)
@staticmethod
def updateFiles():
setToFile(Spider.queue, Spider.queueFile)
setToFile(Spider.crawled, Spider.crawledFile)