-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
62 lines (51 loc) · 1.9 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from scraper import extract_data
def get_html_page(url):
'''
Receives a url and returns its contents,
If your status is different from 200 (ok)
or the page does not exist, returns None.
'''
try:
html = requests.get(url)
if html.status_code != 200:
print('Error, Can not open page ->{}'.format(str(url)))
print('Status -> {}'.format(str(html.status_code)))
return None
else:
return html
except Exception as e:
print(e)
return None
def crawl(pages, depth):
'''
Receive a list of pages and the depth of the crawler
'''
for i in range(depth):
print('Depth: {0}'.format(i))
new_pages = set()
# for page in page list
for page in pages:
page_data = get_html_page(page)
if page_data:
soup = BeautifulSoup(page_data.text, 'lxml')
# find all links on the page
for link in soup.find_all('a', href=True):
if not "#" in link['href']:
url = urljoin(str(page), str(link.get('href')))
if url[0:4] == 'http':
new_pages.add(url)
# Add links as new pages
pages = new_pages
# Extract page data
url = page
title, description, text = extract_data(page_data)
# Print results
print('Url: {0}\nTitle: {1}\nDescription: {2}'.format(url, title, description))
print('Text: {0}'.format(text))
print(50*'-')
if __name__ == '__main__':
crawl(['https://www.python.org/'], 2)