-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
129 lines (103 loc) · 3.86 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import logging
import re
import requests
import time
from advert import HouseAdvert
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.DEBUG)
class AdBuilder:
@staticmethod
def build(entry):
pass
class Scraper:
def __init__(self, session=None, retry_count=1, retry_delay=0, proxies=None):
self.session = session or requests.Session()
self.proxies = proxies
self.retry_count = retry_count
self.retry_delay = retry_delay
def base_request(self, url):
try:
response = self.session.get(url=url, proxies=self.proxies)
except requests.exceptions.ConnectionError as e:
logging.debug(f"Connection failed, sleeping for {self.retry_delay} seconds")
time.sleep(self.retry_delay)
self.base_request(url)
try:
response.raise_for_status()
except requests.HTTPError as e:
raise e
else:
return response
class ImmoScraper(Scraper):
ADS_NO = 0
MAX_ADS = 10000
def get_pages(self):
i = 1
while i:
yield f"https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag={i}"
i += 1
def get_urls_from_page(self, page):
urls = list()
for li in BeautifulSoup(self.base_request(page).text, 'lxml').find_all('p', {'class': 'titolo'}):
a = li.find('a')
if a:
url = li.find('a').get('href')
if 'immobiliare.it' in url:
urls.append((re.search(r'/(\d+)/', url).group(1),))
return urls
def get_all_urls(self):
for page_url in self.get_pages():
logging.debug(f"Now scraping page {page_url}")
urls = self.get_urls_from_page(page_url)
self.ADS_NO += len(urls)
yield urls
if self.ADS_NO >= self.MAX_ADS:
logging.debug("URL collection completed, exiting.")
break
logging.debug("Page completed, now sleeping for 1sec")
time.sleep(1)
def get_ad(self, id):
url = f"https://www.immobiliare.it/annunci/{id}/"
page = None
try:
page = self.base_request(url).text
except requests.exceptions.HTTPError:
return None
soup = BeautifulSoup(page, 'lxml')
title = soup.find('h1', {'class': 'title-detail'}).text.strip()
if 'asta' in title.lower(): # or title.lower().startswith('villa'):
return None
desc = None
size = None
level = 0
brooms = None
rooms = None
p = soup.find('li', {'class': 'features__price'}).find('span').text.strip()
try:
price = int(''.join(re.findall(r'(\d+)', p)))
except ValueError:
price = -1 # Prezzo su richiesta
features = soup.find('ul', {'class': 'features__list'}).find_all('li')
for f in features:
if 'locali' in f.text:
rooms = f.find('span').text.strip()
elif 'bagni' in f.text:
brooms = f.find('span').text.strip()
elif 'piano' in f.text:
level = f.find('abbr').text.strip()
elif 'superficie' in f.text:
size = f.find('span').text.strip()
if not(size):
try:
size = int(float(soup.select(
'div.row.overflow-x-auto.box-consistenze table tfoot tr td div')[0].text.replace(',', '.')))
except IndexError:
size = -1
try:
desc = soup.find('div', {'class': 'description-text'}).text.strip()
except AttributeError:
pass
ha = HouseAdvert(adid=id, title=title, description=desc, price=price,
rooms=rooms, size=size, bathrooms=brooms, level=level)
logging.debug(ha)
return ha