-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScraper.py
83 lines (60 loc) · 3.03 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
import time
from bs4 import BeautifulSoup as bs
import pandas as pd
from seleniumrequests import Firefox
class QS_SCAPER():
def __init__(self):
self.branch_list = ['art-design',
'business-management-studies',
'computer-science-information-systems',
'economics-econometrics'
'data-science',
'physics-astronomy',
'politics',
'social-sciences-management']
self.df = pd.DataFrame(columns=['UNIVERSITY', 'RANK','Country','City/State', *self.branch_list])
URL_main = "https://www.topuniversities.com/university-rankings/world-university-rankings/2024?&page="
self.all_uni, self.all_rank = self.get_uni(URL_main,pages=25)
self.parse_uni(self.all_uni,self.all_rank,'RANK')
self.df.set_index('UNIVERSITY',inplace=True)
URL_prefix =lambda branch: "https://www.topuniversities.com/university-rankings/university-subject-rankings/2023/{}?&page=".format(branch)
for branch in self.branch_list:
self.unilist_raw,self.rank_list = self.get_uni(URL_prefix(branch),pages=15)
self.parse_uni(self.unilist_raw,self.rank_list,branch)
def parse_uni(self,raw,ranks,branch):
for ind in range(len(raw)):
name_loc =raw[ind].text.replace("""QS Stars\n\n\nMore Details""","").strip().split("\n\n\n")
try:
pdseries = pd.DataFrame({'UNIVERSITY':name_loc[0],
branch:ranks[ind].text.strip("="),
'Country':name_loc[1].split(", ")[1],
'City/State':name_loc[1].split(", ")[0]},index=[name_loc[0]])
except IndexError:
continue
if branch !="RANK":
#print(ranks[ind].text)
try :
self.df.loc[name_loc[0]][branch]= ranks[ind].text
except KeyError:
pdseries = pdseries.drop('UNIVERSITY',axis=1)
self.df = pd.concat([self.df,pdseries],axis=0)
else:
self.df.loc[self.df.shape[0]] = pdseries.loc[name_loc[0]]
def get_uni(self,url_prefix,pages=10):
uni_list = []
uni_rank_list = []
driver = Firefox()
for n in range(pages):
time.sleep(0.5)
driver.get(url_prefix+str(n))
time.sleep(2)
soup = bs(driver.page_source, 'html.parser')
uni_details= soup.find_all('div',attrs={'class':'univ-details-right-pos'})
uni_rank= soup.find_all('div',attrs={'class':'_univ-rank mw-100 hide-this-in-mobile-indi'})
uni_list+=uni_details
uni_rank_list+=uni_rank
driver.close()
return uni_list ,uni_rank_list
scraper =QS_SCAPER()
scraper.df.to_excel("BIG_UNI_LIST.xlsx")