This repository has been archived by the owner on Jul 15, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathscrape_crunchbase.py
52 lines (42 loc) · 1.72 KB
/
scrape_crunchbase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import string
import requests
from bs4 import BeautifulSoup
import json
import glob
from multiprocessing import Pool
import sys
API_KEY = None # your api key here
CRUNCHBASE_COMPANY_API = "http://api.crunchbase.com/v/1/company/%s.js?api_key=%s"
company_urls = []
# iterate through all crunchbase companies by going a-z and "other"
for letter in list(string.lowercase) + ["other"]:
r = requests.get("http://www.crunchbase.com/companies?c=%s" % (letter))
soup = BeautifulSoup(r.text)
# get all the companies that are listed by checking for
# "/company/" at the beginning of the url
# however, only save the name of the company by taking
# substring after /company/ (9 characters long)
# output error message if company name too long, skips for now
for link in soup.select("li > a"):
if "/company/" == link['href'][:9]:
company_name = link['href'][9:]
# max filename len is 255, .json = 5
if len(company_name) < 255 - 5:
company_urls.append(company_name)
else:
print "%s too long, skipping." % (company_name)
# write this list of companies to file
open("list_of_companies.txt", 'w').write("\n".join(company_urls))
def download(company_url):
r = requests.get(CRUNCHBASE_COMPANY_API % (company_url, API_KEY))
open("crunchbase_companies/%s.json" % (company_url), 'w').write(r.text.encode("utf-8"))
# get all the already scraped companies so we don't rescrape them
# if force flag is used, rescrape all companies
if "--force" in sys.argv or "-f" in sys.argv:
scraped_companies = []
else:
# remove json extension
scraped_companies = [x[:-5] for x in glob.glob("crunchbase_companies/*.json")]
p = Pool(10)
r = p.map_async(download, set(company_urls) - set(scraped_companies))
r.get()