Skip to content

Commit

Permalink
Changed parsing and support threading
Browse files Browse the repository at this point in the history
Using the JSON API instead of parsing HTML.
Content is now downloaded asynchronously using basic threading.
Fixed bug in detecting range of comic numbers.
Saving transcript instead of just the alt-text.
  • Loading branch information
rubenroy committed Sep 22, 2013
1 parent ca428fe commit 5f2bfa0
Showing 1 changed file with 65 additions and 30 deletions.
95 changes: 65 additions & 30 deletions xkcd_downloader.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,43 @@
import requests,os
from bs4 import BeautifulSoup
def download(page_url,folder):
print 'Opening ',page_url,'...'
soup = BeautifulSoup(requests.get(page_url).text)
tag = soup.find(id="comic").img
image_url = tag["src"]
filename = image_url.split('/')[-1]
f=open(os.getcwd()+"/"+folder+"/"+filename+".txt",'w')
f.write(tag["title"])
f.close()
r = requests.get(image_url, stream = True)
with open(os.getcwd()+"/"+folder+"/"+filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
import requests,os,json,threading
max_threads=70
active_threads=0
lock = threading.Lock()
print_lock=threading.Lock()
folder = "xkcd_downloaded"
class Download(threading.Thread):
page_url=""
def download(self,page_url):
page_dict = json.loads(requests.get(page_url).text)
image_url = page_dict["img"]
filename = image_url.split("/")[-1]
i = page_dict["num"]
img_path = os.getcwd()+"/"+folder+"/"+str(i)+". "+filename
t_path = img_path+".txt"
img_size = int(requests.head(image_url).headers["content-length"])
if os.path.exists(img_path)==False or img_size!=int(os.path.getsize(img_path)):
print_lock.acquire()
print "Downloading",i
print_lock.release()
r = requests.get(image_url,stream = True)
with open(img_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
t_size = len(page_dict["transcript"])
if os.path.exists(t_path)==False or t_size!=int(os.path.getsize(t_path)):
f = open(t_path,'w')
f.write(page_dict["transcript"].encode("UTF-8"))
f.close()
def __init__(self,page_url):
super(Download, self).__init__()
self.page_url = page_url
def run(self):
self.download(self.page_url)
lock.acquire()
global active_threads
active_threads -= 1
lock.release()
def get_range(s,latest):
r = []
for i in s.split(","):
Expand All @@ -26,17 +49,29 @@ def get_range(s,latest):
elif b > latest:
b = latest
r += range(a,b+1)
elif int(i) > latest:
elif int(i) <= latest:
r.append(int(i))
return r
folder = "xkcd_downloaded"
soup = BeautifulSoup(requests.get("http://xkcd.com/").text)
tag = soup.find(attrs={"rel": "prev"})
latest = int(tag["href"].split("/")[1])+1
print 'Latest comic number is',latest
range_input=raw_input('\nEnter comics to download (eg: "55,630,666-999,1024"): ')
range_list=get_range(range_input,latest)
if not os.path.exists(folder):
os.mkdir(folder)
for i in range_list:
download("http://xkcd.com/"+str(i)+"/",folder)
if 404 in r:
r.remove(404)
if 0 in r:
r.remove(0)
return list(set(r))
def main():
global active_threads
latest = json.loads(requests.get("http://xkcd.org/info.0.json").text)["num"]
print 'Latest comic number is',latest
range_input=raw_input('\nEnter comics to download (eg: "55,630,666-999,1024"): ')
range_list=get_range(range_input,latest)
if not os.path.exists(folder):
os.mkdir(folder)
for i in range_list:
while True:
lock.acquire()
if(active_threads<max_threads):
active_threads += 1
lock.release()
break
lock.release()
Download("http://xkcd.com/"+str(i)+"/info.0.json").start()
if __name__ == '__main__':
main()

0 comments on commit 5f2bfa0

Please sign in to comment.