You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Make sure you have the required libraries installed:
pip install requests tqdm img2pdf
import requests
import os
from concurrent import futures
from tqdm import tqdm
import time
import json
import img2pdf
import shutil
from datetime import datetime
EMAIL = "add email here" # Replace with your email
PASSWORD = "add password here" # Replace with your password
OUTPUT_DIR = "D:\\downloads"
RESOLUTION = 0 # Highest resolution
MAX_THREADS = 50
def display_error(response, message):
print(message)
print(response)
print(response.text)
exit()
def get_book_infos(session, url):
r = session.get(url).text
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
response = session.get(infos_url)
data = response.json()['data']
title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
title = ''.join(c for c in title if c not in '<>:"/\\|?*')
title = title[:150]
metadata = data['metadata']
links = []
for item in data['brOptions']['data']:
for page in item:
links.append(page['uri'])
if len(links) > 1:
print(f"[+] Found {len(links)} pages")
return title, links, metadata
else:
print(f"[-] Error while getting image links")
exit()
def login(email, password):
session = requests.Session()
session.get("https://archive.org/account/login")
data = {"username": email, "password": password}
response = session.post("https://archive.org/account/login", data=data)
if "bad_login" in response.text:
print("[-] Invalid credentials!")
exit()
elif "Successful login" in response.text:
print("[+] Successful login")
return session
else:
display_error(response, "[-] Error while login:")
def loan(session, book_id, verbose=True):
data = {
"action": "grant_access",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
data['action'] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 400:
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
print("This book doesn't need to be borrowed")
return session
else:
display_error(response, "Something went wrong when trying to borrow the book.")
data['action'] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if "token" in response.text:
if verbose:
print("[+] Successful loan")
return session
else:
display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
def return_loan(session, book_id):
data = {
"action": "return_loan",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 200 and response.json()["success"]:
print("[+] Book returned")
else:
display_error(response, "Something went wrong when trying to return the book")
def image_name(pages, page, directory):
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
def download_one_image(session, link, i, directory, book_id, pages):
headers = {
"Referer": "https://archive.org/",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
}
retry = True
while retry:
try:
response = session.get(link, headers=headers)
if response.status_code == 403:
session = loan(session, book_id, verbose=False)
raise Exception("Borrow again")
elif response.status_code == 200:
retry = False
except:
time.sleep(1)
image = image_name(pages, i, directory)
with open(image, "wb") as f:
f.write(response.content)
def download(session, n_threads, directory, links, scale, book_id):
print("Downloading pages...")
links = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links)
tasks = []
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for link in links:
i = links.index(link)
tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
pass
images = [image_name(pages, i, directory) for i in range(len(links))]
return images
def make_pdf(pdf, title, directory):
file = title + ".pdf"
i = 1
while os.path.isfile(os.path.join(directory, file)):
file = f"{title}({i}).pdf"
i += 1
with open(os.path.join(directory, file), "wb") as f:
f.write(pdf)
print(f"[+] PDF saved as \"{file}\"")
def main():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
session = login(EMAIL, PASSWORD)
urls = []
while True:
url = input("Please paste the Archive.org book URL (must start with https://archive.org/details/): ").strip()
if not url.startswith("https://archive.org/details/"):
print("Invalid URL format. URL must start with 'https://archive.org/details/'")
continue
urls.append(url)
while True:
choice = input("Do you want to add another book? (y/n): ").lower()
if choice in ['y', 'n']:
break
print("Please enter 'y' or 'n'")
if choice == 'n':
break
print(f"\n{len(urls)} Book(s) to download")
for url in urls:
book_id = list(filter(None, url.split("/")))[3]
print("="*40)
print(f"Current book: https://archive.org/details/{book_id}")
session = loan(session, book_id)
title, links, metadata = get_book_infos(session, url)
directory = os.path.join(OUTPUT_DIR, title)
i = 1
_directory = directory
while os.path.isdir(directory):
directory = f"{_directory}({i})"
i += 1
os.makedirs(directory)
images = download(session, MAX_THREADS, directory, links, RESOLUTION, book_id)
# Create PDF with metadata
pdfmeta = {}
for key in ["title", "creator", "associated-names"]:
if key in metadata:
if isinstance(metadata[key], str):
pass
elif isinstance(metadata[key], list):
metadata[key] = "; ".join(metadata[key])
else:
raise Exception("unsupported metadata type")
if 'title' in metadata:
pdfmeta['title'] = metadata['title']
if 'creator' in metadata and 'associated-names' in metadata:
pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
elif 'creator' in metadata:
pdfmeta['author'] = metadata['creator']
elif 'associated-names' in metadata:
pdfmeta['author'] = metadata['associated-names']
if 'date' in metadata:
try:
pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
except:
pass
pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
pdf = img2pdf.convert(images, **pdfmeta)
make_pdf(pdf, title, OUTPUT_DIR)
try:
shutil.rmtree(directory)
except OSError as e:
print("Error: %s - %s." % (e.filename, e.strerror))
return_loan(session, book_id)
if __name__ == "__main__":
main()
The text was updated successfully, but these errors were encountered:
Make sure you have the required libraries installed:
pip install requests tqdm img2pdf
The text was updated successfully, but these errors were encountered: