an improvement to the code. save as .py #137

fireheart2008 · 2025-01-27T12:22:10Z

Make sure you have the required libraries installed:
pip install requests tqdm img2pdf

import requests
import os
from concurrent import futures
from tqdm import tqdm
import time
import json
import img2pdf
import shutil
from datetime import datetime

EMAIL = "add email here"  # Replace with your email
PASSWORD = "add password here"        # Replace with your password
OUTPUT_DIR = "D:\\downloads"
RESOLUTION = 0  # Highest resolution
MAX_THREADS = 50

def display_error(response, message):
    print(message)
    print(response)
    print(response.text)
    exit()

def get_book_infos(session, url):
    r = session.get(url).text
    infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
    response = session.get(infos_url)
    data = response.json()['data']
    title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
    title = ''.join(c for c in title if c not in '<>:"/\\|?*')
    title = title[:150]
    metadata = data['metadata']
    links = []
    for item in data['brOptions']['data']:
        for page in item:
            links.append(page['uri'])

    if len(links) > 1:
        print(f"[+] Found {len(links)} pages")
        return title, links, metadata
    else:
        print(f"[-] Error while getting image links")
        exit()

def login(email, password):
    session = requests.Session()
    session.get("https://archive.org/account/login")
    
    data = {"username": email, "password": password}
    response = session.post("https://archive.org/account/login", data=data)
    
    if "bad_login" in response.text:
        print("[-] Invalid credentials!")
        exit()
    elif "Successful login" in response.text:
        print("[+] Successful login")
        return session
    else:
        display_error(response, "[-] Error while login:")

def loan(session, book_id, verbose=True):
    data = {
        "action": "grant_access",
        "identifier": book_id
    }
    response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
    data['action'] = "browse_book"
    response = session.post("https://archive.org/services/loans/loan/", data=data)

    if response.status_code == 400:
        if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
            print("This book doesn't need to be borrowed")
            return session
        else:
            display_error(response, "Something went wrong when trying to borrow the book.")

    data['action'] = "create_token"
    response = session.post("https://archive.org/services/loans/loan/", data=data)

    if "token" in response.text:
        if verbose:
            print("[+] Successful loan")
        return session
    else:
        display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")

def return_loan(session, book_id):
    data = {
        "action": "return_loan",
        "identifier": book_id
    }
    response = session.post("https://archive.org/services/loans/loan/", data=data)
    if response.status_code == 200 and response.json()["success"]:
        print("[+] Book returned")
    else:
        display_error(response, "Something went wrong when trying to return the book")

def image_name(pages, page, directory):
    return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"

def download_one_image(session, link, i, directory, book_id, pages):
    headers = {
        "Referer": "https://archive.org/",
        "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
        "Sec-Fetch-Site": "same-site",
        "Sec-Fetch-Mode": "no-cors",
        "Sec-Fetch-Dest": "image",
    }
    retry = True
    while retry:
        try:
            response = session.get(link, headers=headers)
            if response.status_code == 403:
                session = loan(session, book_id, verbose=False)
                raise Exception("Borrow again")
            elif response.status_code == 200:
                retry = False
        except:
            time.sleep(1)

    image = image_name(pages, i, directory)
    with open(image, "wb") as f:
        f.write(response.content)

def download(session, n_threads, directory, links, scale, book_id):
    print("Downloading pages...")
    links = [f"{link}&rotate=0&scale={scale}" for link in links]
    pages = len(links)

    tasks = []
    with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
        for link in links:
            i = links.index(link)
            tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
        for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
            pass

    images = [image_name(pages, i, directory) for i in range(len(links))]
    return images

def make_pdf(pdf, title, directory):
    file = title + ".pdf"
    i = 1
    while os.path.isfile(os.path.join(directory, file)):
        file = f"{title}({i}).pdf"
        i += 1

    with open(os.path.join(directory, file), "wb") as f:
        f.write(pdf)
    print(f"[+] PDF saved as \"{file}\"")

def main():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    session = login(EMAIL, PASSWORD)
    urls = []
    
    while True:
        url = input("Please paste the Archive.org book URL (must start with https://archive.org/details/): ").strip()
        if not url.startswith("https://archive.org/details/"):
            print("Invalid URL format. URL must start with 'https://archive.org/details/'")
            continue
        
        urls.append(url)
        
        while True:
            choice = input("Do you want to add another book? (y/n): ").lower()
            if choice in ['y', 'n']:
                break
            print("Please enter 'y' or 'n'")
            
        if choice == 'n':
            break
    
    print(f"\n{len(urls)} Book(s) to download")
    
    for url in urls:
        book_id = list(filter(None, url.split("/")))[3]
        print("="*40)
        print(f"Current book: https://archive.org/details/{book_id}")
        session = loan(session, book_id)
        title, links, metadata = get_book_infos(session, url)

        directory = os.path.join(OUTPUT_DIR, title)
        i = 1
        _directory = directory
        while os.path.isdir(directory):
            directory = f"{_directory}({i})"
            i += 1
        os.makedirs(directory)

        images = download(session, MAX_THREADS, directory, links, RESOLUTION, book_id)

        # Create PDF with metadata
        pdfmeta = {}
        for key in ["title", "creator", "associated-names"]:
            if key in metadata:
                if isinstance(metadata[key], str):
                    pass
                elif isinstance(metadata[key], list):
                    metadata[key] = "; ".join(metadata[key])
                else:
                    raise Exception("unsupported metadata type")
                    
        if 'title' in metadata:
            pdfmeta['title'] = metadata['title']
        if 'creator' in metadata and 'associated-names' in metadata:
            pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
        elif 'creator' in metadata:
            pdfmeta['author'] = metadata['creator']
        elif 'associated-names' in metadata:
            pdfmeta['author'] = metadata['associated-names']
        
        if 'date' in metadata:
            try:
                pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
            except:
                pass
                
        pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]

        pdf = img2pdf.convert(images, **pdfmeta)
        make_pdf(pdf, title, OUTPUT_DIR)
        
        try:
            shutil.rmtree(directory)
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

        return_loan(session, book_id)

if __name__ == "__main__":
    main()

The text was updated successfully, but these errors were encountered:

fireheart2008 · 2025-01-27T12:35:54Z

now it becomes a standalone script no need for writing any code in the terminal. just paste the links

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

an improvement to the code. save as .py #137

an improvement to the code. save as .py #137

fireheart2008 commented Jan 27, 2025

fireheart2008 commented Jan 27, 2025

an improvement to the code. save as .py #137

an improvement to the code. save as .py #137

Comments

fireheart2008 commented Jan 27, 2025

fireheart2008 commented Jan 27, 2025