Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

an improvement to the code. save as .py #137

Open
fireheart2008 opened this issue Jan 27, 2025 · 1 comment
Open

an improvement to the code. save as .py #137

fireheart2008 opened this issue Jan 27, 2025 · 1 comment

Comments

@fireheart2008
Copy link

Make sure you have the required libraries installed:
pip install requests tqdm img2pdf

import requests
import os
from concurrent import futures
from tqdm import tqdm
import time
import json
import img2pdf
import shutil
from datetime import datetime

EMAIL = "add email here"  # Replace with your email
PASSWORD = "add password here"        # Replace with your password
OUTPUT_DIR = "D:\\downloads"
RESOLUTION = 0  # Highest resolution
MAX_THREADS = 50

def display_error(response, message):
    print(message)
    print(response)
    print(response.text)
    exit()

def get_book_infos(session, url):
    r = session.get(url).text
    infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
    response = session.get(infos_url)
    data = response.json()['data']
    title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
    title = ''.join(c for c in title if c not in '<>:"/\\|?*')
    title = title[:150]
    metadata = data['metadata']
    links = []
    for item in data['brOptions']['data']:
        for page in item:
            links.append(page['uri'])

    if len(links) > 1:
        print(f"[+] Found {len(links)} pages")
        return title, links, metadata
    else:
        print(f"[-] Error while getting image links")
        exit()

def login(email, password):
    session = requests.Session()
    session.get("https://archive.org/account/login")
    
    data = {"username": email, "password": password}
    response = session.post("https://archive.org/account/login", data=data)
    
    if "bad_login" in response.text:
        print("[-] Invalid credentials!")
        exit()
    elif "Successful login" in response.text:
        print("[+] Successful login")
        return session
    else:
        display_error(response, "[-] Error while login:")

def loan(session, book_id, verbose=True):
    data = {
        "action": "grant_access",
        "identifier": book_id
    }
    response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
    data['action'] = "browse_book"
    response = session.post("https://archive.org/services/loans/loan/", data=data)

    if response.status_code == 400:
        if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
            print("This book doesn't need to be borrowed")
            return session
        else:
            display_error(response, "Something went wrong when trying to borrow the book.")

    data['action'] = "create_token"
    response = session.post("https://archive.org/services/loans/loan/", data=data)

    if "token" in response.text:
        if verbose:
            print("[+] Successful loan")
        return session
    else:
        display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")

def return_loan(session, book_id):
    data = {
        "action": "return_loan",
        "identifier": book_id
    }
    response = session.post("https://archive.org/services/loans/loan/", data=data)
    if response.status_code == 200 and response.json()["success"]:
        print("[+] Book returned")
    else:
        display_error(response, "Something went wrong when trying to return the book")

def image_name(pages, page, directory):
    return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"

def download_one_image(session, link, i, directory, book_id, pages):
    headers = {
        "Referer": "https://archive.org/",
        "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
        "Sec-Fetch-Site": "same-site",
        "Sec-Fetch-Mode": "no-cors",
        "Sec-Fetch-Dest": "image",
    }
    retry = True
    while retry:
        try:
            response = session.get(link, headers=headers)
            if response.status_code == 403:
                session = loan(session, book_id, verbose=False)
                raise Exception("Borrow again")
            elif response.status_code == 200:
                retry = False
        except:
            time.sleep(1)

    image = image_name(pages, i, directory)
    with open(image, "wb") as f:
        f.write(response.content)

def download(session, n_threads, directory, links, scale, book_id):
    print("Downloading pages...")
    links = [f"{link}&rotate=0&scale={scale}" for link in links]
    pages = len(links)

    tasks = []
    with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
        for link in links:
            i = links.index(link)
            tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
        for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
            pass

    images = [image_name(pages, i, directory) for i in range(len(links))]
    return images

def make_pdf(pdf, title, directory):
    file = title + ".pdf"
    i = 1
    while os.path.isfile(os.path.join(directory, file)):
        file = f"{title}({i}).pdf"
        i += 1

    with open(os.path.join(directory, file), "wb") as f:
        f.write(pdf)
    print(f"[+] PDF saved as \"{file}\"")

def main():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    session = login(EMAIL, PASSWORD)
    urls = []
    
    while True:
        url = input("Please paste the Archive.org book URL (must start with https://archive.org/details/): ").strip()
        if not url.startswith("https://archive.org/details/"):
            print("Invalid URL format. URL must start with 'https://archive.org/details/'")
            continue
        
        urls.append(url)
        
        while True:
            choice = input("Do you want to add another book? (y/n): ").lower()
            if choice in ['y', 'n']:
                break
            print("Please enter 'y' or 'n'")
            
        if choice == 'n':
            break
    
    print(f"\n{len(urls)} Book(s) to download")
    
    for url in urls:
        book_id = list(filter(None, url.split("/")))[3]
        print("="*40)
        print(f"Current book: https://archive.org/details/{book_id}")
        session = loan(session, book_id)
        title, links, metadata = get_book_infos(session, url)

        directory = os.path.join(OUTPUT_DIR, title)
        i = 1
        _directory = directory
        while os.path.isdir(directory):
            directory = f"{_directory}({i})"
            i += 1
        os.makedirs(directory)

        images = download(session, MAX_THREADS, directory, links, RESOLUTION, book_id)

        # Create PDF with metadata
        pdfmeta = {}
        for key in ["title", "creator", "associated-names"]:
            if key in metadata:
                if isinstance(metadata[key], str):
                    pass
                elif isinstance(metadata[key], list):
                    metadata[key] = "; ".join(metadata[key])
                else:
                    raise Exception("unsupported metadata type")
                    
        if 'title' in metadata:
            pdfmeta['title'] = metadata['title']
        if 'creator' in metadata and 'associated-names' in metadata:
            pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
        elif 'creator' in metadata:
            pdfmeta['author'] = metadata['creator']
        elif 'associated-names' in metadata:
            pdfmeta['author'] = metadata['associated-names']
        
        if 'date' in metadata:
            try:
                pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
            except:
                pass
                
        pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]

        pdf = img2pdf.convert(images, **pdfmeta)
        make_pdf(pdf, title, OUTPUT_DIR)
        
        try:
            shutil.rmtree(directory)
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))

        return_loan(session, book_id)

if __name__ == "__main__":
    main()
@fireheart2008
Copy link
Author

now it becomes a standalone script no need for writing any code in the terminal. just paste the links

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant