Initial commit

jfhr · jfhr · commit c6dc41466020 · 2023-07-23T22:12:20.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+/archive
+/node_modules
diff --git a/README.md b/README.md
@@ -0,0 +1,77 @@
+# reference-archive
+
+Extract reference URLs and DOIs from an 
+[IEEE-style](https://ieeeauthorcenter.ieee.org/wp-content/uploads/IEEE-Reference-Guide.pdf)
+reference list and download the references to the filesystem.
+
+## Features
+
+- downloads webpages as a single file in [MHTML](https://www.rfc-editor.org/rfc/rfc2557) format
+  - including client-rendered content
+  - uses [puppeteer-extra-plugin-stealth](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth) to evade anti-bot measures
+- downloads PDF files and other file types from URLs
+- downloads papers with a DOI from [sci-hub](https://sci-hub.st)
+  - if you make use of this feature, consider [donating to sci-hub](https://sci-hub.st/donate)
+
+## Usage 
+
+This software requires [node.js](https://nodejs.org), version 18 or newer.
+
+Before you use, install the dependencies:
+
+```shell
+npm install  # or pnpm install, or yarn install, etc.
+```
+
+### Use from the command line
+
+- Put your references in a plaintext file, e.g. `references.txt`
+- Create a target directory, e.g. `./archive`
+- Run: 
+```shell
+node index.js references.txt ./archive
+```
+
+### Use as a library
+
+```javascript
+import { extractAndSaveAllURLs } from "./lib.js";
+
+await extractAndSaveAllURLs(
+    '[1] First reference. [Online]. Available: https://jfhr.de/reference-archive/example.pdf (Accessed: 2023-07-23)\n' +
+    '[2] Second reference. [Online]. Available: https://jfhr.de/reference-archive/example.html (Accessed: 2023-07-23)\n' +
+    '[3] Third reference. [Online]. Available: https://jfhr.de/reference-archive/cr.html (Accessed: 2023-07-23)\n' +
+    '[4] S. DeRisi, R. Kennison and N. Twyman, The What and Whys of DOIs. doi: 10.1371/journal.pbio.0000057\n' +
+    '[5] N. Paskin, "Digital Object Identifier (DOI) System", Encyclopedia of Library and Information Sciences (3rd ed.)\n',
+    './archive/'
+);
+```
+
+## Example
+
+Say you have the following reference list:
+
+```text
+[1] First reference. [Online]. Available: https://jfhr.de/reference-archive/example.pdf (Accessed: 2023-07-23)
+[2] Second reference. [Online]. Available: https://jfhr.de/reference-archive/example.html (Accessed: 2023-07-23)
+[3] Third reference. [Online]. Available: https://jfhr.de/reference-archive/cr.html (Accessed: 2023-07-23)
+[4] S. DeRisi, R. Kennison and N. Twyman, The What and Whys of DOIs. doi: 10.1371/journal.pbio.0000057
+[5] N. Paskin, "Digital Object Identifier (DOI) System", Encyclopedia of Library and Information Sciences (3rd ed.)
+```
+
+`reference-archive` would download the following files to your filesystem:
+
+```text
+1.pdf    # PDF file from URL
+2.mhtml  # Single file web page from URL
+3.mhtml  # Single file web page from URL, including client-rendered content
+4.pdf    # PDF file with DOI from sci-hub
+         # no 5.* - reference 5 has no URL and no DOI
+```
+
+## Test
+
+To run tests:
+```shell
+node --test
+```
diff --git a/index.js b/index.js
@@ -0,0 +1,19 @@
+import * as fs from "fs";
+import { extractAndSaveAllURLs } from "./lib.js";
+
+async function main() {
+    const inputFile = process.argv[2];
+    const targetDirectory = process.argv[3];
+    if (!inputFile || !targetDirectory) {
+        console.error('Syntax: node index.js FILE TARGET');
+        process.exitCode = -1;
+        return;
+    }
+
+    const input = await fs.promises.readFile(inputFile, 'utf-8');
+    await fs.promises.mkdir('archive', { recursive: true });
+
+    await extractAndSaveAllURLs(input, targetDirectory);
+}
+
+main();
diff --git a/lib.js b/lib.js
@@ -0,0 +1,270 @@
+import * as fs from "fs";
+import * as stream from "stream";
+import * as http from "http";
+import * as https from "https";
+import * as path from "path";
+import mimeDB from "mime-db";
+import { chromium } from "playwright-extra";
+import StealthPlugin from "puppeteer-extra-plugin-stealth";
+
+chromium.use(StealthPlugin());
+
+/**
+ * @typedef {import('playwright').Browser} Browser
+ */
+
+/**
+ * @typedef {{id: number, url: string}|{id: number, doi: string}} ParsedReference
+ */
+
+/**
+ * @param browser {Browser}
+ * @param url {string}
+ * @return {Promise<string>}
+ */
+async function getPageAsMHTML(browser, url) {
+    const page = await browser.newPage();
+    try {
+        await page.goto(url);
+        await page.waitForLoadState('networkidle');
+        // Extra timeout bc some pages fade content in with an animation
+        await page.waitForTimeout(1000);
+        const session = await page.context().newCDPSession(page);
+        const doc = await session.send('Page.captureSnapshot', {format: 'mhtml'});
+        return doc.data;
+    } finally {
+        await page.close();
+    }
+}
+
+/**
+ * @param browser {Browser}
+ * @param doi {string}
+ * @param filename {string}
+ * @return {Promise<string>}
+ */
+async function saveDOIAsPDFFromSciHub(browser, doi, filename) {
+    const url = `https://sci-hub.st/${doi}`;
+    const page = await browser.newPage();
+    try {
+        await page.goto(url);
+        await page.waitForLoadState('domcontentloaded');
+        await page.locator('button', { hasText: 'save' }).click();
+        await page.pdf({ path: filename });
+    } catch (e) {
+        if (e.name === 'TimeoutError') {
+            throw new Error(`TimeoutError while accessing sci-hub - this might be because of a captcha.
+                Alternatively, visit https://sci-hub.st/${doi} and download the file manually.`);
+        }
+        throw e;
+    } finally {
+        await page.close();
+    }
+}
+
+/**
+ * @param browser {Browser}
+ * @param url {string}
+ * @param filename {string}
+ * @return {Promise<void>}
+ */
+async function savePageAsMHTML(browser, url, filename) {
+    const mhtml = await getPageAsMHTML(browser, url);
+    await fs.promises.writeFile(filename, mhtml, 'utf-8');
+}
+
+/**
+ * @param url {string}
+ * @param filename {string}
+ * @return {Promise<void>}
+ */
+async function downloadFile(url, filename) {
+    const ws = fs.createWriteStream(filename);
+    /** @type {import('http')|import('https')} */
+    let httplib = https;
+    if (url.startsWith('http:')) {
+        httplib = http;
+    }
+
+    await /** @type {Promise<void>} */(new Promise((resolve, reject) => {
+        httplib.get(url, res => {
+            stream.pipeline(res, ws, err => {
+                if (err) {
+                    reject(err);
+                } else {
+                    resolve();
+                }
+            });
+        });
+    }));
+}
+
+/**
+ * @param browser {Browser}
+ * @param url {string}
+ * @param filename {string}
+ * @return {Promise<void>}
+ */
+async function savePage(browser, url, filename) {
+    if (filename.endsWith('.mhtml')) {
+        await savePageAsMHTML(browser, url, filename);
+    } else {
+        await downloadFile(url, filename);
+    }
+}
+
+/**
+ * Make a HEAD request to a URL and return the value of the Content-Type response header.
+ * Return null if the request fails or the response has a non-ok status code or no
+ * Content-Type header.
+ * @param url {string}
+ * @return {Promise<string>}
+ */
+async function getMimeTypeForURL(url) {
+    const response = await fetch(url, {method: 'HEAD'});
+    if (!response.ok) {
+        throw new Error(`HEAD ${url} returned non-ok response status code: ${response.status}`);
+    }
+    const contentType = response.headers.get('content-type');
+    if (contentType === null) {
+        throw new Error(`HEAD ${url} returned response with status code ${response.status} and no Content-Type header`);
+    }
+    return contentType.split(';')[0];
+}
+
+/**
+ * @param id {number}
+ * @param mimeType {string}
+ * @param targetDirectory {string}
+ * @return {string}
+ */
+function getPageTargetFilename(id, mimeType, targetDirectory) {
+    if (mimeType === 'text/html') {
+        return path.join(targetDirectory, `${id}.mhtml`);
+    }
+    const extension = mimeDB[mimeType]?.extensions?.[0];
+    if (extension) {
+        return path.join(targetDirectory, `${id}.${extension}`);
+    }
+    return path.join(targetDirectory, `${id}`);
+}
+
+/**
+ * Return true if the given string is valid URL.
+ * This is equivalent to URL.canParse (which is only available in node>=20)
+ * @param url {string}
+ * @return {boolean}
+ */
+function isValidURL(url) {
+    try {
+        new URL(url);
+        return true;
+    } catch {
+        return false;
+    }
+}
+
+/**
+ * Find references with https: URLs or DOIs in a text.
+ * Yield objects with the reference ID and URL or DOI.
+ * @param text {string}
+ * @return {Generator<ParsedReference>}
+ */
+function *extractURLsFromText(text) {
+    const re = /^\[(?<id>[0-9]+)].*?((?<url>https?:\/\/\S+)|(?<doi>10\.[0-9]+\/\S+))/gm;
+    let match = null;
+    while ((match = re.exec(text)) !== null) {
+        const id = parseInt(match.groups.id);
+        if (!isNaN(id)) {
+            const { url, doi } = match.groups;
+            if (isValidURL(url)) {
+                yield {id, url};
+            } else if (doi) {
+                yield {id, doi}
+            }
+        }
+    }
+}
+
+/**
+ * @param file {string}
+ * @return {Promise<boolean>}
+ */
+async function fileExists(file) {
+    try {
+        const stat = await fs.promises.stat(file);
+        if (stat) {
+            return stat.isFile();
+        }
+    } catch {}
+
+    return false;
+}
+
+/**
+ * @param reference {ParsedReference}
+ * @param targetDirectory {string}
+ * @param browser {Browser}
+ * @return {Promise<void>}
+ */
+async function saveReference(reference, targetDirectory, browser) {
+    const {id, url, doi} = reference;
+    if (url) {
+        const mimeType = await getMimeTypeForURL(url);
+        const targetFile = getPageTargetFilename(id, mimeType, targetDirectory);
+        if (await fileExists(targetFile)) {
+            console.log(`[${id}]: ${targetFile} exists, skipping`);
+            return;
+        }
+        await savePage(browser, url, targetFile);
+        console.log(`[${id}]: ${targetFile} downloaded`);
+    } else if (doi) {
+        const targetFile = getPageTargetFilename(id, 'application/pdf', targetDirectory);
+        await saveDOIAsPDFFromSciHub(browser, doi, targetFile);
+        console.log(`[${id}]: ${targetFile} downloaded`);
+    }
+}
+
+/**
+ * @param input {string}
+ * @param targetDirectory {string}
+ * @param browser {Browser}
+ * @return {Promise<void>}
+ */
+async function extractAndSaveURLsWithBrowser(input, targetDirectory, browser) {
+    for (const reference of extractURLsFromText(input)) {
+        try {
+            await saveReference(reference, targetDirectory, browser);
+        } catch (e) {
+            console.log(`[${reference.id}]: error: ${e}`);
+        }
+    }
+}
+
+/**
+ * Extract IEEE-style references with URLs from the given string, and download the URL content
+ * to a file in the specified directory, with the filename equal to the reference number and
+ * the file extension matching the mime type of the file.
+ *
+ * @example
+ * await extractAndSaveAllURLs(
+ *   '[1] First reference. [Online]. Available: https://jfhr.de/reference-archive/example.html (Accessed: 2023-07-23)\n' +
+ *   '[2] Second reference. [Online]. Available: https://jfhr.de/reference-archive/example.pdf (Accessed: 2023-07-23)',
+ *   './archive/'
+ * );
+ * // Will create the following files:
+ * // ./archive/1.mhtml
+ * // ./archive/2.pdf
+ *
+ * @param input {string}
+ * @param targetDirectory {string}
+ * @return {Promise<void>}
+ */
+export async function extractAndSaveAllURLs(input, targetDirectory) {
+    const browser = await chromium.launch();
+    try {
+        await extractAndSaveURLsWithBrowser(input, targetDirectory, browser);
+    } finally {
+        await browser.close();
+    }
+}
diff --git a/package.json b/package.json
@@ -0,0 +1,18 @@
+{
+  "name": "reference-archive",
+  "version": "1.0.0",
+  "description": "Extract reference URLs and DOIs from an IEEE-style reference list and download the references to the filesystem.",
+  "main": "index.js",
+  "scripts": {
+    "test": "node --test"
+  },
+  "author": "jfhr <me@jfhr.de>",
+  "license": "AGPL-3.0-only",
+  "type": "module",
+  "dependencies": {
+    "mime-db": "^1.52.0",
+    "playwright": "^1.35.0",
+    "playwright-extra": "^4.3.6",
+    "puppeteer-extra-plugin-stealth": "^2.11.2"
+  }
+}
diff --git a/test.js b/test.js