Skip to content

Commit c6dc414

Browse files
committed
Initial commit
0 parents  commit c6dc414

File tree

6 files changed

+433
-0
lines changed

6 files changed

+433
-0
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/archive
2+
/node_modules

README.md

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# reference-archive
2+
3+
Extract reference URLs and DOIs from an
4+
[IEEE-style](https://ieeeauthorcenter.ieee.org/wp-content/uploads/IEEE-Reference-Guide.pdf)
5+
reference list and download the references to the filesystem.
6+
7+
## Features
8+
9+
- downloads webpages as a single file in [MHTML](https://www.rfc-editor.org/rfc/rfc2557) format
10+
- including client-rendered content
11+
- uses [puppeteer-extra-plugin-stealth](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth) to evade anti-bot measures
12+
- downloads PDF files and other file types from URLs
13+
- downloads papers with a DOI from [sci-hub](https://sci-hub.st)
14+
- if you make use of this feature, consider [donating to sci-hub](https://sci-hub.st/donate)
15+
16+
## Usage
17+
18+
This software requires [node.js](https://nodejs.org), version 18 or newer.
19+
20+
Before you use, install the dependencies:
21+
22+
```shell
23+
npm install # or pnpm install, or yarn install, etc.
24+
```
25+
26+
### Use from the command line
27+
28+
- Put your references in a plaintext file, e.g. `references.txt`
29+
- Create a target directory, e.g. `./archive`
30+
- Run:
31+
```shell
32+
node index.js references.txt ./archive
33+
```
34+
35+
### Use as a library
36+
37+
```javascript
38+
import { extractAndSaveAllURLs } from "./lib.js";
39+
40+
await extractAndSaveAllURLs(
41+
'[1] First reference. [Online]. Available: https://jfhr.de/reference-archive/example.pdf (Accessed: 2023-07-23)\n' +
42+
'[2] Second reference. [Online]. Available: https://jfhr.de/reference-archive/example.html (Accessed: 2023-07-23)\n' +
43+
'[3] Third reference. [Online]. Available: https://jfhr.de/reference-archive/cr.html (Accessed: 2023-07-23)\n' +
44+
'[4] S. DeRisi, R. Kennison and N. Twyman, The What and Whys of DOIs. doi: 10.1371/journal.pbio.0000057\n' +
45+
'[5] N. Paskin, "Digital Object Identifier (DOI) System", Encyclopedia of Library and Information Sciences (3rd ed.)\n',
46+
'./archive/'
47+
);
48+
```
49+
50+
## Example
51+
52+
Say you have the following reference list:
53+
54+
```text
55+
[1] First reference. [Online]. Available: https://jfhr.de/reference-archive/example.pdf (Accessed: 2023-07-23)
56+
[2] Second reference. [Online]. Available: https://jfhr.de/reference-archive/example.html (Accessed: 2023-07-23)
57+
[3] Third reference. [Online]. Available: https://jfhr.de/reference-archive/cr.html (Accessed: 2023-07-23)
58+
[4] S. DeRisi, R. Kennison and N. Twyman, The What and Whys of DOIs. doi: 10.1371/journal.pbio.0000057
59+
[5] N. Paskin, "Digital Object Identifier (DOI) System", Encyclopedia of Library and Information Sciences (3rd ed.)
60+
```
61+
62+
`reference-archive` would download the following files to your filesystem:
63+
64+
```text
65+
1.pdf # PDF file from URL
66+
2.mhtml # Single file web page from URL
67+
3.mhtml # Single file web page from URL, including client-rendered content
68+
4.pdf # PDF file with DOI from sci-hub
69+
# no 5.* - reference 5 has no URL and no DOI
70+
```
71+
72+
## Test
73+
74+
To run tests:
75+
```shell
76+
node --test
77+
```

index.js

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import * as fs from "fs";
2+
import { extractAndSaveAllURLs } from "./lib.js";
3+
4+
async function main() {
5+
const inputFile = process.argv[2];
6+
const targetDirectory = process.argv[3];
7+
if (!inputFile || !targetDirectory) {
8+
console.error('Syntax: node index.js FILE TARGET');
9+
process.exitCode = -1;
10+
return;
11+
}
12+
13+
const input = await fs.promises.readFile(inputFile, 'utf-8');
14+
await fs.promises.mkdir('archive', { recursive: true });
15+
16+
await extractAndSaveAllURLs(input, targetDirectory);
17+
}
18+
19+
main();

lib.js

+270
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
import * as fs from "fs";
2+
import * as stream from "stream";
3+
import * as http from "http";
4+
import * as https from "https";
5+
import * as path from "path";
6+
import mimeDB from "mime-db";
7+
import { chromium } from "playwright-extra";
8+
import StealthPlugin from "puppeteer-extra-plugin-stealth";
9+
10+
chromium.use(StealthPlugin());
11+
12+
/**
13+
* @typedef {import('playwright').Browser} Browser
14+
*/
15+
16+
/**
17+
* @typedef {{id: number, url: string}|{id: number, doi: string}} ParsedReference
18+
*/
19+
20+
/**
21+
* @param browser {Browser}
22+
* @param url {string}
23+
* @return {Promise<string>}
24+
*/
25+
async function getPageAsMHTML(browser, url) {
26+
const page = await browser.newPage();
27+
try {
28+
await page.goto(url);
29+
await page.waitForLoadState('networkidle');
30+
// Extra timeout bc some pages fade content in with an animation
31+
await page.waitForTimeout(1000);
32+
const session = await page.context().newCDPSession(page);
33+
const doc = await session.send('Page.captureSnapshot', {format: 'mhtml'});
34+
return doc.data;
35+
} finally {
36+
await page.close();
37+
}
38+
}
39+
40+
/**
41+
* @param browser {Browser}
42+
* @param doi {string}
43+
* @param filename {string}
44+
* @return {Promise<string>}
45+
*/
46+
async function saveDOIAsPDFFromSciHub(browser, doi, filename) {
47+
const url = `https://sci-hub.st/${doi}`;
48+
const page = await browser.newPage();
49+
try {
50+
await page.goto(url);
51+
await page.waitForLoadState('domcontentloaded');
52+
await page.locator('button', { hasText: 'save' }).click();
53+
await page.pdf({ path: filename });
54+
} catch (e) {
55+
if (e.name === 'TimeoutError') {
56+
throw new Error(`TimeoutError while accessing sci-hub - this might be because of a captcha.
57+
Alternatively, visit https://sci-hub.st/${doi} and download the file manually.`);
58+
}
59+
throw e;
60+
} finally {
61+
await page.close();
62+
}
63+
}
64+
65+
/**
66+
* @param browser {Browser}
67+
* @param url {string}
68+
* @param filename {string}
69+
* @return {Promise<void>}
70+
*/
71+
async function savePageAsMHTML(browser, url, filename) {
72+
const mhtml = await getPageAsMHTML(browser, url);
73+
await fs.promises.writeFile(filename, mhtml, 'utf-8');
74+
}
75+
76+
/**
77+
* @param url {string}
78+
* @param filename {string}
79+
* @return {Promise<void>}
80+
*/
81+
async function downloadFile(url, filename) {
82+
const ws = fs.createWriteStream(filename);
83+
/** @type {import('http')|import('https')} */
84+
let httplib = https;
85+
if (url.startsWith('http:')) {
86+
httplib = http;
87+
}
88+
89+
await /** @type {Promise<void>} */(new Promise((resolve, reject) => {
90+
httplib.get(url, res => {
91+
stream.pipeline(res, ws, err => {
92+
if (err) {
93+
reject(err);
94+
} else {
95+
resolve();
96+
}
97+
});
98+
});
99+
}));
100+
}
101+
102+
/**
103+
* @param browser {Browser}
104+
* @param url {string}
105+
* @param filename {string}
106+
* @return {Promise<void>}
107+
*/
108+
async function savePage(browser, url, filename) {
109+
if (filename.endsWith('.mhtml')) {
110+
await savePageAsMHTML(browser, url, filename);
111+
} else {
112+
await downloadFile(url, filename);
113+
}
114+
}
115+
116+
/**
117+
* Make a HEAD request to a URL and return the value of the Content-Type response header.
118+
* Return null if the request fails or the response has a non-ok status code or no
119+
* Content-Type header.
120+
* @param url {string}
121+
* @return {Promise<string>}
122+
*/
123+
async function getMimeTypeForURL(url) {
124+
const response = await fetch(url, {method: 'HEAD'});
125+
if (!response.ok) {
126+
throw new Error(`HEAD ${url} returned non-ok response status code: ${response.status}`);
127+
}
128+
const contentType = response.headers.get('content-type');
129+
if (contentType === null) {
130+
throw new Error(`HEAD ${url} returned response with status code ${response.status} and no Content-Type header`);
131+
}
132+
return contentType.split(';')[0];
133+
}
134+
135+
/**
136+
* @param id {number}
137+
* @param mimeType {string}
138+
* @param targetDirectory {string}
139+
* @return {string}
140+
*/
141+
function getPageTargetFilename(id, mimeType, targetDirectory) {
142+
if (mimeType === 'text/html') {
143+
return path.join(targetDirectory, `${id}.mhtml`);
144+
}
145+
const extension = mimeDB[mimeType]?.extensions?.[0];
146+
if (extension) {
147+
return path.join(targetDirectory, `${id}.${extension}`);
148+
}
149+
return path.join(targetDirectory, `${id}`);
150+
}
151+
152+
/**
153+
* Return true if the given string is valid URL.
154+
* This is equivalent to URL.canParse (which is only available in node>=20)
155+
* @param url {string}
156+
* @return {boolean}
157+
*/
158+
function isValidURL(url) {
159+
try {
160+
new URL(url);
161+
return true;
162+
} catch {
163+
return false;
164+
}
165+
}
166+
167+
/**
168+
* Find references with https: URLs or DOIs in a text.
169+
* Yield objects with the reference ID and URL or DOI.
170+
* @param text {string}
171+
* @return {Generator<ParsedReference>}
172+
*/
173+
function *extractURLsFromText(text) {
174+
const re = /^\[(?<id>[0-9]+)].*?((?<url>https?:\/\/\S+)|(?<doi>10\.[0-9]+\/\S+))/gm;
175+
let match = null;
176+
while ((match = re.exec(text)) !== null) {
177+
const id = parseInt(match.groups.id);
178+
if (!isNaN(id)) {
179+
const { url, doi } = match.groups;
180+
if (isValidURL(url)) {
181+
yield {id, url};
182+
} else if (doi) {
183+
yield {id, doi}
184+
}
185+
}
186+
}
187+
}
188+
189+
/**
190+
* @param file {string}
191+
* @return {Promise<boolean>}
192+
*/
193+
async function fileExists(file) {
194+
try {
195+
const stat = await fs.promises.stat(file);
196+
if (stat) {
197+
return stat.isFile();
198+
}
199+
} catch {}
200+
201+
return false;
202+
}
203+
204+
/**
205+
* @param reference {ParsedReference}
206+
* @param targetDirectory {string}
207+
* @param browser {Browser}
208+
* @return {Promise<void>}
209+
*/
210+
async function saveReference(reference, targetDirectory, browser) {
211+
const {id, url, doi} = reference;
212+
if (url) {
213+
const mimeType = await getMimeTypeForURL(url);
214+
const targetFile = getPageTargetFilename(id, mimeType, targetDirectory);
215+
if (await fileExists(targetFile)) {
216+
console.log(`[${id}]: ${targetFile} exists, skipping`);
217+
return;
218+
}
219+
await savePage(browser, url, targetFile);
220+
console.log(`[${id}]: ${targetFile} downloaded`);
221+
} else if (doi) {
222+
const targetFile = getPageTargetFilename(id, 'application/pdf', targetDirectory);
223+
await saveDOIAsPDFFromSciHub(browser, doi, targetFile);
224+
console.log(`[${id}]: ${targetFile} downloaded`);
225+
}
226+
}
227+
228+
/**
229+
* @param input {string}
230+
* @param targetDirectory {string}
231+
* @param browser {Browser}
232+
* @return {Promise<void>}
233+
*/
234+
async function extractAndSaveURLsWithBrowser(input, targetDirectory, browser) {
235+
for (const reference of extractURLsFromText(input)) {
236+
try {
237+
await saveReference(reference, targetDirectory, browser);
238+
} catch (e) {
239+
console.log(`[${reference.id}]: error: ${e}`);
240+
}
241+
}
242+
}
243+
244+
/**
245+
* Extract IEEE-style references with URLs from the given string, and download the URL content
246+
* to a file in the specified directory, with the filename equal to the reference number and
247+
* the file extension matching the mime type of the file.
248+
*
249+
* @example
250+
* await extractAndSaveAllURLs(
251+
* '[1] First reference. [Online]. Available: https://jfhr.de/reference-archive/example.html (Accessed: 2023-07-23)\n' +
252+
* '[2] Second reference. [Online]. Available: https://jfhr.de/reference-archive/example.pdf (Accessed: 2023-07-23)',
253+
* './archive/'
254+
* );
255+
* // Will create the following files:
256+
* // ./archive/1.mhtml
257+
* // ./archive/2.pdf
258+
*
259+
* @param input {string}
260+
* @param targetDirectory {string}
261+
* @return {Promise<void>}
262+
*/
263+
export async function extractAndSaveAllURLs(input, targetDirectory) {
264+
const browser = await chromium.launch();
265+
try {
266+
await extractAndSaveURLsWithBrowser(input, targetDirectory, browser);
267+
} finally {
268+
await browser.close();
269+
}
270+
}

package.json

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"name": "reference-archive",
3+
"version": "1.0.0",
4+
"description": "Extract reference URLs and DOIs from an IEEE-style reference list and download the references to the filesystem.",
5+
"main": "index.js",
6+
"scripts": {
7+
"test": "node --test"
8+
},
9+
"author": "jfhr <[email protected]>",
10+
"license": "AGPL-3.0-only",
11+
"type": "module",
12+
"dependencies": {
13+
"mime-db": "^1.52.0",
14+
"playwright": "^1.35.0",
15+
"playwright-extra": "^4.3.6",
16+
"puppeteer-extra-plugin-stealth": "^2.11.2"
17+
}
18+
}

0 commit comments

Comments
 (0)