Module is now pure ESM and cannot be require
d from CommonJS module. Read about ESM modules and how to migrate from CommonJS to ESM
If you're using request object to customize headers, query params, encoding, etc. - it may require some changes (howewer most of options remain the same). Please follow this migration guide to got.
If you're using scrape.defaults
now you need to receive them with
import defaultOptions from 'website-scraper/defaultOptions';
If you're using scrape.plugins
now you need to receive them with
import * as plugins from 'website-scraper/plugins';
Create plugin class which adds saveResource
action
// before
scrape({
resourceSaver: class MyResourceSaver {
saveResource (resource) {/* code to save file where you need */}
errorCleanup (err) {/* code to remove all previously saved files in case of error */}
}
})
// after
class CustomSaveResourcePlugin {
apply(registerAction) {
registerAction('saveResource', ({resource}) => {/* code to save file where you need */})
}
}
scrape({
plugins: [ new CustomSaveResourcePlugin() ]
})
Create plugin class which adds getReference
action
// before
scrape({
updateSources: false
})
// after
class MyGetReferencePlugin {
apply(registerAction) {
registerAction('getReference', () => ({ reference: null }))
}
}
scrape({
plugins: [ new MyGetReferencePlugin() ]
})
Create plugin class which adds getReference
action
// before
scrape({
updateMissingSources: true
})
// after
class MyGetReferencePlugin {
apply(registerAction) {
registerAction('getReference', ({resource, parentResource, originalReference}) => {
if (!resource) {
return { reference: getAbsoluteUrl(parentResource, originalReference) }
}
return getRelativePath(parentResource.getFilename(), resource.getFilename());
})
}
}
scrape({
plugins: [ new MyGetReferencePlugin() ]
})
For functions only, if you use string byType
or byStructure
- you don't need to do anything.
Create plugin class which adds generateFilename
action
// before
scrape({
filenameGenerator: (resource, options, occupiedFileNames) => {
return crypto.randomBytes(20).toString('hex');
}
})
// after
class MyGenerateFilenamePlugin {
apply(registerAction) {
registerAction('generateFilename', ({resource}) => {
return {filename: crypto.randomBytes(20).toString('hex')};
});
}
}
scrape({
plugins: [ new MyGenerateFilenamePlugin() ]
})
Create plugin class which adds afterResponse
action
// before
scrape({
httpResponseHandler: (response) => {
if (response.statusCode === 404) {
return Promise.reject(new Error('status is 404'));
} else {
return Promise.resolve(response.body);
}
}
})
// after
class MyAfterResponsePlugin {
apply(registerAction) {
registerAction('afterResponse', ({response}) => {
if (response.statusCode === 404) {
return null;
} else {
return response.body;
}
});
}
}
scrape({
plugins: [ new MyAfterResponsePlugin() ]
})
For functions only, if you use static request object - you don't need to do anything.
Create plugin class which adds beforeRequest
action
// before
scrape({
request: resource => ({qs: {myParam: 123}})
})
// after
class MyBeforeRequestPlugin {
apply(registerAction) {
registerAction('beforeRequest', ({resource, requestOptions}) => {
return {requestOptions: {qs: {myParam: 123}}};
});
}
}
scrape({
plugins: [ new MyBeforeRequestPlugin() ]
})
Create plugin class which adds onResourceSaved
and onResourceError
actions
// before
scrape({
onResourceSaved: (resource) => {
console.log(`Resource ${resource} was saved to fs`);
},
onResourceError: (resource, err) => {
console.log(`Resource ${resource} was not saved because of ${err}`);
}
})
// after
class MyPlugin {
apply(registerAction) {
registerAction('onResourceSaved', ({resource}) => console.log(`Resource ${resource.url} saved!`));
registerAction('onResourceError', ({resource, error}) => console.log(`Resource ${resource.url} has error ${error}`));
}
}
scrape({
plugins: [ new MyPlugin() ]
})