generated from ddev/ddev-addon-template
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathspidergram.config.yaml
49 lines (47 loc) · 1.92 KB
/
spidergram.config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
---
# The included Docker Compose file spins up a no-authentication instance
# of ArangoDB — it's simple to use for local crawling, but it's a good
# idea to set up a "real" server, even if it's running locally, once
# you're done kicking the tires.
arango:
databaseName: db
url: https://spidergram.ddev.site:8529
# These options control the behavior of the Spider when it's actually
# crawling pages and searching them for new links. In a simple config
# file, you can change settings. In a .js or .ts configuration script,
# you can pass in custom URL filtering and response handling functions
# for more control.
spider:
# These options control the behavior of the Spider when it's actually
# crawling pages and searching them for new links. In a simple config
# file, you can change settings. In a .js or .ts configuration script,
# you can pass in custom URL filtering and response handling functions
# for more control.
downloadMimeTypes:
- application/pdf
userAgent: MyCustomSpider
# Links will be labeled with these categories based on
# the section of the page they're found in. Each key is
# a region name, and each value is a CSS selector defining
# the region. By default, the 'regions' property is empty
# and saved links are unlabeled.
urlOptions:
regions:
- header
- footer
- body
# Spidergram uses a global URL normalizer to ensure that the same
# rules are applied consistently and pages aren't re-visited needlessly.
# The default URL normalizer is configurable, but a custom function can be
# passed in instead of this settings object when using a .js or .ts
# configuration script.
urlNormalizer:
# forceProtocol: "https:"
# forceLowercase: "host"
# discardSubdomain: "ww*"
# discardAnchor: true
# discardAuth: true
# discardIndex: "**/{index,default}.{htm,html,aspx,php}"
# discardSearch: "!{page,p}"
# sortSearchParams: true
discardTrailingSlash: true # false by default