This repository was archived by the owner on Mar 8, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdefault.yml
61 lines (55 loc) · 1.61 KB
/
default.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# pulse data-mining configuration
pulse:
mongo:
database: scrapping
uri: "mongodb://localhost:27017"
middlewares: # wip
- StoreRequest
- StoreResponse
html:
- collection: "images"
tag: "img"
attr: "src"
context-attr: "alt"
- collection: "images-test"
selector: "img[data-src]"
attr: "data-src"
context-attr: "alt"
# crawling configuration and behavior, many options below are from colly,
# see https://godoc.org/github.com/gocolly/colly#Collector for more infos
crawler:
allow-url-revisit: false
allowed-domains:
# - www.google.com
async: true
detect-charset: false
disallowed-domains:
# - www.google.com
ignore-robots-txt: false
limit:
delay: 0
domain-regexp: ""
domain-glob: "*"
random-delay: 0
parallelism: 0 # 0 = will take the max os cpu number
max-body-size: 10485760 # 10MB (10 * 1024 * 1024 bytes)
max-depth: 0
max-url-visited: 50
random-user-agents: true
sleep-between-request: 0
# This option is for the internal storage of colly. By default,
# colly use in-memory. If you want to use multiple instance of pulse,
# you may want use the same storage instance
# see http://go-colly.org/docs/best_practices/storage/ for more infos
#storage:
#clear-on-start: true
#redis:
#address: "127.0.0.1:6379"
#password: ""
#db: 0
#prefix: ""
# mongodb: # not supported yet
# database: "pulse"
# uri: "mongodb://127.0.0.1:27017"
user-agents:
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.0129.115 Safari/537.36"