-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathcrawler.yml.example
135 lines (135 loc) · 5.31 KB
/
crawler.yml.example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
## ================== Crawler Configuration - Elasticsearch ====================
#
## Crawler configuration settings. One configuration file can be used to
## define one crawler/crawl job
#
## NOTE: Most Crawler configurations comes with reasonable defaults.
## Before adjusting the configuration, make sure you understand what you
## are trying to accomplish and the consequences.
#
## ------------------------------- Crawler ------------------------------------
#
## The domain(s) that Crawler will crawl. This is an array. All domains in this
## array will be crawled concurrently by the Crawler with a shared output.
## They are separated to allow for domain-specific configurations.
#
#domains:
# - url: http://localhost:8000 # The base URL for this domain
# seed_urls: # The entry point(s) for crawl jobs
# - http://localhost:8000/foo
# - http://localhost:8000/bar
# sitemap_urls: # The location(s) of sitemap files
# - http://localhost:8000/sitemap.xml
#
# # An array of crawl rules
# # See docs/features/CRAWL_RULES.md for more details on this feature
# crawl_rules:
# - policy: deny # the policy for this rule, either: allow | deny
# type: begins # the type of rule, any of: begins | ends | contains | regex
# pattern: /blog # the pattern string for the rule
#
# # An array of content extraction rules
# # See docs/features/EXTRACTION_RULES.md for more details on this feature
# extraction_rulesets:
# - url_filters:
# - type: begins # Filter type, can be: begins | ends | contains | regex
# pattern: /blog # The pattern for the filter
# rules:
# - action: extract # Rule action, can be: extract | set
# field_name: author # The ES doc field to add the value to
# selector: .author # CSS or XPATH selector if source is `html`, regexp if source is `url`
# join_as: array # How to concatenate multiple values, can be: array | string
# value: yes # The value to use, only applicable if action is `set`
# source: html # The source to extract from, can be: html | url
#
## Where to send the results. Possible values are console, file, or elasticsearch
#output_sink: elasticsearch
#
## Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch
#output_index: my-index
#
## Local directory to output crawl results. Required if output_sink is file
#output_dir: output/local-site
#
## The maximum depth that Crawler will follow links to.
#max_crawl_depth: 2
#
## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true
#purge_crawl_enabled: true
#
## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
## dramatically increase the index size if the site being crawled is large. Defaults to false.
#full_html_extraction_enabled: false
#
## Scheduling using cron expressions
#schedule:
# pattern: "0 12 * * *" # every day at noon
#
## Crawl result field size limits
#max_title_size: 1000
#max_body_size: 5_242_880 # 5 megabytes
#max_keywords_size: 512
#max_description_size: 512
#max_indexed_links_count: 10
#max_headings_count: 10
#
## ------------------------------- Crawler - Advanced --------------------------
#
## Proxy configurations.
#http_proxy_host: localhost
#http_proxy_port: 8888
#http_proxy_protocol: http
#http_proxy_username: ent-search
#http_proxy_password: changeme
#loopback_allowed: true
#ssl_verification_mode: none
#
## Authentication configurations.
## Only required if a site has some form of authentication.
#auth.domain: https://my-auth-domain.com
#auth.type: basic
#auth.username: user
#auth.password: pass
#
## Whether document metadata from certain content types will be indexed or not.
## This does not allow binary content to be indexed from these files, only metadata.
## See docs/features/BINARY_CONTENT_EXTRACTION.md for more details.
#binary_content_extraction_enabled: true
#binary_content_extraction_mime_types:
# - application/pdf
# - application/msword
# - application/vnd.openxmlformats-officedocument.wordprocessingml.document
# - application/vnd.ms-powerpoint
# - application/vnd.openxmlformats-officedocument.presentationml.presentation
#
## ------------------------------- Logging -------------------------------------
#
## The log level for system logs. Defaults to `info`
#log_level: info
#
# Whether or not event logging is enabled for output to the shell running Crawler.
# Event logs are incredibly noisy but have a lot of granularity, these can
# be useful for debugging failing Crawlers.
# Defaults to `false`
#event_logs: false
#
## ------------------------------- Elasticsearch -------------------------------
#
## Elasticsearch connection settings for this specific crawler/crawl job.
## See elasticsearch.yml.example for detailed configurations.
##
#elasticsearch:
# host: http://localhost
# port: 9200
# username: elastic
# password: changeme
# api_key: 1234
# pipeline: ent-search-generic-ingestion
# pipeline_enabled: true
# pipeline_params:
# _reduce_whitespace: true
# _run_ml_inference: true
# _extract_binary_content: true
# bulk_api:
# max_items: 10
# max_size_bytes: 1_048_576