config/crawler.yml.example

## ================== Crawler Configuration - Elasticsearch ====================
#
##  Crawler configuration settings. One configuration file can be used to
##       define one crawler/crawl job
#
##  NOTE: Most Crawler configurations comes with reasonable defaults.
##       Before adjusting the configuration, make sure you understand what you
##       are trying to accomplish and the consequences.
#
## ------------------------------- Crawler ------------------------------------
#
## The domain(s) that Crawler will crawl. This is an array. All domains in this
##   array will be crawled concurrently by the Crawler with a shared output.
##   They are separated to allow for domain-specific configurations.
#
#domains:
#  - url: http://localhost:8000         # The base URL for this domain
#    seed_urls:                         # The entry point(s) for crawl jobs
#      - http://localhost:8000/foo
#      - http://localhost:8000/bar
#    sitemap_urls:                      # The location(s) of sitemap files
#      - http://localhost:8000/sitemap.xml
#
#    # An array of crawl rules
#    # See docs/features/CRAWL_RULES.md for more details on this feature
#    crawl_rules:
#      - policy: deny       # the policy for this rule, either: allow | deny
#        type: begins       # the type of rule, any of: begins | ends | contains | regex
#        pattern: /blog     # the pattern string for the rule
#
#    # An array of content extraction rules
#    # See docs/features/EXTRACTION_RULES.md for more details on this feature
#    extraction_rulesets:
#      - url_filters:
#          - type: begins           # Filter type, can be: begins | ends | contains | regex
#            pattern: /blog         # The pattern for the filter
#        rules:
#          - action: extract        # Rule action, can be: extract | set
#            field_name: author     # The ES doc field to add the value to
#            selector: .author      # CSS or XPATH selector if source is `html`, regexp if source is `url`
#            join_as: array         # How to concatenate multiple values, can be: array | string
#            value: yes             # The value to use, only applicable if action is `set`
#            source: html           # The source to extract from, can be: html | url
#
## Where to send the results. Possible values are console, file, or elasticsearch
#output_sink: elasticsearch
#
## Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch
#output_index: my-index
#
## Local directory to output crawl results. Required if output_sink is file
#output_dir: output/local-site
#
## The maximum depth that Crawler will follow links to.
#max_crawl_depth: 2
#
## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true
#purge_crawl_enabled: true
#
## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
##   dramatically increase the index size if the site being crawled is large. Defaults to false.
#full_html_extraction_enabled: false
#
## Scheduling using cron expressions
#schedule:
#  pattern: "0 12 * * *"     # every day at noon
#
## Crawl result field size limits
#max_title_size: 1000
#max_body_size: 5_242_880 # 5 megabytes
#max_keywords_size: 512
#max_description_size: 512
#max_indexed_links_count: 10
#max_headings_count: 10
#
## ------------------------------- Crawler - Advanced --------------------------
#
## Proxy configurations.
#http_proxy_host: localhost
#http_proxy_port: 8888
#http_proxy_protocol: http
#http_proxy_username: ent-search
#http_proxy_password: changeme
#loopback_allowed: true
#ssl_verification_mode: none
#
## Authentication configurations.
##     Only required if a site has some form of authentication.
#auth.domain: https://my-auth-domain.com
#auth.type: basic
#auth.username: user
#auth.password: pass
#
## Whether document metadata from certain content types will be indexed or not.
##     This does not allow binary content to be indexed from these files, only metadata.
##     See docs/features/BINARY_CONTENT_EXTRACTION.md for more details.
#binary_content_extraction_enabled: true
#binary_content_extraction_mime_types:
#  - application/pdf
#  - application/msword
#  - application/vnd.openxmlformats-officedocument.wordprocessingml.document
#  - application/vnd.ms-powerpoint
#  - application/vnd.openxmlformats-officedocument.presentationml.presentation
#
## ------------------------------- Logging -------------------------------------
#
## The log level for system logs. Defaults to `info`
#log_level: info
#
# Whether or not event logging is enabled for output to the shell running Crawler.
#     Event logs are incredibly noisy but have a lot of granularity, these can
#     be useful for debugging failing Crawlers.
#     Defaults to `false`
#event_logs: false
#
## ------------------------------- Elasticsearch -------------------------------
#
## Elasticsearch connection settings for this specific crawler/crawl job.
##     See elasticsearch.yml.example for detailed configurations.
##
#elasticsearch:
#  host: http://localhost
#  port: 9200
#  username: elastic
#  password: changeme
#  api_key: 1234
#  pipeline: ent-search-generic-ingestion
#  pipeline_enabled: true
#  pipeline_params:
#    _reduce_whitespace: true
#    _run_ml_inference: true
#    _extract_binary_content: true
#  bulk_api:
#    max_items: 10
#    max_size_bytes: 1_048_576