es-conf.yaml

# configuration for Elasticsearch resources
  
config:
  # ES metricsConsumer
  es.metrics.addresses: "localhost"
  es.metrics.index.name: "metrics"
  es.metrics.settings:
    cluster.name: "elasticsearch"
  
  es.indexer.addresses: "localhost"
  es.indexer.index.name: "content"
  # es.indexer.pipeline: "_PIPELINE_"
  es.indexer.create: false
  es.indexer.bulkActions: 100
  es.indexer.flushInterval: "1s"
  es.indexer.concurrentRequests: 2
  es.indexer.settings:
    cluster.name: "elasticsearch"

  # ES spout and persistence bolt
  es.status.addresses: "localhost"
  es.status.index.name: "status"
  # the routing is done on the value of 'partition.url.mode'
  es.status.routing: true
  # stores the value used for the routing as a separate field
  # needed by the spout implementations
  es.status.routing.fieldname: "key"
  es.status.bulkActions: 500
  es.status.flushInterval: "1s"
  es.status.concurrentRequests: 5
  es.status.settings:
    cluster.name: "elasticsearch"
  
  ################
  # spout config #
  ################

  # time in secs for which the URLs will be considered for fetching after a ack of fail
  spout.ttl.purgatory: 30

  # Min time (in msecs) to allow between 2 successive queries to ES
  spout.min.delay.queries: 1000
   
  es.status.max.buckets: 1000
  es.status.max.urls.per.bucket: 5
  # field to group the URLs into buckets
  es.status.bucket.field: "key"
  # field to sort the URLs within a bucket
  es.status.bucket.sort.field: "nextFetchDate"
  # field to sort the buckets
  es.status.global.sort.field: "nextFetchDate"
  
  # Delay since previous query date (in secs) after which the nextFetchDate value will be reset
  spout.reset.fetchdate.after: 300

  spout.max.delay.queries: 10000

  # es.status.recentDate.increase: 2
  # es.status.recentDate.min.gap: 2
  
  # AggregationSpout : sampling improves the performance on large crawls
  # es.status.sample: true
  
  # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query 
  es.status.max.start.offset: 10

  topology.metrics.consumer.register:
       - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
         parallelism.hint: 2
         argument: "yyyy-MM-dd"
         #whitelist:
         #  - "fetcher_counter"
         #  - "fetcher_average.bytes_fetched"
         #blacklist:
         #  - "__receive.*"