topology/warc-crawler-index-elasticsearch/es-conf.yaml

# configuration for Elasticsearch resources
# cf. https://github.com/DigitalPebble/storm-crawler/tree/master/external/elasticsearch/archetype/target/classes/archetype-resources/es-conf.yaml
  
config:
  # ES indexer bolt
  # adresses can be specified as a full URL
  # if not we assume that the protocol is http and the port 9200
  es.indexer.addresses: "http://elasticsearch:9200"
  es.indexer.index.name: "content"
  # es.indexer.pipeline: "_PIPELINE_"
  es.indexer.create: false
  es.indexer.bulkActions: 200
  es.indexer.flushInterval: "2s"
  es.indexer.concurrentRequests: 1
  
  # ES metricsConsumer
  es.metrics.addresses: "http://elasticsearch:9200"
  es.metrics.index.name: "metrics"
  
  # ES spout and persistence bolt
  es.status.addresses: "http://elasticsearch:9200"
  es.status.index.name: "status"
  #es.status.user: "USERNAME"
  #es.status.password: "PASSWORD"
  # the routing is done on the value of 'partition.url.mode'
  es.status.routing: true
  # stores the value used for grouping the URLs as a separate field
  # needed by the spout implementations
  # also used for routing if the value above is set to true
  es.status.routing.fieldname: "metadata.hostname"
  es.status.bulkActions: 800
  es.status.flushInterval: "5s"
  es.status.concurrentRequests: 1
  
  # Note: the warc-crawler topology does not include a status spout reading from
  #       Elasticsearch. The configuration was not remove here anyway.
  ################
  # spout config #
  ################
  
  # positive or negative filters parsable by the Lucene Query Parser
  # es.status.filterQuery: 
  #  - "-(key:stormcrawler.net)"
  #  - "-(key:digitalpebble.com)"

  # time in secs for which the URLs will be considered for fetching after a ack of fail
  spout.ttl.purgatory: 30
  
  # Min time (in msecs) to allow between 2 successive queries to ES
  spout.min.delay.queries: 2000

  # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
  # Setting this to -1 or a large value means that the ES will cache the results but also that less and less results
  # might be returned.
  spout.reset.fetchdate.after: 120

  es.status.max.buckets: 50
  es.status.max.urls.per.bucket: 2
  # field to group the URLs into buckets
  es.status.bucket.field: "metadata.hostname"
  # fields to sort the URLs within a bucket
  es.status.bucket.sort.field:
   - "nextFetchDate"
   - "url"
  # field to sort the buckets
  es.status.global.sort.field: "nextFetchDate"

  # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query 
  es.status.max.start.offset: 500

  # AggregationSpout : sampling improves the performance on large crawls
  es.status.sample: false

  # max allowed duration of a query in sec 
  es.status.query.timeout: -1

  # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
  # use it as nextFetchDate
  es.status.recentDate.increase: -1
  es.status.recentDate.min.gap: -1

  topology.metrics.consumer.register:
       - class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
         parallelism.hint: 1
         #whitelist:
         #  - "fetcher_counter"
         #  - "fetcher_average.bytes_fetched"
         #blacklist:
         #  - "__receive.*"