-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathes-conf.yaml
89 lines (75 loc) · 3.23 KB
/
es-conf.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# configuration for Elasticsearch resources
# cf. https://github.com/DigitalPebble/storm-crawler/tree/master/external/elasticsearch/archetype/target/classes/archetype-resources/es-conf.yaml
config:
# ES indexer bolt
# adresses can be specified as a full URL
# if not we assume that the protocol is http and the port 9200
es.indexer.addresses: "http://elasticsearch:9200"
es.indexer.index.name: "content"
# es.indexer.pipeline: "_PIPELINE_"
es.indexer.create: false
es.indexer.bulkActions: 200
es.indexer.flushInterval: "2s"
es.indexer.concurrentRequests: 1
# ES metricsConsumer
es.metrics.addresses: "http://elasticsearch:9200"
es.metrics.index.name: "metrics"
# ES spout and persistence bolt
es.status.addresses: "http://elasticsearch:9200"
es.status.index.name: "status"
#es.status.user: "USERNAME"
#es.status.password: "PASSWORD"
# the routing is done on the value of 'partition.url.mode'
es.status.routing: true
# stores the value used for grouping the URLs as a separate field
# needed by the spout implementations
# also used for routing if the value above is set to true
es.status.routing.fieldname: "metadata.hostname"
es.status.bulkActions: 800
es.status.flushInterval: "5s"
es.status.concurrentRequests: 1
# Note: the warc-crawler topology does not include a status spout reading from
# Elasticsearch. The configuration was not remove here anyway.
################
# spout config #
################
# positive or negative filters parsable by the Lucene Query Parser
# es.status.filterQuery:
# - "-(key:stormcrawler.net)"
# - "-(key:digitalpebble.com)"
# time in secs for which the URLs will be considered for fetching after a ack of fail
spout.ttl.purgatory: 30
# Min time (in msecs) to allow between 2 successive queries to ES
spout.min.delay.queries: 2000
# Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
# Setting this to -1 or a large value means that the ES will cache the results but also that less and less results
# might be returned.
spout.reset.fetchdate.after: 120
es.status.max.buckets: 50
es.status.max.urls.per.bucket: 2
# field to group the URLs into buckets
es.status.bucket.field: "metadata.hostname"
# fields to sort the URLs within a bucket
es.status.bucket.sort.field:
- "nextFetchDate"
- "url"
# field to sort the buckets
es.status.global.sort.field: "nextFetchDate"
# CollapsingSpout : limits the deep paging by resetting the start offset for the ES query
es.status.max.start.offset: 500
# AggregationSpout : sampling improves the performance on large crawls
es.status.sample: false
# max allowed duration of a query in sec
es.status.query.timeout: -1
# AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
# use it as nextFetchDate
es.status.recentDate.increase: -1
es.status.recentDate.min.gap: -1
topology.metrics.consumer.register:
- class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
parallelism.hint: 1
#whitelist:
# - "fetcher_counter"
# - "fetcher_average.bytes_fetched"
#blacklist:
# - "__receive.*"