Skip to content

Commit 291013e

Browse files
committed
apache#620 fix java topologies
1 parent c581b91 commit 291013e

File tree

2 files changed

+127
-0
lines changed

2 files changed

+127
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to you under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package ${package};
18+
19+
import org.apache.storm.topology.TopologyBuilder;
20+
import org.apache.storm.tuple.Fields;
21+
import org.apache.stormcrawler.ConfigurableTopology;
22+
import org.apache.stormcrawler.Constants;
23+
import org.apache.stormcrawler.bolt.FetcherBolt;
24+
import org.apache.stormcrawler.bolt.JSoupParserBolt;
25+
import org.apache.stormcrawler.bolt.SiteMapParserBolt;
26+
import org.apache.stormcrawler.bolt.URLPartitionerBolt;
27+
import org.apache.stormcrawler.solr.bolt.DeletionBolt;
28+
import org.apache.stormcrawler.solr.bolt.IndexerBolt;
29+
import org.apache.stormcrawler.solr.metrics.MetricsConsumer;
30+
import org.apache.stormcrawler.solr.persistence.SolrSpout;
31+
import org.apache.stormcrawler.solr.persistence.StatusUpdaterBolt;
32+
33+
/** Dummy topology to play with the spouts and bolts on Solr */
34+
public class CrawlTopology extends ConfigurableTopology {
35+
36+
public static void main(String[] args) throws Exception {
37+
ConfigurableTopology.start(new CrawlTopology(), args);
38+
}
39+
40+
@Override
41+
protected int run(String[] args) {
42+
TopologyBuilder builder = new TopologyBuilder();
43+
44+
builder.setSpout("spout", new SolrSpout());
45+
46+
builder.setBolt("partitioner", new URLPartitionerBolt()).shuffleGrouping("spout");
47+
48+
builder.setBolt("fetch", new FetcherBolt())
49+
.fieldsGrouping("partitioner", new Fields("key"));
50+
51+
builder.setBolt("sitemap", new SiteMapParserBolt()).localOrShuffleGrouping("fetch");
52+
53+
builder.setBolt("parse", new JSoupParserBolt()).localOrShuffleGrouping("sitemap");
54+
55+
builder.setBolt("indexer", new IndexerBolt()).localOrShuffleGrouping("parse");
56+
57+
builder.setBolt("status", new StatusUpdaterBolt())
58+
.localOrShuffleGrouping("fetch", Constants.StatusStreamName)
59+
.localOrShuffleGrouping("sitemap", Constants.StatusStreamName)
60+
.localOrShuffleGrouping("parse", Constants.StatusStreamName)
61+
.localOrShuffleGrouping("indexer", Constants.StatusStreamName);
62+
63+
builder.setBolt("deleter", new DeletionBolt())
64+
.localOrShuffleGrouping("status", Constants.DELETION_STREAM_NAME);
65+
66+
conf.registerMetricsConsumer(MetricsConsumer.class);
67+
68+
return submit("crawl", conf, builder);
69+
}
70+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to you under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package ${package};;
18+
19+
import org.apache.storm.topology.TopologyBuilder;
20+
import org.apache.storm.tuple.Fields;
21+
import org.apache.stormcrawler.ConfigurableTopology;
22+
import org.apache.stormcrawler.Constants;
23+
import org.apache.stormcrawler.solr.persistence.StatusUpdaterBolt;
24+
import org.apache.stormcrawler.spout.FileSpout;
25+
26+
/**
27+
* Topology which reads from a file containing seeds and distributes to SQS queues based on the IP /
28+
* hostname / domain of the URLs. Used in local mode to bootstrap a crawl.
29+
*/
30+
public class SeedInjector extends ConfigurableTopology {
31+
32+
public static void main(String[] args) throws Exception {
33+
ConfigurableTopology.start(new SeedInjector(), args);
34+
}
35+
36+
@Override
37+
public int run(String[] args) {
38+
39+
if (args.length == 0) {
40+
System.err.println("SeedInjector seed_dir file_filter");
41+
return -1;
42+
}
43+
44+
conf.setDebug(false);
45+
46+
TopologyBuilder builder = new TopologyBuilder();
47+
48+
builder.setSpout("spout", new FileSpout(args[0], args[1], true));
49+
50+
Fields key = new Fields("url");
51+
52+
builder.setBolt("enqueue", new StatusUpdaterBolt())
53+
.fieldsGrouping("spout", Constants.StatusStreamName, key);
54+
55+
return submit("SeedInjector", conf, builder);
56+
}
57+
}

0 commit comments

Comments
 (0)