Skip to content

Commit 92b2970

Browse files
committed
https://github.com/gbif/pipelines/issues/1078
adding CLI components for incremental table build using iceberg
1 parent ac238be commit 92b2970

File tree

5 files changed

+264
-5
lines changed

5 files changed

+264
-5
lines changed

Diff for: gbif/coordinator/tasks/src/main/java/org/gbif/pipelines/common/process/BeamParametersBuilder.java

+18-5
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,7 @@
1414
import lombok.Builder;
1515
import lombok.NoArgsConstructor;
1616
import org.gbif.api.model.pipelines.InterpretationType.RecordType;
17-
import org.gbif.common.messaging.api.messages.PipelinesEventsInterpretedMessage;
18-
import org.gbif.common.messaging.api.messages.PipelinesEventsMessage;
19-
import org.gbif.common.messaging.api.messages.PipelinesInterpretationMessage;
20-
import org.gbif.common.messaging.api.messages.PipelinesInterpretedMessage;
21-
import org.gbif.common.messaging.api.messages.PipelinesVerbatimMessage;
17+
import org.gbif.common.messaging.api.messages.*;
2218
import org.gbif.pipelines.common.configs.AvroWriteConfiguration;
2319
import org.gbif.pipelines.common.configs.ElasticsearchConfiguration;
2420
import org.gbif.pipelines.common.configs.IndexConfiguration;
@@ -195,6 +191,23 @@ public static BeamParameters occurrenceHdfsView(
195191
.putCondition(config.recordType == RecordType.EVENT, "coreRecordType", "EVENT");
196192
}
197193

194+
public static BeamParameters occurrenceWarehouse(
195+
HdfsViewConfiguration config, PipelinesHdfsViewMessage message) {
196+
197+
return BeamParameters.create()
198+
.putRequireNonNull("datasetId", message.getDatasetUuid())
199+
.put("attempt", message.getAttempt())
200+
.put("runner", "SparkRunner")
201+
.putRequireNonNull("metaFileName", config.metaFileName)
202+
.putRequireNonNull("inputPath", config.stepConfig.repositoryPath)
203+
.putRequireNonNull("targetPath", config.repositoryTargetPath)
204+
.putRequireNonNull("hdfsSiteConfig", config.stepConfig.hdfsSiteConfig)
205+
.putRequireNonNull("coreSiteConfig", config.stepConfig.coreSiteConfig)
206+
.putRequireNonNull("properties", config.pipelinesConfig)
207+
.put("experiments", "use_deprecated_read")
208+
.putCondition(config.recordType == RecordType.EVENT, "coreRecordType", "EVENT");
209+
}
210+
198211
public static BeamParameters eventInterpretation(
199212
EventsInterpretationConfiguration config,
200213
PipelinesEventsMessage message,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
package org.gbif.pipelines.tasks.occurrences.warehouse;
2+
3+
import com.google.common.base.Strings;
4+
import lombok.Builder;
5+
import lombok.SneakyThrows;
6+
import lombok.extern.slf4j.Slf4j;
7+
import org.gbif.common.messaging.AbstractMessageCallback;
8+
import org.gbif.common.messaging.api.MessagePublisher;
9+
import org.gbif.common.messaging.api.messages.PipelinesHdfsViewMessage;
10+
import org.gbif.common.messaging.api.messages.PipelinesInterpretationMessage;
11+
import org.gbif.common.messaging.api.messages.PipelinesInterpretedMessage;
12+
import org.gbif.pipelines.common.PipelinesVariables;
13+
import org.gbif.pipelines.common.airflow.AppName;
14+
import org.gbif.pipelines.common.process.AirflowSparkLauncher;
15+
import org.gbif.pipelines.common.process.BeamParametersBuilder;
16+
import org.gbif.pipelines.common.process.RecordCountReader;
17+
import org.gbif.pipelines.common.process.SparkDynamicSettings;
18+
import org.gbif.pipelines.tasks.PipelinesCallback;
19+
import org.gbif.pipelines.tasks.StepHandler;
20+
import org.gbif.pipelines.tasks.common.hdfs.CommonHdfsViewCallback;
21+
import org.gbif.pipelines.tasks.common.hdfs.HdfsViewConfiguration;
22+
import org.gbif.pipelines.tasks.verbatims.dwca.DwcaToAvroConfiguration;
23+
import org.gbif.registry.ws.client.DatasetClient;
24+
import org.gbif.registry.ws.client.pipelines.PipelinesHistoryClient;
25+
26+
/** Callback which is called when an instance {@link PipelinesInterpretationMessage} is received. */
27+
@Slf4j
28+
@Builder
29+
public class DataWarehouseCallback extends AbstractMessageCallback<PipelinesHdfsViewMessage>
30+
implements StepHandler<PipelinesHdfsViewMessage, DataWarehouseFinishMessage> {
31+
32+
protected final HdfsViewConfiguration config;
33+
private final MessagePublisher publisher;
34+
private final PipelinesHistoryClient historyClient;
35+
private final DatasetClient datasetClient;
36+
private final CommonHdfsViewCallback commonHdfsViewCallback;
37+
38+
@Override
39+
public void handleMessage(PipelinesHdfsViewMessage message) {
40+
PipelinesCallback.<PipelinesHdfsViewMessage, DataWarehouseFinishMessage>builder()
41+
.historyClient(historyClient)
42+
.datasetClient(datasetClient)
43+
.config(config)
44+
.stepType(config.stepType)
45+
.publisher(publisher)
46+
.message(message)
47+
.handler(this)
48+
.build()
49+
.handleMessage();
50+
}
51+
52+
@Override
53+
public String getRouting() {
54+
return new PipelinesInterpretedMessage().setRunner(config.processRunner).getRoutingKey();
55+
}
56+
57+
/** Main message processing logic, creates a terminal java process, which runs */
58+
@Override
59+
public Runnable createRunnable(PipelinesHdfsViewMessage message) {
60+
return () -> {
61+
BeamParametersBuilder.BeamParameters beamParameters =
62+
BeamParametersBuilder.occurrenceWarehouse(config, message);
63+
runDistributed(message, beamParameters);
64+
};
65+
}
66+
67+
@SneakyThrows
68+
private void runDistributed(
69+
PipelinesHdfsViewMessage message, BeamParametersBuilder.BeamParameters beamParameters) {
70+
71+
long recordsNumber =
72+
RecordCountReader.builder()
73+
.stepConfig(config.stepConfig)
74+
.datasetKey(message.getDatasetUuid().toString())
75+
.attempt(message.getAttempt().toString())
76+
.metaFileName(new DwcaToAvroConfiguration().metaFileName)
77+
.metricName(PipelinesVariables.Metrics.ARCHIVE_TO_OCC_COUNT)
78+
.build()
79+
.get();
80+
81+
log.info("Calculate job's settings based on {} records", recordsNumber);
82+
boolean useMemoryExtraCoef =
83+
config.sparkConfig.extraCoefDatasetSet.contains(message.getDatasetUuid().toString());
84+
SparkDynamicSettings sparkDynamicSettings =
85+
SparkDynamicSettings.create(config.sparkConfig, recordsNumber, useMemoryExtraCoef);
86+
87+
// App name
88+
String sparkAppName =
89+
AppName.get(config.stepType, message.getDatasetUuid(), message.getAttempt());
90+
91+
// Submit
92+
AirflowSparkLauncher.builder()
93+
.airflowConfiguration(config.airflowConfig)
94+
.sparkStaticConfiguration(config.sparkConfig)
95+
.sparkDynamicSettings(sparkDynamicSettings)
96+
.beamParameters(beamParameters)
97+
.sparkAppName(sparkAppName)
98+
.build()
99+
.submitAwaitVoid();
100+
}
101+
102+
@Override
103+
public DataWarehouseFinishMessage createOutgoingMessage(PipelinesHdfsViewMessage message) {
104+
return new DataWarehouseFinishMessage(
105+
message.getDatasetUuid(), message.getAttempt(), message.getPipelineSteps(), null, null);
106+
}
107+
108+
/**
109+
* Only correct messages can be handled, by now is only messages with the same runner as runner in
110+
* service config {@link HdfsViewConfiguration#processRunner}
111+
*/
112+
@Override
113+
public boolean isMessageCorrect(PipelinesHdfsViewMessage message) {
114+
if (Strings.isNullOrEmpty(message.getRunner())) {
115+
throw new IllegalArgumentException("Runner can't be null or empty " + message);
116+
}
117+
118+
if (!config.processRunner.equals(message.getRunner())) {
119+
log.warn("Skipping, because runner is incorrect");
120+
return false;
121+
}
122+
return true;
123+
}
124+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package org.gbif.pipelines.tasks.occurrences.warehouse;
2+
3+
import com.google.common.util.concurrent.Service;
4+
import org.gbif.cli.Command;
5+
import org.gbif.cli.service.ServiceCommand;
6+
import org.gbif.pipelines.tasks.common.hdfs.HdfsViewConfiguration;
7+
import org.kohsuke.MetaInfServices;
8+
9+
/**
10+
* Entry class for cli command, to start service to process Hdfs View This command starts a service
11+
* which listens to the {@link org.gbif.common.messaging.api.messages.PipelinesInterpretedMessage }
12+
*/
13+
@MetaInfServices(Command.class)
14+
public class DataWarehouseCommand extends ServiceCommand {
15+
16+
private final HdfsViewConfiguration config = new HdfsViewConfiguration();
17+
18+
public DataWarehouseCommand() {
19+
super("pipelines-warehouse");
20+
}
21+
22+
@Override
23+
protected Service getService() {
24+
return new DataWarehouseService(config);
25+
}
26+
27+
@Override
28+
protected Object getConfigurationObject() {
29+
return config;
30+
}
31+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package org.gbif.pipelines.tasks.occurrences.warehouse;
2+
3+
import com.fasterxml.jackson.annotation.JsonCreator;
4+
import com.fasterxml.jackson.annotation.JsonProperty;
5+
import java.util.Set;
6+
import java.util.UUID;
7+
import org.gbif.common.messaging.api.messages.PipelinesHdfsViewMessage;
8+
9+
public class DataWarehouseFinishMessage extends PipelinesHdfsViewMessage {
10+
11+
public DataWarehouseFinishMessage() {}
12+
13+
@JsonCreator
14+
public DataWarehouseFinishMessage(
15+
@JsonProperty("datasetUuid") UUID datasetUuid,
16+
@JsonProperty("attempt") int attempt,
17+
@JsonProperty("pipelineSteps") Set<String> pipelineSteps,
18+
@JsonProperty("runner") String runner,
19+
@JsonProperty("executionId") Long executionId) {
20+
super(datasetUuid, attempt, pipelineSteps, runner, executionId);
21+
}
22+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package org.gbif.pipelines.tasks.occurrences.warehouse;
2+
3+
import com.google.common.util.concurrent.AbstractIdleService;
4+
import java.util.concurrent.ExecutorService;
5+
import java.util.concurrent.Executors;
6+
import lombok.extern.slf4j.Slf4j;
7+
import org.gbif.common.messaging.DefaultMessagePublisher;
8+
import org.gbif.common.messaging.MessageListener;
9+
import org.gbif.common.messaging.api.MessagePublisher;
10+
import org.gbif.pipelines.common.configs.StepConfiguration;
11+
import org.gbif.pipelines.tasks.ServiceFactory;
12+
import org.gbif.pipelines.tasks.common.hdfs.CommonHdfsViewCallback;
13+
import org.gbif.pipelines.tasks.common.hdfs.HdfsViewConfiguration;
14+
import org.gbif.registry.ws.client.DatasetClient;
15+
import org.gbif.registry.ws.client.pipelines.PipelinesHistoryClient;
16+
17+
/**
18+
* A service which listens to the {@link
19+
* org.gbif.common.messaging.api.messages.PipelinesInterpretedMessage }
20+
*/
21+
@Slf4j
22+
public class DataWarehouseService extends AbstractIdleService {
23+
24+
private final HdfsViewConfiguration config;
25+
private MessageListener listener;
26+
private MessagePublisher publisher;
27+
private ExecutorService executor;
28+
29+
public DataWarehouseService(HdfsViewConfiguration config) {
30+
this.config = config;
31+
}
32+
33+
@Override
34+
protected void startUp() throws Exception {
35+
log.info("Started pipelines-warehouse service");
36+
// Prefetch is one, since this is a long-running process.
37+
StepConfiguration c = config.stepConfig;
38+
listener = new MessageListener(c.messaging.getConnectionParameters(), 1);
39+
publisher = new DefaultMessagePublisher(c.messaging.getConnectionParameters());
40+
executor =
41+
config.standaloneNumberThreads == null
42+
? null
43+
: Executors.newFixedThreadPool(config.standaloneNumberThreads);
44+
45+
PipelinesHistoryClient historyClient =
46+
ServiceFactory.createPipelinesHistoryClient(config.stepConfig);
47+
48+
DatasetClient datasetClient = ServiceFactory.createDatasetClient(config.stepConfig);
49+
50+
DataWarehouseCallback callback =
51+
DataWarehouseCallback.builder()
52+
.config(config)
53+
.publisher(publisher)
54+
.historyClient(historyClient)
55+
.datasetClient(datasetClient)
56+
.commonHdfsViewCallback(CommonHdfsViewCallback.create(config, executor))
57+
.build();
58+
59+
listener.listen(c.queueName, callback.getRouting(), c.poolSize, callback);
60+
}
61+
62+
@Override
63+
protected void shutDown() {
64+
listener.close();
65+
publisher.close();
66+
executor.shutdown();
67+
log.info("Stopping pipelines-warehouse service");
68+
}
69+
}

0 commit comments

Comments
 (0)