forked from hazelcast/hazelcast-jet-code-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHadoopAvro.java
114 lines (96 loc) · 4.18 KB
/
HadoopAvro.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/*
* Copyright (c) 2008-2019, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package avro;
import com.hazelcast.jet.Jet;
import com.hazelcast.jet.JetInstance;
import com.hazelcast.jet.hadoop.HdfsSinks;
import com.hazelcast.jet.hadoop.HdfsSources;
import com.hazelcast.jet.impl.util.Util;
import com.hazelcast.jet.pipeline.Pipeline;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.JobConf;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.stream.IntStream;
/**
* A sample which reads records from HDFS using Apache Avro input format,
* filters and writes back to HDFS using Apache Avro output format
*/
public class HadoopAvro {
private static final String MODULE_DIRECTORY = moduleDirectory();
private static final String INPUT_PATH = MODULE_DIRECTORY + "/hdfs-avro-input";
private static final String OUTPUT_PATH = MODULE_DIRECTORY + "/hdfs-avro-output";
private static Pipeline buildPipeline(JobConf jobConfig) {
Pipeline p = Pipeline.create();
p.drawFrom(HdfsSources.<AvroWrapper<User>, NullWritable>hdfs(jobConfig))
.filter(entry -> entry.getKey().datum().get(3).equals(Boolean.TRUE))
.peek(entry -> entry.getKey().datum().toString())
.drainTo(HdfsSinks.hdfs(jobConfig));
return p;
}
public static void main(String[] args) throws Exception {
System.setProperty("hazelcast.logging.type", "log4j");
new HadoopAvro().go();
}
private void go() throws Exception {
try {
createAvroFile();
JetInstance jet = Jet.newJetInstance();
JobConf jobConfig = createJobConfig();
jet.newJob(buildPipeline(jobConfig)).join();
} finally {
Jet.shutdownAll();
}
}
private JobConf createJobConfig() throws IOException {
Path inputPath = new Path(INPUT_PATH);
Path outputPath = new Path(OUTPUT_PATH);
FileSystem.get(new Configuration()).delete(outputPath, true);
JobConf jobConfig = new JobConf();
jobConfig.setInputFormat(AvroInputFormat.class);
jobConfig.setOutputFormat(AvroOutputFormat.class);
AvroOutputFormat.setOutputPath(jobConfig, outputPath);
AvroInputFormat.addInputPath(jobConfig, inputPath);
jobConfig.set(AvroJob.OUTPUT_SCHEMA, User.SCHEMA.toString());
jobConfig.set(AvroJob.INPUT_SCHEMA, User.SCHEMA.toString());
return jobConfig;
}
private void createAvroFile() throws IOException {
Path inputPath = new Path(INPUT_PATH);
FileSystem fs = FileSystem.get(new Configuration());
fs.delete(inputPath, true);
DataFileWriter<User> fileWriter = new DataFileWriter<>(new GenericDatumWriter<User>(User.SCHEMA));
fileWriter.create(User.SCHEMA, fs.create(new Path(inputPath, "file.avro")));
IntStream.range(0, 100)
.mapToObj(i -> new User("name" + i, "pass" + i, i, i % 2 == 0))
.forEach(user -> Util.uncheckRun(() -> fileWriter.append(user)));
fileWriter.close();
fs.close();
}
private static String moduleDirectory() {
String resourcePath = HadoopAvro.class.getClassLoader().getResource("").getPath();
return Paths.get(resourcePath).getParent().getParent().toString();
}
}