hadoop/src/main/java/avro/HadoopAvro.java

/*
 * Copyright (c) 2008-2019, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package avro;

import com.hazelcast.jet.Jet;
import com.hazelcast.jet.JetInstance;
import com.hazelcast.jet.hadoop.HdfsSinks;
import com.hazelcast.jet.hadoop.HdfsSources;
import com.hazelcast.jet.impl.util.Util;
import com.hazelcast.jet.pipeline.Pipeline;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.JobConf;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.stream.IntStream;

/**
 * A sample which reads records from HDFS using Apache Avro input format,
 * filters and writes back to HDFS using Apache Avro output format
 */
public class HadoopAvro {

    private static final String MODULE_DIRECTORY = moduleDirectory();
    private static final String INPUT_PATH = MODULE_DIRECTORY + "/hdfs-avro-input";
    private static final String OUTPUT_PATH = MODULE_DIRECTORY + "/hdfs-avro-output";

    private static Pipeline buildPipeline(JobConf jobConfig) {
        Pipeline p = Pipeline.create();
        p.drawFrom(HdfsSources.<AvroWrapper<User>, NullWritable>hdfs(jobConfig))
         .filter(entry -> entry.getKey().datum().get(3).equals(Boolean.TRUE))
         .peek(entry -> entry.getKey().datum().toString())
         .drainTo(HdfsSinks.hdfs(jobConfig));
        return p;
    }

    public static void main(String[] args) throws Exception {
        System.setProperty("hazelcast.logging.type", "log4j");
        new HadoopAvro().go();
    }

    private void go() throws Exception {
        try {
            createAvroFile();
            JetInstance jet = Jet.newJetInstance();

            JobConf jobConfig = createJobConfig();
            jet.newJob(buildPipeline(jobConfig)).join();

        } finally {
            Jet.shutdownAll();
        }
    }

    private JobConf createJobConfig() throws IOException {
        Path inputPath = new Path(INPUT_PATH);
        Path outputPath = new Path(OUTPUT_PATH);

        FileSystem.get(new Configuration()).delete(outputPath, true);

        JobConf jobConfig = new JobConf();
        jobConfig.setInputFormat(AvroInputFormat.class);
        jobConfig.setOutputFormat(AvroOutputFormat.class);
        AvroOutputFormat.setOutputPath(jobConfig, outputPath);
        AvroInputFormat.addInputPath(jobConfig, inputPath);
        jobConfig.set(AvroJob.OUTPUT_SCHEMA, User.SCHEMA.toString());
        jobConfig.set(AvroJob.INPUT_SCHEMA, User.SCHEMA.toString());
        return jobConfig;
    }

    private void createAvroFile() throws IOException {
        Path inputPath = new Path(INPUT_PATH);
        FileSystem fs = FileSystem.get(new Configuration());
        fs.delete(inputPath, true);

        DataFileWriter<User> fileWriter = new DataFileWriter<>(new GenericDatumWriter<User>(User.SCHEMA));

        fileWriter.create(User.SCHEMA, fs.create(new Path(inputPath, "file.avro")));
        IntStream.range(0, 100)
                 .mapToObj(i -> new User("name" + i, "pass" + i, i, i % 2 == 0))
                 .forEach(user -> Util.uncheckRun(() -> fileWriter.append(user)));
        fileWriter.close();
        fs.close();
    }

    private static String moduleDirectory() {
        String resourcePath = HadoopAvro.class.getClassLoader().getResource("").getPath();
        return Paths.get(resourcePath).getParent().getParent().toString();
    }

}