Skip to content

Commit

Permalink
Added example config
Browse files Browse the repository at this point in the history
  • Loading branch information
nielsbasjes committed Mar 28, 2012
1 parent 87c7a9d commit 5258302
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 47 deletions.
10 changes: 10 additions & 0 deletions .checkstyle
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>

<fileset-config file-format-version="1.2.0" simple-config="true" sync-formatter="false">
<local-check-config name="Hadoop common checkstyle" location="dev-support/checkstyle.xml" type="project" description="This set of rules was copied from the Hadoop common project.">
<additional-data name="protect-config-file" value="true"/>
</local-check-config>
<fileset name="all" enabled="true" check-config-name="Hadoop common checkstyle" local="true">
<file-match-pattern match-pattern="." include-pattern="true"/>
</fileset>
</fileset-config>
8 changes: 0 additions & 8 deletions .classpath

This file was deleted.

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
.settings
.svn
target
.classpath .project
193 changes: 157 additions & 36 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,38 +1,159 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>nl.basjes.hadoop</groupId>
<artifactId>hadoop-splittablegzip</artifactId>
<version>1.0-0.23.0-cdh4b1</version>
<packaging>jar</packaging>

<name>Splittable gzip</name>
<url>http://niels.basjes.nl/gzip</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>0.23.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.8.1</version>
</plugin>
</plugins>
</build>
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>nl.basjes.hadoop</groupId>
<artifactId>hadoop-splittablegzip</artifactId>
<!-- Do NOT use a '-' in the version number !! -->
<version>1.0</version>
<packaging>jar</packaging>

<name>Splittable Gzip codec</name>
<url>http://niels.basjes.nl/splittable-gzip</url>

<properties>
<compileSource>1.6</compileSource>
<targetJdk>1.6</targetJdk>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>0.23.1</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.8.1</version>
<executions>
<execution>
<id>javadoc-jar</id>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<version>2.7.1</version>
<configuration>
<linkXref>true</linkXref>
<!-- <rulesets> <ruleset> /pmd-ruleset.xml </ruleset> </rulesets> -->
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>cpd-check</goal> <!-- Fail the build if copy/paste detector fails -->
</goals>
<configuration>
<verbose>true</verbose>
</configuration>
</execution>
</executions>
</configuration>
</plugin>

<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
<version>2.0.1</version>
<configuration>
<targetJdk>1.6</targetJdk>
</configuration>
<executions>
<execution>
<phase>package</phase>
</execution>
</executions>
</plugin>

<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>rpm-maven-plugin</artifactId>
<version>2.0.1</version>
<configuration>
<copyright>Apache License v2.0</copyright>
<packager>Niels Basjes</packager>
<group>Development/Libraries</group>
<needarch>noarch</needarch>
<description>
This codec offers a trade off between "spent resources" and "scalability"
when reading Gzipped input files by simply always starting at the beginning
of the file. So in general this "splittable" Gzip codec will WASTE CPU time and
FileSystem IO (HDFS) and probably other system resources (Network)
too to reduce the "wall clock" time in some real-life situations.</description>
<release>1</release>
<mappings>
<mapping>
<directory>/usr/lib/hadoop</directory>
<username>root</username>
<groupname>root</groupname>
<sources>
<source>
<location>target/${project.build.finalName}.jar</location>
</source>
</sources>
</mapping>

<mapping>
<directory>/usr/share/doc/${project.build.finalName}/apidocs</directory>
<username>root</username>
<groupname>root</groupname>
<sources>
<source>
<location>target/apidocs</location>
</source>
</sources>
</mapping>

<mapping>
<directory>/usr/share/doc/${project.build.finalName}/</directory>
<username>root</username>
<groupname>root</groupname>
<sources>
<source>
<location>src/main/resources/mapred-site.xml.example</location>
</source>
</sources>
</mapping>

</mappings>

</configuration>
<executions>
<execution>
<goals>
<goal>attached-rpm</goal>
</goals>
</execution>
</executions>
</plugin>

</plugins>
</build>

<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.8.1</version>
</plugin>
</plugins>
</reporting>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
* This can be done by changing the <b>io.compression.codecs</b> property to
* something like this:<br>
* <i>org.apache.hadoop.io.compress.DefaultCodec,
* org.apache.hadoop.io.compress.SkipSeekSplittableGzipCodec,
* nl.basjes.hadoop.io.compress.SplittableGzipCodec,
* org.apache.hadoop.io.compress.BZip2Codec</i><br>
* </li>
* <li>Set the split size to something that works in your situation. This can be
Expand Down Expand Up @@ -281,7 +281,7 @@ public SplittableGzipInputStream(final CompressionInputStream in,
this.in = (ThrottleableDecompressorStream) in;
} else {
this.in = null; // Permanently cripple this instance ('in' is final) .
throw new IOException("The SkipSeekSplittableGzipCodec relies on"
throw new IOException("The SplittableGzipCodec relies on"
+ " functionality in the ThrottleableDecompressorStream class.");
}

Expand Down Expand Up @@ -317,7 +317,7 @@ public SplittableGzipInputStream(final CompressionInputStream in,
// An EOF while seeking for the START of the split !?!?
throw new EOFException("Unexpected end of input stream when"
+ " seeking for the start of the split in"
+ " SkipSeekSplittableGzipCodec:"
+ " SplittableGzipCodec:"
+ " start=" + start + " adjustedStart=" + start + " position="
+ getPos());
}
Expand Down
20 changes: 20 additions & 0 deletions src/main/resources/mapred-site.xml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>

<!-- The regular GzipCodec must be removed and replaced by nl.basjes.hadoop.io.compress.SplittableGzipCodec -->
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,nl.basjes.hadoop.io.compress.SplittableGzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec</value>
<description>A list of the compression codec classes that can be used
for compression/decompression.</description>
</property>

<!-- How big must the split be? -->
<property>
<name>mapreduce.input.fileinputformat.split.maxsize</name>
<value>100000</value>
</property>

</configuration>

0 comments on commit 5258302

Please sign in to comment.