Skip to content

Commit

Permalink
PIG-5255: Improvements to bloom join (satishsaley via rohini)
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/pig/trunk@1843689 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
r0hini committed Oct 12, 2018
1 parent aca7d0d commit 1b866ad
Show file tree
Hide file tree
Showing 17 changed files with 672 additions and 16 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ PIG-5282: Upgade to Java 8 (satishsaley via rohini)

IMPROVEMENTS

PIG-5255: Improvements to bloom join (satishsaley via rohini)

PIG-5359: Reduce time spent in split serialization (satishsaley via rohini)

PIG-5357: BagFactory interface should support creating a distinct bag from a set (jtolar via rohini)
Expand Down
6 changes: 6 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@
<include name="joda-time-${joda-time.version}.jar"/>
<include name="automaton-${automaton.version}.jar"/>
<include name="jansi-${jansi.version}.jar"/>
<include name="RoaringBitmap-shaded-${roaring-bitmap-shaded.version}.jar"/>
</patternset>
</fileset>

Expand Down Expand Up @@ -741,6 +742,7 @@
<fileset dir="${ivy.lib.dir}" includes="parser-core-${basjes-httpdlog-pigloader.version}.jar"/>
<fileset dir="${ivy.lib.dir}" includes="ivy-*.jar"/>
<fileset dir="${ivy.lib.dir}" includes="commons-logging-*.jar"/>
<fileset dir="${ivy.lib.dir}" includes="RoaringBitmap-shaded-${roaring-bitmap-shaded.version}.jar"/>
</copy>
</target>

Expand Down Expand Up @@ -1716,6 +1718,10 @@
<ivy:retrieve settingsRef="${ant.project.name}.ivy.settings" log="${loglevel}"
pattern="${ivy.lib.dir.spark}/[artifact]-[revision](-[classifier]).[ext]" conf="spark${sparkversion},hbase${hbaseversion}"/>
<ivy:cachepath pathid="compile.classpath" conf="compile"/>
<exec dir="${basedir}/shade/roaringbitmap" executable="mvn">
<arg line="clean package -Droaring.bitmap.version=${roaring-bitmap-shaded.version}"/>
</exec>
<copy file="${basedir}/shade/roaringbitmap/target/RoaringBitmap-shaded-${roaring-bitmap-shaded.version}.jar" todir="${ivy.lib.dir}"/>
</target>

<target name="ivy-test" depends="ivy-resolve" description="Retrieve Ivy-managed artifacts for test configuration">
Expand Down
3 changes: 3 additions & 0 deletions ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@
<dependency org="org.jruby" name="jruby-complete" rev="${jruby.version}" conf="compile->master"/>
<dependency org="asm" name="asm" rev="${asm.version}" conf="compile->default"/>

<!-- Dependencies for bloom join -->
<dependency org="org.roaringbitmap" name="RoaringBitmap" rev="${roaring-bitmap-shaded.version}" conf="compile->master"/>

<!-- HBase dependency in format for releases higher or equal to 0.95 -->
<dependency org="org.apache.hbase" name="hbase-client" rev="${hbase1.version}" conf="hbase1->master">
<artifact name="hbase-client" type="jar"/>
Expand Down
3 changes: 2 additions & 1 deletion ivy/libraries.properties
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,5 @@ htrace.version=3.1.0-incubating
htrace4.version=4.0.1-incubating
commons-lang3.version=3.6
scala-xml.version=1.0.5
glassfish.el.version=3.0.1-b08
glassfish.el.version=3.0.1-b08
roaring-bitmap-shaded.version=0.7.14
80 changes: 80 additions & 0 deletions shade/roaringbitmap/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.apache</groupId>
<artifactId>apache</artifactId>
<version>14</version>
</parent>

<groupId>org.apache.pig</groupId>
<artifactId>RoaringBitmap-shaded</artifactId>
<packaging>jar</packaging>
<version>${roaring.bitmap.version}</version>

<name>Pig RoaringBitmap</name>
<url>http://pig.apache.org</url>
<prerequisites>
<maven>3.0</maven>
</prerequisites>

<modules>
</modules>

<properties>
<maven.shade.plugin.version>2.4.3</maven.shade.plugin.version>
<roaring.bitmap.version>0.7.14</roaring.bitmap.version>
</properties>

<dependencies>
<dependency>
<groupId>org.roaringbitmap</groupId>
<artifactId>RoaringBitmap</artifactId>
<version>${roaring.bitmap.version}</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>${maven.shade.plugin.version}</version>
<executions>
<execution>
<id>shade-asm</id>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<relocations>
<relocation>
<pattern>org.roaringbitmap</pattern>
<shadedPattern>org.apache.pig.org.roaringbitmap</shadedPattern>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,16 @@
import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.Packager;
import org.apache.pig.builtin.BuildBloomBase;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.bloom.BloomFilter;

public class BloomPackager extends Packager {

Expand Down Expand Up @@ -103,10 +102,10 @@ private Result combineBloomFilters() throws IOException {
Iterator<Tuple> iter = bags[0].iterator();
Tuple tup = iter.next();
DataByteArray bloomBytes = (DataByteArray) tup.get(0);
BloomFilter bloomFilter = BuildBloomBase.bloomIn(bloomBytes);
BloomFilter bloomFilter = BloomFilter.bloomIn(bloomBytes);
while (iter.hasNext()) {
tup = iter.next();
bloomFilter.or(BuildBloomBase.bloomIn((DataByteArray) tup.get(0)));
bloomFilter.or(BloomFilter.bloomIn((DataByteArray) tup.get(0)));
}

Object partition = key;
Expand Down Expand Up @@ -160,4 +159,4 @@ private Result retrieveBloomFilter() throws IOException {
public boolean isBloomCreatedInMap() {
return bloomCreatedInMap;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,19 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.HDataType;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.tez.runtime.ObjectCache;
import org.apache.pig.backend.hadoop.executionengine.tez.runtime.TezInput;
import org.apache.pig.builtin.BuildBloomBase;
import org.apache.pig.classification.InterfaceAudience;
import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.bloom.BloomFilter;
import org.apache.pig.impl.io.NullableTuple;
import org.apache.pig.impl.io.PigNullableWritable;
import org.apache.tez.runtime.api.LogicalInput;
Expand Down Expand Up @@ -106,7 +105,7 @@ public void attachInputs(Map<String, LogicalInput> inputs,
}
Tuple val = (Tuple) reader.getCurrentValue();
int index = (int) val.get(0);
bloomFilters[index] = BuildBloomBase.bloomIn((DataByteArray) val.get(1));
bloomFilters[index] = BloomFilter.bloomIn((DataByteArray) val.get(1));
}
ObjectCache.getInstance().cache(cacheKey, bloomFilters);
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.pig.PigConfiguration;
import org.apache.pig.backend.executionengine.ExecException;
Expand All @@ -37,6 +36,7 @@
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.bloom.BloomFilter;
import org.apache.pig.impl.io.NullableBytesWritable;
import org.apache.pig.impl.io.NullableIntWritable;
import org.apache.pig.impl.io.NullableTuple;
Expand Down
Loading

0 comments on commit 1b866ad

Please sign in to comment.