blob: 290bd0a32fb5069f4d6a430fc1bb21149d883a96 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.builtin;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.pig.FilterFunc;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
/**
* Use a Bloom filter build previously by BuildBloom. You would first
* build a bloom filter in a group all job. For example:
* in a group all job. For example:
* define bb BuildBloom('jenkins', '100', '0.1');
* A = load 'foo' as (x, y);
* B = group A all;
* C = foreach B generate bb(A.x);
* store C into 'mybloom';
* The bloom filter can be on multiple keys by passing more than one field
* (or the entire bag) to BuildBloom.
* The resulting file can then be used in a Bloom filter as:
* define bloom Bloom(mybloom);
* A = load 'foo' as (x, y);
* B = load 'bar' as (z);
* C = filter B by bloom(z);
* D = join C by z, A by x;
* It uses {@link org.apache.hadoop.util.bloom.BloomFilter}.
*/
public class Bloom extends FilterFunc {
private String bloomFile;
public BloomFilter filter = null;
/**
* @param filename file containing the serialized Bloom filter
*/
public Bloom(String filename) {
bloomFile = filename;
}
@Override
public Boolean exec(Tuple input) throws IOException {
if (filter == null) {
init();
}
byte[] b;
if (input.size() == 1) b = DataType.toBytes(input.get(0));
else b = DataType.toBytes(input, DataType.TUPLE);
Key k = new Key(b);
return filter.membershipTest(k);
}
@Override
public List<String> getCacheFiles() {
List<String> list = new ArrayList<String>(1);
// We were passed the name of the file on HDFS. Append a
// name for the file on the task node.
try {
list.add(bloomFile + "#" + getFilenameFromPath(bloomFile));
} catch (IOException e) {
throw new RuntimeException(e);
}
return list;
}
private void init() throws IOException {
filter = new BloomFilter();
String dir = "./" + getFilenameFromPath(bloomFile);
String[] partFiles = new File(dir)
.list(new FilenameFilter() {
@Override
public boolean accept(File current, String name) {
return name.startsWith("part");
}
});
String dcFile = dir + "/" + partFiles[0];
DataInputStream dis = new DataInputStream(new FileInputStream(dcFile));
try {
filter.readFields(dis);
} finally {
dis.close();
}
}
/**
* For testing only, do not use directly.
*/
public void setFilter(DataByteArray dba) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(dba.get()));
filter = new BloomFilter();
filter.readFields(dis);
}
private String getFilenameFromPath(String p) throws IOException {
Path path = new Path(p);
return path.toUri().getPath().replace("/", "_");
}
}