blob: c5707fd06340b1c00179050e4131b77dc40362da [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package datafu.pig.hash;
import java.util.Random;
import datafu.pig.hash.Hasher;
import com.google.common.hash.HashFunction;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
*
* Computes a hash value of a string using a randomly generated seed and
* outputs it in hex.
*
* This class should only be used for hashing algorithms that accept a seed
* (murmur3-32, murmur3-128 and sip24).
*
* It allows you to generate a well-mixed sequence of values, unpredictable
* for every run, without relying on the random number generator for each
* record. The seed is generated by the front end (i.e. when you launch your
* script) and so is identical for every task.
*
* @see datafu.pig.hash.Hasher
*/
public class HasherRand extends Hasher
{
protected HashFunction hash_func;
protected final String algorithm;
/**
* Generates hash values according to murmur3-32, a non-cryptographic-strength
* hash function with good mixing.
*
* @throws IllegalArgumentException, RuntimeException
* @see #HasherRand(String alg)
*/
public HasherRand() throws IllegalArgumentException, RuntimeException
{
this("murmur3-32");
}
/**
* @param alg
* @throws IllegalArgumentException, RuntimeException
* @see #HasherRand()
*/
public HasherRand(String alg) throws IllegalArgumentException, RuntimeException
{
algorithm = alg;
}
/**
* @param val the single string to hash
* @return val, hashed according to the algorithm specified at instantiation
*/
@Override
public String call(String val)
{
if (hash_func == null) {
// memoize the hash func
String rand_seed = (String)getInstanceProperties().get("rand_seed");
super.makeHashFunc(algorithm, rand_seed);
}
return super.call(val);
}
/**
* Generate a seed exactly once on the front end, so all workers get same value
* @param in_schema Input schema
* @param out_schema Output schema
*/
@Override
protected void onReady(Schema in_schema, Schema out_schema) {
String rand_seed;
Random rg = getRandomGenerator();
if (algorithm.equals("murmur3-32")) {
int rand_int = rg.nextInt();
rand_seed = String.format("%08x", rand_int);
}
else if (algorithm.equals("murmur3-128")){
int rand_int = rg.nextInt();
rand_seed = String.format("%08x", rand_int);
}
else if (algorithm.equals("sip24")) {
long rand_k0 = rg.nextLong();
long rand_k1 = rg.nextLong();
rand_seed = String.format("%016x%016x", rand_k0, rand_k1);
}
else { throw new IllegalArgumentException("No hash function found for algorithm "+algorithm+" with a seed. Allowed values include "+SEEDED_HASH_NAMES); }
getInstanceProperties().put("rand_seed", rand_seed);
super.onReady(in_schema, out_schema);
}
// exists so tests can inject constant seed.
protected Random getRandomGenerator() {
return new Random();
}
}