| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package datafu.pig.hash; |
| |
| import java.util.Random; |
| |
| import datafu.pig.hash.Hasher; |
| import com.google.common.hash.HashFunction; |
| |
| import org.apache.pig.impl.logicalLayer.schema.Schema; |
| |
| /** |
| * |
| * Computes a hash value of a string using a randomly generated seed and |
| * outputs it in hex. |
| * |
| * This class should only be used for hashing algorithms that accept a seed |
| * (murmur3-32, murmur3-128 and sip24). |
| * |
| * It allows you to generate a well-mixed sequence of values, unpredictable |
| * for every run, without relying on the random number generator for each |
| * record. The seed is generated by the front end (i.e. when you launch your |
| * script) and so is identical for every task. |
| * |
| * @see datafu.pig.hash.Hasher |
| */ |
| public class HasherRand extends Hasher |
| { |
| protected HashFunction hash_func; |
| protected final String algorithm; |
| |
| /** |
| * Generates hash values according to murmur3-32, a non-cryptographic-strength |
| * hash function with good mixing. |
| * |
| * @throws IllegalArgumentException, RuntimeException |
| * @see #HasherRand(String alg) |
| */ |
| public HasherRand() throws IllegalArgumentException, RuntimeException |
| { |
| this("murmur3-32"); |
| } |
| |
| /** |
| * @param alg |
| * @throws IllegalArgumentException, RuntimeException |
| * @see #HasherRand() |
| */ |
| public HasherRand(String alg) throws IllegalArgumentException, RuntimeException |
| { |
| algorithm = alg; |
| } |
| |
| /** |
| * @param val the single string to hash |
| * @return val, hashed according to the algorithm specified at instantiation |
| */ |
| @Override |
| public String call(String val) |
| { |
| if (hash_func == null) { |
| // memoize the hash func |
| String rand_seed = (String)getInstanceProperties().get("rand_seed"); |
| super.makeHashFunc(algorithm, rand_seed); |
| } |
| return super.call(val); |
| } |
| |
| /** |
| * Generate a seed exactly once on the front end, so all workers get same value |
| |
| * @param in_schema Input schema |
| * @param out_schema Output schema |
| */ |
| @Override |
| protected void onReady(Schema in_schema, Schema out_schema) { |
| String rand_seed; |
| Random rg = getRandomGenerator(); |
| |
| if (algorithm.equals("murmur3-32")) { |
| int rand_int = rg.nextInt(); |
| rand_seed = String.format("%08x", rand_int); |
| } |
| else if (algorithm.equals("murmur3-128")){ |
| int rand_int = rg.nextInt(); |
| rand_seed = String.format("%08x", rand_int); |
| } |
| else if (algorithm.equals("sip24")) { |
| long rand_k0 = rg.nextLong(); |
| long rand_k1 = rg.nextLong(); |
| rand_seed = String.format("%016x%016x", rand_k0, rand_k1); |
| } |
| else { throw new IllegalArgumentException("No hash function found for algorithm "+algorithm+" with a seed. Allowed values include "+SEEDED_HASH_NAMES); } |
| |
| getInstanceProperties().put("rand_seed", rand_seed); |
| |
| super.onReady(in_schema, out_schema); |
| } |
| |
| // exists so tests can inject constant seed. |
| protected Random getRandomGenerator() { |
| return new Random(); |
| } |
| } |