blob: 702544ecbb5fa838ab9eb4430bd69a81b35e20c4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.core.client.sample;
import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Set;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.sample.impl.DataoutputHasher;
import com.google.common.collect.ImmutableSet;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
/**
* A base class that can be used to create Samplers based on hashing. This class offers consistent
* options for configuring the hash function. The subclass decides which parts of the key to hash.
*
* <p>
* This class support two options passed into {@link #init(SamplerConfiguration)}. One option is
* {@code hasher} which specifies a hashing algorithm. Valid values for this option are {@code md5},
* {@code sha1}, and {@code murmur3_32}. If you are not sure, then choose {@code murmur3_32}.
*
* <p>
* The second option is {@code modulus} which can have any positive integer as a value.
*
* <p>
* Any data where {@code hash(data) % modulus == 0} will be selected for the sample.
*
* @since 1.8.0
*/
public abstract class AbstractHashSampler implements Sampler {
private HashFunction hashFunction;
private int modulus;
private static final Set<String> VALID_OPTIONS = ImmutableSet.of("hasher", "modulus");
/**
* Subclasses with options should override this method and return true if the option is valid for
* the subclass or if {@code super.isValidOption(opt)} returns true.
*/
protected boolean isValidOption(String option) {
return VALID_OPTIONS.contains(option);
}
/**
* Subclasses with options should override this method and call {@code super.init(config)}.
*/
@SuppressFBWarnings(value = "UNSAFE_HASH_EQUALS",
justification = "these hashes don't protect any secrets, just used for binning")
@Override
public void init(SamplerConfiguration config) {
String hasherOpt = config.getOptions().get("hasher");
String modulusOpt = config.getOptions().get("modulus");
requireNonNull(hasherOpt, "Hasher not specified");
requireNonNull(modulusOpt, "Modulus not specified");
for (String option : config.getOptions().keySet()) {
checkArgument(isValidOption(option), "Unknown option : %s", option);
}
switch (hasherOpt) {
case "murmur3_32":
hashFunction = Hashing.murmur3_32();
break;
case "md5":
@SuppressWarnings("deprecation")
HashFunction deprecatedMd5 = Hashing.md5();
hashFunction = deprecatedMd5;
break;
case "sha1":
@SuppressWarnings("deprecation")
HashFunction deprecatedSha1 = Hashing.sha1();
hashFunction = deprecatedSha1;
break;
default:
throw new IllegalArgumentException("Unknown hahser " + hasherOpt);
}
modulus = Integer.parseInt(modulusOpt);
}
/**
* Subclass must override this method and hash some portion of the key.
*
* @param hasher
* Data written to this will be used to compute the hash for the key.
*/
protected abstract void hash(DataOutput hasher, Key k) throws IOException;
@Override
public boolean accept(Key k) {
Hasher hasher = hashFunction.newHasher();
try {
hash(new DataoutputHasher(hasher), k);
} catch (IOException e) {
throw new RuntimeException(e);
}
return hasher.hash().asInt() % modulus == 0;
}
}