blob: 376c250ecf4e545e7a60942692418a73d40fa6ab [file] [log] [blame]
package org.apache.cassandra.hadoop;
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
import java.io.IOException;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.io.compress.CompressionParameters;
import org.apache.cassandra.thrift.*;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Hex;
import org.apache.hadoop.conf.Configuration;
import org.apache.thrift.TBase;
import org.apache.thrift.TDeserializer;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.transport.TTransport;
public class ConfigHelper
{
private static final String INPUT_PARTITIONER_CONFIG = "cassandra.input.partitioner.class";
private static final String OUTPUT_PARTITIONER_CONFIG = "cassandra.output.partitioner.class";
private static final String INPUT_KEYSPACE_CONFIG = "cassandra.input.keyspace";
private static final String OUTPUT_KEYSPACE_CONFIG = "cassandra.output.keyspace";
private static final String INPUT_KEYSPACE_USERNAME_CONFIG = "cassandra.input.keyspace.username";
private static final String INPUT_KEYSPACE_PASSWD_CONFIG = "cassandra.input.keyspace.passwd";
private static final String OUTPUT_KEYSPACE_USERNAME_CONFIG = "cassandra.output.keyspace.username";
private static final String OUTPUT_KEYSPACE_PASSWD_CONFIG = "cassandra.output.keyspace.passwd";
private static final String INPUT_COLUMNFAMILY_CONFIG = "cassandra.input.columnfamily";
private static final String OUTPUT_COLUMNFAMILY_CONFIG = "mapreduce.output.basename"; //this must == OutputFormat.BASE_OUTPUT_NAME
private static final String INPUT_PREDICATE_CONFIG = "cassandra.input.predicate";
private static final String INPUT_KEYRANGE_CONFIG = "cassandra.input.keyRange";
private static final String INPUT_SPLIT_SIZE_CONFIG = "cassandra.input.split.size";
private static final String INPUT_SPLIT_SIZE_IN_MB_CONFIG = "cassandra.input.split.size_mb";
private static final String INPUT_WIDEROWS_CONFIG = "cassandra.input.widerows";
private static final int DEFAULT_SPLIT_SIZE = 64 * 1024;
private static final String RANGE_BATCH_SIZE_CONFIG = "cassandra.range.batch.size";
private static final int DEFAULT_RANGE_BATCH_SIZE = 4096;
private static final String INPUT_THRIFT_PORT = "cassandra.input.thrift.port";
private static final String OUTPUT_THRIFT_PORT = "cassandra.output.thrift.port";
private static final String INPUT_INITIAL_THRIFT_ADDRESS = "cassandra.input.thrift.address";
private static final String OUTPUT_INITIAL_THRIFT_ADDRESS = "cassandra.output.thrift.address";
private static final String READ_CONSISTENCY_LEVEL = "cassandra.consistencylevel.read";
private static final String WRITE_CONSISTENCY_LEVEL = "cassandra.consistencylevel.write";
private static final String OUTPUT_COMPRESSION_CLASS = "cassandra.output.compression.class";
private static final String OUTPUT_COMPRESSION_CHUNK_LENGTH = "cassandra.output.compression.length";
private static final String OUTPUT_LOCAL_DC_ONLY = "cassandra.output.local.dc.only";
private static final String THRIFT_FRAMED_TRANSPORT_SIZE_IN_MB = "cassandra.thrift.framed.size_mb";
private static final Logger logger = LoggerFactory.getLogger(ConfigHelper.class);
/**
* Set the keyspace and column family for the input of this job.
*
* @param conf Job configuration you are about to run
* @param keyspace
* @param columnFamily
* @param widerows
*/
public static void setInputColumnFamily(Configuration conf, String keyspace, String columnFamily, boolean widerows)
{
if (keyspace == null)
throw new UnsupportedOperationException("keyspace may not be null");
if (columnFamily == null)
throw new UnsupportedOperationException("table may not be null");
conf.set(INPUT_KEYSPACE_CONFIG, keyspace);
conf.set(INPUT_COLUMNFAMILY_CONFIG, columnFamily);
conf.set(INPUT_WIDEROWS_CONFIG, String.valueOf(widerows));
}
/**
* Set the keyspace and column family for the input of this job.
*
* @param conf Job configuration you are about to run
* @param keyspace
* @param columnFamily
*/
public static void setInputColumnFamily(Configuration conf, String keyspace, String columnFamily)
{
setInputColumnFamily(conf, keyspace, columnFamily, false);
}
/**
* Set the keyspace for the output of this job.
*
* @param conf Job configuration you are about to run
* @param keyspace
*/
public static void setOutputKeyspace(Configuration conf, String keyspace)
{
if (keyspace == null)
throw new UnsupportedOperationException("keyspace may not be null");
conf.set(OUTPUT_KEYSPACE_CONFIG, keyspace);
}
/**
* Set the column family for the output of this job.
*
* @param conf Job configuration you are about to run
* @param columnFamily
*/
public static void setOutputColumnFamily(Configuration conf, String columnFamily)
{
conf.set(OUTPUT_COLUMNFAMILY_CONFIG, columnFamily);
}
/**
* Set the column family for the output of this job.
*
* @param conf Job configuration you are about to run
* @param keyspace
* @param columnFamily
*/
public static void setOutputColumnFamily(Configuration conf, String keyspace, String columnFamily)
{
setOutputKeyspace(conf, keyspace);
setOutputColumnFamily(conf, columnFamily);
}
/**
* The number of rows to request with each get range slices request.
* Too big and you can either get timeouts when it takes Cassandra too
* long to fetch all the data. Too small and the performance
* will be eaten up by the overhead of each request.
*
* @param conf Job configuration you are about to run
* @param batchsize Number of rows to request each time
*/
public static void setRangeBatchSize(Configuration conf, int batchsize)
{
conf.setInt(RANGE_BATCH_SIZE_CONFIG, batchsize);
}
/**
* The number of rows to request with each get range slices request.
* Too big and you can either get timeouts when it takes Cassandra too
* long to fetch all the data. Too small and the performance
* will be eaten up by the overhead of each request.
*
* @param conf Job configuration you are about to run
* @return Number of rows to request each time
*/
public static int getRangeBatchSize(Configuration conf)
{
return conf.getInt(RANGE_BATCH_SIZE_CONFIG, DEFAULT_RANGE_BATCH_SIZE);
}
/**
* Set the size of the input split.
* This affects the number of maps created, if the number is too small
* the overhead of each map will take up the bulk of the job time.
*
* @param conf Job configuration you are about to run
* @param splitsize Number of partitions in the input split
*/
public static void setInputSplitSize(Configuration conf, int splitsize)
{
conf.setInt(INPUT_SPLIT_SIZE_CONFIG, splitsize);
}
public static int getInputSplitSize(Configuration conf)
{
return conf.getInt(INPUT_SPLIT_SIZE_CONFIG, DEFAULT_SPLIT_SIZE);
}
/**
* Set the size of the input split. getInputSplitSize value is used if this is not set.
* This affects the number of maps created, if the number is too small
* the overhead of each map will take up the bulk of the job time.
*
* @param conf Job configuration you are about to run
* @param splitSizeMb Input split size in MB
*/
public static void setInputSplitSizeInMb(Configuration conf, int splitSizeMb)
{
conf.setInt(INPUT_SPLIT_SIZE_IN_MB_CONFIG, splitSizeMb);
}
/**
* cassandra.input.split.size will be used if the value is undefined or negative.
* @param conf Job configuration you are about to run
* @return split size in MB or -1 if it is undefined.
*/
public static int getInputSplitSizeInMb(Configuration conf)
{
return conf.getInt(INPUT_SPLIT_SIZE_IN_MB_CONFIG, -1);
}
/**
* Set the predicate that determines what columns will be selected from each row.
*
* @param conf Job configuration you are about to run
* @param predicate
*/
public static void setInputSlicePredicate(Configuration conf, SlicePredicate predicate)
{
conf.set(INPUT_PREDICATE_CONFIG, thriftToString(predicate));
}
public static SlicePredicate getInputSlicePredicate(Configuration conf)
{
String s = conf.get(INPUT_PREDICATE_CONFIG);
return s == null ? null : predicateFromString(s);
}
private static String thriftToString(TBase object)
{
assert object != null;
// this is so awful it's kind of cool!
TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
try
{
return Hex.bytesToHex(serializer.serialize(object));
}
catch (TException e)
{
throw new RuntimeException(e);
}
}
private static SlicePredicate predicateFromString(String st)
{
assert st != null;
TDeserializer deserializer = new TDeserializer(new TBinaryProtocol.Factory());
SlicePredicate predicate = new SlicePredicate();
try
{
deserializer.deserialize(predicate, Hex.hexToBytes(st));
}
catch (TException e)
{
throw new RuntimeException(e);
}
return predicate;
}
/**
* Set the KeyRange to limit the rows.
* @param conf Job configuration you are about to run
*/
public static void setInputRange(Configuration conf, String startToken, String endToken)
{
KeyRange range = new KeyRange().setStart_token(startToken).setEnd_token(endToken);
conf.set(INPUT_KEYRANGE_CONFIG, thriftToString(range));
}
/**
* Set the KeyRange to limit the rows.
* @param conf Job configuration you are about to run
*/
public static void setInputRange(Configuration conf, String startToken, String endToken, List<IndexExpression> filter)
{
KeyRange range = new KeyRange().setStart_token(startToken).setEnd_token(endToken).setRow_filter(filter);
conf.set(INPUT_KEYRANGE_CONFIG, thriftToString(range));
}
/**
* Set the KeyRange to limit the rows.
* @param conf Job configuration you are about to run
*/
public static void setInputRange(Configuration conf, List<IndexExpression> filter)
{
KeyRange range = new KeyRange().setRow_filter(filter);
conf.set(INPUT_KEYRANGE_CONFIG, thriftToString(range));
}
/** may be null if unset */
public static KeyRange getInputKeyRange(Configuration conf)
{
String str = conf.get(INPUT_KEYRANGE_CONFIG);
return str == null ? null : keyRangeFromString(str);
}
private static KeyRange keyRangeFromString(String st)
{
assert st != null;
TDeserializer deserializer = new TDeserializer(new TBinaryProtocol.Factory());
KeyRange keyRange = new KeyRange();
try
{
deserializer.deserialize(keyRange, Hex.hexToBytes(st));
}
catch (TException e)
{
throw new RuntimeException(e);
}
return keyRange;
}
public static String getInputKeyspace(Configuration conf)
{
return conf.get(INPUT_KEYSPACE_CONFIG);
}
public static String getOutputKeyspace(Configuration conf)
{
return conf.get(OUTPUT_KEYSPACE_CONFIG);
}
public static void setInputKeyspaceUserNameAndPassword(Configuration conf, String username, String password)
{
setInputKeyspaceUserName(conf, username);
setInputKeyspacePassword(conf, password);
}
public static void setInputKeyspaceUserName(Configuration conf, String username)
{
conf.set(INPUT_KEYSPACE_USERNAME_CONFIG, username);
}
public static String getInputKeyspaceUserName(Configuration conf)
{
return conf.get(INPUT_KEYSPACE_USERNAME_CONFIG);
}
public static void setInputKeyspacePassword(Configuration conf, String password)
{
conf.set(INPUT_KEYSPACE_PASSWD_CONFIG, password);
}
public static String getInputKeyspacePassword(Configuration conf)
{
return conf.get(INPUT_KEYSPACE_PASSWD_CONFIG);
}
public static void setOutputKeyspaceUserNameAndPassword(Configuration conf, String username, String password)
{
setOutputKeyspaceUserName(conf, username);
setOutputKeyspacePassword(conf, password);
}
public static void setOutputKeyspaceUserName(Configuration conf, String username)
{
conf.set(OUTPUT_KEYSPACE_USERNAME_CONFIG, username);
}
public static String getOutputKeyspaceUserName(Configuration conf)
{
return conf.get(OUTPUT_KEYSPACE_USERNAME_CONFIG);
}
public static void setOutputKeyspacePassword(Configuration conf, String password)
{
conf.set(OUTPUT_KEYSPACE_PASSWD_CONFIG, password);
}
public static String getOutputKeyspacePassword(Configuration conf)
{
return conf.get(OUTPUT_KEYSPACE_PASSWD_CONFIG);
}
public static String getInputColumnFamily(Configuration conf)
{
return conf.get(INPUT_COLUMNFAMILY_CONFIG);
}
public static String getOutputColumnFamily(Configuration conf)
{
if (conf.get(OUTPUT_COLUMNFAMILY_CONFIG) != null)
return conf.get(OUTPUT_COLUMNFAMILY_CONFIG);
else
throw new UnsupportedOperationException("You must set the output column family using either setOutputColumnFamily or by adding a named output with MultipleOutputs");
}
public static boolean getInputIsWide(Configuration conf)
{
return Boolean.parseBoolean(conf.get(INPUT_WIDEROWS_CONFIG));
}
public static String getReadConsistencyLevel(Configuration conf)
{
return conf.get(READ_CONSISTENCY_LEVEL, "LOCAL_ONE");
}
public static void setReadConsistencyLevel(Configuration conf, String consistencyLevel)
{
conf.set(READ_CONSISTENCY_LEVEL, consistencyLevel);
}
public static String getWriteConsistencyLevel(Configuration conf)
{
return conf.get(WRITE_CONSISTENCY_LEVEL, "LOCAL_ONE");
}
public static void setWriteConsistencyLevel(Configuration conf, String consistencyLevel)
{
conf.set(WRITE_CONSISTENCY_LEVEL, consistencyLevel);
}
public static int getInputRpcPort(Configuration conf)
{
return Integer.parseInt(conf.get(INPUT_THRIFT_PORT, "9160"));
}
public static void setInputRpcPort(Configuration conf, String port)
{
conf.set(INPUT_THRIFT_PORT, port);
}
public static String getInputInitialAddress(Configuration conf)
{
return conf.get(INPUT_INITIAL_THRIFT_ADDRESS);
}
public static void setInputInitialAddress(Configuration conf, String address)
{
conf.set(INPUT_INITIAL_THRIFT_ADDRESS, address);
}
public static void setInputPartitioner(Configuration conf, String classname)
{
conf.set(INPUT_PARTITIONER_CONFIG, classname);
}
public static IPartitioner getInputPartitioner(Configuration conf)
{
return FBUtilities.newPartitioner(conf.get(INPUT_PARTITIONER_CONFIG));
}
public static int getOutputRpcPort(Configuration conf)
{
return Integer.parseInt(conf.get(OUTPUT_THRIFT_PORT, "9160"));
}
public static void setOutputRpcPort(Configuration conf, String port)
{
conf.set(OUTPUT_THRIFT_PORT, port);
}
public static String getOutputInitialAddress(Configuration conf)
{
return conf.get(OUTPUT_INITIAL_THRIFT_ADDRESS);
}
public static void setOutputInitialAddress(Configuration conf, String address)
{
conf.set(OUTPUT_INITIAL_THRIFT_ADDRESS, address);
}
public static void setOutputPartitioner(Configuration conf, String classname)
{
conf.set(OUTPUT_PARTITIONER_CONFIG, classname);
}
public static IPartitioner getOutputPartitioner(Configuration conf)
{
return FBUtilities.newPartitioner(conf.get(OUTPUT_PARTITIONER_CONFIG));
}
public static String getOutputCompressionClass(Configuration conf)
{
return conf.get(OUTPUT_COMPRESSION_CLASS);
}
public static String getOutputCompressionChunkLength(Configuration conf)
{
return conf.get(OUTPUT_COMPRESSION_CHUNK_LENGTH, String.valueOf(CompressionParameters.DEFAULT_CHUNK_LENGTH));
}
public static void setOutputCompressionClass(Configuration conf, String classname)
{
conf.set(OUTPUT_COMPRESSION_CLASS, classname);
}
public static void setOutputCompressionChunkLength(Configuration conf, String length)
{
conf.set(OUTPUT_COMPRESSION_CHUNK_LENGTH, length);
}
public static void setThriftFramedTransportSizeInMb(Configuration conf, int frameSizeInMB)
{
conf.setInt(THRIFT_FRAMED_TRANSPORT_SIZE_IN_MB, frameSizeInMB);
}
/**
* @param conf The configuration to use.
* @return Value (converts MBs to Bytes) set by {@link #setThriftFramedTransportSizeInMb(Configuration, int)} or default of 15MB
*/
public static int getThriftFramedTransportSize(Configuration conf)
{
return conf.getInt(THRIFT_FRAMED_TRANSPORT_SIZE_IN_MB, 15) * 1024 * 1024; // 15MB is default in Cassandra
}
public static CompressionParameters getOutputCompressionParamaters(Configuration conf)
{
if (getOutputCompressionClass(conf) == null)
return new CompressionParameters(null);
Map<String, String> options = new HashMap<String, String>(2);
options.put(CompressionParameters.SSTABLE_COMPRESSION, getOutputCompressionClass(conf));
options.put(CompressionParameters.CHUNK_LENGTH_KB, getOutputCompressionChunkLength(conf));
return CompressionParameters.create(options);
}
public static boolean getOutputLocalDCOnly(Configuration conf)
{
return Boolean.parseBoolean(conf.get(OUTPUT_LOCAL_DC_ONLY, "false"));
}
public static void setOutputLocalDCOnly(Configuration conf, boolean localDCOnly)
{
conf.set(OUTPUT_LOCAL_DC_ONLY, Boolean.toString(localDCOnly));
}
public static Cassandra.Client getClientFromInputAddressList(Configuration conf) throws IOException
{
return getClientFromAddressList(conf, ConfigHelper.getInputInitialAddress(conf).split(","), ConfigHelper.getInputRpcPort(conf));
}
public static Cassandra.Client getClientFromOutputAddressList(Configuration conf) throws IOException
{
return getClientFromAddressList(conf, ConfigHelper.getOutputInitialAddress(conf).split(","), ConfigHelper.getOutputRpcPort(conf));
}
private static Cassandra.Client getClientFromAddressList(Configuration conf, String[] addresses, int port) throws IOException
{
Cassandra.Client client = null;
List<IOException> exceptions = new ArrayList<IOException>();
for (String address : addresses)
{
try
{
client = createConnection(conf, address, port);
break;
}
catch (IOException ioe)
{
exceptions.add(ioe);
}
}
if (client == null)
{
logger.error("failed to connect to any initial addresses");
for (IOException ioe : exceptions)
{
logger.error("", ioe);
}
throw exceptions.get(exceptions.size() - 1);
}
return client;
}
@SuppressWarnings("resource")
public static Cassandra.Client createConnection(Configuration conf, String host, Integer port) throws IOException
{
try
{
TTransport transport = getClientTransportFactory(conf).openTransport(host, port);
return new Cassandra.Client(new TBinaryProtocol(transport, true, true));
}
catch (Exception e)
{
throw new IOException("Unable to connect to server " + host + ":" + port, e);
}
}
public static ITransportFactory getClientTransportFactory(Configuration conf)
{
String factoryClassName = conf.get(ITransportFactory.PROPERTY_KEY, TFramedTransportFactory.class.getName());
ITransportFactory factory = getClientTransportFactory(factoryClassName);
Map<String, String> options = getOptions(conf, factory.supportedOptions());
factory.setOptions(options);
return factory;
}
private static ITransportFactory getClientTransportFactory(String factoryClassName)
{
try
{
return (ITransportFactory) Class.forName(factoryClassName).newInstance();
}
catch (Exception e)
{
throw new RuntimeException("Failed to instantiate transport factory:" + factoryClassName, e);
}
}
private static Map<String, String> getOptions(Configuration conf, Set<String> supportedOptions)
{
Map<String, String> options = new HashMap<>();
for (String optionKey : supportedOptions)
{
String optionValue = conf.get(optionKey);
if (optionValue != null)
options.put(optionKey, optionValue);
}
return options;
}
}