blob: fe0b2a9ef27373dec49e0df008b9c02edd613bdc [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_EXPERIMENT_DATAPROVIDER_H
#define IMPALA_EXPERIMENT_DATAPROVIDER_H
#include <math.h>
#include <iostream>
#include <limits>
#include <boost/cstdint.hpp>
#include <boost/scoped_ptr.hpp>
#include <boost/random/uniform_int.hpp>
#include <boost/random/linear_congruential.hpp>
#include <boost/random/uniform_int.hpp>
#include <boost/random/uniform_real.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/generator_iterator.hpp>
#include "runtime/mem-pool.h"
#include "runtime/types.h"
#include "runtime/string-value.h"
#include "util/runtime-profile.h"
/// This is a test utility class that can generate data that is similar to the tuple
/// data we use.
/// It can accept columns descriptions and generates rows (in batches) with an iterator
/// interface.
//
/// See data-provider-test.cc on how to use this.
//
/// TODO: provide a way to have better control over the pool strings are allocated to
/// TODO: provide a way to control data skew. This is pretty easy with the boost rand
/// classes.
class DataProvider {
public:
struct Value {
union {
bool b;
int8_t int8;
int16_t int16;
int32_t int32;
int64_t int64;
float f;
double d;
};
impala::StringValue s;
};
/// How the data should be generated.
enum DataGen {
UNIFORM_RANDOM,
SEQUENTIAL,
};
class ColDesc {
public:
/// Create a column desc with min/max range and the data gen type
template<typename T>
static ColDesc Create(const T& min, const T& max, DataGen gen = UNIFORM_RANDOM);
private:
friend class DataProvider;
/// Generates a column value between [min,max) for this column.
/// d is a random value between [0,1] and i is the row index.
template<typename T>
T Generate(double d, int i) const;
ColDesc(impala::PrimitiveType type, int bytes) {
this->type = type;
this->bytes = bytes;
}
/// Default generator - used for int and float types
template<typename T>
T Generate(double d, int i, T min, T max) const {
switch (gen_type) {
case UNIFORM_RANDOM:
return (T)(d * (max - min) + min);
case SEQUENTIAL:
return (T)(i % (int64_t)(max - min) + min);
}
return 0;
}
impala::PrimitiveType type;
Value min, max;
DataGen gen_type;
int bytes;
};
/// Create a data provider object with a pool for allocating memory and a
/// profile to collect metrics.
DataProvider(impala::MemPool* pool, impala::RuntimeProfile* profile);
/// Reset the generator with the column description.
/// - num_rows: total rows to generate
/// - batch_size: size of generated batches from NextBatch
/// Data returned via previous NextBatch calls is no longer valid
void Reset(int num_rows, int batch_size, const std::vector<ColDesc>& columns);
/// Sets the seed to use for randomly generated data. The default generator will
/// use seed(0)
void SetSeed(int seed);
/// The size of a row (tuple size)
int row_size() const { return row_size_; }
/// The total number of rows that will be generated
int total_rows() const { return num_rows_; }
/// Generated the next batch, returning a pointer to the start of the batch
/// and the number of rows generated.
/// Returns NULL/0 when the generator is done.
void* NextBatch(int* rows_returned);
/// Print the row data in csv format.
void Print(std::ostream*, char* data, int num_rows) const;
private:
impala::MemPool* pool_;
impala::RuntimeProfile* profile_;
int num_rows_;
int batch_size_;
int rows_returned_;
boost::scoped_ptr<char> data_;
int row_size_;
boost::minstd_rand rand_generator_;
std::vector<ColDesc> cols_;
impala::RuntimeProfile::Counter* bytes_generated_;
};
template<>
inline DataProvider::ColDesc DataProvider::ColDesc::Create<bool>(
const bool& min, const bool &max, DataGen gen) {
ColDesc c(impala::TYPE_BOOLEAN, 1);
c.min.b = min;
c.max.b = max;
c.gen_type = gen;
return c;
}
template<>
inline DataProvider::ColDesc DataProvider::ColDesc::Create<int8_t>(
const int8_t& min, const int8_t& max, DataGen gen) {
ColDesc c(impala::TYPE_TINYINT, 1);
c.min.int8 = min;
c.max.int8 = max;
c.gen_type = gen;
return c;
}
template<>
inline DataProvider::ColDesc DataProvider::ColDesc::Create<int16_t>(
const int16_t& min, const int16_t& max, DataGen gen) {
ColDesc c(impala::TYPE_SMALLINT, 2);
c.min.int16 = min;
c.max.int16 = max;
c.gen_type = gen;
return c;
}
template<>
inline DataProvider::ColDesc DataProvider::ColDesc::Create<int32_t>(
const int32_t& min, const int32_t& max, DataGen gen) {
ColDesc c(impala::TYPE_INT, 4);
c.min.int32 = min;
c.max.int32 = max;
c.gen_type = gen;
return c;
}
template<>
inline DataProvider::ColDesc DataProvider::ColDesc::Create<int64_t>(
const int64_t& min, const int64_t& max, DataGen gen) {
ColDesc c(impala::TYPE_BIGINT, 8);
c.min.int64 = min;
c.max.int64 = max;
c.gen_type = gen;
return c;
}
template<>
inline DataProvider::ColDesc DataProvider::ColDesc::Create<float>(
const float& min, const float& max, DataGen gen) {
ColDesc c(impala::TYPE_FLOAT, 4);
c.min.f = min;
c.max.f = max;
c.gen_type = gen;
return c;
}
template<>
inline DataProvider::ColDesc DataProvider::ColDesc::Create<double>(
const double& min, const double& max, DataGen gen) {
ColDesc c(impala::TYPE_DOUBLE, 8);
c.min.d = min;
c.max.d = max;
c.gen_type = gen;
return c;
}
template<> inline
DataProvider::ColDesc DataProvider::ColDesc::Create<impala::StringValue>(
const impala::StringValue& min, const impala::StringValue& max, DataGen gen) {
ColDesc c(impala::TYPE_STRING, 16);
c.min.s = min;
c.max.s = max;
c.gen_type = gen;
return c;
}
template<> inline bool DataProvider::ColDesc::Generate<bool>(double d, int i) const {
switch (gen_type) {
case UNIFORM_RANDOM:
return (int)(round(d * max.b - min.b)) + min.b;
case SEQUENTIAL:
return (i % 2) ? true : false;
}
return false;
}
template<> inline int8_t DataProvider::ColDesc::Generate<int8_t>(double d, int i) const {
return Generate<int8_t>(d, i, min.int8, max.int8);
}
template<> inline int16_t DataProvider::ColDesc::Generate<int16_t>(double d, int i) const {
return Generate<int16_t>(d, i, min.int16, max.int16);
}
template<> inline int32_t DataProvider::ColDesc::Generate<int32_t>(double d, int i) const {
return Generate<int32_t>(d, i, min.int32, max.int32);
}
template<> inline int64_t DataProvider::ColDesc::Generate<int64_t>(double d, int i) const {
return Generate<int64_t>(d, i, min.int64, max.int64);
}
template<> inline float DataProvider::ColDesc::Generate<float>(double d, int i) const {
return Generate<float>(d, i, min.f, max.f);
}
template<> inline double DataProvider::ColDesc::Generate<double>(double d, int i) const {
return Generate<double>(d, i, min.d, max.d);
}
#endif