| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #ifndef IMPALA_EXPERIMENT_DATAPROVIDER_H |
| #define IMPALA_EXPERIMENT_DATAPROVIDER_H |
| |
| #include <math.h> |
| #include <iostream> |
| #include <limits> |
| #include <boost/cstdint.hpp> |
| #include <boost/scoped_ptr.hpp> |
| #include <boost/random/uniform_int.hpp> |
| #include <boost/random/linear_congruential.hpp> |
| #include <boost/random/uniform_int.hpp> |
| #include <boost/random/uniform_real.hpp> |
| #include <boost/random/variate_generator.hpp> |
| #include <boost/generator_iterator.hpp> |
| |
| #include "runtime/mem-pool.h" |
| #include "runtime/types.h" |
| #include "runtime/string-value.h" |
| #include "util/runtime-profile.h" |
| |
| /// This is a test utility class that can generate data that is similar to the tuple |
| /// data we use. |
| /// It can accept columns descriptions and generates rows (in batches) with an iterator |
| /// interface. |
| // |
| /// See data-provider-test.cc on how to use this. |
| // |
| /// TODO: provide a way to have better control over the pool strings are allocated to |
| /// TODO: provide a way to control data skew. This is pretty easy with the boost rand |
| /// classes. |
| class DataProvider { |
| public: |
| struct Value { |
| union { |
| bool b; |
| int8_t int8; |
| int16_t int16; |
| int32_t int32; |
| int64_t int64; |
| float f; |
| double d; |
| }; |
| impala::StringValue s; |
| }; |
| |
| /// How the data should be generated. |
| enum DataGen { |
| UNIFORM_RANDOM, |
| SEQUENTIAL, |
| }; |
| |
| class ColDesc { |
| public: |
| /// Create a column desc with min/max range and the data gen type |
| template<typename T> |
| static ColDesc Create(const T& min, const T& max, DataGen gen = UNIFORM_RANDOM); |
| |
| private: |
| friend class DataProvider; |
| |
| /// Generates a column value between [min,max) for this column. |
| /// d is a random value between [0,1] and i is the row index. |
| template<typename T> |
| T Generate(double d, int i) const; |
| |
| |
| ColDesc(impala::PrimitiveType type, int bytes) { |
| this->type = type; |
| this->bytes = bytes; |
| } |
| |
| /// Default generator - used for int and float types |
| template<typename T> |
| T Generate(double d, int i, T min, T max) const { |
| switch (gen_type) { |
| case UNIFORM_RANDOM: |
| return (T)(d * (max - min) + min); |
| case SEQUENTIAL: |
| return (T)(i % (int64_t)(max - min) + min); |
| } |
| return 0; |
| } |
| |
| impala::PrimitiveType type; |
| Value min, max; |
| DataGen gen_type; |
| int bytes; |
| }; |
| |
| /// Create a data provider object with a pool for allocating memory and a |
| /// profile to collect metrics. |
| DataProvider(impala::MemPool* pool, impala::RuntimeProfile* profile); |
| |
| /// Reset the generator with the column description. |
| /// - num_rows: total rows to generate |
| /// - batch_size: size of generated batches from NextBatch |
| /// Data returned via previous NextBatch calls is no longer valid |
| void Reset(int num_rows, int batch_size, const std::vector<ColDesc>& columns); |
| |
| /// Sets the seed to use for randomly generated data. The default generator will |
| /// use seed(0) |
| void SetSeed(int seed); |
| |
| /// The size of a row (tuple size) |
| int row_size() const { return row_size_; } |
| |
| /// The total number of rows that will be generated |
| int total_rows() const { return num_rows_; } |
| |
| /// Generated the next batch, returning a pointer to the start of the batch |
| /// and the number of rows generated. |
| /// Returns NULL/0 when the generator is done. |
| void* NextBatch(int* rows_returned); |
| |
| /// Print the row data in csv format. |
| void Print(std::ostream*, char* data, int num_rows) const; |
| |
| private: |
| impala::MemPool* pool_; |
| impala::RuntimeProfile* profile_; |
| int num_rows_; |
| int batch_size_; |
| int rows_returned_; |
| boost::scoped_ptr<char> data_; |
| int row_size_; |
| boost::minstd_rand rand_generator_; |
| std::vector<ColDesc> cols_; |
| |
| impala::RuntimeProfile::Counter* bytes_generated_; |
| }; |
| |
| template<> |
| inline DataProvider::ColDesc DataProvider::ColDesc::Create<bool>( |
| const bool& min, const bool &max, DataGen gen) { |
| ColDesc c(impala::TYPE_BOOLEAN, 1); |
| c.min.b = min; |
| c.max.b = max; |
| c.gen_type = gen; |
| return c; |
| } |
| template<> |
| inline DataProvider::ColDesc DataProvider::ColDesc::Create<int8_t>( |
| const int8_t& min, const int8_t& max, DataGen gen) { |
| ColDesc c(impala::TYPE_TINYINT, 1); |
| c.min.int8 = min; |
| c.max.int8 = max; |
| c.gen_type = gen; |
| return c; |
| } |
| template<> |
| inline DataProvider::ColDesc DataProvider::ColDesc::Create<int16_t>( |
| const int16_t& min, const int16_t& max, DataGen gen) { |
| ColDesc c(impala::TYPE_SMALLINT, 2); |
| c.min.int16 = min; |
| c.max.int16 = max; |
| c.gen_type = gen; |
| return c; |
| } |
| template<> |
| inline DataProvider::ColDesc DataProvider::ColDesc::Create<int32_t>( |
| const int32_t& min, const int32_t& max, DataGen gen) { |
| ColDesc c(impala::TYPE_INT, 4); |
| c.min.int32 = min; |
| c.max.int32 = max; |
| c.gen_type = gen; |
| return c; |
| } |
| template<> |
| inline DataProvider::ColDesc DataProvider::ColDesc::Create<int64_t>( |
| const int64_t& min, const int64_t& max, DataGen gen) { |
| ColDesc c(impala::TYPE_BIGINT, 8); |
| c.min.int64 = min; |
| c.max.int64 = max; |
| c.gen_type = gen; |
| return c; |
| } |
| template<> |
| inline DataProvider::ColDesc DataProvider::ColDesc::Create<float>( |
| const float& min, const float& max, DataGen gen) { |
| ColDesc c(impala::TYPE_FLOAT, 4); |
| c.min.f = min; |
| c.max.f = max; |
| c.gen_type = gen; |
| return c; |
| } |
| template<> |
| inline DataProvider::ColDesc DataProvider::ColDesc::Create<double>( |
| const double& min, const double& max, DataGen gen) { |
| ColDesc c(impala::TYPE_DOUBLE, 8); |
| c.min.d = min; |
| c.max.d = max; |
| c.gen_type = gen; |
| return c; |
| } |
| template<> inline |
| DataProvider::ColDesc DataProvider::ColDesc::Create<impala::StringValue>( |
| const impala::StringValue& min, const impala::StringValue& max, DataGen gen) { |
| ColDesc c(impala::TYPE_STRING, 16); |
| c.min.s = min; |
| c.max.s = max; |
| c.gen_type = gen; |
| return c; |
| } |
| |
| |
| template<> inline bool DataProvider::ColDesc::Generate<bool>(double d, int i) const { |
| switch (gen_type) { |
| case UNIFORM_RANDOM: |
| return (int)(round(d * max.b - min.b)) + min.b; |
| case SEQUENTIAL: |
| return (i % 2) ? true : false; |
| } |
| return false; |
| } |
| template<> inline int8_t DataProvider::ColDesc::Generate<int8_t>(double d, int i) const { |
| return Generate<int8_t>(d, i, min.int8, max.int8); |
| } |
| template<> inline int16_t DataProvider::ColDesc::Generate<int16_t>(double d, int i) const { |
| return Generate<int16_t>(d, i, min.int16, max.int16); |
| } |
| template<> inline int32_t DataProvider::ColDesc::Generate<int32_t>(double d, int i) const { |
| return Generate<int32_t>(d, i, min.int32, max.int32); |
| } |
| template<> inline int64_t DataProvider::ColDesc::Generate<int64_t>(double d, int i) const { |
| return Generate<int64_t>(d, i, min.int64, max.int64); |
| } |
| template<> inline float DataProvider::ColDesc::Generate<float>(double d, int i) const { |
| return Generate<float>(d, i, min.f, max.f); |
| } |
| template<> inline double DataProvider::ColDesc::Generate<double>(double d, int i) const { |
| return Generate<double>(d, i, min.d, max.d); |
| } |
| |
| #endif |