blob: c0857e0b3a346a337fc67f835664870c38f3fb14 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <iostream>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/stats.hpp>
#include <boost/accumulators/statistics/variance.hpp>
#include "exprs/aggregate-functions.h"
#include "runtime/multi-precision.h"
#include "testutil/gtest-util.h"
#include "udf/udf.h"
#include "udf/uda-test-harness.h"
#include "util/decimal-util.h"
#include "common/names.h"
namespace tag = boost::accumulators::tag;
using boost::accumulators::accumulator_set;
using boost::accumulators::stats;
using boost::accumulators::variance;
using boost::algorithm::is_any_of;
using boost::algorithm::trim;
using namespace impala;
using namespace impala_udf;
template <int RANGE_START, int RANGE_END>
bool CheckAppxMedian(const IntVal& actual, const IntVal& expected) {
return actual.val >= RANGE_START && actual.val <= RANGE_END;
}
bool CheckHistogramDistribution(const StringVal& actual,
const StringVal& max_expected_stdev) {
string result(reinterpret_cast<char*>(actual.ptr), actual.len);
vector<string> str_vals;
split(str_vals, result, is_any_of(","));
accumulator_set<int, stats<tag::variance>> acc;
int prev_val = -1;
for (string& s: str_vals) {
trim(s);
int val = lexical_cast<int>(s);
if (prev_val != -1) acc(val - prev_val);
prev_val = val;
}
double actual_stdev = sqrt(variance(acc));
string expected_str(reinterpret_cast<char*>(max_expected_stdev.ptr),
max_expected_stdev.len);
return actual_stdev < lexical_cast<double>(expected_str);
}
// TODO: Add other datatypes
TEST(HistogramTest, TestInt) {
UdaTestHarness<StringVal, StringVal, IntVal> test_histogram(
AggregateFunctions::ReservoirSampleInit<IntVal>,
AggregateFunctions::ReservoirSampleUpdate<IntVal>,
AggregateFunctions::ReservoirSampleMerge<IntVal>,
AggregateFunctions::ReservoirSampleSerialize<IntVal>,
AggregateFunctions::HistogramFinalize<IntVal>);
UdaTestHarness<IntVal, StringVal, IntVal> test_median(
AggregateFunctions::ReservoirSampleInit<IntVal>,
AggregateFunctions::ReservoirSampleUpdate<IntVal>,
AggregateFunctions::ReservoirSampleMerge<IntVal>,
AggregateFunctions::ReservoirSampleSerialize<IntVal>,
AggregateFunctions::AppxMedianFinalize<IntVal>);
const int NUM_BUCKETS = 100;
const int INPUT_SIZE = NUM_BUCKETS * 1000;
// All input values are 1, result should be constant.
{
vector<IntVal> input(INPUT_SIZE, 1);
char expected[] = "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
"1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
"1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "
"1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1";
EXPECT_TRUE(test_histogram.Execute(input, StringVal(&expected[0])))
<< test_histogram.GetErrorMsg();
}
// Now check input values ranging from 0 to 100,000. Each bucket should have 1000
// values, i.e. bucket i should approximately contain values [100*i, 100*(i+1)]. We
// check the distribution of the deltas between histogram values is not too large.
// TODO: Add more deterministic test cases
{
vector<IntVal> input;
for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(i);
test_histogram.SetResultComparator(CheckHistogramDistribution);
StringVal max_expected_stdev = StringVal("100.0");
EXPECT_TRUE(test_histogram.Execute(input, max_expected_stdev))
<< test_histogram.GetErrorMsg();
test_median.SetResultComparator(CheckAppxMedian<45000,55000>);
EXPECT_TRUE(test_median.Execute(input, IntVal()))
<< test_median.GetErrorMsg();
}
}
TEST(HistogramTest, TestDecimal) {
UdaTestHarness<StringVal, StringVal, DecimalVal> test(
AggregateFunctions::ReservoirSampleInit<DecimalVal>,
AggregateFunctions::ReservoirSampleUpdate<DecimalVal>,
AggregateFunctions::ReservoirSampleMerge<DecimalVal>,
AggregateFunctions::ReservoirSampleSerialize<DecimalVal>,
AggregateFunctions::HistogramFinalize<DecimalVal>);
const int NUM_BUCKETS = 100;
const int INPUT_SIZE = NUM_BUCKETS * 1000;
// All input values are x, result should be constant.
{
vector<DecimalVal> input;
int128_t val = DecimalUtil::MAX_UNSCALED_DECIMAL16;
stringstream ss;
for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(DecimalVal(val));
for (int i = 0; i < NUM_BUCKETS; ++i) {
ss << val;
if (i < NUM_BUCKETS - 1) ss << ", ";
}
EXPECT_TRUE(test.Execute(input, StringVal(ss.str().c_str()))) << test.GetErrorMsg();
}
{
vector<DecimalVal> input;
for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(DecimalVal(i));
test.SetResultComparator(CheckHistogramDistribution);
StringVal max_expected_stdev = StringVal("100.0");
EXPECT_TRUE(test.Execute(input, max_expected_stdev)) << test.GetErrorMsg();
}
}
TEST(HistogramTest, TestString) {
UdaTestHarness<StringVal, StringVal, StringVal> test(
AggregateFunctions::ReservoirSampleInit<StringVal>,
AggregateFunctions::ReservoirSampleUpdate<StringVal>,
AggregateFunctions::ReservoirSampleMerge<StringVal>,
AggregateFunctions::ReservoirSampleSerialize<StringVal>,
AggregateFunctions::HistogramFinalize<StringVal>);
const int NUM_BUCKETS = 100;
const int INPUT_SIZE = NUM_BUCKETS * 1000;
// All input values are x, result should be constant.
vector<StringVal> input;
for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(StringVal("x"));
char expected[] = "x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
"x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
"x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, "
"x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x";
EXPECT_TRUE(test.Execute(input, StringVal(&expected[0]))) << test.GetErrorMsg();
}
IMPALA_TEST_MAIN();