blob: 17eaf0d29a12c0725d7daaefa4cd573490b14f63 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
#define KOLMOGOROV_SMIRNOV_IMPL_HPP_
namespace datasketches {
// type resolver
template<typename T, typename C, typename S, typename A>
kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
return kll_quantile_calculator<T, C, A>(sketch);
}
template<typename Sketch>
double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
using Comparator = typename Sketch::comparator;
auto calc1 = make_quantile_calculator(sketch1);
auto calc2 = make_quantile_calculator(sketch2);
auto it1 = calc1.begin();
auto it2 = calc2.begin();
const auto n1 = sketch1.get_n();
const auto n2 = sketch2.get_n();
double delta = 0;
while (it1 != calc1.end() && it2 != calc2.end()) {
const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
if (Comparator()((*it1).first, (*it2).first)) {
++it1;
} else if (Comparator()((*it2).first, (*it1).first)) {
++it2;
} else {
++it1;
++it2;
}
}
const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
return delta;
}
template<typename Sketch>
double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
const double r1 = sketch1.get_num_retained();
const double r2 = sketch2.get_num_retained();
const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
const double eps1 = sketch1.get_normalized_rank_error(false);
const double eps2 = sketch2.get_normalized_rank_error(false);
return delta_area_threshold + eps1 + eps2;
}
template<typename Sketch>
bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
}
} /* namespace datasketches */
#endif