be/src/util/tdigest.h - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 /*
  * Licensed to Derrick R. Burns under one or more
  * contributor license agreements.  See the NOTICES file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // T-Digest :  Percentile and Quantile Estimation of Big Data
 // A new data structure for accurate on-line accumulation of rank-based statistics
 // such as quantiles and trimmed means.
 // See original paper: "Computing extremely accurate quantiles using t-digest"
 // by Ted Dunning and Otmar Ertl for more details
 // https://github.com/tdunning/t-digest/blob/07b8f2ca2be8d0a9f04df2feadad5ddc1bb73c88/docs/t-digest-paper/histo.pdf.
 // https://github.com/derrickburns/tdigest

 #pragma once

 #include <pdqsort.h>

 #include <algorithm>
 #include <cfloat>
 #include <cmath>
 #include <iostream>
 #include <memory>
 #include <queue>
 #include <utility>
 #include <vector>

 #include "common/factory_creator.h"
 #include "common/logging.h"
 #include "udf/udf.h"
 #include "util/debug_util.h"

 namespace doris {

 using Value = float;
 using Weight = float;
 using Index = size_t;

 const size_t kHighWater = 40000;

 class Centroid {
 public:
     Centroid() : Centroid(0.0, 0.0) {}

     Centroid(Value mean, Weight weight) : _mean(mean), _weight(weight) {}

     Value mean() const noexcept { return _mean; }

     Weight weight() const noexcept { return _weight; }

     Value& mean() noexcept { return _mean; }

     Weight& weight() noexcept { return _weight; }

     void add(const Centroid& c) {
         DCHECK_GT(c._weight, 0);
         if (_weight != 0.0) {
             _weight += c._weight;
             _mean += c._weight * (c._mean - _mean) / _weight;
         } else {
             _weight = c._weight;
             _mean = c._mean;
         }
     }

 private:
     Value _mean = 0;
     Weight _weight = 0;
 };

 struct CentroidList {
     CentroidList(const std::vector<Centroid>& s) : iter(s.cbegin()), end(s.cend()) {}
     std::vector<Centroid>::const_iterator iter;
     std::vector<Centroid>::const_iterator end;

     bool advance() { return ++iter != end; }
 };

 class CentroidListComparator {
 public:
     CentroidListComparator() {}

     bool operator()(const CentroidList& left, const CentroidList& right) const {
         return left.iter->mean() > right.iter->mean();
     }
 };

 using CentroidListQueue =
         std::priority_queue<CentroidList, std::vector<CentroidList>, CentroidListComparator>;

 struct CentroidComparator {
     bool operator()(const Centroid& a, const Centroid& b) const { return a.mean() < b.mean(); }
 };

 class TDigest {
     ENABLE_FACTORY_CREATOR(TDigest);

     class TDigestComparator {
     public:
         TDigestComparator() {}

         bool operator()(const TDigest* left, const TDigest* right) const {
             return left->totalSize() > right->totalSize();
         }
     };
     using TDigestQueue =
             std::priority_queue<const TDigest*, std::vector<const TDigest*>, TDigestComparator>;

 public:
     TDigest() : TDigest(10000) {}

     explicit TDigest(Value compression) : TDigest(compression, 0) {}

     TDigest(Value compression, Index bufferSize) : TDigest(compression, bufferSize, 0) {}

     TDigest(Value compression, Index unmergedSize, Index mergedSize)
             : _compression(compression),
               _max_processed(processedSize(mergedSize, compression)),
               _max_unprocessed(unprocessedSize(unmergedSize, compression)) {
         _processed.reserve(_max_processed);
         _unprocessed.reserve(_max_unprocessed + 1);
     }

     TDigest(std::vector<Centroid>&& processed, std::vector<Centroid>&& unprocessed,
             Value compression, Index unmergedSize, Index mergedSize)
             : TDigest(compression, unmergedSize, mergedSize) {
         _processed = std::move(processed);
         _unprocessed = std::move(unprocessed);

         _processed_weight = weight(_processed);
         _unprocessed_weight = weight(_unprocessed);
         if (_processed.size() > 0) {
             _min = std::min(_min, _processed[0].mean());
             _max = std::max(_max, (_processed.cend() - 1)->mean());
         }
         updateCumulative();
     }

     static Weight weight(std::vector<Centroid>& centroids) noexcept {
         Weight w = 0.0;
         for (auto centroid : centroids) {
             w += centroid.weight();
         }
         return w;
     }

     TDigest& operator=(TDigest&& o) {
         _compression = o._compression;
         _max_processed = o._max_processed;
         _max_unprocessed = o._max_unprocessed;
         _processed_weight = o._processed_weight;
         _unprocessed_weight = o._unprocessed_weight;
         _processed = std::move(o._processed);
         _unprocessed = std::move(o._unprocessed);
         _cumulative = std::move(o._cumulative);
         _min = o._min;
         _max = o._max;
         return *this;
     }

     TDigest(TDigest&& o)
             : TDigest(std::move(o._processed), std::move(o._unprocessed), o._compression,
                       o._max_unprocessed, o._max_processed) {}

     static inline Index processedSize(Index size, Value compression) noexcept {
         return (size == 0) ? static_cast<Index>(2 * std::ceil(compression)) : size;
     }

     static inline Index unprocessedSize(Index size, Value compression) noexcept {
         return (size == 0) ? static_cast<Index>(8 * std::ceil(compression)) : size;
     }

     // merge in another t-digest
     void merge(const TDigest* other) {
         std::vector<const TDigest*> others {other};
         add(others.cbegin(), others.cend());
     }

     const std::vector<Centroid>& processed() const { return _processed; }

     const std::vector<Centroid>& unprocessed() const { return _unprocessed; }

     Index maxUnprocessed() const { return _max_unprocessed; }

     Index maxProcessed() const { return _max_processed; }

     void add(std::vector<const TDigest*> digests) { add(digests.cbegin(), digests.cend()); }

     // merge in a vector of tdigests in the most efficient manner possible
     // in constant space
     // works for any value of kHighWater
     void add(std::vector<const TDigest*>::const_iterator iter,
              std::vector<const TDigest*>::const_iterator end) {
         if (iter != end) {
             auto size = std::distance(iter, end);
             TDigestQueue pq(TDigestComparator {});
             for (; iter != end; iter++) {
                 pq.push((*iter));
             }
             std::vector<const TDigest*> batch;
             batch.reserve(size);

             size_t totalSize = 0;
             while (!pq.empty()) {
                 auto td = pq.top();
                 batch.push_back(td);
                 pq.pop();
                 totalSize += td->totalSize();
                 if (totalSize >= kHighWater || pq.empty()) {
                     mergeProcessed(batch);
                     mergeUnprocessed(batch);
                     processIfNecessary();
                     batch.clear();
                     totalSize = 0;
                 }
             }
             updateCumulative();
         }
     }

     Weight processedWeight() const { return _processed_weight; }

     Weight unprocessedWeight() const { return _unprocessed_weight; }

     bool haveUnprocessed() const { return _unprocessed.size() > 0; }

     size_t totalSize() const { return _processed.size() + _unprocessed.size(); }

     long totalWeight() const { return static_cast<long>(_processed_weight + _unprocessed_weight); }

     // return the cdf on the t-digest
     Value cdf(Value x) {
         if (haveUnprocessed() || isDirty()) process();
         return cdfProcessed(x);
     }

     bool isDirty() {
         return _processed.size() > _max_processed || _unprocessed.size() > _max_unprocessed;
     }

     // return the cdf on the processed values
     Value cdfProcessed(Value x) const {
         VLOG_CRITICAL << "cdf value " << x;
         VLOG_CRITICAL << "processed size " << _processed.size();
         if (_processed.size() == 0) {
             // no data to examine
             VLOG_CRITICAL << "no processed values";

             return 0.0;
         } else if (_processed.size() == 1) {
             VLOG_CRITICAL << "one processed value "
                           << " _min " << _min << " _max " << _max;
             // exactly one centroid, should have _max==_min
             auto width = _max - _min;
             if (x < _min) {
                 return 0.0;
             } else if (x > _max) {
                 return 1.0;
             } else if (x - _min <= width) {
                 // _min and _max are too close together to do any viable interpolation
                 return 0.5;
             } else {
                 // interpolate if somehow we have weight > 0 and _max != _min
                 return (x - _min) / (_max - _min);
             }
         } else {
             auto n = _processed.size();
             if (x <= _min) {
                 VLOG_CRITICAL << "below _min "
                               << " _min " << _min << " x " << x;
                 return 0;
             }

             if (x >= _max) {
                 VLOG_CRITICAL << "above _max "
                               << " _max " << _max << " x " << x;
                 return 1;
             }

             // check for the left tail
             if (x <= mean(0)) {
                 VLOG_CRITICAL << "left tail "
                               << " _min " << _min << " mean(0) " << mean(0) << " x " << x;

                 // note that this is different than mean(0) > _min ... this guarantees interpolation works
                 if (mean(0) - _min > 0) {
                     return (x - _min) / (mean(0) - _min) * weight(0) / _processed_weight / 2.0;
                 } else {
                     return 0;
                 }
             }

             // and the right tail
             if (x >= mean(n - 1)) {
                 VLOG_CRITICAL << "right tail"
                               << " _max " << _max << " mean(n - 1) " << mean(n - 1) << " x " << x;

                 if (_max - mean(n - 1) > 0) {
                     return 1.0 - (_max - x) / (_max - mean(n - 1)) * weight(n - 1) /
                                          _processed_weight / 2.0;
                 } else {
                     return 1;
                 }
             }

             CentroidComparator cc;
             auto iter =
                     std::upper_bound(_processed.cbegin(), _processed.cend(), Centroid(x, 0), cc);

             auto i = std::distance(_processed.cbegin(), iter);
             auto z1 = x - (iter - 1)->mean();
             auto z2 = (iter)->mean() - x;
             DCHECK_LE(0.0, z1);
             DCHECK_LE(0.0, z2);
             VLOG_CRITICAL << "middle "
                           << " z1 " << z1 << " z2 " << z2 << " x " << x;

             return weightedAverage(_cumulative[i - 1], z2, _cumulative[i], z1) / _processed_weight;
         }
     }

     // this returns a quantile on the t-digest
     Value quantile(Value q) {
         if (haveUnprocessed() || isDirty()) process();
         return quantileProcessed(q);
     }

     // this returns a quantile on the currently processed values without changing the t-digest
     // the value will not represent the unprocessed values
     Value quantileProcessed(Value q) const {
         if (q < 0 || q > 1) {
             VLOG_CRITICAL << "q should be in [0,1], got " << q;
             return NAN;
         }

         if (_processed.size() == 0) {
             // no sorted means no data, no way to get a quantile
             return NAN;
         } else if (_processed.size() == 1) {
             // with one data point, all quantiles lead to Rome

             return mean(0);
         }

         // we know that there are at least two sorted now
         auto n = _processed.size();

         // if values were stored in a sorted array, index would be the offset we are Weighterested in
         const auto index = q * _processed_weight;

         // at the boundaries, we return _min or _max
         if (index <= weight(0) / 2.0) {
             DCHECK_GT(weight(0), 0);
             return _min + 2.0 * index / weight(0) * (mean(0) - _min);
         }

         auto iter = std::lower_bound(_cumulative.cbegin(), _cumulative.cend(), index);

         if (iter + 1 != _cumulative.cend()) {
             auto i = std::distance(_cumulative.cbegin(), iter);
             auto z1 = index - *(iter - 1);
             auto z2 = *(iter)-index;
             // VLOG_CRITICAL << "z2 " << z2 << " index " << index << " z1 " << z1;
             return weightedAverage(mean(i - 1), z2, mean(i), z1);
         }

         DCHECK_LE(index, _processed_weight);
         DCHECK_GE(index, _processed_weight - weight(n - 1) / 2.0);

         auto z1 = index - _processed_weight - weight(n - 1) / 2.0;
         auto z2 = weight(n - 1) / 2 - z1;
         return weightedAverage(mean(n - 1), z1, _max, z2);
     }

     Value compression() const { return _compression; }

     void add(Value x) { add(x, 1); }

     void compress() { process(); }

     // add a single centroid to the unprocessed vector, processing previously unprocessed sorted if our limit has
     // been reached.
     bool add(Value x, Weight w) {
         if (std::isnan(x)) {
             return false;
         }
         _unprocessed.push_back(Centroid(x, w));
         _unprocessed_weight += w;
         processIfNecessary();
         return true;
     }

     void add(std::vector<Centroid>::const_iterator iter,
              std::vector<Centroid>::const_iterator end) {
         while (iter != end) {
             const size_t diff = std::distance(iter, end);
             const size_t room = _max_unprocessed - _unprocessed.size();
             auto mid = iter + std::min(diff, room);
             while (iter != mid) _unprocessed.push_back(*(iter++));
             if (_unprocessed.size() >= _max_unprocessed) {
                 process();
             }
         }
     }

     uint32_t serialized_size() {
         return sizeof(uint32_t) + sizeof(Value) * 5 + sizeof(Index) * 2 + sizeof(uint32_t) * 3 +
                _processed.size() * sizeof(Centroid) + _unprocessed.size() * sizeof(Centroid) +
                _cumulative.size() * sizeof(Weight);
     }

     size_t serialize(uint8_t* writer) {
         uint8_t* dst = writer;
         uint32_t total_size = serialized_size();
         memcpy(writer, &total_size, sizeof(uint32_t));
         writer += sizeof(uint32_t);
         memcpy(writer, &_compression, sizeof(Value));
         writer += sizeof(Value);
         memcpy(writer, &_min, sizeof(Value));
         writer += sizeof(Value);
         memcpy(writer, &_max, sizeof(Value));
         writer += sizeof(Value);
         memcpy(writer, &_max_processed, sizeof(Index));
         writer += sizeof(Index);
         memcpy(writer, &_max_unprocessed, sizeof(Index));
         writer += sizeof(Index);
         memcpy(writer, &_processed_weight, sizeof(Value));
         writer += sizeof(Value);
         memcpy(writer, &_unprocessed_weight, sizeof(Value));
         writer += sizeof(Value);

         uint32_t size = _processed.size();
         memcpy(writer, &size, sizeof(uint32_t));
         writer += sizeof(uint32_t);
         for (int i = 0; i < size; i++) {
             memcpy(writer, &_processed[i], sizeof(Centroid));
             writer += sizeof(Centroid);
         }

         size = _unprocessed.size();
         memcpy(writer, &size, sizeof(uint32_t));
         writer += sizeof(uint32_t);
         //TODO(weixiang): may be once memcpy is enough!
         for (int i = 0; i < size; i++) {
             memcpy(writer, &_unprocessed[i], sizeof(Centroid));
             writer += sizeof(Centroid);
         }

         size = _cumulative.size();
         memcpy(writer, &size, sizeof(uint32_t));
         writer += sizeof(uint32_t);
         for (int i = 0; i < size; i++) {
             memcpy(writer, &_cumulative[i], sizeof(Weight));
             writer += sizeof(Weight);
         }
         return writer - dst;
     }

     void unserialize(const uint8_t* type_reader) {
         uint32_t total_length = 0;
         memcpy(&total_length, type_reader, sizeof(uint32_t));
         type_reader += sizeof(uint32_t);
         memcpy(&_compression, type_reader, sizeof(Value));
         type_reader += sizeof(Value);
         memcpy(&_min, type_reader, sizeof(Value));
         type_reader += sizeof(Value);
         memcpy(&_max, type_reader, sizeof(Value));
         type_reader += sizeof(Value);

         memcpy(&_max_processed, type_reader, sizeof(Index));
         type_reader += sizeof(Index);
         memcpy(&_max_unprocessed, type_reader, sizeof(Index));
         type_reader += sizeof(Index);
         memcpy(&_processed_weight, type_reader, sizeof(Value));
         type_reader += sizeof(Value);
         memcpy(&_unprocessed_weight, type_reader, sizeof(Value));
         type_reader += sizeof(Value);

         uint32_t size;
         memcpy(&size, type_reader, sizeof(uint32_t));
         type_reader += sizeof(uint32_t);
         _processed.resize(size);
         for (int i = 0; i < size; i++) {
             memcpy(&_processed[i], type_reader, sizeof(Centroid));
             type_reader += sizeof(Centroid);
         }
         memcpy(&size, type_reader, sizeof(uint32_t));
         type_reader += sizeof(uint32_t);
         _unprocessed.resize(size);
         for (int i = 0; i < size; i++) {
             memcpy(&_unprocessed[i], type_reader, sizeof(Centroid));
             type_reader += sizeof(Centroid);
         }
         memcpy(&size, type_reader, sizeof(uint32_t));
         type_reader += sizeof(uint32_t);
         _cumulative.resize(size);
         for (int i = 0; i < size; i++) {
             memcpy(&_cumulative[i], type_reader, sizeof(Weight));
             type_reader += sizeof(Weight);
         }
     }

 private:
     Value _compression;

     Value _min = std::numeric_limits<Value>::max();

     Value _max = std::numeric_limits<Value>::min();

     Index _max_processed;

     Index _max_unprocessed;

     Value _processed_weight = 0.0;

     Value _unprocessed_weight = 0.0;

     std::vector<Centroid> _processed;

     std::vector<Centroid> _unprocessed;

     std::vector<Weight> _cumulative;

     // return mean of i-th centroid
     Value mean(int i) const noexcept { return _processed[i].mean(); }

     // return weight of i-th centroid
     Weight weight(int i) const noexcept { return _processed[i].weight(); }

     // append all unprocessed centroids into current unprocessed vector
     void mergeUnprocessed(const std::vector<const TDigest*>& tdigests) {
         if (tdigests.size() == 0) return;

         size_t total = _unprocessed.size();
         for (auto& td : tdigests) {
             total += td->_unprocessed.size();
         }

         _unprocessed.reserve(total);
         for (auto& td : tdigests) {
             _unprocessed.insert(_unprocessed.end(), td->_unprocessed.cbegin(),
                                 td->_unprocessed.cend());
             _unprocessed_weight += td->_unprocessed_weight;
         }
     }

     // merge all processed centroids together into a single sorted vector
     void mergeProcessed(const std::vector<const TDigest*>& tdigests) {
         if (tdigests.size() == 0) return;

         size_t total = 0;
         CentroidListQueue pq(CentroidListComparator {});
         for (auto& td : tdigests) {
             auto& sorted = td->_processed;
             auto size = sorted.size();
             if (size > 0) {
                 pq.push(CentroidList(sorted));
                 total += size;
                 _processed_weight += td->_processed_weight;
             }
         }
         if (total == 0) return;

         if (_processed.size() > 0) {
             pq.push(CentroidList(_processed));
             total += _processed.size();
         }

         std::vector<Centroid> sorted;
         VLOG_CRITICAL << "total " << total;
         sorted.reserve(total);

         while (!pq.empty()) {
             auto best = pq.top();
             pq.pop();
             sorted.push_back(*(best.iter));
             if (best.advance()) pq.push(best);
         }
         _processed = std::move(sorted);
         if (_processed.size() > 0) {
             _min = std::min(_min, _processed[0].mean());
             _max = std::max(_max, (_processed.cend() - 1)->mean());
         }
     }

     void processIfNecessary() {
         if (isDirty()) {
             process();
         }
     }

     void updateCumulative() {
         const auto n = _processed.size();
         _cumulative.clear();
         _cumulative.reserve(n + 1);
         auto previous = 0.0;
         for (Index i = 0; i < n; i++) {
             auto current = weight(i);
             auto halfCurrent = current / 2.0;
             _cumulative.push_back(previous + halfCurrent);
             previous = previous + current;
         }
         _cumulative.push_back(previous);
     }

     // merges _unprocessed centroids and _processed centroids together and processes them
     // when complete, _unprocessed will be empty and _processed will have at most _max_processed centroids
     void process() {
         CentroidComparator cc;
         // select percentile_approx(lo_orderkey,0.5) from lineorder;
         // have test pdqsort and RadixSort, find here pdqsort performance is better when data is struct Centroid
         // But when sort plain type like int/float of std::vector<T>, find RadixSort is better
         pdqsort(_unprocessed.begin(), _unprocessed.end(), cc);
         auto count = _unprocessed.size();
         _unprocessed.insert(_unprocessed.end(), _processed.cbegin(), _processed.cend());
         std::inplace_merge(_unprocessed.begin(), _unprocessed.begin() + count, _unprocessed.end(),
                            cc);

         _processed_weight += _unprocessed_weight;
         _unprocessed_weight = 0;
         _processed.clear();

         _processed.push_back(_unprocessed[0]);
         Weight wSoFar = _unprocessed[0].weight();
         Weight wLimit = _processed_weight * integratedQ(1.0);

         auto end = _unprocessed.end();
         for (auto iter = _unprocessed.cbegin() + 1; iter < end; iter++) {
             auto& centroid = *iter;
             Weight projectedW = wSoFar + centroid.weight();
             if (projectedW <= wLimit) {
                 wSoFar = projectedW;
                 (_processed.end() - 1)->add(centroid);
             } else {
                 auto k1 = integratedLocation(wSoFar / _processed_weight);
                 wLimit = _processed_weight * integratedQ(k1 + 1.0);
                 wSoFar += centroid.weight();
                 _processed.emplace_back(centroid);
             }
         }
         _unprocessed.clear();
         _min = std::min(_min, _processed[0].mean());
         VLOG_CRITICAL << "new _min " << _min;
         _max = std::max(_max, (_processed.cend() - 1)->mean());
         VLOG_CRITICAL << "new _max " << _max;
         updateCumulative();
     }

     int checkWeights() { return checkWeights(_processed, _processed_weight); }

     size_t checkWeights(const std::vector<Centroid>& sorted, Value total) {
         size_t badWeight = 0;
         auto k1 = 0.0;
         auto q = 0.0;
         for (auto iter = sorted.cbegin(); iter != sorted.cend(); iter++) {
             auto w = iter->weight();
             auto dq = w / total;
             auto k2 = integratedLocation(q + dq);
             if (k2 - k1 > 1 && w != 1) {
                 VLOG_CRITICAL << "Oversize centroid at " << std::distance(sorted.cbegin(), iter)
                               << " k1 " << k1 << " k2 " << k2 << " dk " << (k2 - k1) << " w " << w
                               << " q " << q;
                 badWeight++;
             }
             if (k2 - k1 > 1.5 && w != 1) {
                 VLOG_CRITICAL << "Egregiously Oversize centroid at "
                               << std::distance(sorted.cbegin(), iter) << " k1 " << k1 << " k2 "
                               << k2 << " dk " << (k2 - k1) << " w " << w << " q " << q;
                 badWeight++;
             }
             q += dq;
             k1 = k2;
         }

         return badWeight;
     }

     /**
     * Converts a quantile into a centroid scale value.  The centroid scale is nomin_ally
     * the number k of the centroid that a quantile point q should belong to.  Due to
     * round-offs, however, we can't align things perfectly without splitting points
     * and sorted.  We don't want to do that, so we have to allow for offsets.
     * In the end, the criterion is that any quantile range that spans a centroid
     * scale range more than one should be split across more than one centroid if
     * possible.  This won't be possible if the quantile range refers to a single point
     * or an already existing centroid.
     * <p/>
     * This mapping is steep near q=0 or q=1 so each centroid there will correspond to
     * less q range.  Near q=0.5, the mapping is flatter so that sorted there will
     * represent a larger chunk of quantiles.
     *
     * @param q The quantile scale value to be mapped.
     * @return The centroid scale value corresponding to q.
     */
     Value integratedLocation(Value q) const {
         return _compression * (std::asin(2.0 * q - 1.0) + M_PI / 2) / M_PI;
     }

     Value integratedQ(Value k) const {
         return (std::sin(std::min(k, _compression) * M_PI / _compression - M_PI / 2) + 1) / 2;
     }

     /**
      * Same as {@link #weightedAverageSorted(Value, Value, Value, Value)} but flips
      * the order of the variables if <code>x2</code> is greater than
      * <code>x1</code>.
     */
     static Value weightedAverage(Value x1, Value w1, Value x2, Value w2) {
         return (x1 <= x2) ? weightedAverageSorted(x1, w1, x2, w2)
                           : weightedAverageSorted(x2, w2, x1, w1);
     }

     /**
     * Compute the weighted average between <code>x1</code> with a weight of
     * <code>w1</code> and <code>x2</code> with a weight of <code>w2</code>.
     * This expects <code>x1</code> to be less than or equal to <code>x2</code>
     * and is guaranteed to return a number between <code>x1</code> and
     * <code>x2</code>.
     */
     static Value weightedAverageSorted(Value x1, Value w1, Value x2, Value w2) {
         DCHECK_LE(x1, x2);
         const Value x = (x1 * w1 + x2 * w2) / (w1 + w2);
         return std::max(x1, std::min(x, x2));
     }

     static Value interpolate(Value x, Value x0, Value x1) { return (x - x0) / (x1 - x0); }

     /**
     * Computes an interpolated value of a quantile that is between two sorted.
     *
     * Index is the quantile desired multiplied by the total number of samples - 1.
     *
     * @param index              Denormalized quantile desired
     * @param previousIndex      The denormalized quantile corresponding to the center of the previous centroid.
     * @param nextIndex          The denormalized quantile corresponding to the center of the following centroid.
     * @param previousMean       The mean of the previous centroid.
     * @param nextMean           The mean of the following centroid.
     * @return  The interpolated mean.
     */
     static Value quantile(Value index, Value previousIndex, Value nextIndex, Value previousMean,
                           Value nextMean) {
         const auto delta = nextIndex - previousIndex;
         const auto previousWeight = (nextIndex - index) / delta;
         const auto nextWeight = (index - previousIndex) / delta;
         return previousMean * previousWeight + nextMean * nextWeight;
     }
 };

 } // namespace doris