be/test/exprs/zipf_distribution.h - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include <algorithm>
 #include <cmath>
 #include <random>

 /** Refer to https://stackoverflow.com/questions/9983239/how-to-generate-zipf-distributed-numbers-efficiently
  *  Zipf-like random distribution.
  *
  * "Rejection-inversion to generate variates from monotone discrete
  * distributions", Wolfgang Hörmann and Gerhard Derflinger
  * ACM TOMACS 6.3 (1996): 169-184
  */
 template<class IntType = unsigned long, class RealType = double>
 class zipf_distribution
 {
 public:
     typedef RealType input_type;
     typedef IntType result_type;

     static_assert(std::numeric_limits<IntType>::is_integer, "");
     static_assert(!std::numeric_limits<RealType>::is_integer, "");

     zipf_distribution(const IntType n=std::numeric_limits<IntType>::max(),
                       const RealType q=1.0)
             : n(n)
             , q(q)
             , H_x1(H(1.5) - 1.0)
             , H_n(H(n + 0.5))
             , dist(H_x1, H_n)
     {}

     IntType operator()(std::mt19937& rng)
     {
         while (true) {
             const RealType u = dist(rng);
             const RealType x = H_inv(u);
             const IntType  k = clamp<IntType>(std::round(x), 1, n);
             if (u >= H(k + 0.5) - h(k)) {
                 return k;
             }
         }
     }

 private:
     /** Clamp x to [min, max]. */
     template<typename T>
     static constexpr T clamp(const T x, const T min, const T max)
     {
         return std::max(min, std::min(max, x));
     }

     /** exp(x) - 1 / x */
     static double
     expxm1bx(const double x)
     {
         return (std::abs(x) > epsilon)
                ? std::expm1(x) / x
                : (1.0 + x/2.0 * (1.0 + x/3.0 * (1.0 + x/4.0)));
     }

     /** H(x) = log(x) if q == 1, (x^(1-q) - 1)/(1 - q) otherwise.
      * H(x) is an integral of h(x).
      *
      * Note the numerator is one less than in the paper order to work with all
      * positive q.
      */
     const RealType H(const RealType x)
     {
         const RealType log_x = std::log(x);
         return expxm1bx((1.0 - q) * log_x) * log_x;
     }

     /** log(1 + x) / x */
     static RealType
     log1pxbx(const RealType x)
     {
         return (std::abs(x) > epsilon)
                ? std::log1p(x) / x
                : 1.0 - x * ((1/2.0) - x * ((1/3.0) - x * (1/4.0)));
     }

     /** The inverse function of H(x) */
     const RealType H_inv(const RealType x)
     {
         const RealType t = std::max(-1.0, x * (1.0 - q));
         return std::exp(log1pxbx(t) * x);
     }

     /** That hat function h(x) = 1 / (x ^ q) */
     const RealType h(const RealType x)
     {
         return std::exp(-q * std::log(x));
     }

     static constexpr RealType epsilon = 1e-8;

     IntType                                  n;     ///< Number of elements
     RealType                                 q;     ///< Exponent
     RealType                                 H_x1;  ///< H(x_1)
     RealType                                 H_n;   ///< H(n)
     std::uniform_real_distribution<RealType> dist;  ///< [H(x_1), H(n)]
 };
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include <algorithm>
	#include <cmath>
	#include <random>

	/** Refer to https://stackoverflow.com/questions/9983239/how-to-generate-zipf-distributed-numbers-efficiently
	* Zipf-like random distribution.
	*
	* "Rejection-inversion to generate variates from monotone discrete
	* distributions", Wolfgang Hörmann and Gerhard Derflinger
	* ACM TOMACS 6.3 (1996): 169-184
	*/
	template<class IntType = unsigned long, class RealType = double>
	class zipf_distribution
	{
	public:
	typedef RealType input_type;
	typedef IntType result_type;

	static_assert(std::numeric_limits<IntType>::is_integer, "");
	static_assert(!std::numeric_limits<RealType>::is_integer, "");

	zipf_distribution(const IntType n=std::numeric_limits<IntType>::max(),
	const RealType q=1.0)
	: n(n)
	, q(q)
	, H_x1(H(1.5) - 1.0)
	, H_n(H(n + 0.5))
	, dist(H_x1, H_n)
	{}

	IntType operator()(std::mt19937& rng)
	{
	while (true) {
	const RealType u = dist(rng);
	const RealType x = H_inv(u);
	const IntType k = clamp<IntType>(std::round(x), 1, n);
	if (u >= H(k + 0.5) - h(k)) {
	return k;
	}
	}
	}

	private:
	/** Clamp x to [min, max]. */
	template<typename T>
	static constexpr T clamp(const T x, const T min, const T max)
	{
	return std::max(min, std::min(max, x));
	}

	/** exp(x) - 1 / x */
	static double
	expxm1bx(const double x)
	{
	return (std::abs(x) > epsilon)
	? std::expm1(x) / x
	: (1.0 + x/2.0 * (1.0 + x/3.0 * (1.0 + x/4.0)));
	}

	/** H(x) = log(x) if q == 1, (x^(1-q) - 1)/(1 - q) otherwise.
	* H(x) is an integral of h(x).
	*
	* Note the numerator is one less than in the paper order to work with all
	* positive q.
	*/
	const RealType H(const RealType x)
	{
	const RealType log_x = std::log(x);
	return expxm1bx((1.0 - q) * log_x) * log_x;
	}

	/** log(1 + x) / x */
	static RealType
	log1pxbx(const RealType x)
	{
	return (std::abs(x) > epsilon)
	? std::log1p(x) / x
	: 1.0 - x * ((1/2.0) - x * ((1/3.0) - x * (1/4.0)));
	}

	/** The inverse function of H(x) */
	const RealType H_inv(const RealType x)
	{
	const RealType t = std::max(-1.0, x * (1.0 - q));
	return std::exp(log1pxbx(t) * x);
	}

	/** That hat function h(x) = 1 / (x ^ q) */
	const RealType h(const RealType x)
	{
	return std::exp(-q * std::log(x));
	}

	static constexpr RealType epsilon = 1e-8;

	IntType n; ///< Number of elements
	RealType q; ///< Exponent
	RealType H_x1; ///< H(x_1)
	RealType H_n; ///< H(n)
	std::uniform_real_distribution<RealType> dist; ///< [H(x_1), H(n)]
	};