be/src/vec/aggregate_functions/aggregate_function_sem.h - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include <math.h>

 #include "runtime/decimalv2_value.h"
 #include "runtime/define_primitive_type.h"
 #include "runtime/primitive_type.h"
 #include "vec/aggregate_functions/aggregate_function.h"
 #include "vec/columns/column.h"
 #include "vec/common/assert_cast.h"
 #include "vec/core/types.h"
 #include "vec/data_types/data_type.h"
 #include "vec/data_types/data_type_decimal.h"

 namespace doris::vectorized {
 #include "common/compile_check_begin.h"

 class Arena;
 class BufferReadable;
 class BufferWritable;

 /**
  * SEM = sqrt(variance / count) = sqrt(m2 / (count * (count - 1)))
  * It uses Welford’s method for numerically stable one-pass computation of the mean and variance.
  */
 struct AggregateFunctionSemData {
     double mean {};
     double m2 {}; // Cumulative sum of squares
     UInt64 count = 0;

     //  Let the old mean be mean_{n-1} and we receive a new value x_n.
     //  The new mean should be: mean_n = mean_{n-1} + (x_n - mean_{n-1}) / n
     //  The new M2 should capture total squared deviations from the new mean:
     //      M2_n = M2_{n-1} + (x_n - mean_{n-1}) * (x_n - mean_n)
     void add(const double& value) {
         count++;
         double delta = value - mean;
         mean += delta / static_cast<double>(count);
         double delta2 = value - mean;
         m2 += delta * delta2;
     }

     // Suppose we have dataset A (count_a, mean_a, M2_a) and B (count_b, mean_b, M2_b).
     // When merging:
     //   - The total count is count_a + count_b
     //   - The new mean is the weighted average of the two means
     //   - The new M2 accumulates both variances and an adjustment term to account for
     //     the difference in means between A and B:
     //        M2 = M2_a + M2_b + delta^2 * count_a * count_b / total_count
     // where delta = mean_b - mean_a
     void merge(const AggregateFunctionSemData& rhs) {
         UInt64 total_count = count + rhs.count;
         double delta = rhs.mean - mean;
         mean = (mean * static_cast<double>(count) + rhs.mean * static_cast<double>(rhs.count)) /
                static_cast<double>(total_count);
         m2 += rhs.m2 + delta * delta * static_cast<double>(count) * static_cast<double>(rhs.count) /
                                static_cast<double>(total_count);
         count = total_count;
     }

     void write(BufferWritable& buf) const {
         buf.write_binary(mean);
         buf.write_binary(m2);
         buf.write_binary(count);
     }

     void read(BufferReadable& buf) {
         buf.read_binary(mean);
         buf.read_binary(m2);
         buf.read_binary(count);
     }

     void reset() {
         mean = {};
         m2 = {};
         count = 0;
     }

     double result() const {
         if (count < 2) {
             return 0;
         }
         double dCount = static_cast<double>(count);
         double result = std::sqrt(m2 / (dCount * (dCount - 1.0)));
         return result;
     }
 };

 template <typename Data>
 class AggregateFunctionSem final
         : public IAggregateFunctionDataHelper<Data, AggregateFunctionSem<Data>>,
           UnaryExpression,
           NullableAggregateFunction {
 public:
     AggregateFunctionSem(const DataTypes& argument_types_)
             : IAggregateFunctionDataHelper<Data, AggregateFunctionSem<Data>>(argument_types_) {}

     String get_name() const override { return "sem"; }

     DataTypePtr get_return_type() const override { return std::make_shared<DataTypeFloat64>(); }

     void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num,
              Arena&) const override {
         const auto& column =
                 assert_cast<const ColumnFloat64&, TypeCheckOnRelease::DISABLE>(*columns[0]);
         this->data(place).add((double)column.get_data()[row_num]);
     }

     void reset(AggregateDataPtr place) const override { this->data(place).reset(); }

     void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs,
                Arena&) const override {
         this->data(place).merge(this->data(rhs));
     }

     void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override {
         this->data(place).write(buf);
     }

     void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf,
                      Arena&) const override {
         this->data(place).read(buf);
     }

     void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override {
         auto& column = assert_cast<ColumnFloat64&>(to);
         column.get_data().push_back(this->data(place).result());
     }
 };

 #include "common/compile_check_end.h"
 } // namespace doris::vectorized
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include <math.h>

	#include "runtime/decimalv2_value.h"
	#include "runtime/define_primitive_type.h"
	#include "runtime/primitive_type.h"
	#include "vec/aggregate_functions/aggregate_function.h"
	#include "vec/columns/column.h"
	#include "vec/common/assert_cast.h"
	#include "vec/core/types.h"
	#include "vec/data_types/data_type.h"
	#include "vec/data_types/data_type_decimal.h"

	namespace doris::vectorized {
	#include "common/compile_check_begin.h"

	class Arena;
	class BufferReadable;
	class BufferWritable;

	/**
	* SEM = sqrt(variance / count) = sqrt(m2 / (count * (count - 1)))
	* It uses Welford’s method for numerically stable one-pass computation of the mean and variance.
	*/
	struct AggregateFunctionSemData {
	double mean {};
	double m2 {}; // Cumulative sum of squares
	UInt64 count = 0;

	// Let the old mean be mean_{n-1} and we receive a new value x_n.
	// The new mean should be: mean_n = mean_{n-1} + (x_n - mean_{n-1}) / n
	// The new M2 should capture total squared deviations from the new mean:
	// M2_n = M2_{n-1} + (x_n - mean_{n-1}) * (x_n - mean_n)
	void add(const double& value) {
	count++;
	double delta = value - mean;
	mean += delta / static_cast<double>(count);
	double delta2 = value - mean;
	m2 += delta * delta2;
	}

	// Suppose we have dataset A (count_a, mean_a, M2_a) and B (count_b, mean_b, M2_b).
	// When merging:
	// - The total count is count_a + count_b
	// - The new mean is the weighted average of the two means
	// - The new M2 accumulates both variances and an adjustment term to account for
	// the difference in means between A and B:
	// M2 = M2_a + M2_b + delta^2 * count_a * count_b / total_count
	// where delta = mean_b - mean_a
	void merge(const AggregateFunctionSemData& rhs) {
	UInt64 total_count = count + rhs.count;
	double delta = rhs.mean - mean;
	mean = (mean * static_cast<double>(count) + rhs.mean * static_cast<double>(rhs.count)) /
	static_cast<double>(total_count);
	m2 += rhs.m2 + delta * delta * static_cast<double>(count) * static_cast<double>(rhs.count) /
	static_cast<double>(total_count);
	count = total_count;
	}

	void write(BufferWritable& buf) const {
	buf.write_binary(mean);
	buf.write_binary(m2);
	buf.write_binary(count);
	}

	void read(BufferReadable& buf) {
	buf.read_binary(mean);
	buf.read_binary(m2);
	buf.read_binary(count);
	}

	void reset() {
	mean = {};
	m2 = {};
	count = 0;
	}

	double result() const {
	if (count < 2) {
	return 0;
	}
	double dCount = static_cast<double>(count);
	double result = std::sqrt(m2 / (dCount * (dCount - 1.0)));
	return result;
	}
	};

	template <typename Data>
	class AggregateFunctionSem final
	: public IAggregateFunctionDataHelper<Data, AggregateFunctionSem<Data>>,
	UnaryExpression,
	NullableAggregateFunction {
	public:
	AggregateFunctionSem(const DataTypes& argument_types_)
	: IAggregateFunctionDataHelper<Data, AggregateFunctionSem<Data>>(argument_types_) {}

	String get_name() const override { return "sem"; }

	DataTypePtr get_return_type() const override { return std::make_shared<DataTypeFloat64>(); }

	void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num,
	Arena&) const override {
	const auto& column =
	assert_cast<const ColumnFloat64&, TypeCheckOnRelease::DISABLE>(*columns[0]);
	this->data(place).add((double)column.get_data()[row_num]);
	}

	void reset(AggregateDataPtr place) const override { this->data(place).reset(); }

	void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs,
	Arena&) const override {
	this->data(place).merge(this->data(rhs));
	}

	void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override {
	this->data(place).write(buf);
	}

	void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf,
	Arena&) const override {
	this->data(place).read(buf);
	}

	void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override {
	auto& column = assert_cast<ColumnFloat64&>(to);
	column.get_data().push_back(this->data(place).result());
	}
	};

	#include "common/compile_check_end.h"
	} // namespace doris::vectorized