blob: 8749364d51ae6d63d92fbc914f35e9a3f3bbbce3 [file]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <faiss/impl/platform_macros.h>
#include <faiss/utils/distances.h>
#include <gen_cpp/Types_types.h>
#include <optional>
#include "common/exception.h"
#include "common/status.h"
#include "core/assert_cast.h"
#include "core/column/column.h"
#include "core/column/column_array.h"
#include "core/column/column_const.h"
#include "core/column/column_nullable.h"
#include "core/data_type/data_type.h"
#include "core/data_type/data_type_array.h"
#include "core/data_type/data_type_nullable.h"
#include "core/data_type/data_type_number.h"
#include "core/data_type/primitive_type.h"
#include "core/types.h"
#include "exec/common/util.hpp"
#include "exprs/function/array/function_array_utils.h"
#include "exprs/function/function.h"
namespace doris {
class L1Distance {
public:
static constexpr auto name = "l1_distance";
static float distance(const float* x, const float* y, size_t d) {
return faiss::fvec_L1(x, y, d);
}
};
class L2Distance {
public:
static constexpr auto name = "l2_distance";
static float distance(const float* x, const float* y, size_t d) {
return std::sqrt(faiss::fvec_L2sqr(x, y, d));
}
};
class InnerProduct {
public:
static constexpr auto name = "inner_product";
static float distance(const float* x, const float* y, size_t d) {
return faiss::fvec_inner_product(x, y, d);
}
};
class CosineDistance {
public:
static constexpr auto name = "cosine_distance";
static float distance(const float* x, const float* y, size_t d);
};
class CosineSimilarity {
public:
static constexpr auto name = "cosine_similarity";
static float distance(const float* x, const float* y, size_t d);
};
class L2DistanceApproximate : public L2Distance {
public:
static constexpr auto name = "l2_distance_approximate";
};
class InnerProductApproximate : public InnerProduct {
public:
static constexpr auto name = "inner_product_approximate";
};
template <typename DistanceImpl>
class FunctionArrayDistance : public IFunction {
public:
using DataType = PrimitiveTypeTraits<TYPE_FLOAT>::DataType;
using ColumnType = PrimitiveTypeTraits<TYPE_FLOAT>::ColumnType;
static constexpr auto name = DistanceImpl::name;
String get_name() const override { return name; }
static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
size_t get_number_of_arguments() const override { return 2; }
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
if (arguments.size() != 2) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Invalid number of arguments");
}
// primitive_type of Nullable is its nested type.
if (arguments[0]->get_primitive_type() != TYPE_ARRAY ||
arguments[1]->get_primitive_type() != TYPE_ARRAY) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"Arguments for function {} must be arrays", get_name());
}
return std::make_shared<DataTypeFloat32>();
}
// All array distance functions has always not nullable return type.
// We want to make sure throw exception if input columns contain NULL.
bool use_default_implementation_for_nulls() const override { return false; }
// Extract the ColumnArray from a column, unwrapping Nullable if present.
// Validates that no NULL values exist.
static const ColumnArray* _extract_array_column(const IColumn* col, const char* arg_name,
const String& func_name) {
if (col->is_nullable()) {
if (col->has_null()) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"{} for function {} cannot be null", arg_name, func_name);
}
auto nullable = assert_cast<const ColumnNullable*>(col);
return assert_cast<const ColumnArray*>(nullable->get_nested_column_ptr().get());
}
return assert_cast<const ColumnArray*>(col);
}
// Extract the ColumnFloat32 data from an array column, unwrapping Nullable if present.
// Validates that no NULL elements exist within the array.
static const ColumnFloat32* _extract_float_data(const ColumnArray* arr, const char* arg_name,
const String& func_name) {
if (arr->get_data_ptr()->is_nullable()) {
if (arr->get_data_ptr()->has_null()) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"{} for function {} cannot have null", arg_name, func_name);
}
auto nullable = assert_cast<const ColumnNullable*>(arr->get_data_ptr().get());
return assert_cast<const ColumnFloat32*>(nullable->get_nested_column_ptr().get());
}
return assert_cast<const ColumnFloat32*>(arr->get_data_ptr().get());
}
// Holds the extracted float data pointer and dimension for a const array argument,
// avoiding repeated per-row extraction.
struct ConstArrayInfo {
const float* data = nullptr;
ssize_t dim = 0;
};
// Try to extract const array info from a column. If the column is ColumnConst,
// extract the float data pointer and dimension once; otherwise return nullopt.
std::optional<ConstArrayInfo> _try_extract_const(const ColumnPtr& col,
const char* arg_name) const {
if (!is_column_const(*col)) {
return std::nullopt;
}
auto const_col = assert_cast<const ColumnConst*>(col.get());
const IColumn* inner = const_col->get_data_column_ptr().get();
const ColumnArray* arr = _extract_array_column(inner, arg_name, get_name());
const ColumnFloat32* float_col = _extract_float_data(arr, arg_name, get_name());
ssize_t dim = static_cast<ssize_t>(float_col->size());
return ConstArrayInfo {float_col->get_data().data(), dim};
}
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
const auto& arg1 = block.get_by_position(arguments[0]);
const auto& arg2 = block.get_by_position(arguments[1]);
// Try to handle const columns without expanding them.
auto const_info1 = _try_extract_const(arg1.column, "First argument");
auto const_info2 = _try_extract_const(arg2.column, "Second argument");
// For non-const columns, expand and extract normally.
ColumnPtr materialized_col1, materialized_col2;
const ColumnArray* arr1 = nullptr;
const ColumnArray* arr2 = nullptr;
const ColumnFloat32* float1 = nullptr;
const ColumnFloat32* float2 = nullptr;
const ColumnOffset64* offset1 = nullptr;
const ColumnOffset64* offset2 = nullptr;
const IColumn::Offsets64* offsets_data1 = nullptr;
const IColumn::Offsets64* offsets_data2 = nullptr;
const float* float_data1 = nullptr;
const float* float_data2 = nullptr;
if (!const_info1) {
materialized_col1 = arg1.column->convert_to_full_column_if_const();
arr1 = _extract_array_column(materialized_col1.get(), "First argument", get_name());
float1 = _extract_float_data(arr1, "First argument", get_name());
offset1 = assert_cast<const ColumnArray::ColumnOffsets*>(arr1->get_offsets_ptr().get());
offsets_data1 = &offset1->get_data();
float_data1 = float1->get_data().data();
}
if (!const_info2) {
materialized_col2 = arg2.column->convert_to_full_column_if_const();
arr2 = _extract_array_column(materialized_col2.get(), "Second argument", get_name());
float2 = _extract_float_data(arr2, "Second argument", get_name());
offset2 = assert_cast<const ColumnArray::ColumnOffsets*>(arr2->get_offsets_ptr().get());
offsets_data2 = &offset2->get_data();
float_data2 = float2->get_data().data();
}
// prepare return data
auto dst = ColumnType::create(input_rows_count);
auto& dst_data = dst->get_data();
for (size_t row = 0; row < input_rows_count; ++row) {
const float* data_ptr1;
const float* data_ptr2;
ssize_t size1, size2;
const auto idx = static_cast<ssize_t>(row);
if (const_info1) {
data_ptr1 = const_info1->data;
size1 = const_info1->dim;
} else {
// -1 is valid for PaddedPODArray-backed offsets.
const auto prev_offset1 = (*offsets_data1)[idx - 1];
size1 = (*offsets_data1)[idx] - prev_offset1;
data_ptr1 = float_data1 + prev_offset1;
}
if (const_info2) {
data_ptr2 = const_info2->data;
size2 = const_info2->dim;
} else {
const auto prev_offset2 = (*offsets_data2)[idx - 1];
size2 = (*offsets_data2)[idx] - prev_offset2;
data_ptr2 = float_data2 + prev_offset2;
}
if (size1 != size2) [[unlikely]] {
return Status::InvalidArgument(
"function {} have different input element sizes of array: {} and {}",
get_name(), size1, size2);
}
dst_data[row] = DistanceImpl::distance(data_ptr1, data_ptr2, size1);
}
block.replace_by_position(result, std::move(dst));
return Status::OK();
}
};
} // namespace doris