blob: fc49a0566bff9e09dae2557a3e435e2f785f3362 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/array/arrayIndex.h
// and modified by Doris
#pragma once
#include <stddef.h>
#include <memory>
#include <utility>
#include "common/status.h"
#include "olap/column_predicate.h"
#include "olap/rowset/segment_v2/index_reader_helper.h"
#include "olap/rowset/segment_v2/inverted_index_query_type.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "runtime/define_primitive_type.h"
#include "runtime/primitive_type.h"
#include "vec/columns/column.h"
#include "vec/columns/column_array.h"
#include "vec/columns/column_nullable.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/common/assert_cast.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
#include "vec/core/call_on_type_index.h"
#include "vec/core/column_numbers.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_array.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_number.h" // IWYU pragma: keep
#include "vec/functions/function.h"
namespace doris {
class FunctionContext;
} // namespace doris
namespace doris::vectorized {
struct ArrayContainsAction {
static constexpr auto ResultType = PrimitiveType::TYPE_BOOLEAN;
static constexpr auto name = "array_contains";
static constexpr const bool resume_execution = false;
static constexpr void apply(typename PrimitiveTypeTraits<ResultType>::CppType& current,
size_t) noexcept {
current = 1;
}
};
struct ArrayPositionAction {
static constexpr auto ResultType = PrimitiveType::TYPE_BIGINT;
static constexpr auto name = "array_position";
static constexpr const bool resume_execution = false;
static constexpr void apply(typename PrimitiveTypeTraits<ResultType>::CppType& current,
size_t j) noexcept {
current = j + 1;
}
};
struct ArrayCountEqual {
static constexpr auto ResultType = PrimitiveType::TYPE_BIGINT;
static constexpr auto name = "countequal";
static constexpr const bool resume_execution = true;
static constexpr void apply(typename PrimitiveTypeTraits<ResultType>::CppType& current,
size_t j) noexcept {
++current;
}
};
struct ParamValue {
PrimitiveType type;
Field value;
};
template <typename ConcreteAction>
class FunctionArrayIndex : public IFunction {
public:
static constexpr auto ResultType = ConcreteAction::ResultType;
static constexpr auto name = ConcreteAction::name;
static FunctionPtr create() { return std::make_shared<FunctionArrayIndex>(); }
/// Get function name.
String get_name() const override { return name; }
bool is_variadic() const override { return false; }
size_t get_number_of_arguments() const override { return 2; }
bool use_default_implementation_for_nulls() const override { return false; }
Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
if (scope == FunctionContext::THREAD_LOCAL) {
return Status::OK();
}
DCHECK(context->get_num_args() >= 1);
DCHECK_EQ(context->get_arg_type(0)->get_primitive_type(), PrimitiveType::TYPE_ARRAY);
// now we only support same
std::shared_ptr<ParamValue> state = std::make_shared<ParamValue>();
Field field;
if (context->get_constant_col(1)) {
context->get_constant_col(1)->column_ptr->get(0, field);
state->value = field;
state->type = context->get_arg_type(1)->get_primitive_type();
context->set_function_state(scope, state);
}
return Status::OK();
}
Status evaluate_inverted_index(
const ColumnsWithTypeAndName& arguments,
const std::vector<vectorized::IndexFieldNameAndTypePair>& data_type_with_names,
std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
const InvertedIndexAnalyzerCtx* analyzer_ctx,
segment_v2::InvertedIndexResultBitmap& bitmap_result) const override {
DCHECK(arguments.size() == 1);
DCHECK(data_type_with_names.size() == 1);
DCHECK(iterators.size() == 1);
auto* iter = iterators[0];
auto data_type_with_name = data_type_with_names[0];
if (iter == nullptr) {
return Status::OK();
}
if (!segment_v2::IndexReaderHelper::has_string_or_bkd_index(iter)) {
// parser is not none we can not make sure the result is correct in expr combination
// for example, filter: !array_index(array, 'tall:120cm, weight: 35kg')
// here we have rows [tall:120cm, weight: 35kg, hobbies: reading book] which be tokenized
// but query is also tokenized, and FULLTEXT reader will catch this row as matched,
// so array_index(array, 'tall:120cm, weight: 35kg') return this rowid,
// but we expect it to be filtered, because we want row is equal to 'tall:120cm, weight: 35kg'
return Status::OK();
}
Field param_value;
arguments[0].column->get(0, param_value);
auto param_type = arguments[0].type->get_primitive_type();
// The current implementation for the inverted index of arrays cannot handle cases where the array contains null values,
// meaning an item in the array is null.
if (param_value.is_null()) {
return Status::OK();
}
std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
if (iter->has_null()) {
segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
RETURN_IF_ERROR(iter->read_null_bitmap(&null_bitmap_cache_handle));
null_bitmap = null_bitmap_cache_handle.get_bitmap();
}
std::unique_ptr<InvertedIndexQueryParamFactory> query_param = nullptr;
RETURN_IF_ERROR(InvertedIndexQueryParamFactory::create_query_value(param_type, &param_value,
query_param));
InvertedIndexParam param;
param.column_name = data_type_with_name.first;
param.column_type = data_type_with_name.second;
param.query_value = query_param->get_value();
param.query_type = segment_v2::InvertedIndexQueryType::EQUAL_QUERY;
param.num_rows = num_rows;
param.roaring = std::make_shared<roaring::Roaring>();
param.analyzer_ctx = analyzer_ctx;
RETURN_IF_ERROR(iter->read_from_index(segment_v2::IndexParam {&param}));
// here debug for check array_contains function really filter rows by inverted index correctly
DBUG_EXECUTE_IF("array_func.array_contains", {
auto result_bitmap = DebugPoints::instance()->get_debug_param_or_default<int32_t>(
"array_func.array_contains", "result_bitmap", 0);
if (result_bitmap < 0) {
return Status::Error<ErrorCode::INTERNAL_ERROR>(
"result_bitmap count cannot be negative");
}
if (param.roaring->cardinality() != result_bitmap) {
return Status::Error<ErrorCode::INTERNAL_ERROR>(
"array_contains really filtered {} by inverted index not equal to expected "
"{}",
param.roaring->cardinality(), result_bitmap);
}
})
segment_v2::InvertedIndexResultBitmap result(param.roaring, null_bitmap);
bitmap_result = result;
bitmap_result.mask_out_null();
return Status::OK();
}
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
if (arguments[0]->is_nullable()) {
return make_nullable(
std::make_shared<typename PrimitiveTypeTraits<ResultType>::DataType>());
} else {
return std::make_shared<typename PrimitiveTypeTraits<ResultType>::DataType>();
}
}
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
DBUG_EXECUTE_IF("array_func.array_contains", {
auto req_id = DebugPoints::instance()->get_debug_param_or_default<int32_t>(
"array_func.array_contains", "req_id", 0);
return Status::Error<ErrorCode::INTERNAL_ERROR>(
"{} has already execute inverted index req_id {} , should not execute expr "
"with rows: {}",
get_name(), req_id, input_rows_count);
});
return _execute_dispatch(block, arguments, result, input_rows_count);
}
private:
ColumnPtr _execute_string(const ColumnArray::Offsets64& offsets, const UInt8* nested_null_map,
const IColumn& nested_column, const IColumn& right_column,
const UInt8* right_nested_null_map,
const UInt8* outer_null_map) const {
// check array nested column type and get data
const auto& str_offs = reinterpret_cast<const ColumnString&>(nested_column).get_offsets();
const auto& str_chars = reinterpret_cast<const ColumnString&>(nested_column).get_chars();
// check right column type and get data
const auto& right_offs = reinterpret_cast<const ColumnString&>(right_column).get_offsets();
const auto& right_chars = reinterpret_cast<const ColumnString&>(right_column).get_chars();
// prepare return data
auto dst = PrimitiveTypeTraits<ResultType>::ColumnType::create(offsets.size(), 0);
auto& dst_data = dst->get_data();
auto dst_null_column = ColumnUInt8::create(offsets.size(), 0);
auto& dst_null_data = dst_null_column->get_data();
// process
for (size_t row = 0; row < offsets.size(); ++row) {
if (outer_null_map && outer_null_map[row]) {
dst_null_data[row] = true;
continue;
}
dst_null_data[row] = false;
typename PrimitiveTypeTraits<ResultType>::CppType res = 0;
size_t off = offsets[row - 1];
size_t len = offsets[row] - off;
size_t right_off = right_offs[row - 1];
size_t right_len = right_offs[row] - right_off;
for (size_t pos = 0; pos < len; ++pos) {
// match null value
if (right_nested_null_map && right_nested_null_map[row] && nested_null_map &&
nested_null_map[pos + off]) {
ConcreteAction::apply(res, pos);
if constexpr (!ConcreteAction::resume_execution) {
break;
}
}
// some is null while another is not
if (right_nested_null_map && nested_null_map &&
right_nested_null_map[row] != nested_null_map[pos + off]) {
continue;
}
if (nested_null_map && nested_null_map[pos + off]) {
continue;
}
size_t str_pos = str_offs[pos + off - 1];
size_t str_len = str_offs[pos + off] - str_pos;
const char* left_raw_v = reinterpret_cast<const char*>(&str_chars[str_pos]);
const char* right_raw_v = reinterpret_cast<const char*>(&right_chars[right_off]);
// StringRef operator == using vec impl
if (StringRef(left_raw_v, str_len) == StringRef(right_raw_v, right_len)) {
ConcreteAction::apply(res, pos);
if constexpr (!ConcreteAction::resume_execution) {
break;
}
}
}
dst_data[row] = res;
}
if (outer_null_map == nullptr) {
return dst;
}
return ColumnNullable::create(std::move(dst), std::move(dst_null_column));
}
template <typename NestedColumnType, typename RightColumnType>
ColumnPtr _execute_number(const ColumnArray::Offsets64& offsets, const UInt8* nested_null_map,
const IColumn& nested_column, const IColumn& right_column,
const UInt8* right_nested_null_map,
const UInt8* outer_null_map) const {
// check array nested column type and get data
const auto& nested_data =
reinterpret_cast<const NestedColumnType&>(nested_column).get_data();
// check right column type and get data
const auto& right_data = reinterpret_cast<const RightColumnType&>(right_column).get_data();
// prepare return data
auto dst = PrimitiveTypeTraits<ResultType>::ColumnType::create(offsets.size(), 0);
auto& dst_data = dst->get_data();
auto dst_null_column = ColumnUInt8::create(offsets.size(), 0);
auto& dst_null_data = dst_null_column->get_data();
// process
for (size_t row = 0; row < offsets.size(); ++row) {
if (outer_null_map && outer_null_map[row]) {
dst_null_data[row] = true;
continue;
}
dst_null_data[row] = false;
typename PrimitiveTypeTraits<ResultType>::CppType res = 0;
size_t off = offsets[row - 1];
size_t len = offsets[row] - off;
for (size_t pos = 0; pos < len; ++pos) {
// match null value
if (right_nested_null_map && right_nested_null_map[row] && nested_null_map &&
nested_null_map[pos + off]) {
ConcreteAction::apply(res, pos);
if constexpr (!ConcreteAction::resume_execution) {
break;
}
}
// some is null while another is not
if (right_nested_null_map && nested_null_map &&
right_nested_null_map[row] != nested_null_map[pos + off]) {
continue;
}
if (nested_null_map && nested_null_map[pos + off]) {
continue;
}
if (nested_data[pos + off] == right_data[row]) {
ConcreteAction::apply(res, pos);
if constexpr (!ConcreteAction::resume_execution) {
break;
}
}
}
dst_data[row] = res;
}
if (outer_null_map == nullptr) {
return dst;
}
return ColumnNullable::create(std::move(dst), std::move(dst_null_column));
}
template <typename NestedColumnType>
ColumnPtr _execute_number_expanded(const ColumnArray::Offsets64& offsets,
const UInt8* nested_null_map, const IColumn& nested_column,
const IColumn& right_column,
const UInt8* right_nested_null_map,
const UInt8* outer_null_map) const {
if (is_column<NestedColumnType>(right_column)) {
return _execute_number<NestedColumnType, NestedColumnType>(
offsets, nested_null_map, nested_column, right_column, right_nested_null_map,
outer_null_map);
}
return nullptr;
}
Status _execute_dispatch(Block& block, const ColumnNumbers& arguments, uint32_t result,
size_t input_rows_count) const {
// extract array offsets and nested data
auto left_column =
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
if (block.get_by_position(arguments[0]).type->get_primitive_type() != TYPE_ARRAY) {
return Status::InvalidArgument(get_name() + " first argument must be array, but got " +
block.get_by_position(arguments[0]).type->get_name());
}
const ColumnArray* array_column = nullptr;
const UInt8* array_null_map = nullptr;
if (left_column->is_nullable()) {
auto nullable_array = reinterpret_cast<const ColumnNullable*>(left_column.get());
array_column =
reinterpret_cast<const ColumnArray*>(&nullable_array->get_nested_column());
array_null_map = nullable_array->get_null_map_column().get_data().data();
} else {
array_column = reinterpret_cast<const ColumnArray*>(left_column.get());
}
const auto& offsets = array_column->get_offsets();
const UInt8* nested_null_map = nullptr;
ColumnPtr nested_column = nullptr;
if (array_column->get_data().is_nullable()) {
const auto& nested_null_column =
reinterpret_cast<const ColumnNullable&>(array_column->get_data());
nested_null_map = nested_null_column.get_null_map_column().get_data().data();
nested_column = nested_null_column.get_nested_column_ptr();
} else {
nested_column = array_column->get_data_ptr();
}
// get right column
ColumnPtr right_full_column =
block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();
ColumnPtr right_column = right_full_column;
const UInt8* right_nested_null_map = nullptr;
if (right_column->is_nullable()) {
const auto& nested_null_column = assert_cast<const ColumnNullable&>(*right_full_column);
right_column = nested_null_column.get_nested_column_ptr();
right_nested_null_map = nested_null_column.get_null_map_column().get_data().data();
}
// execute
auto array_type = remove_nullable(block.get_by_position(arguments[0]).type);
auto left_element_type =
remove_nullable(assert_cast<const DataTypeArray&>(*array_type).get_nested_type());
auto right_type = remove_nullable(block.get_by_position(arguments[1]).type);
ColumnPtr return_column = nullptr;
if (is_string_type(right_type->get_primitive_type()) &&
is_string_type(left_element_type->get_primitive_type())) {
return_column = _execute_string(offsets, nested_null_map, *nested_column, *right_column,
right_nested_null_map, array_null_map);
} else if (is_number(right_type->get_primitive_type()) &&
is_number(left_element_type->get_primitive_type())) {
auto call = [&](const auto& type) -> bool {
using DispatchType = std::decay_t<decltype(type)>;
return_column = _execute_number<typename DispatchType::ColumnType,
typename DispatchType::ColumnType>(
offsets, nested_null_map, *nested_column, *right_column,
right_nested_null_map, array_null_map);
return true;
};
if (!dispatch_switch_number(right_type->get_primitive_type(), call)) {
return Status::InternalError(get_name() + " not support right type " +
right_type->get_name());
}
} else if ((is_date_v2_or_datetime_v2(right_type->get_primitive_type()) ||
right_type->get_primitive_type() == TYPE_TIMEV2) &&
(is_date_v2_or_datetime_v2(left_element_type->get_primitive_type()) ||
left_element_type->get_primitive_type() == TYPE_TIMEV2)) {
if (left_element_type->get_primitive_type() == TYPE_DATEV2) {
return_column = _execute_number_expanded<ColumnDateV2>(
offsets, nested_null_map, *nested_column, *right_column,
right_nested_null_map, array_null_map);
} else if (left_element_type->get_primitive_type() == TYPE_DATETIMEV2) {
return_column = _execute_number_expanded<ColumnDateTimeV2>(
offsets, nested_null_map, *nested_column, *right_column,
right_nested_null_map, array_null_map);
} else if (left_element_type->get_primitive_type() == TYPE_TIMEV2) {
return_column = _execute_number_expanded<ColumnTimeV2>(
offsets, nested_null_map, *nested_column, *right_column,
right_nested_null_map, array_null_map);
}
} else if (is_ip(right_type->get_primitive_type()) &&
is_ip(left_element_type->get_primitive_type())) {
if (left_element_type->get_primitive_type() == TYPE_IPV4) {
return_column = _execute_number_expanded<ColumnIPv4>(
offsets, nested_null_map, *nested_column, *right_column,
right_nested_null_map, array_null_map);
} else if (left_element_type->get_primitive_type() == TYPE_IPV6) {
return_column = _execute_number_expanded<ColumnIPv6>(
offsets, nested_null_map, *nested_column, *right_column,
right_nested_null_map, array_null_map);
}
}
if (return_column) {
block.replace_by_position(result, std::move(return_column));
return Status::OK();
}
return Status::RuntimeError("execute failed or unsupported types for function {}({}, {})",
get_name(),
block.get_by_position(arguments[0]).type->get_name(),
block.get_by_position(arguments[1]).type->get_name());
}
};
} // namespace doris::vectorized