blob: 5f0c05e984415128a85aa6238b43e1c55f57133d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/functions/function_tokenize.h"
#include <glog/logging.h>
#include <rapidjson/prettywriter.h>
#include <algorithm>
#include <boost/regex.hpp>
#include <memory>
#include <utility>
#include "CLucene/StdHeader.h"
#include "CLucene/config/repl_wchar.h"
#include "olap/inverted_index_parser.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_number.h"
namespace doris::vectorized {
#include "common/compile_check_begin.h"
using namespace doris::segment_v2::inverted_index;
Status parse(const std::string& str, std::map<std::string, std::string>& result) {
boost::regex pattern(
R"delimiter((?:'([^']*)'|"([^"]*)"|([^, ]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^, ]*)))delimiter");
boost::smatch matches;
std::string::const_iterator searchStart(str.cbegin());
while (boost::regex_search(searchStart, str.cend(), matches, pattern)) {
std::string key = matches[1].length()
? matches[1].str()
: (matches[2].length() ? matches[2].str() : matches[3].str());
std::string value = matches[4].length()
? matches[4].str()
: (matches[5].length() ? matches[5].str() : matches[6].str());
result[key] = value;
searchStart = matches.suffix().first;
}
return Status::OK();
}
void FunctionTokenize::_do_tokenize_none(const ColumnString& src_column_string,
const MutableColumnPtr& dest_column_ptr) const {
ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size();
for (size_t i = 0; i < src_offsets_size; i++) {
const StringRef tokenize_str = src_column_string.get_data_at(i);
rapidjson::Document doc;
doc.SetArray();
rapidjson::Document::AllocatorType& allocator = doc.GetAllocator();
rapidjson::Value obj(rapidjson::kObjectType);
obj.AddMember(
"token",
rapidjson::Value(tokenize_str.data,
static_cast<rapidjson::SizeType>(tokenize_str.size), allocator)
.Move(),
allocator);
doc.PushBack(obj, allocator);
rapidjson::StringBuffer buffer;
rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer);
writer.SetFormatOptions(rapidjson::kFormatSingleLineArray);
doc.Accept(writer);
const std::string json_array_str = buffer.GetString();
dest_column_ptr->insert_data(json_array_str.data(), json_array_str.size());
}
}
void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string,
const InvertedIndexAnalyzerCtx& analyzer_ctx,
bool support_phrase,
const MutableColumnPtr& dest_column_ptr) const {
ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size();
for (size_t i = 0; i < src_offsets_size; i++) {
const StringRef tokenize_str = src_column_string.get_data_at(i);
if (tokenize_str.size == 0) {
dest_column_ptr->insert_data("", 0);
continue;
}
auto reader = InvertedIndexAnalyzer::create_reader(analyzer_ctx.char_filter_map);
reader->init(tokenize_str.data, (int)tokenize_str.size, true);
auto analyzer_tokens =
InvertedIndexAnalyzer::get_analyse_result(reader, analyzer_ctx.analyzer);
rapidjson::Document doc;
doc.SetArray();
rapidjson::Document::AllocatorType& allocator = doc.GetAllocator();
for (const auto& analyzer_token : analyzer_tokens) {
rapidjson::Value obj(rapidjson::kObjectType);
obj.AddMember(
"token",
rapidjson::Value(analyzer_token.get_single_term().c_str(), allocator).Move(),
allocator);
if (support_phrase) {
obj.AddMember("position", analyzer_token.position, allocator);
}
doc.PushBack(obj, allocator);
}
rapidjson::StringBuffer buffer;
rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer);
writer.SetFormatOptions(rapidjson::kFormatSingleLineArray);
doc.Accept(writer);
const std::string json_array_str = buffer.GetString();
dest_column_ptr->insert_data(json_array_str.data(), json_array_str.size());
}
}
Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block,
const ColumnNumbers& arguments, uint32_t result,
size_t /*input_rows_count*/) const {
DCHECK_EQ(arguments.size(), 2);
const auto& [src_column, left_const] =
unpack_if_const(block.get_by_position(arguments[0]).column);
const auto& [right_column, right_const] =
unpack_if_const(block.get_by_position(arguments[1]).column);
auto dest_column_type = std::make_shared<vectorized::DataTypeString>();
auto dest_column_ptr = dest_column_type->create_column();
if (const auto* col_left = check_and_get_column<ColumnString>(src_column.get())) {
if (const auto* col_right = check_and_get_column<ColumnString>(right_column.get())) {
std::map<std::string, std::string> properties;
auto st = parse(col_right->get_data_at(0).to_string(), properties);
if (!st.ok()) {
return st;
}
InvertedIndexAnalyzerConfig config;
config.analyzer_name = get_analyzer_name_from_properties(properties);
config.parser_type = get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(properties));
if (config.parser_type == InvertedIndexParserType::PARSER_UNKNOWN) {
return Status::Error<doris::ErrorCode::INDEX_INVALID_PARAMETERS>(
"unsupported parser type. currently, only 'english', 'chinese', "
"'unicode', 'icu', 'basic' and 'ik' analyzers are supported.");
}
// Special handling for PARSER_NONE: return original string as single token
if (config.analyzer_name.empty() &&
config.parser_type == InvertedIndexParserType::PARSER_NONE) {
_do_tokenize_none(*col_left, dest_column_ptr);
block.replace_by_position(result, std::move(dest_column_ptr));
return Status::OK();
}
config.parser_mode = get_parser_mode_string_from_properties(properties);
config.char_filter_map = get_parser_char_filter_map_from_properties(properties);
config.lower_case = get_parser_lowercase_from_properties(properties);
config.stop_words = get_parser_stopwords_from_properties(properties);
bool support_phrase = get_parser_phrase_support_string_from_properties(properties) ==
INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES;
std::shared_ptr<lucene::analysis::Analyzer> analyzer_holder;
try {
analyzer_holder =
doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_analyzer(
&config);
} catch (CLuceneError& e) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
} catch (Exception& e) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
"inverted index create analyzer failed: {}", e.what());
}
InvertedIndexAnalyzerCtx analyzer_ctx;
analyzer_ctx.analyzer_name = config.analyzer_name;
analyzer_ctx.parser_type = config.parser_type;
analyzer_ctx.char_filter_map = config.char_filter_map;
analyzer_ctx.analyzer = analyzer_holder.get();
_do_tokenize(*col_left, analyzer_ctx, support_phrase, dest_column_ptr);
block.replace_by_position(result, std::move(dest_column_ptr));
return Status::OK();
}
}
return Status::RuntimeError("unimplemented function {}", get_name());
}
void register_function_tokenize(SimpleFunctionFactory& factory) {
factory.register_function<FunctionTokenize>();
}
#include "common/compile_check_end.h"
} // namespace doris::vectorized