blob: 80e775dec36c8c0f70ed2d4542bc8748464d9407 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cctype>
#include "common/status.h"
#include "vec/columns/column_string.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function.h"
#include "vec/functions/simple_function_factory.h"
namespace doris::vectorized {
#include "common/compile_check_begin.h"
class FunctionSoundex : public IFunction {
public:
static constexpr auto name = "soundex";
static FunctionPtr create() { return std::make_shared<FunctionSoundex>(); }
String get_name() const override { return name; }
size_t get_number_of_arguments() const override { return 1; }
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return std::make_shared<DataTypeString>();
}
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
const ColumnPtr col_ptr = block.get_by_position(arguments[0]).column;
auto res_column = ColumnString::create();
res_column->reserve(input_rows_count);
auto& res_data = res_column->get_chars();
auto& res_offsets = res_column->get_offsets();
res_data.reserve(input_rows_count * CODE_SIZE);
res_offsets.resize(input_rows_count);
for (size_t i = 0; i < input_rows_count; ++i) {
StringRef ref = col_ptr->get_data_at(i);
RETURN_IF_ERROR(calculate_soundex_and_insert(ref, res_data, res_offsets, i));
}
block.replace_by_position(result, std::move(res_column));
return Status::OK();
}
private:
Status calculate_soundex_and_insert(const StringRef& ref, ColumnString::Chars& chars,
ColumnString::Offsets& offsets, const size_t row) const {
uint32_t row_start = (row == 0) ? 0 : offsets[row - 1];
uint32_t expect_end = row_start + CODE_SIZE;
if (ref.size == 0) {
offsets[row] = row_start;
return Status::OK();
}
char pre_code = '\0';
for (size_t i = 0; i < ref.size; ++i) {
auto c = static_cast<unsigned char>(ref.data[i]);
if (c > 0x7f) {
return Status::InvalidArgument("soundex only supports ASCII, but got: {}",
ref.data[i]);
}
if (!std::isalpha(c)) {
continue;
}
c = static_cast<char>(std::toupper(c));
if (chars.size() == row_start) {
chars.push_back(c);
pre_code = (SOUNDEX_TABLE[c - 'A'] == 'N') ? '\0' : SOUNDEX_TABLE[c - 'A'];
} else if (char code = SOUNDEX_TABLE[c - 'A']; code != 'N') {
if (code != 'V' && code != pre_code) {
chars.push_back(code);
if (chars.size() == expect_end) {
offsets[row] = static_cast<ColumnString::Offset>(chars.size());
return Status::OK();
}
}
pre_code = code;
}
}
while (chars.size() != row_start && chars.size() < expect_end) {
chars.push_back('0');
}
offsets[row] = static_cast<ColumnString::Offset>(chars.size());
return Status::OK();
}
/** 1. If a vowel (A, E, I, O, U) separates two consonants that have the same soundex code
* the consonant to the right of the vowel is coded. Here we use 'V' to represent vowels.
* eg : **Tymczak** is coded as T-522 (T, 5 for the M, 2 for the C, Z ignored , 2 for the K).
* Since the vowel "A" separates the Z and K, the K is coded.
*
* 2. If "H" or "W" separate two consonants that have the same soundex code, the consonant to the right of the vowel is NOT coded.
* Here we use 'N' to represent these two characters.
* eg : **Ashcraft** is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 for the F). It is not coded A-226.
*/
static constexpr char SOUNDEX_TABLE[26] = {'V', '1', '2', '3', 'V', '1', '2', 'N', 'V',
'2', '2', '4', '5', '5', 'V', '1', '2', '6',
'2', '3', 'V', '1', 'N', '2', 'V', '2'};
static constexpr uint8_t CODE_SIZE = 4;
};
void register_function_soundex(SimpleFunctionFactory& factory) {
factory.register_function<FunctionSoundex>();
}
#include "common/compile_check_end.h"
} // namespace doris::vectorized