blob: bd6ba1ea9ee0f23f0d0d5a7b6b7de50e775eaaf5 [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <functional>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <vector>
#include "paimon/predicate/predicate.h"
#include "paimon/utils/roaring_bitmap64.h"
#include "paimon/visibility.h"
namespace paimon {
/// A configuration structure for full-text search operations.
struct PAIMON_EXPORT FullTextSearch {
/// Enumeration of supported full-text search types.
enum class SearchType {
/// All terms in the query must be present (AND semantics).
MATCH_ALL = 1,
/// Any term in the query can match (OR semantics).
MATCH_ANY = 2,
/// Matches the exact sequence of words (with proximity).
PHRASE = 3,
/// Matches terms starting with the given string (e.g., "run*" → running, runner).
PREFIX = 4,
/// Supports wildcards * and ? (e.g., "ap*e", "app?e" -> "apple").
WILDCARD = 5,
/// Default/fallback type for unrecognized or invalid queries.
UNKNOWN = 128
};
FullTextSearch(const std::string& _field_name, std::optional<int32_t> _limit,
const std::string& _query, const SearchType& _search_type,
const std::optional<RoaringBitmap64>& _pre_filter)
: field_name(_field_name),
limit(_limit),
query(_query),
search_type(_search_type),
pre_filter(_pre_filter) {}
std::shared_ptr<FullTextSearch> ReplacePreFilter(
const std::optional<RoaringBitmap64>& _pre_filter) const {
return std::make_shared<FullTextSearch>(field_name, limit, query, search_type, _pre_filter);
}
/// Name of the field to search within (must be a full-text indexed field).
std::string field_name;
/// Maximum number of documents to return. If set, limit ordered by top scores. Otherwise, no
/// score return.
std::optional<int32_t> limit;
/// The query string to search for. The interpretation depends on search_type:
///
/// - For MATCH_ALL/MATCH_ANY: keywords are split into terms using the **same analyzer as
/// indexing**.
/// Example: "Hello World" → terms ["hello", "world"] (after lowercasing and tokenization).
///
/// - For PHRASE: matches the exact word sequence (with optional slop). Also be analyzed.
///
/// - For PREFIX: matches terms starting with the given string (e.g., "run" → running, runner).
/// Only the prefix part is considered; analysis will not be applied.
///
/// - For WILDCARD: supports wildcards * and ? (e.g., "ap*e", "app?e").
/// Not passed through analyzer — matched directly against indexed terms.
///
/// @note Analyzer consistency between indexing and querying is critical for correctness.
std::string query;
/// Type of search to perform.
SearchType search_type;
/// A pre-filter based on **global row IDs**, implemented by leveraging another global index.
/// Only rows whose global row ID is present in `pre_filter` will be included during search.
/// If not set, all rows will be included.
std::optional<RoaringBitmap64> pre_filter;
};
} // namespace paimon