[Feat](udf) Support Python UDF for Doris (#57329)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
Co-authored-by: yangshijie <jasonsjyang@tencent.com>diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index c2ce46b..1aeccc9 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -35,6 +35,7 @@
#include <mutex>
#include <random>
#include <string>
+#include <string_view>
#include <utility>
#include <vector>
@@ -45,6 +46,7 @@
#include "io/fs/file_writer.h"
#include "io/fs/local_file_system.h"
#include "util/cpu_info.h"
+#include "util/string_util.h"
namespace doris::config {
#include "common/compile_check_avoid_begin.h"
@@ -1082,6 +1084,25 @@
// enable java udf and jdbc scannode
DEFINE_Bool(enable_java_support, "true");
+// enable python udf
+DEFINE_Bool(enable_python_udf_support, "false");
+// python env mode, options: conda, venv
+DEFINE_String(python_env_mode, "");
+// root path of conda runtime, python_env_mode should be conda
+DEFINE_String(python_conda_root_path, "");
+// root path of venv runtime, python_env_mode should be venv
+DEFINE_String(python_venv_root_path, "${DORIS_HOME}/lib/udf/python");
+// python interpreter paths used by venv, e.g. /usr/bin/python3.7:/usr/bin/python3.6
+DEFINE_String(python_venv_interpreter_paths, "");
+// python deps index url
+DEFINE_String(python_deps_index_url, "https://pypi.org/simple/");
+// min number of python process
+DEFINE_Int32(min_python_process_nums, "16");
+// max number of python process
+DEFINE_Int32(max_python_process_nums, "256");
+// timeout in milliseconds when waiting for available python process
+DEFINE_Int32(python_process_pool_wait_timeout_ms, "30000");
+
// Set config randomly to check more issues in github workflow
DEFINE_Bool(enable_fuzzy_mode, "false");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index df83ae4..0d530ac 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1113,6 +1113,25 @@
// enable java udf and jdbc scannode
DECLARE_Bool(enable_java_support);
+// enable python udf
+DECLARE_Bool(enable_python_udf_support);
+// python env mode, options: conda, venv
+DECLARE_String(python_env_mode);
+// root path of conda runtime, python_env_mode should be conda
+DECLARE_String(python_conda_root_path);
+// root path of venv runtime, python_env_mode should be venv
+DECLARE_String(python_venv_root_path);
+// python interpreter paths used by venv, e.g. /usr/bin/python3.7:/usr/bin/python3.6
+DECLARE_String(python_venv_interpreter_paths);
+// python deps index url
+DECLARE_String(python_deps_index_url);
+// min number of python process
+DECLARE_Int32(min_python_process_nums);
+// max number of python process
+DECLARE_Int32(max_python_process_nums);
+// timeout in milliseconds when waiting for available python process
+DECLARE_Int32(python_process_pool_wait_timeout_ms);
+
// Set config randomly to check more issues in github workflow
DECLARE_Bool(enable_fuzzy_mode);
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index 78bdfa3..a175dd3 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -99,6 +99,7 @@
#include "service/backend_options.h"
#include "service/backend_service.h"
#include "service/point_query_executor.h"
+#include "udf/python/python_udf_server.h"
#include "util/bfd_parser.h"
#include "util/bit_util.h"
#include "util/brpc_client_cache.h"
@@ -889,6 +890,7 @@
_s_tracking_memory = false;
clear_storage_resource();
+ PythonUDFServerManager::instance().shutdown();
LOG(INFO) << "Doris exec envorinment is destoried.";
}
diff --git a/be/src/runtime/user_function_cache.cpp b/be/src/runtime/user_function_cache.cpp
index ce6453f..d54c2f4 100644
--- a/be/src/runtime/user_function_cache.cpp
+++ b/be/src/runtime/user_function_cache.cpp
@@ -20,6 +20,7 @@
// IWYU pragma: no_include <bthread/errno.h>
#include <errno.h> // IWYU pragma: keep
#include <glog/logging.h>
+#include <minizip/unzip.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
@@ -41,6 +42,7 @@
#include "io/fs/local_file_system.h"
#include "runtime/exec_env.h"
#include "runtime/plugin/cloud_plugin_downloader.h"
+#include "util/defer_op.h"
#include "util/dynamic_util.h"
#include "util/md5.h"
#include "util/string_util.h"
@@ -88,6 +90,9 @@
// And this is used to indicate whether library is downloaded.
bool is_downloaded = false;
+ // Indicate if the zip file is unziped.
+ bool is_unziped = false;
+
// used to lookup a symbol
void* lib_handle = nullptr;
@@ -144,9 +149,12 @@
lib_type = LibType::SO;
} else if (ends_with(file, ".jar")) {
lib_type = LibType::JAR;
+ } else if (ends_with(file, ".zip") && _check_cache_is_python_udf(dir, file)) {
+ lib_type = LibType::PY_ZIP;
} else {
return Status::InternalError(
- "unknown library file format. the file type is not end with xxx.jar or xxx.so : " +
+ "unknown library file format. the file type is not end with xxx.jar or xxx.so"
+ " or xxx.zip : " +
file);
}
@@ -249,15 +257,120 @@
RETURN_IF_ERROR(_download_lib(url, entry));
}
+ if (!entry->is_unziped && entry->type == LibType::PY_ZIP) {
+ RETURN_IF_ERROR(_unzip_lib(entry->lib_file));
+ entry->lib_file = entry->lib_file.substr(0, entry->lib_file.size() - 4);
+ entry->is_unziped = true;
+ }
+
if (entry->type == LibType::SO) {
RETURN_IF_ERROR(_load_cache_entry_internal(entry));
- } else if (entry->type != LibType::JAR) {
+ } else if (entry->type != LibType::JAR && entry->type != LibType::PY_ZIP) {
return Status::InvalidArgument(
- "Unsupported lib type! Make sure your lib type is one of 'so' and 'jar'!");
+ "Unsupported lib type! Make sure your lib type is one of 'so' and 'jar' and "
+ "python 'zip'!");
}
return Status::OK();
}
+Status UserFunctionCache::_check_cache_is_python_udf(const std::string& dir,
+ const std::string& file) {
+ const std::string& full_path = dir + "/" + file;
+ RETURN_IF_ERROR(_unzip_lib(full_path));
+ std::string unzip_dir = full_path.substr(0, full_path.size() - 4);
+
+ bool has_python_file = false;
+
+ auto scan_cb = [&has_python_file](const io::FileInfo& file) {
+ if (file.is_file && ends_with(file.file_name, ".py")) {
+ has_python_file = true;
+ return false; // Stop iteration once we find a Python file
+ }
+ return true;
+ };
+ RETURN_IF_ERROR(io::global_local_filesystem()->iterate_directory(unzip_dir, scan_cb));
+ if (!has_python_file) {
+ return Status::InternalError("No Python file found in the unzipped directory.");
+ }
+ return Status::OK();
+}
+
+Status UserFunctionCache::_unzip_lib(const std::string& zip_file) {
+ std::string unzip_dir = zip_file.substr(0, zip_file.size() - 4);
+ RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(unzip_dir));
+
+ unzFile zip_file_handle = unzOpen(zip_file.c_str());
+ if (zip_file_handle == nullptr) {
+ return Status::InternalError("Failed to open zip file: " + zip_file);
+ }
+
+ Defer defer([&] { unzClose(zip_file_handle); });
+
+ unz_global_info global_info;
+ if (unzGetGlobalInfo(zip_file_handle, &global_info) != UNZ_OK) {
+ return Status::InternalError("Failed to get global info from zip file: " + zip_file);
+ }
+
+ for (uLong i = 0; i < global_info.number_entry; ++i) {
+ unz_file_info file_info;
+ char filename[256];
+ if (unzGetCurrentFileInfo(zip_file_handle, &file_info, filename, sizeof(filename), nullptr,
+ 0, nullptr, 0) != UNZ_OK) {
+ return Status::InternalError("Failed to get file info from zip file: " + zip_file);
+ }
+
+ if (std::string(filename).find("__MACOSX") != std::string::npos) {
+ if ((i + 1) < global_info.number_entry) {
+ if (unzGoToNextFile(zip_file_handle) != UNZ_OK) {
+ return Status::InternalError("Failed to go to next file in zip: " + zip_file);
+ }
+ }
+ continue;
+ }
+
+ std::string full_filename = unzip_dir + "/" + filename;
+ if (full_filename.length() > PATH_MAX) {
+ return Status::InternalError(
+ fmt::format("File path {}... is too long, maximum path length is {}",
+ full_filename.substr(0, 50), PATH_MAX));
+ }
+
+ if (filename[strlen(filename) - 1] == '/') {
+ RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(full_filename));
+ } else {
+ if (unzOpenCurrentFile(zip_file_handle) != UNZ_OK) {
+ return Status::InternalError("Failed to open file in zip: " +
+ std::string(filename));
+ }
+
+ FILE* out = fopen(full_filename.c_str(), "wb");
+ if (out == nullptr) {
+ unzCloseCurrentFile(zip_file_handle);
+ return Status::InternalError("Failed to create file: " + full_filename);
+ }
+ char buffer[8192];
+ int bytes_read;
+ while ((bytes_read = unzReadCurrentFile(zip_file_handle, buffer, sizeof(buffer))) > 0) {
+ fwrite(buffer, bytes_read, 1, out);
+ }
+ fclose(out);
+ unzCloseCurrentFile(zip_file_handle);
+ if (bytes_read < 0) {
+ return Status::InternalError("Failed to read file in zip: " +
+ std::string(filename));
+ }
+ }
+
+ if ((i + 1) < global_info.number_entry) {
+ if (unzGoToNextFile(zip_file_handle) != UNZ_OK) {
+ return Status::InternalError("Failed to go to next file in zip: " + zip_file);
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
// entry's lock must be held
Status UserFunctionCache::_download_lib(const std::string& url,
std::shared_ptr<UserFunctionCacheEntry> entry) {
@@ -348,6 +461,8 @@
ss << _lib_dir << '/' << shard << '/' << function_id << '.' << checksum;
if (type == LibType::JAR) {
ss << '.' << file_name;
+ } else if (type == LibType::PY_ZIP) {
+ ss << '.' << file_name;
} else {
ss << ".so";
}
@@ -362,6 +477,14 @@
return Status::OK();
}
+Status UserFunctionCache::get_pypath(int64_t fid, const std::string& url,
+ const std::string& checksum, std::string* libpath) {
+ std::shared_ptr<UserFunctionCacheEntry> entry = nullptr;
+ RETURN_IF_ERROR(_get_cache_entry(fid, url, checksum, entry, LibType::PY_ZIP));
+ *libpath = entry->lib_file;
+ return Status::OK();
+}
+
std::vector<std::string> UserFunctionCache::_split_string_by_checksum(const std::string& file) {
std::vector<std::string> result;
diff --git a/be/src/runtime/user_function_cache.h b/be/src/runtime/user_function_cache.h
index f5a04a5..1596f4c 100644
--- a/be/src/runtime/user_function_cache.h
+++ b/be/src/runtime/user_function_cache.h
@@ -43,7 +43,7 @@
// with id, this function library is valid. And when user wants to
// change its implementation(URL), Doris will generate a new function
// id.
-enum class LibType { JAR, SO };
+enum class LibType { JAR, SO, PY_ZIP };
class UserFunctionCache {
public:
@@ -59,6 +59,9 @@
Status get_jarpath(int64_t fid, const std::string& url, const std::string& checksum,
std::string* libpath);
+ Status get_pypath(int64_t fid, const std::string& url, const std::string& checksum,
+ std::string* libpath);
+
private:
Status _load_cached_lib();
Status _load_entry_from_lib(const std::string& dir, const std::string& file);
@@ -66,6 +69,14 @@
std::shared_ptr<UserFunctionCacheEntry>& output_entry, LibType type);
Status _load_cache_entry(const std::string& url, std::shared_ptr<UserFunctionCacheEntry> entry);
Status _download_lib(const std::string& url, std::shared_ptr<UserFunctionCacheEntry> entry);
+ /**
+ * Unzip the python udf user file.
+ */
+ Status _unzip_lib(const std::string& file);
+ /**
+ * Check if the cache file is python udf.
+ */
+ Status _check_cache_is_python_udf(const std::string& dir, const std::string& file);
Status _load_cache_entry_internal(std::shared_ptr<UserFunctionCacheEntry> entry);
std::string _make_lib_file(int64_t function_id, const std::string& checksum, LibType type,
diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp
index 401456c..69154b6 100644
--- a/be/src/service/doris_main.cpp
+++ b/be/src/service/doris_main.cpp
@@ -24,6 +24,7 @@
// IWYU pragma: no_include <bthread/errno.h>
#include <errno.h> // IWYU pragma: keep
#include <fcntl.h>
+#include <fmt/core.h>
#if !defined(__SANITIZE_ADDRESS__) && !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && \
!defined(THREAD_SANITIZER) && !defined(USE_JEMALLOC)
#include <gperftools/malloc_extension.h> // IWYU pragma: keep
@@ -76,9 +77,11 @@
#include "service/backend_service.h"
#include "service/brpc_service.h"
#include "service/http_service.h"
+#include "udf/python/python_env.h"
#include "util/debug_util.h"
#include "util/disk_info.h"
#include "util/mem_info.h"
+#include "util/string_util.h"
#include "util/thrift_rpc_helper.h"
#include "util/thrift_server.h"
#include "util/uid_util.h"
@@ -499,6 +502,70 @@
}
}
+ if (doris::config::enable_python_udf_support) {
+ if (std::string python_udf_root_path =
+ fmt::format("{}/lib/udf/python", std::getenv("DORIS_HOME"));
+ !std::filesystem::exists(python_udf_root_path)) {
+ std::filesystem::create_directories(python_udf_root_path);
+ }
+
+ // Normalize and trim all Python-related config parameters
+ std::string python_env_mode =
+ std::string(doris::trim(doris::to_lower(doris::config::python_env_mode)));
+ std::string python_conda_root_path =
+ std::string(doris::trim(doris::config::python_conda_root_path));
+ std::string python_venv_root_path =
+ std::string(doris::trim(doris::config::python_venv_root_path));
+ std::string python_venv_interpreter_paths =
+ std::string(doris::trim(doris::config::python_venv_interpreter_paths));
+
+ if (python_env_mode == "conda") {
+ if (python_conda_root_path.empty()) {
+ LOG(ERROR)
+ << "Python conda root path is empty, please set `python_conda_root_path` "
+ "or set `enable_python_udf_support` to `false`";
+ exit(1);
+ }
+ LOG(INFO) << "Doris backend python version manager is initialized. Python conda "
+ "root path: "
+ << python_conda_root_path;
+ status = doris::PythonVersionManager::instance().init(doris::PythonEnvType::CONDA,
+ python_conda_root_path, "");
+ } else if (python_env_mode == "venv") {
+ if (python_venv_root_path.empty()) {
+ LOG(ERROR)
+ << "Python venv root path is empty, please set `python_venv_root_path` or "
+ "set `enable_python_udf_support` to `false`";
+ exit(1);
+ }
+ if (python_venv_interpreter_paths.empty()) {
+ LOG(ERROR)
+ << "Python interpreter paths is empty, please set "
+ "`python_venv_interpreter_paths` or set `enable_python_udf_support` to "
+ "`false`";
+ exit(1);
+ }
+ LOG(INFO) << "Doris backend python version manager is initialized. Python venv "
+ "root path: "
+ << python_venv_root_path
+ << ", python interpreter paths: " << python_venv_interpreter_paths;
+ status = doris::PythonVersionManager::instance().init(doris::PythonEnvType::VENV,
+ python_venv_root_path,
+ python_venv_interpreter_paths);
+ } else {
+ status = Status::InvalidArgument(
+ "Python env mode is invalid, should be `conda` or `venv`. If you don't want to "
+ "enable the Python UDF function, please set `enable_python_udf_support` to "
+ "`false`");
+ }
+
+ if (!status.ok()) {
+ LOG(ERROR) << "Failed to initialize python version manager: " << status;
+ exit(1);
+ }
+ LOG(INFO) << doris::PythonVersionManager::instance().to_string();
+ }
+
// Doris own signal handler must be register after jvm is init.
// Or our own sig-handler for SIGINT & SIGTERM will not be chained ...
// https://www.oracle.com/java/technologies/javase/signals.html
diff --git a/be/src/udf/CMakeLists.txt b/be/src/udf/CMakeLists.txt
index 60ea86c..34e6eec 100755
--- a/be/src/udf/CMakeLists.txt
+++ b/be/src/udf/CMakeLists.txt
@@ -20,7 +20,13 @@
# where to put generated binaries
set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/udf")
+set(UDF_SOURCES udf.cpp)
+
+file(GLOB PYTHON_UDF_SOURCES "python/*.cpp")
+
+list(APPEND UDF_SOURCES ${PYTHON_UDF_SOURCES})
+
# Build this library twice. Once to be linked into the main Doris. This version
# can have dependencies on our other libs. The second version is shipped as part
# of the UDF sdk, which can't use other libs.
-add_library(Udf STATIC udf.cpp)
\ No newline at end of file
+add_library(Udf STATIC ${UDF_SOURCES})
diff --git a/be/src/udf/python/python_env.cpp b/be/src/udf/python/python_env.cpp
new file mode 100644
index 0000000..0b29be8
--- /dev/null
+++ b/be/src/udf/python/python_env.cpp
@@ -0,0 +1,290 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "python_env.h"
+
+#include <fmt/core.h>
+
+#include <filesystem>
+#include <memory>
+#include <regex>
+#include <vector>
+
+#include "common/status.h"
+#include "udf/python/python_udf_server.h"
+#include "util/string_util.h"
+
+namespace doris {
+
+namespace fs = std::filesystem;
+
+// extract python version by executing `python --version` and extract "3.9.16" from "Python 3.9.16"
+// @param python_path: path to python executable, e.g. "/opt/miniconda3/envs/myenv/bin/python"
+// @param version: extracted python version, e.g. "3.9.16"
+static Status extract_python_version(const std::string& python_path, std::string* version) {
+ static std::regex python_version_re(R"(^Python (\d+\.\d+\.\d+))");
+
+ if (!fs::exists(python_path)) {
+ return Status::NotFound("Python executable not found: {}", python_path);
+ }
+
+ std::string cmd = fmt::format("\"{}\" --version", python_path);
+ FILE* pipe = popen(cmd.c_str(), "r");
+ if (!pipe) {
+ return Status::InternalError("Failed to run: {}", cmd);
+ }
+
+ std::string result;
+ char buf[128];
+ while (fgets(buf, sizeof(buf), pipe)) {
+ result += buf;
+ }
+ pclose(pipe);
+
+ std::smatch match;
+ if (std::regex_search(result, match, python_version_re)) {
+ *version = match[1].str();
+ return Status::OK();
+ }
+
+ return Status::InternalError("Failed to extract Python version from path: {}, result: {}",
+ python_path, result);
+}
+
+PythonEnvironment::PythonEnvironment(const std::string& name, const PythonVersion& python_version)
+ : env_name(name), python_version(python_version) {}
+
+std::string PythonEnvironment::to_string() const {
+ return fmt::format(
+ "[env_name: {}, env_base_path: {}, python_base_path: {}, python_full_version: {}]",
+ env_name, python_version.base_path, python_version.executable_path,
+ python_version.full_version);
+}
+
+bool PythonEnvironment::is_valid() const {
+ if (!python_version.is_valid()) return false;
+
+ auto perms = fs::status(python_version.executable_path).permissions();
+ if ((perms & fs::perms::owner_exec) == fs::perms::none) {
+ return false;
+ }
+
+ std::string version;
+ if (!extract_python_version(python_version.executable_path, &version).ok()) {
+ LOG(WARNING) << "Failed to extract python version from path: "
+ << python_version.executable_path;
+ return false;
+ }
+
+ return python_version.full_version == version;
+}
+
+// Scan for environments under the /{conda_root_path}/envs directory from the conda root.
+Status PythonEnvironment::scan_from_conda_root_path(const fs::path& conda_root_path,
+ std::vector<PythonEnvironment>* environments) {
+ DCHECK(!conda_root_path.empty() && environments != nullptr);
+
+ fs::path envs_dir = conda_root_path / "envs";
+ if (!fs::exists(envs_dir) || !fs::is_directory(envs_dir)) {
+ return Status::NotFound("Conda envs directory not found: {}", envs_dir.string());
+ }
+
+ for (const auto& entry : fs::directory_iterator(envs_dir)) {
+ if (!entry.is_directory()) continue;
+
+ std::string env_name = entry.path().filename(); // e.g. "myenv"
+ std::string env_base_path = entry.path(); // e.g. "/opt/miniconda3/envs/myenv"
+ std::string python_path =
+ env_base_path + "/bin/python"; // e.g. "/{env_base_path}/bin/python"
+ std::string python_full_version; // e.g. "3.9.16"
+ RETURN_IF_ERROR(extract_python_version(python_path, &python_full_version));
+ size_t pos = python_full_version.find_last_of('.');
+
+ if (UNLIKELY(pos == std::string::npos)) {
+ return Status::InvalidArgument("Invalid python version: {}", python_full_version);
+ }
+
+ PythonVersion python_version(python_full_version, env_base_path, python_path);
+ PythonEnvironment conda_env(env_name, python_version);
+
+ if (UNLIKELY(!conda_env.is_valid())) {
+ LOG(WARNING) << "Invalid conda environment: " << conda_env.to_string();
+ continue;
+ }
+
+ environments->push_back(std::move(conda_env));
+ }
+
+ if (environments->empty()) {
+ return Status::NotFound("No conda python environments found");
+ }
+
+ return Status::OK();
+}
+
+Status PythonEnvironment::scan_from_venv_root_path(
+ const fs::path& venv_root_path, const std::vector<std::string>& interpreter_paths,
+ std::vector<PythonEnvironment>* environments) {
+ DCHECK(!venv_root_path.empty() && environments != nullptr);
+
+ for (const auto& interpreter_path : interpreter_paths) {
+ if (!fs::exists(interpreter_path) || !fs::is_regular_file(interpreter_path)) {
+ return Status::NotFound("Interpreter path not found: {}", interpreter_path);
+ }
+ std::string python_full_version;
+ RETURN_IF_ERROR(extract_python_version(interpreter_path, &python_full_version));
+ size_t pos = python_full_version.find_last_of('.');
+ if (UNLIKELY(pos == std::string::npos)) {
+ return Status::InvalidArgument("Invalid python version: {}", python_full_version);
+ }
+ // Extract major.minor version (e.g., "3.12" from "3.12.0")
+ std::string python_major_minor_version = python_full_version.substr(0, pos);
+
+ std::string env_name = fmt::format("python{}", python_full_version); // e.g. "python3.9.16"
+ std::string env_base_path = fmt::format("{}/{}", venv_root_path.string(),
+ env_name); // e.g. "/opt/venv/python3.9.16"
+ std::string python_path =
+ fmt::format("{}/bin/python", env_base_path); // e.g. "/{venv_base_path}/bin/python"
+
+ if (!fs::exists(env_base_path) || !fs::exists(python_path)) {
+ fs::create_directories(env_base_path);
+ std::string create_venv_cmd =
+ fmt::format("{} -m venv {}", interpreter_path, env_base_path);
+
+ if (system(create_venv_cmd.c_str()) != 0 || !fs::exists(python_path)) {
+ return Status::RuntimeError("Failed to create python virtual environment, cmd: {}",
+ create_venv_cmd);
+ }
+ }
+
+ // Use major.minor version for site-packages path (e.g., "python3.12")
+ std::string python_dependency_path = fmt::format("{}/lib/python{}/site-packages",
+ env_base_path, python_major_minor_version);
+
+ if (!fs::exists(python_dependency_path)) {
+ return Status::NotFound("Python dependency path not found: {}", python_dependency_path);
+ }
+
+ PythonVersion python_version(python_full_version, env_base_path, python_path);
+ PythonEnvironment venv_env(env_name, python_version);
+
+ if (UNLIKELY(!venv_env.is_valid())) {
+ LOG(WARNING) << "Invalid venv environment: " << venv_env.to_string();
+ continue;
+ }
+
+ environments->push_back(std::move(venv_env));
+ }
+
+ if (environments->empty()) {
+ return Status::NotFound("No venv python environments found");
+ }
+
+ return Status::OK();
+}
+
+Status PythonEnvScanner::get_versions(std::vector<PythonVersion>* versions) const {
+ DCHECK(versions != nullptr);
+ if (_envs.empty()) {
+ return Status::InternalError("not found available version");
+ }
+ for (const auto& env : _envs) {
+ versions->push_back(env.python_version);
+ }
+ return Status::OK();
+}
+
+Status PythonEnvScanner::get_version(const std::string& runtime_version,
+ PythonVersion* version) const {
+ if (_envs.empty()) {
+ return Status::InternalError("not found available version");
+ }
+ std::string_view runtime_version_view(runtime_version);
+ runtime_version_view = trim(runtime_version_view);
+ for (const auto& env : _envs) {
+ if (env.python_version.full_version == runtime_version_view) {
+ *version = env.python_version;
+ return Status::OK();
+ }
+ }
+ return Status::NotFound("not found runtime version: {}", runtime_version);
+}
+
+Status CondaEnvScanner::scan() {
+ RETURN_IF_ERROR(PythonEnvironment::scan_from_conda_root_path(_env_root_path, &_envs));
+ return Status::OK();
+}
+
+std::string CondaEnvScanner::to_string() const {
+ std::stringstream ss;
+ ss << "Conda environments: ";
+ for (const auto& conda_env : _envs) {
+ ss << conda_env.to_string() << ", ";
+ }
+ return ss.str();
+}
+
+Status VenvEnvScanner::scan() {
+ RETURN_IF_ERROR(PythonEnvironment::scan_from_venv_root_path(_env_root_path, _interpreter_paths,
+ &_envs));
+ return Status::OK();
+}
+
+std::string VenvEnvScanner::to_string() const {
+ std::stringstream ss;
+ ss << "Venv environments: ";
+ for (const auto& venv_env : _envs) {
+ ss << venv_env.to_string() << ", ";
+ }
+ return ss.str();
+}
+
+Status PythonVersionManager::init(PythonEnvType env_type, const fs::path& python_root_path,
+ const std::string& python_venv_interpreter_paths) {
+ switch (env_type) {
+ case PythonEnvType::CONDA: {
+ if (!fs::exists(python_root_path) || !fs::is_directory(python_root_path)) {
+ return Status::InvalidArgument("Invalid conda root path: {}",
+ python_root_path.string());
+ }
+ _env_scanner = std::make_unique<CondaEnvScanner>(python_root_path);
+ break;
+ }
+ case PythonEnvType::VENV: {
+ if (!fs::exists(python_root_path) || !fs::is_directory(python_root_path)) {
+ return Status::InvalidArgument("Invalid venv root path: {}", python_root_path.string());
+ }
+ std::vector<std::string> interpreter_paths = split(python_venv_interpreter_paths, ":");
+ if (interpreter_paths.empty()) {
+ return Status::InvalidArgument("Invalid python interpreter paths: {}",
+ python_venv_interpreter_paths);
+ }
+ _env_scanner = std::make_unique<VenvEnvScanner>(python_root_path, interpreter_paths);
+ break;
+ }
+ default:
+ return Status::NotSupported("Unsupported python runtime type: {}",
+ static_cast<int>(env_type));
+ }
+ std::vector<PythonVersion> versions;
+ RETURN_IF_ERROR(_env_scanner->scan());
+ RETURN_IF_ERROR(_env_scanner->get_versions(&versions));
+ RETURN_IF_ERROR(PythonUDFServerManager::instance().init(versions));
+ return Status::OK();
+}
+
+} // namespace doris
diff --git a/be/src/udf/python/python_env.h b/be/src/udf/python/python_env.h
new file mode 100644
index 0000000..4d3a5ac
--- /dev/null
+++ b/be/src/udf/python/python_env.h
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <filesystem>
+
+#include "common/status.h"
+
+namespace doris {
+
+namespace fs = std::filesystem;
+
+enum class PythonEnvType { CONDA, VENV };
+
+struct PythonVersion {
+ std::string full_version; // e.g. "3.9.16"
+ std::string base_path; // e.g. "/root/anaconda3/envs/python3.9"
+ std::string executable_path; // e.g. "{base_path}/bin/python3"
+
+ PythonVersion() = default;
+
+ explicit PythonVersion(std::string full_version, std::string base_path,
+ std::string executable_path)
+ : full_version(std::move(full_version)),
+ base_path(std::move(base_path)),
+ executable_path(std::move(executable_path)) {}
+
+ bool operator==(const PythonVersion& other) const {
+ return full_version == other.full_version && base_path == other.base_path &&
+ executable_path == other.executable_path;
+ }
+
+ const std::string& get_base_path() const { return base_path; }
+
+ const std::string& get_executable_path() const { return executable_path; }
+
+ bool is_valid() const {
+ return !full_version.empty() && !base_path.empty() && !executable_path.empty() &&
+ fs::exists(base_path) && fs::exists(executable_path);
+ }
+
+ std::string to_string() const {
+ return fmt::format("[full_version: {}, base_path: {}, executable_path: {}]", full_version,
+ base_path, executable_path);
+ }
+};
+
+struct PythonEnvironment {
+ std::string env_name; // e.g. "base" or "myenv"
+ PythonVersion python_version;
+
+ PythonEnvironment(const std::string& name, const PythonVersion& python_version);
+
+ std::string to_string() const;
+
+ bool is_valid() const;
+
+ static Status scan_from_conda_root_path(const fs::path& conda_root_path,
+ std::vector<PythonEnvironment>* environments);
+
+ static Status scan_from_venv_root_path(const fs::path& venv_root_path,
+ const std::vector<std::string>& interpreter_paths,
+ std::vector<PythonEnvironment>* environments);
+};
+
+class PythonEnvScanner {
+public:
+ PythonEnvScanner(const fs::path& env_root_path) : _env_root_path(env_root_path) {}
+
+ virtual ~PythonEnvScanner() = default;
+
+ virtual Status scan() = 0;
+
+ Status get_versions(std::vector<PythonVersion>* versions) const;
+
+ Status get_version(const std::string& runtime_version, PythonVersion* version) const;
+
+ std::string root_path() const { return _env_root_path.string(); }
+
+ virtual PythonEnvType env_type() const = 0;
+
+ virtual std::string to_string() const = 0;
+
+protected:
+ fs::path _env_root_path;
+ std::vector<PythonEnvironment> _envs;
+};
+
+class CondaEnvScanner : public PythonEnvScanner {
+public:
+ CondaEnvScanner(const fs::path& python_root_path) : PythonEnvScanner(python_root_path) {}
+
+ ~CondaEnvScanner() override = default;
+
+ Status scan() override;
+
+ std::string to_string() const override;
+
+ PythonEnvType env_type() const override { return PythonEnvType::CONDA; }
+};
+
+class VenvEnvScanner : public PythonEnvScanner {
+public:
+ VenvEnvScanner(const fs::path& python_root_path,
+ const std::vector<std::string>& interpreter_paths)
+ : PythonEnvScanner(python_root_path), _interpreter_paths(interpreter_paths) {}
+
+ ~VenvEnvScanner() override = default;
+
+ Status scan() override;
+
+ std::string to_string() const override;
+
+ PythonEnvType env_type() const override { return PythonEnvType::VENV; }
+
+private:
+ std::vector<std::string> _interpreter_paths;
+};
+
+class PythonVersionManager {
+public:
+ static PythonVersionManager& instance() {
+ static PythonVersionManager instance;
+ return instance;
+ }
+
+ Status init(PythonEnvType env_type, const fs::path& python_root_path,
+ const std::string& python_venv_interpreter_paths);
+
+ Status get_version(const std::string& runtime_version, PythonVersion* version) const {
+ return _env_scanner->get_version(runtime_version, version);
+ }
+
+ std::string to_string() const { return _env_scanner->to_string(); }
+
+private:
+ std::unique_ptr<PythonEnvScanner> _env_scanner;
+};
+
+} // namespace doris
+
+namespace std {
+template <>
+struct hash<doris::PythonVersion> {
+ size_t operator()(const doris::PythonVersion& v) const noexcept {
+ return hash<string> {}(v.full_version);
+ }
+};
+} // namespace std
diff --git a/be/src/udf/python/python_udf_client.cpp b/be/src/udf/python/python_udf_client.cpp
new file mode 100644
index 0000000..8164d9f
--- /dev/null
+++ b/be/src/udf/python/python_udf_client.cpp
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "udf/python/python_udf_client.h"
+
+#include <utility>
+
+#include "arrow/flight/client.h"
+#include "arrow/flight/server.h"
+#include "common/status.h"
+#include "udf/python/python_udf_meta.h"
+#include "udf/python/python_udf_runtime.h"
+#include "util/arrow/utils.h"
+
+namespace doris {
+
+Status PythonUDFClient::create(const PythonUDFMeta& func_meta, ProcessPtr process,
+ PythonUDFClientPtr* client) {
+ PythonUDFClientPtr python_udf_client = std::make_shared<PythonUDFClient>();
+ RETURN_IF_ERROR(python_udf_client->init(func_meta, std::move(process)));
+ *client = std::move(python_udf_client);
+ return Status::OK();
+}
+
+Status PythonUDFClient::init(const PythonUDFMeta& func_meta, ProcessPtr process) {
+ if (_inited) {
+ return Status::InternalError("PythonUDFClient has already been initialized");
+ }
+ arrow::flight::Location location;
+ RETURN_DORIS_STATUS_IF_RESULT_ERROR(location,
+ arrow::flight::Location::Parse(process->get_uri()));
+ RETURN_DORIS_STATUS_IF_RESULT_ERROR(_arrow_client, FlightClient::Connect(location));
+ std::string command;
+ RETURN_IF_ERROR(func_meta.serialize_to_json(&command));
+ FlightDescriptor descriptor = FlightDescriptor::Command(command);
+ arrow::flight::FlightClient::DoExchangeResult exchange_res;
+ RETURN_DORIS_STATUS_IF_RESULT_ERROR(exchange_res, _arrow_client->DoExchange(descriptor));
+ _reader = std::move(exchange_res.reader);
+ _writer = std::move(exchange_res.writer);
+ _process = std::move(process);
+ _inited = true;
+ return Status::OK();
+}
+
+Status PythonUDFClient::evaluate(const arrow::RecordBatch& input,
+ std::shared_ptr<arrow::RecordBatch>* output) {
+ if (!_process->is_alive()) {
+ return Status::RuntimeError("Python UDF process is not alive");
+ }
+
+ // Step 1: Begin exchange with schema (only once)
+ if (UNLIKELY(!_begin)) {
+ auto begin_res = _writer->Begin(input.schema());
+ if (!begin_res.ok()) {
+ return handle_error(begin_res);
+ }
+ _begin = true;
+ }
+
+ // Step 2: Write the record batch to server
+ auto write_res = _writer->WriteRecordBatch(input);
+ if (!write_res.ok()) {
+ return handle_error(write_res);
+ }
+
+ // Step 3: Read response from server
+ auto read_res = _reader->Next();
+ if (!read_res.ok()) {
+ return handle_error(read_res.status());
+ }
+
+ arrow::flight::FlightStreamChunk chunk = std::move(*read_res);
+ if (!chunk.data) {
+ _process->shutdown();
+ return Status::InternalError("Received empty RecordBatch from Python UDF server");
+ }
+ *output = std::move(chunk.data);
+ return Status::OK();
+}
+
+Status PythonUDFClient::handle_error(arrow::Status status) {
+ DCHECK(!status.ok());
+ _writer.reset();
+ _reader.reset();
+ _process->shutdown();
+ std::string msg = status.message();
+ size_t pos = msg.find("The above exception was the direct cause");
+ if (pos != std::string::npos) {
+ msg = msg.substr(0, pos);
+ }
+ return Status::RuntimeError(trim(msg));
+}
+
+Status PythonUDFClient::close() {
+ if (!_inited || !_writer) return Status::OK();
+ auto writer_res = _writer->Close();
+ if (!writer_res.ok()) {
+ return handle_error(writer_res);
+ }
+ _inited = false;
+ _begin = false;
+ _arrow_client.reset();
+ _writer.reset();
+ _reader.reset();
+ if (auto* pool = _process->pool(); pool) {
+ pool->return_process(std::move(_process));
+ }
+ return Status::OK();
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_client.h b/be/src/udf/python/python_udf_client.h
new file mode 100644
index 0000000..9d88b79
--- /dev/null
+++ b/be/src/udf/python/python_udf_client.h
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <arrow/status.h>
+
+#include "arrow/flight/client.h"
+#include "common/status.h"
+#include "udf/python/python_udf_meta.h"
+#include "udf/python/python_udf_runtime.h"
+#include "util/arrow/utils.h"
+
+namespace doris {
+
+class PythonUDFClient;
+class PythonUDFProcessPool;
+
+using PythonUDFClientPtr = std::shared_ptr<PythonUDFClient>;
+
+class PythonUDFClient {
+public:
+ using FlightDescriptor = arrow::flight::FlightDescriptor;
+ using FlightClient = arrow::flight::FlightClient;
+ using FlightStreamWriter = arrow::flight::FlightStreamWriter;
+ using FlightStreamReader = arrow::flight::FlightStreamReader;
+
+ PythonUDFClient() = default;
+
+ ~PythonUDFClient() = default;
+
+ static Status create(const PythonUDFMeta& func_meta, ProcessPtr process,
+ PythonUDFClientPtr* client);
+
+ Status init(const PythonUDFMeta& func_meta, ProcessPtr process);
+
+ Status evaluate(const arrow::RecordBatch& input, std::shared_ptr<arrow::RecordBatch>* output);
+
+ Status close();
+
+ Status handle_error(arrow::Status status);
+
+ std::string print_process() const { return _process->to_string(); }
+
+private:
+ DISALLOW_COPY_AND_ASSIGN(PythonUDFClient);
+
+ bool _inited = false;
+ bool _begin = false;
+ std::unique_ptr<FlightClient> _arrow_client;
+ std::unique_ptr<FlightStreamWriter> _writer;
+ std::unique_ptr<FlightStreamReader> _reader;
+ ProcessPtr _process;
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_meta.cpp b/be/src/udf/python/python_udf_meta.cpp
new file mode 100644
index 0000000..ba3105d
--- /dev/null
+++ b/be/src/udf/python/python_udf_meta.cpp
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "udf/python/python_udf_meta.h"
+
+#include <arrow/util/base64.h>
+#include <fmt/core.h>
+#include <rapidjson/stringbuffer.h>
+#include <rapidjson/writer.h>
+
+#include <sstream>
+
+#include "common/status.h"
+#include "util/arrow/utils.h"
+#include "util/string_util.h"
+
+namespace doris {
+
+Status PythonUDFMeta::convert_types_to_schema(const vectorized::DataTypes& types,
+ const std::string& timezone,
+ std::shared_ptr<arrow::Schema>* schema) {
+ assert(!types.empty());
+ arrow::SchemaBuilder builder;
+ for (size_t i = 0; i < types.size(); ++i) {
+ std::shared_ptr<arrow::DataType> arrow_type;
+ RETURN_IF_ERROR(convert_to_arrow_type(types[i], &arrow_type, timezone));
+ std::shared_ptr<arrow::Field> field = std::make_shared<arrow::Field>(
+ "arg" + std::to_string(i), arrow_type, types[i]->is_nullable());
+ RETURN_DORIS_STATUS_IF_ERROR(builder.AddField(field));
+ }
+ RETURN_DORIS_STATUS_IF_RESULT_ERROR(schema, builder.Finish());
+ return Status::OK();
+}
+
+Status PythonUDFMeta::serialize_arrow_schema(const std::shared_ptr<arrow::Schema>& schema,
+ std::shared_ptr<arrow::Buffer>* out) {
+ RETURN_DORIS_STATUS_IF_RESULT_ERROR(
+ out, arrow::ipc::SerializeSchema(*schema, arrow::default_memory_pool()));
+ return Status::OK();
+}
+
+/*
+ json format:
+ {
+ "name": "xxx",
+ "symbol": "xxx",
+ "location": "xxx",
+ "udf_load_type": 0 or 1,
+ "runtime_version": "x.xx.xx",
+ "always_nullable": true,
+ "inline_code": "base64_inline_code",
+ "input_types": "base64_input_types",
+ "return_type": "base64_return_type"
+ }
+*/
+Status PythonUDFMeta::serialize_to_json(std::string* json_str) const {
+ rapidjson::Document doc;
+ doc.SetObject();
+ auto& allocator = doc.GetAllocator();
+ doc.AddMember("name", rapidjson::Value().SetString(_name.c_str(), allocator), allocator);
+ doc.AddMember("symbol", rapidjson::Value().SetString(_symbol.c_str(), allocator), allocator);
+ doc.AddMember("location", rapidjson::Value().SetString(_location.c_str(), allocator),
+ allocator);
+ doc.AddMember("udf_load_type", rapidjson::Value().SetInt(static_cast<int>(_type)), allocator);
+ doc.AddMember("runtime_version",
+ rapidjson::Value().SetString(_runtime_version.c_str(), allocator), allocator);
+ doc.AddMember("always_nullable", rapidjson::Value().SetBool(_always_nullable), allocator);
+
+ {
+ // Serialize base64 inline code to json
+ std::string base64_str = arrow::util::base64_encode(_inline_code);
+ doc.AddMember("inline_code", rapidjson::Value().SetString(base64_str.c_str(), allocator),
+ allocator);
+ }
+ {
+ // Serialize base64 input types to json
+ std::shared_ptr<arrow::Schema> input_schema;
+ RETURN_IF_ERROR(convert_types_to_schema(_input_types, TimezoneUtils::default_time_zone,
+ &input_schema));
+ std::shared_ptr<arrow::Buffer> input_schema_buffer;
+ RETURN_IF_ERROR(serialize_arrow_schema(input_schema, &input_schema_buffer));
+ std::string base64_str =
+ arrow::util::base64_encode({input_schema_buffer->data_as<char>(),
+ static_cast<size_t>(input_schema_buffer->size())});
+ doc.AddMember("input_types", rapidjson::Value().SetString(base64_str.c_str(), allocator),
+ allocator);
+ }
+ {
+ // Serialize base64 return type to json
+ std::shared_ptr<arrow::Schema> return_schema;
+ RETURN_IF_ERROR(convert_types_to_schema({_return_type}, TimezoneUtils::default_time_zone,
+ &return_schema));
+ std::shared_ptr<arrow::Buffer> return_schema_buffer;
+ RETURN_IF_ERROR(serialize_arrow_schema(return_schema, &return_schema_buffer));
+ std::string base64_str =
+ arrow::util::base64_encode({return_schema_buffer->data_as<char>(),
+ static_cast<size_t>(return_schema_buffer->size())});
+ doc.AddMember("return_type", rapidjson::Value().SetString(base64_str.c_str(), allocator),
+ allocator);
+ }
+
+ // Convert document to json string
+ rapidjson::StringBuffer buffer;
+ rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+ doc.Accept(writer);
+ *json_str = std::string(buffer.GetString(), buffer.GetSize());
+ return Status::OK();
+}
+
+std::string PythonUDFMeta::to_string() const {
+ std::stringstream input_types_ss;
+ input_types_ss << "<";
+ for (size_t i = 0; i < _input_types.size(); ++i) {
+ input_types_ss << _input_types[i]->get_name();
+ if (i != _input_types.size() - 1) {
+ input_types_ss << ", ";
+ }
+ }
+ input_types_ss << ">";
+ return fmt::format(
+ "[name: {}, symbol: {}, location: {}, runtime_version: {}, always_nullable: {}, "
+ "inline_code: {}][input_types: {}][return_type: {}]",
+ _name, _symbol, _location, _runtime_version, _always_nullable, _inline_code,
+ input_types_ss.str(), _return_type->get_name());
+}
+
+Status PythonUDFMeta::check() const {
+ if (trim(_name).empty()) {
+ return Status::InvalidArgument("Python UDF name is empty");
+ }
+
+ if (trim(_symbol).empty()) {
+ return Status::InvalidArgument("Python UDF symbol is empty");
+ }
+
+ if (trim(_runtime_version).empty()) {
+ return Status::InvalidArgument("Python UDF runtime version is empty");
+ }
+
+ if (_input_types.empty()) {
+ return Status::InvalidArgument("Python UDF input types is empty");
+ }
+
+ if (!_return_type) {
+ return Status::InvalidArgument("Python UDF return type is empty");
+ }
+
+ if (_type == PythonUDFLoadType::UNKNOWN) {
+ return Status::InvalidArgument(
+ "Python UDF load type is invalid, please check inline code or file path");
+ }
+
+ if (_type == PythonUDFLoadType::MODULE) {
+ if (trim(_location).empty()) {
+ return Status::InvalidArgument("Non-inline Python UDF location is empty");
+ }
+ if (trim(_checksum).empty()) {
+ return Status::InvalidArgument("Non-inline Python UDF checksum is empty");
+ }
+ }
+
+ return Status::OK();
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_meta.h b/be/src/udf/python/python_udf_meta.h
new file mode 100644
index 0000000..71c808a
--- /dev/null
+++ b/be/src/udf/python/python_udf_meta.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <sys/types.h>
+
+#include "arrow/buffer.h"
+#include "arrow/flight/client.h"
+#include "arrow/flight/server.h"
+#include "common/status.h"
+#include "util/arrow/row_batch.h"
+#include "vec/data_types/data_type.h"
+
+namespace doris {
+
+enum class PythonUDFLoadType : uint8_t { INLINE = 0, MODULE = 1, UNKNOWN = 2 };
+
+struct PythonUDFMeta {
+ int64_t _id;
+ std::string _name;
+ std::string _symbol;
+ std::string _location;
+ std::string _checksum;
+ std::string _runtime_version;
+ std::string _inline_code;
+ bool _always_nullable;
+ vectorized::DataTypes _input_types;
+ vectorized::DataTypePtr _return_type;
+ PythonUDFLoadType _type;
+
+ static Status convert_types_to_schema(const vectorized::DataTypes& types,
+ const std::string& timezone,
+ std::shared_ptr<arrow::Schema>* schema);
+
+ static Status serialize_arrow_schema(const std::shared_ptr<arrow::Schema>& schema,
+ std::shared_ptr<arrow::Buffer>* out);
+
+ Status serialize_to_json(std::string* json_str) const;
+
+ std::string to_string() const;
+
+ Status check() const;
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_runtime.cpp b/be/src/udf/python/python_udf_runtime.cpp
new file mode 100644
index 0000000..3683fc4
--- /dev/null
+++ b/be/src/udf/python/python_udf_runtime.cpp
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "udf/python/python_udf_runtime.h"
+
+#include <butil/fd_utility.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <boost/process.hpp>
+
+#include "common/config.h"
+#include "common/logging.h"
+#include "common/status.h"
+#include "udf/python/python_udf_server.h"
+
+namespace doris {
+
+void PythonUDFProcess::remove_unix_socket() {
+ if (_uri.empty() || _unix_socket_file_path.empty()) return;
+
+ if (unlink(_unix_socket_file_path.c_str()) == 0) {
+ LOG(INFO) << "Successfully removed unix socket: " << _unix_socket_file_path;
+ return;
+ }
+
+ if (errno == ENOENT) {
+ // File does not exist, this is fine, no need to warn
+ LOG(INFO) << "Unix socket not found (already removed): " << _uri;
+ } else {
+ LOG(WARNING) << "Failed to remove unix socket " << _uri << ": " << std::strerror(errno)
+ << " (errno=" << errno << ")";
+ }
+}
+
+void PythonUDFProcess::shutdown() {
+ if (!_child.valid() || _is_shutdown) return;
+
+ _child.terminate();
+ bool graceful = false;
+ constexpr std::chrono::milliseconds retry_interval(100); // 100ms
+
+ for (int i = 0; i < TERMINATE_RETRY_TIMES; ++i) {
+ if (!_child.running()) {
+ graceful = true;
+ break;
+ }
+ std::this_thread::sleep_for(retry_interval);
+ }
+
+ if (!graceful) {
+ LOG(WARNING) << "Python process did not terminate gracefully, sending SIGKILL";
+ ::kill(_child.id(), SIGKILL);
+ _child.wait();
+ }
+
+ if (int exit_code = _child.exit_code(); exit_code > 128 && exit_code <= 255) {
+ int signal = exit_code - 128;
+ LOG(INFO) << "Python process was killed by signal " << signal;
+ } else {
+ LOG(INFO) << "Python process exited normally with code: " << exit_code;
+ }
+
+ _output_stream.close();
+ remove_unix_socket();
+ _is_shutdown = true;
+}
+
+std::string PythonUDFProcess::to_string() const {
+ return fmt::format(
+ "PythonUDFProcess(child_pid={}, uri={}, "
+ "unix_socket_file_path={}, is_shutdown={})",
+ _child.id(), _uri, _unix_socket_file_path, _is_shutdown);
+}
+
+Status PythonUDFProcessPool::init() {
+ if (_init_pool_size > _max_pool_size) {
+ return Status::InvalidArgument("min_idle cannot be greater than max_pool_size");
+ }
+
+ std::lock_guard<std::mutex> lock(_mtx);
+ for (size_t i = 0; i < _init_pool_size; ++i) {
+ ProcessPtr process;
+ RETURN_IF_ERROR(PythonUDFServerManager::instance().fork(this, &process));
+ _idle_processes.push(std::move(process));
+ ++_current_size;
+ }
+
+ return Status::OK();
+}
+
+Status PythonUDFProcessPool::borrow_process(ProcessPtr* process) {
+ std::unique_lock<std::mutex> lock(_mtx);
+
+ if (_is_shutdown) {
+ return Status::RuntimeError("UDF process pool is shutdown");
+ }
+
+ // Try to get an idle process or create a new one
+ while (true) {
+ // If there's an idle process, return it immediately
+ if (!_idle_processes.empty()) {
+ *process = std::move(_idle_processes.front());
+ _idle_processes.pop();
+ return Status::OK();
+ }
+
+ // If we can create a new process, do it
+ if (_current_size < _max_pool_size) {
+ RETURN_IF_ERROR(PythonUDFServerManager::instance().fork(this, process));
+ ++_current_size;
+ return Status::OK();
+ }
+
+ // Pool is exhausted, wait for a process to be returned
+ LOG(INFO) << "Python UDF process pool exhausted (current size: " << _current_size
+ << ", max size: " << _max_pool_size << "), waiting for available process...";
+
+ auto timeout = std::chrono::milliseconds(config::python_process_pool_wait_timeout_ms);
+ std::cv_status wait_result = _cv.wait_for(lock, timeout);
+
+ // Check if shutdown during wait
+ if (_is_shutdown) {
+ return Status::RuntimeError("UDF process pool is shutdown");
+ }
+
+ // If timeout occurred and still no idle processes
+ if (wait_result == std::cv_status::timeout && _idle_processes.empty()) {
+ return Status::RuntimeError(
+ "UDF process pool exhausted (max size = {}), waited for {} ms but no "
+ "process became available. Please increase max_python_process_nums parameter "
+ "or python_process_pool_wait_timeout_ms and restart BE",
+ _max_pool_size, config::python_process_pool_wait_timeout_ms);
+ }
+
+ // If notified or spurious wakeup, loop back to check conditions
+ }
+}
+
+void PythonUDFProcessPool::return_process(ProcessPtr process) {
+ {
+ std::lock_guard<std::mutex> lock(_mtx);
+
+ if (!process || _is_shutdown) return;
+
+ if (!process->is_alive()) {
+ --_current_size;
+ LOG(WARNING) << "return dead process: " << process->to_string();
+ return;
+ }
+
+ _idle_processes.push(std::move(process));
+ }
+ // Notify one waiting thread that a process is available
+ _cv.notify_one();
+}
+
+void PythonUDFProcessPool::shutdown() {
+ std::lock_guard<std::mutex> lock(_mtx);
+
+ if (_is_shutdown) return;
+
+ while (!_idle_processes.empty()) {
+ _idle_processes.front()->shutdown();
+ _idle_processes.pop();
+ }
+
+ _is_shutdown = true;
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_runtime.h b/be/src/udf/python/python_udf_runtime.h
new file mode 100644
index 0000000..aa95414
--- /dev/null
+++ b/be/src/udf/python/python_udf_runtime.h
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <boost/process.hpp>
+#include <condition_variable>
+#include <queue>
+
+#include "common/status.h"
+#include "python_env.h"
+
+namespace doris {
+
+static const char* UNIX_SOCKET_PREFIX = "grpc+unix://";
+static const char* BASE_UNIX_SOCKET_PATH_TEMPLATE = "{}{}/lib/udf/python/python_udf";
+static const char* UNIX_SOCKET_PATH_TEMPLATE = "{}_{}.sock";
+static const char* FLIGHT_SERVER_PATH_TEMPLATE = "{}/plugins/python_udf/{}";
+static const char* FLIGHT_SERVER_FILENAME = "python_udf_server.py";
+static const char* EXECUTABLE_PYTHON_FILENAME = "python";
+
+inline std::string get_base_unix_socket_path() {
+ return fmt::format(BASE_UNIX_SOCKET_PATH_TEMPLATE, UNIX_SOCKET_PREFIX,
+ std::getenv("DORIS_HOME"));
+}
+
+inline std::string get_unix_socket_path(pid_t child_pid) {
+ return fmt::format(UNIX_SOCKET_PATH_TEMPLATE, get_base_unix_socket_path(), child_pid);
+}
+
+inline std::string get_unix_socket_file_path(pid_t child_pid) {
+ return fmt::format(UNIX_SOCKET_PATH_TEMPLATE,
+ fmt::format(BASE_UNIX_SOCKET_PATH_TEMPLATE, "", std::getenv("DORIS_HOME")),
+ child_pid);
+}
+
+inline std::string get_fight_server_path() {
+ return fmt::format(FLIGHT_SERVER_PATH_TEMPLATE, std::getenv("DORIS_HOME"),
+ FLIGHT_SERVER_FILENAME);
+}
+
+class PythonUDFProcess;
+class PythonUDFProcessPool;
+
+using ProcessPtr = std::unique_ptr<PythonUDFProcess>;
+using PythonUDFProcessPoolPtr = std::unique_ptr<PythonUDFProcessPool>;
+
+class PythonUDFProcess {
+public:
+ PythonUDFProcess(boost::process::child child, boost::process::ipstream output_stream,
+ PythonUDFProcessPool* pool)
+ : _is_shutdown(false),
+ _uri(get_unix_socket_path(child.id())),
+ _unix_socket_file_path(get_unix_socket_file_path(child.id())),
+ _child(std::move(child)),
+ _output_stream(std::move(output_stream)),
+ _pool(pool) {}
+
+ ~PythonUDFProcess() { shutdown(); }
+
+ std::string get_uri() const { return _uri; }
+
+ const std::string& get_socket_file_path() const { return _unix_socket_file_path; }
+
+ bool is_shutdown() const { return _is_shutdown; }
+
+ bool is_alive() const {
+ if (_is_shutdown) return false;
+ return _child.running();
+ }
+
+ void remove_unix_socket();
+
+ void shutdown();
+
+ std::string to_string() const;
+
+ PythonUDFProcessPool* pool() const { return _pool; }
+
+private:
+ constexpr static int TERMINATE_RETRY_TIMES = 10;
+ constexpr static size_t MAX_ACCUMULATED_LOG_SIZE = 65536;
+
+ bool _is_shutdown {false};
+ std::string _uri;
+ std::string _unix_socket_file_path;
+ mutable boost::process::child _child;
+ boost::process::ipstream _output_stream;
+ std::string _accumulated_log;
+ PythonUDFProcessPool* _pool {nullptr};
+};
+
+class PythonUDFProcessPool {
+public:
+ explicit PythonUDFProcessPool(PythonVersion version, size_t max_pool_size, size_t min_idle)
+ : _python_version(version),
+ _max_pool_size(max_pool_size),
+ _init_pool_size(min_idle),
+ _current_size(0),
+ _is_shutdown(false) {}
+
+ explicit PythonUDFProcessPool(PythonVersion version)
+ : _python_version(version),
+ _max_pool_size(16),
+ _init_pool_size(4),
+ _current_size(0),
+ _is_shutdown(false) {}
+
+ Status init();
+
+ Status borrow_process(ProcessPtr* process);
+
+ void return_process(ProcessPtr process);
+
+ void shutdown();
+
+ const PythonVersion& get_python_version() const { return _python_version; }
+
+private:
+ PythonVersion _python_version;
+ size_t _max_pool_size;
+ size_t _init_pool_size;
+ size_t _current_size;
+ bool _is_shutdown;
+ std::queue<ProcessPtr> _idle_processes;
+ // protect _idle_processes, _is_shutdown and _current_size
+ mutable std::mutex _mtx;
+ // condition variable to notify waiting threads when a process is returned
+ std::condition_variable _cv;
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_server.cpp b/be/src/udf/python/python_udf_server.cpp
new file mode 100644
index 0000000..a74008d
--- /dev/null
+++ b/be/src/udf/python/python_udf_server.cpp
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "udf/python/python_udf_server.h"
+
+#include <butil/fd_utility.h>
+#include <dirent.h>
+#include <fmt/core.h>
+#include <sys/poll.h>
+
+#include <boost/asio.hpp>
+#include <boost/process.hpp>
+
+#include "common/config.h"
+#include "udf/python/python_udf_client.h"
+
+namespace doris {
+
+Status PythonUDFServerManager::init(const std::vector<PythonVersion>& versions) {
+ std::lock_guard<std::mutex> lock(_pools_mutex);
+ for (const auto& version : versions) {
+ if (_pools.find(version) != _pools.end()) continue;
+ PythonUDFProcessPoolPtr new_pool = std::make_unique<PythonUDFProcessPool>(
+ version, config::max_python_process_nums, config::min_python_process_nums);
+ RETURN_IF_ERROR(new_pool->init());
+ _pools[version] = std::move(new_pool);
+ }
+ return Status::OK();
+}
+
+Status PythonUDFServerManager::get_client(const PythonUDFMeta& func_meta,
+ const PythonVersion& version,
+ PythonUDFClientPtr* client) {
+ PythonUDFProcessPoolPtr* pool = nullptr;
+ {
+ std::lock_guard<std::mutex> lock(_pools_mutex);
+ if (_pools.find(version) == _pools.end()) {
+ PythonUDFProcessPoolPtr new_pool = std::make_unique<PythonUDFProcessPool>(
+ version, config::max_python_process_nums, config::min_python_process_nums);
+ RETURN_IF_ERROR(new_pool->init());
+ _pools[version] = std::move(new_pool);
+ }
+ pool = &_pools[version];
+ }
+ ProcessPtr process;
+ RETURN_IF_ERROR((*pool)->borrow_process(&process));
+ RETURN_IF_ERROR(PythonUDFClient::create(func_meta, std::move(process), client));
+ return Status::OK();
+}
+
+Status PythonUDFServerManager::fork(PythonUDFProcessPool* pool, ProcessPtr* process) {
+ DCHECK(pool != nullptr);
+ const PythonVersion& version = pool->get_python_version();
+ // e.g. /usr/local/python3.7/bin/python3
+ std::string python_executable_path = version.get_executable_path();
+ // e.g. /{DORIS_HOME}/plugins/python_udf/python_udf_server.py
+ std::string fight_server_path = get_fight_server_path();
+ // e.g. grpc+unix:///home/doris/output/be/lib/udf/python/python_udf
+ std::string base_unix_socket_path = get_base_unix_socket_path();
+ std::vector<std::string> args = {"-u", // unbuffered output
+ fight_server_path, base_unix_socket_path};
+ boost::process::environment env = boost::this_process::environment();
+ boost::process::ipstream child_output; // input stream from child
+
+ try {
+ boost::process::child c(
+ python_executable_path, args, boost::process::std_out > child_output,
+ boost::process::env = env,
+ boost::process::on_exit([](int exit_code, const std::error_code& ec) {
+ if (ec) {
+ LOG(WARNING) << "Python UDF server exited with error: " << ec.message();
+ }
+ }));
+
+ std::string log_line;
+ std::string full_log;
+ bool started_successfully = false;
+ std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
+ const auto timeout = std::chrono::milliseconds(5000);
+
+ while (std::chrono::steady_clock::now() - start < timeout) {
+ if (std::getline(child_output, log_line)) {
+ full_log += log_line + "\n";
+ LOG(INFO) << fmt::format("Start python server, log_line: {}, full_log: {}",
+ log_line, full_log);
+ if (log_line == "Start python server successfully") {
+ started_successfully = true;
+ break;
+ }
+ } else {
+ if (!c.running()) {
+ break;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
+ }
+ }
+
+ if (!started_successfully) {
+ if (c.running()) {
+ c.terminate(); // terminate() sends SIGTERM on Unix
+ c.wait(); // wait for exit to avoid zombie processes
+ }
+
+ std::string error_msg = full_log.empty() ? "No output from Python server" : full_log;
+ LOG(ERROR) << "Python server start failed:\n" << error_msg;
+ return Status::InternalError("python server start failed:\n{}", error_msg);
+ }
+
+ *process = std::make_unique<PythonUDFProcess>(std::move(c), std::move(child_output), pool);
+ } catch (const std::exception& e) {
+ return Status::InternalError("Failed to start Python UDF server: {}", e.what());
+ }
+
+ return Status::OK();
+}
+
+void PythonUDFServerManager::shutdown() {
+ std::lock_guard lock(_pools_mutex);
+ for (auto& pool : _pools) {
+ pool.second->shutdown();
+ }
+ _pools.clear();
+ LOG(INFO) << "Python UDF server manager shutdown successfully";
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_server.h b/be/src/udf/python/python_udf_server.h
new file mode 100644
index 0000000..b21b874
--- /dev/null
+++ b/be/src/udf/python/python_udf_server.h
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "common/status.h"
+#include "udf/python/python_udf_client.h"
+#include "udf/python/python_udf_meta.h"
+#include "udf/python/python_udf_runtime.h"
+
+namespace doris {
+
+class PythonUDFServerManager {
+public:
+ PythonUDFServerManager() = default;
+
+ ~PythonUDFServerManager() = default;
+
+ static PythonUDFServerManager& instance() {
+ static PythonUDFServerManager instance;
+ return instance;
+ }
+
+ Status init(const std::vector<PythonVersion>& versions);
+
+ Status get_client(const PythonUDFMeta& func_meta, const PythonVersion& version,
+ PythonUDFClientPtr* client);
+
+ Status fork(PythonUDFProcessPool* pool, ProcessPtr* process);
+
+ void shutdown();
+
+private:
+ std::unordered_map<PythonVersion, PythonUDFProcessPoolPtr> _pools;
+ // protect _pools
+ std::mutex _pools_mutex;
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/udf/python/python_udf_server.py b/be/src/udf/python/python_udf_server.py
new file mode 100644
index 0000000..7095b52
--- /dev/null
+++ b/be/src/udf/python/python_udf_server.py
@@ -0,0 +1,1054 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import base64
+import importlib
+import inspect
+import json
+import sys
+import os
+import traceback
+import logging
+import time
+import threading
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Tuple, get_origin
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+
+import pandas as pd
+import pyarrow as pa
+from pyarrow import flight
+
+
+class ServerState:
+ """Global server state container."""
+
+ unix_socket_path: str = ""
+
+ @staticmethod
+ def setup_logging():
+ """Setup logging configuration for the UDF server."""
+ doris_home = os.getenv("DORIS_HOME")
+ if not doris_home:
+ # Fallback to current directory if DORIS_HOME is not set
+ doris_home = os.getcwd()
+
+ log_dir = os.path.join(doris_home, "lib", "udf", "python")
+ os.makedirs(log_dir, exist_ok=True)
+ log_file = os.path.join(log_dir, "python_udf_output.log")
+
+ logging.basicConfig(
+ level=logging.INFO,
+ format="[%(asctime)s] [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
+ handlers=[
+ logging.FileHandler(log_file, mode="a", encoding="utf-8"),
+ logging.StreamHandler(sys.stderr), # Also log to stderr for debugging
+ ],
+ )
+ logging.info("Logging initialized. Log file: %s", log_file)
+
+ @staticmethod
+ def extract_base_unix_socket_path(unix_socket_uri: str) -> str:
+ """
+ Extract the file system path from a gRPC Unix socket URI.
+
+ Args:
+ unix_socket_uri: URI in format 'grpc+unix:///path/to/socket'
+
+ Returns:
+ The file system path without the protocol prefix
+ """
+ if unix_socket_uri.startswith("grpc+unix://"):
+ unix_socket_uri = unix_socket_uri[len("grpc+unix://") :]
+ return unix_socket_uri
+
+ @staticmethod
+ def remove_unix_socket(unix_socket_uri: str) -> None:
+ """
+ Remove the Unix domain socket file if it exists.
+
+ Args:
+ unix_socket_uri: URI of the Unix socket to remove
+ """
+ if unix_socket_uri is None:
+ return
+ base_unix_socket_path = ServerState.extract_base_unix_socket_path(
+ unix_socket_uri
+ )
+ if os.path.exists(base_unix_socket_path):
+ try:
+ os.unlink(base_unix_socket_path)
+ logging.info(
+ "Removed UNIX socket %s successfully", base_unix_socket_path
+ )
+ except OSError as e:
+ logging.error(
+ "Failed to remove UNIX socket %s: %s", base_unix_socket_path, e
+ )
+ else:
+ logging.warning("UNIX socket %s does not exist", base_unix_socket_path)
+
+ @staticmethod
+ def monitor_parent_exit():
+ """
+ Monitor the parent process and exit gracefully if it dies.
+ This prevents orphaned UDF server processes.
+ """
+ parent_pid = os.getppid()
+ if parent_pid == 1:
+ # Parent process is init, no need to monitor
+ logging.info("Parent process is init (PID 1), skipping parent monitoring")
+ return
+
+ logging.info("Started monitoring parent process (PID: %s)", parent_pid)
+
+ while True:
+ try:
+ # os.kill(pid, 0) only checks whether the process exists
+ # without sending an actual signal
+ os.kill(parent_pid, 0)
+ except OSError:
+ # Parent process died
+ ServerState.remove_unix_socket(ServerState.unix_socket_path)
+ logging.error(
+ "Parent process %s died, exiting UDF server, unix socket path: %s",
+ parent_pid,
+ ServerState.unix_socket_path,
+ )
+ os._exit(0)
+ # Check every 2 seconds
+ time.sleep(2)
+
+
+ServerState.setup_logging()
+monitor_thread = threading.Thread(target=ServerState.monitor_parent_exit, daemon=True)
+monitor_thread.start()
+
+
+@contextmanager
+def temporary_sys_path(path: str):
+ """
+ Context manager to temporarily add a path to sys.path.
+ Ensures the path is removed after use to avoid pollution.
+
+ Args:
+ path: Directory path to add to sys.path
+
+ Yields:
+ None
+ """
+ path_added = False
+ if path not in sys.path:
+ sys.path.insert(0, path)
+ path_added = True
+ logging.debug("Temporarily added to sys.path: %s", path)
+
+ try:
+ yield
+ finally:
+ if path_added and path in sys.path:
+ sys.path.remove(path)
+ logging.debug("Removed from sys.path: %s", path)
+
+
+class VectorType(Enum):
+ """Enum representing supported vector types."""
+
+ LIST = "list"
+ PANDAS_SERIES = "pandas.Series"
+ ARROW_ARRAY = "pyarrow.Array"
+
+ @property
+ def python_type(self):
+ """
+ Returns the Python type corresponding to this VectorType.
+
+ Returns:
+ The Python type class (list, pd.Series, or pa.Array)
+ """
+ mapping = {
+ VectorType.LIST: list,
+ VectorType.PANDAS_SERIES: pd.Series,
+ VectorType.ARROW_ARRAY: pa.Array,
+ }
+ return mapping[self]
+
+ @staticmethod
+ def resolve_vector_type(param: inspect.Parameter):
+ """
+ Resolves the param's type annotation to the corresponding VectorType enum.
+ Returns None if the type is unsupported or not a vector type.
+ """
+ if (
+ param is None
+ or param.annotation is None
+ or param.annotation is inspect.Parameter.empty
+ ):
+ return None
+
+ annotation = param.annotation
+ origin = get_origin(annotation)
+ raw_type = origin if origin is not None else annotation
+
+ if raw_type is list:
+ return VectorType.LIST
+ if raw_type is pd.Series:
+ return VectorType.PANDAS_SERIES
+
+ return None
+
+
+class PythonUDFMeta:
+ """Metadata container for a Python UDF."""
+
+ def __init__(
+ self,
+ name: str,
+ symbol: str,
+ location: str,
+ udf_load_type: int,
+ runtime_version: str,
+ always_nullable: bool,
+ inline_code: bytes,
+ input_types: pa.Schema,
+ output_type: pa.DataType,
+ ) -> None:
+ """
+ Initialize Python UDF metadata.
+
+ Args:
+ name: UDF function name
+ symbol: Symbol to load (function name or module.function)
+ location: File path or directory containing the UDF
+ udf_load_type: 0 for inline code, 1 for module
+ runtime_version: Python runtime version requirement
+ always_nullable: Whether the UDF can return NULL values
+ inline_code: Base64-encoded inline Python code (if applicable)
+ input_types: PyArrow schema for input parameters
+ output_type: PyArrow data type for return value
+ """
+ self.name = name
+ self.symbol = symbol
+ self.location = location
+ self.udf_load_type = udf_load_type
+ self.runtime_version = runtime_version
+ self.always_nullable = always_nullable
+ self.inline_code = inline_code
+ self.input_types = input_types
+ self.output_type = output_type
+
+ def __str__(self) -> str:
+ """Returns a string representation of the UDF metadata."""
+ udf_load_type_str = "INLINE" if self.udf_load_type == 0 else "MODULE"
+ return (
+ f"PythonUDFMeta(name={self.name}, symbol={self.symbol}, "
+ f"location={self.location}, udf_load_type={udf_load_type_str}, runtime_version={self.runtime_version}, "
+ f"always_nullable={self.always_nullable}, inline_code={self.inline_code}, "
+ f"input_types={self.input_types}, output_type={self.output_type})"
+ )
+
+
+class AdaptivePythonUDF:
+ """
+ A wrapper around a UDF function that supports both scalar and vectorized execution modes.
+ The mode is determined by the type hints of the function parameters.
+ """
+
+ def __init__(self, python_udf_meta: PythonUDFMeta, func: Callable) -> None:
+ """
+ Initialize the adaptive UDF wrapper.
+
+ Args:
+ python_udf_meta: Metadata describing the UDF
+ func: The actual Python function to execute
+ """
+ self.python_udf_meta = python_udf_meta
+ self._eval_func = func
+
+ def __str__(self) -> str:
+ """Returns a string representation of the UDF wrapper."""
+ input_type_strs = [str(t) for t in self.python_udf_meta.input_types.types]
+ output_type_str = str(self.python_udf_meta.output_type)
+ eval_func_str = f"{self.python_udf_meta.name}({', '.join(input_type_strs)}) -> {output_type_str}"
+ return f"AdaptivePythonUDF(python_udf_meta: {self.python_udf_meta}, eval_func: {eval_func_str})"
+
+ def __call__(self, record_batch: pa.RecordBatch) -> pa.Array:
+ """
+ Executes the UDF on the given record batch. Supports both scalar and vectorized modes.
+
+ :param record_batch: Input data with N columns, each of length num_rows
+ :return: Output array of length num_rows
+ """
+ if record_batch.num_rows == 0:
+ return pa.array([], type=self._get_output_type())
+
+ if self._should_use_vectorized():
+ logging.info("Using vectorized mode for UDF: %s", self.python_udf_meta.name)
+ return self._vectorized_call(record_batch)
+
+ logging.info("Using scalar mode for UDF: %s", self.python_udf_meta.name)
+ return self._scalar_call(record_batch)
+
+ @staticmethod
+ def _cast_arrow_to_vector(arrow_array: pa.Array, vec_type: VectorType):
+ """
+ Convert a pa.Array to an instance of the specified VectorType.
+ """
+ if vec_type == VectorType.LIST:
+ return arrow_array.to_pylist()
+ elif vec_type == VectorType.PANDAS_SERIES:
+ return arrow_array.to_pandas()
+ else:
+ raise ValueError(f"Unsupported vector type: {vec_type}")
+
+ def _should_use_vectorized(self) -> bool:
+ """
+ Determines whether to use vectorized mode based on parameter type annotations.
+ Returns True if any parameter is annotated as:
+ - list
+ - pd.Series
+ """
+ try:
+ signature = inspect.signature(self._eval_func)
+ except ValueError:
+ # Cannot inspect built-in or C functions; default to scalar
+ return False
+
+ for param in signature.parameters.values():
+ if VectorType.resolve_vector_type(param):
+ return True
+
+ return False
+
+ def _convert_from_arrow_to_py(self, field):
+ if field is None:
+ return None
+
+ if pa.types.is_map(field.type):
+ # pyarrow.lib.MapScalar's as_py() returns a list of tuples, convert to dict
+ list_of_tuples = field.as_py()
+ return dict(list_of_tuples) if list_of_tuples is not None else None
+ return field.as_py()
+
+ def _scalar_call(self, record_batch: pa.RecordBatch) -> pa.Array:
+ """
+ Applies the UDF in scalar mode: one row at a time.
+
+ Args:
+ record_batch: Input data batch
+
+ Returns:
+ Output array with results for each row
+ """
+ columns = record_batch.columns
+ num_rows = record_batch.num_rows
+ result = []
+
+ for i in range(num_rows):
+ converted_args = [self._convert_from_arrow_to_py(col[i]) for col in columns]
+
+ try:
+ res = self._eval_func(*converted_args)
+ # Check if result is None when always_nullable is False
+ if res is None and not self.python_udf_meta.always_nullable:
+ raise RuntimeError(
+ f"the result of row {i} is null, but the return type is not nullable, "
+ f"please check the always_nullable property in create function statement, "
+ f"it should be true"
+ )
+ result.append(res)
+ except Exception as e:
+ logging.error(
+ "Error in scalar UDF execution at row %s: %s\nArgs: %s\nTraceback: %s",
+ i,
+ e,
+ converted_args,
+ traceback.format_exc(),
+ )
+ # Return None for failed rows if always_nullable is True
+ if self.python_udf_meta.always_nullable:
+ result.append(None)
+ else:
+ raise
+
+ return pa.array(result, type=self._get_output_type(), from_pandas=True)
+
+ def _vectorized_call(self, record_batch: pa.RecordBatch) -> pa.Array:
+ """
+ Applies the UDF in vectorized mode: processes entire columns at once.
+
+ Args:
+ record_batch: Input data batch
+
+ Returns:
+ Output array with results
+ """
+ column_args = record_batch.columns
+ logging.info("Vectorized call with %s columns", len(column_args))
+
+ sig = inspect.signature(self._eval_func)
+ params = list(sig.parameters.values())
+
+ if len(column_args) != len(params):
+ raise ValueError(f"UDF expects {len(params)} args, got {len(column_args)}")
+
+ converted_args = []
+ for param, arrow_col in zip(params, column_args):
+ vec_type = VectorType.resolve_vector_type(param)
+
+ if vec_type is None:
+ # For scalar types (int, float, str, etc.), extract the first value
+ # instead of converting to list
+ pylist = arrow_col.to_pylist()
+ if len(pylist) > 0:
+ converted = pylist[0]
+ logging.info(
+ "Converted %s to scalar (first value): %s",
+ param.name,
+ type(converted).__name__,
+ )
+ else:
+ converted = None
+ logging.info(
+ "Converted %s to scalar (None, empty column)", param.name
+ )
+ else:
+ converted = self._cast_arrow_to_vector(arrow_col, vec_type)
+ logging.info("Converted %s: %s", param.name, vec_type)
+
+ converted_args.append(converted)
+
+ try:
+ result = self._eval_func(*converted_args)
+ except Exception as e:
+ logging.error(
+ "Error in vectorized UDF: %s\nTraceback: %s", e, traceback.format_exc()
+ )
+ raise RuntimeError(f"Error in vectorized UDF: {e}") from e
+
+ # Convert result to PyArrow Array
+ result_array = None
+ if isinstance(result, pa.Array):
+ result_array = result
+ elif isinstance(result, pa.ChunkedArray):
+ # Combine chunks into a single array
+ result_array = pa.concat_arrays(result.chunks)
+ elif isinstance(result, pd.Series):
+ result_array = pa.array(result, type=self._get_output_type())
+ elif isinstance(result, list):
+ result_array = pa.array(
+ result, type=self._get_output_type(), from_pandas=True
+ )
+ else:
+ # Scalar result - broadcast to all rows
+ out_type = self._get_output_type()
+ logging.warning(
+ "UDF returned scalar value, broadcasting to %s rows",
+ record_batch.num_rows,
+ )
+ result_array = pa.array([result] * record_batch.num_rows, type=out_type)
+
+ # Check for None values when always_nullable is False
+ if not self.python_udf_meta.always_nullable:
+ null_count = result_array.null_count
+ if null_count > 0:
+ # Find the first null index for error message
+ for i, value in enumerate(result_array):
+ if value.is_valid is False:
+ raise RuntimeError(
+ f"the result of row {i} is null, but the return type is not nullable, "
+ f"please check the always_nullable property in create function statement, "
+ f"it should be true"
+ )
+
+ return result_array
+
+ def _get_output_type(self) -> pa.DataType:
+ """
+ Returns the expected output type for the UDF.
+
+ Returns:
+ PyArrow DataType for the output
+ """
+ return self.python_udf_meta.output_type or pa.null()
+
+
+class UDFLoader(ABC):
+ """Abstract base class for loading UDFs from different sources."""
+
+ def __init__(self, python_udf_meta: PythonUDFMeta) -> None:
+ """
+ Initialize the UDF loader.
+
+ Args:
+ python_udf_meta: Metadata describing the UDF to load
+ """
+ self.python_udf_meta = python_udf_meta
+
+ @abstractmethod
+ def load(self) -> AdaptivePythonUDF:
+ """Load the UDF and return an AdaptivePythonUDF wrapper."""
+ raise NotImplementedError("Subclasses must implement load().")
+
+
+class InlineUDFLoader(UDFLoader):
+ """Loads a UDF defined directly in inline code."""
+
+ def load(self) -> AdaptivePythonUDF:
+ """
+ Load and execute inline Python code to extract the UDF function.
+
+ Returns:
+ AdaptivePythonUDF wrapper around the loaded function
+
+ Raises:
+ RuntimeError: If code execution fails
+ ValueError: If the function is not found or not callable
+ """
+ symbol = self.python_udf_meta.symbol
+ inline_code = self.python_udf_meta.inline_code.decode("utf-8")
+ env: dict[str, Any] = {}
+ logging.info("Loading inline code for function '%s'", symbol)
+ logging.debug("Inline code:\n%s", inline_code)
+
+ try:
+ # Execute the code in a clean environment
+ # pylint: disable=exec-used
+ # Note: exec() is necessary here for dynamic UDF loading from inline code
+ exec(inline_code, env) # nosec B102
+ except Exception as e:
+ logging.error(
+ "Failed to exec inline code: %s\nTraceback: %s",
+ e,
+ traceback.format_exc(),
+ )
+ raise RuntimeError(f"Failed to exec inline code: {e}") from e
+
+ func = env.get(symbol)
+ if func is None:
+ available_funcs = [
+ k for k, v in env.items() if callable(v) and not k.startswith("_")
+ ]
+ logging.error(
+ "Function '%s' not found in inline code. Available functions: %s",
+ symbol,
+ available_funcs,
+ )
+ raise ValueError(f"Function '{symbol}' not found in inline code.")
+
+ if not callable(func):
+ logging.error(
+ "'%s' exists but is not callable (type: %s)", symbol, type(func)
+ )
+ raise ValueError(f"'{symbol}' is not a callable function.")
+
+ logging.info("Successfully loaded function '%s' from inline code", symbol)
+ return AdaptivePythonUDF(self.python_udf_meta, func)
+
+
+class ModuleUDFLoader(UDFLoader):
+ """Loads a UDF from a Python module file (.py)."""
+
+ def load(self) -> AdaptivePythonUDF:
+ """
+ Loads a UDF from a Python module file.
+
+ Returns:
+ AdaptivePythonUDF instance wrapping the loaded function
+
+ Raises:
+ ValueError: If module file not found
+ TypeError: If symbol is not callable
+ """
+ symbol = self.python_udf_meta.symbol # [package_name.]module_name.function_name
+ location = self.python_udf_meta.location # /path/to/module_name[.py]
+
+ if not os.path.exists(location):
+ raise ValueError(f"Module file not found: {location}")
+
+ package_name, module_name, func_name = self.parse_symbol(symbol)
+ func = self.load_udf_from_module(location, package_name, module_name, func_name)
+
+ if not callable(func):
+ raise TypeError(
+ f"'{symbol}' exists but is not callable (type: {type(func).__name__})"
+ )
+
+ logging.info(
+ "Successfully loaded function '%s' from module: %s", symbol, location
+ )
+ return AdaptivePythonUDF(self.python_udf_meta, func)
+
+ def parse_symbol(self, symbol: str):
+ """
+ Parse symbol into (package_name, module_name, func_name)
+
+ Supported formats:
+ - "module.func" → (None, module, func)
+ - "package.module.func" → (package, "module", func)
+ """
+ if not symbol or "." not in symbol:
+ raise ValueError(
+ f"Invalid symbol format: '{symbol}'. "
+ "Expected 'module.function' or 'package.module.function'"
+ )
+
+ parts = symbol.split(".")
+ if len(parts) == 2:
+ # module.func → Single-file mode
+ module_name, func_name = parts
+ package_name = None
+ if not module_name or not module_name.strip():
+ raise ValueError(f"Module name is empty in symbol: '{symbol}'")
+ if not func_name or not func_name.strip():
+ raise ValueError(f"Function name is empty in symbol: '{symbol}'")
+ elif len(parts) > 2:
+ package_name = parts[0]
+ module_name = ".".join(parts[1:-1])
+ func_name = parts[-1]
+ if not package_name or not package_name.strip():
+ raise ValueError(f"Package name is empty in symbol: '{symbol}'")
+ if not module_name or not module_name.strip():
+ raise ValueError(f"Module name is empty in symbol: '{symbol}'")
+ if not func_name or not func_name.strip():
+ raise ValueError(f"Function name is empty in symbol: '{symbol}'")
+ else:
+ raise ValueError(f"Invalid symbol format: '{symbol}'")
+
+ logging.debug(
+ "Parsed symbol: package=%s, module=%s, func=%s",
+ package_name,
+ module_name,
+ func_name,
+ )
+ return package_name, module_name, func_name
+
+ def _validate_location(self, location: str) -> None:
+ """Validate that the location is a valid directory."""
+ if not os.path.isdir(location):
+ raise ValueError(f"Location is not a directory: {location}")
+
+ def _get_or_import_module(self, location: str, full_module_name: str) -> Any:
+ """Get module from cache or import it."""
+ if full_module_name in sys.modules:
+ logging.warning(
+ "Module '%s' already loaded, using cached version", full_module_name
+ )
+ return sys.modules[full_module_name]
+
+ with temporary_sys_path(location):
+ return importlib.import_module(full_module_name)
+
+ def _extract_function(
+ self, module: Any, func_name: str, module_name: str
+ ) -> Callable:
+ """Extract and validate function from module."""
+ func = getattr(module, func_name, None)
+ if func is None:
+ raise AttributeError(
+ f"Function '{func_name}' not found in module '{module_name}'"
+ )
+ if not callable(func):
+ raise TypeError(f"'{func_name}' is not callable")
+ return func
+
+ def _load_single_file_udf(
+ self, location: str, module_name: str, func_name: str
+ ) -> Callable:
+ """Load UDF from a single Python file."""
+ py_file = os.path.join(location, f"{module_name}.py")
+ if not os.path.isfile(py_file):
+ raise ImportError(f"Python file not found: {py_file}")
+
+ try:
+ udf_module = self._get_or_import_module(location, module_name)
+ return self._extract_function(udf_module, func_name, module_name)
+ except (ImportError, AttributeError, TypeError) as e:
+ raise ImportError(
+ f"Failed to load single-file UDF '{module_name}.{func_name}': {e}"
+ ) from e
+ except Exception as e:
+ logging.error(
+ "Unexpected error loading UDF: %s\n%s", e, traceback.format_exc()
+ )
+ raise
+
+ def _ensure_package_init(self, package_path: str, package_name: str) -> None:
+ """Ensure __init__.py exists in the package directory."""
+ init_path = os.path.join(package_path, "__init__.py")
+ if not os.path.exists(init_path):
+ logging.warning(
+ "__init__.py not found in package '%s', attempting to create it",
+ package_name,
+ )
+ try:
+ with open(init_path, "w", encoding="utf-8") as f:
+ f.write(
+ "# Auto-generated by UDF loader to make directory a Python package\n"
+ )
+ logging.info("Created __init__.py in %s", package_path)
+ except OSError as e:
+ raise ImportError(
+ f"Cannot create __init__.py in package '{package_name}': {e}"
+ ) from e
+
+ def _build_full_module_name(self, package_name: str, module_name: str) -> str:
+ """Build the full module name for package mode."""
+ if module_name == "__init__":
+ return package_name
+ return f"{package_name}.{module_name}"
+
+ def _load_package_udf(
+ self, location: str, package_name: str, module_name: str, func_name: str
+ ) -> Callable:
+ """Load UDF from a Python package."""
+ package_path = os.path.join(location, package_name)
+ if not os.path.isdir(package_path):
+ raise ImportError(f"Package '{package_name}' not found in '{location}'")
+
+ self._ensure_package_init(package_path, package_name)
+
+ try:
+ full_module_name = self._build_full_module_name(package_name, module_name)
+ udf_module = self._get_or_import_module(location, full_module_name)
+ return self._extract_function(udf_module, func_name, full_module_name)
+ except (ImportError, AttributeError, TypeError) as e:
+ raise ImportError(
+ f"Failed to load packaged UDF '{package_name}.{module_name}.{func_name}': {e}"
+ ) from e
+ except Exception as e:
+ logging.error(
+ "Unexpected error loading packaged UDF: %s\n%s",
+ e,
+ traceback.format_exc(),
+ )
+ raise
+
+ def load_udf_from_module(
+ self,
+ location: str,
+ package_name: Optional[str],
+ module_name: str,
+ func_name: str,
+ ) -> Callable:
+ """
+ Load a UDF from a Python module, supporting both:
+ 1. Single-file mode: package_name=None, module_name="your_file"
+ 2. Package mode: package_name="your_pkg", module_name="submodule" or "__init__"
+
+ Args:
+ location:
+ - In package mode: parent directory of the package
+ - In single-file mode: directory containing the .py file
+ package_name:
+ - If None or empty: treat as single-file mode
+ - Else: standard package name
+ module_name:
+ - In package mode: submodule name (e.g., "main") or "__init__"
+ - In single-file mode: filename without .py (e.g., "udf_script")
+ func_name: name of the function to load
+
+ Returns:
+ The callable UDF function.
+ """
+ self._validate_location(location)
+
+ if not package_name or package_name.strip() == "":
+ return self._load_single_file_udf(location, module_name, func_name)
+ else:
+ return self._load_package_udf(
+ location, package_name, module_name, func_name
+ )
+
+
+class UDFLoaderFactory:
+ """Factory to select the appropriate loader based on UDF location."""
+
+ @staticmethod
+ def get_loader(python_udf_meta: PythonUDFMeta) -> UDFLoader:
+ """
+ Factory method to create the appropriate UDF loader based on metadata.
+
+ Args:
+ python_udf_meta: UDF metadata containing load type and location
+
+ Returns:
+ Appropriate UDFLoader instance (InlineUDFLoader or ModuleUDFLoader)
+
+ Raises:
+ ValueError: If UDF load type or location is unsupported
+ """
+ location = python_udf_meta.location
+ udf_load_type = python_udf_meta.udf_load_type # 0: inline, 1: module
+
+ if udf_load_type == 0:
+ return InlineUDFLoader(python_udf_meta)
+ elif udf_load_type == 1:
+ if UDFLoaderFactory.check_module(location):
+ return ModuleUDFLoader(python_udf_meta)
+ else:
+ raise ValueError(f"Unsupported UDF location: {location}")
+ else:
+ raise ValueError(f"Unsupported UDF load type: {udf_load_type}")
+
+ @staticmethod
+ def check_module(location: str) -> bool:
+ """
+ Checks if a location is a valid Python module or package.
+
+ A valid module is either:
+ - A .py file, or
+ - A directory containing __init__.py (i.e., a package).
+
+ Raises:
+ ValueError: If the location does not exist or contains no Python module.
+
+ Returns:
+ True if valid.
+ """
+ if not os.path.exists(location):
+ raise ValueError(f"Module not found: {location}")
+
+ if os.path.isfile(location):
+ if location.endswith(".py"):
+ return True
+ else:
+ raise ValueError(f"File is not a Python module (.py): {location}")
+
+ if os.path.isdir(location):
+ if UDFLoaderFactory.has_python_file_recursive(location):
+ return True
+ else:
+ raise ValueError(
+ f"Directory contains no Python (.py) files: {location}"
+ )
+
+ raise ValueError(f"Invalid module location (not file or directory): {location}")
+
+ @staticmethod
+ def has_python_file_recursive(location: str) -> bool:
+ """
+ Recursively checks if a directory contains any Python (.py) files.
+
+ Args:
+ location: Directory path to search
+
+ Returns:
+ True if at least one .py file is found, False otherwise
+ """
+ path = Path(location)
+ if not path.is_dir():
+ return False
+ return any(path.rglob("*.py"))
+
+
+class UDFFlightServer(flight.FlightServerBase):
+ """Arrow Flight server for executing Python UDFs."""
+
+ @staticmethod
+ def parse_python_udf_meta(
+ descriptor: flight.FlightDescriptor,
+ ) -> Optional[PythonUDFMeta]:
+ """Parses UDF metadata from a command descriptor."""
+
+ if descriptor.descriptor_type != flight.DescriptorType.CMD:
+ logging.error("Invalid descriptor type: %s", descriptor.descriptor_type)
+ return None
+
+ cmd_json = json.loads(descriptor.command)
+ name = cmd_json["name"]
+ symbol = cmd_json["symbol"]
+ location = cmd_json["location"]
+ udf_load_type = cmd_json["udf_load_type"]
+ runtime_version = cmd_json["runtime_version"]
+ always_nullable = cmd_json["always_nullable"]
+
+ inline_code = base64.b64decode(cmd_json["inline_code"])
+ input_binary = base64.b64decode(cmd_json["input_types"])
+ output_binary = base64.b64decode(cmd_json["return_type"])
+
+ input_schema = pa.ipc.read_schema(pa.BufferReader(input_binary))
+ output_schema = pa.ipc.read_schema(pa.BufferReader(output_binary))
+
+ if len(output_schema) != 1:
+ logging.error(
+ "Output schema must have exactly one field: %s", output_schema
+ )
+ return None
+
+ output_type = output_schema.field(0).type
+
+ return PythonUDFMeta(
+ name=name,
+ symbol=symbol,
+ location=location,
+ udf_load_type=udf_load_type,
+ runtime_version=runtime_version,
+ always_nullable=always_nullable,
+ inline_code=inline_code,
+ input_types=input_schema,
+ output_type=output_type,
+ )
+
+ @staticmethod
+ def check_schema(
+ record_batch: pa.RecordBatch, expected_schema: pa.Schema
+ ) -> Tuple[bool, str]:
+ """
+ Validates that the input RecordBatch schema matches the expected schema.
+ Checks that field count and types match, but field names can differ.
+
+ :return: (result, error_message)
+ """
+ actual = record_batch.schema
+ expected = expected_schema
+
+ logging.info(f"Actual schema: {actual}")
+ logging.info(f"Expected schema: {expected}")
+
+ # Check field count
+ if len(actual) != len(expected):
+ return (
+ False,
+ f"Schema length mismatch, got {len(actual)} fields, expected {len(expected)} fields",
+ )
+
+ # Check each field type (ignore field names)
+ for i, (actual_field, expected_field) in enumerate(zip(actual, expected)):
+ if not actual_field.type.equals(expected_field.type):
+ return False, (
+ f"Type mismatch at field index {i}, "
+ f"got {actual_field.type}, expected {expected_field.type}"
+ )
+
+ return True, ""
+
+ def do_exchange(
+ self,
+ context: flight.ServerCallContext,
+ descriptor: flight.FlightDescriptor,
+ reader: flight.MetadataRecordBatchReader,
+ writer: flight.MetadataRecordBatchWriter,
+ ) -> None:
+ """Handles bidirectional streaming UDF execution."""
+ logging.info("Received exchange request for UDF: %s", descriptor)
+
+ python_udf_meta = UDFFlightServer.parse_python_udf_meta(descriptor)
+ if not python_udf_meta:
+ raise ValueError("Invalid or missing UDF metadata in descriptor")
+
+ loader = UDFLoaderFactory.get_loader(python_udf_meta)
+ udf = loader.load()
+ logging.info("Loaded UDF: %s", udf)
+
+ started = False
+ for chunk in reader:
+ if not chunk.data:
+ logging.info("Empty chunk received, skipping")
+ continue
+
+ check_schema_result, error_msg = UDFFlightServer.check_schema(
+ chunk.data, python_udf_meta.input_types
+ )
+ if not check_schema_result:
+ logging.error("Schema mismatch: %s", error_msg)
+ raise ValueError(f"Schema mismatch: {error_msg}")
+
+ result_array = udf(chunk.data)
+
+ if not python_udf_meta.output_type.equals(result_array.type):
+ logging.error(
+ "Output type mismatch: got %s, expected %s",
+ result_array.type,
+ python_udf_meta.output_type,
+ )
+ raise ValueError(
+ f"Output type mismatch: got {result_array.type}, expected {python_udf_meta.output_type}"
+ )
+
+ result_batch = pa.RecordBatch.from_arrays([result_array], ["result"])
+ if not started:
+ writer.begin(result_batch.schema)
+ started = True
+ writer.write_batch(result_batch)
+
+
+def check_unix_socket_path(unix_socket_path: str) -> bool:
+ """Validates the Unix domain socket path format."""
+ if not unix_socket_path:
+ logging.error("Unix socket path is empty")
+ return False
+
+ if not unix_socket_path.startswith("grpc+unix://"):
+ raise ValueError("gRPC UDS URL must start with 'grpc+unix://'")
+
+ socket_path = unix_socket_path[len("grpc+unix://") :].strip()
+ if not socket_path:
+ logging.error("Extracted socket path is empty")
+ return False
+
+ return True
+
+
+def main(unix_socket_path: str) -> None:
+ """
+ Main entry point for the Python UDF server.
+
+ Args:
+ unix_socket_path: Base path for the Unix domain socket
+
+ Raises:
+ SystemExit: If socket path is invalid or server fails to start
+ """
+ try:
+ if not check_unix_socket_path(unix_socket_path):
+ print(f"ERROR: Invalid socket path: {unix_socket_path}", flush=True)
+ sys.exit(1)
+
+ current_pid = os.getpid()
+ ServerState.unix_socket_path = f"{unix_socket_path}_{current_pid}.sock"
+ server = UDFFlightServer(ServerState.unix_socket_path)
+ print("Start python server successfully", flush=True)
+ logging.info("##### PYTHON UDF SERVER STARTED AT %s #####", datetime.now())
+ server.wait()
+
+ except Exception as e:
+ print(
+ f"ERROR: Failed to start Python UDF server: {type(e).__name__}: {e}",
+ flush=True,
+ )
+ tb_lines = traceback.format_exception(type(e), e, e.__traceback__)
+ if len(tb_lines) > 1:
+ print(f"DETAIL: {tb_lines[-2].strip()}", flush=True)
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Run an Arrow Flight UDF server over Unix socket."
+ )
+ parser.add_argument(
+ "unix_socket_path",
+ type=str,
+ help="Path to the Unix socket (e.g., grpc+unix:///path/to/socket)",
+ )
+ args = parser.parse_args()
+ main(args.unix_socket_path)
diff --git a/be/src/util/arrow/block_convertor.cpp b/be/src/util/arrow/block_convertor.cpp
index 0da5c22..ab8da71 100644
--- a/be/src/util/arrow/block_convertor.cpp
+++ b/be/src/util/arrow/block_convertor.cpp
@@ -53,38 +53,7 @@
namespace doris {
#include "common/compile_check_begin.h"
-class FromBlockConverter {
-public:
- FromBlockConverter(const vectorized::Block& block, const std::shared_ptr<arrow::Schema>& schema,
- arrow::MemoryPool* pool, const cctz::time_zone& timezone_obj)
- : _block(block),
- _schema(schema),
- _pool(pool),
- _cur_field_idx(-1),
- _timezone_obj(timezone_obj) {}
-
- ~FromBlockConverter() = default;
-
- Status convert(std::shared_ptr<arrow::RecordBatch>* out);
-
-private:
- const vectorized::Block& _block;
- const std::shared_ptr<arrow::Schema>& _schema;
- arrow::MemoryPool* _pool;
-
- size_t _cur_field_idx;
- size_t _cur_start;
- size_t _cur_rows;
- vectorized::ColumnPtr _cur_col;
- vectorized::DataTypePtr _cur_type;
- arrow::ArrayBuilder* _cur_builder = nullptr;
-
- const cctz::time_zone& _timezone_obj;
-
- std::vector<std::shared_ptr<arrow::Array>> _arrays;
-};
-
-Status FromBlockConverter::convert(std::shared_ptr<arrow::RecordBatch>* out) {
+Status FromBlockToRecordBatchConverter::convert(std::shared_ptr<arrow::RecordBatch>* out) {
int num_fields = _schema->num_fields();
if (_block.columns() != num_fields) {
return Status::InvalidArgument("number fields not match");
@@ -127,13 +96,44 @@
return Status::OK();
}
+Status FromRecordBatchToBlockConverter::convert(vectorized::Block* block) {
+ DCHECK(block);
+ int num_fields = _batch->num_columns();
+ if ((size_t)num_fields != _types.size()) {
+ return Status::InvalidArgument("number fields not match");
+ }
+
+ int64_t num_rows = _batch->num_rows();
+ _columns.reserve(num_fields);
+
+ for (int idx = 0; idx < num_fields; ++idx) {
+ auto doris_type = _types[idx];
+ auto doris_column = doris_type->create_column();
+ auto arrow_column = _batch->column(idx);
+ DCHECK_EQ(arrow_column->length(), num_rows);
+ RETURN_IF_ERROR(doris_type->get_serde()->read_column_from_arrow(
+ *doris_column, &*arrow_column, 0, num_rows, _timezone_obj));
+ _columns.emplace_back(std::move(doris_column), std::move(doris_type), std::to_string(idx));
+ }
+
+ block->swap(_columns);
+ return Status::OK();
+}
+
Status convert_to_arrow_batch(const vectorized::Block& block,
const std::shared_ptr<arrow::Schema>& schema, arrow::MemoryPool* pool,
std::shared_ptr<arrow::RecordBatch>* result,
const cctz::time_zone& timezone_obj) {
- FromBlockConverter converter(block, schema, pool, timezone_obj);
+ FromBlockToRecordBatchConverter converter(block, schema, pool, timezone_obj);
return converter.convert(result);
}
+Status convert_from_arrow_batch(const std::shared_ptr<arrow::RecordBatch>& batch,
+ const vectorized::DataTypes& types, vectorized::Block* block,
+ const cctz::time_zone& timezone_obj) {
+ FromRecordBatchToBlockConverter converter(batch, types, timezone_obj);
+ return converter.convert(block);
+}
+
#include "common/compile_check_end.h"
} // namespace doris
diff --git a/be/src/util/arrow/block_convertor.h b/be/src/util/arrow/block_convertor.h
index 6c3163b..7cda9c3 100644
--- a/be/src/util/arrow/block_convertor.h
+++ b/be/src/util/arrow/block_convertor.h
@@ -22,7 +22,9 @@
#include <memory>
#include "common/status.h"
+#include "vec/columns/column.h"
#include "vec/core/block.h"
+#include "vec/data_types/data_type.h"
// This file will convert Doris Block to/from Arrow's RecordBatch
// Block is used by Doris query engine to exchange data between
@@ -38,9 +40,63 @@
namespace doris {
+class FromBlockToRecordBatchConverter {
+public:
+ FromBlockToRecordBatchConverter(const vectorized::Block& block,
+ const std::shared_ptr<arrow::Schema>& schema,
+ arrow::MemoryPool* pool, const cctz::time_zone& timezone_obj)
+ : _block(block),
+ _schema(schema),
+ _pool(pool),
+ _cur_field_idx(-1),
+ _timezone_obj(timezone_obj) {}
+
+ ~FromBlockToRecordBatchConverter() = default;
+
+ Status convert(std::shared_ptr<arrow::RecordBatch>* out);
+
+private:
+ const vectorized::Block& _block;
+ const std::shared_ptr<arrow::Schema>& _schema;
+ arrow::MemoryPool* _pool;
+
+ size_t _cur_field_idx;
+ size_t _cur_start;
+ size_t _cur_rows;
+ vectorized::ColumnPtr _cur_col;
+ vectorized::DataTypePtr _cur_type;
+ arrow::ArrayBuilder* _cur_builder = nullptr;
+
+ const cctz::time_zone& _timezone_obj;
+
+ std::vector<std::shared_ptr<arrow::Array>> _arrays;
+};
+
+class FromRecordBatchToBlockConverter {
+public:
+ FromRecordBatchToBlockConverter(const std::shared_ptr<arrow::RecordBatch>& batch,
+ const vectorized::DataTypes& types,
+ const cctz::time_zone& timezone_obj)
+ : _batch(batch), _types(types), _timezone_obj(timezone_obj) {}
+
+ ~FromRecordBatchToBlockConverter() = default;
+
+ Status convert(vectorized::Block* block);
+
+private:
+ const std::shared_ptr<arrow::RecordBatch>& _batch;
+ const vectorized::DataTypes& _types;
+ const cctz::time_zone& _timezone_obj;
+ vectorized::ColumnsWithTypeAndName _columns;
+};
+
Status convert_to_arrow_batch(const vectorized::Block& block,
const std::shared_ptr<arrow::Schema>& schema, arrow::MemoryPool* pool,
std::shared_ptr<arrow::RecordBatch>* result,
const cctz::time_zone& timezone_obj);
+Status convert_from_arrow_batch(const std::shared_ptr<arrow::RecordBatch>& batch,
+ const vectorized::DataTypes& types, vectorized::Block* block,
+ const cctz::time_zone& timezone_obj);
+
} // namespace doris
diff --git a/be/src/util/arrow/utils.h b/be/src/util/arrow/utils.h
index 0a731ba..7794906 100644
--- a/be/src/util/arrow/utils.h
+++ b/be/src/util/arrow/utils.h
@@ -17,8 +17,11 @@
#pragma once
+#include <arrow/result.h>
+
#include <iostream>
+#include "common/compiler_util.h"
#include "common/status.h"
// This files contains some utilities to convert Doris internal
@@ -72,4 +75,33 @@
Status to_doris_status(const arrow::Status& status);
arrow::Status to_arrow_status(const Status& status);
+template <typename T>
+inline void assign_from_result(T& output, const arrow::Result<T>& result) {
+ output = *result;
+}
+
+template <typename T>
+inline void assign_from_result(T& output, arrow::Result<T>&& result) {
+ output = std::move(*result);
+}
+
+template <typename T>
+inline void assign_from_result(T* output, const arrow::Result<T>& result) {
+ *output = *result;
+}
+
+template <typename T>
+inline void assign_from_result(T* output, arrow::Result<T>&& result) {
+ *output = std::move(*result);
+}
+
+#define RETURN_DORIS_STATUS_IF_RESULT_ERROR(output, result_expr) \
+ do { \
+ auto&& _result_ = (result_expr); \
+ if (UNLIKELY(!_result_.ok())) { \
+ return to_doris_status(_result_.status()); \
+ } \
+ assign_from_result(output, std::forward<decltype(_result_)>(_result_)); \
+ } while (0)
+
} // namespace doris
diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp b/be/src/vec/exprs/vectorized_fn_call.cpp
index cc61a69..553cb76 100644
--- a/be/src/vec/exprs/vectorized_fn_call.cpp
+++ b/be/src/vec/exprs/vectorized_fn_call.cpp
@@ -17,6 +17,7 @@
#include "vec/exprs/vectorized_fn_call.h"
+#include <fmt/compile.h>
#include <fmt/format.h>
#include <fmt/ranges.h> // IWYU pragma: keep
#include <gen_cpp/Opcodes_types.h>
@@ -57,6 +58,7 @@
#include "vec/functions/function_agg_state.h"
#include "vec/functions/function_fake.h"
#include "vec/functions/function_java_udf.h"
+#include "vec/functions/function_python_udf.h"
#include "vec/functions/function_rpc.h"
#include "vec/functions/simple_function_factory.h"
#include "vec/utils/util.hpp"
@@ -115,6 +117,17 @@
"Java UDF is not enabled, you can change be config enable_java_support to true "
"and restart be.");
}
+ } else if (_fn.binary_type == TFunctionBinaryType::PYTHON_UDF) {
+ if (config::enable_python_udf_support) {
+ _function = PythonFunctionCall::create(_fn, argument_template, _data_type);
+ LOG(INFO) << fmt::format(
+ "create python function call: {}, runtime version: {}, function code: {}",
+ _fn.name.function_name, _fn.runtime_version, _fn.function_code);
+ } else {
+ return Status::InternalError(
+ "Python UDF is not enabled, you can change be config enable_python_udf_support "
+ "to true and restart be.");
+ }
} else if (_fn.binary_type == TFunctionBinaryType::AGG_STATE) {
DataTypes argument_types;
for (auto column : argument_template) {
diff --git a/be/src/vec/functions/function_python_udf.cpp b/be/src/vec/functions/function_python_udf.cpp
new file mode 100644
index 0000000..63c0c1d
--- /dev/null
+++ b/be/src/vec/functions/function_python_udf.cpp
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/functions/function_python_udf.h"
+
+#include <arrow/record_batch.h>
+#include <arrow/type_fwd.h>
+#include <fmt/core.h>
+#include <glog/logging.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <ctime>
+#include <memory>
+
+#include "common/status.h"
+#include "runtime/user_function_cache.h"
+#include "udf/python/python_udf_meta.h"
+#include "udf/python/python_udf_server.h"
+#include "util/arrow/block_convertor.h"
+#include "util/arrow/row_batch.h"
+#include "util/timezone_utils.h"
+#include "vec/core/block.h"
+#include "vec/exec/jni_connector.h"
+
+namespace doris::vectorized {
+
+PythonFunctionCall::PythonFunctionCall(const TFunction& fn, const DataTypes& argument_types,
+ const DataTypePtr& return_type)
+ : _fn(fn), _argument_types(argument_types), _return_type(return_type) {}
+
+Status PythonFunctionCall::open(FunctionContext* context,
+ FunctionContext::FunctionStateScope scope) {
+ if (scope == FunctionContext::FunctionStateScope::FRAGMENT_LOCAL) {
+ LOG(INFO) << "Open python UDF fragment local";
+ return Status::OK();
+ }
+
+ PythonVersion version;
+ PythonUDFMeta func_meta;
+ func_meta._id = _fn.id;
+ func_meta._name = _fn.name.function_name;
+ func_meta._symbol = _fn.scalar_fn.symbol;
+ if (!_fn.function_code.empty()) {
+ func_meta._type = PythonUDFLoadType::INLINE;
+ func_meta._location = "inline";
+ func_meta._inline_code = _fn.function_code;
+ } else if (!_fn.hdfs_location.empty()) {
+ func_meta._type = PythonUDFLoadType::MODULE;
+ func_meta._location = _fn.hdfs_location;
+ func_meta._checksum = _fn.checksum;
+ } else {
+ func_meta._type = PythonUDFLoadType::UNKNOWN;
+ func_meta._location = "unknown";
+ }
+
+ func_meta._input_types = _argument_types;
+ func_meta._return_type = _return_type;
+
+ if (_fn.__isset.runtime_version && !_fn.runtime_version.empty()) {
+ RETURN_IF_ERROR(
+ PythonVersionManager::instance().get_version(_fn.runtime_version, &version));
+ } else {
+ return Status::InvalidArgument("Python UDF runtime version is not set");
+ }
+
+ func_meta._runtime_version = version.full_version;
+ RETURN_IF_ERROR(func_meta.check());
+ func_meta._always_nullable = _return_type->is_nullable();
+ LOG(INFO) << fmt::format("runtime_version: {}, func_meta: {}", version.to_string(),
+ func_meta.to_string());
+
+ if (func_meta._type == PythonUDFLoadType::MODULE) {
+ RETURN_IF_ERROR(UserFunctionCache::instance()->get_pypath(
+ func_meta._id, func_meta._location, func_meta._checksum, &func_meta._location));
+ }
+
+ PythonUDFClientPtr client = nullptr;
+ RETURN_IF_ERROR(PythonUDFServerManager::instance().get_client(func_meta, version, &client));
+
+ if (!client) {
+ return Status::InternalError("Python UDF client is null");
+ }
+
+ context->set_function_state(FunctionContext::THREAD_LOCAL, client);
+ LOG(INFO) << fmt::format("Successfully get python UDF client, process: {}",
+ client->print_process());
+ return Status::OK();
+}
+
+Status PythonFunctionCall::execute_impl(FunctionContext* context, Block& block,
+ const ColumnNumbers& arguments, uint32_t result,
+ size_t num_rows) const {
+ auto client = reinterpret_cast<PythonUDFClient*>(
+ context->get_function_state(FunctionContext::THREAD_LOCAL));
+ if (!client) {
+ LOG(WARNING) << "Python UDF client is null";
+ return Status::InternalError("Python UDF client is null");
+ }
+
+ int64_t input_rows = block.rows();
+ uint32_t input_columns = block.columns();
+ DCHECK(input_columns > 0 && result < input_columns &&
+ _argument_types.size() == arguments.size());
+ vectorized::Block input_block;
+ vectorized::Block output_block;
+
+ if (!_return_type->equals(*block.get_by_position(result).type)) {
+ return Status::InternalError(fmt::format("Python UDF output type {} not equal to {}",
+ block.get_by_position(result).type->get_name(),
+ _return_type->get_name()));
+ }
+
+ for (uint32_t i = 0; i < arguments.size(); ++i) {
+ if (!_argument_types[i]->equals(*block.get_by_position(arguments[i]).type)) {
+ return Status::InternalError(
+ fmt::format("Python UDF input type {} not equal to {}",
+ block.get_by_position(arguments[i]).type->get_name(),
+ _argument_types[i]->get_name()));
+ }
+ input_block.insert(block.get_by_position(arguments[i]));
+ }
+
+ std::shared_ptr<arrow::Schema> schema;
+ RETURN_IF_ERROR(
+ get_arrow_schema_from_block(input_block, &schema, TimezoneUtils::default_time_zone));
+ std::shared_ptr<arrow::RecordBatch> input_batch;
+ std::shared_ptr<arrow::RecordBatch> output_batch;
+ cctz::time_zone _timezone_obj; // default UTC
+ RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(),
+ &input_batch, _timezone_obj));
+ RETURN_IF_ERROR(client->evaluate(*input_batch, &output_batch));
+ int64_t output_rows = output_batch->num_rows();
+
+ if (output_batch->num_columns() != 1) {
+ return Status::InternalError(fmt::format("Python UDF output columns {} not equal to 1",
+ output_batch->num_columns()));
+ }
+
+ if (input_rows != output_rows) {
+ return Status::InternalError(fmt::format(
+ "Python UDF output rows {} not equal to input rows {}", output_rows, input_rows));
+ }
+
+ RETURN_IF_ERROR(
+ convert_from_arrow_batch(output_batch, {_return_type}, &output_block, _timezone_obj));
+ DCHECK_EQ(output_block.columns(), 1);
+ block.replace_by_position(result, std::move(output_block.get_by_position(0).column));
+ return Status::OK();
+}
+
+Status PythonFunctionCall::close(FunctionContext* context,
+ FunctionContext::FunctionStateScope scope) {
+ auto client = reinterpret_cast<PythonUDFClient*>(
+ context->get_function_state(FunctionContext::THREAD_LOCAL));
+ if (!client) {
+ LOG(WARNING) << "Python UDF client is null";
+ return Status::InternalError("Python UDF client is null");
+ }
+ RETURN_IF_ERROR(client->close());
+ return Status::OK();
+}
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/functions/function_python_udf.h b/be/src/vec/functions/function_python_udf.h
new file mode 100644
index 0000000..e13bf49
--- /dev/null
+++ b/be/src/vec/functions/function_python_udf.h
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gen_cpp/Types_types.h>
+
+#include <functional>
+#include <memory>
+
+#include "common/status.h"
+#include "udf/udf.h"
+#include "vec/core/block.h"
+#include "vec/core/column_numbers.h"
+#include "vec/core/columns_with_type_and_name.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type.h"
+#include "vec/functions/function.h"
+
+namespace doris::vectorized {
+
+class PythonUDFPreparedFunction : public PreparedFunctionImpl {
+public:
+ using execute_call_back = std::function<Status(FunctionContext* context, Block& block,
+ const ColumnNumbers& arguments, uint32_t result,
+ size_t input_rows_count)>;
+
+ explicit PythonUDFPreparedFunction(const execute_call_back& func, const std::string& name)
+ : callback_function(func), name(name) {}
+
+ String get_name() const override { return name; }
+
+protected:
+ Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+ uint32_t result, size_t input_rows_count) const override {
+ return callback_function(context, block, arguments, result, input_rows_count);
+ }
+
+ bool use_default_implementation_for_nulls() const override { return false; }
+
+private:
+ execute_call_back callback_function;
+ std::string name;
+};
+
+class PythonFunctionCall : public IFunctionBase {
+public:
+ PythonFunctionCall(const TFunction& fn, const DataTypes& argument_types,
+ const DataTypePtr& return_type);
+
+ static FunctionBasePtr create(const TFunction& fn, const ColumnsWithTypeAndName& argument_types,
+ const DataTypePtr& return_type) {
+ DataTypes data_types(argument_types.size());
+ for (size_t i = 0; i < argument_types.size(); ++i) {
+ data_types[i] = argument_types[i].type;
+ }
+ return std::make_shared<PythonFunctionCall>(fn, data_types, return_type);
+ }
+
+ /// Get the main function name.
+ String get_name() const override { return _fn.name.function_name; }
+
+ const DataTypes& get_argument_types() const override { return _argument_types; }
+ const DataTypePtr& get_return_type() const override { return _return_type; }
+
+ PreparedFunctionPtr prepare(FunctionContext* context, const Block& sample_block,
+ const ColumnNumbers& arguments, uint32_t result) const override {
+ return std::make_shared<PythonUDFPreparedFunction>(
+ [this](auto&& PH1, auto&& PH2, auto&& PH3, auto&& PH4, auto&& PH5) {
+ return PythonFunctionCall::execute_impl(
+ std::forward<decltype(PH1)>(PH1), std::forward<decltype(PH2)>(PH2),
+ std::forward<decltype(PH3)>(PH3), std::forward<decltype(PH4)>(PH4),
+ std::forward<decltype(PH5)>(PH5));
+ },
+ _fn.name.function_name);
+ }
+
+ Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
+
+ Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+ uint32_t result, size_t input_rows_count) const;
+
+ Status close(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
+
+ bool is_use_default_implementation_for_constants() const override { return true; }
+
+ bool is_udf_function() const override { return true; }
+
+private:
+ const TFunction& _fn;
+ const DataTypes _argument_types;
+ const DataTypePtr _return_type {nullptr};
+};
+
+} // namespace doris::vectorized
diff --git a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
index 24886fa..a0600d6 100644
--- a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
@@ -23,6 +23,7 @@
#include <arrow/record_batch.h>
#include <arrow/status.h>
#include <arrow/type.h>
+#include <arrow/type_fwd.h>
#include <arrow/util/decimal.h>
#include <arrow/visit_type_inline.h>
#include <arrow/visitor.h>
@@ -42,6 +43,7 @@
#include <vector>
#include "olap/hll.h"
+#include "runtime/define_primitive_type.h"
#include "runtime/descriptors.cpp"
#include "util/arrow/block_convertor.h"
#include "util/arrow/row_batch.h"
@@ -77,8 +79,8 @@
namespace doris::vectorized {
-void serialize_and_deserialize_arrow_test(std::vector<PrimitiveType> cols, int row_num,
- bool is_nullable) {
+std::shared_ptr<Block> create_test_block(std::vector<PrimitiveType> cols, int row_num,
+ bool is_nullable) {
auto block = std::make_shared<Block>();
for (int i = 0; i < cols.size(); i++) {
std::string col_name = std::to_string(i);
@@ -398,6 +400,12 @@
LOG(FATAL) << "error column type";
}
}
+ return block;
+}
+
+void serialize_and_deserialize_arrow_test(std::vector<PrimitiveType> cols, int row_num,
+ bool is_nullable) {
+ std::shared_ptr<Block> block = create_test_block(cols, row_num, is_nullable);
std::shared_ptr<arrow::RecordBatch> record_batch =
CommonDataTypeSerdeTest::serialize_arrow(block);
auto assert_block = std::make_shared<Block>(block->clone_empty());
@@ -405,6 +413,25 @@
CommonDataTypeSerdeTest::compare_two_blocks(block, assert_block);
}
+void block_converter_test(std::vector<PrimitiveType> cols, int row_num, bool is_nullable) {
+ std::shared_ptr<Block> source_block = create_test_block(cols, row_num, is_nullable);
+ std::shared_ptr<arrow::RecordBatch> record_batch;
+ std::shared_ptr<arrow::Schema> schema;
+ Status status = Status::OK();
+ status = get_arrow_schema_from_block(*source_block, &schema, TimezoneUtils::default_time_zone);
+ ASSERT_TRUE(status.ok() && schema);
+ cctz::time_zone default_timezone; //default UTC
+ status = convert_to_arrow_batch(*source_block, schema, arrow::default_memory_pool(),
+ &record_batch, default_timezone);
+ ASSERT_TRUE(status.ok() && record_batch);
+ auto target_block = std::make_shared<Block>(source_block->clone_empty());
+ DataTypes source_data_types = source_block->get_data_types();
+ status = convert_from_arrow_batch(record_batch, source_data_types, &*target_block,
+ default_timezone);
+ ASSERT_TRUE(status.ok() && target_block);
+ CommonDataTypeSerdeTest::compare_two_blocks(source_block, target_block);
+}
+
TEST(DataTypeSerDeArrowTest, DataTypeScalaSerDeTest) {
std::vector<PrimitiveType> cols = {
TYPE_INT, TYPE_INT, TYPE_STRING, TYPE_DECIMAL128I, TYPE_BOOLEAN,
@@ -486,4 +513,14 @@
CommonDataTypeSerdeTest::compare_two_blocks(block, assert_block);
}
+TEST(DataTypeSerDeArrowTest, BlockConverterTest) {
+ std::vector<PrimitiveType> cols = {
+ TYPE_INT, TYPE_INT, TYPE_STRING, TYPE_DECIMAL128I, TYPE_BOOLEAN,
+ TYPE_DECIMAL32, TYPE_DECIMAL64, TYPE_IPV4, TYPE_IPV6, TYPE_DATETIME,
+ TYPE_DATETIMEV2, TYPE_DATE, TYPE_DATEV2,
+ };
+ block_converter_test(cols, 7, true);
+ block_converter_test(cols, 7, false);
+}
+
} // namespace doris::vectorized
diff --git a/build.sh b/build.sh
index 048fa48..516fc2a 100755
--- a/build.sh
+++ b/build.sh
@@ -937,9 +937,11 @@
mkdir -p "${DORIS_OUTPUT}/be/storage"
mkdir -p "${DORIS_OUTPUT}/be/plugins/jdbc_drivers/"
mkdir -p "${DORIS_OUTPUT}/be/plugins/java_udf/"
+ mkdir -p "${DORIS_OUTPUT}/be/plugins/python_udf/"
mkdir -p "${DORIS_OUTPUT}/be/plugins/connectors/"
mkdir -p "${DORIS_OUTPUT}/be/plugins/hadoop_conf/"
mkdir -p "${DORIS_OUTPUT}/be/plugins/java_extensions/"
+ cp -r -p "${DORIS_HOME}/be/src/udf/python/python_udf_server.py" "${DORIS_OUTPUT}/be/plugins/python_udf/"
fi
if [[ "${BUILD_BROKER}" -eq 1 ]]; then
diff --git a/conf/be.conf b/conf/be.conf
index d7e815e..7e35191 100644
--- a/conf/be.conf
+++ b/conf/be.conf
@@ -94,4 +94,4 @@
# Error = 4
azure_log_level = 4
## If you are not running in aws cloud, you can disable EC2 metadata
-AWS_EC2_METADATA_DISABLED=true
+AWS_EC2_METADATA_DISABLED=true
\ No newline at end of file
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index d1b95e4..798e800 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2832,6 +2832,14 @@
public static boolean enable_udf_in_load = false;
@ConfField(description = {
+ "开启python_udf, 默认为false。如果该配置为false,则禁止创建和使用python_udf。在一些场景下关闭该配置可防止命令注入攻击。",
+ "Used to enable python_udf, default is true. if this configuration is false, creation and use of python_udf is "
+ + "disabled. in some scenarios it may be necessary to disable this configuration to prevent "
+ + "command injection attacks."
+ })
+ public static boolean enable_python_udf = false;
+
+ @ConfField(description = {
"是否忽略 Image 文件中未知的模块。如果为 true,不在 PersistMetaModules.MODULE_NAMES 中的元数据模块将被忽略并跳过。"
+ "默认为 false,如果 Image 文件中包含未知的模块,Doris 将会抛出异常。"
+ "该参数主要用于降级操作中,老版本可以兼容新版本的 Image 文件。",
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
index fa9f119..35cb4b0 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
@@ -56,6 +56,7 @@
RIGHT_BRACKET: ']';
LEFT_BRACE: '{';
RIGHT_BRACE: '}';
+DOLLAR_QUOTED_STRING: '$$' ( ~'$' | '$' ~'$' )* '$$';
// TODO: add a doc to list reserved words
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
index 8a485a9..c33d5d2 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
@@ -215,7 +215,8 @@
(TABLES | AGGREGATE)? FUNCTION (IF NOT EXISTS)?
functionIdentifier LEFT_PAREN functionArguments? RIGHT_PAREN
RETURNS returnType=dataType (INTERMEDIATE intermediateType=dataType)?
- properties=propertyClause? #createUserDefineFunction
+ properties=propertyClause?
+ (AS functionCode=dollarQuotedString)? #createUserDefineFunction
| CREATE statementScope? ALIAS FUNCTION (IF NOT EXISTS)?
functionIdentifier LEFT_PAREN functionArguments? RIGHT_PAREN
WITH PARAMETER LEFT_PAREN parameters=identifierSeq? RIGHT_PAREN
@@ -1867,6 +1868,10 @@
| SUBTRACT? (EXPONENT_VALUE | DECIMAL_VALUE) #decimalLiteral
;
+dollarQuotedString
+ : DOLLAR_QUOTED_STRING
+ ;
+
// there are 1 kinds of keywords in Doris.
// - Non-reserved keywords:
// normal version of non-reserved keywords.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java
index 4d7d8f9..71a0a1d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java
@@ -155,6 +155,10 @@
protected boolean isStaticLoad = false;
@SerializedName("eT")
protected long expirationTime = 360; // default 6 hours;
+ @SerializedName("rv")
+ protected String runtimeVersion;
+ @SerializedName("fc")
+ protected String functionCode;
// Only used for serialization
protected Function() {
@@ -332,6 +336,22 @@
isGlobal = global;
}
+ public String getRuntimeVersion() {
+ return runtimeVersion;
+ }
+
+ public void setRuntimeVersion(String runtimeVersion) {
+ this.runtimeVersion = runtimeVersion;
+ }
+
+ public String getFunctionCode() {
+ return functionCode;
+ }
+
+ public void setFunctionCode(String functionCode) {
+ this.functionCode = functionCode;
+ }
+
// TODO(cmy): Currently we judge whether it is UDF by wheter the 'location' is set.
// Maybe we should use a separate variable to identify,
// but additional variables need to modify the persistence information.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java
index fb6444e..edfc580 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java
@@ -25,7 +25,9 @@
import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdaf;
import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf;
import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdtf;
+import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdf;
import org.apache.doris.nereids.types.DataType;
+import org.apache.doris.thrift.TFunctionBinaryType;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
@@ -182,7 +184,11 @@
if (function.isUDTFunction()) {
JavaUdtf.translateToNereidsFunction(dbName, ((ScalarFunction) function));
} else {
- JavaUdf.translateToNereidsFunction(dbName, ((ScalarFunction) function));
+ if (function.getBinaryType() == TFunctionBinaryType.JAVA_UDF) {
+ JavaUdf.translateToNereidsFunction(dbName, ((ScalarFunction) function));
+ } else if (function.getBinaryType() == TFunctionBinaryType.PYTHON_UDF) {
+ PythonUdf.translateToNereidsFunction(dbName, (ScalarFunction) function);
+ }
}
} else if (function instanceof AggregateFunction) {
JavaUdaf.translateToNereidsFunction(dbName, ((AggregateFunction) function));
@@ -213,4 +219,9 @@
}
}
+ public static void checkEnablePythonUdf() throws AnalysisException {
+ if (!Config.enable_python_udf) {
+ throw new AnalysisException("python_udf has been disabled.");
+ }
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java
index c0a3f05..cba3c83 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java
@@ -25,6 +25,7 @@
import org.apache.doris.thrift.TFunctionBinaryType;
import org.apache.doris.thrift.TScalarFunction;
+import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.annotations.SerializedName;
@@ -253,11 +254,18 @@
public TFunction toThrift(Type realReturnType, Type[] realArgTypes, Boolean[] realArgTypeNullables) {
TFunction fn = super.toThrift(realReturnType, realArgTypes, realArgTypeNullables);
fn.setScalarFn(new TScalarFunction());
- if (getBinaryType() == TFunctionBinaryType.JAVA_UDF || getBinaryType() == TFunctionBinaryType.RPC) {
+ if (getBinaryType() == TFunctionBinaryType.JAVA_UDF || getBinaryType() == TFunctionBinaryType.RPC
+ || getBinaryType() == TFunctionBinaryType.PYTHON_UDF) {
fn.getScalarFn().setSymbol(symbolName);
} else {
fn.getScalarFn().setSymbol("");
}
+ if (getBinaryType() == TFunctionBinaryType.PYTHON_UDF) {
+ if (!Strings.isNullOrEmpty(functionCode)) {
+ fn.setFunctionCode(functionCode);
+ }
+ fn.setRuntimeVersion(runtimeVersion);
+ }
if (dictFunction != null) {
fn.setDictFunction(dictFunction);
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
index a6cb178..a3a9cd4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
@@ -102,6 +102,7 @@
import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdaf;
import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf;
import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdtf;
+import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdf;
import org.apache.doris.nereids.trees.expressions.functions.window.WindowFunction;
import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.literal.NullLiteral;
@@ -875,6 +876,16 @@
return functionCallExpr;
}
+ @Override
+ public Expr visitPythonUdf(PythonUdf udf, PlanTranslatorContext context) {
+ FunctionParams exprs = new FunctionParams(udf.children().stream()
+ .map(expression -> expression.accept(this, context))
+ .collect(Collectors.toList()));
+ FunctionCallExpr functionCallExpr = new FunctionCallExpr(udf.getCatalogFunction(), exprs);
+ functionCallExpr.setNullableFromNereids(udf.nullable());
+ return functionCallExpr;
+ }
+
// TODO: Supports for `distinct`
private Expr translateAggregateFunction(AggregateFunction function,
List<Expression> currentPhaseArguments, List<Expr> aggFnArguments,
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
index 5c0a4ca..dc0bea1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
@@ -5034,9 +5034,10 @@
Map<String, String> properties = ctx.propertyClause() != null
? Maps.newHashMap(visitPropertyClause(ctx.propertyClause()))
: Maps.newHashMap();
+ String functionCode = ctx.dollarQuotedString() != null ? ctx.dollarQuotedString().getText() : "";
return new CreateFunctionCommand(statementScope, ifNotExists, isAggFunction, false, isTableFunction,
function, functionArgTypesInfo, returnType, intermediateType,
- null, null, properties);
+ null, null, properties, functionCode);
}
@Override
@@ -5054,7 +5055,7 @@
Expression originFunction = getExpression(ctx.expression());
return new CreateFunctionCommand(statementScope, ifNotExists, false, true, false,
function, functionArgTypesInfo, VarcharType.MAX_VARCHAR_TYPE, null,
- parameters, originFunction, null);
+ parameters, originFunction, null, null);
}
@Override
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java
new file mode 100644
index 0000000..464d614
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.udf;
+
+import org.apache.doris.analysis.FunctionName;
+import org.apache.doris.catalog.Env;
+import org.apache.doris.catalog.Function;
+import org.apache.doris.catalog.Function.NullableMode;
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.catalog.Type;
+import org.apache.doris.common.util.URI;
+import org.apache.doris.nereids.exceptions.AnalysisException;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import org.apache.doris.nereids.trees.expressions.VirtualSlotReference;
+import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.Udf;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.ScalarFunction;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.DataType;
+import org.apache.doris.thrift.TFunctionBinaryType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+/**
+ * Python UDF for Nereids
+ */
+public class PythonUdf extends ScalarFunction implements ExplicitlyCastableSignature, Udf {
+ private final String dbName;
+ private final long functionId;
+ private final TFunctionBinaryType binaryType;
+ private final FunctionSignature signature;
+ private final NullableMode nullableMode;
+ private final String objectFile;
+ private final String symbol;
+ private final String prepareFn;
+ private final String closeFn;
+ private final String checkSum;
+ private final boolean isStaticLoad;
+ private final long expirationTime;
+ private final String runtimeVersion;
+ private final String functionCode;
+
+ /**
+ * Constructor of UDF
+ */
+ public PythonUdf(String name, long functionId, String dbName, TFunctionBinaryType binaryType,
+ FunctionSignature signature,
+ NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn,
+ String checkSum, boolean isStaticLoad, long expirationTime,
+ String runtimeVersion, String functionCode, Expression... args) {
+ super(name, args);
+ this.dbName = dbName;
+ this.functionId = functionId;
+ this.binaryType = binaryType;
+ this.signature = signature;
+ this.nullableMode = nullableMode;
+ this.objectFile = objectFile;
+ this.symbol = symbol;
+ this.prepareFn = prepareFn;
+ this.closeFn = closeFn;
+ this.checkSum = checkSum;
+ this.isStaticLoad = isStaticLoad;
+ this.expirationTime = expirationTime;
+ this.runtimeVersion = runtimeVersion;
+ this.functionCode = functionCode;
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return ImmutableList.of(signature);
+ }
+
+ @Override
+ public boolean hasVarArguments() {
+ return signature.hasVarArgs;
+ }
+
+ @Override
+ public int arity() {
+ return signature.argumentsTypes.size();
+ }
+
+ @Override
+ public NullableMode getNullableMode() {
+ return nullableMode;
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public PythonUdf withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == this.children.size());
+ return new PythonUdf(getName(), functionId, dbName, binaryType, signature, nullableMode,
+ objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime,
+ runtimeVersion, functionCode, children.toArray(new Expression[0]));
+ }
+
+ /**
+ * translate catalog java udf to nereids java udf
+ */
+ public static void translateToNereidsFunction(String dbName, org.apache.doris.catalog.ScalarFunction scalar) {
+ String fnName = scalar.functionName();
+ DataType retType = DataType.fromCatalogType(scalar.getReturnType());
+ List<DataType> argTypes = Arrays.stream(scalar.getArgs())
+ .map(DataType::fromCatalogType)
+ .collect(Collectors.toList());
+
+ FunctionSignature.FuncSigBuilder sigBuilder = FunctionSignature.ret(retType);
+ FunctionSignature sig = scalar.hasVarArgs()
+ ? sigBuilder.varArgs(argTypes.toArray(new DataType[0]))
+ : sigBuilder.args(argTypes.toArray(new DataType[0]));
+
+ VirtualSlotReference[] virtualSlots = argTypes.stream()
+ .map(type -> new VirtualSlotReference(type.toString(), type, Optional.empty(),
+ (shape) -> ImmutableList.of()))
+ .toArray(VirtualSlotReference[]::new);
+
+ PythonUdf udf = new PythonUdf(fnName, scalar.getId(), dbName, scalar.getBinaryType(), sig,
+ scalar.getNullableMode(),
+ scalar.getLocation() == null ? null : scalar.getLocation().getLocation(),
+ scalar.getSymbolName(),
+ scalar.getPrepareFnSymbol(),
+ scalar.getCloseFnSymbol(),
+ scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(),
+ scalar.getRuntimeVersion(),
+ scalar.getFunctionCode(),
+ virtualSlots);
+
+ PythonUdfBuilder builder = new PythonUdfBuilder(udf);
+ Env.getCurrentEnv().getFunctionRegistry().addUdf(dbName, fnName, builder);
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitPythonUdf(this, context);
+ }
+
+ @Override
+ public Function getCatalogFunction() {
+ try {
+ org.apache.doris.catalog.ScalarFunction expr = org.apache.doris.catalog.ScalarFunction.createUdf(
+ binaryType,
+ new FunctionName(dbName, getName()),
+ signature.argumentsTypes.stream().map(DataType::toCatalogDataType).toArray(Type[]::new),
+ signature.returnType.toCatalogDataType(),
+ signature.hasVarArgs,
+ objectFile == null ? null : URI.create(objectFile),
+ symbol,
+ prepareFn,
+ closeFn
+ );
+ expr.setNullableMode(nullableMode);
+ expr.setChecksum(checkSum);
+ expr.setId(functionId);
+ expr.setStaticLoad(isStaticLoad);
+ expr.setExpirationTime(expirationTime);
+ expr.setRuntimeVersion(runtimeVersion);
+ expr.setFunctionCode(functionCode);
+ return expr;
+ } catch (Exception e) {
+ throw new AnalysisException(e.getMessage(), e.getCause());
+ }
+ }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java
new file mode 100644
index 0000000..7185594
--- /dev/null
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.udf;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.common.Pair;
+import org.apache.doris.common.util.ReflectionUtils;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import org.apache.doris.nereids.trees.expressions.functions.BoundFunction;
+import org.apache.doris.nereids.types.DataType;
+import org.apache.doris.nereids.util.TypeCoercionUtils;
+
+import com.google.common.base.Suppliers;
+import com.google.common.collect.Lists;
+
+import java.util.List;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+/**
+ * function builder for python udf
+ */
+public class PythonUdfBuilder extends UdfBuilder {
+ private final PythonUdf udf;
+ private final int arity;
+ private final boolean isVarArgs;
+
+ public PythonUdfBuilder(PythonUdf udf) {
+ this.udf = udf;
+ this.isVarArgs = udf.hasVarArguments();
+ this.arity = udf.arity();
+ }
+
+ @Override
+ public List<DataType> getArgTypes() {
+ return Suppliers.memoize(() -> udf.getSignatures().get(0).argumentsTypes.stream()
+ .map(DataType.class::cast)
+ .collect(Collectors.toList())).get();
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return udf.getSignatures();
+ }
+
+ @Override
+ public Class<? extends BoundFunction> functionClass() {
+ return JavaUdf.class;
+ }
+
+ @Override
+ public boolean canApply(List<?> arguments) {
+ if ((isVarArgs && arity > arguments.size() + 1) || (!isVarArgs && arguments.size() != arity)) {
+ return false;
+ }
+ for (Object argument : arguments) {
+ if (!(argument instanceof Expression)) {
+ Optional<Class> primitiveType = ReflectionUtils.getPrimitiveType(argument.getClass());
+ if (!primitiveType.isPresent() || !Expression.class.isAssignableFrom(primitiveType.get())) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public Pair<PythonUdf, PythonUdf> build(String name, List<?> arguments) {
+ List<Expression> exprs = arguments.stream().map(Expression.class::cast).collect(Collectors.toList());
+ List<DataType> argTypes = udf.getSignatures().get(0).argumentsTypes;
+
+ List<Expression> processedExprs = Lists.newArrayList();
+ for (int i = 0; i < exprs.size(); ++i) {
+ processedExprs.add(TypeCoercionUtils.castIfNotSameType(exprs.get(i), argTypes.get(i)));
+ }
+ return Pair.ofSame(udf.withChildren(processedExprs));
+ }
+
+ @Override
+ public String parameterDisplayString() {
+ StringBuilder string = new StringBuilder("(");
+ for (int i = 0; i < udf.getArgumentsTypes().size(); ++i) {
+ if (i > 0) {
+ string.append(", ");
+ }
+ string.append(udf.getArgumentsTypes().get(i));
+ if (isVarArgs && i + 1 == udf.getArgumentsTypes().size()) {
+ string.append("...");
+ }
+ }
+ return string.append(")").toString();
+ }
+}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index 630fcbe..8285cc9 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -531,6 +531,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.YearsSub;
import org.apache.doris.nereids.trees.expressions.functions.udf.AliasUdf;
import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf;
+import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdf;
/**
* ScalarFunctionVisitor.
@@ -2460,6 +2461,10 @@
return visitScalarFunction(javaUdf, context);
}
+ default R visitPythonUdf(PythonUdf pythonUdf, C context) {
+ return visitScalarFunction(pythonUdf, context);
+ }
+
default R visitAliasUdf(AliasUdf aliasUdf, C context) {
return visitScalarFunction(aliasUdf, context);
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java
index 317d004..f06a72e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java
@@ -91,6 +91,8 @@
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.collections.map.CaseInsensitiveMap;
import org.apache.commons.lang3.StringUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.io.InputStream;
@@ -109,6 +111,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
@@ -143,6 +146,10 @@
// iff is static load, BE will be cache the udf class load, so only need load once
public static final String IS_STATIC_LOAD = "static_load";
public static final String EXPIRATION_TIME = "expiration_time";
+ public static final String RUNTIME_VERSION = "runtime_version";
+
+ private static final Pattern PYTHON_VERSION_PATTERN = Pattern.compile("^3\\.\\d{1,2}(?:\\.\\d{1,2})?$");
+ private static final Logger LOG = LogManager.getLogger(CreateFunctionCommand.class);
// timeout for both connection and read. 10 seconds is long enough.
private static final int HTTP_TIMEOUT_MS = 10000;
@@ -170,14 +177,16 @@
// if not, will core dump when input is not null column, but need return null
// like https://github.com/apache/doris/pull/14002/files
private NullableMode returnNullMode = NullableMode.ALWAYS_NULLABLE;
+ private String runtimeVersion;
+ private String functionCode;
/**
* CreateFunctionCommand
*/
public CreateFunctionCommand(SetType setType, boolean ifNotExists, boolean isAggregate, boolean isAlias,
- boolean isTableFunction, FunctionName functionName, FunctionArgTypesInfo argsDef,
- DataType returnType, DataType intermediateType, List<String> parameters,
- Expression originFunction, Map<String, String> properties) {
+ boolean isTableFunction, FunctionName functionName, FunctionArgTypesInfo argsDef,
+ DataType returnType, DataType intermediateType, List<String> parameters,
+ Expression originFunction, Map<String, String> properties, String functionCode) {
super(PlanType.CREATE_FUNCTION_COMMAND);
this.setType = setType;
this.ifNotExists = ifNotExists;
@@ -199,6 +208,7 @@
} else {
this.properties = ImmutableSortedMap.copyOf(properties, String.CASE_INSENSITIVE_ORDER);
}
+ this.functionCode = functionCode;
}
@Override
@@ -336,22 +346,48 @@
if (staticLoad != null && staticLoad) {
isStaticLoad = true;
}
- String expirationTimeString = properties.get(EXPIRATION_TIME);
- if (expirationTimeString != null) {
- long timeMinutes = 0;
- try {
- timeMinutes = Long.parseLong(expirationTimeString);
- } catch (NumberFormatException e) {
- throw new AnalysisException(e.getMessage());
- }
- if (timeMinutes <= 0) {
- throw new AnalysisException("expirationTime should greater than zero: ");
- }
- this.expirationTime = timeMinutes;
+ extractExpirationTime();
+ } else if (binaryType == TFunctionBinaryType.PYTHON_UDF) {
+ FunctionUtil.checkEnablePythonUdf();
+
+ // always_nullable the default value is true, equal null means true
+ Boolean isReturnNull = parseBooleanFromProperties(IS_RETURN_NULL);
+ if (isReturnNull != null && !isReturnNull) {
+ returnNullMode = NullableMode.ALWAYS_NOT_NULLABLE;
}
+ extractExpirationTime();
+ String runtimeVersionString = properties.get(RUNTIME_VERSION);
+ if (runtimeVersionString == null) {
+ throw new AnalysisException("Python runtime version is not set");
+ } else if (!validatePythonRuntimeVersion(runtimeVersionString)) {
+ throw new AnalysisException(
+ String.format("Invalid Python runtime version: '%s'. Expected format:"
+ + "'3.X.X' or '3.XX.XX' (e.g. '3.10.2').", runtimeVersionString));
+ }
+ runtimeVersion = runtimeVersionString;
}
}
+ private void extractExpirationTime() throws AnalysisException {
+ String expirationTimeString = properties.get(EXPIRATION_TIME);
+ if (expirationTimeString != null) {
+ long timeMinutes = 0;
+ try {
+ timeMinutes = Long.parseLong(expirationTimeString);
+ } catch (NumberFormatException e) {
+ throw new AnalysisException(e.getMessage());
+ }
+ if (timeMinutes <= 0) {
+ throw new AnalysisException("expirationTime should greater than zero: ");
+ }
+ this.expirationTime = timeMinutes;
+ }
+ }
+
+ private static boolean validatePythonRuntimeVersion(String runtimeVersionString) {
+ return runtimeVersionString != null && PYTHON_VERSION_PATTERN.matcher(runtimeVersionString).matches();
+ }
+
private Boolean parseBooleanFromProperties(String propertyString) throws AnalysisException {
String valueOfString = properties.get(propertyString);
if (valueOfString == null) {
@@ -519,6 +555,8 @@
checkRPCUdf(symbol);
} else if (binaryType == TFunctionBinaryType.JAVA_UDF) {
analyzeJavaUdf(symbol);
+ } else if (binaryType == TFunctionBinaryType.PYTHON_UDF) {
+ analyzePythonUdf(symbol);
}
URI location;
if (!Strings.isNullOrEmpty(originalUserFile)) {
@@ -534,6 +572,8 @@
function.setNullableMode(returnNullMode);
function.setStaticLoad(isStaticLoad);
function.setExpirationTime(expirationTime);
+ function.setRuntimeVersion(runtimeVersion);
+ function.setFunctionCode(functionCode);
}
private void analyzeJavaUdaf(String clazz) throws AnalysisException {
@@ -714,6 +754,26 @@
}
}
+ private void analyzePythonUdf(String clazz) throws AnalysisException {
+ if (Strings.isNullOrEmpty(clazz)) {
+ throw new AnalysisException("No symbol class name provided for Python UDF");
+ }
+
+ if (Strings.isNullOrEmpty(this.functionCode)) {
+ return;
+ }
+
+ this.functionCode = this.functionCode.trim();
+ if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) {
+ throw new AnalysisException("Inline Python UDF code must be start with $$ and end with $$");
+ }
+
+ this.functionCode = this.functionCode.substring(2, this.functionCode.length() - 2);
+ if (this.functionCode.isEmpty()) {
+ throw new AnalysisException("Inline Python UDF is empty");
+ }
+ }
+
private void checkUdfClass(String clazz, ClassLoader cl) throws ClassNotFoundException, AnalysisException {
Class udfClass = cl.loadClass(clazz);
List<Method> evalList = Arrays.stream(udfClass.getMethods())
diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift
index 34d2220..1594ff9 100644
--- a/gensrc/thrift/Types.thrift
+++ b/gensrc/thrift/Types.thrift
@@ -331,7 +331,9 @@
JAVA_UDF = 5,
- AGG_STATE = 6
+ AGG_STATE = 6,
+
+ PYTHON_UDF = 7
}
// Represents a fully qualified function name.
@@ -407,6 +409,8 @@
15: optional bool is_static_load = false
16: optional i64 expiration_time //minutes
17: optional TDictFunction dict_function
+ 18: optional string runtime_version
+ 19: optional string function_code
}
enum TJdbcOperation {
diff --git a/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertequal.out b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertequal.out
new file mode 100644
index 0000000..3376296
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertequal.out
@@ -0,0 +1,4 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select --
+23.34 == 23.34
+
diff --git a/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertlessthan.out b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertlessthan.out
new file mode 100644
index 0000000..41cb521
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertlessthan.out
@@ -0,0 +1,5 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select --
+0.123 < 0.124
+23.34 < 23.35
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_aggregate.out b/regression-test/data/pythonudf_p0/test_pythonudf_aggregate.out
new file mode 100644
index 0000000..ac41889
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_aggregate.out
@@ -0,0 +1,38 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_grades --
+1 Alice English 88 B
+1 Alice Math 95 A
+1 Alice Science 92 A
+2 Bob English 85 B
+2 Bob Math 78 C
+2 Bob Science 80 B
+3 Charlie English 70 C
+3 Charlie Math 65 D
+3 Charlie Science 68 D
+4 David English 60 D
+4 David Math 55 F
+4 David Science 58 F
+
+-- !select_group_by_grade --
+A 2 93.5
+B 3 84.33333333333333
+C 2 74
+D 3 64.33333333333333
+F 2 56.5
+
+-- !select_aggregate_with_udf --
+1 Alice 91.66666666666667 A
+2 Bob 81 B
+3 Charlie 67.66666666666667 D
+4 David 57.66666666666666 F
+
+-- !select_age_group_aggregate --
+Adult 2 85000 90000 80000
+Minor 1 0 0 0
+Senior 2 105000 110000 100000
+Young Adult 3 51666.66666666666 60000 45000
+
+-- !select_having_with_udf --
+1 Alice 91.66666666666667
+2 Bob 81
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_always_nullable.out b/regression-test/data/pythonudf_p0/test_pythonudf_always_nullable.out
new file mode 100644
index 0000000..055bbad
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_always_nullable.out
@@ -0,0 +1,42 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_nullable_true_normal --
+20
+
+-- !select_nullable_true_null --
+\N
+
+-- !select_nullable_true_negative --
+\N
+
+-- !select_nullable_false_normal --
+20
+
+-- !select_nullable_false_null --
+0
+
+-- !select_nullable_false_returns_none_normal --
+20
+
+-- !select_table_nullable_true --
+1 10 20
+2 \N \N
+3 -5 \N
+4 0 0
+5 100 200
+
+-- !select_table_nullable_false --
+1 10 20
+2 \N 0
+3 -5 -10
+4 0 0
+5 100 200
+
+-- !select_string_nullable --
+HELLO
+
+-- !select_string_nullable_null --
+\N
+
+-- !select_string_nullable_empty --
+\N
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_array.out b/regression-test/data/pythonudf_p0/test_pythonudf_array.out
new file mode 100644
index 0000000..eda9275
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_array.out
@@ -0,0 +1,109 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+1 2 a1b
+2 4 a2b
+3 6 a3b
+4 8 a4b
+5 10 a5b
+6 12 a6b
+7 14 a7b
+8 16 a8b
+9 18 a9b
+10 20 a10b
+
+-- !select_1 --
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+
+-- !select_2 --
+\N
+
+-- !select_3 --
+[1] 2
+[2] 4
+[3] 6
+[4] 8
+[5] 10
+[6] 12
+[7] 14
+[8] 16
+[9] 18
+[10] 20
+
+-- !select_4 --
+[2] 2
+[4] 4
+[6] 6
+[8] 8
+[10] 10
+[12] 12
+[14] 14
+[16] 16
+[18] 18
+[20] 20
+
+-- !select_5 --
+\N
+
+-- !select_6 --
+["a1b"] 2
+["a2b"] 4
+["a3b"] 6
+["a4b"] 8
+["a5b"] 10
+["a6b"] 12
+["a7b"] 14
+["a8b"] 16
+["a9b"] 18
+["a10b"] 20
+
+-- !select_7 --
+["a1b1"] 2
+["a2b2"] 4
+["a3b3"] 6
+["a4b4"] 8
+["a5b5"] 10
+["a6b6"] 12
+["a7b7"] 14
+["a8b8"] 16
+["a9b9"] 18
+["a10b10"] 20
+
+-- !select_8 --
+\N
+
+-- !select_9 --
+a1b 2
+a2b 4
+a3b 6
+a4b 8
+a5b 10
+a6b 12
+a7b 14
+a8b 16
+a9b 18
+a10b 20
+
+-- !select_10 --
+a1b1 2
+a2b2 4
+a3b3 6
+a4b4 8
+a5b5 10
+a6b6 12
+a7b7 14
+a8b8 16
+a9b9 18
+a10b10 20
+
+-- !select_11 --
+\N
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_base_data_type.out b/regression-test/data/pythonudf_p0/test_pythonudf_base_data_type.out
new file mode 100644
index 0000000..781ea31
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_base_data_type.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_1 --
+True,127,32767,2147483647,9223372036854775807,170141183460469231731687303715884105727,1.2300000190734863,4.56789,123456.780000000,12345678901.230000000,123456789012345678901.234567890,2023-01-01,2023-01-01 20:34:56+08:00,char_data_1,varchar_data_1,string_data_1
+False,-128,-32768,-2147483648,-9223372036854775808,-170141183460469231731687303715884105728,-2.3399999141693115,-5.6789,-987654.320000000,-98765432.110000000,-987654321098765432.109876540,2024-05-15,2024-05-15 16:22:10+08:00,char_data_2,varchar_data_2,string_data_2
+True,0,0,0,0,0,0.0,0.0,0E-9,0E-9,0E-9,2025-10-15,2025-10-15 08:00:00+08:00,char_zero,varchar_zero,string_zero
+False,100,20000,300000000,4000000000000000000,99999999999999999999999999999999999999,3.140000104904175,2.71828,999999.990000000,99999999999999.990000000,100000000000000000000000.000000000,2022-12-31,2023-01-01 07:59:59+08:00,char_max,varchar_max,string_max
+True,-50,-10000,-100000000,-5000000000000000000,-99999999999999999999999999999999999999,-1.409999966621399,-0.57721,-0.010000000,-0.010000000,0E-9,2021-07-04,2021-07-04 22:30:00+08:00,char_neg,varchar_neg,string_neg
+
+-- !select_2 --
+True,127,32767,2147483647,9223372036854775807,170141183460469231731687303715884105727,1.2300000190734863,4.56789,123456.780000000,12345678901.230000000,123456789012345678901.234567890,2023-01-01,2023-01-01 20:34:56+08:00,char_data_1,varchar_data_1,string_data_1
+False,-128,-32768,-2147483648,-9223372036854775808,-170141183460469231731687303715884105728,-2.3399999141693115,-5.6789,-987654.320000000,-98765432.110000000,-987654321098765432.109876540,2024-05-15,2024-05-15 16:22:10+08:00,char_data_2,varchar_data_2,string_data_2
+True,0,0,0,0,0,0.0,0.0,0E-9,0E-9,0E-9,2025-10-15,2025-10-15 08:00:00+08:00,char_zero,varchar_zero,string_zero
+False,100,20000,300000000,4000000000000000000,99999999999999999999999999999999999999,3.140000104904175,2.71828,999999.990000000,99999999999999.990000000,100000000000000000000000.000000000,2022-12-31,2023-01-01 07:59:59+08:00,char_max,varchar_max,string_max
+True,-50,-10000,-100000000,-5000000000000000000,-99999999999999999999999999999999999999,-1.409999966621399,-0.57721,-0.010000000,-0.010000000,0E-9,2021-07-04,2021-07-04 22:30:00+08:00,char_neg,varchar_neg,string_neg
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_boolean.out b/regression-test/data/pythonudf_p0/test_pythonudf_boolean.out
new file mode 100644
index 0000000..e43b5026
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_boolean.out
@@ -0,0 +1,28 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+111 true
+112 false
+113 false
+114 true
+
+-- !select --
+false
+
+-- !select --
+true
+
+-- !select --
+false
+
+-- !select --
+true
+
+-- !select --
+true
+
+-- !select --
+111 false
+112 true
+113 true
+114 false
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_complex_data_type.out b/regression-test/data/pythonudf_p0/test_pythonudf_complex_data_type.out
new file mode 100644
index 0000000..3f3d821
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_complex_data_type.out
@@ -0,0 +1,37 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_1 --
+[1,2,3]|[a,b,c]|[[1,2],[3,4]]
+[]|[]|[]
+NULL|[x,NULL,z]|NULL
+[0,-1,2147483647]|[hello,world]|[[],[1]]
+
+-- !select_2 --
+[1,2,3]|[a,b,c]|[[1,2],[3,4]]
+[]|[]|[]
+NULL|[x,NULL,z]|NULL
+[0,-1,2147483647]|[hello,world]|[[],[1]]
+
+-- !select_3 --
+{1:one,2:two}|{e:2.718,pi:3.14}
+{}|{}
+NULL|{null_key:NULL}
+{-1:minus_one,0:zero}|{max:1.79769e+308}
+
+-- !select_4 --
+{1:one,2:two}|{e:2.718,pi:3.14}
+{}|{}
+NULL|{null_key:NULL}
+{-1:minus_one,0:zero}|{max:1.79769e+308}
+
+-- !select_5 --
+(Alice,30,75000.50)|(1.5,2.5,[red,blue])
+(NULL,NULL,NULL)|(0.0,0.0,[])
+(Bob,25,60000.00)|(NULL,3.14,[tag1,NULL,tag3])
+(,0,0.00)|(-1.0,-2.0,NULL)
+
+-- !select_6 --
+(Alice,30,75000.50)|(1.5,2.5,[red,blue])
+(NULL,NULL,NULL)|(0.0,0.0,[])
+(Bob,25,60000.00)|(NULL,3.14,[tag1,NULL,tag3])
+(,0,0.00)|(-1.0,-2.0,NULL)
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_data_types.out b/regression-test/data/pythonudf_p0/test_pythonudf_data_types.out
new file mode 100644
index 0000000..a79b499
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_data_types.out
@@ -0,0 +1,24 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_tinyint --
+11
+
+-- !select_smallint --
+2000
+
+-- !select_bigint --
+1000001000000
+
+-- !select_decimal --
+\N
+
+-- !select_date --
+2024-01-15
+
+-- !select_datetime --
+2024-01-15 18:30:45+08:00
+
+-- !select_table_types --
+1 11 200 1010000
+2 21 400 1020000
+3 \N \N \N
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_error_handling.out b/regression-test/data/pythonudf_p0/test_pythonudf_error_handling.out
new file mode 100644
index 0000000..2dc0c07
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_error_handling.out
@@ -0,0 +1,50 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_divide_normal --
+5
+
+-- !select_divide_zero --
+\N
+
+-- !select_divide_null --
+\N
+
+-- !select_substring_valid --
+e
+
+-- !select_substring_invalid --
+\N
+
+-- !select_substring_negative --
+\N
+
+-- !select_parse_valid --
+123
+
+-- !select_parse_invalid --
+\N
+
+-- !select_parse_empty --
+\N
+
+-- !select_array_valid --
+20
+
+-- !select_array_invalid --
+\N
+
+-- !select_table_error_handling --
+1 100 10 10 123 123
+2 50 0 \N abc \N
+3 \N 5 \N \N
+4 75 \N \N 456 456
+5 25 5 5 xyz \N
+
+-- !select_length_normal --
+5
+
+-- !select_length_empty --
+0
+
+-- !select_length_null --
+\N
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_file_protocol.out b/regression-test/data/pythonudf_p0/test_pythonudf_file_protocol.out
new file mode 100644
index 0000000..8d1d3a5
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_file_protocol.out
@@ -0,0 +1,22 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_file_int --
+100
+
+-- !select_file_string --
+123****890
+
+-- !select_file_float --
+\N
+
+-- !select_file_bool_true --
+false
+
+-- !select_file_bool_false --
+true
+
+-- !select_table_file --
+1 10 11 hello h***o
+2 20 21 world w***d
+3 30 31 python p****n
+4 40 41 doris d***s
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_float.out b/regression-test/data/pythonudf_p0/test_pythonudf_float.out
new file mode 100644
index 0000000..6baa525
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_float.out
@@ -0,0 +1,45 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+111 11111.11 222222.3 12345678.34455677 1111112
+112 1234556.0 222222.3 222222222.3333333 4444444444444.556
+113 8.765432E7 \N 6666666666.666667 \N
+
+-- !select --
+-108.2747
+
+-- !select --
+-108.2747
+
+-- !select --
+\N
+
+-- !select --
+\N
+
+-- !select --
+111 -211111.2
+112 1012334.0
+113 \N
+
+-- !select --
+111 -211111.2
+112 1012334.0
+113 \N
+
+-- !select --
+113.9475611
+
+-- !select --
+113.9475611
+
+-- !select --
+\N
+
+-- !select --
+\N
+
+-- !select --
+111 24691356.68911354
+112 444444444.6666667
+113 13333333333.33333
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_global_function.out b/regression-test/data/pythonudf_p0/test_pythonudf_global_function.out
new file mode 100644
index 0000000..ad3d84a
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_global_function.out
@@ -0,0 +1,23 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_global_multiply --
+56
+
+-- !select_global_lower --
+hello world
+
+-- !select_local_add --
+40
+
+-- !select_table_global --
+1 5 6 30 APPLE apple
+2 10 20 200 BANANA banana
+3 3 7 21 CHERRY cherry
+4 \N 5 \N DATE date
+5 8 9 72 \N \N
+
+-- !select_global_power --
+8
+
+-- !select_global_power_decimal --
+2.23606797749979
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_inline_complex.out b/regression-test/data/pythonudf_p0/test_pythonudf_inline_complex.out
new file mode 100644
index 0000000..22015af
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_inline_complex.out
@@ -0,0 +1,19 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_array_sum --
+15
+
+-- !select_reverse --
+olleH
+
+-- !select_weighted_avg --
+84
+
+-- !select_format_name --
+DOE, John
+
+-- !select_in_range_true --
+true
+
+-- !select_in_range_false --
+false
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_inline_scalar.out b/regression-test/data/pythonudf_p0/test_pythonudf_inline_scalar.out
new file mode 100644
index 0000000..9632ff9
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_inline_scalar.out
@@ -0,0 +1,28 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_add --
+30
+
+-- !select_add_null --
+\N
+
+-- !select_concat --
+Hello World
+
+-- !select_concat_null --
+\N
+
+-- !select_square --
+25
+
+-- !select_square_negative --
+9
+
+-- !select_positive --
+true
+
+-- !select_negative --
+false
+
+-- !select_zero --
+false
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_inline_vector.out b/regression-test/data/pythonudf_p0/test_pythonudf_inline_vector.out
new file mode 100644
index 0000000..de95543
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_inline_vector.out
@@ -0,0 +1,85 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !vec_add_int --
+1 10 20 31
+2 30 40 71
+3 \N 50 \N
+4 60 \N \N
+5 70 80 151
+
+-- !vec_multiply_double --
+1 1.5 2.5 3.75
+2 3.5 4.5 15.75
+3 5.5 \N \N
+4 \N 6.5 \N
+5 7.5 8.5 63.75
+
+-- !vec_concat_string --
+1 hello world hello_world
+2 foo bar foo_bar
+3 \N test \N
+4 data \N \N
+5 python udf python_udf
+
+-- !vec_max_int --
+1 10 20 20
+2 30 40 40
+3 \N 50 \N
+4 60 \N \N
+5 70 80 80
+
+-- !vec_sqrt_double --
+1 1.5 1.224744871391589
+2 3.5 1.870828693386971
+3 5.5 2.345207879911715
+4 \N \N
+5 7.5 2.738612787525831
+
+-- !vec_upper_string --
+1 hello HELLO
+2 foo FOO
+3 \N \N
+4 data DATA
+5 python PYTHON
+
+-- !vec_weighted_sum --
+1 10 20 17
+2 30 40 37
+3 \N 50 \N
+4 60 \N \N
+5 70 80 77
+
+-- !vec_not_bool --
+1 true false
+2 false true
+3 true false
+4 false true
+5 true false
+
+-- !vec_greater_than --
+1 10 20 false
+2 30 40 false
+3 \N 50 false
+4 60 \N false
+5 70 80 false
+
+-- !vec_string_length --
+1 hello 5
+2 foo 3
+3 \N \N
+4 data 4
+5 python 6
+
+-- !vec_fill_null_int --
+1 10 10
+2 30 30
+3 \N 0
+4 60 60
+5 70 70
+
+-- !vec_cumsum_int --
+1 10 10
+2 30 40
+3 \N \N
+4 60 100
+5 70 170
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_int.out b/regression-test/data/pythonudf_p0/test_pythonudf_int.out
new file mode 100644
index 0000000..cd3a1de
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_int.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+1 2 3 4
+2 4 6 8
+3 6 9 12
+4 8 12 16
+5 10 15 20
+6 12 18 24
+7 14 21 28
+8 16 24 32
+9 18 27 36
+10 20 30 40
+
+-- !select --
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+
+-- !select --
+\N
+
+-- !select --
+3
+5
+7
+9
+11
+13
+15
+17
+19
+21
+
+-- !select --
+\N
+
+-- !select --
+4
+7
+10
+13
+16
+19
+22
+25
+28
+31
+
+-- !select --
+\N
+
+-- !select --
+5
+9
+13
+17
+21
+25
+29
+33
+37
+41
+
+-- !select --
+\N
+
+-- !select_global_1 --
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+
+-- !select_global_2 --
+\N
+
+-- !select_global_3 --
+4
+4
+4
+4
+4
+4
+4
+4
+4
+4
+
+-- !select_global_4 --
+4
+4
+4
+4
+4
+4
+4
+4
+4
+4
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_map.out b/regression-test/data/pythonudf_p0/test_pythonudf_map.out
new file mode 100644
index 0000000..7c7cf58
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_map.out
@@ -0,0 +1,10 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_1 --
+{1:1, 10:1, 100:1} 111
+{2:1, 20:1, 200:1, 2000:1} 2222
+{3:1} 3
+
+-- !select_2 --
+{"114":"514", "1919":"810"} 1145141919810
+{"a":"bc", "def":"g", "hij":"k"} abcdefghijk
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_mixed_params.out b/regression-test/data/pythonudf_p0/test_pythonudf_mixed_params.out
new file mode 100644
index 0000000..45d661e
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_mixed_params.out
@@ -0,0 +1,77 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_1 --
+1 100 150
+2 200 300
+3 150 225
+4 300 450
+5 250 375
+6 180 270
+7 220 330
+8 120 180
+9 280 420
+10 350 525
+
+-- !select_2 --
+1 100 5 550
+2 200 3 660
+3 150 8 1320
+4 300 2 660
+5 250 6 1650
+
+-- !select_3 --
+1 100 0.1 90
+2 200 0.15 170
+3 150 0.2 120
+4 300 0.05 285
+5 250 0.12 220
+
+-- !select_4 --
+1 100 5 0.1 460
+2 200 3 0.15 520
+3 150 8 0.2 970
+4 300 2 0.05 580
+5 250 6 0.12 1330
+
+-- !select_5 --
+1 A CAT_A
+2 B CAT_B
+3 A CAT_A
+4 C CAT_C
+5 B CAT_B
+
+-- !select_6 --
+1 5 15
+2 3 13
+3 8 18
+4 2 12
+5 6 16
+
+-- !select_7 --
+1 100 0.1 100
+2 200 0.15 170
+3 150 0.2 150
+4 300 0.05 285
+5 250 0.12 220
+6 180 0.18 180
+7 220 0.08 202.4
+8 120 0.25 120
+9 280 0.1 252
+10 350 0.15 297.5
+
+-- !select_8 --
+1 100 5 600
+2 200 3 720
+3 150 8 1440
+
+-- !select_9 --
+1 100 5 120
+2 200 3 200
+3 150 8 180
+4 300 2 300
+5 250 6 300
+
+-- !select_10 --
+1 100 109.5
+2 200 214
+3 150 161.75
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module.out b/regression-test/data/pythonudf_p0/test_pythonudf_module.out
new file mode 100644
index 0000000..a1a8c21
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_module.out
@@ -0,0 +1,13 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select --
+1001 5 10 500 62.19368581839511
+1002 40 1 20 2.679441541679836
+1003 15 5 300 40.4622349294233
+1004 -1 3 100 \N
+1005 \N 2 200 \N
+1006 7 \N 150 \N
+1007 30 0 \N \N
+1008 0 100 5000 100
+1009 100 2 10 3.595836866004329
+1010 8 8 800 68.85254329722605
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module_advanced.out b/regression-test/data/pythonudf_p0/test_pythonudf_module_advanced.out
new file mode 100644
index 0000000..67bbf40
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_module_advanced.out
@@ -0,0 +1,57 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_module_ltv_normal --
+100
+
+-- !select_module_ltv_null --
+\N
+
+-- !select_module_ltv_zero --
+100
+
+-- !select_customer_analytics --
+1001 Premium 5 50 10000 100
+1002 Regular 30 10 2000 67.19368581839511
+1003 Inactive 60 5 500 20.37527840768416
+1004 VIP 2 100 25000 100
+1005 Regular 15 25 5000 100
+1006 Regular \N 30 6000 \N
+1007 Regular 10 \N 3000 \N
+1008 Inactive 45 8 \N \N
+1009 VIP 0 200 50000 100
+1010 Churned 90 2 100 6.295836866004329
+
+-- !select_segment_analysis --
+Churned 1 100 6.295836866004329
+Inactive 2 500 20.37527840768416
+Premium 1 10000 100
+Regular 4 4000 83.59684290919756
+VIP 2 37500 100
+
+-- !select_high_value_customers --
+
+-- !select_sorted_by_ltv --
+1009 VIP 100
+1005 Regular 100
+1004 VIP 100
+1001 Premium 100
+1002 Regular 67.19368581839511
+
+-- !select_complex_query --
+1001 Premium 5 50 10000 100 Low Value
+1004 VIP 2 100 25000 100 Low Value
+1005 Regular 15 25 5000 100 Low Value
+1009 VIP 0 200 50000 100 Low Value
+1002 Regular 30 10 2000 67.19368581839511 Low Value
+1003 Inactive 60 5 500 20.37527840768416 Low Value
+1010 Churned 90 2 100 6.295836866004329 Low Value
+1007 Regular 10 \N 3000 \N Unknown
+1008 Inactive 45 8 \N \N Unknown
+1006 Regular \N 30 6000 \N Unknown
+
+-- !select_join_with_module_udf --
+1001 Alice Johnson Premium 10000 100
+1004 Diana Prince VIP 25000 100
+1005 Eve Wilson Regular 5000 100
+1002 Bob Smith Regular 2000 67.19368581839511
+1003 Charlie Brown Inactive 500 20.37527840768416
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module_scalar.out b/regression-test/data/pythonudf_p0/test_pythonudf_module_scalar.out
new file mode 100644
index 0000000..534da01
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_module_scalar.out
@@ -0,0 +1,210 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !add_three --
+1 10 20 30 60
+2 5 15 25 45
+3 100 50 25 175
+4 7 3 11 21
+5 17 19 23 59
+
+-- !safe_div --
+1 100 10 10
+2 200 20 10
+3 150 0 \N
+4 80 5 16
+5 300 15 20
+
+-- !discount --
+1 100 90 75
+2 200 180 150
+3 150 135 112.5
+4 80 72 60
+5 300 270 225
+
+-- !compound_interest --
+1 100 162.8894626777442
+2 200 325.7789253554884
+3 150 244.3341940166163
+4 80 130.3115701421954
+5 300 488.6683880332326
+
+-- !bmi --
+1 22.86 29.39
+2 22.86 29.39
+3 22.86 29.39
+4 22.86 29.39
+5 22.86 29.39
+
+-- !fibonacci --
+1 10 55
+2 5 5
+4 7 13
+5 17 1597
+
+-- !is_prime --
+1 10 20 30 false false false
+2 5 15 25 true false false
+3 100 50 25 false false false
+4 7 3 11 true true true
+5 17 19 23 true true true
+
+-- !gcd --
+1 10 20 10
+2 5 15 5
+3 100 50 50
+4 7 3 1
+5 17 19 1
+
+-- !lcm --
+1 10 20 20
+2 5 15 15
+3 100 50 100
+4 7 3 21
+5 17 19 323
+
+-- !reverse --
+1 hello world dlrow olleh
+2 foo bar baz zab rab oof
+3 racecar racecar
+4 a man a plan a canal panama amanap lanac a nalp a nam a
+5 python udf test tset fdu nohtyp
+
+-- !count_vowels --
+1 hello world 3
+2 foo bar baz 4
+3 racecar 3
+4 a man a plan a canal panama 10
+5 python udf test 3
+
+-- !count_words --
+1 hello world 2
+2 foo bar baz 3
+3 racecar 1
+4 a man a plan a canal panama 7
+5 python udf test 3
+
+-- !capitalize --
+1 hello world Hello World
+2 foo bar baz Foo Bar Baz
+3 racecar Racecar
+4 a man a plan a canal panama A Man A Plan A Canal Panama
+5 python udf test Python Udf Test
+
+-- !is_palindrome --
+1 hello world false
+2 foo bar baz false
+3 racecar true
+4 a man a plan a canal panama true
+5 python udf test false
+
+-- !similarity --
+1 hello world 50
+2 foo bar baz 10
+3 racecar 14.29
+4 a man a plan a canal panama 10
+5 python udf test 23.08
+
+-- !mask_email --
+1 test@example.com t***@example.com
+2 user@domain.com u***@domain.com
+3 admin@test.org a***@test.org
+4 info@company.net i***@company.net
+5 contact@site.io c***@site.io
+
+-- !extract_domain --
+1 test@example.com example.com
+2 user@domain.com domain.com
+3 admin@test.org test.org
+4 info@company.net company.net
+5 contact@site.io site.io
+
+-- !levenshtein --
+1 hello world 0
+2 foo bar baz 10
+3 racecar 10
+4 a man a plan a canal panama 24
+5 python udf test 13
+
+-- !days_between --
+1 2024-01-15 2024-01-20 5
+2 2024-02-10 2024-03-15 34
+3 2023-12-01 2024-01-01 31
+4 2024-06-15 2024-06-15 0
+5 2024-03-01 2024-12-31 305
+
+-- !is_weekend --
+1 2024-01-15 false
+2 2024-02-10 true
+3 2023-12-01 false
+4 2024-06-15 true
+5 2024-03-01 false
+
+-- !get_quarter --
+1 2024-01-15 1
+2 2024-02-10 1
+3 2023-12-01 4
+4 2024-06-15 2
+5 2024-03-01 1
+
+-- !age --
+1 34
+2 34
+3 33
+4 34
+5 34
+
+-- !in_range --
+1 10 true
+2 5 false
+3 100 false
+4 7 false
+5 17 true
+
+-- !xor --
+1 true true false
+2 false true true
+3 true false true
+4 false false false
+5 true true false
+
+-- !grade --
+1 100 A
+2 200 A
+3 150 A
+4 80 B
+5 300 A
+
+-- !categorize_age --
+1 10 Child
+2 5 Child
+3 100 Senior
+4 7 Child
+5 17 Teenager
+
+-- !tax --
+1 100 15
+2 200 30
+3 150 22.5
+4 80 12
+5 300 45
+
+-- !truncate --
+1 hello world hello w...
+2 foo bar baz foo bar...
+3 racecar racecar
+4 a man a plan a canal panama a man a...
+5 python udf test python ...
+
+-- !null_handling --
+1 10 20 30 60
+2 \N 20 30 \N
+3 10 \N 30 \N
+4 10 20 \N \N
+5 \N \N \N \N
+
+-- !string_edge --
+1 normal string gnirts lamron 3 2
+2 0 0
+3 0 0
+4 a a 1 1
+5 \N \N \N \N
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module_vector.out b/regression-test/data/pythonudf_p0/test_pythonudf_module_vector.out
new file mode 100644
index 0000000..bfcab62
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_module_vector.out
@@ -0,0 +1,106 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !vec_add_const --
+1 10 20 130
+2 30 15 145
+3 50 50 200
+4 5 25 130
+5 100 10 210
+
+-- !vec_multiply_round --
+1 1.5 2.5 3.75
+2 3.5 4.5 15.75
+3 5.5 2 11
+4 7.5 1.5 11.25
+5 9.5 3.5 33.25
+
+-- !vec_concat_sep --
+1 hello world python udf hello world | python udf
+2 foo bar test case foo bar | test case
+3 data science machine learning data science | machine learning
+4 apache doris database system apache doris | database system
+5 vector operations pandas series vector operations | pandas series
+
+-- !vec_title_case --
+1 hello world Hello World
+2 foo bar Foo Bar
+3 data science Data Science
+4 apache doris Apache Doris
+5 vector operations Vector Operations
+
+-- !vec_conditional --
+1 10 20 20
+2 30 15 30
+3 50 50 50
+4 5 25 25
+5 100 10 100
+
+-- !vec_percentage --
+1 1.5 2.5 60
+2 3.5 4.5 77.78
+3 5.5 2 275
+4 7.5 1.5 500
+5 9.5 3.5 271.43
+
+-- !vec_in_range --
+1 10 true
+2 30 true
+3 50 true
+4 5 false
+5 100 false
+
+-- !vec_safe_div --
+1 1.5 2.5 0.6
+2 3.5 4.5 0.7777777777777778
+3 5.5 2 2.75
+4 7.5 1.5 5
+5 9.5 3.5 2.714285714285714
+
+-- !vec_exp_decay --
+1 1.5 10 1.074796965860684
+2 3.5 30 1.287578044100048
+3 5.5 50 1.03881581560659
+4 7.5 5 6.348612936679606
+5 9.5 100 0.3389029367988978
+
+-- !vec_first_word --
+1 hello world hello
+2 foo bar foo
+3 data science data
+4 apache doris apache
+5 vector operations vector
+
+-- !vec_abs_diff --
+1 10 20 10
+2 30 15 15
+3 50 50 0
+4 5 25 20
+5 100 10 90
+
+-- !vec_power --
+1 1.5 2.25
+2 3.5 12.25
+3 5.5 30.25
+4 7.5 56.25
+5 9.5 90.25
+
+-- !vec_bool_and --
+1 true true true
+2 false true false
+3 true false false
+4 false false false
+5 true true true
+
+-- !vec_bool_or --
+1 true true true
+2 false true true
+3 true false true
+4 false false false
+5 true true true
+
+-- !vec_clip --
+1 10 20
+2 30 30
+3 50 50
+4 5 20
+5 100 60
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_multiline_inline.out b/regression-test/data/pythonudf_p0/test_pythonudf_multiline_inline.out
new file mode 100644
index 0000000..a6e0c9c
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_multiline_inline.out
@@ -0,0 +1,23 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_complex_calc --
+25
+
+-- !select_business_logic_vip --
+MEDIUM:3750.00
+
+-- !select_business_logic_regular --
+MEDIUM:1800.00
+
+-- !select_text_analyzer --
+len:15,words:3,upper:2,lower:8,digits:3
+
+-- !select_statistics --
+mean:25.00,std:11.18,max:40.00,min:10.00
+
+-- !select_table_multiline --
+1 VIP 15000 150 HIGH:11250.00 len:22,words:3,upper:1,lower:19,digits:0
+2 PREMIUM 8000 80 MEDIUM:6560.00 len:13,words:2,upper:1,lower:11,digits:0
+3 REGULAR 3000 40 MEDIUM:2700.00 len:13,words:2,upper:1,lower:11,digits:0
+4 VIP 500 10 LOW:400.00 len:15,words:3,upper:4,lower:9,digits:0
+5 REGULAR 12000 200 HIGH:10200.00 len:19,words:3,upper:1,lower:16,digits:0
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_performance.out b/regression-test/data/pythonudf_p0/test_pythonudf_performance.out
new file mode 100644
index 0000000..d82f52f
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_performance.out
@@ -0,0 +1,59 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_perf_simple --
+4990
+
+-- !select_perf_aggregate --
+A 2500 996
+B 2500 998
+C 2500 1000
+D 2500 1002
+
+-- !select_perf_multiple_udf --
+D 1880
+C 1870
+B 1870
+A 1870
+
+-- !select_perf_string --
+A 250
+B 250
+C 250
+D 250
+
+-- !select_perf_complex --
+A 3372.3 6744.6 0
+B 3373.65 6745.950000000001 1.35
+C 3375 6747.3 2.7
+D 3376.35 6748.650000000001 4.05
+
+-- !select_perf_nested --
+D 1002
+C 1000
+B 998
+A 996
+
+-- !select_perf_null --
+5000 4000 50
+
+-- !select_perf_order --
+9999 999 1998
+8999 999 1998
+7999 999 1998
+6999 999 1998
+5999 999 1998
+4999 999 1998
+3999 999 1998
+2999 999 1998
+1999 999 1998
+999 999 1998
+9998 998 1996
+8998 998 1996
+7998 998 1996
+6998 998 1996
+5998 998 1996
+4998 998 1996
+3998 998 1996
+2998 998 1996
+1998 998 1996
+998 998 1996
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_ret_map.out b/regression-test/data/pythonudf_p0/test_pythonudf_ret_map.out
new file mode 100644
index 0000000..b1160eb
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_ret_map.out
@@ -0,0 +1,17 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_1 --
+{1:1.1, 11:11.1} {10:11, 110:111}
+{2:2.2, 22:22.2} {20:22, 220:222}
+
+-- !select_2 --
+{1:1, 10:1, 100:1} {10:10, 100:10, 1000:10}
+{2:2, 20:2, 200:2} {20:20, 200:20, 2000:20}
+
+-- !select_3 --
+10 1.1 {"11410":"5141.1"}
+20 2.2 {"11420":"5142.2"}
+
+-- !select_4 --
+{"abc":"efg", "h":"i"} {"abc114":"efg514", "h114":"i514"}
+{"j":"k"} {"j114":"k514"}
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_runtime_version.out b/regression-test/data/pythonudf_p0/test_pythonudf_runtime_version.out
new file mode 100644
index 0000000..2658cd5
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_runtime_version.out
@@ -0,0 +1,4 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_version_short --
+42
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_schema_check.out b/regression-test/data/pythonudf_p0/test_pythonudf_schema_check.out
new file mode 100644
index 0000000..a17c75e
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_schema_check.out
@@ -0,0 +1,112 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_1 --
+1 10 1000 1010
+2 20 2000 2020
+3 30 3000 3030
+4 40 4000 4040
+5 50 5000 5050
+
+-- !select_2 --
+1 100 1000 1100
+2 200 2000 2200
+3 300 3000 3300
+4 400 4000 4400
+5 500 5000 5500
+
+-- !select_3 --
+1 1000 10000 11000
+2 2000 20000 22000
+3 3000 30000 33000
+4 4000 40000 44000
+5 5000 50000 55000
+
+-- !select_4 --
+1 1.5 10.5 12
+2 2.5 20.5 23
+3 3.5 30.5 34
+4 4.5 40.5 45
+5 5.5 50.5 56
+
+-- !select_5 --
+1 10 100 1000 1110
+2 20 200 2000 2220
+3 30 300 3000 3330
+4 40 400 4000 4440
+5 50 500 5000 5550
+
+-- !select_6 --
+1 10 100 1000
+2 20 200 4000
+3 30 300 9000
+4 40 400 16000
+5 50 500 25000
+
+-- !select_7 --
+1 1.5 10.5 7
+2 2.5 20.5 8.199999999999999
+3 3.5 30.5 8.714285714285714
+4 4.5 40.5 9
+5 5.5 50.5 9.181818181818182
+
+-- !select_8 --
+1 1000 1.5 2001.5
+2 2000 2.5 4002.5
+3 3000 3.5 6003.5
+4 4000 4.5 8004.5
+5 5000 5.5 10005.5
+
+-- !select_9 --
+1 test1 TEST1
+2 test2 TEST2
+3 test3 TEST3
+4 test4 TEST4
+5 test5 TEST5
+
+-- !select_10 --
+1 true false
+2 false true
+3 true false
+4 false true
+5 true false
+
+-- !select_11 --
+1 10 10000 10010
+2 20 20000 20020
+3 30 30000 30030
+4 40 40000 40040
+5 50 50000 50050
+
+-- !select_12 --
+1 1000 1500
+2 2000 3000
+3 3000 4500
+4 4000 6000
+5 5000 7500
+
+-- !select_13 --
+1 test1 \N
+
+-- !select_14 --
+1 10000 11000
+
+-- !select_15 --
+1 10.5 12.0
+
+-- !select_16 --
+1 true 1001
+
+-- !select_17 --
+1 2024-01-01 2024-01-01
+
+-- !select_18 --
+1 1000 false
+
+-- !select_19 --
+1 test1 \N
+
+-- !select_20 --
+1 test1 true \N
+
+-- !select_22 --
+1 1.5 1001
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_string.out b/regression-test/data/pythonudf_p0/test_pythonudf_string.out
new file mode 100644
index 0000000..59f2f7c
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_string.out
@@ -0,0 +1,67 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+1 1 abcdefg1 poiuytre1abcdefg
+2 2 abcdefg2 poiuytre2abcdefg
+3 3 abcdefg3 poiuytre3abcdefg
+4 4 abcdefg4 poiuytre4abcdefg
+5 5 abcdefg5 poiuytre5abcdefg
+6 6 abcdefg6 poiuytre6abcdefg
+7 7 abcdefg7 poiuytre7abcdefg
+8 8 abcdefg8 poiuytre8abcdefg
+9 9 abcdefg9 poiuytre9abcdefg
+
+-- !select_default_2 --
+1 1 abcdefg1 poiuytre1abcdefg
+2 2 abcdefg2 poiuytre2abcdefg
+3 3 abcdefg3 poiuytre3abcdefg
+4 4 abcdefg4 poiuytre4abcdefg
+5 5 abcdefg5 poiuytre5abcdefg
+6 6 abcdefg6 poiuytre6abcdefg
+7 7 abcdefg7 poiuytre7abcdefg
+8 8 abcdefg8 poiuytre8abcdefg
+9 9 abcdefg9 poiuytre9abcdefg
+
+-- !select --
+ab***fg1
+ab***fg2
+ab***fg3
+ab***fg4
+ab***fg5
+ab***fg6
+ab***fg7
+ab***fg8
+ab***fg9
+
+-- !select --
+po***********efg
+po***********efg
+po***********efg
+po***********efg
+po***********efg
+po***********efg
+po***********efg
+po***********efg
+po***********efg
+
+-- !select --
+ab*def ab**efg
+ab*def ab**efg
+ab*def ab**efg
+ab*def ab**efg
+ab*def ab**efg
+ab*def ab**efg
+ab*def ab**efg
+ab*def ab**efg
+ab*def ab**efg
+
+-- !select_4 --
+ab***fg1 ab***fg1
+ab***fg2 ab***fg2
+ab***fg3 ab***fg3
+ab***fg4 ab***fg4
+ab***fg5 ab***fg5
+ab***fg6 ab***fg6
+ab***fg7 ab***fg7
+ab***fg8 ab***fg8
+ab***fg9 ab***fg9
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_array.out b/regression-test/data/pythonudf_p0/test_pythonudtf_array.out
new file mode 100644
index 0000000..866fd08
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudtf_array.out
@@ -0,0 +1,28 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+1 2 a1b
+2 4 a2b
+3 6 a3b
+
+-- !select_1 --
+1 [1, 2, 3]
+1 [1, 2, 3]
+1 [1, 2, 3]
+2 [1, 2, 3]
+2 [1, 2, 3]
+2 [1, 2, 3]
+3 [1, 2, 3]
+3 [1, 2, 3]
+3 [1, 2, 3]
+
+-- !select_2 --
+1 ["Hi", "DataMind", "Good"]
+1 ["Hi", "DataMind", "Good"]
+1 ["Hi", "DataMind", "Good"]
+2 ["Hi", "DataMind", "Good"]
+2 ["Hi", "DataMind", "Good"]
+2 ["Hi", "DataMind", "Good"]
+3 ["Hi", "DataMind", "Good"]
+3 ["Hi", "DataMind", "Good"]
+3 ["Hi", "DataMind", "Good"]
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_float.out b/regression-test/data/pythonudf_p0/test_pythonudtf_float.out
new file mode 100644
index 0000000..907e4e4
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudtf_float.out
@@ -0,0 +1,24 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+111 11111.111 222222.33 1.234567834455677E7 1111112.0
+112 1234556.1 222222.33 2.2222222233333334E8 4.444444444444556E12
+113 8.765432E7 \N 6.666666666666667E9 \N
+
+-- !select1 --
+111 1.234567834455677E7 1.234567834455677E8
+112 2.2222222233333334E8 2.2222222233333335E9
+113 6.666666666666667E9 6.666666666666667E10
+
+-- !select2 --
+111 1111112.0 1.111112E7
+112 4.444444444444556E12 4.4444444444445555E13
+
+-- !select3 --
+111 11111.111 11101.111
+112 1234556.1 1234546.1
+113 8.765432E7 8.7654312E7
+
+-- !select4 --
+111 222222.33 222212.33
+112 222222.33 222212.33
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_int.out b/regression-test/data/pythonudf_p0/test_pythonudtf_int.out
new file mode 100644
index 0000000..7d35ab6
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudtf_int.out
@@ -0,0 +1,128 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+0 3 300 3000
+0 6 600 6000
+0 9 900 9000
+1 1 100 1000
+1 4 400 4000
+1 7 700 7000
+2 2 200 2000
+2 5 500 5000
+2 8 800 8000
+
+-- !select1 --
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+0 0
+1 1
+1 1
+1 1
+1 1
+1 1
+1 1
+1 1
+1 1
+1 1
+2 2
+2 2
+2 2
+2 2
+2 2
+2 2
+2 2
+2 2
+2 2
+
+-- !select2 --
+1 1
+1 1
+1 1
+2 2
+2 2
+2 2
+3 3
+3 3
+3 3
+4 4
+4 4
+4 4
+5 5
+5 5
+5 5
+6 6
+6 6
+6 6
+7 7
+7 7
+7 7
+8 8
+8 8
+8 8
+9 9
+9 9
+9 9
+
+-- !select3 --
+100 100
+100 100
+100 100
+200 200
+200 200
+200 200
+300 300
+300 300
+300 300
+400 400
+400 400
+400 400
+500 500
+500 500
+500 500
+600 600
+600 600
+600 600
+700 700
+700 700
+700 700
+800 800
+800 800
+800 800
+900 900
+900 900
+900 900
+
+-- !select4 --
+1000 1000
+1000 1000
+1000 1000
+2000 2000
+2000 2000
+2000 2000
+3000 3000
+3000 3000
+3000 3000
+4000 4000
+4000 4000
+4000 4000
+5000 5000
+5000 5000
+5000 5000
+6000 6000
+6000 6000
+6000 6000
+7000 7000
+7000 7000
+7000 7000
+8000 8000
+8000 8000
+8000 8000
+9000 9000
+9000 9000
+9000 9000
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_map.out b/regression-test/data/pythonudf_p0/test_pythonudtf_map.out
new file mode 100644
index 0000000..6255675
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudtf_map.out
@@ -0,0 +1,9 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_1 --
+1 {"114":514, "1919":810}
+1 {"114":514, "1919":810}
+1 {"114":514, "1919":810}
+2 {"a":11, "def":22, "hij":33}
+2 {"a":11, "def":22, "hij":33}
+2 {"a":11, "def":22, "hij":33}
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_string.out b/regression-test/data/pythonudf_p0/test_pythonudtf_string.out
new file mode 100644
index 0000000..da31e54
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudtf_string.out
@@ -0,0 +1,32 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+1 1 abc,defg poiuytre,abcdefg
+2 2 abc,defg poiuytre,abcdefg
+0 3 abc,defg poiuytre,abcdefg
+1 4 abc,defg poiuytre,abcdefg
+2 5 abc,defg poiuytre,abcdefg
+0 6 abc,defg poiuytre,abcdefg
+1 7 abc,defg poiuytre,abcdefg
+2 8 abc,defg poiuytre,abcdefg
+9 9 ab,cdefg poiuytreabcde,fg
+
+-- !select1 --
+0 abc,defg abc
+0 abc,defg defg
+0 abc,defg abc
+0 abc,defg defg
+1 abc,defg abc
+1 abc,defg defg
+1 abc,defg abc
+1 abc,defg defg
+1 abc,defg abc
+1 abc,defg defg
+2 abc,defg abc
+2 abc,defg defg
+2 abc,defg abc
+2 abc,defg defg
+2 abc,defg abc
+2 abc,defg defg
+9 ab,cdefg ab
+9 ab,cdefg cdefg
+
diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_struct.out b/regression-test/data/pythonudf_p0/test_pythonudtf_struct.out
new file mode 100644
index 0000000..f641040
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudtf_struct.out
@@ -0,0 +1,17 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select_default --
+0
+1
+2
+
+-- !select1 --
+0 1 0.112 Hello, DataMind
+0 1 0.112 Hello, DataMind
+0 1 0.112 Hello, DataMind
+1 1 0.112 Hello, DataMind
+1 1 0.112 Hello, DataMind
+1 1 0.112 Hello, DataMind
+2 1 0.112 Hello, DataMind
+2 1 0.112 Hello, DataMind
+2 1 0.112 Hello, DataMind
+
diff --git a/regression-test/pipeline/p0/conf/be.conf b/regression-test/pipeline/p0/conf/be.conf
index aa533b0..ed07170 100644
--- a/regression-test/pipeline/p0/conf/be.conf
+++ b/regression-test/pipeline/p0/conf/be.conf
@@ -90,3 +90,8 @@
enable_graceful_exit_check=true
enable_prefill_all_dbm_agg_cache_after_compaction=true
+
+# enable to use python udf
+enable_python_udf_support = true
+python_env_mode = venv
+python_venv_interpreter_paths = /usr/bin/python3
diff --git a/regression-test/pipeline/p0/conf/fe.conf b/regression-test/pipeline/p0/conf/fe.conf
index d7fae1f..cef89d5 100644
--- a/regression-test/pipeline/p0/conf/fe.conf
+++ b/regression-test/pipeline/p0/conf/fe.conf
@@ -92,3 +92,6 @@
max_spilled_profile_num = 2000
check_table_lock_leaky=true
+
+# enable python udf
+enable_python_udf = true
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertequal.groovy b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertequal.groovy
new file mode 100644
index 0000000..09a15f5
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertequal.groovy
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_assertequal") {
+ def pyPath = """${context.file.parent}/../udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP TABLE IF EXISTS test_pythonudf_assertequal """
+ sql """
+ CREATE TABLE IF NOT EXISTS test_pythonudf_assertequal (
+ `col` varchar(10) NOT NULL,
+ `col_1` double NOT NULL,
+ `col_2` double NOT NULL
+ )
+ DISTRIBUTED BY HASH(col) PROPERTIES("replication_num" = "1");
+ """
+
+ sql """ INSERT INTO test_pythonudf_assertequal VALUES ('abc', 23.34, 23.34); """
+
+ File path1 = new File(pyPath)
+ if (!path1.exists()) {
+ throw new IllegalStateException("""${pyPath} doesn't exist! """)
+ }
+
+ sql """ CREATE FUNCTION asser_equal(double, double) RETURNS string PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="assert_equal_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT asser_equal(col_1, col_2) as a FROM test_pythonudf_assertequal ORDER BY a; """
+
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS asser_equal(double, double); ")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_assertequal")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertlessthan.groovy b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertlessthan.groovy
new file mode 100644
index 0000000..08a6b1f
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertlessthan.groovy
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_assertlessthan") {
+ def tableName = "test_pythonudf_assertlessthan"
+ def pyPath = """${context.file.parent}/../udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP TABLE IF EXISTS test_pythonudf_assertlessthan """
+ sql """
+ CREATE TABLE IF NOT EXISTS test_pythonudf_assertlessthan (
+ `col` varchar(10) NOT NULL,
+ `col_1` double NOT NULL,
+ `col_2` double NOT NULL
+ )
+ DISTRIBUTED BY HASH(col) PROPERTIES("replication_num" = "1");
+ """
+
+ sql """ INSERT INTO test_pythonudf_assertlessthan VALUES ('abc', 23.34, 23.35), ('bcd', 0.123, 0.124); """
+
+ File path1 = new File(pyPath)
+ if (!path1.exists()) {
+ throw new IllegalStateException("""${pyPath} doesn't exist! """)
+ }
+
+ sql """ CREATE FUNCTION asser_lessthan(double, double) RETURNS string PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="assert_lessthan_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT asser_lessthan(col_1, col_2) as a FROM test_pythonudf_assertlessthan ORDER BY a; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS asser_lessthan(double, double); ")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_assertlessthan")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_aggregate.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_aggregate.groovy
new file mode 100644
index 0000000..ef74f9f
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_aggregate.groovy
@@ -0,0 +1,195 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_aggregate") {
+ def runtime_version = "3.10.12"
+
+ try {
+ // Test 1: Create simple aggregate function (although Python UDF is mainly for scalar functions)
+ // Test using Python UDF in aggregate queries
+ sql """ DROP FUNCTION IF EXISTS py_score_grade(DOUBLE); """
+ sql """
+ CREATE FUNCTION py_score_grade(DOUBLE)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(score):
+ if score is None:
+ return None
+ if score >= 90:
+ return 'A'
+ elif score >= 80:
+ return 'B'
+ elif score >= 70:
+ return 'C'
+ elif score >= 60:
+ return 'D'
+ else:
+ return 'F'
+\$\$;
+ """
+
+ // Create test table
+ sql """ DROP TABLE IF EXISTS student_scores; """
+ sql """
+ CREATE TABLE student_scores (
+ student_id INT,
+ student_name STRING,
+ subject STRING,
+ score DOUBLE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(student_id)
+ DISTRIBUTED BY HASH(student_id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO student_scores VALUES
+ (1, 'Alice', 'Math', 95.0),
+ (1, 'Alice', 'English', 88.0),
+ (1, 'Alice', 'Science', 92.0),
+ (2, 'Bob', 'Math', 78.0),
+ (2, 'Bob', 'English', 85.0),
+ (2, 'Bob', 'Science', 80.0),
+ (3, 'Charlie', 'Math', 65.0),
+ (3, 'Charlie', 'English', 70.0),
+ (3, 'Charlie', 'Science', 68.0),
+ (4, 'David', 'Math', 55.0),
+ (4, 'David', 'English', 60.0),
+ (4, 'David', 'Science', 58.0);
+ """
+
+ // Test using UDF in SELECT
+ qt_select_grades """
+ SELECT
+ student_id,
+ student_name,
+ subject,
+ score,
+ py_score_grade(score) AS grade
+ FROM student_scores
+ ORDER BY student_id, subject;
+ """
+
+ // Test using UDF in GROUP BY
+ qt_select_group_by_grade """
+ SELECT
+ py_score_grade(score) AS grade,
+ COUNT(*) AS count,
+ AVG(score) AS avg_score
+ FROM student_scores
+ GROUP BY py_score_grade(score)
+ ORDER BY grade;
+ """
+
+ // Test using UDF in aggregate functions
+ qt_select_aggregate_with_udf """
+ SELECT
+ student_id,
+ student_name,
+ AVG(score) AS avg_score,
+ py_score_grade(AVG(score)) AS avg_grade
+ FROM student_scores
+ GROUP BY student_id, student_name
+ ORDER BY student_id;
+ """
+
+ // Test 2: Create classification function for aggregate analysis
+ sql """ DROP FUNCTION IF EXISTS py_age_group(INT); """
+ sql """
+ CREATE FUNCTION py_age_group(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(age):
+ if age is None:
+ return None
+ if age < 18:
+ return 'Minor'
+ elif age < 30:
+ return 'Young Adult'
+ elif age < 50:
+ return 'Adult'
+ else:
+ return 'Senior'
+\$\$;
+ """
+
+ sql """ DROP TABLE IF EXISTS users; """
+ sql """
+ CREATE TABLE users (
+ user_id INT,
+ name STRING,
+ age INT,
+ salary DOUBLE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(user_id)
+ DISTRIBUTED BY HASH(user_id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO users VALUES
+ (1, 'User1', 16, 0),
+ (2, 'User2', 25, 50000),
+ (3, 'User3', 35, 80000),
+ (4, 'User4', 55, 100000),
+ (5, 'User5', 28, 60000),
+ (6, 'User6', 45, 90000),
+ (7, 'User7', 22, 45000),
+ (8, 'User8', 60, 110000);
+ """
+
+ qt_select_age_group_aggregate """
+ SELECT
+ py_age_group(age) AS age_group,
+ COUNT(*) AS user_count,
+ AVG(salary) AS avg_salary,
+ MAX(salary) AS max_salary,
+ MIN(salary) AS min_salary
+ FROM users
+ GROUP BY py_age_group(age)
+ ORDER BY age_group;
+ """
+
+ // Test 3: Use UDF in HAVING clause
+ qt_select_having_with_udf """
+ SELECT
+ student_id,
+ student_name,
+ AVG(score) AS avg_score
+ FROM student_scores
+ GROUP BY student_id, student_name
+ HAVING py_score_grade(AVG(score)) IN ('A', 'B')
+ ORDER BY student_id;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_score_grade(DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_age_group(INT);")
+ try_sql("DROP TABLE IF EXISTS student_scores;")
+ try_sql("DROP TABLE IF EXISTS users;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_always_nullable.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_always_nullable.groovy
new file mode 100644
index 0000000..a66317b
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_always_nullable.groovy
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_always_nullable") {
+ // Test different configurations of always_nullable parameter
+
+ def runtime_version = "3.10.12"
+ try {
+ // Test 1: always_nullable = true (default value)
+ sql """ DROP FUNCTION IF EXISTS py_nullable_true(INT); """
+ sql """
+ CREATE FUNCTION py_nullable_true(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ if x < 0:
+ return None
+ return x * 2
+\$\$;
+ """
+
+ qt_select_nullable_true_normal """ SELECT py_nullable_true(10) AS result; """
+ qt_select_nullable_true_null """ SELECT py_nullable_true(NULL) AS result; """
+ qt_select_nullable_true_negative """ SELECT py_nullable_true(-5) AS result; """
+
+ // Test 2: always_nullable = false
+ sql """ DROP FUNCTION IF EXISTS py_nullable_false(INT); """
+ sql """
+ CREATE FUNCTION py_nullable_false(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "false",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return 0
+ return x * 2
+\$\$;
+ """
+
+ qt_select_nullable_false_normal """ SELECT py_nullable_false(10) AS result; """
+ qt_select_nullable_false_null """ SELECT py_nullable_false(NULL) AS result; """
+
+ // Test 3: always_nullable = false but function returns None
+ // This tests the edge case where the function violates the always_nullable contract
+ sql """ DROP FUNCTION IF EXISTS py_nullable_false_returns_none(INT); """
+ sql """
+ CREATE FUNCTION py_nullable_false_returns_none(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "false",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x < 0:
+ return None # Returns None even though always_nullable is false
+ return x * 2
+\$\$;
+ """
+
+ qt_select_nullable_false_returns_none_normal """ SELECT py_nullable_false_returns_none(10) AS result; """
+
+ test {
+ sql """ SELECT py_nullable_false_returns_none(-5) AS result; """
+ exception "but the return type is not nullable, please check the always_nullable property in create function statement, it should be true"
+ }
+
+ // Test 4: Test nullable behavior on table data
+ sql """ DROP TABLE IF EXISTS nullable_test_table; """
+ sql """
+ CREATE TABLE nullable_test_table (
+ id INT,
+ value INT
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO nullable_test_table VALUES
+ (1, 10),
+ (2, NULL),
+ (3, -5),
+ (4, 0),
+ (5, 100);
+ """
+
+ qt_select_table_nullable_true """
+ SELECT
+ id,
+ value,
+ py_nullable_true(value) AS result
+ FROM nullable_test_table
+ ORDER BY id;
+ """
+
+ qt_select_table_nullable_false """
+ SELECT
+ id,
+ value,
+ py_nullable_false(value) AS result
+ FROM nullable_test_table
+ ORDER BY id;
+ """
+
+ test {
+ sql """
+ SELECT
+ id,
+ value,
+ py_nullable_false_returns_none(value) AS result
+ FROM nullable_test_table
+ ORDER BY id;
+ """
+ exception "'<' not supported between instances of 'NoneType' and 'int'"
+ }
+
+ // Test 5: Nullable test for string type
+ sql """ DROP FUNCTION IF EXISTS py_string_nullable(STRING); """
+ sql """
+ CREATE FUNCTION py_string_nullable(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s):
+ if s is None or s == "":
+ return None
+ return s.upper()
+\$\$;
+ """
+
+ qt_select_string_nullable """ SELECT py_string_nullable('hello') AS result; """
+ qt_select_string_nullable_null """ SELECT py_string_nullable(NULL) AS result; """
+ qt_select_string_nullable_empty """ SELECT py_string_nullable('') AS result; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_nullable_true(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_nullable_false(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_nullable_false_returns_none(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_string_nullable(STRING);")
+ try_sql("DROP TABLE IF EXISTS nullable_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_array.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_array.groovy
new file mode 100644
index 0000000..c107fa9
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_array.groovy
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_array") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP TABLE IF EXISTS test_pythonudf_array """
+ sql """
+ CREATE TABLE IF NOT EXISTS test_pythonudf_array (
+ `user_id` INT NOT NULL COMMENT "",
+ `tinyint_col` TINYINT NOT NULL COMMENT "",
+ `string_col` STRING NOT NULL COMMENT ""
+ )
+ DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1");
+ """
+ StringBuilder sb = new StringBuilder()
+ int i = 1
+ for (; i < 10; i ++) {
+ sb.append("""
+ (${i},${i}*2,'a${i}b'),
+ """)
+ }
+ sb.append("""
+ (${i},${i}*2,'a${i}b')
+ """)
+ sql """ INSERT INTO test_pythonudf_array VALUES
+ ${sb.toString()}
+ """
+ qt_select_default """ SELECT * FROM test_pythonudf_array t ORDER BY user_id; """
+
+ File path = new File(pyPath)
+ if (!path.exists()) {
+ throw new IllegalStateException("""${pyPath} doesn't exist! """)
+ }
+
+ sql """ DROP FUNCTION IF EXISTS python_udf_array_int_test(array<int>); """
+ sql """ CREATE FUNCTION python_udf_array_int_test(array<int>) RETURNS int PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="array_int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+ qt_select_1 """ SELECT python_udf_array_int_test(array(user_id)) result FROM test_pythonudf_array ORDER BY result; """
+ qt_select_2 """ SELECT python_udf_array_int_test(null) result ; """
+
+
+ sql """ DROP FUNCTION IF EXISTS python_udf_array_return_int_test(array<int>); """
+ sql """ CREATE FUNCTION python_udf_array_return_int_test(array<int>) RETURNS array<int> PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="array_return_array_int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+ qt_select_3 """ SELECT python_udf_array_return_int_test(array(user_id)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """
+ qt_select_4 """ SELECT python_udf_array_return_int_test(array(user_id,user_id)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """
+ qt_select_5 """ SELECT python_udf_array_return_int_test(null) result ; """
+
+
+ sql """ DROP FUNCTION IF EXISTS python_udf_array_return_string_test(array<string>); """
+ sql """ CREATE FUNCTION python_udf_array_return_string_test(array<string>) RETURNS array<string> PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="array_return_array_string_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+ qt_select_6 """ SELECT python_udf_array_return_string_test(array(string_col)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """
+ qt_select_7 """ SELECT python_udf_array_return_string_test(array(string_col, cast(user_id as string))), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """
+ qt_select_8 """ SELECT python_udf_array_return_string_test(null) result ; """
+
+ sql """ DROP FUNCTION IF EXISTS python_udf_array_string_test(array<string>); """
+ sql """ CREATE FUNCTION python_udf_array_string_test(array<string>) RETURNS string PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="array_string_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+ qt_select_9 """ SELECT python_udf_array_string_test(array(string_col)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """
+ qt_select_10 """ SELECT python_udf_array_string_test(array(string_col, cast(user_id as string))), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """
+ qt_select_11 """ SELECT python_udf_array_string_test(null) result ; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS python_udf_array_int_test(array<int>);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_array_return_int_test(array<int>);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_array_return_string_test(array<string>);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_array_string_test(array<string>);")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_array")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_base_data_type.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_base_data_type.groovy
new file mode 100644
index 0000000..a7fa5ff
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_base_data_type.groovy
@@ -0,0 +1,323 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_base_data_type") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+
+ // TEST INLINE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS row_to_csv_all(
+ BOOLEAN,
+ TINYINT,
+ SMALLINT,
+ INT,
+ BIGINT,
+ LARGEINT,
+ FLOAT,
+ DOUBLE,
+ DECIMAL,
+ DECIMAL,
+ DECIMAL,
+ DATE,
+ DATETIME,
+ CHAR,
+ VARCHAR,
+ STRING
+ );
+ """
+ sql """
+CREATE FUNCTION row_to_csv_all(
+ BOOLEAN,
+ TINYINT,
+ SMALLINT,
+ INT,
+ BIGINT,
+ LARGEINT,
+ FLOAT,
+ DOUBLE,
+ DECIMAL,
+ DECIMAL,
+ DECIMAL,
+ DATE,
+ DATETIME,
+ CHAR,
+ VARCHAR,
+ STRING
+)
+RETURNS STRING
+PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "row_to_csv_all_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+)
+AS \$\$
+def row_to_csv_all_impl(
+ bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col,
+ float_col, double_col, decimal32_col, decimal64_col, decimal128_col,
+ date_col, datetime_col, char_col, varchar_col, string_col
+):
+ cols = [
+ bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col,
+ float_col, double_col, decimal32_col, decimal64_col, decimal128_col,
+ date_col, datetime_col, char_col, varchar_col, string_col
+ ]
+
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ return ','.join(safe_str(col) for col in cols)
+\$\$;
+ """
+ sql """ DROP TABLE IF EXISTS test_datatype_table; """
+ sql """
+ CREATE TABLE test_datatype_table (
+ id INT,
+ bool_value BOOLEAN,
+ tinyint_value TINYINT,
+ smallint_value SMALLINT,
+ int_value INT,
+ bigint_value BIGINT,
+ largeint_value LARGEINT,
+ float_value float,
+ double_value DOUBLE,
+ decimal32_value DECIMAL(8, 2),
+ decimal64_value DECIMAL(16, 2),
+ decimal128_value DECIMAL(32, 8),
+ -- decimal256_value DECIMAL(64, 10),
+ date_value DATE,
+ datetime_value DATETIME,
+ char_value CHAR(100),
+ varchar_value VARCHAR(100),
+ string_value STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_datatype_table VALUES
+ (1, TRUE, 127, 32767, 2147483647, 9223372036854775807, 170141183460469231731687303715884105727,
+ 1.23, 4.56789, 123456.78, 12345678901.2345, 123456789012345678901.234567890,
+ '2023-01-01', '2023-01-01 12:34:56', 'char_data_1', 'varchar_data_1', 'string_data_1'),
+
+ (2, FALSE, -128, -32768, -2147483648, -9223372036854775808, -170141183460469231731687303715884105728,
+ -2.34, -5.6789, -987654.32, -98765432.109876543, -987654321098765432.10987654321,
+ '2024-05-15', '2024-05-15 08:22:10', 'char_data_2', 'varchar_data_2', 'string_data_2'),
+
+ (3, TRUE, 0, 0, 0, 0, 0,
+ 0.0, 0.0, 0.00, 0.00, 0.00000000,
+ '2025-10-15', '2025-10-15 00:00:00', 'char_zero', 'varchar_zero', 'string_zero'),
+
+ (4, FALSE, 100, 20000, 300000000, 4000000000000000000, 99999999999999999999999999999999999999,
+ 3.14, 2.71828, 999999.99, 99999999999999.99, 99999999999999999999999.999999999999999,
+ '2022-12-31', '2022-12-31 23:59:59', 'char_max', 'varchar_max', 'string_max'),
+
+ (5, TRUE, -50, -10000, -100000000, -5000000000000000000, -99999999999999999999999999999999999999,
+ -1.41, -0.57721, -0.01, -0.01, -0.000000001,
+ '2021-07-04', '2021-07-04 14:30:00', 'char_neg', 'varchar_neg', 'string_neg');
+ """
+
+ qt_select_1 """
+ SELECT row_to_csv_all(
+ bool_value,
+ tinyint_value,
+ smallint_value,
+ int_value,
+ bigint_value,
+ largeint_value,
+ float_value,
+ double_value,
+ decimal32_value,
+ decimal64_value,
+ decimal128_value,
+ date_value,
+ datetime_value,
+ char_value,
+ varchar_value,
+ string_value
+ ) AS csv_row
+ FROM test_datatype_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS row_to_csv_all(
+ BOOLEAN,
+ TINYINT,
+ SMALLINT,
+ INT,
+ BIGINT,
+ LARGEINT,
+ FLOAT,
+ DOUBLE,
+ DECIMAL,
+ DECIMAL,
+ DECIMAL,
+ DATE,
+ DATETIME,
+ CHAR,
+ VARCHAR,
+ STRING
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_datatype_table;")
+ }
+
+ // TEST MODULE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS row_to_csv_all(
+ BOOLEAN,
+ TINYINT,
+ SMALLINT,
+ INT,
+ BIGINT,
+ LARGEINT,
+ FLOAT,
+ DOUBLE,
+ DECIMAL,
+ DECIMAL,
+ DECIMAL,
+ DATE,
+ DATETIME,
+ CHAR,
+ VARCHAR,
+ STRING
+ );
+ """
+ sql """
+ CREATE FUNCTION row_to_csv_all(
+ BOOLEAN,
+ TINYINT,
+ SMALLINT,
+ INT,
+ BIGINT,
+ LARGEINT,
+ FLOAT,
+ DOUBLE,
+ DECIMAL,
+ DECIMAL,
+ DECIMAL,
+ DATE,
+ DATETIME,
+ CHAR,
+ VARCHAR,
+ STRING
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_data_type.row_to_csv_all_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+ sql """ DROP TABLE IF EXISTS test_datatype_table; """
+ sql """
+ CREATE TABLE test_datatype_table (
+ id INT,
+ bool_value BOOLEAN,
+ tinyint_value TINYINT,
+ smallint_value SMALLINT,
+ int_value INT,
+ bigint_value BIGINT,
+ largeint_value LARGEINT,
+ float_value float,
+ double_value DOUBLE,
+ decimal32_value DECIMAL(8, 2),
+ decimal64_value DECIMAL(16, 2),
+ decimal128_value DECIMAL(32, 8),
+ -- decimal256_value DECIMAL(64, 10),
+ date_value DATE,
+ datetime_value DATETIME,
+ char_value CHAR(100),
+ varchar_value VARCHAR(100),
+ string_value STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_datatype_table VALUES
+ (1, TRUE, 127, 32767, 2147483647, 9223372036854775807, 170141183460469231731687303715884105727,
+ 1.23, 4.56789, 123456.78, 12345678901.2345, 123456789012345678901.234567890,
+ '2023-01-01', '2023-01-01 12:34:56', 'char_data_1', 'varchar_data_1', 'string_data_1'),
+
+ (2, FALSE, -128, -32768, -2147483648, -9223372036854775808, -170141183460469231731687303715884105728,
+ -2.34, -5.6789, -987654.32, -98765432.109876543, -987654321098765432.10987654321,
+ '2024-05-15', '2024-05-15 08:22:10', 'char_data_2', 'varchar_data_2', 'string_data_2'),
+
+ (3, TRUE, 0, 0, 0, 0, 0,
+ 0.0, 0.0, 0.00, 0.00, 0.00000000,
+ '2025-10-15', '2025-10-15 00:00:00', 'char_zero', 'varchar_zero', 'string_zero'),
+
+ (4, FALSE, 100, 20000, 300000000, 4000000000000000000, 99999999999999999999999999999999999999,
+ 3.14, 2.71828, 999999.99, 99999999999999.99, 99999999999999999999999.999999999999999,
+ '2022-12-31', '2022-12-31 23:59:59', 'char_max', 'varchar_max', 'string_max'),
+
+ (5, TRUE, -50, -10000, -100000000, -5000000000000000000, -99999999999999999999999999999999999999,
+ -1.41, -0.57721, -0.01, -0.01, -0.000000001,
+ '2021-07-04', '2021-07-04 14:30:00', 'char_neg', 'varchar_neg', 'string_neg');
+ """
+
+ qt_select_2 """
+ SELECT row_to_csv_all(
+ bool_value,
+ tinyint_value,
+ smallint_value,
+ int_value,
+ bigint_value,
+ largeint_value,
+ float_value,
+ double_value,
+ decimal32_value,
+ decimal64_value,
+ decimal128_value,
+ date_value,
+ datetime_value,
+ char_value,
+ varchar_value,
+ string_value
+ ) AS csv_row
+ FROM test_datatype_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS row_to_csv_all(
+ BOOLEAN,
+ TINYINT,
+ SMALLINT,
+ INT,
+ BIGINT,
+ LARGEINT,
+ FLOAT,
+ DOUBLE,
+ DECIMAL,
+ DECIMAL,
+ DECIMAL,
+ DATE,
+ DATETIME,
+ CHAR,
+ VARCHAR,
+ STRING
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_datatype_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_benchmark.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_benchmark.groovy
new file mode 100644
index 0000000..2ce0564
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_benchmark.groovy
@@ -0,0 +1,341 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_benchmark") {
+ // Benchmark test for Python UDF with large dataset
+
+ def scalarPyPath = """${context.file.parent}/udf_scripts/python_udf_scalar_ops.zip"""
+ def vectorPyPath = """${context.file.parent}/udf_scripts/python_udf_vector_ops.zip"""
+ scp_udf_file_to_all_be(scalarPyPath)
+ scp_udf_file_to_all_be(vectorPyPath)
+ def runtime_version = "3.10.12"
+
+ sql "CREATE DATABASE IF NOT EXISTS test_pythonudf_benchmark"
+ sql "USE test_pythonudf_benchmark"
+
+ log.info("Python scalar module path: ${scalarPyPath}".toString())
+ log.info("Python vector module path: ${vectorPyPath}".toString())
+
+ try {
+ // ==================== Create Large Test Table ====================
+ sql """ DROP TABLE IF EXISTS python_udf_benchmark_table; """
+ sql """
+ CREATE TABLE python_udf_benchmark_table (
+ id BIGINT,
+ int_val INT,
+ double_val DOUBLE,
+ string_val STRING,
+ email STRING,
+ bool_val BOOLEAN,
+ date_val DATE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 10
+ PROPERTIES("replication_num" = "1");
+ """
+
+ log.info("Creating benchmark table with large dataset...")
+
+ // Load 1 million rows using streamLoad (much faster)
+ def totalRows = 1000000
+
+ log.info("Loading ${totalRows} rows using streamLoad from CSV file...")
+ def loadStartTime = System.currentTimeMillis()
+
+ streamLoad {
+ db 'test_pythonudf_benchmark'
+ table "python_udf_benchmark_table"
+ set 'column_separator', '\t'
+ file 'benchmark_data_1m.csv'
+ time 120000 // 120 seconds timeout
+
+ check { result, exception, startTime, endTime ->
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ assertEquals(json.NumberTotalRows, json.NumberLoadedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+ }
+ }
+
+ def loadEndTime = System.currentTimeMillis()
+ log.info("Data loaded in ${loadEndTime - loadStartTime} ms (${String.format('%.2f', totalRows / ((loadEndTime - loadStartTime) / 1000.0))} rows/sec)")
+
+ sql "sync"
+
+ // Verify row count
+ def rowCount = sql "SELECT COUNT(*) FROM python_udf_benchmark_table"
+ log.info("Verified row count: ${rowCount[0][0]}")
+
+ // ==================== Benchmark 1: Simple Scalar UDF ====================
+ log.info("=== Benchmark 1: Simple Scalar UDF (multiply_with_default) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_multiply(INT, INT, INT); """
+ sql """
+ CREATE FUNCTION py_multiply(INT, INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${scalarPyPath}",
+ "symbol" = "python_udf_scalar_ops.multiply_with_default",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ def startTime1 = System.currentTimeMillis()
+ sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT id, py_multiply(int_val, 2, 1) AS result
+ FROM python_udf_benchmark_table
+ ) t;
+ """
+ def endTime1 = System.currentTimeMillis()
+ log.info("Scalar UDF (simple): ${endTime1 - startTime1} ms for ${totalRows} rows")
+
+ // ==================== Benchmark 2: Complex Scalar UDF ====================
+ log.info("=== Benchmark 2: Complex Scalar UDF (Levenshtein distance) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_levenshtein(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_levenshtein(STRING, STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${scalarPyPath}",
+ "symbol" = "python_udf_scalar_ops.levenshtein_distance",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ def startTime2 = System.currentTimeMillis()
+ sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT id, py_levenshtein(string_val, 'test_string_50') AS distance
+ FROM python_udf_benchmark_table
+ LIMIT 100000
+ ) t;
+ """
+ def endTime2 = System.currentTimeMillis()
+ log.info("Scalar UDF (complex): ${endTime2 - startTime2} ms for 100000 rows")
+
+ // ==================== Benchmark 3: String Processing Scalar UDF ====================
+ log.info("=== Benchmark 3: String Processing Scalar UDF (extract_domain) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_extract_domain(STRING); """
+ sql """
+ CREATE FUNCTION py_extract_domain(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${scalarPyPath}",
+ "symbol" = "python_udf_scalar_ops.extract_domain",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ def startTime3 = System.currentTimeMillis()
+ sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT id, py_extract_domain(email) AS domain
+ FROM python_udf_benchmark_table
+ ) t;
+ """
+ def endTime3 = System.currentTimeMillis()
+ log.info("Scalar UDF (string): ${endTime3 - startTime3} ms for ${totalRows} rows")
+
+ // ==================== Benchmark 4: Simple Vector UDF ====================
+ log.info("=== Benchmark 4: Simple Vector UDF (add_constant) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_add(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_add(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${vectorPyPath}",
+ "symbol" = "python_udf_vector_ops.add_constant",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true",
+ "vectorized" = "true"
+ );
+ """
+
+ def startTime4 = System.currentTimeMillis()
+ sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT id, py_vec_add(int_val, 100) AS result
+ FROM python_udf_benchmark_table
+ ) t;
+ """
+ def endTime4 = System.currentTimeMillis()
+ log.info("Vector UDF (simple): ${endTime4 - startTime4} ms for ${totalRows} rows")
+
+ // ==================== Benchmark 5: Complex Vector UDF ====================
+ log.info("=== Benchmark 5: Complex Vector UDF (string_length) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_strlen(STRING); """
+ sql """
+ CREATE FUNCTION py_vec_strlen(STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${vectorPyPath}",
+ "symbol" = "python_udf_vector_ops.string_length",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true",
+ "vectorized" = "true"
+ );
+ """
+
+ def startTime5 = System.currentTimeMillis()
+ sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT id, py_vec_strlen(string_val) AS len
+ FROM python_udf_benchmark_table
+ ) t;
+ """
+ def endTime5 = System.currentTimeMillis()
+ log.info("Vector UDF (string): ${endTime5 - startTime5} ms for ${totalRows} rows")
+
+ // ==================== Benchmark 6: Scalar UDF with Grouping ====================
+ log.info("=== Benchmark 6: Scalar UDF with Grouping ===")
+
+ def startTime6 = System.currentTimeMillis()
+ sql """
+ SELECT
+ int_val % 100 AS bucket,
+ COUNT(*) AS cnt,
+ SUM(int_val) AS total
+ FROM python_udf_benchmark_table
+ WHERE py_multiply(int_val, 2, 1) > 1000
+ GROUP BY int_val % 100
+ ORDER BY bucket
+ LIMIT 10;
+ """
+ def endTime6 = System.currentTimeMillis()
+ log.info("Scalar UDF with Grouping (WHERE clause): ${endTime6 - startTime6} ms")
+
+ // ==================== Benchmark 7: Vector UDF with Grouping ====================
+ log.info("=== Benchmark 7: Vector UDF with Grouping ===")
+
+ def startTime7 = System.currentTimeMillis()
+ sql """
+ SELECT
+ int_val % 100 AS bucket,
+ COUNT(*) AS cnt,
+ SUM(int_val) AS total
+ FROM python_udf_benchmark_table
+ WHERE py_vec_add(int_val, 100) > 1000
+ GROUP BY int_val % 100
+ ORDER BY bucket
+ LIMIT 10;
+ """
+ def endTime7 = System.currentTimeMillis()
+ log.info("Vector UDF with Grouping (WHERE clause): ${endTime7 - startTime7} ms")
+
+ // ==================== Benchmark 8: Multiple UDFs in Single Query ====================
+ log.info("=== Benchmark 8: Multiple UDFs in Single Query ===")
+
+ def startTime8 = System.currentTimeMillis()
+ sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT
+ id,
+ py_multiply(int_val, 2, 1) AS mul_result,
+ py_extract_domain(email) AS domain,
+ py_vec_add(int_val, 100) AS vec_result
+ FROM python_udf_benchmark_table
+ LIMIT 100000
+ ) t;
+ """
+ def endTime8 = System.currentTimeMillis()
+ log.info("Multiple UDFs: ${endTime8 - startTime8} ms for 100000 rows")
+
+ // ==================== Benchmark 9: Filter with UDF ====================
+ log.info("=== Benchmark 9: Filter with UDF ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_is_prime(INT); """
+ sql """
+ CREATE FUNCTION py_is_prime(INT)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${scalarPyPath}",
+ "symbol" = "python_udf_scalar_ops.is_prime",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ def startTime9 = System.currentTimeMillis()
+ sql """
+ SELECT COUNT(*)
+ FROM python_udf_benchmark_table
+ WHERE py_is_prime(int_val) = true
+ LIMIT 10000;
+ """
+ def endTime9 = System.currentTimeMillis()
+ log.info("Filter with UDF: ${endTime9 - startTime9} ms")
+
+ // ==================== Benchmark Summary ====================
+ log.info("\n" + "=" * 80 + "\nBENCHMARK SUMMARY\n" + "=" * 80 + "\nDataset size: ${totalRows} rows\n" +
+ "1. Scalar UDF (simple multiply): ${endTime1 - startTime1} ms\n" +
+ "2. Scalar UDF (complex Levenshtein): ${endTime2 - startTime2} ms (100K rows)\n" +
+ "3. Scalar UDF (string extract_domain): ${endTime3 - startTime3} ms\n" +
+ "4. Vector UDF (simple add): ${endTime4 - startTime4} ms\n" +
+ "5. Vector UDF (string length): ${endTime5 - startTime5} ms\n" +
+ "6. Aggregation with Scalar UDF: ${endTime6 - startTime6} ms\n" +
+ "7. Aggregation with Vector UDF: ${endTime7 - startTime7} ms\n" +
+ "8. Multiple UDFs in query: ${endTime8 - startTime8} ms (100K rows)\n" +
+ "9. Filter with UDF: ${endTime9 - startTime9} ms\n" +
+ "=" * 80)
+
+ // Calculate throughput
+ def throughput1 = totalRows / ((endTime1 - startTime1) / 1000.0)
+ def throughput4 = totalRows / ((endTime4 - startTime4) / 1000.0)
+ log.info("Scalar UDF throughput: ${String.format('%.2f', throughput1)} rows/sec")
+ log.info("Vector UDF throughput: ${String.format('%.2f', throughput4)} rows/sec")
+ log.info("Vector speedup: ${String.format('%.2f', (endTime1 - startTime1) / (endTime4 - startTime4))}x")
+
+ } finally {
+ // Cleanup
+ log.info("Cleaning up benchmark resources...")
+
+ try_sql("DROP FUNCTION IF EXISTS py_multiply(INT, INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_levenshtein(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_extract_domain(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_add(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_strlen(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_is_prime(INT);")
+
+ try_sql("DROP TABLE IF EXISTS python_udf_benchmark_table;")
+ try_sql("DROP DATABASE IF EXISTS test_pythonudf_benchmark;")
+ log.info("Benchmark cleanup completed.")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_boolean.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_boolean.groovy
new file mode 100644
index 0000000..7d2ba90
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_boolean.groovy
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_boolean") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP TABLE IF EXISTS test_pythonudf_boolean """
+ sql """
+ CREATE TABLE IF NOT EXISTS test_pythonudf_boolean (
+ `user_id` INT NOT NULL COMMENT "",
+ `boo_1` BOOLEAN NOT NULL COMMENT ""
+ )
+ DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1");
+ """
+
+ sql """ INSERT INTO test_pythonudf_boolean (`user_id`,`boo_1`) VALUES
+ (111,true),
+ (112,false),
+ (113,0),
+ (114,1)
+ """
+ qt_select_default """ SELECT * FROM test_pythonudf_boolean t ORDER BY user_id; """
+
+ File path1 = new File(pyPath)
+ if (!path1.exists()) {
+ throw new IllegalStateException("""${pyPath} doesn't exist! """)
+ }
+
+ sql """ CREATE FUNCTION python_udf_boolean_test(BOOLEAN) RETURNS BOOLEAN PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="boolean_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT python_udf_boolean_test(1) as result; """
+ qt_select """ SELECT python_udf_boolean_test(0) as result ; """
+ qt_select """ SELECT python_udf_boolean_test(true) as result ; """
+ qt_select """ SELECT python_udf_boolean_test(false) as result ; """
+ qt_select """ SELECT python_udf_boolean_test(null) as result ; """
+ qt_select """ SELECT user_id,python_udf_boolean_test(boo_1) as result FROM test_pythonudf_boolean order by user_id; """
+
+
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS python_udf_boolean_test(BOOLEAN);")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_boolean")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_complex_data_type.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_complex_data_type.groovy
new file mode 100644
index 0000000..b071654
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_complex_data_type.groovy
@@ -0,0 +1,408 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_complex_data_type") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+
+ // TEST ARRAY INLINE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS array_to_csv(
+ ARRAY<INT>,
+ ARRAY<STRING>,
+ ARRAY<ARRAY<INT>>
+ );
+ """
+ sql """
+CREATE FUNCTION array_to_csv(
+ ARRAY<INT>,
+ ARRAY<STRING>,
+ ARRAY<ARRAY<INT>>
+)
+RETURNS STRING
+PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "array_to_csv_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+)
+AS \$\$
+def array_to_csv_impl(int_arr, str_arr, nested_arr):
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ def format_array(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(safe_str(item) for item in arr) + ']'
+
+ def format_nested_array(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(format_array(inner) for inner in arr) + ']'
+
+ parts = [
+ format_array(int_arr),
+ format_array(str_arr),
+ format_nested_array(nested_arr)
+ ]
+ return '|'.join(parts)
+\$\$;
+ """
+ sql """ DROP TABLE IF EXISTS test_array_table; """
+ sql """
+ CREATE TABLE test_array_table (
+ id INT,
+ int_array ARRAY<INT>,
+ string_array ARRAY<STRING>,
+ nested_array ARRAY<ARRAY<INT>>
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_array_table VALUES
+ (1, [1, 2, 3], ['a', 'b', 'c'], [[1,2], [3,4]]),
+ (2, [], [], []),
+ (3, NULL, ['x', NULL, 'z'], NULL),
+ (4, [0, -1, 2147483647], ['hello', 'world'], [[], [1]]);
+ """
+
+ qt_select_1 """
+ SELECT array_to_csv(int_array, string_array, nested_array) AS result FROM test_array_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS array_to_csv(
+ ARRAY<INT>,
+ ARRAY<STRING>,
+ ARRAY<ARRAY<INT>>
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_array_table;")
+ }
+
+ // TEST ARRAY MODULE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS array_to_csv(
+ ARRAY<INT>,
+ ARRAY<STRING>,
+ ARRAY<ARRAY<INT>>
+ );
+ """
+ sql """
+ CREATE FUNCTION array_to_csv(
+ ARRAY<INT>,
+ ARRAY<STRING>,
+ ARRAY<ARRAY<INT>>
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file"="file://${pyPath}",
+ "symbol" = "python_udf_array_type.array_to_csv_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+ sql """ DROP TABLE IF EXISTS test_array_table; """
+ sql """
+ CREATE TABLE test_array_table (
+ id INT,
+ int_array ARRAY<INT>,
+ string_array ARRAY<STRING>,
+ nested_array ARRAY<ARRAY<INT>>
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_array_table VALUES
+ (1, [1, 2, 3], ['a', 'b', 'c'], [[1,2], [3,4]]),
+ (2, [], [], []),
+ (3, NULL, ['x', NULL, 'z'], NULL),
+ (4, [0, -1, 2147483647], ['hello', 'world'], [[], [1]]);
+ """
+
+
+ qt_select_2 """
+ SELECT array_to_csv(int_array, string_array, nested_array) AS result FROM test_array_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS array_to_csv(
+ ARRAY<INT>,
+ ARRAY<STRING>,
+ ARRAY<ARRAY<INT>>
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_array_table;")
+ }
+
+ // TEST MAP INLINE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS map_to_csv(
+ MAP<INT, STRING>,
+ MAP<STRING, DOUBLE>
+ );
+ """
+ sql """
+CREATE FUNCTION map_to_csv(
+ MAP<INT, STRING>,
+ MAP<STRING, DOUBLE>
+)
+RETURNS STRING
+PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "map_to_csv_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+)
+AS \$\$
+def map_to_csv_impl(map1, map2):
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ def format_map(m):
+ if m is None:
+ return 'NULL'
+ # Doris passes MAP as Python dict
+ items = [f"{safe_str(k)}:{safe_str(v)}" for k, v in m.items()]
+ return '{' + ','.join(sorted(items)) + '}'
+
+ return '|'.join([format_map(map1), format_map(map2)])
+\$\$;
+ """
+ sql """ DROP TABLE IF EXISTS test_map_table; """
+ sql """
+ CREATE TABLE test_map_table (
+ id INT,
+ int_string_map MAP<INT, STRING>,
+ string_double_map MAP<STRING, DOUBLE>
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_map_table VALUES
+ (1, {1:'one', 2:'two'}, {'pi':3.14, 'e':2.718}),
+ (2, {}, {}),
+ (3, NULL, {'null_key': NULL}),
+ (4, {0:'zero', -1:'minus_one'}, {'max':1.79769e308});
+ """
+
+ qt_select_3 """
+ SELECT map_to_csv(int_string_map, string_double_map) AS result FROM test_map_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS map_to_csv(
+ MAP<INT, STRING>,
+ MAP<STRING, DOUBLE>
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_map_table;")
+ }
+
+ // TEST MAP MODULE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS map_to_csv(
+ MAP<INT, STRING>,
+ MAP<STRING, DOUBLE>
+ );
+ """
+ sql """
+ CREATE FUNCTION map_to_csv(
+ MAP<INT, STRING>,
+ MAP<STRING, DOUBLE>
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file"="file://${pyPath}",
+ "symbol" = "python_udf_map_type.map_to_csv_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+ sql """ DROP TABLE IF EXISTS test_map_table; """
+ sql """
+ CREATE TABLE test_map_table (
+ id INT,
+ int_string_map MAP<INT, STRING>,
+ string_double_map MAP<STRING, DOUBLE>
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_map_table VALUES
+ (1, {1:'one', 2:'two'}, {'pi':3.14, 'e':2.718}),
+ (2, {}, {}),
+ (3, NULL, {'null_key': NULL}),
+ (4, {0:'zero', -1:'minus_one'}, {'max':1.79769e308});
+ """
+
+ qt_select_4 """
+ SELECT map_to_csv(int_string_map, string_double_map) AS result FROM test_map_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS map_to_csv(
+ MAP<INT, STRING>,
+ MAP<STRING, DOUBLE>
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_map_table;")
+ }
+
+ // TEST STRUCT INLINE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS struct_to_csv(
+ STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+ );
+ """
+ sql """
+CREATE FUNCTION struct_to_csv(
+ STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+)
+RETURNS STRING
+PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "struct_to_csv_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+)
+AS \$\$
+def struct_to_csv_impl(person, point):
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ def format_array(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(safe_str(item) for item in arr) + ']'
+
+ def format_struct_dict(s, field_names):
+ if s is None:
+ return 'NULL'
+ parts = []
+ for field in field_names:
+ val = s.get(field)
+ parts.append(safe_str(val))
+ return '(' + ','.join(parts) + ')'
+
+ person_str = format_struct_dict(person, ['name', 'age', 'salary'])
+
+ if point is None:
+ point_str = 'NULL'
+ else:
+ x_val = safe_str(point.get('x'))
+ y_val = safe_str(point.get('y'))
+ tags_val = format_array(point.get('tags'))
+ point_str = f"({x_val},{y_val},{tags_val})"
+
+ return '|'.join([person_str, point_str])
+\$\$;
+ """
+ sql """ DROP TABLE IF EXISTS test_struct_table; """
+ sql """
+ CREATE TABLE test_struct_table(
+ id INT,
+ person STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ point STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_struct_table VALUES
+ (1, {'Alice', 30, 75000.50}, {1.5, 2.5, ['red', 'blue']}),
+ (2, {NULL, NULL, NULL}, {0.0, 0.0, []}),
+ (3, {'Bob', 25, 60000.00}, {NULL, 3.14, ['tag1', NULL, 'tag3']}),
+ (4, {'', 0, 0.0}, {-1.0, -2.0, NULL});
+ """
+
+ qt_select_5 """
+ SELECT struct_to_csv(person, point) AS result FROM test_struct_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS struct_to_csv(
+ STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_struct_table;")
+ }
+
+ // TEST STRUCT MODULE CASE
+ try {
+ sql """
+ DROP FUNCTION IF EXISTS struct_to_csv(
+ STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+ );
+ """
+ sql """
+ CREATE FUNCTION struct_to_csv(
+ STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file"="file://${pyPath}",
+ "symbol" = "python_udf_struct_type.struct_to_csv_impl",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+ sql """ DROP TABLE IF EXISTS test_struct_table; """
+ sql """
+ CREATE TABLE test_struct_table(
+ id INT,
+ person STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ point STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO test_struct_table VALUES
+ (1, {'Alice', 30, 75000.50}, {1.5, 2.5, ['red', 'blue']}),
+ (2, {NULL, NULL, NULL}, {0.0, 0.0, []}),
+ (3, {'Bob', 25, 60000.00}, {NULL, 3.14, ['tag1', NULL, 'tag3']}),
+ (4, {'', 0, 0.0}, {-1.0, -2.0, NULL});
+ """
+
+ qt_select_6 """
+ SELECT struct_to_csv(person, point) AS result FROM test_struct_table;
+ """
+ } finally {
+ try_sql("""DROP FUNCTION IF EXISTS struct_to_csv(
+ STRUCT<name: STRING, age: INT, salary: DECIMAL(12,2)>,
+ STRUCT<x: DOUBLE, y: DOUBLE, tags: ARRAY<STRING>>
+ );""")
+ try_sql("DROP TABLE IF EXISTS test_struct_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_data_types.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_data_types.groovy
new file mode 100644
index 0000000..abbb22f
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_data_types.groovy
@@ -0,0 +1,189 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_data_types") {
+ // Test various data types supported by Python UDF
+ def runtime_version = "3.10.12"
+
+ try {
+ // Test 1: TINYINT type
+ sql """ DROP FUNCTION IF EXISTS py_tinyint_test(TINYINT); """
+ sql """
+ CREATE FUNCTION py_tinyint_test(TINYINT)
+ RETURNS TINYINT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x + 1
+\$\$;
+ """
+
+ qt_select_tinyint """ SELECT py_tinyint_test(CAST(10 AS TINYINT)) AS result; """
+
+ // Test 2: SMALLINT type
+ sql """ DROP FUNCTION IF EXISTS py_smallint_test(SMALLINT); """
+ sql """
+ CREATE FUNCTION py_smallint_test(SMALLINT)
+ RETURNS SMALLINT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x * 2
+\$\$;
+ """
+
+ qt_select_smallint """ SELECT py_smallint_test(CAST(1000 AS SMALLINT)) AS result; """
+
+ // Test 3: BIGINT type
+ sql """ DROP FUNCTION IF EXISTS py_bigint_test(BIGINT); """
+ sql """
+ CREATE FUNCTION py_bigint_test(BIGINT)
+ RETURNS BIGINT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x + 1000000
+\$\$;
+ """
+
+ qt_select_bigint """ SELECT py_bigint_test(1000000000000) AS result; """
+
+ // Test 4: DECIMAL type
+ sql """ DROP FUNCTION IF EXISTS py_decimal_test(DECIMAL(10,2)); """
+ sql """
+ CREATE FUNCTION py_decimal_test(DECIMAL(10,2))
+ RETURNS DECIMAL(10,2)
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x * 1.1
+\$\$;
+ """
+
+ qt_select_decimal """ SELECT py_decimal_test(100.50) AS result; """
+
+ // Test 5: DATE type
+ sql """ DROP FUNCTION IF EXISTS py_date_test(DATE); """
+ sql """
+ CREATE FUNCTION py_date_test(DATE)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(d):
+ if d is None:
+ return None
+ return str(d)
+\$\$;
+ """
+
+ qt_select_date """ SELECT py_date_test('2024-01-15') AS result; """
+
+ // Test 6: DATETIME type
+ sql """ DROP FUNCTION IF EXISTS py_datetime_test(DATETIME); """
+ sql """
+ CREATE FUNCTION py_datetime_test(DATETIME)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(dt):
+ if dt is None:
+ return None
+ return str(dt)
+\$\$;
+ """
+
+ qt_select_datetime """ SELECT py_datetime_test('2024-01-15 10:30:45') AS result; """
+
+ // Test 7: Comprehensive test - create table and test multiple data types
+ sql """ DROP TABLE IF EXISTS data_types_test_table; """
+ sql """
+ CREATE TABLE data_types_test_table (
+ id INT,
+ tiny_val TINYINT,
+ small_val SMALLINT,
+ int_val INT,
+ big_val BIGINT,
+ float_val FLOAT,
+ double_val DOUBLE,
+ decimal_val DECIMAL(10,2),
+ string_val STRING,
+ bool_val BOOLEAN
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO data_types_test_table VALUES
+ (1, 10, 100, 1000, 10000, 1.5, 2.5, 100.50, 'test1', true),
+ (2, 20, 200, 2000, 20000, 2.5, 3.5, 200.75, 'test2', false),
+ (3, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+ """
+
+ qt_select_table_types """
+ SELECT
+ id,
+ py_tinyint_test(tiny_val) AS tiny_result,
+ py_smallint_test(small_val) AS small_result,
+ py_bigint_test(big_val) AS big_result
+ FROM data_types_test_table
+ ORDER BY id;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_tinyint_test(TINYINT);")
+ try_sql("DROP FUNCTION IF EXISTS py_smallint_test(SMALLINT);")
+ try_sql("DROP FUNCTION IF EXISTS py_bigint_test(BIGINT);")
+ try_sql("DROP FUNCTION IF EXISTS py_decimal_test(DECIMAL(10,2));")
+ try_sql("DROP FUNCTION IF EXISTS py_date_test(DATE);")
+ try_sql("DROP FUNCTION IF EXISTS py_datetime_test(DATETIME);")
+ try_sql("DROP TABLE IF EXISTS data_types_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_error_handling.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_error_handling.groovy
new file mode 100644
index 0000000..1871d81
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_error_handling.groovy
@@ -0,0 +1,190 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_error_handling") {
+ // Test error handling and exception cases for Python UDF
+
+ def runtime_version = "3.10.12"
+ try {
+ // Test 1: Division by zero error handling
+ sql """ DROP FUNCTION IF EXISTS py_safe_divide(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_safe_divide(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(a, b):
+ if a is None or b is None:
+ return None
+ if b == 0:
+ return None
+ return a / b
+\$\$;
+ """
+
+ qt_select_divide_normal """ SELECT py_safe_divide(10.0, 2.0) AS result; """
+ qt_select_divide_zero """ SELECT py_safe_divide(10.0, 0.0) AS result; """
+ qt_select_divide_null """ SELECT py_safe_divide(10.0, NULL) AS result; """
+
+ // Test 2: String index out of bounds handling
+ sql """ DROP FUNCTION IF EXISTS py_safe_substring(STRING, INT); """
+ sql """
+ CREATE FUNCTION py_safe_substring(STRING, INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s, index):
+ if s is None or index is None:
+ return None
+ if index < 0 or index >= len(s):
+ return None
+ return s[index]
+\$\$;
+ """
+
+ qt_select_substring_valid """ SELECT py_safe_substring('hello', 1) AS result; """
+ qt_select_substring_invalid """ SELECT py_safe_substring('hello', 10) AS result; """
+ qt_select_substring_negative """ SELECT py_safe_substring('hello', -1) AS result; """
+
+ // Test 3: Type conversion error handling
+ sql """ DROP FUNCTION IF EXISTS py_safe_int_parse(STRING); """
+ sql """
+ CREATE FUNCTION py_safe_int_parse(STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s):
+ if s is None:
+ return None
+ try:
+ return int(s)
+ except (ValueError, TypeError):
+ return None
+\$\$;
+ """
+
+ qt_select_parse_valid """ SELECT py_safe_int_parse('123') AS result; """
+ qt_select_parse_invalid """ SELECT py_safe_int_parse('abc') AS result; """
+ qt_select_parse_empty """ SELECT py_safe_int_parse('') AS result; """
+
+ // Test 4: Array out of bounds handling
+ sql """ DROP FUNCTION IF EXISTS py_safe_array_get(ARRAY<INT>, INT); """
+ sql """
+ CREATE FUNCTION py_safe_array_get(ARRAY<INT>, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(arr, index):
+ if arr is None or index is None:
+ return None
+ if index < 0 or index >= len(arr):
+ return None
+ return arr[index]
+\$\$;
+ """
+
+ qt_select_array_valid """ SELECT py_safe_array_get([10, 20, 30], 1) AS result; """
+ qt_select_array_invalid """ SELECT py_safe_array_get([10, 20, 30], 5) AS result; """
+
+ // Test 5: Test error handling on table data
+ sql """ DROP TABLE IF EXISTS error_handling_test_table; """
+ sql """
+ CREATE TABLE error_handling_test_table (
+ id INT,
+ numerator DOUBLE,
+ denominator DOUBLE,
+ text STRING,
+ arr_index INT
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO error_handling_test_table VALUES
+ (1, 100.0, 10.0, '123', 0),
+ (2, 50.0, 0.0, 'abc', 1),
+ (3, NULL, 5.0, '', 2),
+ (4, 75.0, NULL, '456', -1),
+ (5, 25.0, 5.0, 'xyz', 10);
+ """
+
+ qt_select_table_error_handling """
+ SELECT
+ id,
+ numerator,
+ denominator,
+ py_safe_divide(numerator, denominator) AS divide_result,
+ text,
+ py_safe_int_parse(text) AS parse_result
+ FROM error_handling_test_table
+ ORDER BY id;
+ """
+
+ // Test 6: Empty string handling
+ sql """ DROP FUNCTION IF EXISTS py_safe_length(STRING); """
+ sql """
+ CREATE FUNCTION py_safe_length(STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s):
+ if s is None:
+ return None
+ return len(s)
+\$\$;
+ """
+
+ qt_select_length_normal """ SELECT py_safe_length('hello') AS result; """
+ qt_select_length_empty """ SELECT py_safe_length('') AS result; """
+ qt_select_length_null """ SELECT py_safe_length(NULL) AS result; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_safe_divide(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_safe_substring(STRING, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_safe_int_parse(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_safe_array_get(ARRAY<INT>, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_safe_length(STRING);")
+ try_sql("DROP TABLE IF EXISTS error_handling_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy
new file mode 100644
index 0000000..d4f8f21
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_file_protocol") {
+ // Test loading Python UDF from zip package using file:// protocol
+
+ def zipPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(zipPath)
+ def runtime_version = "3.10.12"
+ log.info("Python zip path: ${zipPath}".toString())
+
+ try {
+ // Test 1: Load int_test.py from zip package using file:// protocol
+ sql """ DROP FUNCTION IF EXISTS py_file_int_add(INT); """
+ sql """
+ CREATE FUNCTION py_file_int_add(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${zipPath}",
+ "symbol" = "int_test.evaluate",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+
+ qt_select_file_int """ SELECT py_file_int_add(99) AS result; """
+
+ // Test 2: Load string_test.py from zip package using file:// protocol
+ sql """ DROP FUNCTION IF EXISTS py_file_string_mask(STRING, INT, INT); """
+ sql """
+ CREATE FUNCTION py_file_string_mask(STRING, INT, INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${zipPath}",
+ "symbol" = "string_test.evaluate",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+
+ qt_select_file_string """ SELECT py_file_string_mask('1234567890', 3, 3) AS result; """
+
+ // Test 3: Load float_test.py from zip package using file:// protocol
+ sql """ DROP FUNCTION IF EXISTS py_file_float_process(FLOAT); """
+ sql """
+ CREATE FUNCTION py_file_float_process(FLOAT)
+ RETURNS FLOAT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${zipPath}",
+ "symbol" = "float_test.evaluate",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+
+ qt_select_file_float """ SELECT py_file_float_process(3.14) AS result; """
+
+ // Test 4: Load boolean_test.py from zip package using file:// protocol
+ sql """ DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN); """
+ sql """
+ CREATE FUNCTION py_file_bool_not(BOOLEAN)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${zipPath}",
+ "symbol" = "boolean_test.evaluate",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+
+ qt_select_file_bool_true """ SELECT py_file_bool_not(true) AS result; """
+ qt_select_file_bool_false """ SELECT py_file_bool_not(false) AS result; """
+
+ // Test 5: Test UDF with file:// protocol on table data
+ sql """ DROP TABLE IF EXISTS file_protocol_test_table; """
+ sql """
+ CREATE TABLE file_protocol_test_table (
+ id INT,
+ num INT,
+ text STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO file_protocol_test_table VALUES
+ (1, 10, 'hello'),
+ (2, 20, 'world'),
+ (3, 30, 'python'),
+ (4, 40, 'doris');
+ """
+
+ qt_select_table_file """
+ SELECT
+ id,
+ num,
+ py_file_int_add(num) AS num_result,
+ text,
+ py_file_string_mask(text, 1, 1) AS text_result
+ FROM file_protocol_test_table
+ ORDER BY id;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_file_int_add(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_file_string_mask(STRING, INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_file_float_process(FLOAT);")
+ try_sql("DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN);")
+ try_sql("DROP TABLE IF EXISTS file_protocol_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy
new file mode 100644
index 0000000..5ac06aa
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_float") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.12.0"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP TABLE IF EXISTS test_pythonudf_float """
+ sql """
+ CREATE TABLE IF NOT EXISTS test_pythonudf_float (
+ `user_id` INT NOT NULL COMMENT "",
+ `float_1` FLOAT NOT NULL COMMENT "",
+ `float_2` FLOAT COMMENT "",
+ `double_1` DOUBLE NOT NULL COMMENT "",
+ `double_2` DOUBLE COMMENT ""
+ )
+ DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1");
+ """
+
+
+ sql """ INSERT INTO test_pythonudf_float (`user_id`,`float_1`,`float_2`,double_1,double_2) VALUES
+ (111,11111.11111,222222.3333333,12345678.34455677,1111111.999999999999),
+ (112,1234556.11111,222222.3333333,222222222.3333333333333,4444444444444.555555555555),
+ (113,87654321.11111,null,6666666666.6666666666,null)
+ """
+ qt_select_default """ SELECT * FROM test_pythonudf_float t ORDER BY user_id; """
+
+ File path = new File(pyPath)
+ if (!path.exists()) {
+ throw new IllegalStateException("""${pyPath} doesn't exist! """)
+ }
+
+ sql """ DROP FUNCTION IF EXISTS python_udf_float_test(FLOAT,FLOAT) """
+
+ sql """ CREATE FUNCTION python_udf_float_test(FLOAT,FLOAT) RETURNS FLOAT PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="float_test.evaluate",
+ "type"="PYTHON_UDF",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ ); """
+
+ qt_select """ SELECT python_udf_float_test(cast(2.83645 as float),cast(111.1111111 as float)) as result; """
+ qt_select """ SELECT python_udf_float_test(2.83645,111.1111111) as result ; """
+ qt_select """ SELECT python_udf_float_test(2.83645,null) as result ; """
+ qt_select """ SELECT python_udf_float_test(cast(2.83645 as float),null) as result ; """
+ qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """
+ createMV("create materialized view udf_mv as SELECT user_id as a1,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id;")
+ qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """
+
+ explain {
+ sql("SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; ")
+ contains "(udf_mv)"
+ }
+
+ sql """ CREATE FUNCTION python_udf_double_test(DOUBLE,DOUBLE) RETURNS DOUBLE PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="double_test.evaluate",
+ "type"="PYTHON_UDF",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ ); """
+
+ qt_select """ SELECT python_udf_double_test(cast(2.83645 as DOUBLE),cast(111.1111111 as DOUBLE)) as result; """
+ qt_select """ SELECT python_udf_double_test(2.83645,111.1111111) as result ; """
+ qt_select """ SELECT python_udf_double_test(2.83645,null) as result ; """
+ qt_select """ SELECT python_udf_double_test(cast(2.83645 as DOUBLE),null) as result ; """
+ qt_select """ SELECT user_id,python_udf_double_test(double_1, double_1) as sum FROM test_pythonudf_float order by user_id; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS python_udf_double_test(DOUBLE,DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_float_test(FLOAT,FLOAT);")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_float")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_global_function.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_global_function.groovy
new file mode 100644
index 0000000..099ca46
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_global_function.groovy
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_global_function") {
+ // Test creating global Python UDF with GLOBAL keyword
+
+ def runtime_version = "3.10.12"
+ try {
+ // Test 1: Create GLOBAL function
+ sql """ DROP GLOBAL FUNCTION IF EXISTS py_global_multiply(INT, INT); """
+ sql """
+ CREATE GLOBAL FUNCTION py_global_multiply(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(a, b):
+ if a is None or b is None:
+ return None
+ return a * b
+\$\$;
+ """
+
+ qt_select_global_multiply """ SELECT py_global_multiply(7, 8) AS result; """
+
+ // Test 2: Create GLOBAL string function
+ sql """ DROP GLOBAL FUNCTION IF EXISTS py_global_lower(STRING); """
+ sql """
+ CREATE GLOBAL FUNCTION py_global_lower(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s):
+ if s is None:
+ return None
+ return s.lower()
+\$\$;
+ """
+
+ qt_select_global_lower """ SELECT py_global_lower('HELLO WORLD') AS result; """
+
+ // Test 3: Create regular (non-GLOBAL) function for comparison
+ sql """ DROP FUNCTION IF EXISTS py_local_add(INT, INT); """
+ sql """
+ CREATE FUNCTION py_local_add(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(a, b):
+ if a is None or b is None:
+ return None
+ return a + b
+\$\$;
+ """
+
+ qt_select_local_add """ SELECT py_local_add(15, 25) AS result; """
+
+ // Test 4: Test GLOBAL function on table data
+ sql """ DROP TABLE IF EXISTS global_function_test_table; """
+ sql """
+ CREATE TABLE global_function_test_table (
+ id INT,
+ val1 INT,
+ val2 INT,
+ text STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO global_function_test_table VALUES
+ (1, 5, 6, 'APPLE'),
+ (2, 10, 20, 'BANANA'),
+ (3, 3, 7, 'CHERRY'),
+ (4, NULL, 5, 'DATE'),
+ (5, 8, 9, NULL);
+ """
+
+ qt_select_table_global """
+ SELECT
+ id,
+ val1,
+ val2,
+ py_global_multiply(val1, val2) AS multiply_result,
+ text,
+ py_global_lower(text) AS lower_result
+ FROM global_function_test_table
+ ORDER BY id;
+ """
+
+ // Test 5: Mathematical calculation with GLOBAL function
+ sql """ DROP GLOBAL FUNCTION IF EXISTS py_global_power(DOUBLE, DOUBLE); """
+ sql """
+ CREATE GLOBAL FUNCTION py_global_power(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(base, exponent):
+ if base is None or exponent is None:
+ return None
+ return base ** exponent
+\$\$;
+ """
+
+ qt_select_global_power """ SELECT py_global_power(2.0, 3.0) AS result; """
+ qt_select_global_power_decimal """ SELECT py_global_power(5.0, 0.5) AS result; """
+
+ } finally {
+ try_sql("DROP GLOBAL FUNCTION IF EXISTS py_global_multiply(INT, INT);")
+ try_sql("DROP GLOBAL FUNCTION IF EXISTS py_global_lower(STRING);")
+ try_sql("DROP GLOBAL FUNCTION IF EXISTS py_global_power(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_local_add(INT, INT);")
+ try_sql("DROP TABLE IF EXISTS global_function_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_inline_complex.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_complex.groovy
new file mode 100644
index 0000000..0caf6c8
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_complex.groovy
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_inline_complex") {
+ // Test complex Python UDF using Inline mode
+
+ def runtime_version = "3.10.12"
+ try {
+ // Test 1: Array processing
+ sql """ DROP FUNCTION IF EXISTS py_array_sum(ARRAY<INT>); """
+ sql """
+ CREATE FUNCTION py_array_sum(ARRAY<INT>)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(arr):
+ if arr is None:
+ return None
+ return sum(arr)
+\$\$;
+ """
+
+ qt_select_array_sum """ SELECT py_array_sum([1, 2, 3, 4, 5]) AS result; """
+
+ // Test 2: String processing - reverse
+ sql """ DROP FUNCTION IF EXISTS py_reverse_string(STRING); """
+ sql """
+ CREATE FUNCTION py_reverse_string(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s):
+ if s is None:
+ return None
+ return s[::-1]
+\$\$;
+ """
+
+ qt_select_reverse """ SELECT py_reverse_string('Hello') AS result; """
+
+ // Test 3: Multi-parameter complex calculation
+ sql """ DROP FUNCTION IF EXISTS py_weighted_avg(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_weighted_avg(DOUBLE, DOUBLE, DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(val1, weight1, val2, weight2):
+ if any(x is None for x in [val1, weight1, val2, weight2]):
+ return None
+ total_weight = weight1 + weight2
+ if total_weight == 0:
+ return None
+ return (val1 * weight1 + val2 * weight2) / total_weight
+\$\$;
+ """
+
+ qt_select_weighted_avg """ SELECT py_weighted_avg(80.0, 0.6, 90.0, 0.4) AS result; """
+
+ // Test 4: String formatting
+ sql """ DROP FUNCTION IF EXISTS py_format_name(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_format_name(STRING, STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(first_name, last_name):
+ if first_name is None or last_name is None:
+ return None
+ return f"{last_name.upper()}, {first_name.capitalize()}"
+\$\$;
+ """
+
+ qt_select_format_name """ SELECT py_format_name('john', 'doe') AS result; """
+
+ // Test 5: Numeric range validation
+ sql """ DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT); """
+ sql """
+ CREATE FUNCTION py_in_range(INT, INT, INT)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(value, min_val, max_val):
+ if any(x is None for x in [value, min_val, max_val]):
+ return None
+ return min_val <= value <= max_val
+\$\$;
+ """
+
+ qt_select_in_range_true """ SELECT py_in_range(50, 0, 100) AS result; """
+ qt_select_in_range_false """ SELECT py_in_range(150, 0, 100) AS result; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_array_sum(ARRAY<INT>);")
+ try_sql("DROP FUNCTION IF EXISTS py_reverse_string(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_weighted_avg(DOUBLE, DOUBLE, DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_format_name(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT);")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_inline_priority.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_priority.groovy
new file mode 100644
index 0000000..5693f7b
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_priority.groovy
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_inline_priority") {
+ // Test that inline code has higher priority when both file and inline code are specified
+
+ // Disabled temporarily
+ return
+
+ def zipPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(zipPath)
+ def runtime_version = "3.10.12"
+ log.info("Python zip path: ${zipPath}".toString())
+
+ try {
+ // Test 1: Specify both file and inline code, verify inline code takes priority
+ // Function in int_test.py returns arg + 1
+ // But inline code returns arg * 10
+ sql """ DROP FUNCTION IF EXISTS py_priority_test(INT); """
+ sql """
+ CREATE FUNCTION py_priority_test(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${zipPath}",
+ "symbol" = "int_test.evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(arg):
+ # inline code: returns arg * 10
+ if arg is None:
+ return None
+ return arg * 10
+\$\$;
+ """
+
+ // If using code from file, result should be 6 (5 + 1)
+ // If using inline code, result should be 50 (5 * 10)
+ qt_select_priority_inline """ SELECT py_priority_test(5) AS result; """
+
+ // Test 2: Another priority test - string processing
+ sql """ DROP FUNCTION IF EXISTS py_priority_string_test(STRING); """
+ sql """
+ CREATE FUNCTION py_priority_string_test(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s):
+ # inline code: returns reversed string
+ if s is None:
+ return None
+ return s[::-1]
+\$\$;
+ """
+
+ // inline code should return reversed string
+ qt_select_priority_string """ SELECT py_priority_string_test('hello') AS result; """
+
+ // Test 3: Verify priority on table data
+ sql """ DROP TABLE IF EXISTS priority_test_table; """
+ sql """
+ CREATE TABLE priority_test_table (
+ id INT,
+ num INT
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO priority_test_table VALUES
+ (1, 1),
+ (2, 2),
+ (3, 3),
+ (4, 4),
+ (5, 5);
+ """
+
+ // Verify inline code priority: should return num * 10
+ qt_select_table_priority """
+ SELECT
+ id,
+ num,
+ py_priority_test(num) AS result
+ FROM priority_test_table
+ ORDER BY id;
+ """
+
+ // Test 4: Only file parameter, no inline code
+ sql """ DROP FUNCTION IF EXISTS py_file_only_test(INT); """
+ sql """
+ CREATE FUNCTION py_file_only_test(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${zipPath}",
+ "symbol" = "int_test.evaluate",
+ "runtime_version" = "${runtime_version}"
+ );
+ """
+
+ // Should use code from file: returns arg + 1
+ qt_select_file_only """ SELECT py_file_only_test(5) AS result; """
+
+ // Test 5: Only inline code, no file parameter
+ sql """ DROP FUNCTION IF EXISTS py_inline_only_test(INT); """
+ sql """
+ CREATE FUNCTION py_inline_only_test(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(arg):
+ if arg is None:
+ return None
+ return arg * 100
+\$\$;
+ """
+
+ // Should use inline code: returns arg * 100
+ qt_select_inline_only """ SELECT py_inline_only_test(5) AS result; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_priority_test(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_priority_string_test(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_file_only_test(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_inline_only_test(INT);")
+ try_sql("DROP TABLE IF EXISTS priority_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_inline_scalar.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_scalar.groovy
new file mode 100644
index 0000000..1ab003a
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_scalar.groovy
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_inline_basic") {
+ // Test basic Python UDF using Inline mode
+
+ def runtime_version = "3.10.12"
+ try {
+ // Test 1: Simple integer addition
+ sql """ DROP FUNCTION IF EXISTS py_add(INT, INT); """
+ sql """
+ CREATE FUNCTION py_add(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(a, b):
+ return a + b
+\$\$;
+ """
+
+ qt_select_add """ SELECT py_add(10, 20) AS result; """
+ qt_select_add_null """ SELECT py_add(NULL, 20) AS result; """
+
+ // Test 2: String concatenation
+ sql """ DROP FUNCTION IF EXISTS py_concat(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_concat(STRING, STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s1, s2):
+ if s1 is None or s2 is None:
+ return None
+ return s1 + s2
+\$\$;
+ """
+
+ qt_select_concat """ SELECT py_concat('Hello', ' World') AS result; """
+ qt_select_concat_null """ SELECT py_concat('Hello', NULL) AS result; """
+
+ // Test 3: Mathematical operations
+ sql """ DROP FUNCTION IF EXISTS py_square(DOUBLE); """
+ sql """
+ CREATE FUNCTION py_square(DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x * x
+\$\$;
+ """
+
+ qt_select_square """ SELECT py_square(5.0) AS result; """
+ qt_select_square_negative """ SELECT py_square(-3.0) AS result; """
+
+ // Test 4: Conditional logic
+ sql """ DROP FUNCTION IF EXISTS py_is_positive(INT); """
+ sql """
+ CREATE FUNCTION py_is_positive(INT)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(num):
+ if num is None:
+ return None
+ return num > 0
+\$\$;
+ """
+
+ qt_select_positive """ SELECT py_is_positive(10) AS result; """
+ qt_select_negative """ SELECT py_is_positive(-5) AS result; """
+ qt_select_zero """ SELECT py_is_positive(0) AS result; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_add(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_concat(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_square(DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_is_positive(INT);")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_inline_vector.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_vector.groovy
new file mode 100644
index 0000000..f321191
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_vector.groovy
@@ -0,0 +1,409 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_inline_vector") {
+ // Test vectorized Python UDF using Inline mode with pandas.Series
+
+ def runtime_version = "3.10.12"
+ try {
+ // Create test table
+ sql """ DROP TABLE IF EXISTS vector_udf_test_table; """
+ sql """
+ CREATE TABLE vector_udf_test_table (
+ id INT,
+ int_col1 INT,
+ int_col2 INT,
+ double_col1 DOUBLE,
+ double_col2 DOUBLE,
+ string_col1 STRING,
+ string_col2 STRING,
+ bool_col BOOLEAN
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO vector_udf_test_table VALUES
+ (1, 10, 20, 1.5, 2.5, 'hello', 'world', true),
+ (2, 30, 40, 3.5, 4.5, 'foo', 'bar', false),
+ (3, NULL, 50, 5.5, NULL, NULL, 'test', true),
+ (4, 60, NULL, NULL, 6.5, 'data', NULL, false),
+ (5, 70, 80, 7.5, 8.5, 'python', 'udf', true);
+ """
+
+ // Test 1: Vector INT addition with pandas.Series
+ sql """ DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_add_int(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "add",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def add(a: pd.Series, b: pd.Series) -> pd.Series:
+ return a + b + 1
+\$\$;
+ """
+
+ qt_vec_add_int """
+ SELECT
+ id,
+ int_col1,
+ int_col2,
+ py_vec_add_int(int_col1, int_col2) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 2: Vector DOUBLE multiplication with pandas.Series
+ sql """ DROP FUNCTION IF EXISTS py_vec_multiply_double(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_multiply_double(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "multiply",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def multiply(a: pd.Series, b: pd.Series) -> pd.Series:
+ return a * b
+\$\$;
+ """
+
+ qt_vec_multiply_double """
+ SELECT
+ id,
+ double_col1,
+ double_col2,
+ py_vec_multiply_double(double_col1, double_col2) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 3: Vector STRING concatenation with pandas.Series
+ sql """ DROP FUNCTION IF EXISTS py_vec_concat_string(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_vec_concat_string(STRING, STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "concat",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def concat(s1: pd.Series, s2: pd.Series) -> pd.Series:
+ return s1 + '_' + s2
+\$\$;
+ """
+
+ qt_vec_concat_string """
+ SELECT
+ id,
+ string_col1,
+ string_col2,
+ py_vec_concat_string(string_col1, string_col2) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 4: Vector INT with conditional logic using pandas.Series
+ sql """ DROP FUNCTION IF EXISTS py_vec_max_int(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_max_int(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "get_max",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+import numpy as np
+
+def get_max(a: pd.Series, b: pd.Series) -> pd.Series:
+ return pd.Series(np.maximum(a, b))
+\$\$;
+ """
+
+ qt_vec_max_int """
+ SELECT
+ id,
+ int_col1,
+ int_col2,
+ py_vec_max_int(int_col1, int_col2) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 5: Vector DOUBLE with mathematical operations
+ sql """ DROP FUNCTION IF EXISTS py_vec_sqrt_double(DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_sqrt_double(DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "sqrt",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+import numpy as np
+
+def sqrt(x: pd.Series) -> pd.Series:
+ return np.sqrt(x)
+\$\$;
+ """
+
+ qt_vec_sqrt_double """
+ SELECT
+ id,
+ double_col1,
+ py_vec_sqrt_double(double_col1) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 6: Vector STRING with upper case transformation
+ sql """ DROP FUNCTION IF EXISTS py_vec_upper_string(STRING); """
+ sql """
+ CREATE FUNCTION py_vec_upper_string(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "to_upper",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def to_upper(s: pd.Series) -> pd.Series:
+ return s.str.upper()
+\$\$;
+ """
+
+ qt_vec_upper_string """
+ SELECT
+ id,
+ string_col1,
+ py_vec_upper_string(string_col1) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 7: Vector INT with complex calculation
+ sql """ DROP FUNCTION IF EXISTS py_vec_weighted_sum(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_weighted_sum(INT, INT)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "weighted_sum",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def weighted_sum(a: pd.Series, b: pd.Series) -> pd.Series:
+ return a * 0.3 + b * 0.7
+\$\$;
+ """
+
+ qt_vec_weighted_sum """
+ SELECT
+ id,
+ int_col1,
+ int_col2,
+ py_vec_weighted_sum(int_col1, int_col2) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 8: Vector BOOLEAN operations
+ sql """ DROP FUNCTION IF EXISTS py_vec_not_bool(BOOLEAN); """
+ sql """
+ CREATE FUNCTION py_vec_not_bool(BOOLEAN)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "negate",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def negate(b: pd.Series) -> pd.Series:
+ return ~b
+\$\$;
+ """
+
+ qt_vec_not_bool """
+ SELECT
+ id,
+ bool_col,
+ py_vec_not_bool(bool_col) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 9: Vector INT comparison returning BOOLEAN
+ sql """ DROP FUNCTION IF EXISTS py_vec_greater_than(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_greater_than(INT, INT)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "greater",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def greater(a: pd.Series, b: pd.Series) -> pd.Series:
+ return a > b
+\$\$;
+ """
+
+ qt_vec_greater_than """
+ SELECT
+ id,
+ int_col1,
+ int_col2,
+ py_vec_greater_than(int_col1, int_col2) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 10: Vector STRING length calculation
+ sql """ DROP FUNCTION IF EXISTS py_vec_string_length(STRING); """
+ sql """
+ CREATE FUNCTION py_vec_string_length(STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "str_len",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def str_len(s: pd.Series) -> pd.Series:
+ return s.str.len()
+\$\$;
+ """
+
+ qt_vec_string_length """
+ SELECT
+ id,
+ string_col1,
+ py_vec_string_length(string_col1) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 11: Vector with NULL handling using fillna
+ sql """ DROP FUNCTION IF EXISTS py_vec_fill_null_int(INT); """
+ sql """
+ CREATE FUNCTION py_vec_fill_null_int(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "fill_null",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def fill_null(x: pd.Series) -> pd.Series:
+ return x.fillna(0)
+\$\$;
+ """
+
+ qt_vec_fill_null_int """
+ SELECT
+ id,
+ int_col1,
+ py_vec_fill_null_int(int_col1) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ // Test 12: Vector with aggregation-like operation (cumulative sum)
+ sql """ DROP FUNCTION IF EXISTS py_vec_cumsum_int(INT); """
+ sql """
+ CREATE FUNCTION py_vec_cumsum_int(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "cumsum",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def cumsum(x: pd.Series) -> pd.Series:
+ return x.cumsum()
+\$\$;
+ """
+
+ qt_vec_cumsum_int """
+ SELECT
+ id,
+ int_col1,
+ py_vec_cumsum_int(int_col1) AS result
+ FROM vector_udf_test_table
+ ORDER BY id;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_multiply_double(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_concat_string(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_max_int(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_sqrt_double(DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_upper_string(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_weighted_sum(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_not_bool(BOOLEAN);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_greater_than(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_string_length(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_fill_null_int(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_cumsum_int(INT);")
+ try_sql("DROP TABLE IF EXISTS vector_udf_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_int.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_int.groovy
new file mode 100644
index 0000000..2a3906c
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_int.groovy
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_int") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP TABLE IF EXISTS test_pythonudf_int """
+ sql """
+ CREATE TABLE IF NOT EXISTS test_pythonudf_int (
+ `user_id` INT NOT NULL COMMENT "",
+ `tinyint_col` TINYINT NOT NULL COMMENT "",
+ `smallint_col` SMALLINT NOT NULL COMMENT "",
+ `bigint_col` BIGINT NOT NULL COMMENT ""
+ )
+ DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1");
+ """
+ StringBuilder sb = new StringBuilder()
+ int i = 1
+ for (; i < 10; i++) {
+ sb.append("""
+ (${i},${i}*2,${i}*3,${i}*4),
+ """)
+ }
+ sb.append("""
+ (${i},${i}*2,${i}*3,${i}*4)
+ """)
+ sql """ INSERT INTO test_pythonudf_int VALUES
+ ${sb.toString()}
+ """
+ qt_select_default """ SELECT * FROM test_pythonudf_int t ORDER BY user_id; """
+
+ File path = new File(pyPath)
+ if (!path.exists()) {
+ throw new IllegalStateException("""${pyPath} doesn't exist! """)
+ }
+
+ sql """ DROP FUNCTION IF EXISTS python_udf_int_test(int) """
+
+ sql """ CREATE FUNCTION python_udf_int_test(int) RETURNS int PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT python_udf_int_test(user_id) result FROM test_pythonudf_int ORDER BY result; """
+ qt_select """ SELECT python_udf_int_test(null) result ; """
+
+
+ sql """ CREATE FUNCTION python_udf_tinyint_test(tinyint) RETURNS tinyint PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT python_udf_tinyint_test(tinyint_col) result FROM test_pythonudf_int ORDER BY result; """
+ qt_select """ SELECT python_udf_tinyint_test(null) result ; """
+
+
+ sql """ CREATE FUNCTION python_udf_smallint_test(smallint) RETURNS smallint PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT python_udf_smallint_test(smallint_col) result FROM test_pythonudf_int ORDER BY result; """
+ qt_select """ SELECT python_udf_smallint_test(null) result ; """
+
+
+ sql """ CREATE FUNCTION python_udf_bigint_test(bigint) RETURNS bigint PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT python_udf_bigint_test(bigint_col) result FROM test_pythonudf_int ORDER BY result; """
+ qt_select """ SELECT python_udf_bigint_test(null) result ; """
+
+ sql """ CREATE GLOBAL FUNCTION python_udf_int_test_global(int) RETURNS int PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select_global_1 """ SELECT python_udf_int_test_global(user_id) result FROM test_pythonudf_int ORDER BY result; """
+ qt_select_global_2 """ SELECT python_udf_int_test_global(null) result ; """
+ qt_select_global_3 """ SELECT python_udf_int_test_global(3) result FROM test_pythonudf_int ORDER BY result; """
+ qt_select_global_4 """ SELECT abs(python_udf_int_test_global(3)) result FROM test_pythonudf_int ORDER BY result; """
+
+ } finally {
+ try_sql("DROP GLOBAL FUNCTION IF EXISTS python_udf_int_test_global(int);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_tinyint_test(tinyint);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_smallint_test(smallint);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_bigint_test(bigint);")
+ try_sql("DROP FUNCTION IF EXISTS python_udf_int_test(int);")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_int")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_map.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_map.groovy
new file mode 100644
index 0000000..58be735
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_map.groovy
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_map") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ try_sql("DROP FUNCTION IF EXISTS udfii(Map<INT, INT>);")
+ try_sql("DROP FUNCTION IF EXISTS udfss(Map<String, String>);")
+ try_sql("DROP TABLE IF EXISTS map_ii")
+ try_sql("DROP TABLE IF EXISTS map_ss")
+ sql """
+ CREATE TABLE IF NOT EXISTS map_ii (
+ `id` INT(11) NULL COMMENT "",
+ `m` Map<INT, INT> NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2"
+ );
+ """
+ sql """ """
+ sql """ INSERT INTO map_ii VALUES(1, {1:1,10:1,100:1}); """
+ sql """ INSERT INTO map_ii VALUES(2, {2:1,20:1,200:1,2000:1}); """
+ sql """ INSERT INTO map_ii VALUES(3, {3:1}); """
+ sql """ DROP FUNCTION IF EXISTS udfii(Map<INT, INT>); """
+ sql """ CREATE FUNCTION udfii(Map<INT, INT>) RETURNS INT PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="map_int_int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+
+ qt_select_1 """ select m,udfii(m) from map_ii order by id; """
+
+ sql """ CREATE TABLE IF NOT EXISTS map_ss (
+ `id` INT(11) NULL COMMENT "",
+ `m` Map<String, String> NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2"
+ ); """
+ sql """ INSERT INTO map_ss VALUES(1, {"114":"514","1919":"810"}); """
+ sql """ INSERT INTO map_ss VALUES(2, {"a":"bc","def":"g","hij":"k"}); """
+ sql """ DROP FUNCTION IF EXISTS udfss(Map<String, String>); """
+
+ sql """ CREATE FUNCTION udfss(Map<String, String>) RETURNS STRING PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="map_string_string_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select_2 """ select m,udfss(m) from map_ss order by id; """
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS udfii(Map<INT, INT>);")
+ try_sql("DROP FUNCTION IF EXISTS udfss(Map<String, String>);")
+ try_sql("DROP TABLE IF EXISTS map_ii")
+ try_sql("DROP TABLE IF EXISTS map_ss")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_mixed_params.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_mixed_params.groovy
new file mode 100644
index 0000000..a9e6f6f
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_mixed_params.groovy
@@ -0,0 +1,443 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_mixed_params") {
+ // Test vectorized Python UDF with mixed parameter types (pd.Series + scalar)
+ // This tests the scenario where some parameters are vectorized (pd.Series)
+ // and some are scalar values (int, float, str)
+ //
+ // Key concept: In vectorized UDF, you can mix:
+ // - pd.Series parameters (process entire column)
+ // - scalar parameters (single value like int, float, str)
+
+ def runtime_version = "3.10.12"
+
+ try {
+ // Create test table
+ sql """ DROP TABLE IF EXISTS test_mixed_params_table; """
+ sql """
+ CREATE TABLE test_mixed_params_table (
+ id INT,
+ price DOUBLE,
+ quantity INT,
+ discount_rate DOUBLE,
+ category STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 3
+ PROPERTIES("replication_num" = "1");
+ """
+
+ // Insert test data
+ sql """
+ INSERT INTO test_mixed_params_table VALUES
+ (1, 100.0, 5, 0.1, 'A'),
+ (2, 200.0, 3, 0.15, 'B'),
+ (3, 150.0, 8, 0.2, 'A'),
+ (4, 300.0, 2, 0.05, 'C'),
+ (5, 250.0, 6, 0.12, 'B'),
+ (6, 180.0, 4, 0.18, 'A'),
+ (7, 220.0, 7, 0.08, 'C'),
+ (8, 120.0, 9, 0.25, 'B'),
+ (9, 280.0, 1, 0.1, 'A'),
+ (10, 350.0, 5, 0.15, 'C');
+ """
+
+ sql "sync"
+
+ // ==================== Test 1: pd.Series + scalar float ====================
+ log.info("=== Test 1: pd.Series + scalar float ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_multiply_constant(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_multiply_constant(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_multiply_constant",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_multiply_constant(values: pd.Series, multiplier: float) -> pd.Series:
+ # values: pd.Series (vectorized column data)
+ # multiplier: float (scalar constant)
+ return values * multiplier
+\$\$;
+ """
+
+ qt_select_1 """
+ SELECT
+ id,
+ price,
+ py_vec_multiply_constant(price, 1.5) AS price_multiplied
+ FROM test_mixed_params_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 2: Multiple pd.Series + scalar float ====================
+ log.info("=== Test 2: Multiple pd.Series + scalar float ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_calc_total(DOUBLE, INT, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_calc_total(DOUBLE, INT, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_calc_total",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_calc_total(price: pd.Series, quantity: pd.Series, tax_rate: float) -> pd.Series:
+ # price: pd.Series (vectorized)
+ # quantity: pd.Series (vectorized)
+ # tax_rate: float (scalar constant)
+ subtotal = price * quantity
+ return subtotal * (1 + tax_rate)
+\$\$;
+ """
+
+ qt_select_2 """
+ SELECT
+ id,
+ price,
+ quantity,
+ py_vec_calc_total(price, quantity, 0.1) AS total_with_tax
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 5;
+ """
+
+ // ==================== Test 3: Two pd.Series (both vectorized) ====================
+ log.info("=== Test 3: Two pd.Series (both vectorized) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_apply_discount(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_apply_discount(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_apply_discount",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_apply_discount(price: pd.Series, discount_rate: pd.Series) -> pd.Series:
+ # Both are pd.Series (vectorized)
+ # Each row has its own discount rate from the column
+ return price * (1 - discount_rate)
+\$\$;
+ """
+
+ qt_select_3 """
+ SELECT
+ id,
+ price,
+ discount_rate,
+ py_vec_apply_discount(price, discount_rate) AS final_price
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 5;
+ """
+
+ // ==================== Test 4: Complex Mixed Parameters (3 Series + 1 scalar) ====================
+ log.info("=== Test 4: Complex calculation with mixed params (3 Series + 1 scalar) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_complex_calc(DOUBLE, INT, DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_complex_calc(DOUBLE, INT, DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_complex_calc",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_complex_calc(price: pd.Series, quantity: pd.Series, discount_rate: pd.Series, shipping_fee: float) -> pd.Series:
+ # price: pd.Series (vectorized)
+ # quantity: pd.Series (vectorized)
+ # discount_rate: pd.Series (vectorized, per-row discount)
+ # shipping_fee: float (scalar constant)
+
+ # Calculate: (price * quantity) * (1 - discount) + shipping_fee
+ subtotal = price * quantity
+ after_discount = subtotal * (1 - discount_rate)
+ return after_discount + shipping_fee
+\$\$;
+ """
+
+ qt_select_4 """
+ SELECT
+ id,
+ price,
+ quantity,
+ discount_rate,
+ py_vec_complex_calc(price, quantity, discount_rate, 10.0) AS final_total
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 5;
+ """
+
+ // ==================== Test 5: String pd.Series + scalar str ====================
+ log.info("=== Test 5: String pd.Series + scalar str ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_add_prefix(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_vec_add_prefix(STRING, STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_add_prefix",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_add_prefix(categories: pd.Series, prefix: str) -> pd.Series:
+ # categories: pd.Series (vectorized string column)
+ # prefix: str (scalar constant)
+ return prefix + '_' + categories
+\$\$;
+ """
+
+ qt_select_5 """
+ SELECT
+ id,
+ category,
+ py_vec_add_prefix(category, 'CAT') AS prefixed_category
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 5;
+ """
+
+ // ==================== Test 6: pd.Series + scalar int ====================
+ log.info("=== Test 6: pd.Series + scalar int ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_add_int(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_add_int",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_add_int(quantities: pd.Series, bonus: int) -> pd.Series:
+ # quantities: pd.Series (vectorized int column)
+ # bonus: int (scalar constant)
+ return quantities + bonus
+\$\$;
+ """
+
+ qt_select_6 """
+ SELECT
+ id,
+ quantity,
+ py_vec_add_int(quantity, 10) AS quantity_with_bonus
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 5;
+ """
+
+ // ==================== Test 7: Conditional Logic with Mixed Params ====================
+ log.info("=== Test 7: Conditional logic with mixed params (2 Series + 1 scalar) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_conditional_discount(DOUBLE, DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_conditional_discount(DOUBLE, DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_conditional_discount",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+import numpy as np
+
+def py_vec_conditional_discount(price: pd.Series, discount_rate: pd.Series, threshold: float) -> pd.Series:
+ # price: pd.Series (vectorized)
+ # discount_rate: pd.Series (vectorized)
+ # threshold: float (scalar constant - minimum price for discount)
+
+ # Apply discount only if price >= threshold
+ result = np.where(price >= threshold,
+ price * (1 - discount_rate),
+ price)
+ return pd.Series(result)
+\$\$;
+ """
+
+ qt_select_7 """
+ SELECT
+ id,
+ price,
+ discount_rate,
+ py_vec_conditional_discount(price, discount_rate, 200.0) AS final_price
+ FROM test_mixed_params_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 8: Scalar first, then Series ====================
+ log.info("=== Test 8: Scalar parameter first, then Series ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_scale_and_add(DOUBLE, DOUBLE, INT); """
+ sql """
+ CREATE FUNCTION py_vec_scale_and_add(DOUBLE, DOUBLE, INT)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_scale_and_add",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_scale_and_add(scale_factor: float, prices: pd.Series, quantities: pd.Series) -> pd.Series:
+ # scale_factor: float (scalar constant)
+ # prices: pd.Series (vectorized)
+ # quantities: pd.Series (vectorized)
+ return (prices * quantities) * scale_factor
+\$\$;
+ """
+
+ qt_select_8 """
+ SELECT
+ id,
+ price,
+ quantity,
+ py_vec_scale_and_add(1.2, price, quantity) AS scaled_total
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 3;
+ """
+
+ // ==================== Test 9: Alternating Series and Scalar ====================
+ log.info("=== Test 9: Alternating Series and scalar parameters ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_alternating(DOUBLE, DOUBLE, INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_alternating(DOUBLE, DOUBLE, INT, INT)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_alternating",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_alternating(prices: pd.Series, markup: float, quantities: pd.Series, min_qty: int) -> pd.Series:
+ # prices: pd.Series (vectorized)
+ # markup: float (scalar constant)
+ # quantities: pd.Series (vectorized)
+ # min_qty: int (scalar constant)
+
+ import numpy as np
+ # Apply markup only if quantity >= min_qty
+ result = np.where(quantities >= min_qty,
+ prices * (1 + markup),
+ prices)
+ return pd.Series(result)
+\$\$;
+ """
+
+ qt_select_9 """
+ SELECT
+ id,
+ price,
+ quantity,
+ py_vec_alternating(price, 0.2, quantity, 5) AS conditional_price
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 5;
+ """
+
+ // ==================== Test 10: Multiple scalars with one Series ====================
+ log.info("=== Test 10: Multiple scalar parameters with one Series ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_multi_scalar(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_multi_scalar(DOUBLE, DOUBLE, DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_multi_scalar",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_multi_scalar(prices: pd.Series, tax: float, discount: float, fee: float) -> pd.Series:
+ # prices: pd.Series (vectorized)
+ # tax: float (scalar constant)
+ # discount: float (scalar constant)
+ # fee: float (scalar constant)
+
+ # Calculate: (price * (1 - discount)) * (1 + tax) + fee
+ after_discount = prices * (1 - discount)
+ with_tax = after_discount * (1 + tax)
+ return with_tax + fee
+\$\$;
+ """
+
+ qt_select_10 """
+ SELECT
+ id,
+ price,
+ py_vec_multi_scalar(price, 0.1, 0.05, 5.0) AS final_price
+ FROM test_mixed_params_table
+ ORDER BY id
+ LIMIT 3;
+ """
+
+ log.info("All mixed parameter tests passed!")
+
+ } finally {
+ // Cleanup
+ sql """ DROP FUNCTION IF EXISTS py_vec_multiply_constant(DOUBLE, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_calc_total(DOUBLE, INT, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_apply_discount(DOUBLE, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_complex_calc(DOUBLE, INT, DOUBLE, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_add_prefix(STRING, STRING); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_conditional_discount(DOUBLE, DOUBLE, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_scale_and_add(DOUBLE, DOUBLE, INT); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_alternating(DOUBLE, DOUBLE, INT, INT); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_multi_scalar(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """
+ sql """ DROP TABLE IF EXISTS test_mixed_params_table; """
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module.groovy
new file mode 100644
index 0000000..a22fd88
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module.groovy
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_module") {
+ def pyPath = """${context.file.parent}/udf_scripts/python_udf_module_test.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.12.0"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP FUNCTION IF EXISTS python_udf_ltv_score(BIGINT, BIGINT, DOUBLE); """
+ sql """
+ CREATE FUNCTION python_udf_ltv_score(BIGINT, BIGINT, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file"="file://${pyPath}",
+ "symbol" = "python_udf_module_test.main.safe_ltv",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+ sql """ DROP TABLE IF EXISTS user_behavior_test; """
+ sql """
+ CREATE TABLE user_behavior_test (
+ user_id BIGINT,
+ days_since_last_action BIGINT,
+ total_actions BIGINT,
+ total_spend DOUBLE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(user_id)
+ DISTRIBUTED BY HASH(user_id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+ sql """
+ INSERT INTO user_behavior_test VALUES
+ (1001, 5, 10, 500.0),
+ (1002, 40, 1, 20.0),
+ (1003, 15, 5, 300.0),
+ (1004, -1, 3, 100.0),
+ (1005, NULL, 2, 200.0),
+ (1006, 7, NULL, 150.0),
+ (1007, 30, 0, NULL),
+ (1008, 0, 100, 5000.0),
+ (1009, 100, 2, 10.0),
+ (1010, 8, 8, 800.0);
+ """
+
+ qt_select """ SELECT
+ user_id,
+ days_since_last_action,
+ total_actions,
+ total_spend,
+ python_udf_ltv_score(days_since_last_action, total_actions, total_spend) AS ltv_score
+ FROM user_behavior_test
+ ORDER BY user_id; """
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS python_udf_ltv_score(BIGINT, BIGINT, DOUBLE);")
+ try_sql("DROP TABLE IF EXISTS user_behavior_test;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module_advanced.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module_advanced.groovy
new file mode 100644
index 0000000..77001bf
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module_advanced.groovy
@@ -0,0 +1,180 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_module_advanced") {
+ // Test advanced Python UDF features using Module mode
+
+ def zipPath = """${context.file.parent}/udf_scripts/python_udf_module_test.zip"""
+ scp_udf_file_to_all_be(zipPath)
+ def runtime_version = "3.12.0"
+ log.info("Python Zip path: ${zipPath}".toString())
+
+ try {
+ // Test 1: Use different module paths in zip package
+ sql """ DROP FUNCTION IF EXISTS py_module_ltv(BIGINT, BIGINT, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_module_ltv(BIGINT, BIGINT, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${zipPath}",
+ "symbol" = "python_udf_module_test.main.safe_ltv",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_select_module_ltv_normal """ SELECT py_module_ltv(10, 100, 5000.0) AS result; """
+ qt_select_module_ltv_null """ SELECT py_module_ltv(NULL, 100, 5000.0) AS result; """
+ qt_select_module_ltv_zero """ SELECT py_module_ltv(0, 0, 5000.0) AS result; """
+
+ // Test 2: Use Module UDF in complex queries
+ sql """ DROP TABLE IF EXISTS customer_analytics; """
+ sql """
+ CREATE TABLE customer_analytics (
+ customer_id BIGINT,
+ days_inactive BIGINT,
+ total_orders BIGINT,
+ total_revenue DOUBLE,
+ customer_segment STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(customer_id)
+ DISTRIBUTED BY HASH(customer_id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO customer_analytics VALUES
+ (1001, 5, 50, 10000.0, 'Premium'),
+ (1002, 30, 10, 2000.0, 'Regular'),
+ (1003, 60, 5, 500.0, 'Inactive'),
+ (1004, 2, 100, 25000.0, 'VIP'),
+ (1005, 15, 25, 5000.0, 'Regular'),
+ (1006, NULL, 30, 6000.0, 'Regular'),
+ (1007, 10, NULL, 3000.0, 'Regular'),
+ (1008, 45, 8, NULL, 'Inactive'),
+ (1009, 0, 200, 50000.0, 'VIP'),
+ (1010, 90, 2, 100.0, 'Churned');
+ """
+
+ qt_select_customer_analytics """
+ SELECT
+ customer_id,
+ customer_segment,
+ days_inactive,
+ total_orders,
+ total_revenue,
+ py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score
+ FROM customer_analytics
+ ORDER BY customer_id;
+ """
+
+ // Test 3: Use Module UDF for group aggregation
+ qt_select_segment_analysis """
+ SELECT
+ customer_segment,
+ COUNT(*) AS customer_count,
+ AVG(total_revenue) AS avg_revenue,
+ AVG(py_module_ltv(days_inactive, total_orders, total_revenue)) AS avg_ltv_score
+ FROM customer_analytics
+ GROUP BY customer_segment
+ ORDER BY customer_segment;
+ """
+
+ // Test 4: Use Module UDF for filtering
+ qt_select_high_value_customers """
+ SELECT
+ customer_id,
+ customer_segment,
+ total_revenue,
+ py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score
+ FROM customer_analytics
+ WHERE py_module_ltv(days_inactive, total_orders, total_revenue) > 100
+ ORDER BY ltv_score DESC;
+ """
+
+ // Test 5: Use Module UDF for sorting
+ qt_select_sorted_by_ltv """
+ SELECT
+ customer_id,
+ customer_segment,
+ py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score
+ FROM customer_analytics
+ ORDER BY py_module_ltv(days_inactive, total_orders, total_revenue) DESC
+ LIMIT 5;
+ """
+
+ // Test 6: Use Module UDF with multiple conditions
+ qt_select_complex_query """
+ SELECT
+ customer_id,
+ customer_segment,
+ days_inactive,
+ total_orders,
+ total_revenue,
+ py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score,
+ CASE
+ WHEN py_module_ltv(days_inactive, total_orders, total_revenue) > 200 THEN 'High Value'
+ WHEN py_module_ltv(days_inactive, total_orders, total_revenue) > 100 THEN 'Medium Value'
+ WHEN py_module_ltv(days_inactive, total_orders, total_revenue) IS NOT NULL THEN 'Low Value'
+ ELSE 'Unknown'
+ END AS value_category
+ FROM customer_analytics
+ ORDER BY ltv_score DESC;
+ """
+
+ // Test 7: Use Module UDF with JOIN operations
+ sql """ DROP TABLE IF EXISTS customer_info; """
+ sql """
+ CREATE TABLE customer_info (
+ customer_id BIGINT,
+ customer_name STRING,
+ registration_date DATE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(customer_id)
+ DISTRIBUTED BY HASH(customer_id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO customer_info VALUES
+ (1001, 'Alice Johnson', '2023-01-15'),
+ (1002, 'Bob Smith', '2023-03-20'),
+ (1003, 'Charlie Brown', '2022-11-10'),
+ (1004, 'Diana Prince', '2023-05-01'),
+ (1005, 'Eve Wilson', '2023-02-14');
+ """
+
+ qt_select_join_with_module_udf """
+ SELECT
+ ci.customer_id,
+ ci.customer_name,
+ ca.customer_segment,
+ ca.total_revenue,
+ py_module_ltv(ca.days_inactive, ca.total_orders, ca.total_revenue) AS ltv_score
+ FROM customer_info ci
+ JOIN customer_analytics ca ON ci.customer_id = ca.customer_id
+ WHERE py_module_ltv(ca.days_inactive, ca.total_orders, ca.total_revenue) IS NOT NULL
+ ORDER BY ltv_score DESC;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_module_ltv(BIGINT, BIGINT, DOUBLE);")
+ try_sql("DROP TABLE IF EXISTS customer_analytics;")
+ try_sql("DROP TABLE IF EXISTS customer_info;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module_scalar.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module_scalar.groovy
new file mode 100644
index 0000000..af0a43f
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module_scalar.groovy
@@ -0,0 +1,818 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_module_scalar") {
+ // Comprehensive test for scalar Python UDF using module mode
+
+ def pyPath = """${context.file.parent}/udf_scripts/python_udf_scalar_ops.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+
+ log.info("Python module path: ${pyPath}".toString())
+
+ try {
+ // Create test table with diverse data types
+ sql """ DROP TABLE IF EXISTS scalar_module_test_table; """
+ sql """
+ CREATE TABLE scalar_module_test_table (
+ id INT,
+ int_a INT,
+ int_b INT,
+ int_c INT,
+ double_a DOUBLE,
+ double_b DOUBLE,
+ string_a STRING,
+ string_b STRING,
+ bool_a BOOLEAN,
+ bool_b BOOLEAN,
+ date_a DATE,
+ date_b DATE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO scalar_module_test_table VALUES
+ (1, 10, 20, 30, 100.0, 10.0, 'hello world', 'test@example.com', true, true, '2024-01-15', '2024-01-20'),
+ (2, 5, 15, 25, 200.0, 20.0, 'foo bar baz', 'user@domain.com', false, true, '2024-02-10', '2024-03-15'),
+ (3, 100, 50, 25, 150.0, 0.0, 'racecar', 'admin@test.org', true, false, '2023-12-01', '2024-01-01'),
+ (4, 7, 3, 11, 80.0, 5.0, 'a man a plan a canal panama', 'info@company.net', false, false, '2024-06-15', '2024-06-15'),
+ (5, 17, 19, 23, 300.0, 15.0, 'python udf test', 'contact@site.io', true, true, '2024-03-01', '2024-12-31');
+ """
+
+ // ==================== Numeric Operations Tests ====================
+
+ // Test 1: Add three numbers
+ sql """ DROP FUNCTION IF EXISTS py_add_three(INT, INT, INT); """
+ sql """
+ CREATE FUNCTION py_add_three(INT, INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.add_three_numbers",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_add_three """
+ SELECT
+ id,
+ int_a, int_b, int_c,
+ py_add_three(int_a, int_b, int_c) AS result
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 2: Safe division with precision
+ sql """ DROP FUNCTION IF EXISTS py_safe_div(DOUBLE, DOUBLE, INT); """
+ sql """
+ CREATE FUNCTION py_safe_div(DOUBLE, DOUBLE, INT)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.safe_divide_with_precision",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_safe_div """
+ SELECT
+ id,
+ double_a, double_b,
+ py_safe_div(double_a, double_b, 2) AS result
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 3: Calculate discount price
+ sql """ DROP FUNCTION IF EXISTS py_discount(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_discount(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.calculate_discount_price",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_discount """
+ SELECT
+ id,
+ double_a,
+ py_discount(double_a, 10.0) AS price_10_off,
+ py_discount(double_a, 25.0) AS price_25_off
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 4: Compound interest
+ sql """ DROP FUNCTION IF EXISTS py_compound_interest(DOUBLE, DOUBLE, INT); """
+ sql """
+ CREATE FUNCTION py_compound_interest(DOUBLE, DOUBLE, INT)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.compound_interest",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_compound_interest """
+ SELECT
+ id,
+ double_a,
+ py_compound_interest(double_a, 5.0, 10) AS future_value
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 5: Calculate BMI
+ sql """ DROP FUNCTION IF EXISTS py_bmi(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_bmi(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.calculate_bmi",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_bmi """
+ SELECT
+ id,
+ py_bmi(70.0, 1.75) AS bmi_normal,
+ py_bmi(90.0, 1.75) AS bmi_overweight
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 6: Fibonacci number
+ sql """ DROP FUNCTION IF EXISTS py_fibonacci(INT); """
+ sql """
+ CREATE FUNCTION py_fibonacci(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.fibonacci",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_fibonacci """
+ SELECT
+ id,
+ int_a,
+ py_fibonacci(int_a) AS fib_result
+ FROM scalar_module_test_table
+ WHERE int_a <= 20
+ ORDER BY id;
+ """
+
+ // Test 7: Is prime number
+ sql """ DROP FUNCTION IF EXISTS py_is_prime(INT); """
+ sql """
+ CREATE FUNCTION py_is_prime(INT)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.is_prime",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_is_prime """
+ SELECT
+ id,
+ int_a, int_b, int_c,
+ py_is_prime(int_a) AS a_is_prime,
+ py_is_prime(int_b) AS b_is_prime,
+ py_is_prime(int_c) AS c_is_prime
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 8: GCD (Greatest Common Divisor)
+ sql """ DROP FUNCTION IF EXISTS py_gcd(INT, INT); """
+ sql """
+ CREATE FUNCTION py_gcd(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.gcd",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_gcd """
+ SELECT
+ id,
+ int_a, int_b,
+ py_gcd(int_a, int_b) AS gcd_result
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 9: LCM (Least Common Multiple)
+ sql """ DROP FUNCTION IF EXISTS py_lcm(INT, INT); """
+ sql """
+ CREATE FUNCTION py_lcm(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.lcm",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_lcm """
+ SELECT
+ id,
+ int_a, int_b,
+ py_lcm(int_a, int_b) AS lcm_result
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // ==================== String Operations Tests ====================
+
+ // Test 10: Reverse string
+ sql """ DROP FUNCTION IF EXISTS py_reverse(STRING); """
+ sql """
+ CREATE FUNCTION py_reverse(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.reverse_string",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_reverse """
+ SELECT
+ id,
+ string_a,
+ py_reverse(string_a) AS reversed
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 11: Count vowels
+ sql """ DROP FUNCTION IF EXISTS py_count_vowels(STRING); """
+ sql """
+ CREATE FUNCTION py_count_vowels(STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.count_vowels",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_count_vowels """
+ SELECT
+ id,
+ string_a,
+ py_count_vowels(string_a) AS vowel_count
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 12: Count words
+ sql """ DROP FUNCTION IF EXISTS py_count_words(STRING); """
+ sql """
+ CREATE FUNCTION py_count_words(STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.count_words",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_count_words """
+ SELECT
+ id,
+ string_a,
+ py_count_words(string_a) AS word_count
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 13: Capitalize words
+ sql """ DROP FUNCTION IF EXISTS py_capitalize(STRING); """
+ sql """
+ CREATE FUNCTION py_capitalize(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.capitalize_words",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_capitalize """
+ SELECT
+ id,
+ string_a,
+ py_capitalize(string_a) AS capitalized
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 14: Is palindrome
+ sql """ DROP FUNCTION IF EXISTS py_is_palindrome(STRING); """
+ sql """
+ CREATE FUNCTION py_is_palindrome(STRING)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.is_palindrome",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_is_palindrome """
+ SELECT
+ id,
+ string_a,
+ py_is_palindrome(string_a) AS is_palindrome
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 15: String similarity
+ sql """ DROP FUNCTION IF EXISTS py_similarity(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_similarity(STRING, STRING)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.string_similarity",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_similarity """
+ SELECT
+ id,
+ string_a,
+ py_similarity(string_a, 'hello') AS similarity_to_hello
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 16: Mask email
+ sql """ DROP FUNCTION IF EXISTS py_mask_email(STRING); """
+ sql """
+ CREATE FUNCTION py_mask_email(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.mask_email",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_mask_email """
+ SELECT
+ id,
+ string_b,
+ py_mask_email(string_b) AS masked_email
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 17: Extract domain from email
+ sql """ DROP FUNCTION IF EXISTS py_extract_domain(STRING); """
+ sql """
+ CREATE FUNCTION py_extract_domain(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.extract_domain",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_extract_domain """
+ SELECT
+ id,
+ string_b,
+ py_extract_domain(string_b) AS domain
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 18: Levenshtein distance
+ sql """ DROP FUNCTION IF EXISTS py_levenshtein(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_levenshtein(STRING, STRING)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.levenshtein_distance",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_levenshtein """
+ SELECT
+ id,
+ string_a,
+ py_levenshtein(string_a, 'hello world') AS edit_distance
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // ==================== Date/Time Operations Tests ====================
+
+ // Test 19: Days between dates
+ sql """ DROP FUNCTION IF EXISTS py_days_between(DATE, DATE); """
+ sql """
+ CREATE FUNCTION py_days_between(DATE, DATE)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.days_between_dates",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_days_between """
+ SELECT
+ id,
+ date_a, date_b,
+ py_days_between(date_a, date_b) AS days_diff
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 20: Is weekend
+ sql """ DROP FUNCTION IF EXISTS py_is_weekend(DATE); """
+ sql """
+ CREATE FUNCTION py_is_weekend(DATE)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.is_weekend",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_is_weekend """
+ SELECT
+ id,
+ date_a,
+ py_is_weekend(date_a) AS is_weekend
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 21: Get quarter
+ sql """ DROP FUNCTION IF EXISTS py_get_quarter(DATE); """
+ sql """
+ CREATE FUNCTION py_get_quarter(DATE)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.get_quarter",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_get_quarter """
+ SELECT
+ id,
+ date_a,
+ py_get_quarter(date_a) AS quarter
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 22: Age in years
+ sql """ DROP FUNCTION IF EXISTS py_age(DATE, DATE); """
+ sql """
+ CREATE FUNCTION py_age(DATE, DATE)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.age_in_years",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_age """
+ SELECT
+ id,
+ py_age('1990-01-01', date_a) AS age
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // ==================== Boolean/Conditional Operations Tests ====================
+
+ // Test 23: Is in range
+ sql """ DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT); """
+ sql """
+ CREATE FUNCTION py_in_range(INT, INT, INT)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.is_in_range",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_in_range """
+ SELECT
+ id,
+ int_a,
+ py_in_range(int_a, 10, 50) AS in_range_10_50
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 24: XOR operation
+ sql """ DROP FUNCTION IF EXISTS py_xor(BOOLEAN, BOOLEAN); """
+ sql """
+ CREATE FUNCTION py_xor(BOOLEAN, BOOLEAN)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.xor_operation",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_xor """
+ SELECT
+ id,
+ bool_a, bool_b,
+ py_xor(bool_a, bool_b) AS xor_result
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // ==================== Complex/Mixed Operations Tests ====================
+
+ // Test 25: Calculate grade
+ sql """ DROP FUNCTION IF EXISTS py_grade(DOUBLE); """
+ sql """
+ CREATE FUNCTION py_grade(DOUBLE)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.calculate_grade",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_grade """
+ SELECT
+ id,
+ double_a,
+ py_grade(double_a) AS grade
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 26: Categorize age
+ sql """ DROP FUNCTION IF EXISTS py_categorize_age(INT); """
+ sql """
+ CREATE FUNCTION py_categorize_age(INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.categorize_age",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_categorize_age """
+ SELECT
+ id,
+ int_a,
+ py_categorize_age(int_a) AS age_category
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 27: Calculate tax
+ sql """ DROP FUNCTION IF EXISTS py_tax(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_tax(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.calculate_tax",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_tax """
+ SELECT
+ id,
+ double_a,
+ py_tax(double_a, 15.0) AS tax_15_percent
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 28: Truncate string with suffix
+ sql """ DROP FUNCTION IF EXISTS py_truncate(STRING, INT, STRING); """
+ sql """
+ CREATE FUNCTION py_truncate(STRING, INT, STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_scalar_ops.truncate_string",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_truncate """
+ SELECT
+ id,
+ string_a,
+ py_truncate(string_a, 10, '...') AS truncated
+ FROM scalar_module_test_table
+ ORDER BY id;
+ """
+
+ // ==================== Edge Cases and NULL Handling Tests ====================
+
+ // Test 29: NULL handling in numeric operations
+ sql """ DROP TABLE IF EXISTS null_test_table; """
+ sql """
+ CREATE TABLE null_test_table (
+ id INT,
+ val1 INT,
+ val2 INT,
+ val3 INT
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO null_test_table VALUES
+ (1, 10, 20, 30),
+ (2, NULL, 20, 30),
+ (3, 10, NULL, 30),
+ (4, 10, 20, NULL),
+ (5, NULL, NULL, NULL);
+ """
+
+ qt_null_handling """
+ SELECT
+ id,
+ val1, val2, val3,
+ py_add_three(val1, val2, val3) AS sum_result
+ FROM null_test_table
+ ORDER BY id;
+ """
+
+ // Test 30: Empty string handling
+ sql """ DROP TABLE IF EXISTS string_edge_test; """
+ sql """
+ CREATE TABLE string_edge_test (
+ id INT,
+ str_val STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO string_edge_test VALUES
+ (1, 'normal string'),
+ (2, ''),
+ (3, ' '),
+ (4, 'a'),
+ (5, NULL);
+ """
+
+ qt_string_edge """
+ SELECT
+ id,
+ str_val,
+ py_reverse(str_val) AS reversed,
+ py_count_vowels(str_val) AS vowels,
+ py_count_words(str_val) AS words
+ FROM string_edge_test
+ ORDER BY id;
+ """
+
+ } finally {
+ // Cleanup all functions
+ try_sql("DROP FUNCTION IF EXISTS py_add_three(INT, INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_safe_div(DOUBLE, DOUBLE, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_discount(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_compound_interest(DOUBLE, DOUBLE, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_bmi(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_fibonacci(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_is_prime(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_gcd(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_lcm(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_reverse(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_count_vowels(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_count_words(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_capitalize(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_is_palindrome(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_similarity(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_mask_email(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_extract_domain(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_levenshtein(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_days_between(DATE, DATE);")
+ try_sql("DROP FUNCTION IF EXISTS py_is_weekend(DATE);")
+ try_sql("DROP FUNCTION IF EXISTS py_get_quarter(DATE);")
+ try_sql("DROP FUNCTION IF EXISTS py_age(DATE, DATE);")
+ try_sql("DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_xor(BOOLEAN, BOOLEAN);")
+ try_sql("DROP FUNCTION IF EXISTS py_grade(DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_categorize_age(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_tax(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_truncate(STRING, INT, STRING);")
+
+ // Cleanup tables
+ try_sql("DROP TABLE IF EXISTS scalar_module_test_table;")
+ try_sql("DROP TABLE IF EXISTS null_test_table;")
+ try_sql("DROP TABLE IF EXISTS string_edge_test;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module_vector.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module_vector.groovy
new file mode 100644
index 0000000..c39e92c
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module_vector.groovy
@@ -0,0 +1,429 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_module_vector") {
+ // Test vectorized Python UDF using module mode with pandas.Series
+
+ def pyPath = """${context.file.parent}/udf_scripts/python_udf_vector_ops.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+
+ log.info("Python module path: ${pyPath}".toString())
+
+ try {
+ // Create test table
+ sql """ DROP TABLE IF EXISTS vector_module_test_table; """
+ sql """
+ CREATE TABLE vector_module_test_table (
+ id INT,
+ int_a INT,
+ int_b INT,
+ double_a DOUBLE,
+ double_b DOUBLE,
+ string_a STRING,
+ string_b STRING,
+ bool_a BOOLEAN,
+ bool_b BOOLEAN
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO vector_module_test_table VALUES
+ (1, 10, 20, 1.5, 2.5, 'hello world', 'python udf', true, true),
+ (2, 30, 15, 3.5, 4.5, 'foo bar', 'test case', false, true),
+ (3, 50, 50, 5.5, 2.0, 'data science', 'machine learning', true, false),
+ (4, 5, 25, 7.5, 1.5, 'apache doris', 'database system', false, false),
+ (5, 100, 10, 9.5, 3.5, 'vector operations', 'pandas series', true, true);
+ """
+
+ // Test 1: Vector addition with constant
+ sql """ DROP FUNCTION IF EXISTS py_vec_add_const(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_add_const(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_add_with_constant",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_add_const """
+ SELECT
+ id,
+ int_a,
+ int_b,
+ py_vec_add_const(int_a, int_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 2: Vector multiplication and rounding
+ sql """ DROP FUNCTION IF EXISTS py_vec_multiply_round(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_multiply_round(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_multiply_and_round",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_multiply_round """
+ SELECT
+ id,
+ double_a,
+ double_b,
+ py_vec_multiply_round(double_a, double_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 3: Vector string concatenation with separator
+ sql """ DROP FUNCTION IF EXISTS py_vec_concat_sep(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_vec_concat_sep(STRING, STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_string_concat_with_separator",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_concat_sep """
+ SELECT
+ id,
+ string_a,
+ string_b,
+ py_vec_concat_sep(string_a, string_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 4: Vector string title case
+ sql """ DROP FUNCTION IF EXISTS py_vec_title_case(STRING); """
+ sql """
+ CREATE FUNCTION py_vec_title_case(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_string_title_case",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_title_case """
+ SELECT
+ id,
+ string_a,
+ py_vec_title_case(string_a) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 5: Vector conditional value (max of two values)
+ sql """ DROP FUNCTION IF EXISTS py_vec_conditional(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_conditional(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_conditional_value",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_conditional """
+ SELECT
+ id,
+ int_a,
+ int_b,
+ py_vec_conditional(int_a, int_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 6: Vector percentage calculation
+ sql """ DROP FUNCTION IF EXISTS py_vec_percentage(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_percentage(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_percentage_calculation",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_percentage """
+ SELECT
+ id,
+ double_a,
+ double_b,
+ py_vec_percentage(double_a, double_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 7: Vector range check
+ sql """ DROP FUNCTION IF EXISTS py_vec_in_range(INT, INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_in_range(INT, INT, INT)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_is_in_range",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_in_range """
+ SELECT
+ id,
+ int_a,
+ py_vec_in_range(int_a, 10, 50) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 8: Vector safe division
+ sql """ DROP FUNCTION IF EXISTS py_vec_safe_div(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_safe_div(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_safe_divide",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_safe_div """
+ SELECT
+ id,
+ double_a,
+ double_b,
+ py_vec_safe_div(double_a, double_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 9: Vector exponential decay
+ sql """ DROP FUNCTION IF EXISTS py_vec_exp_decay(DOUBLE, INT); """
+ sql """
+ CREATE FUNCTION py_vec_exp_decay(DOUBLE, INT)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_exponential_decay",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_exp_decay """
+ SELECT
+ id,
+ double_a,
+ int_a,
+ py_vec_exp_decay(double_a, int_a) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 10: Vector string extract first word
+ sql """ DROP FUNCTION IF EXISTS py_vec_first_word(STRING); """
+ sql """
+ CREATE FUNCTION py_vec_first_word(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_string_extract_first_word",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_first_word """
+ SELECT
+ id,
+ string_a,
+ py_vec_first_word(string_a) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 11: Vector absolute difference
+ sql """ DROP FUNCTION IF EXISTS py_vec_abs_diff(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_abs_diff(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_abs_difference",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_abs_diff """
+ SELECT
+ id,
+ int_a,
+ int_b,
+ py_vec_abs_diff(int_a, int_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 12: Vector power operation
+ sql """ DROP FUNCTION IF EXISTS py_vec_power(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_power(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_power",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_power """
+ SELECT
+ id,
+ double_a,
+ py_vec_power(double_a, 2.0) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 13: Vector boolean AND operation
+ sql """ DROP FUNCTION IF EXISTS py_vec_bool_and(BOOLEAN, BOOLEAN); """
+ sql """
+ CREATE FUNCTION py_vec_bool_and(BOOLEAN, BOOLEAN)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_boolean_and",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_bool_and """
+ SELECT
+ id,
+ bool_a,
+ bool_b,
+ py_vec_bool_and(bool_a, bool_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 14: Vector boolean OR operation
+ sql """ DROP FUNCTION IF EXISTS py_vec_bool_or(BOOLEAN, BOOLEAN); """
+ sql """
+ CREATE FUNCTION py_vec_bool_or(BOOLEAN, BOOLEAN)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_boolean_or",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_bool_or """
+ SELECT
+ id,
+ bool_a,
+ bool_b,
+ py_vec_bool_or(bool_a, bool_b) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ // Test 15: Vector clip values
+ sql """ DROP FUNCTION IF EXISTS py_vec_clip(INT, INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_clip(INT, INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${pyPath}",
+ "symbol" = "python_udf_vector_ops.vec_clip_values",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ qt_vec_clip """
+ SELECT
+ id,
+ int_a,
+ py_vec_clip(int_a, 20, 60) AS result
+ FROM vector_module_test_table
+ ORDER BY id;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_vec_add_const(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_multiply_round(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_concat_sep(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_title_case(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_conditional(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_percentage(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_in_range(INT, INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_safe_div(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_exp_decay(DOUBLE, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_first_word(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_abs_diff(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_power(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_bool_and(BOOLEAN, BOOLEAN);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_bool_or(BOOLEAN, BOOLEAN);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_clip(INT, INT, INT);")
+ try_sql("DROP TABLE IF EXISTS vector_module_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_multiline_inline.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_multiline_inline.groovy
new file mode 100644
index 0000000..ff17fe2
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_multiline_inline.groovy
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_multiline_inline") {
+ // Test complex multi-line inline Python code
+
+ def runtime_version = "3.10.12"
+ try {
+ // Test 1: Inline code with helper functions
+ sql """ DROP FUNCTION IF EXISTS py_complex_calculation(INT, INT); """
+ sql """
+ CREATE FUNCTION py_complex_calculation(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def helper_function(x):
+ return x * x
+
+def evaluate(a, b):
+ if a is None or b is None:
+ return None
+ result = helper_function(a) + helper_function(b)
+ return result
+\$\$;
+ """
+
+ qt_select_complex_calc """ SELECT py_complex_calculation(3, 4) AS result; """
+
+ // Test 2: Complex function with conditional logic
+ sql """ DROP FUNCTION IF EXISTS py_business_logic(STRING, DOUBLE, INT); """
+ sql """
+ CREATE FUNCTION py_business_logic(STRING, DOUBLE, INT)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(customer_type, amount, quantity):
+ if customer_type is None or amount is None or quantity is None:
+ return 'INVALID'
+
+ # Calculate discount
+ discount = 0
+ if customer_type == 'VIP':
+ discount = 0.2
+ elif customer_type == 'PREMIUM':
+ discount = 0.15
+ elif customer_type == 'REGULAR':
+ discount = 0.1
+ else:
+ discount = 0
+
+ # Bulk discount
+ if quantity >= 100:
+ discount += 0.05
+ elif quantity >= 50:
+ discount += 0.03
+
+ # Calculate final price
+ final_amount = amount * (1 - discount)
+
+ # Return result
+ if final_amount > 10000:
+ return f'HIGH:{final_amount:.2f}'
+ elif final_amount > 1000:
+ return f'MEDIUM:{final_amount:.2f}'
+ else:
+ return f'LOW:{final_amount:.2f}'
+\$\$;
+ """
+
+ qt_select_business_logic_vip """ SELECT py_business_logic('VIP', 5000.0, 120) AS result; """
+ qt_select_business_logic_regular """ SELECT py_business_logic('REGULAR', 2000.0, 30) AS result; """
+
+ // Test 3: Complex string processing logic
+ sql """ DROP FUNCTION IF EXISTS py_text_analyzer(STRING); """
+ sql """
+ CREATE FUNCTION py_text_analyzer(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(text):
+ if text is None:
+ return None
+
+ # Collect statistics
+ length = len(text)
+ words = text.split()
+ word_count = len(words)
+
+ # Count character types
+ upper_count = sum(1 for c in text if c.isupper())
+ lower_count = sum(1 for c in text if c.islower())
+ digit_count = sum(1 for c in text if c.isdigit())
+
+ # Build result
+ result = f"len:{length},words:{word_count},upper:{upper_count},lower:{lower_count},digits:{digit_count}"
+ return result
+\$\$;
+ """
+
+ qt_select_text_analyzer """ SELECT py_text_analyzer('Hello World 123') AS result; """
+
+ // Test 4: Complex mathematical calculation function
+ sql """ DROP FUNCTION IF EXISTS py_statistics(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_statistics(DOUBLE, DOUBLE, DOUBLE, DOUBLE)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(v1, v2, v3, v4):
+ if any(x is None for x in [v1, v2, v3, v4]):
+ return None
+
+ values = [v1, v2, v3, v4]
+
+ # Calculate statistics
+ total = sum(values)
+ count = len(values)
+ mean = total / count
+
+ # Calculate variance
+ variance = sum((x - mean) ** 2 for x in values) / count
+
+ # Calculate standard deviation
+ import math
+ std_dev = math.sqrt(variance)
+
+ # Find max and min values
+ max_val = max(values)
+ min_val = min(values)
+
+ result = f"mean:{mean:.2f},std:{std_dev:.2f},max:{max_val:.2f},min:{min_val:.2f}"
+ return result
+\$\$;
+ """
+
+ qt_select_statistics """ SELECT py_statistics(10.0, 20.0, 30.0, 40.0) AS result; """
+
+ // Test 5: Test complex inline code on table data
+ sql """ DROP TABLE IF EXISTS multiline_test_table; """
+ sql """
+ CREATE TABLE multiline_test_table (
+ id INT,
+ customer_type STRING,
+ amount DOUBLE,
+ quantity INT,
+ description STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO multiline_test_table VALUES
+ (1, 'VIP', 15000.0, 150, 'Premium customer order'),
+ (2, 'PREMIUM', 8000.0, 80, 'Good customer'),
+ (3, 'REGULAR', 3000.0, 40, 'Regular order'),
+ (4, 'VIP', 500.0, 10, 'Small VIP order'),
+ (5, 'REGULAR', 12000.0, 200, 'Large regular order');
+ """
+
+ qt_select_table_multiline """
+ SELECT
+ id,
+ customer_type,
+ amount,
+ quantity,
+ py_business_logic(customer_type, amount, quantity) AS pricing_result,
+ py_text_analyzer(description) AS text_analysis
+ FROM multiline_test_table
+ ORDER BY id;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_complex_calculation(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_business_logic(STRING, DOUBLE, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_text_analyzer(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_statistics(DOUBLE, DOUBLE, DOUBLE, DOUBLE);")
+ try_sql("DROP TABLE IF EXISTS multiline_test_table;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_performance.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_performance.groovy
new file mode 100644
index 0000000..f7429f6
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_performance.groovy
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_performance") {
+ // Test Python UDF performance and correctness with large data volumes
+
+ def runtime_version = "3.10.12"
+ try {
+ // Create simple Python UDF
+ sql """ DROP FUNCTION IF EXISTS py_perf_double(INT); """
+ sql """
+ CREATE FUNCTION py_perf_double(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x * 2
+\$\$;
+ """
+
+ sql """ DROP FUNCTION IF EXISTS py_perf_concat(STRING, STRING); """
+ sql """
+ CREATE FUNCTION py_perf_concat(STRING, STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(s1, s2):
+ if s1 is None or s2 is None:
+ return None
+ return s1 + '_' + s2
+\$\$;
+ """
+
+ // Create test table
+ sql """ DROP TABLE IF EXISTS performance_test_table; """
+ sql """
+ CREATE TABLE performance_test_table (
+ id INT,
+ value INT,
+ category STRING,
+ amount DOUBLE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 4
+ PROPERTIES("replication_num" = "1");
+ """
+
+ // Insert test data (large data volume)
+ sql """
+ INSERT INTO performance_test_table
+ SELECT
+ number AS id,
+ number % 1000 AS value,
+ CASE
+ WHEN number % 4 = 0 THEN 'A'
+ WHEN number % 4 = 1 THEN 'B'
+ WHEN number % 4 = 2 THEN 'C'
+ ELSE 'D'
+ END AS category,
+ number * 1.5 AS amount
+ FROM numbers("number" = "10000");
+ """
+
+ // Test 1: Simple UDF call performance
+ qt_select_perf_simple """
+ SELECT COUNT(*) AS total_count
+ FROM performance_test_table
+ WHERE py_perf_double(value) > 1000;
+ """
+
+ // Test 2: UDF performance in aggregate queries
+ qt_select_perf_aggregate """
+ SELECT
+ category,
+ COUNT(*) AS count,
+ AVG(py_perf_double(value)) AS avg_doubled_value
+ FROM performance_test_table
+ GROUP BY category
+ ORDER BY category;
+ """
+
+ // Test 3: Multiple UDFs combined usage
+ qt_select_perf_multiple_udf """
+ SELECT
+ category,
+ COUNT(*) AS count
+ FROM performance_test_table
+ WHERE py_perf_double(value) > 500
+ GROUP BY category
+ ORDER BY count DESC
+ LIMIT 10;
+ """
+
+ // Test 4: String UDF performance
+ qt_select_perf_string """
+ SELECT
+ category,
+ COUNT(DISTINCT py_perf_concat(category, CAST(value AS STRING))) AS unique_combinations
+ FROM performance_test_table
+ GROUP BY category
+ ORDER BY category;
+ """
+
+ // Test 5: UDF with complex calculations
+ sql """ DROP FUNCTION IF EXISTS py_perf_complex(INT, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_perf_complex(INT, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def evaluate(v, a):
+ if v is None or a is None:
+ return None
+ result = (v * 1.5 + a * 0.8) / 2.0
+ return result
+\$\$;
+ """
+
+ qt_select_perf_complex """
+ SELECT
+ category,
+ AVG(py_perf_complex(value, amount)) AS avg_complex_result,
+ MAX(py_perf_complex(value, amount)) AS max_complex_result,
+ MIN(py_perf_complex(value, amount)) AS min_complex_result
+ FROM performance_test_table
+ GROUP BY category
+ ORDER BY category;
+ """
+
+ // Test 6: UDF in nested queries
+ qt_select_perf_nested """
+ SELECT
+ category,
+ avg_doubled
+ FROM (
+ SELECT
+ category,
+ AVG(py_perf_double(value)) AS avg_doubled
+ FROM performance_test_table
+ GROUP BY category
+ ) t
+ WHERE avg_doubled > 500
+ ORDER BY avg_doubled DESC;
+ """
+
+ // Test 7: Performance test with NULL value handling
+ sql """ DROP TABLE IF EXISTS performance_null_test; """
+ sql """
+ CREATE TABLE performance_null_test (
+ id INT,
+ value INT,
+ nullable_value INT
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 4
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO performance_null_test
+ SELECT
+ number AS id,
+ number % 100 AS value,
+ CASE WHEN number % 5 = 0 THEN NULL ELSE number % 50 END AS nullable_value
+ FROM numbers("number" = "5000");
+ """
+
+ qt_select_perf_null """
+ SELECT
+ COUNT(*) AS total,
+ COUNT(py_perf_double(nullable_value)) AS non_null_count,
+ AVG(py_perf_double(nullable_value)) AS avg_result
+ FROM performance_null_test;
+ """
+
+ // Test 8: Sorting performance
+ qt_select_perf_order """
+ SELECT
+ id,
+ value,
+ py_perf_double(value) AS doubled_value
+ FROM performance_test_table
+ ORDER BY py_perf_double(value) DESC, id DESC
+ LIMIT 20;
+ """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_perf_double(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_perf_concat(STRING, STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_perf_complex(INT, DOUBLE);")
+ try_sql("DROP TABLE IF EXISTS performance_test_table;")
+ try_sql("DROP TABLE IF EXISTS performance_null_test;")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_performance_comparison.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_performance_comparison.groovy
new file mode 100644
index 0000000..35dc06c
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_performance_comparison.groovy
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_performance_comparison") {
+ // Quick performance comparison: Scalar vs Vector Python UDF
+ // Lightweight test for quick performance checks
+
+ def scalarPyPath = """${context.file.parent}/udf_scripts/python_udf_scalar_ops.zip"""
+ def vectorPyPath = """${context.file.parent}/udf_scripts/python_udf_vector_ops.zip"""
+ scp_udf_file_to_all_be(scalarPyPath)
+ scp_udf_file_to_all_be(vectorPyPath)
+ def runtime_version = "3.10.12"
+
+ sql "CREATE DATABASE IF NOT EXISTS test_pythonudf_performance_comparison"
+ sql "USE test_pythonudf_performance_comparison"
+
+ // Quick test with smaller dataset
+ def TEST_ROWS = 100000 // 100K rows for quick testing
+
+ log.info("=" * 80)
+ log.info("PYTHON UDF PERFORMANCE COMPARISON")
+ log.info("Quick test with ${TEST_ROWS} rows")
+ log.info("=" * 80)
+
+ try {
+ // Create test table
+ sql """ DROP TABLE IF EXISTS perf_comparison_table; """
+ sql """
+ CREATE TABLE perf_comparison_table (
+ id INT,
+ val1 INT,
+ val2 INT,
+ price DOUBLE,
+ discount DOUBLE,
+ text STRING
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 10
+ PROPERTIES("replication_num" = "1");
+ """
+
+ // Load test data using streamLoad from CSV file
+ log.info("Loading ${TEST_ROWS} rows using streamLoad from CSV file...")
+ def loadStartTime = System.currentTimeMillis()
+
+ streamLoad {
+ db 'test_pythonudf_performance_comparison'
+ table "perf_comparison_table"
+
+ // Set column separator to tab
+ set 'column_separator', '\t'
+
+ // File path relative to regression-test/data/pythonudf_p0/
+ file 'benchmark_data_100k.csv'
+
+ time 60000 // 60 seconds timeout
+
+ // Custom check callback
+ check { result, exception, startTime, endTime ->
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ assertEquals(json.NumberTotalRows, json.NumberLoadedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+ }
+ }
+
+ def loadEndTime = System.currentTimeMillis()
+ log.info("Data loaded in ${loadEndTime - loadStartTime} ms")
+
+ sql "sync"
+
+ def actualRows = sql "SELECT COUNT(*) FROM perf_comparison_table"
+ log.info("Verified row count: ${actualRows[0][0]}\nData ready. Starting performance tests...\n")
+
+ // Define test cases
+ def testCases = [
+ [
+ name: "Integer Multiplication",
+ scalar_symbol: "python_udf_scalar_ops.multiply_with_default",
+ vector_symbol: "python_udf_vector_ops.multiply_by_constant",
+ params: "(INT, INT, INT)",
+ returns: "INT",
+ query: "SELECT COUNT(*) FROM (SELECT id, {UDF}(val1, 10, 1) AS result FROM perf_comparison_table) t"
+ ],
+ [
+ name: "Price Calculation",
+ scalar_symbol: "python_udf_scalar_ops.calculate_discount_price",
+ vector_symbol: "python_udf_vector_ops.calculate_discount",
+ params: "(DOUBLE, DOUBLE)",
+ returns: "DOUBLE",
+ query: "SELECT COUNT(*) FROM (SELECT id, {UDF}(price, discount) AS result FROM perf_comparison_table) t"
+ ],
+ [
+ name: "String Length",
+ scalar_symbol: "python_udf_scalar_ops.string_length_custom",
+ vector_symbol: "python_udf_vector_ops.string_length",
+ params: "(STRING)",
+ returns: "INT",
+ query: "SELECT COUNT(*) FROM (SELECT id, {UDF}(text) AS result FROM perf_comparison_table) t"
+ ]
+ ]
+
+ def results = []
+
+ testCases.each { testCase ->
+ log.info("-" * 80)
+ log.info("Test: ${testCase.name}")
+ log.info("-" * 80)
+
+ // Test Scalar UDF
+ sql """ DROP FUNCTION IF EXISTS py_scalar_test${testCase.params}; """
+ sql """
+ CREATE FUNCTION py_scalar_test${testCase.params}
+ RETURNS ${testCase.returns}
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${scalarPyPath}",
+ "symbol" = "${testCase.scalar_symbol}",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ def scalarQuery = testCase.query.replace("{UDF}", "py_scalar_test")
+
+ // Warm up
+ sql scalarQuery
+
+ // Actual test - run 3 times and take average
+ def scalarTimes = []
+ for (int i = 0; i < 3; i++) {
+ def start = System.currentTimeMillis()
+ sql scalarQuery
+ def end = System.currentTimeMillis()
+ scalarTimes.add(end - start)
+ }
+ def scalarAvg = scalarTimes.sum() / scalarTimes.size()
+
+ log.info(" Scalar UDF: ${scalarTimes} ms, Avg: ${scalarAvg} ms")
+
+ // Test Vector UDF
+ sql """ DROP FUNCTION IF EXISTS py_vector_test${testCase.params}; """
+ sql """
+ CREATE FUNCTION py_vector_test${testCase.params}
+ RETURNS ${testCase.returns}
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${vectorPyPath}",
+ "symbol" = "${testCase.vector_symbol}",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true",
+ "vectorized" = "true"
+ );
+ """
+
+ def vectorQuery = testCase.query.replace("{UDF}", "py_vector_test")
+
+ // Warm up
+ sql vectorQuery
+
+ // Actual test - run 3 times and take average
+ def vectorTimes = []
+ for (int i = 0; i < 3; i++) {
+ def start = System.currentTimeMillis()
+ sql vectorQuery
+ def end = System.currentTimeMillis()
+ vectorTimes.add(end - start)
+ }
+ def vectorAvg = vectorTimes.sum() / vectorTimes.size()
+
+ log.info(" Vector UDF: ${vectorTimes} ms, Avg: ${vectorAvg} ms")
+
+ def speedup = scalarAvg / vectorAvg
+ def improvement = ((scalarAvg - vectorAvg) / scalarAvg * 100)
+
+ log.info(" Speedup: ${String.format('%.2f', speedup)}x")
+ log.info(" Improvement: ${String.format('%.1f', improvement)}%")
+
+ results.add([
+ name: testCase.name,
+ scalar: scalarAvg,
+ vector: vectorAvg,
+ speedup: speedup,
+ improvement: improvement
+ ])
+
+ // Cleanup
+ sql """ DROP FUNCTION IF EXISTS py_scalar_test${testCase.params}; """
+ sql """ DROP FUNCTION IF EXISTS py_vector_test${testCase.params}; """
+ }
+
+ // Print summary
+ def summary = new StringBuilder()
+ summary.append("\n" + "=" * 80 + "\n")
+ summary.append("PERFORMANCE COMPARISON SUMMARY\n")
+ summary.append("=" * 80 + "\n")
+ summary.append(String.format("%-30s %12s %12s %10s %12s", "Test Case", "Scalar(ms)", "Vector(ms)", "Speedup", "Improvement") + "\n")
+ summary.append("-" * 80 + "\n")
+
+ results.each { r ->
+ summary.append(String.format("%-30s %12.1f %12.1f %9.2fx %11.1f%%",
+ r.name, r.scalar, r.vector, r.speedup, r.improvement) + "\n")
+ }
+
+ def avgSpeedup = results.collect { it.speedup }.sum() / results.size()
+ def avgImprovement = results.collect { it.improvement }.sum() / results.size()
+
+ summary.append("-" * 80 + "\n")
+ summary.append(String.format("%-30s %12s %12s %9.2fx %11.1f%%",
+ "AVERAGE", "-", "-", avgSpeedup, avgImprovement) + "\n")
+ summary.append("=" * 80)
+
+ log.info(summary.toString())
+
+ } finally {
+ // Cleanup
+ try_sql("DROP TABLE IF EXISTS perf_comparison_table;")
+ try_sql("DROP DATABASE IF EXISTS test_pythonudf_performance_comparison;")
+ log.info("\nPerformance comparison completed.")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_ret_map.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_ret_map.groovy
new file mode 100644
index 0000000..7567a78
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_ret_map.groovy
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_ret_map") {
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ try_sql("DROP FUNCTION IF EXISTS retii(map<int,int>);")
+ try_sql("DROP FUNCTION IF EXISTS retss(map<String,String>);")
+ try_sql("DROP FUNCTION IF EXISTS retid(map<int,Double>);")
+ try_sql("DROP FUNCTION IF EXISTS retidss(int ,double);")
+ try_sql("DROP TABLE IF EXISTS db")
+ try_sql("DROP TABLE IF EXISTS dbss")
+ sql """
+ CREATE TABLE IF NOT EXISTS db(
+ `id` INT NULL COMMENT "",
+ `i` INT NULL COMMENT "",
+ `d` Double NULL COMMENT "",
+ `mii` Map<INT, INT> NULL COMMENT "",
+ `mid` Map<INT, Double> NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2");
+ """
+ sql """ INSERT INTO db VALUES(1, 10,1.1,{1:1,10:1,100:1},{1:1.1,11:11.1}); """
+ sql """ INSERT INTO db VALUES(2, 20,2.2,{2:2,20:2,200:2},{2:2.2,22:22.2}); """
+
+ sql """
+ CREATE TABLE IF NOT EXISTS dbss(
+ `id` INT NULL COMMENT "",
+ `m` Map<String, String> NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2");
+ """
+
+ sql """ INSERT INTO dbss VALUES(1,{"abc":"efg","h":"i"}); """
+ sql """ INSERT INTO dbss VALUES(2,{"j":"k"}); """
+
+
+ sql """
+
+ CREATE FUNCTION retii(map<int,int>) RETURNS map<int,int> PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="map_ret_int_int_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+
+ """
+
+ sql """
+
+ CREATE FUNCTION retss(map<String,String>) RETURNS map<String,String> PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="map_ret_string_string_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+
+ """
+
+
+ sql """
+
+ CREATE FUNCTION retid(map<int,Double>) RETURNS map<int,Double> PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="map_ret_int_double_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+
+ """
+
+ sql """
+
+ CREATE FUNCTION retidss(int ,double) RETURNS map<String,String> PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="map_int_double_ret_string_string_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ );
+
+ """
+
+ qt_select_1 """ select mid , retid(mid) from db order by id; """
+
+ qt_select_2 """ select mii , retii(mii) from db order by id; """
+
+ qt_select_3 """ select i,d,retidss(i,d) from db order by id; """
+
+ qt_select_4 """ select m,retss(m) from dbss order by id; """
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS retii(map<int,int>);")
+ try_sql("DROP FUNCTION IF EXISTS retss(map<String,String>);")
+ try_sql("DROP FUNCTION IF EXISTS retid(map<int,Double>);")
+ try_sql("DROP FUNCTION IF EXISTS retidss(int ,double);")
+ try_sql("DROP TABLE IF EXISTS db")
+ try_sql("DROP TABLE IF EXISTS dbss")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_runtime_version.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_runtime_version.groovy
new file mode 100644
index 0000000..675942f
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_runtime_version.groovy
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_runtime_version") {
+ // Test different configurations of runtime_version parameter
+
+ // Disabled temporarily
+ return
+
+ try {
+ // Test 1: Specify short version number (x.xx format) with inline code
+ sql """ DROP FUNCTION IF EXISTS py_version_test_short(INT); """
+ sql """
+ CREATE FUNCTION py_version_test_short(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "3.12"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x * 2
+\$\$;
+ """
+
+ qt_select_version_short """ SELECT py_version_test_short(21) AS result; """
+
+ // Test 2: Specify full version number (x.xx.xx format) with inline code
+ sql """ DROP FUNCTION IF EXISTS py_version_test_full(INT); """
+ sql """
+ CREATE FUNCTION py_version_test_full(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "3.12.10"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x * 3
+\$\$;
+ """
+
+ qt_select_version_full """ SELECT py_version_test_full(10) AS result; """
+
+ // Test 3: Do not specify runtime_version (use default)
+ sql """ DROP FUNCTION IF EXISTS py_version_test_default(INT); """
+ sql """
+ CREATE FUNCTION py_version_test_default(INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate"
+ )
+ AS \$\$
+def evaluate(x):
+ if x is None:
+ return None
+ return x + 100
+\$\$;
+ """
+
+ qt_select_version_default """ SELECT py_version_test_default(50) AS result; """
+
+ // Test 4: String function with runtime_version
+ sql """ DROP FUNCTION IF EXISTS py_version_string_test(STRING); """
+ sql """
+ CREATE FUNCTION py_version_string_test(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "3.12"
+ )
+ AS \$\$
+def evaluate(s):
+ if s is None:
+ return None
+ return s.upper()
+\$\$;
+ """
+
+ qt_select_version_string """ SELECT py_version_string_test('hello') AS result; """
+
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS py_version_test_short(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_version_test_full(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_version_test_default(INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_version_string_test(STRING);")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_schema_check.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_schema_check.groovy
new file mode 100644
index 0000000..07d6183
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_schema_check.groovy
@@ -0,0 +1,544 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_schema_check") {
+ // Test type compatibility in Python UDF
+ // Users can specify compatible types instead of exact matching types
+ // For example: TINYINT can be used where INT is expected
+
+ def runtime_version = "3.10.12"
+
+ try {
+ // Create test table with various integer types
+ sql """ DROP TABLE IF EXISTS test_type_compat_table; """
+ sql """
+ CREATE TABLE test_type_compat_table (
+ id INT,
+ tiny_val TINYINT,
+ small_val SMALLINT,
+ int_val INT,
+ big_val BIGINT,
+ float_val FLOAT,
+ double_val DOUBLE,
+ str_val STRING,
+ bool_val BOOLEAN,
+ date_val DATE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 3
+ PROPERTIES("replication_num" = "1");
+ """
+
+ // Insert test data
+ sql """
+ INSERT INTO test_type_compat_table VALUES
+ (1, 10, 100, 1000, 10000, 1.5, 10.5, 'test1', true, '2024-01-01'),
+ (2, 20, 200, 2000, 20000, 2.5, 20.5, 'test2', false, '2024-01-02'),
+ (3, 30, 300, 3000, 30000, 3.5, 30.5, 'test3', true, '2024-01-03'),
+ (4, 40, 400, 4000, 40000, 4.5, 40.5, 'test4', false, '2024-01-04'),
+ (5, 50, 500, 5000, 50000, 5.5, 50.5, 'test5', true, '2024-01-05');
+ """
+
+ // ==================== Test 1: Integer Type Promotion (TINYINT -> INT) ====================
+ log.info("=== Test 1: TINYINT can be used where INT is expected ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_add_int(INT, INT); """
+ sql """
+ CREATE FUNCTION py_add_int(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_add_int",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def py_add_int(a, b):
+ if a is None or b is None:
+ return None
+ return a + b
+\$\$;
+ """
+
+ // Pass TINYINT where INT is expected
+ qt_select_1 """
+ SELECT
+ id,
+ tiny_val,
+ int_val,
+ py_add_int(tiny_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 2: Integer Type Promotion (SMALLINT -> INT) ====================
+ log.info("=== Test 2: SMALLINT can be used where INT is expected ===")
+
+ qt_select_2 """
+ SELECT
+ id,
+ small_val,
+ int_val,
+ py_add_int(small_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 3: Integer Type Promotion (INT -> BIGINT) ====================
+ log.info("=== Test 3: INT can be used where BIGINT is expected ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_add_bigint(BIGINT, BIGINT); """
+ sql """
+ CREATE FUNCTION py_add_bigint(BIGINT, BIGINT)
+ RETURNS BIGINT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_add_bigint",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def py_add_bigint(a, b):
+ return a + b
+\$\$;
+ """
+
+ qt_select_3 """
+ SELECT
+ id,
+ int_val,
+ big_val,
+ py_add_bigint(int_val, big_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 4: Float Type Promotion (FLOAT -> DOUBLE) ====================
+ log.info("=== Test 4: FLOAT can be used where DOUBLE is expected ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_add_double(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_add_double(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_add_double",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def py_add_double(a, b):
+ return a + b
+\$\$;
+ """
+
+ qt_select_4 """
+ SELECT
+ id,
+ float_val,
+ double_val,
+ py_add_double(float_val, double_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 5: Mixed Integer Types ====================
+ log.info("=== Test 5: Mixed integer types (TINYINT, SMALLINT, INT) ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_sum_three(INT, INT, INT); """
+ sql """
+ CREATE FUNCTION py_sum_three(INT, INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_sum_three",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def py_sum_three(a, b, c):
+ return a + b + c
+\$\$;
+ """
+
+ qt_select_5 """
+ SELECT
+ id,
+ tiny_val,
+ small_val,
+ int_val,
+ py_sum_three(tiny_val, small_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 6: Vectorized UDF with Type Promotion ====================
+ log.info("=== Test 6: Vectorized UDF with integer type promotion ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_multiply(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_multiply(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_multiply",
+ "runtime_version" = "${runtime_version}",
+ "vectorized" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_multiply(a: pd.Series, b: pd.Series) -> pd.Series:
+ return a * b
+\$\$;
+ """
+
+ // Use TINYINT and SMALLINT where INT is expected
+ qt_select_6 """
+ SELECT
+ id,
+ tiny_val,
+ small_val,
+ py_vec_multiply(tiny_val, small_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 7: Vectorized UDF with Float Promotion ====================
+ log.info("=== Test 7: Vectorized UDF with float type promotion ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_divide(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_divide(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_divide",
+ "runtime_version" = "${runtime_version}",
+ "vectorized" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_divide(a: pd.Series, b: pd.Series) -> pd.Series:
+ return a / b
+\$\$;
+ """
+
+ // Use FLOAT where DOUBLE is expected
+ qt_select_7 """
+ SELECT
+ id,
+ float_val,
+ double_val,
+ py_vec_divide(double_val, float_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 8: Mixed Types in Vectorized UDF ====================
+ log.info("=== Test 8: Mixed integer and float types in vectorized UDF ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_calc(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_calc(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_calc",
+ "runtime_version" = "${runtime_version}",
+ "vectorized" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_calc(a: pd.Series, b: pd.Series) -> pd.Series:
+ return a * 2.0 + b
+\$\$;
+ """
+
+ // Use INT and FLOAT where DOUBLE is expected
+ qt_select_8 """
+ SELECT
+ id,
+ int_val,
+ float_val,
+ py_vec_calc(int_val, float_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 9: String Type Compatibility ====================
+ log.info("=== Test 9: String type compatibility ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_string_upper(STRING); """
+ sql """
+ CREATE FUNCTION py_string_upper(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_string_upper",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def py_string_upper(s):
+ return s.upper() if s else None
+\$\$;
+ """
+
+ qt_select_9 """
+ SELECT
+ id,
+ str_val,
+ py_string_upper(str_val) AS upper_str
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 10: Boolean Type ====================
+ log.info("=== Test 10: Boolean type compatibility ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_bool_not(BOOLEAN); """
+ sql """
+ CREATE FUNCTION py_bool_not(BOOLEAN)
+ RETURNS BOOLEAN
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_bool_not",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def py_bool_not(b):
+ return not b if b is not None else None
+\$\$;
+ """
+
+ qt_select_10 """
+ SELECT
+ id,
+ bool_val,
+ py_bool_not(bool_val) AS negated
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 11: Complex Type Promotion Chain ====================
+ log.info("=== Test 11: Complex type promotion chain (TINYINT -> BIGINT) ===")
+
+ qt_select_11 """
+ SELECT
+ id,
+ tiny_val,
+ big_val,
+ py_add_bigint(tiny_val, big_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 12: Vectorized with Mixed Scalar and Series ====================
+ log.info("=== Test 12: Vectorized UDF with type promotion and mixed params ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_vec_scale(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_vec_scale(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_vec_scale",
+ "runtime_version" = "${runtime_version}",
+ "vectorized" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def py_vec_scale(values: pd.Series, factor: float) -> pd.Series:
+ return values * factor
+\$\$;
+ """
+
+ // Use INT (promoted to DOUBLE) with scalar FLOAT
+ qt_select_12 """
+ SELECT
+ id,
+ int_val,
+ py_vec_scale(int_val, 1.5) AS scaled
+ FROM test_type_compat_table
+ ORDER BY id;
+ """
+
+ // ==================== Test 13: Type Incompatibility - STRING to INT ====================
+ log.info("=== Test 13: Type incompatibility - STRING cannot be used where INT is expected ===")
+
+ qt_select_13 """
+ SELECT
+ id,
+ str_val,
+ py_add_int(str_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 14: Type Incompatibility - BIGINT to INT ====================
+ log.info("=== Test 14: Type incompatibility - BIGINT cannot be downcast to INT ===")
+
+ qt_select_14 """
+ SELECT
+ id,
+ big_val,
+ py_add_int(big_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 15: Type Incompatibility - DOUBLE to FLOAT ====================
+ log.info("=== Test 15: Type incompatibility - DOUBLE cannot be downcast to FLOAT ===")
+
+ sql """ DROP FUNCTION IF EXISTS py_add_float(FLOAT, FLOAT); """
+ sql """
+ CREATE FUNCTION py_add_float(FLOAT, FLOAT)
+ RETURNS FLOAT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "py_add_float",
+ "runtime_version" = "${runtime_version}"
+ )
+ AS \$\$
+def py_add_float(a, b):
+ return a + b
+\$\$;
+ """
+
+ qt_select_15 """
+ SELECT
+ id,
+ double_val,
+ py_add_float(double_val, float_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 16: Type Incompatibility - BOOLEAN to INT ====================
+ log.info("=== Test 16: Type incompatibility - BOOLEAN cannot be used where INT is expected ===")
+
+ qt_select_16 """
+ SELECT
+ id,
+ bool_val,
+ py_add_int(bool_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 17: Type Incompatibility - DATE to STRING ====================
+ log.info("=== Test 17: Type incompatibility - DATE cannot be directly used where STRING is expected ===")
+
+ qt_select_17 """
+ SELECT
+ id,
+ date_val,
+ py_string_upper(date_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 18: Type Incompatibility - INT to BOOLEAN ====================
+ log.info("=== Test 18: Type incompatibility - INT cannot be used where BOOLEAN is expected ===")
+
+ qt_select_18 """
+ SELECT
+ id,
+ int_val,
+ py_bool_not(int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 19: Type Incompatibility in Vectorized UDF - STRING to INT ====================
+ log.info("=== Test 19: Type incompatibility in vectorized UDF - STRING to INT ===")
+
+ qt_select_19 """
+ SELECT
+ id,
+ str_val,
+ py_vec_multiply(str_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 20: Type Incompatibility - Mixed incompatible types ====================
+ log.info("=== Test 20: Type incompatibility - Mixed incompatible types ===")
+
+ qt_select_20 """
+ SELECT
+ id,
+ str_val,
+ bool_val,
+ py_add_int(str_val, bool_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ // ==================== Test 21: Wrong number of arguments ====================
+ log.info("=== Test 21: Wrong number of arguments ===")
+
+ test {
+ sql """
+ SELECT
+ id,
+ py_add_int(int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+ exception "Can not found function 'py_add_int' which has 1 arity. Candidate functions are: [py_add_int(INT, INT)]"
+ }
+
+ // ==================== Test 22: Type Incompatibility - FLOAT to INT ====================
+ log.info("=== Test 22: Type incompatibility - FLOAT cannot be used where INT is expected ===")
+
+ qt_select_22 """
+ SELECT
+ id,
+ float_val,
+ py_add_int(float_val, int_val) AS result
+ FROM test_type_compat_table
+ ORDER BY id
+ LIMIT 1;
+ """
+
+ log.info("All type compatibility tests (including negative tests) passed!")
+
+ } finally {
+ // Cleanup
+ sql """ DROP FUNCTION IF EXISTS py_add_int(INT, INT); """
+ sql """ DROP FUNCTION IF EXISTS py_add_bigint(BIGINT, BIGINT); """
+ sql """ DROP FUNCTION IF EXISTS py_add_double(DOUBLE, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_add_float(FLOAT, FLOAT); """
+ sql """ DROP FUNCTION IF EXISTS py_sum_three(INT, INT, INT); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_multiply(INT, INT); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_divide(DOUBLE, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_calc(DOUBLE, DOUBLE); """
+ sql """ DROP FUNCTION IF EXISTS py_string_upper(STRING); """
+ sql """ DROP FUNCTION IF EXISTS py_bool_not(BOOLEAN); """
+ sql """ DROP FUNCTION IF EXISTS py_vec_scale(DOUBLE, DOUBLE); """
+ sql """ DROP TABLE IF EXISTS test_type_compat_table; """
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_stress.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_stress.groovy
new file mode 100644
index 0000000..e3b0ff8
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_stress.groovy
@@ -0,0 +1,314 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_stress") {
+ // Stress test for Python UDF - configurable dataset size
+ // This test is designed to push Python UDF to its limits
+
+ def scalarPyPath = """${context.file.parent}/udf_scripts/python_udf_scalar_ops.zip"""
+ def vectorPyPath = """${context.file.parent}/udf_scripts/python_udf_vector_ops.zip"""
+ scp_udf_file_to_all_be(scalarPyPath)
+ scp_udf_file_to_all_be(vectorPyPath)
+ def runtime_version = "3.10.12"
+
+ sql "CREATE DATABASE IF NOT EXISTS test_pythonudf_stress"
+ sql "USE test_pythonudf_stress"
+
+ // Configuration: Adjust these for different stress levels
+ def TOTAL_ROWS = 5000000 // 5 million rows (change to 10M, 50M for extreme stress)
+ def BATCH_SIZE = 50000 // Insert batch size
+ def BUCKETS = 32 // Number of buckets for distribution
+
+ log.info("\n" + "=" * 80 + "\nPYTHON UDF STRESS TEST\n" + "=" * 80 + "\nConfiguration:\n" +
+ " Total Rows: ${TOTAL_ROWS}\n" +
+ " Batch Size: ${BATCH_SIZE}\n" +
+ " Buckets: ${BUCKETS}\n" +
+ "=" * 80)
+
+ try {
+ // ==================== Create Stress Test Table ====================
+ sql """ DROP TABLE IF EXISTS python_udf_stress_table; """
+ sql """
+ CREATE TABLE python_udf_stress_table (
+ id BIGINT,
+ category INT,
+ value1 INT,
+ value2 INT,
+ price DOUBLE,
+ discount DOUBLE,
+ name STRING,
+ description STRING,
+ email STRING,
+ is_active BOOLEAN,
+ created_date DATE
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS ${BUCKETS}
+ PROPERTIES("replication_num" = "1");
+ """
+
+ log.info("Loading ${TOTAL_ROWS} rows using streamLoad from CSV file...")
+ def loadStartTime = System.currentTimeMillis()
+
+ streamLoad {
+ db 'test_pythonudf_stress'
+ table "python_udf_stress_table"
+ set 'column_separator', '\t'
+ file 'benchmark_data_5m.csv'
+ time 300000 // 300 seconds (5 minutes) timeout
+
+ check { result, exception, startTime, endTime ->
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ assertEquals(json.NumberTotalRows, json.NumberLoadedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+ }
+ }
+
+ def loadEndTime = System.currentTimeMillis()
+ log.info("Data loaded in ${loadEndTime - loadStartTime} ms (${String.format('%.2f', TOTAL_ROWS / ((loadEndTime - loadStartTime) / 1000.0))} rows/sec)")
+
+ sql "sync"
+
+ def rowCount = sql "SELECT COUNT(*) FROM python_udf_stress_table"
+ log.info("Verified row count: ${rowCount[0][0]}")
+
+ // ==================== Define UDFs ====================
+
+ // Scalar UDF - Simple
+ sql """ DROP FUNCTION IF EXISTS py_calc_final_price(DOUBLE, DOUBLE); """
+ sql """
+ CREATE FUNCTION py_calc_final_price(DOUBLE, DOUBLE)
+ RETURNS DOUBLE
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${scalarPyPath}",
+ "symbol" = "python_udf_scalar_ops.calculate_discount_price",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ // Scalar UDF - Complex
+ sql """ DROP FUNCTION IF EXISTS py_extract_domain(STRING); """
+ sql """
+ CREATE FUNCTION py_extract_domain(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${scalarPyPath}",
+ "symbol" = "python_udf_scalar_ops.extract_domain",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ // Vector UDF - Simple
+ sql """ DROP FUNCTION IF EXISTS py_vec_multiply(INT, INT); """
+ sql """
+ CREATE FUNCTION py_vec_multiply(INT, INT)
+ RETURNS INT
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${vectorPyPath}",
+ "symbol" = "python_udf_vector_ops.multiply_by_constant",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ // Vector UDF - String
+ sql """ DROP FUNCTION IF EXISTS py_vec_upper(STRING); """
+ sql """
+ CREATE FUNCTION py_vec_upper(STRING)
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "file" = "file://${vectorPyPath}",
+ "symbol" = "python_udf_vector_ops.to_uppercase",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ );
+ """
+
+ log.info("=" * 80)
+ log.info("STRESS TEST EXECUTION")
+ log.info("=" * 80)
+
+ // ==================== Stress Test 1: Full Table Scan with Scalar UDF ====================
+ log.info("Test 1: Full table scan with scalar UDF (${TOTAL_ROWS} rows)")
+
+ def test1Start = System.currentTimeMillis()
+ def result1 = sql """
+ SELECT COUNT(*), AVG(final_price)
+ FROM (
+ SELECT id, py_calc_final_price(price, discount) AS final_price
+ FROM python_udf_stress_table
+ ) t;
+ """
+ def test1End = System.currentTimeMillis()
+ def test1Time = test1End - test1Start
+ log.info(" Result: ${result1[0]}")
+ log.info(" Time: ${test1Time} ms")
+ log.info(" Throughput: ${String.format('%.2f', TOTAL_ROWS / (test1Time / 1000.0))} rows/sec")
+
+ // ==================== Stress Test 2: Full Table Scan with Vector UDF ====================
+ log.info("Test 2: Full table scan with vector UDF (${TOTAL_ROWS} rows)")
+
+ def test2Start = System.currentTimeMillis()
+ def result2 = sql """
+ SELECT COUNT(*), AVG(result)
+ FROM (
+ SELECT id, py_vec_multiply(value1, 10) AS result
+ FROM python_udf_stress_table
+ ) t;
+ """
+ def test2End = System.currentTimeMillis()
+ def test2Time = test2End - test2Start
+ log.info(" Result: ${result2[0]}")
+ log.info(" Time: ${test2Time} ms")
+ log.info(" Throughput: ${String.format('%.2f', TOTAL_ROWS / (test2Time / 1000.0))} rows/sec")
+ log.info(" Speedup vs Scalar: ${String.format('%.2f', test1Time / (test2Time * 1.0))}x")
+
+ // ==================== Stress Test 3: String Processing with Vector UDF ====================
+ log.info("Test 3: String processing with vector UDF (${TOTAL_ROWS} rows)")
+
+ def test3Start = System.currentTimeMillis()
+ def result3 = sql """
+ SELECT COUNT(DISTINCT upper_name)
+ FROM (
+ SELECT py_vec_upper(name) AS upper_name
+ FROM python_udf_stress_table
+ ) t;
+ """
+ def test3End = System.currentTimeMillis()
+ def test3Time = test3End - test3Start
+ log.info(" Result: ${result3[0]}")
+ log.info(" Time: ${test3Time} ms")
+ log.info(" Throughput: ${String.format('%.2f', TOTAL_ROWS / (test3Time / 1000.0))} rows/sec")
+
+ // ==================== Stress Test 4: Complex Aggregation ====================
+ log.info("Test 4: Complex aggregation with multiple UDFs")
+
+ def test4Start = System.currentTimeMillis()
+ def result4 = sql """
+ SELECT
+ category,
+ COUNT(*) AS cnt,
+ AVG(py_calc_final_price(price, discount)) AS avg_final_price,
+ AVG(py_vec_multiply(value1, 10)) AS avg_multiplied
+ FROM python_udf_stress_table
+ GROUP BY category
+ ORDER BY category
+ LIMIT 20;
+ """
+ def test4End = System.currentTimeMillis()
+ def test4Time = test4End - test4Start
+ log.info(" Processed ${result4.size()} groups")
+ log.info(" Time: ${test4Time} ms")
+
+ // ==================== Stress Test 5: Join with UDF ====================
+ log.info("Test 5: Self-join with UDF (limited to 1M rows)")
+
+ def test5Start = System.currentTimeMillis()
+ def result5 = sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT
+ a.id,
+ py_vec_multiply(a.value1, b.value2) AS result
+ FROM python_udf_stress_table a
+ JOIN python_udf_stress_table b ON a.category = b.category
+ WHERE a.id < 100000 AND b.id < 100000
+ ) t;
+ """
+ def test5End = System.currentTimeMillis()
+ def test5Time = test5End - test5Start
+ log.info(" Result: ${result5[0]}")
+ log.info(" Time: ${test5Time} ms")
+
+ // ==================== Stress Test 6: Concurrent UDF Calls ====================
+ log.info("Test 6: Multiple UDFs in single query")
+
+ def test6Start = System.currentTimeMillis()
+ def result6 = sql """
+ SELECT COUNT(*)
+ FROM (
+ SELECT
+ id,
+ py_calc_final_price(price, discount) AS final_price,
+ py_extract_domain(email) AS domain,
+ py_vec_multiply(value1, 5) AS vec_result,
+ py_vec_upper(name) AS upper_name
+ FROM python_udf_stress_table
+ LIMIT 500000
+ ) t;
+ """
+ def test6End = System.currentTimeMillis()
+ def test6Time = test6End - test6Start
+ log.info(" Result: ${result6[0]}")
+ log.info(" Time: ${test6Time} ms")
+ log.info(" Throughput: ${String.format('%.2f', 500000 / (test6Time / 1000.0))} rows/sec")
+
+ // ==================== Stress Test 7: Filter with UDF ====================
+ log.info("Test 7: Filter with UDF predicate")
+
+ def test7Start = System.currentTimeMillis()
+ def result7 = sql """
+ SELECT COUNT(*)
+ FROM python_udf_stress_table
+ WHERE py_vec_multiply(value1, 2) > 1000
+ LIMIT 100000;
+ """
+ def test7End = System.currentTimeMillis()
+ def test7Time = test7End - test7Start
+ log.info(" Result: ${result7[0]}")
+ log.info(" Time: ${test7Time} ms")
+
+ // ==================== Final Summary ====================
+ log.info("=" * 80 + "\nSTRESS TEST SUMMARY\n" + "=" * 80 + "\nDataset: ${TOTAL_ROWS} rows\n\n" +
+ "Test Results:\n" +
+ " 1. Scalar UDF full scan: ${test1Time} ms (${String.format('%.2f', TOTAL_ROWS / (test1Time / 1000.0))} rows/sec)\n" +
+ " 2. Vector UDF full scan: ${test2Time} ms (${String.format('%.2f', TOTAL_ROWS / (test2Time / 1000.0))} rows/sec)\n" +
+ " 3. Vector string processing: ${test3Time} ms (${String.format('%.2f', TOTAL_ROWS / (test3Time / 1000.0))} rows/sec)\n" +
+ " 4. Complex aggregation: ${test4Time} ms\n" +
+ " 5. Join with UDF: ${test5Time} ms\n" +
+ " 6. Multiple UDFs: ${test6Time} ms (${String.format('%.2f', 500000 / (test6Time / 1000.0))} rows/sec)\n" +
+ " 7. Filter with UDF: ${test7Time} ms\n\n" +
+ "Performance Metrics:\n" +
+ " Vector vs Scalar speedup: ${String.format('%.2fx', test1Time / (test2Time * 1.0))}\n" +
+ " Total test time: ${(test7End - test1Start) / 1000.0} seconds\n" +
+ "=" * 80)
+
+ } finally {
+ // Cleanup
+ log.info("Cleaning up stress test resources...")
+
+ try_sql("DROP FUNCTION IF EXISTS py_calc_final_price(DOUBLE, DOUBLE);")
+ try_sql("DROP FUNCTION IF EXISTS py_extract_domain(STRING);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_multiply(INT, INT);")
+ try_sql("DROP FUNCTION IF EXISTS py_vec_upper(STRING);")
+
+ try_sql("DROP TABLE IF EXISTS python_udf_stress_table;")
+ try_sql("DROP DATABASE IF EXISTS test_pythonudf_stress;")
+ log.info("Stress test cleanup completed.")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_string.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_string.groovy
new file mode 100644
index 0000000..358023d
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/test_pythonudf_string.groovy
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_string") {
+ def tableName = "test_pythonudf_string"
+ def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip"""
+ scp_udf_file_to_all_be(pyPath)
+ def runtime_version = "3.10.12"
+ log.info("Python Zip path: ${pyPath}".toString())
+ try {
+ sql """ DROP TABLE IF EXISTS test_pythonudf_string """
+ sql """ DROP TABLE IF EXISTS test_pythonudf_string_2 """
+ sql """
+ CREATE TABLE IF NOT EXISTS test_pythonudf_string (
+ `user_id` INT NOT NULL COMMENT "用户id",
+ `char_col` CHAR NOT NULL COMMENT "",
+ `varchar_col` VARCHAR(10) NOT NULL COMMENT "",
+ `string_col` STRING NOT NULL COMMENT ""
+ )
+ DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1");
+ """
+ StringBuilder sb = new StringBuilder()
+ int i = 1
+ for (; i < 9; i ++) {
+ sb.append("""
+ (${i}, '${i}','abcdefg${i}','poiuytre${i}abcdefg'),
+ """)
+ }
+ sb.append("""
+ (${i}, '${i}','abcdefg${i}','poiuytre${i}abcdefg')
+ """)
+ sql """ INSERT INTO test_pythonudf_string VALUES
+ ${sb.toString()}
+ """
+ sql """ create table test_pythonudf_string_2 like test_pythonudf_string """
+ sql """ insert into test_pythonudf_string_2 select * from test_pythonudf_string; """
+ qt_select_default """ SELECT * FROM test_pythonudf_string t ORDER BY user_id; """
+ qt_select_default_2 """ SELECT * FROM test_pythonudf_string_2 t ORDER BY user_id; """
+
+ File path = new File(pyPath)
+ if (!path.exists()) {
+ throw new IllegalStateException("""${pyPath} doesn't exist! """)
+ }
+
+ sql """ CREATE FUNCTION python_udf_string_test(string, int, int) RETURNS string PROPERTIES (
+ "file"="file://${pyPath}",
+ "symbol"="string_test.evaluate",
+ "type"="PYTHON_UDF",
+ "always_nullable" = "true",
+ "runtime_version" = "${runtime_version}"
+ ); """
+
+ qt_select """ SELECT python_udf_string_test(varchar_col, 2, 3) result FROM test_pythonudf_string ORDER BY result; """
+ qt_select """ SELECT python_udf_string_test(string_col, 2, 3) result FROM test_pythonudf_string ORDER BY result; """
+ qt_select """ SELECT python_udf_string_test('abcdef', 2, 3), python_udf_string_test('abcdefg', 2, 3) result FROM test_pythonudf_string ORDER BY result; """
+
+ qt_select_4 """
+ SELECT
+ COALESCE(
+ python_udf_string_test(test_pythonudf_string.varchar_col, 2, 3),
+ 'not1'
+ ),
+ COALESCE(
+ python_udf_string_test(test_pythonudf_string.varchar_col, 2, 3),
+ 'not2'
+ )
+ FROM
+ test_pythonudf_string
+ JOIN test_pythonudf_string_2 ON test_pythonudf_string.user_id = test_pythonudf_string_2.user_id order by 1,2;
+ """
+ } finally {
+ try_sql("DROP FUNCTION IF EXISTS python_udf_string_test(string, int, int);")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_string")
+ try_sql("DROP TABLE IF EXISTS test_pythonudf_string_2")
+ }
+}
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py
new file mode 100644
index 0000000..ef30209
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(res):
+ value = 0
+ for data in res:
+ if data is not None:
+ value += data
+ return value
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py
new file mode 100644
index 0000000..7781d78
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(res):
+ value = 0
+ for data in res:
+ if data is not None:
+ value += data
+ result = []
+ result.append(value)
+ return result
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py
new file mode 100644
index 0000000..92864bc
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(res):
+ value = ""
+ for data in res:
+ if data is not None:
+ value += data
+ result = []
+ result.append(value)
+ return result
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py
new file mode 100644
index 0000000..ede02c1
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(res):
+ value = ""
+ for data in res:
+ if data is not None:
+ value += data
+ return value
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/assert_equal_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/assert_equal_test.py
new file mode 100644
index 0000000..43501d1
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/assert_equal_test.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(val1, val2):
+ if val1 != val2:
+ raise RuntimeError("Assertion Not Met :: ! ( " + str(val1) + " == " + str(val2) + " )")
+ else:
+ return str(val1) + " == " + str(val2)
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/assert_lessthan_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/assert_lessthan_test.py
new file mode 100644
index 0000000..b4ca8ff
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/assert_lessthan_test.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(smaller, bigger):
+ if smaller is None or bigger is None:
+ raise RuntimeError("Null values found :: " + str(smaller) + " < " + str(bigger))
+ if not (smaller < bigger):
+ raise RuntimeError("Assertion Not Met :: ! ( " + str(smaller) + " < " + str(bigger) + " )")
+ else:
+ return str(smaller) + " < " + str(bigger)
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/boolean_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/boolean_test.py
new file mode 100644
index 0000000..b6443e3
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/boolean_test.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(arg):
+ if arg is True:
+ return False
+ else:
+ return True
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/double_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/double_test.py
new file mode 100644
index 0000000..8667bc0
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/double_test.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(arg1, arg2):
+ return arg1 + arg2
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py
new file mode 100644
index 0000000..3b2d726f
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(arg1, arg2):
+ return arg1 - arg2
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py
new file mode 100644
index 0000000..b96f6b0
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(arg):
+ return int(arg + 1)
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_int_double_ret_string_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_double_ret_string_string_test.py
new file mode 100644
index 0000000..f8be7d9
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_double_ret_string_string_test.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(i, d):
+ ans = {}
+ ans["114" + str(i)] = "514" + str(d)
+ return ans
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_int_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_int_test.py
new file mode 100644
index 0000000..87e27ec
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_int_test.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(hashMap):
+ mul = 0
+ for key, value in hashMap.items():
+ mul += key * value
+ return mul
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_double_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_double_test.py
new file mode 100644
index 0000000..3fc0028
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_double_test.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(mid):
+ ans = {}
+ for key, value in mid.items():
+ ans[key * 10] = value * 10
+ return ans
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_int_test.py
new file mode 100644
index 0000000..5e57f3d
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_int_test.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(mii):
+ ans = {}
+ for key, value in mii.items():
+ ans[key * 10] = value * 10
+ return ans
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_string_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_string_string_test.py
new file mode 100644
index 0000000..b6eb3a3
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_string_string_test.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(mp):
+ ans = {}
+ for key, value in mp.items():
+ ans[key + "114"] = value + "514"
+ return ans
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_string_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_string_string_test.py
new file mode 100644
index 0000000..2121c92
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_string_string_test.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(hashMap):
+ sb = []
+ sortSet = set()
+
+ for key, value in hashMap.items():
+ sortSet.add(key + value)
+
+ for item in sorted(sortSet):
+ sb.append(item)
+
+ ans = ''.join(sb)
+ return ans
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_array_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_array_type.py
new file mode 100644
index 0000000..6d8af80
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_array_type.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def array_to_csv_impl(int_arr, str_arr, nested_arr):
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ def format_array(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(safe_str(item) for item in arr) + ']'
+
+ def format_nested_array(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(format_array(inner) for inner in arr) + ']'
+
+ parts = [
+ format_array(int_arr),
+ format_array(str_arr),
+ format_nested_array(nested_arr)
+ ]
+ return '|'.join(parts)
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_data_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_data_type.py
new file mode 100644
index 0000000..4786e97
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_data_type.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def row_to_csv_all_impl(
+ bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col,
+ float_col, double_col, decimal32_col, decimal64_col, decimal128_col,
+ date_col, datetime_col, char_col, varchar_col, string_col
+):
+ cols = [
+ bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col,
+ float_col, double_col, decimal32_col, decimal64_col, decimal128_col,
+ date_col, datetime_col, char_col, varchar_col, string_col
+ ]
+
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ return ','.join(safe_str(col) for col in cols)
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_map_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_map_type.py
new file mode 100644
index 0000000..bd6f099
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_map_type.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def map_to_csv_impl(map1, map2):
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ def format_map(m):
+ if m is None:
+ return 'NULL'
+ # Doris passes MAP as Python dict
+ items = [f"{safe_str(k)}:{safe_str(v)}" for k, v in m.items()]
+ return '{' + ','.join(sorted(items)) + '}'
+
+ return '|'.join([format_map(map1), format_map(map2)])
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_module_test.zip b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_module_test.zip
new file mode 100644
index 0000000..6dc6d95
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_module_test.zip
Binary files differ
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.py
new file mode 100644
index 0000000..95de4dc
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.py
@@ -0,0 +1,413 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Scalar Python UDF operations - row-by-row processing
+"""
+
+import math
+import re
+from datetime import datetime, timedelta
+from decimal import Decimal
+
+
+# ==================== Numeric Operations ====================
+
+def add_three_numbers(a, b, c):
+ """Add three numbers"""
+ if a is None or b is None or c is None:
+ return None
+ return a + b + c
+
+
+def multiply_with_default(a, b, default=1):
+ """Multiply two numbers, return default if any is None"""
+ if a is None or b is None:
+ return default
+ return a * b
+
+
+def safe_divide_with_precision(numerator, denominator, precision=2):
+ """Safe division with specified decimal precision"""
+ if numerator is None or denominator is None or denominator == 0:
+ return None
+ result = numerator / denominator
+ return round(result, precision)
+
+
+def calculate_discount_price(original_price, discount_percent):
+ """Calculate price after discount"""
+ if original_price is None or discount_percent is None:
+ return None
+ if discount_percent < 0 or discount_percent > 100:
+ return original_price
+ return original_price * (1 - discount_percent / 100)
+
+
+def compound_interest(principal, rate, years):
+ """Calculate compound interest: P * (1 + r)^t"""
+ if principal is None or rate is None or years is None:
+ return None
+ if principal <= 0 or rate < 0 or years < 0:
+ return None
+ return principal * math.pow(1 + rate / 100, years)
+
+
+def calculate_bmi(weight_kg, height_m):
+ """Calculate Body Mass Index"""
+ if weight_kg is None or height_m is None or height_m <= 0:
+ return None
+ return round(weight_kg / (height_m * height_m), 2)
+
+
+def fibonacci(n):
+ """Calculate nth Fibonacci number"""
+ if n is None or n < 0:
+ return None
+ if n <= 1:
+ return n
+ a, b = 0, 1
+ for _ in range(2, n + 1):
+ a, b = b, a + b
+ return b
+
+
+def is_prime(n):
+ """Check if a number is prime"""
+ if n is None or n < 2:
+ return False
+ if n == 2:
+ return True
+ if n % 2 == 0:
+ return False
+ for i in range(3, int(math.sqrt(n)) + 1, 2):
+ if n % i == 0:
+ return False
+ return True
+
+
+def gcd(a, b):
+ """Calculate Greatest Common Divisor"""
+ if a is None or b is None:
+ return None
+ a, b = abs(a), abs(b)
+ while b:
+ a, b = b, a % b
+ return a
+
+
+def lcm(a, b):
+ """Calculate Least Common Multiple"""
+ if a is None or b is None or a == 0 or b == 0:
+ return None
+ return abs(a * b) // gcd(a, b)
+
+
+# ==================== String Operations ====================
+
+def reverse_string(s):
+ """Reverse a string"""
+ if s is None:
+ return None
+ return s[::-1]
+
+
+def count_vowels(s):
+ """Count number of vowels in a string"""
+ if s is None:
+ return None
+ vowels = 'aeiouAEIOU'
+ return sum(1 for char in s if char in vowels)
+
+
+def count_words(s):
+ """Count number of words in a string"""
+ if s is None:
+ return None
+ return len(s.split())
+
+
+def string_length_custom(s):
+ """Calculate string length (custom implementation for testing)"""
+ if s is None:
+ return None
+ return len(s)
+
+
+def capitalize_words(s):
+ """Capitalize first letter of each word"""
+ if s is None:
+ return None
+ return ' '.join(word.capitalize() for word in s.split())
+
+
+def remove_whitespace(s):
+ """Remove all whitespace from string"""
+ if s is None:
+ return None
+ return ''.join(s.split())
+
+
+def extract_numbers(s):
+ """Extract all numbers from string and concatenate"""
+ if s is None:
+ return None
+ numbers = re.findall(r'\d+', s)
+ return ','.join(numbers) if numbers else ''
+
+
+def is_palindrome(s):
+ """Check if string is a palindrome (case-insensitive)"""
+ if s is None:
+ return None
+ cleaned = ''.join(c.lower() for c in s if c.isalnum())
+ return cleaned == cleaned[::-1]
+
+
+def string_similarity(s1, s2):
+ """Calculate simple string similarity (0-100)"""
+ if s1 is None or s2 is None:
+ return None
+ if s1 == s2:
+ return 100.0
+ # Simple character overlap ratio
+ set1, set2 = set(s1.lower()), set(s2.lower())
+ if not set1 or not set2:
+ return 0.0
+ intersection = len(set1 & set2)
+ union = len(set1 | set2)
+ return round(intersection / union * 100, 2)
+
+
+def mask_email(email):
+ """Mask email address: user@domain.com -> u***@domain.com"""
+ if email is None or '@' not in email:
+ return None
+ parts = email.split('@')
+ if len(parts[0]) <= 1:
+ return email
+ masked_user = parts[0][0] + '***'
+ return f"{masked_user}@{parts[1]}"
+
+
+def extract_domain(email):
+ """Extract domain from email address"""
+ if email is None or '@' not in email:
+ return None
+ return email.split('@')[1]
+
+
+def truncate_string(s, max_length, suffix='...'):
+ """Truncate string to max length with suffix"""
+ if s is None:
+ return None
+ if len(s) <= max_length:
+ return s
+ return s[:max_length - len(suffix)] + suffix
+
+
+# ==================== Date/Time Operations ====================
+
+def days_between_dates(date1_str, date2_str):
+ """Calculate days between two dates (YYYY-MM-DD format)"""
+ if date1_str is None or date2_str is None:
+ return None
+ try:
+ d1 = datetime.strptime(str(date1_str), '%Y-%m-%d')
+ d2 = datetime.strptime(str(date2_str), '%Y-%m-%d')
+ return abs((d2 - d1).days)
+ except:
+ return None
+
+
+def is_weekend(date_str):
+ """Check if date is weekend (Saturday or Sunday)"""
+ if date_str is None:
+ return None
+ try:
+ date = datetime.strptime(str(date_str), '%Y-%m-%d')
+ return date.weekday() >= 5 # 5=Saturday, 6=Sunday
+ except:
+ return None
+
+
+def get_quarter(date_str):
+ """Get quarter (1-4) from date"""
+ if date_str is None:
+ return None
+ try:
+ date = datetime.strptime(str(date_str), '%Y-%m-%d')
+ return (date.month - 1) // 3 + 1
+ except:
+ return None
+
+
+def age_in_years(birth_date_str, current_date_str):
+ """Calculate age in years"""
+ if birth_date_str is None or current_date_str is None:
+ return None
+ try:
+ birth = datetime.strptime(str(birth_date_str), '%Y-%m-%d')
+ current = datetime.strptime(str(current_date_str), '%Y-%m-%d')
+ age = current.year - birth.year
+ if (current.month, current.day) < (birth.month, birth.day):
+ age -= 1
+ return age
+ except:
+ return None
+
+
+# ==================== Boolean/Conditional Operations ====================
+
+def is_in_range(value, min_val, max_val):
+ """Check if value is in range [min_val, max_val]"""
+ if value is None or min_val is None or max_val is None:
+ return None
+ return min_val <= value <= max_val
+
+
+def xor_operation(a, b):
+ """XOR operation on two booleans"""
+ if a is None or b is None:
+ return None
+ return (a or b) and not (a and b)
+
+
+def all_true(*args):
+ """Check if all arguments are True"""
+ if any(arg is None for arg in args):
+ return None
+ return all(args)
+
+
+def any_true(*args):
+ """Check if any argument is True"""
+ if any(arg is None for arg in args):
+ return None
+ return any(args)
+
+
+def count_true(*args):
+ """Count number of True values"""
+ if any(arg is None for arg in args):
+ return None
+ return sum(1 for arg in args if arg)
+
+
+# ==================== Complex/Mixed Operations ====================
+
+def calculate_grade(score):
+ """Convert numeric score to letter grade"""
+ if score is None:
+ return None
+ if score >= 90:
+ return 'A'
+ elif score >= 80:
+ return 'B'
+ elif score >= 70:
+ return 'C'
+ elif score >= 60:
+ return 'D'
+ else:
+ return 'F'
+
+
+def categorize_age(age):
+ """Categorize age into groups"""
+ if age is None:
+ return None
+ if age < 0:
+ return 'Invalid'
+ elif age < 13:
+ return 'Child'
+ elif age < 20:
+ return 'Teenager'
+ elif age < 60:
+ return 'Adult'
+ else:
+ return 'Senior'
+
+
+def calculate_tax(income, tax_rate):
+ """Calculate tax with progressive rates"""
+ if income is None or tax_rate is None:
+ return None
+ if income <= 0:
+ return 0.0
+ return round(income * tax_rate / 100, 2)
+
+
+def format_phone_number(phone):
+ """Format phone number: 1234567890 -> (123) 456-7890"""
+ if phone is None:
+ return None
+ digits = ''.join(c for c in str(phone) if c.isdigit())
+ if len(digits) != 10:
+ return phone
+ return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
+
+
+def validate_credit_card_luhn(card_number):
+ """Validate credit card using Luhn algorithm"""
+ if card_number is None:
+ return False
+ digits = [int(d) for d in str(card_number) if d.isdigit()]
+ if not digits:
+ return False
+
+ checksum = 0
+ for i, digit in enumerate(reversed(digits)):
+ if i % 2 == 1:
+ digit *= 2
+ if digit > 9:
+ digit -= 9
+ checksum += digit
+ return checksum % 10 == 0
+
+
+def json_extract_value(json_str, key):
+ """Extract value from simple JSON string"""
+ if json_str is None or key is None:
+ return None
+ try:
+ import json
+ data = json.loads(json_str)
+ return str(data.get(key, ''))
+ except:
+ return None
+
+
+def levenshtein_distance(s1, s2):
+ """Calculate Levenshtein distance between two strings"""
+ if s1 is None or s2 is None:
+ return None
+ if len(s1) < len(s2):
+ return levenshtein_distance(s2, s1)
+ if len(s2) == 0:
+ return len(s1)
+
+ previous_row = range(len(s2) + 1)
+ for i, c1 in enumerate(s1):
+ current_row = [i + 1]
+ for j, c2 in enumerate(s2):
+ insertions = previous_row[j + 1] + 1
+ deletions = current_row[j] + 1
+ substitutions = previous_row[j] + (c1 != c2)
+ current_row.append(min(insertions, deletions, substitutions))
+ previous_row = current_row
+
+ return previous_row[-1]
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.zip b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.zip
new file mode 100644
index 0000000..15192ef
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.zip
Binary files differ
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_struct_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_struct_type.py
new file mode 100644
index 0000000..b785691
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_struct_type.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def struct_to_csv_impl(person, point):
+ def safe_str(x):
+ return 'NULL' if x is None else str(x)
+
+ def format_array(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(safe_str(item) for item in arr) + ']'
+
+ def format_struct_dict(s, field_names):
+ if s is None:
+ return 'NULL'
+ parts = []
+ for field in field_names:
+ val = s.get(field)
+ parts.append(safe_str(val))
+ return '(' + ','.join(parts) + ')'
+
+ person_str = format_struct_dict(person, ['name', 'age', 'salary'])
+
+ if point is None:
+ point_str = 'NULL'
+ else:
+ x_val = safe_str(point.get('x'))
+ y_val = safe_str(point.get('y'))
+ tags_val = format_array(point.get('tags'))
+ point_str = f"({x_val},{y_val},{tags_val})"
+
+ return '|'.join([person_str, point_str])
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.py
new file mode 100644
index 0000000..31dd411
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Vector Python UDF operations using pandas.Series
+"""
+
+import pandas as pd
+import numpy as np
+
+
+def add_constant(a: pd.Series, constant: pd.Series) -> pd.Series:
+ """Add a constant to series"""
+ # constant is a series but we use the first value
+ const_val = constant.iloc[0] if len(constant) > 0 else 0
+ return a + const_val
+
+
+def multiply_by_constant(a: pd.Series, constant: pd.Series) -> pd.Series:
+ """Multiply series by a constant"""
+ const_val = constant.iloc[0] if len(constant) > 0 else 1
+ return a * const_val
+
+
+def calculate_discount(price: pd.Series, discount_percent: pd.Series) -> pd.Series:
+ """Calculate price after discount"""
+ return price * (1 - discount_percent)
+
+
+def string_length(s: pd.Series) -> pd.Series:
+ """Calculate length of each string in series"""
+ return s.str.len()
+
+
+def to_uppercase(s: pd.Series) -> pd.Series:
+ """Convert strings to uppercase"""
+ return s.str.upper()
+
+
+def vec_add_with_constant(a: pd.Series, b: pd.Series) -> pd.Series:
+ """Add two series and add a constant"""
+ return a + b + 100
+
+
+def vec_multiply_and_round(a: pd.Series, b: pd.Series) -> pd.Series:
+ """Multiply two series and round to 2 decimal places"""
+ return (a * b).round(2)
+
+
+def vec_string_concat_with_separator(s1: pd.Series, s2: pd.Series) -> pd.Series:
+ """Concatenate two string series with a separator"""
+ return s1 + ' | ' + s2
+
+
+def vec_string_title_case(s: pd.Series) -> pd.Series:
+ """Convert string series to title case"""
+ return s.str.title()
+
+
+def vec_conditional_value(a: pd.Series, b: pd.Series) -> pd.Series:
+ """Return a if a > b, else return b"""
+ return pd.Series(np.where(a > b, a, b))
+
+
+def vec_percentage_calculation(part: pd.Series, total: pd.Series) -> pd.Series:
+ """Calculate percentage: (part / total) * 100"""
+ return (part / total * 100).round(2)
+
+
+def vec_is_in_range(value: pd.Series, min_val: pd.Series, max_val: pd.Series) -> pd.Series:
+ """Check if value is between min_val and max_val"""
+ return (value >= min_val) & (value <= max_val)
+
+
+def vec_safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
+ """Safe division, return 0 when denominator is 0 or None"""
+ result = numerator / denominator
+ # Replace inf and -inf with 0
+ result = result.replace([np.inf, -np.inf], 0)
+ # Fill NaN with 0
+ return result.fillna(0)
+
+
+def vec_exponential_decay(value: pd.Series, days: pd.Series) -> pd.Series:
+ """Calculate exponential decay: value * exp(-days/30)"""
+ return value * np.exp(-days / 30.0)
+
+
+def vec_string_extract_first_word(s: pd.Series) -> pd.Series:
+ """Extract the first word from a string"""
+ return s.str.split().str[0]
+
+
+def vec_normalize_to_range(value: pd.Series) -> pd.Series:
+ """Normalize values to 0-1 range using min-max normalization"""
+ min_val = value.min()
+ max_val = value.max()
+ if max_val == min_val:
+ return pd.Series([0.5] * len(value))
+ return (value - min_val) / (max_val - min_val)
+
+
+def vec_moving_average(value: pd.Series) -> pd.Series:
+ """Calculate 3-point moving average"""
+ return value.rolling(window=3, min_periods=1).mean()
+
+
+def vec_z_score(value: pd.Series) -> pd.Series:
+ """Calculate z-score: (value - mean) / std"""
+ mean = value.mean()
+ std = value.std()
+ if std == 0 or pd.isna(std):
+ return pd.Series([0.0] * len(value))
+ return (value - mean) / std
+
+
+def vec_clip_values(value: pd.Series, min_val: pd.Series, max_val: pd.Series) -> pd.Series:
+ """Clip values to be within min_val and max_val"""
+ return value.clip(lower=min_val, upper=max_val)
+
+
+def vec_boolean_and(a: pd.Series, b: pd.Series) -> pd.Series:
+ """Logical AND operation on two boolean series"""
+ return a & b
+
+
+def vec_boolean_or(a: pd.Series, b: pd.Series) -> pd.Series:
+ """Logical OR operation on two boolean series"""
+ return a | b
+
+
+def vec_string_contains(s: pd.Series, pattern: pd.Series) -> pd.Series:
+ """Check if string contains pattern (case-insensitive)"""
+ # For simplicity, use the first pattern value for all rows
+ if len(pattern) > 0 and not pd.isna(pattern.iloc[0]):
+ pattern_str = str(pattern.iloc[0])
+ return s.str.contains(pattern_str, case=False, na=False)
+ return pd.Series([False] * len(s))
+
+
+def vec_abs_difference(a: pd.Series, b: pd.Series) -> pd.Series:
+ """Calculate absolute difference between two series"""
+ return (a - b).abs()
+
+
+def vec_power(base: pd.Series, exponent: pd.Series) -> pd.Series:
+ """Calculate base raised to the power of exponent"""
+ return base ** exponent
+
+
+def vec_log_transform(value: pd.Series) -> pd.Series:
+ """Calculate natural logarithm, return 0 for non-positive values"""
+ result = np.log(value)
+ return result.replace([np.inf, -np.inf], 0).fillna(0)
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.zip b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.zip
new file mode 100644
index 0000000..3efd381
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.zip
Binary files differ
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip b/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip
new file mode 100644
index 0000000..b4ed70a
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip
Binary files differ
diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/string_test.py
new file mode 100644
index 0000000..3505617
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udf_scripts/string_test.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(arg1, a, b):
+ return arg1[:a] + "*" * (len(arg1) - a - b) + arg1[-b:]
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/array_int_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/array_int_test.py
new file mode 100644
index 0000000..78c1fce
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/array_int_test.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(count):
+ for i in range(3):
+ yield [1, 2, 3]
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/array_string_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/array_string_test.py
new file mode 100644
index 0000000..7fb1f02
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/array_string_test.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(count):
+ for i in range(3):
+ yield ['Hi', 'DataMind', 'Good']
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/double_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/double_test.py
new file mode 100644
index 0000000..275c493
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/double_test.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(val):
+ yield val * 10
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/float_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/float_test.py
new file mode 100644
index 0000000..de321ba
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/float_test.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(val):
+ yield val - 10
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/int_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/int_test.py
new file mode 100644
index 0000000..15ccedb
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/int_test.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(count):
+ for i in range(3):
+ yield count
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/map_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/map_test.py
new file mode 100644
index 0000000..290da85
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/map_test.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(val):
+ for i in range(3):
+ yield val
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/string_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/string_test.py
new file mode 100644
index 0000000..78939b8
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/string_test.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(value, separator):
+ for part in value.split(separator):
+ yield part
\ No newline at end of file
diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/struct_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/struct_test.py
new file mode 100644
index 0000000..1a93ba3
--- /dev/null
+++ b/regression-test/suites/pythonudf_p0/udtf_scripts/struct_test.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+def evaluate(val):
+ for i in range(3):
+ yield 1, 0.112, "Hello, DataMind"
\ No newline at end of file