blob: f77742b76944e5596edcc4a82f371f19961a4c54 [file] [log] [blame]
#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
# This script will generate the implementation of the simple vector functions for the BE.
# These include:
# - Arithmetic functions
# - Binary functions
# - Cast functions
#
# The script outputs (run: 'src/common/function/gen_vector_functions.py')
# - header and implemention for above functions:
# - src/gen_cpp/opcode/vector_functions.[h/cc]
# - python file that contains the metadata for those functions:
# - src/gen_cpp/generated_vector_functions.py
"""
import string
import os
import errno
filter_binary_op = string.Template("\
bool VectorComputeFunctions::${fn_signature}(\n\
Expr* expr, VectorizedRowBatch* batch) {\n\
int n = batch->size();\n\
if (0 == n) {\n\
return false;\n\
}\n\
int* sel = batch->selected();\n\
Expr* op1 = expr->children()[0];\n\
Expr* op2 = expr->children()[1];\n\
batch->add_column(expr->output_column(), expr->type());\n\
if (expr->is_constant()) {\n\
${native_type1}* val1 = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\
${native_type2}* val2 = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\
if (val1 == NULL || val2 == NULL) return false;\n\
if (!(*val1 ${native_op} *val2)) batch->set_size(0);\n\
} else if (op1->is_constant()) {\n\
${native_type1}* value = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\
if (NULL == value || !op2->evaluate(batch)) return false;\n\
${native_type1}* vector1\n\
= reinterpret_cast<${native_type1}*>(batch->column(op2->output_column())->col_data());\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (*value ${native_op} vector1[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (*value ${native_op} vector1[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
} else if (op2->is_constant()) {\n\
${native_type2}* value = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\
if (NULL == value || !op1->evaluate(batch)) return false;\n\
${native_type1}* vector1\n\
= reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (vector1[i] ${native_op} *value) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (vector1[i] ${native_op} *value) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
} else {\n\
if (!op1->evaluate(batch) || !op2->evaluate(batch)) return false;\n\
${native_type1}* vector1\n\
= reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
${native_type2}* vector2\n\
= reinterpret_cast<${native_type2}*>(batch->column(op2->output_column())->col_data());\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (vector1[i] ${native_op} vector2[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (vector1[i] ${native_op} vector2[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
return true;\n\
}\n\n")
filter_in_op = string.Template("\
bool VectorComputeFunctions::${fn_signature}(\n\
Expr* expr, VectorizedRowBatch* batch) {\n\
int n = batch->size();\n\
if (0 == n) {\n\
return true;\n\
}\n\
batch->add_column(expr->output_column(), expr->type());\n\
int* sel = batch->selected();\n\
int num_children = expr->get_num_children();\n\
Expr* op1 = expr->children()[0];\n\
InPredicate *in_pred = static_cast<InPredicate*>(expr);\n\
\n\
if (op1->is_constant()) {\n\
void* value = op1->get_value(NULL);\n\
if (!in_pred->hybird_set()->find(value)) {\n\
batch->set_size(0);\n\
return true;\n\
}\n\
\n\
if (num_children > 1) {\n\
${native_type1}* v = reinterpret_cast<${native_type1}*>(value);\n\
${native_type1}* vectors[num_children];\n\
for (int i = 1; i < num_children; ++i) {\n\
if (expr->get_child(i)->evaluate(batch)) return false;\n\
vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\
}\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
for (int k = 1; k < num_children; ++k) {\n\
if (*v == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
for (int k = 1; k < num_children; ++k) {\n\
if (*v == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
} else {\n\
int c1 = op1->evaluate(batch);\n\
DCHECK(c1 >= 0);\n\
${native_type1}* vector1 \n\
=reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
if (0 != in_pred->hybird_set()->size()) {\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (in_pred->hybird_set()->find(&vector1[i])) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (in_pred->hybird_set()->find(&vector1[i])) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
\n\
if (num_children > 1) {\n\
${native_type1}* vectors[num_children];\n\
for (int i = 1; i < num_children; ++i) {\n\
if (!expr->get_child(i)->evaluate(batch)) return false;\n\
vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\
}\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
for (int k = 1; k < num_children; ++k) {\n\
if (vector1[i] == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
for (int k = 1; k < num_children; ++k) {\n\
if (vector1[i] == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
}\n\
return true;\n\
}\n\n")
python_template = string.Template("\
['${fn_name}', '${return_type}', [${args}], 'VectorComputeFunctions::${fn_signature}', []], \n")
# Mapping of function to template
templates = {
'Filter_Eq': filter_binary_op,
'Filter_Ne': filter_binary_op,
'Filter_Gt': filter_binary_op,
'Filter_Lt': filter_binary_op,
'Filter_Ge': filter_binary_op,
'Filter_Le': filter_binary_op,
'Filter_In': filter_in_op,
}
# Some aggregate types that are useful for defining functions
types = {
'BOOLEAN': ['BOOLEAN'],
'TINYINT': ['TINYINT'],
'SMALLINT': ['SMALLINT'],
'INT': ['INT'],
'BIGINT': ['BIGINT'],
'LARGEINT': ['LARGEINT'],
'FLOAT': ['FLOAT'],
'DOUBLE': ['DOUBLE'],
'STRING': ['VARCHAR'],
'DATE': ['DATE'],
'DATETIME': ['DATETIME'],
'DECIMALV2': ['DECIMALV2'],
'NATIVE_INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT'],
'INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT'],
'FLOAT_TYPES': ['FLOAT', 'DOUBLE'],
'NUMERIC_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
'NATIVE_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
'STRCAST_TYPES': ['BOOLEAN', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
'ALL_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT', 'FLOAT',\
'DOUBLE', 'VARCHAR', 'DATETIME', 'DECIMALV2'],
'MAX_TYPES': ['BIGINT', 'LARGEINT', 'DOUBLE', 'DECIMALV2'],
}
# Operation, [ReturnType], [[Args1], [Args2], ... [ArgsN]]
functions = [
# BinaryPredicates
['Filter_Eq', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Ne', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Gt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Lt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Ge', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Le', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
# InPredicates
['Filter_In', ['BOOLEAN'], [['ALL_TYPES']]],
]
native_types = {
'BOOLEAN': 'bool',
'TINYINT': 'char',
'SMALLINT': 'short',
'INT': 'int',
'BIGINT': 'long',
'LARGEINT': '__int128',
'FLOAT': 'float',
'DOUBLE': 'double',
'VARCHAR': 'StringValue',
'DATE': 'DateTimeValue',
'DATETIME': 'DateTimeValue',
'DECIMALV2': 'DecimalV2Value',
}
# Portable type used in the function implementation
implemented_types = {
'BOOLEAN': 'bool',
'TINYINT': 'int8_t',
'SMALLINT': 'int16_t',
'INT': 'int32_t',
'BIGINT': 'int64_t',
'LARGEINT': '__int128',
'FLOAT': 'float',
'DOUBLE': 'double',
'VARCHAR': 'StringValue',
'DATE': 'DateTimeValue',
'DATETIME': 'DateTimeValue',
'DECIMALV2': 'DecimalV2Value',
}
native_ops = {
'Filter_Eq': '==',
'Filter_Ne': '!=',
'Filter_Gt': '>',
'Filter_Lt': '<',
'Filter_Ge': '>=',
'Filter_Le': '<=',
'Eq': '==',
'Ne': '!=',
'Gt': '>',
'Lt': '<',
'Ge': '>=',
'Le': '<=',
'BITAND': '&',
'BITNOT': '~',
'BITOR': '|',
'BITXOR': '^',
'DIVIDE': '/',
'EQ': '==',
'GT': '>',
'GE': '>=',
'INT_DIVIDE': '/',
'SUBTRACT': '-',
'MOD': '%',
'MULTIPLY': '*',
'LT': '<',
'LE': '<=',
'NE': '!=',
'ADD': '+',
}
native_funcs = {
'EQ': 'Eq',
'LE': 'Le',
'LT': 'Lt',
'NE': 'Ne',
'GE': 'Ge',
'GT': 'Gt',
}
cc_preamble = '\
// This is a generated file, DO NOT EDIT IT.\n\
// To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\
\n\
#include "gen_cpp/opcode/vector-functions.h"\n\
#include "exprs/case_expr.h"\n\
#include "exprs/expr.h"\n\
#include "exprs/in_predicate.h"\n\
#include "runtime/string_value.hpp"\n\
#include "runtime/vectorized_row_batch.h"\n\
#include "util/string_parser.hpp"\n\
#include <boost/lexical_cast.hpp>\n\
\n\
using namespace boost;\n\
using namespace std;\n\
\n\
namespace doris { \n\
\n'
cc_epilogue = '\
}\n'
h_preamble = '\
// This is a generated file, DO NOT EDIT IT.\n\
// To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\
\n\
#ifndef DORIS_OPCODE_VECTOR_FUNCTIONS_H\n\
#define DORIS_OPCODE_VECTOR_FUNCTIONS_H\n\
\n\
namespace doris {\n\
class Expr;\n\
class OpcodeRegistry;\n\
class VectorizedRowBatch;\n\
\n\
class VectorComputeFunctions {\n\
public:\n'
h_epilogue = '\
};\n\
\n\
}\n\
\n\
#endif\n'
python_preamble = '\
#!/usr/bin/env python\n\
# Licensed to the Apache Software Foundation (ASF) under one \n\
# or more contributor license agreements. See the NOTICE file \n\
# distributed with this work for additional information \n\
# regarding copyright ownership. The ASF licenses this file \n\
# to you under the Apache License, Version 2.0 (the \n\
# "License"); you may not use this file except in compliance \n\
# with the License. You may obtain a copy of the License at \n\
# \n\
# http://www.apache.org/licenses/LICENSE-2.0\n\
# \n\
# Unless required by applicable law or agreed to in writing, software\n\
# distributed under the License is distributed on an "AS IS" BASIS,\n\
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\
# See the License for the specific language governing permissions and\n\
# limitations under the License.\n\
\n\
# This is a generated file, DO NOT EDIT IT.\n\
# To add new functions, see impala/common/function-registry/gen_opcodes.py\n\
\n\
functions = [\n'
python_epilogue = ']'
header_template = string.Template("\
static bool ${fn_signature}(\n\
Expr* e, VectorizedRowBatch* batch);\n")
BE_PATH = "../gen_cpp/opcode/"
def initialize_sub(op, return_type, arg_types):
"""
Expand the signature data for template substitution. Returns
a dictionary with all the entries for all the templates used in this script
"""
sub = {}
sub["fn_name"] = op
sub["fn_signature"] = op
sub["return_type"] = return_type
sub["args"] = ""
if op in native_ops:
sub["native_op"] = native_ops[op]
for idx in range(0, len(arg_types)):
arg = arg_types[idx]
sub["fn_signature"] += "_" + native_types[arg]
sub["native_type" + repr(idx + 1)] = implemented_types[arg]
sub["args"] += "'" + arg + "', "
return sub
if __name__ == "__main__":
try:
os.makedirs(BE_PATH)
except OSError as e:
if e.errno == errno.EEXIST:
pass
else:
raise
h_file = open(BE_PATH + 'vector-functions.h', 'w')
cc_file = open(BE_PATH + 'vector-functions.cc', 'w')
python_file = open('generated_vector_functions.py', 'w')
h_file.write(h_preamble)
cc_file.write(cc_preamble)
python_file.write(python_preamble)
# Generate functions and headers
for func_data in functions:
op = func_data[0]
# If a specific template has been specified, use that one.
if len(func_data) >= 4:
template = func_data[3]
else:
# Skip functions with no template (shouldn't be auto-generated)
if not op in templates:
continue
template = templates[op]
# Expand all arguments
return_types = []
for ret in func_data[1]:
for t in types[ret]:
return_types.append(t)
signatures = []
for args in func_data[2]:
expanded_arg = []
for arg in args:
for t in types[arg]:
expanded_arg.append(t)
signatures.append(expanded_arg)
# Put arguments into substitution structure
num_functions = 0
for args in signatures:
num_functions = max(num_functions, len(args))
num_functions = max(num_functions, len(return_types))
num_args = len(signatures)
# Validate the input is correct
if len(return_types) != 1 and len(return_types) != num_functions:
print("Invalid Declaration: " + func_data)
sys.exit(1)
for args in signatures:
if len(args) != 1 and len(args) != num_functions:
print("Invalid Declaration: " + func_data)
sys.exit(1)
# Iterate over every function signature to generate
for i in range(0, num_functions):
if len(return_types) == 1:
return_type = return_types[0]
else:
return_type = return_types[i]
arg_types = []
for j in range(0, num_args):
if len(signatures[j]) == 1:
arg_types.append(signatures[j][0])
else:
arg_types.append(signatures[j][i])
# At this point, 'return_type' is a single type and 'arg_types'
# is a list of single types
sub = initialize_sub(op, return_type, arg_types)
h_file.write(header_template.substitute(sub))
cc_file.write(template.substitute(sub))
python_file.write(python_template.substitute(sub))
h_file.write(h_epilogue)
cc_file.write(cc_epilogue)
python_file.write(python_epilogue)
h_file.close()
cc_file.close()
python_file.close()