blob: f208021268332cbf98978bf6b890351e515e6d5e [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <boost/algorithm/string.hpp>
#include "codegen/llvm-codegen.h"
#include "runtime/descriptors.h"
#include "runtime/mem-pool.h"
#include "runtime/runtime-state.h"
#include "runtime/string-value.h"
#include "runtime/timestamp-value.h"
#include "runtime/tuple.h"
#include "text-converter.h"
#include "util/string-parser.h"
#include "util/runtime-profile-counters.h"
#include "common/names.h"
using namespace impala;
TextConverter::TextConverter(char escape_char, const string& null_col_val,
bool check_null, bool strict_mode)
: escape_char_(escape_char),
null_col_val_(null_col_val),
check_null_(check_null),
strict_mode_(strict_mode) {
}
void TextConverter::UnescapeString(const char* src, char* dest, int* len,
int64_t maxlen) {
const char* src_end = src + *len;
char* dest_end = dest + *len;
if (maxlen > 0) dest_end = dest + maxlen;
char* dest_ptr = dest;
bool escape_next_char = false;
while ((src < src_end) && (dest_ptr < dest_end)) {
if (*src == escape_char_) {
escape_next_char = !escape_next_char;
} else {
escape_next_char = false;
}
if (escape_next_char) {
++src;
} else {
*dest_ptr++ = *src++;
}
}
char* dest_start = reinterpret_cast<char*>(dest);
*len = dest_ptr - dest_start;
}
// Codegen for a function to parse one slot. The IR for a int slot looks like:
// define i1 @WriteSlot(<{ i32, i8 }>* %tuple_arg, i8* %data, i32 %len) #38 {
// entry:
// %parse_result = alloca i32
// %0 = call i1 @IrIsNullString(i8* %data, i32 %len)
// br i1 %0, label %set_null, label %check_zero
//
// set_null: ; preds = %check_zero, %entry
// %1 = bitcast <{ i32, i8 }>* %tuple_arg to i8*
// %null_byte_ptr2 = getelementptr inbounds i8, i8* %1, i32 4
// %null_byte3 = load i8, i8* %null_byte_ptr2
// %null_bit_set4 = or i8 %null_byte3, 1
// store i8 %null_bit_set4, i8* %null_byte_ptr2
// ret i1 true
//
// parse_slot: ; preds = %check_zero
// %slot = getelementptr inbounds <{ i32, i8 }>, <{ i32, i8 }>* %tuple_arg, i32 0, i32 0
// %2 = call i32 @IrStringToInt32(i8* %data, i32 %len, i32* %parse_result)
// %parse_result1 = load i32, i32* %parse_result
// %failed = icmp eq i32 %parse_result1, 1
// br i1 %failed, label %parse_fail, label %parse_success
//
// check_zero: ; preds = %entry
// %3 = icmp eq i32 %len, 0
// br i1 %3, label %set_null, label %parse_slot
//
// parse_success: ; preds = %parse_slot
// store i32 %2, i32* %slot
// ret i1 true
//
// parse_fail: ; preds = %parse_slot
// %4 = bitcast <{ i32, i8 }>* %tuple_arg to i8*
// %null_byte_ptr = getelementptr inbounds i8, i8* %4, i32 4
// %null_byte = load i8, i8* %null_byte_ptr
// %null_bit_set = or i8 %null_byte, 1
// store i8 %null_bit_set, i8* %null_byte_ptr
// ret i1 false
//}
// TODO: convert this function to use cross-compilation + constant substitution in whole
// or part. It is currently too complex and doesn't implement the full functionality.
Status TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc, llvm::Function** fn,
const char* null_col_val, int len, bool check_null, bool strict_mode) {
DCHECK(fn != nullptr);
if (slot_desc->type().type == TYPE_CHAR) {
return Status::Expected("TextConverter::CodegenWriteSlot(): Char isn't supported for"
" CodegenWriteSlot");
}
// Codegen is_null_string
bool is_default_null = (len == 2 && null_col_val[0] == '\\' && null_col_val[1] == 'N');
llvm::Function* is_null_string_fn;
if (is_default_null) {
is_null_string_fn = codegen->GetFunction(IRFunction::IS_NULL_STRING, false);
} else {
is_null_string_fn = codegen->GetFunction(IRFunction::GENERIC_IS_NULL_STRING, false);
}
DCHECK(is_null_string_fn != NULL);
llvm::StructType* tuple_type = tuple_desc->GetLlvmStruct(codegen);
if (tuple_type == NULL) {
return Status("TextConverter::CodegenWriteSlot(): Failed to generate "
"tuple type");
}
llvm::PointerType* tuple_ptr_type = tuple_type->getPointerTo();
LlvmCodeGen::FnPrototype prototype(
codegen, "WriteSlot", codegen->bool_type());
prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple_arg", tuple_ptr_type));
prototype.AddArgument(LlvmCodeGen::NamedVariable("data", codegen->ptr_type()));
prototype.AddArgument(LlvmCodeGen::NamedVariable("len", codegen->i32_type()));
LlvmBuilder builder(codegen->context());
llvm::Value* args[3];
*fn = prototype.GeneratePrototype(&builder, &args[0]);
llvm::BasicBlock *set_null_block, *parse_slot_block, *check_zero_block = NULL;
codegen->CreateIfElseBlocks(*fn, "set_null", "parse_slot",
&set_null_block, &parse_slot_block);
if (!slot_desc->type().IsVarLenStringType()) {
check_zero_block = llvm::BasicBlock::Create(codegen->context(), "check_zero", *fn);
}
// Check if the data matches the configured NULL string.
llvm::Value* is_null;
if (check_null) {
if (is_default_null) {
is_null = builder.CreateCall(
is_null_string_fn, llvm::ArrayRef<llvm::Value*>({args[1], args[2]}));
} else {
is_null = builder.CreateCall(is_null_string_fn,
llvm::ArrayRef<llvm::Value*>(
{args[1], args[2], codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
const_cast<char*>(null_col_val)),
codegen->GetI32Constant(len)}));
}
} else {
// Constant FALSE as branch condition. We rely on later optimization passes
// to remove the branch and THEN block.
is_null = codegen->false_value();
}
builder.CreateCondBr(is_null, set_null_block,
(slot_desc->type().IsVarLenStringType()) ? parse_slot_block : check_zero_block);
if (!slot_desc->type().IsVarLenStringType()) {
builder.SetInsertPoint(check_zero_block);
// If len == 0 and it is not a string col, set slot to NULL
llvm::Value* null_len =
builder.CreateICmpEQ(args[2], codegen->GetI32Constant(0));
builder.CreateCondBr(null_len, set_null_block, parse_slot_block);
}
// Codegen parse slot block
builder.SetInsertPoint(parse_slot_block);
llvm::Value* slot =
builder.CreateStructGEP(NULL, args[0], slot_desc->llvm_field_idx(), "slot");
if (slot_desc->type().IsVarLenStringType()) {
llvm::Value* ptr = builder.CreateStructGEP(NULL, slot, 0, "string_ptr");
llvm::Value* len = builder.CreateStructGEP(NULL, slot, 1, "string_len");
builder.CreateStore(args[1], ptr);
// TODO codegen memory allocation for CHAR
DCHECK(slot_desc->type().type != TYPE_CHAR);
if (slot_desc->type().type == TYPE_VARCHAR) {
// determine if we need to truncate the string
llvm::Value* maxlen = codegen->GetI32Constant(slot_desc->type().len);
llvm::Value* len_lt_maxlen =
builder.CreateICmpSLT(args[2], maxlen, "len_lt_maxlen");
llvm::Value* minlen =
builder.CreateSelect(len_lt_maxlen, args[2], maxlen, "select_min_len");
builder.CreateStore(minlen, len);
} else {
builder.CreateStore(args[2], len);
}
builder.CreateRet(codegen->true_value());
} else {
IRFunction::Type parse_fn_enum;
llvm::Function* parse_fn = NULL;
switch (slot_desc->type().type) {
case TYPE_BOOLEAN:
parse_fn_enum = IRFunction::STRING_TO_BOOL;
break;
case TYPE_TINYINT:
parse_fn_enum = IRFunction::STRING_TO_INT8;
break;
case TYPE_SMALLINT:
parse_fn_enum = IRFunction::STRING_TO_INT16;
break;
case TYPE_INT:
parse_fn_enum = IRFunction::STRING_TO_INT32;
break;
case TYPE_BIGINT:
parse_fn_enum = IRFunction::STRING_TO_INT64;
break;
case TYPE_FLOAT:
parse_fn_enum = IRFunction::STRING_TO_FLOAT;
break;
case TYPE_DOUBLE:
parse_fn_enum = IRFunction::STRING_TO_DOUBLE;
break;
case TYPE_TIMESTAMP:
parse_fn_enum = IRFunction::STRING_TO_TIMESTAMP;
break;
case TYPE_DATE:
parse_fn_enum = IRFunction::STRING_TO_DATE;
break;
case TYPE_DECIMAL:
switch (slot_desc->slot_size()) {
case 4:
parse_fn_enum = IRFunction::STRING_TO_DECIMAL4;
break;
case 8:
parse_fn_enum = IRFunction::STRING_TO_DECIMAL8;
break;
case 16:
parse_fn_enum = IRFunction::STRING_TO_DECIMAL16;
break;
default:
DCHECK(false);
return Status("TextConverter::CodegenWriteSlot(): "
"Decimal slots can't be this size.");
}
break;
default:
DCHECK(false);
return Status("TextConverter::CodegenWriteSlot(): Codegen'd parser NYI for the"
"slot_desc type");
}
parse_fn = codegen->GetFunction(parse_fn_enum, false);
DCHECK(parse_fn != NULL);
// Set up trying to parse the string to the slot type
llvm::BasicBlock *parse_success_block, *parse_failed_block;
codegen->CreateIfElseBlocks(*fn, "parse_success", "parse_fail",
&parse_success_block, &parse_failed_block);
LlvmCodeGen::NamedVariable parse_result("parse_result", codegen->i32_type());
llvm::Value* parse_result_ptr = codegen->CreateEntryBlockAlloca(*fn, parse_result);
llvm::CallInst* parse_return;
// Call Impala's StringTo* function
// Function implementations in exec/hdfs-scanner-ir.cc
if (slot_desc->type().type == TYPE_DECIMAL) {
// Special case for decimal since it has additional precision/scale parameters
parse_return = builder.CreateCall(parse_fn, {args[1], args[2],
codegen->GetI32Constant(slot_desc->type().precision),
codegen->GetI32Constant(slot_desc->type().scale), parse_result_ptr});
} else if (slot_desc->type().type == TYPE_TIMESTAMP) {
// If the return value is large (more than 16 bytes in our toolchain) the first
// parameter would be a pointer to value parsed and the return value of callee
// should be ignored
parse_return =
builder.CreateCall(parse_fn, {slot, args[1], args[2], parse_result_ptr});
} else {
parse_return = builder.CreateCall(parse_fn, {args[1], args[2], parse_result_ptr});
}
llvm::Value* parse_result_val = builder.CreateLoad(parse_result_ptr, "parse_result");
llvm::Value* failed_value =
codegen->GetI32Constant(StringParser::PARSE_FAILURE);
// Check for parse error.
llvm::Value* parse_failed =
builder.CreateICmpEQ(parse_result_val, failed_value, "failed");
if (strict_mode) {
// In strict_mode, also check if parse_result is PARSE_OVERFLOW.
llvm::Value* overflow_value =
codegen->GetI32Constant(StringParser::PARSE_OVERFLOW);
llvm::Value* parse_overflow =
builder.CreateICmpEQ(parse_result_val, overflow_value, "overflowed");
parse_failed = builder.CreateOr(parse_failed, parse_overflow, "failed_or");
}
builder.CreateCondBr(parse_failed, parse_failed_block, parse_success_block);
// Parse succeeded
builder.SetInsertPoint(parse_success_block);
// If the parsed value is in parse_return, move it into slot
if (slot_desc->type().type == TYPE_DECIMAL) {
// For Decimal values, the return type generated by Clang is struct type rather than
// integer so casting is necessary
llvm::Value* cast_slot =
builder.CreateBitCast(slot, parse_return->getType()->getPointerTo());
builder.CreateStore(parse_return, cast_slot);
} else if (slot_desc->type().type != TYPE_TIMESTAMP) {
builder.CreateStore(parse_return, slot);
}
builder.CreateRet(codegen->true_value());
// Parse failed, set slot to null and return false
builder.SetInsertPoint(parse_failed_block);
slot_desc->CodegenSetNullIndicator(codegen, &builder, args[0], codegen->true_value());
builder.CreateRet(codegen->false_value());
}
// Case where data is \N or len == 0 and it is not a string col
builder.SetInsertPoint(set_null_block);
slot_desc->CodegenSetNullIndicator(codegen, &builder, args[0], codegen->true_value());
builder.CreateRet(codegen->true_value());
if (codegen->FinalizeFunction(*fn) == NULL) {
return Status("TextConverter::CodegenWriteSlot(): codegen'd "
"WriteSlot function failed verification, see log");
}
return Status::OK();
}