be/src/codegen/llvm-codegen.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "codegen/llvm-codegen.h"

 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <boost/algorithm/string.hpp>
 #include <boost/thread/mutex.hpp>
 #include <gutil/strings/substitute.h>

 #include <llvm/ADT/Triple.h>
 #include <llvm/Analysis/InstructionSimplify.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/MCJIT.h>
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/DataLayout.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/GlobalVariable.h>
 #include <llvm/IR/InstIterator.h>
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/NoFolder.h>
 #include <llvm/IR/Verifier.h>
 #include <llvm/Linker/Linker.h>
 #include <llvm/Support/DynamicLibrary.h>
 #include <llvm/Support/ErrorHandling.h>
 #include <llvm/Support/Host.h>
 #include <llvm/Support/TargetRegistry.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Transforms/Utils/Cloning.h>

 #include "codegen/codegen-anyval.h"
 #include "codegen/codegen-symbol-emitter.h"
 #include "codegen/impala-ir-data.h"
 #include "codegen/instruction-counter.h"
 #include "codegen/mcjit-mem-mgr.h"
 #include "common/logging.h"
 #include "impala-ir/impala-ir-names.h"
 #include "runtime/descriptors.h"
 #include "runtime/hdfs-fs-cache.h"
 #include "runtime/lib-cache.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/string-value.h"
 #include "runtime/timestamp-value.h"
 #include "util/cpu-info.h"
 #include "util/hdfs-util.h"
 #include "util/path-builder.h"
 #include "util/runtime-profile-counters.h"
 #include "util/test-info.h"

 #include "common/names.h"

 using namespace llvm;
 using namespace strings;
 using std::fstream;
 using std::move;

 DEFINE_bool(print_llvm_ir_instruction_count, false,
     "if true, prints the instruction counts of all JIT'd functions");
 DEFINE_bool(disable_optimization_passes, false,
     "if true, disables llvm optimization passes (used for testing)");
 DEFINE_bool(dump_ir, false, "if true, output IR after optimization passes");
 DEFINE_bool(perf_map, false,
     "if true, generate /tmp/perf-<pid>.map file for linux perf symbols. "
     "This is not recommended for production use because it may affect performance.");
 DEFINE_string(unopt_module_dir, "",
     "if set, saves unoptimized generated IR modules to the specified directory.");
 DEFINE_string(opt_module_dir, "",
     "if set, saves optimized generated IR modules to the specified directory.");
 DEFINE_string(asm_module_dir, "",
     "if set, saves disassembly for generated IR modules to the specified directory.");
 DECLARE_string(local_library_dir);

 namespace impala {

 bool LlvmCodeGen::llvm_initialized_ = false;

 string LlvmCodeGen::cpu_name_;
 vector<string> LlvmCodeGen::cpu_attrs_;
 unordered_set<string> LlvmCodeGen::gv_ref_ir_fns_;

 [[noreturn]] static void LlvmCodegenHandleError(
     void* user_data, const std::string& reason, bool gen_crash_diag) {
   LOG(FATAL) << "LLVM hit fatal error: " << reason.c_str();
 }

 bool LlvmCodeGen::IsDefinedInImpalad(const string& fn_name) {
   void* fn_ptr = NULL;
   Status status =
       LibCache::instance()->GetSoFunctionPtr("", fn_name, &fn_ptr, NULL, true);
   return status.ok();
 }

 void LlvmCodeGen::ParseGlobalConstant(Value* val, unordered_set<string>* ref_fns) {
   // Parse constants to find any referenced functions.
   vector<string> fn_names;
   if (isa<Function>(val)) {
     fn_names.push_back(cast<Function>(val)->getName().str());
   } else if (isa<BlockAddress>(val)) {
     const BlockAddress *ba = cast<BlockAddress>(val);
     fn_names.push_back(ba->getFunction()->getName().str());
   } else if (isa<GlobalAlias>(val)) {
     GlobalAlias* alias = cast<GlobalAlias>(val);
     ParseGlobalConstant(alias->getAliasee(), ref_fns);
   } else if (isa<ConstantExpr>(val)) {
     const ConstantExpr* ce = cast<ConstantExpr>(val);
     if (ce->isCast()) {
       for (User::const_op_iterator oi=ce->op_begin(); oi != ce->op_end(); ++oi) {
         Function* fn = dyn_cast<Function>(*oi);
         if (fn != NULL) fn_names.push_back(fn->getName().str());
       }
     }
   } else if (isa<ConstantStruct>(val) || isa<ConstantArray>(val) ||
       isa<ConstantDataArray>(val)) {
     const Constant* val_constant = cast<Constant>(val);
     for (int i = 0; i < val_constant->getNumOperands(); ++i) {
       ParseGlobalConstant(val_constant->getOperand(i), ref_fns);
     }
   } else if (isa<ConstantVector>(val) || isa<ConstantDataVector>(val)) {
     const Constant* val_const = cast<Constant>(val);
     for (int i = 0; i < val->getType()->getVectorNumElements(); ++i) {
       ParseGlobalConstant(val_const->getAggregateElement(i), ref_fns);
     }
   } else {
     // Ignore constants which cannot contain function pointers. Ignore other global
     // variables referenced by this global variable as InitializeLlvm() will parse
     // all global variables.
     DCHECK(isa<UndefValue>(val) || isa<ConstantFP>(val) || isa<ConstantInt>(val) ||
         isa<GlobalVariable>(val) || isa<ConstantTokenNone>(val) ||
         isa<ConstantPointerNull>(val) || isa<ConstantAggregateZero>(val) ||
         isa<ConstantDataSequential>(val));
   }

   // Adds all functions not defined in Impalad native binary.
   for (const string& fn_name: fn_names) {
     if (!IsDefinedInImpalad(fn_name)) ref_fns->insert(fn_name);
   }
 }

 void LlvmCodeGen::ParseGVForFunctions(Module* module, unordered_set<string>* ref_fns) {
   for (GlobalVariable& gv: module->globals()) {
     if (gv.hasInitializer() && gv.isConstant()) {
       Constant* val = gv.getInitializer();
       if (val->getNumOperands() > 0) ParseGlobalConstant(val, ref_fns);
     }
   }
 }

 void LlvmCodeGen::InitializeLlvm(bool load_backend) {
   DCHECK(!llvm_initialized_);
   llvm::remove_fatal_error_handler();
   llvm::install_fatal_error_handler(LlvmCodegenHandleError);
   // These functions can *only* be called once per process and are used to set up
   // LLVM subsystems for code generation targeting the machine we're running on.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
   llvm::InitializeNativeTargetAsmParser();
   llvm::InitializeNativeTargetDisassembler();
   llvm_initialized_ = true;

   if (load_backend) {
     string path;
     // For test env, we have to load libfesupport.so to provide sym for LLVM.
     PathBuilder::GetFullBuildPath("service/libfesupport.so", &path);
     bool failed = llvm::sys::DynamicLibrary::LoadLibraryPermanently(path.c_str());
     DCHECK_EQ(failed, 0);
   }

   cpu_name_ = llvm::sys::getHostCPUName().str();
   LOG(INFO) << "CPU class for runtime code generation: " << cpu_name_;
   GetHostCPUAttrs(&cpu_attrs_);
   LOG(INFO) << "CPU flags for runtime code generation: "
             << boost::algorithm::join(cpu_attrs_, ",");

   // Write an empty map file for perf to find.
   if (FLAGS_perf_map) CodegenSymbolEmitter::WritePerfMap();

   ObjectPool init_pool;
   scoped_ptr<LlvmCodeGen> init_codegen;
   Status status = LlvmCodeGen::CreateFromMemory(&init_pool, NULL, "init", &init_codegen);
   ParseGVForFunctions(init_codegen->module_, &gv_ref_ir_fns_);

   // Validate the module by verifying that functions for all IRFunction::Type
   // can be found.
   for (int i = IRFunction::FN_START; i < IRFunction::FN_END; ++i) {
     DCHECK(FN_MAPPINGS[i].fn == i);
     const string& fn_name = FN_MAPPINGS[i].fn_name;
     DCHECK(init_codegen->module_->getFunction(fn_name) != NULL)
         << "Failed to find function " << fn_name;
   }
 }

 LlvmCodeGen::LlvmCodeGen(
     ObjectPool* pool, MemTracker* parent_mem_tracker, const string& id)
   : id_(id),
     profile_(pool, "CodeGen"),
     mem_tracker_(new MemTracker(&profile_, -1, "CodeGen", parent_mem_tracker)),
     optimizations_enabled_(false),
     is_corrupt_(false),
     is_compiled_(false),
     context_(new llvm::LLVMContext()),
     module_(NULL),
     memory_manager_(NULL),
     loaded_functions_(IRFunction::FN_END, NULL) {
   DCHECK(llvm_initialized_) << "Must call LlvmCodeGen::InitializeLlvm first.";

   load_module_timer_ = ADD_TIMER(&profile_, "LoadTime");
   prepare_module_timer_ = ADD_TIMER(&profile_, "PrepareTime");
   module_bitcode_size_ = ADD_COUNTER(&profile_, "ModuleBitcodeSize", TUnit::BYTES);
   codegen_timer_ = ADD_TIMER(&profile_, "CodegenTime");
   optimization_timer_ = ADD_TIMER(&profile_, "OptimizationTime");
   compile_timer_ = ADD_TIMER(&profile_, "CompileTime");
   num_functions_ = ADD_COUNTER(&profile_, "NumFunctions", TUnit::UNIT);
   num_instructions_ = ADD_COUNTER(&profile_, "NumInstructions", TUnit::UNIT);
 }

 Status LlvmCodeGen::CreateFromFile(ObjectPool* pool, MemTracker* parent_mem_tracker,
     const string& file, const string& id, scoped_ptr<LlvmCodeGen>* codegen) {
   codegen->reset(new LlvmCodeGen(pool, parent_mem_tracker, id));
   SCOPED_TIMER((*codegen)->profile_.total_time_counter());

   unique_ptr<Module> loaded_module;
   RETURN_IF_ERROR((*codegen)->LoadModuleFromFile(file, &loaded_module));

   return (*codegen)->Init(std::move(loaded_module));
 }

 Status LlvmCodeGen::CreateFromMemory(ObjectPool* pool, MemTracker* parent_mem_tracker,
     const string& id, scoped_ptr<LlvmCodeGen>* codegen) {
   codegen->reset(new LlvmCodeGen(pool, parent_mem_tracker, id));
   SCOPED_TIMER((*codegen)->profile_.total_time_counter());

   // Select the appropriate IR version. We cannot use LLVM IR with SSE4.2 instructions on
   // a machine without SSE4.2 support.
   StringRef module_ir;
   string module_name;
   if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
     module_ir = StringRef(reinterpret_cast<const char*>(impala_sse_llvm_ir),
         impala_sse_llvm_ir_len);
     module_name = "Impala IR with SSE 4.2 support";
   } else {
     module_ir = StringRef(reinterpret_cast<const char*>(impala_no_sse_llvm_ir),
         impala_no_sse_llvm_ir_len);
     module_name = "Impala IR with no SSE 4.2 support";
   }

   unique_ptr<MemoryBuffer> module_ir_buf(
       MemoryBuffer::getMemBuffer(module_ir, "", false));
   unique_ptr<Module> loaded_module;
   RETURN_IF_ERROR((*codegen)->LoadModuleFromMemory(std::move(module_ir_buf),
       module_name, &loaded_module));
   return (*codegen)->Init(std::move(loaded_module));
 }

 Status LlvmCodeGen::LoadModuleFromFile(const string& file, unique_ptr<Module>* module) {
   unique_ptr<MemoryBuffer> file_buffer;
   {
     SCOPED_TIMER(load_module_timer_);

     ErrorOr<unique_ptr<MemoryBuffer>> tmp_file_buffer = MemoryBuffer::getFile(file);
     if (!tmp_file_buffer) {
       stringstream ss;
       ss << "Could not load module " << file << ": "
          << tmp_file_buffer.getError().message();
       return Status(ss.str());
     }
     file_buffer = std::move(tmp_file_buffer.get());
   }

   COUNTER_ADD(module_bitcode_size_, file_buffer->getBufferSize());
   return LoadModuleFromMemory(std::move(file_buffer), file, module);
 }

 Status LlvmCodeGen::LoadModuleFromMemory(unique_ptr<MemoryBuffer> module_ir_buf,
     string module_name, unique_ptr<Module>* module) {
   DCHECK(!module_name.empty());
   SCOPED_TIMER(prepare_module_timer_);
   ErrorOr<unique_ptr<Module>> tmp_module(NULL);
   COUNTER_ADD(module_bitcode_size_, module_ir_buf->getMemBufferRef().getBufferSize());
   tmp_module = getLazyBitcodeModule(std::move(module_ir_buf), context(), false);
   if (!tmp_module) {
     stringstream ss;
     ss << "Could not parse module " << module_name << ": " << tmp_module.getError();
     return Status(ss.str());
   }

   *module = std::move(tmp_module.get());

   // We never run global constructors or destructors so let's strip them out for all
   // modules when we load them.
   StripGlobalCtorsDtors((*module).get());

   (*module)->setModuleIdentifier(module_name);
   return Status::OK();
 }

 // TODO: Create separate counters/timers (file size, load time) for each module linked
 Status LlvmCodeGen::LinkModule(const string& file) {
   if (linked_modules_.find(file) != linked_modules_.end()) return Status::OK();

   SCOPED_TIMER(profile_.total_time_counter());
   unique_ptr<Module> new_module;
   RETURN_IF_ERROR(LoadModuleFromFile(file, &new_module));

   // The module data layout must match the one selected by the execution engine.
   new_module->setDataLayout(execution_engine_->getDataLayout());

   // Record all IR functions in 'new_module' referenced by the module's global variables
   // if they are not defined in the Impalad native code. They must be materialized to
   // avoid linking error.
   unordered_set<string> ref_fns;
   ParseGVForFunctions(new_module.get(), &ref_fns);

   // Record all the materializable functions in the new module before linking.
   // Linking the new module to the main module (i.e. 'module_') may materialize
   // functions in the new module. These materialized functions need to be parsed
   // to materialize any functions they call in 'module_'.
   unordered_set<string> materializable_fns;
   for (Function& fn: new_module->functions()) {
     if (fn.isMaterializable()) materializable_fns.insert(fn.getName().str());
   }

   bool error = Linker::linkModules(*module_, std::move(new_module));
   if (error) {
     stringstream ss;
     ss << "Problem linking " << file << " to main module.";
     return Status(ss.str());
   }
   linked_modules_.insert(file);

   for (const string& fn_name: ref_fns) {
     Function* fn = module_->getFunction(fn_name);
     // The global variable from source module which references 'fn' can have private
     // linkage and it may not be linked into 'module_'.
     if (fn != NULL && fn->isMaterializable()) {
       RETURN_IF_ERROR(MaterializeFunction(fn));
       materializable_fns.erase(fn->getName().str());
     }
   }
   // Parse functions in the source module materialized during linking and materialize
   // their callees. Do it after linking so LLVM has "merged" functions defined in both
   // modules. LLVM may not link in functions (and their callees) from source module if
   // they're defined in destination module already.
   for (const string& fn_name: materializable_fns) {
     Function* fn = module_->getFunction(fn_name);
     if (fn != NULL && !fn->isMaterializable()) RETURN_IF_ERROR(MaterializeCallees(fn));
   }
   return Status::OK();
 }

 void LlvmCodeGen::StripGlobalCtorsDtors(llvm::Module* module) {
   GlobalVariable* constructors = module->getGlobalVariable("llvm.global_ctors");
   if (constructors != NULL) constructors->eraseFromParent();
   GlobalVariable* destructors = module->getGlobalVariable("llvm.global_dtors");
   if (destructors != NULL) destructors->eraseFromParent();
 }

 Status LlvmCodeGen::CreateImpalaCodegen(ObjectPool* pool, MemTracker* parent_mem_tracker,
     const string& id, scoped_ptr<LlvmCodeGen>* codegen_ret) {
   RETURN_IF_ERROR(CreateFromMemory(pool, parent_mem_tracker, id, codegen_ret));
   LlvmCodeGen* codegen = codegen_ret->get();

   // Parse module for cross compiled functions and types
   SCOPED_TIMER(codegen->profile_.total_time_counter());
   SCOPED_TIMER(codegen->prepare_module_timer_);

   // Get type for StringValue
   codegen->string_val_type_ = codegen->GetType(StringValue::LLVM_CLASS_NAME);

   // Get type for TimestampValue
   codegen->timestamp_val_type_ = codegen->GetType(TimestampValue::LLVM_CLASS_NAME);

   // Verify size is correct
   const DataLayout& data_layout = codegen->execution_engine()->getDataLayout();
   const StructLayout* layout =
       data_layout.getStructLayout(static_cast<StructType*>(codegen->string_val_type_));
   if (layout->getSizeInBytes() != sizeof(StringValue)) {
     DCHECK_EQ(layout->getSizeInBytes(), sizeof(StringValue));
     return Status("Could not create llvm struct type for StringVal");
   }

   // Materialize functions implicitly referenced by the global variables.
   for (const string& fn_name : gv_ref_ir_fns_) {
     Function* fn = codegen->module_->getFunction(fn_name);
     DCHECK(fn != NULL);
     codegen->MaterializeFunction(fn);
   }
   return Status::OK();
 }

 Status LlvmCodeGen::Init(unique_ptr<Module> module) {
   DCHECK(module != NULL);

   llvm::CodeGenOpt::Level opt_level = CodeGenOpt::Aggressive;
 #ifndef NDEBUG
   // For debug builds, don't generate JIT compiled optimized assembly.
   // This takes a non-neglible amount of time (~.5 ms per function) and
   // blows up the fe tests (which take ~10-20 ms each).
   opt_level = CodeGenOpt::None;
 #endif
   module_ = module.get();
   EngineBuilder builder(std::move(module));
   builder.setEngineKind(EngineKind::JIT);
   builder.setOptLevel(opt_level);
   unique_ptr<ImpalaMCJITMemoryManager> memory_manager(new ImpalaMCJITMemoryManager);
   memory_manager_ = memory_manager.get();
   builder.setMCJITMemoryManager(move(memory_manager));
   builder.setMCPU(cpu_name_);
   builder.setMAttrs(cpu_attrs_);
   builder.setErrorStr(&error_string_);

   execution_engine_.reset(builder.create());
   if (execution_engine_ == NULL) {
     module_ = NULL; // module_ was owned by builder.
     stringstream ss;
     ss << "Could not create ExecutionEngine: " << error_string_;
     return Status(ss.str());
   }

   // The module data layout must match the one selected by the execution engine.
   module_->setDataLayout(execution_engine_->getDataLayout());

   void_type_ = Type::getVoidTy(context());
   ptr_type_ = PointerType::get(GetType(TYPE_TINYINT), 0);
   true_value_ = ConstantInt::get(context(), APInt(1, true, true));
   false_value_ = ConstantInt::get(context(), APInt(1, false, true));

   SetupJITListeners();

   RETURN_IF_ERROR(LoadIntrinsics());

   return Status::OK();
 }

 void LlvmCodeGen::SetupJITListeners() {
   bool need_symbol_emitter = !FLAGS_asm_module_dir.empty() || FLAGS_perf_map;
   if (!need_symbol_emitter) return;
   symbol_emitter_.reset(new CodegenSymbolEmitter(id_));
   execution_engine_->RegisterJITEventListener(symbol_emitter_.get());
   symbol_emitter_->set_emit_perf_map(FLAGS_perf_map);

   if (!FLAGS_asm_module_dir.empty()) {
     symbol_emitter_->set_asm_path(Substitute("$0/$1.asm", FLAGS_asm_module_dir, id_));
   }
 }

 LlvmCodeGen::~LlvmCodeGen() {
   if (memory_manager_ != NULL) mem_tracker_->Release(memory_manager_->bytes_tracked());
   if (mem_tracker_->parent() != NULL) mem_tracker_->UnregisterFromParent();
   mem_tracker_.reset();

   // Execution engine executes callback on event listener, so tear down engine first.
   execution_engine_.reset();
   symbol_emitter_.reset();
 }

 void LlvmCodeGen::EnableOptimizations(bool enable) {
   optimizations_enabled_ = enable;
 }

 void LlvmCodeGen::GetHostCPUAttrs(vector<string>* attrs) {
   // LLVM's ExecutionEngine expects features to be enabled or disabled with a list
   // of strings like ["+feature1", "-feature2"].
   StringMap<bool> cpu_features;
   llvm::sys::getHostCPUFeatures(cpu_features);
   for (const StringMapEntry<bool>& entry: cpu_features) {
     attrs->emplace_back(
         Substitute("$0$1", entry.second ? "+" : "-", entry.first().data()));
   }
 }

 string LlvmCodeGen::GetIR(bool full_module) const {
   string str;
   raw_string_ostream stream(str);
   if (full_module) {
     module_->print(stream, NULL);
   } else {
     for (int i = 0; i < codegend_functions_.size(); ++i) {
       codegend_functions_[i]->print(stream, true);
     }
   }
   return str;
 }

 Type* LlvmCodeGen::GetType(const ColumnType& type) {
   switch (type.type) {
     case TYPE_NULL:
       return Type::getInt1Ty(context());
     case TYPE_BOOLEAN:
       return Type::getInt1Ty(context());
     case TYPE_TINYINT:
       return Type::getInt8Ty(context());
     case TYPE_SMALLINT:
       return Type::getInt16Ty(context());
     case TYPE_INT:
       return Type::getInt32Ty(context());
     case TYPE_BIGINT:
       return Type::getInt64Ty(context());
     case TYPE_FLOAT:
       return Type::getFloatTy(context());
     case TYPE_DOUBLE:
       return Type::getDoubleTy(context());
     case TYPE_STRING:
     case TYPE_VARCHAR:
       return string_val_type_;
     case TYPE_CHAR:
       // IMPALA-3207: Codegen for CHAR is not yet implemented, this should not
       // be called for TYPE_CHAR.
       DCHECK(false) << "NYI";
       return NULL;
     case TYPE_TIMESTAMP:
       return timestamp_val_type_;
     case TYPE_DECIMAL:
       return Type::getIntNTy(context(), type.GetByteSize() * 8);
     default:
       DCHECK(false) << "Invalid type: " << type;
       return NULL;
   }
 }

 PointerType* LlvmCodeGen::GetPtrType(const ColumnType& type) {
   return PointerType::get(GetType(type), 0);
 }

 Type* LlvmCodeGen::GetType(const string& name) {
   Type* type = module_->getTypeByName(name);
   DCHECK(type != NULL) << name;
   return type;
 }

 PointerType* LlvmCodeGen::GetPtrType(const string& name) {
   Type* type = GetType(name);
   DCHECK(type != NULL) << name;
   return PointerType::get(type, 0);
 }

 PointerType* LlvmCodeGen::GetPtrType(Type* type) {
   return PointerType::get(type, 0);
 }

 PointerType* LlvmCodeGen::GetPtrPtrType(Type* type) {
   return PointerType::get(PointerType::get(type, 0), 0);
 }

 // Llvm doesn't let you create a PointerValue from a c-side ptr.  Instead
 // cast it to an int and then to 'type'.
 Value* LlvmCodeGen::CastPtrToLlvmPtr(Type* type, const void* ptr) {
   Constant* const_int = ConstantInt::get(Type::getInt64Ty(context()), (int64_t)ptr);
   return ConstantExpr::getIntToPtr(const_int, type);
 }

 Constant* LlvmCodeGen::GetIntConstant(PrimitiveType type, uint64_t val) {
   switch (type) {
     case TYPE_TINYINT:
       return ConstantInt::get(context(), APInt(8, val));
     case TYPE_SMALLINT:
       return ConstantInt::get(context(), APInt(16, val));
     case TYPE_INT:
       return ConstantInt::get(context(), APInt(32, val));
     case TYPE_BIGINT:
       return ConstantInt::get(context(), APInt(64, val));
     default:
       DCHECK(false);
       return NULL;
   }
 }

 Constant* LlvmCodeGen::GetIntConstant(int num_bytes, uint64_t low_bits, uint64_t high_bits) {
   DCHECK_GE(num_bytes, 1);
   DCHECK_LE(num_bytes, 16);
   DCHECK(BitUtil::IsPowerOf2(num_bytes));
   vector<uint64_t> vals({low_bits, high_bits});
   return ConstantInt::get(context(), APInt(8 * num_bytes, vals));
 }

 Value* LlvmCodeGen::GetStringConstant(LlvmBuilder* builder, char* data, int len) {
   // Create a global string with private linkage.
   Constant* const_string =
       ConstantDataArray::getString(context(), StringRef(data, len), false);
   GlobalVariable* gv = new GlobalVariable(
       *module_, const_string->getType(), true, GlobalValue::PrivateLinkage, const_string);
   // Get a pointer to the first element of the string.
   return builder->CreateConstInBoundsGEP2_32(NULL, gv, 0, 0, "");
 }

 AllocaInst* LlvmCodeGen::CreateEntryBlockAlloca(Function* f, const NamedVariable& var) {
   IRBuilder<> tmp(&f->getEntryBlock(), f->getEntryBlock().begin());
   AllocaInst* alloca = tmp.CreateAlloca(var.type, NULL, var.name.c_str());
   if (var.type == GetType(CodegenAnyVal::LLVM_DECIMALVAL_NAME)) {
     // Generated functions may manipulate DecimalVal arguments via SIMD instructions such
     // as 'movaps' that require 16-byte memory alignment. LLVM uses 8-byte alignment by
     // default, so explicitly set the alignment for DecimalVals.
     alloca->setAlignment(16);
   }
   return alloca;
 }

 AllocaInst* LlvmCodeGen::CreateEntryBlockAlloca(
     const LlvmBuilder& builder, Type* type, const char* name) {
   return CreateEntryBlockAlloca(
       builder.GetInsertBlock()->getParent(), NamedVariable(name, type));
 }

 AllocaInst* LlvmCodeGen::CreateEntryBlockAlloca(const LlvmBuilder& builder, Type* type,
     int num_entries, int alignment, const char* name) {
   Function* fn = builder.GetInsertBlock()->getParent();
   IRBuilder<> tmp(&fn->getEntryBlock(), fn->getEntryBlock().begin());
   AllocaInst* alloca =
       tmp.CreateAlloca(type, GetIntConstant(TYPE_INT, num_entries), name);
   alloca->setAlignment(alignment);
   return alloca;
 }

 void LlvmCodeGen::CreateIfElseBlocks(Function* fn, const string& if_name,
     const string& else_name, BasicBlock** if_block, BasicBlock** else_block,
     BasicBlock* insert_before) {
   *if_block = BasicBlock::Create(context(), if_name, fn, insert_before);
   *else_block = BasicBlock::Create(context(), else_name, fn, insert_before);
 }

 Status LlvmCodeGen::MaterializeCallees(Function* fn) {
   for (inst_iterator iter = inst_begin(fn); iter != inst_end(fn); ++iter) {
     Instruction* instr = &*iter;
     Function* called_fn = NULL;
     if (isa<CallInst>(instr)) {
       CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
       called_fn = call_instr->getCalledFunction();
     } else if (isa<InvokeInst>(instr)) {
       InvokeInst* invoke_instr = reinterpret_cast<InvokeInst*>(instr);
       called_fn = invoke_instr->getCalledFunction();
     }
     if (called_fn != NULL) RETURN_IF_ERROR(MaterializeFunctionHelper(called_fn));
   }
   return Status::OK();
 }

 Status LlvmCodeGen::MaterializeFunctionHelper(Function *fn) {
   DCHECK(!is_compiled_);
   if (fn->isIntrinsic() || !fn->isMaterializable()) return Status::OK();

   std::error_code err = module_->materialize(fn);
   if (UNLIKELY(err)) {
     return Status(Substitute("Failed to materialize $0: $1",
         fn->getName().str(), err.message()));
   }

   // Materialized functions are marked as not materializable by LLVM.
   DCHECK(!fn->isMaterializable());
   RETURN_IF_ERROR(MaterializeCallees(fn));
   return Status::OK();
 }

 Status LlvmCodeGen::MaterializeFunction(Function *fn) {
   SCOPED_TIMER(profile_.total_time_counter());
   SCOPED_TIMER(prepare_module_timer_);
   return MaterializeFunctionHelper(fn);
 }

 Function* LlvmCodeGen::GetFunction(const string& symbol, bool clone) {
   Function* fn = module_->getFunction(symbol.c_str());
   if (fn == NULL) {
     LOG(ERROR) << "Unable to locate function " << symbol;
     return NULL;
   }
   Status status = MaterializeFunction(fn);
   if (UNLIKELY(!status.ok())) return NULL;
   if (clone) return CloneFunction(fn);
   return fn;
 }

 Function* LlvmCodeGen::GetFunction(IRFunction::Type ir_type, bool clone) {
   Function* fn = loaded_functions_[ir_type];
   if (fn == NULL) {
     DCHECK(FN_MAPPINGS[ir_type].fn == ir_type);
     const string& fn_name = FN_MAPPINGS[ir_type].fn_name;
     fn = module_->getFunction(fn_name);
     if (fn == NULL) {
       LOG(ERROR) << "Unable to locate function " << fn_name;
       return NULL;
     }
     // Mixing "NoInline" with "AlwaysInline" will lead to compilation failure.
     if (!fn->hasFnAttribute(Attribute::NoInline)) fn->addFnAttr(Attribute::AlwaysInline);
     loaded_functions_[ir_type] = fn;
   }
   Status status = MaterializeFunction(fn);
   if (UNLIKELY(!status.ok())) return NULL;
   if (clone) return CloneFunction(fn);
   return fn;
 }

 // TODO: this should return a Status
 bool LlvmCodeGen::VerifyFunction(Function* fn) {
   if (is_corrupt_) return false;

   // Check that there are no calls to Expr::GetConstant(). These should all have been
   // inlined via Expr::InlineConstants().
   for (inst_iterator iter = inst_begin(fn); iter != inst_end(fn); ++iter) {
     Instruction* instr = &*iter;
     if (!isa<CallInst>(instr)) continue;
     CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
     Function* called_fn = call_instr->getCalledFunction();
     // look for call to Expr::GetConstant()
     if (called_fn != NULL &&
         called_fn->getName().find(Expr::GET_CONSTANT_INT_SYMBOL_PREFIX) != string::npos) {
       LOG(ERROR) << "Found call to Expr::GetConstant*(): " << Print(call_instr);
       is_corrupt_ = true;
       break;
     }
   }

   // There is an llvm bug (#10957) that causes the first step of the verifier to always
   // abort the process if it runs into an issue and ignores ReturnStatusAction.  This
   // would cause impalad to go down if one query has a problem.  To work around this, we
   // will copy that step here and not abort on error. Adapted from the pre-verifier
   // function pass.
   // TODO: doesn't seem there is much traction in getting this fixed but we'll see
   for (Function::iterator i = fn->begin(), e = fn->end(); i != e; ++i) {
     if (i->empty() || !i->back().isTerminator()) {
       LOG(ERROR) << "Basic block must end with terminator: \n" << Print(&(*i));
       is_corrupt_ = true;
       break;
     }
   }

   if (!is_corrupt_) {
     string str;
     raw_string_ostream stream(str);
     is_corrupt_ = verifyFunction(*fn, &stream);
     if (is_corrupt_) LOG(ERROR) << str;
   }

   if (is_corrupt_) {
     string fn_name = fn->getName(); // llvm has some fancy operator overloading
     LOG(ERROR) << "Function corrupt: " << fn_name;
     fn->dump();
     return false;
   }
   return true;
 }

 void LlvmCodeGen::SetNoInline(llvm::Function* function) const {
   function->removeFnAttr(llvm::Attribute::AlwaysInline);
   function->addFnAttr(llvm::Attribute::NoInline);
 }

 LlvmCodeGen::FnPrototype::FnPrototype(
     LlvmCodeGen* codegen, const string& name, Type* ret_type)
   : codegen_(codegen), name_(name), ret_type_(ret_type) {
   DCHECK(!codegen_->is_compiled_) << "Not valid to add additional functions";
 }

 Function* LlvmCodeGen::FnPrototype::GeneratePrototype(
     LlvmBuilder* builder, Value** params, bool print_ir) {
   vector<Type*> arguments;
   for (int i = 0; i < args_.size(); ++i) {
     arguments.push_back(args_[i].type);
   }
   FunctionType* prototype = FunctionType::get(ret_type_, arguments, false);

   Function* fn = Function::Create(
       prototype, GlobalValue::ExternalLinkage, name_, codegen_->module_);
   DCHECK(fn != NULL);

   // Name the arguments
   int idx = 0;
   for (Function::arg_iterator iter = fn->arg_begin();
       iter != fn->arg_end(); ++iter, ++idx) {
     iter->setName(args_[idx].name);
     if (params != NULL) params[idx] = &*iter;
   }

   if (builder != NULL) {
     BasicBlock* entry_block = BasicBlock::Create(codegen_->context(), "entry", fn);
     builder->SetInsertPoint(entry_block);
   }

   if (print_ir) codegen_->codegend_functions_.push_back(fn);
   return fn;
 }

 int LlvmCodeGen::ReplaceCallSites(Function* caller, Function* new_fn,
     const string& target_name) {
   DCHECK(!is_compiled_);
   DCHECK(caller->getParent() == module_);
   DCHECK(caller != NULL);
   DCHECK(new_fn != NULL);

   vector<CallInst*> call_sites;
   FindCallSites(caller, target_name, &call_sites);
   int replaced = 0;
   for (CallInst* call_instr: call_sites) {
     // Replace the called function
     call_instr->setCalledFunction(new_fn);
     ++replaced;
   }
   return replaced;
 }

 int LlvmCodeGen::ReplaceCallSitesWithValue(Function* caller, Value* replacement,
     const string& target_name) {
   DCHECK(!is_compiled_);
   DCHECK(caller->getParent() == module_);
   DCHECK(caller != NULL);
   DCHECK(replacement != NULL);

   vector<CallInst*> call_sites;
   FindCallSites(caller, target_name, &call_sites);
   int replaced = 0;
   for (CallInst* call_instr: call_sites) {
     call_instr->replaceAllUsesWith(replacement);
     ++replaced;
   }
   return replaced;
 }

 int LlvmCodeGen::ReplaceCallSitesWithBoolConst(llvm::Function* caller, bool constant,
     const string& target_name) {
   Value* replacement = ConstantInt::get(Type::getInt1Ty(context()), constant);
   return ReplaceCallSitesWithValue(caller, replacement, target_name);
 }

 void LlvmCodeGen::FindCallSites(Function* caller, const string& target_name,
       vector<CallInst*>* results) {
   for (inst_iterator iter = inst_begin(caller); iter != inst_end(caller); ++iter) {
     Instruction* instr = &*iter;
     // Look for call instructions. Note that we'll ignore invoke and other related
     // instructions that are not a plain function call.
     if (CallInst::classof(instr)) {
       CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
       Function* callee = call_instr->getCalledFunction();
       // Check for substring match.
       if (callee != NULL && callee->getName().find(target_name) != string::npos) {
         results->push_back(call_instr);
       }
     }
   }
 }

 Function* LlvmCodeGen::CloneFunction(Function* fn) {
   DCHECK(!is_compiled_);
   ValueToValueMapTy dummy_vmap;
   // Verifies that 'fn' has been materialized already. Callers are expected to use
   // GetFunction() to obtain the Function object.
   DCHECK(!fn->isMaterializable());
   // CloneFunction() automatically gives the new function a unique name
   Function* fn_clone = llvm::CloneFunction(fn, dummy_vmap, false);
   fn_clone->copyAttributesFrom(fn);
   module_->getFunctionList().push_back(fn_clone);
   return fn_clone;
 }

 Function* LlvmCodeGen::FinalizeFunction(Function* function) {
   if (LIKELY(!function->hasFnAttribute(llvm::Attribute::NoInline))) {
     function->addFnAttr(llvm::Attribute::AlwaysInline);
   }

   if (!VerifyFunction(function)) {
     function->eraseFromParent(); // deletes function
     return NULL;
   }
   if (FLAGS_dump_ir) function->dump();
   return function;
 }

 Status LlvmCodeGen::MaterializeModule(Module* module) {
   std::error_code err = module->materializeAll();
   if (UNLIKELY(err)) {
     stringstream err_msg;
     err_msg << "Failed to complete materialization of module " << module->getName().str()
         << ": " << err.message();
     return Status(err_msg.str());
   }
   return Status::OK();
 }

 // It's okay to call this function even if the module has been materialized.
 Status LlvmCodeGen::FinalizeLazyMaterialization() {
   SCOPED_TIMER(prepare_module_timer_);
   for (Function& fn: module_->functions()) {
     if (fn.isMaterializable()) {
       DCHECK(!module_->isMaterialized());
       // Unmaterialized functions can still have their declarations around. LLVM asserts
       // these unmaterialized functions' linkage types are external / external weak.
       fn.setLinkage(Function::ExternalLinkage);
       // DCE may claim the personality function is still referenced by unmaterialized
       // functions when it is deleted by DCE. Similarly, LLVM may complain if comdats
       // reference unmaterialized functions but their definition cannot be found.
       // Since the unmaterialized functions are not used anyway, just remove their
       // personality functions and comdats.
       fn.setPersonalityFn(NULL);
       fn.setComdat(NULL);
       fn.setIsMaterializable(false);
     }
   }
   // All unused functions are now not materializable so it should be quick to call
   // materializeAll(). We need to call this function in order to destroy the
   // materializer so that DCE will not assert fail.
   return MaterializeModule(module_);
 }

 Status LlvmCodeGen::FinalizeModule() {
   DCHECK(!is_compiled_);
   is_compiled_ = true;

   if (FLAGS_unopt_module_dir.size() != 0) {
     string path = FLAGS_unopt_module_dir + "/" + id_ + "_unopt.ll";
     fstream f(path.c_str(), fstream::out | fstream::trunc);
     if (f.fail()) {
       LOG(ERROR) << "Could not save IR to: " << path;
     } else {
       f << GetIR(true);
       f.close();
     }
   }

   if (is_corrupt_) return Status("Module is corrupt.");
   SCOPED_TIMER(profile_.total_time_counter());

   // Don't waste time optimizing module if there are no functions to JIT. This can happen
   // if the codegen object is created but no functions are successfully codegen'd.
   if (fns_to_jit_compile_.empty()) {
     DestroyModule();
     return Status::OK();
   }

   RETURN_IF_ERROR(FinalizeLazyMaterialization());
   if (optimizations_enabled_ && !FLAGS_disable_optimization_passes) {
     RETURN_IF_ERROR(OptimizeModule());
   }

   if (FLAGS_opt_module_dir.size() != 0) {
     string path = FLAGS_opt_module_dir + "/" + id_ + "_opt.ll";
     fstream f(path.c_str(), fstream::out | fstream::trunc);
     if (f.fail()) {
       LOG(ERROR) << "Could not save IR to: " << path;
     } else {
       f << GetIR(true);
       f.close();
     }
   }

   {
     SCOPED_TIMER(compile_timer_);
     // Finalize module, which compiles all functions.
     execution_engine_->finalizeObject();
   }

   // Get pointers to all codegen'd functions
   for (int i = 0; i < fns_to_jit_compile_.size(); ++i) {
     Function* function = fns_to_jit_compile_[i].first;
     void* jitted_function = execution_engine_->getPointerToFunction(function);
     DCHECK(jitted_function != NULL) << "Failed to jit " << function->getName().data();
     *fns_to_jit_compile_[i].second = jitted_function;
   }

   DestroyModule();

   // Track the memory consumed by the compiled code.
   int64_t bytes_allocated = memory_manager_->bytes_allocated();
   if (!mem_tracker_->TryConsume(bytes_allocated)) {
     const string& msg = Substitute(
         "Failed to allocate '$0' bytes for compiled code module", bytes_allocated);
     return mem_tracker_->MemLimitExceeded(NULL, msg, bytes_allocated);
   }
   memory_manager_->set_bytes_tracked(bytes_allocated);
   return Status::OK();
 }

 Status LlvmCodeGen::OptimizeModule() {
   SCOPED_TIMER(optimization_timer_);

   // This pass manager will construct optimizations passes that are "typical" for
   // c/c++ programs.  We're relying on llvm to pick the best passes for us.
   // TODO: we can likely muck with this to get better compile speeds or write
   // our own passes.  Our subexpression elimination optimization can be rolled into
   // a pass.
   PassManagerBuilder pass_builder;
   // 2 maps to -O2
   // TODO: should we switch to 3? (3 may not produce different IR than 2 while taking
   // longer, but we should check)
   pass_builder.OptLevel = 2;
   // Don't optimize for code size (this corresponds to -O2/-O3)
   pass_builder.SizeLevel = 0;
   pass_builder.Inliner = createFunctionInliningPass();

   // The TargetIRAnalysis pass is required to provide information about the target
   // machine to optimisation passes, e.g. the cost model.
   TargetIRAnalysis target_analysis =
       execution_engine_->getTargetMachine()->getTargetIRAnalysis();

   // Before running any other optimization passes, run the internalize pass, giving it
   // the names of all functions registered by AddFunctionToJit(), followed by the
   // global dead code elimination pass. This causes all functions not registered to be
   // JIT'd to be marked as internal, and any internal functions that are not used are
   // deleted by DCE pass. This greatly decreases compile time by removing unused code.
   vector<const char*> exported_fn_names;
   for (int i = 0; i < fns_to_jit_compile_.size(); ++i) {
     exported_fn_names.push_back(fns_to_jit_compile_[i].first->getName().data());
   }
   unique_ptr<legacy::PassManager> module_pass_manager(new legacy::PassManager());
   module_pass_manager->add(createTargetTransformInfoWrapperPass(target_analysis));
   module_pass_manager->add(createInternalizePass(exported_fn_names));
   module_pass_manager->add(createGlobalDCEPass());
   module_pass_manager->run(*module_);

   // Update counters before final optimization, but after removing unused functions. This
   // gives us a rough measure of how much work the optimization and compilation must do.
   InstructionCounter counter;
   counter.visit(*module_);
   COUNTER_SET(num_functions_, counter.GetCount(InstructionCounter::TOTAL_FUNCTIONS));
   COUNTER_SET(num_instructions_, counter.GetCount(InstructionCounter::TOTAL_INSTS));

   int64_t estimated_memory = ESTIMATED_OPTIMIZER_BYTES_PER_INST
       * counter.GetCount(InstructionCounter::TOTAL_INSTS);
   if (!mem_tracker_->TryConsume(estimated_memory)) {
     const string& msg = Substitute(
         "Codegen failed to reserve '$0' bytes for optimization", estimated_memory);
     return mem_tracker_->MemLimitExceeded(NULL, msg, estimated_memory);
   }

   // Create and run function pass manager
   unique_ptr<legacy::FunctionPassManager> fn_pass_manager(
       new legacy::FunctionPassManager(module_));
   fn_pass_manager->add(createTargetTransformInfoWrapperPass(target_analysis));
   pass_builder.populateFunctionPassManager(*fn_pass_manager);
   fn_pass_manager->doInitialization();
   for (Module::iterator it = module_->begin(), end = module_->end(); it != end ; ++it) {
     if (!it->isDeclaration()) fn_pass_manager->run(*it);
   }
   fn_pass_manager->doFinalization();

   // Create and run module pass manager
   module_pass_manager.reset(new legacy::PassManager());
   module_pass_manager->add(createTargetTransformInfoWrapperPass(target_analysis));
   pass_builder.populateModulePassManager(*module_pass_manager);
   module_pass_manager->run(*module_);
   if (FLAGS_print_llvm_ir_instruction_count) {
     for (int i = 0; i < fns_to_jit_compile_.size(); ++i) {
       InstructionCounter counter;
       counter.visit(*fns_to_jit_compile_[i].first);
       VLOG(1) << fns_to_jit_compile_[i].first->getName().str();
       VLOG(1) << counter.PrintCounters();
     }
   }

   mem_tracker_->Release(estimated_memory);
   return Status::OK();
 }

 void LlvmCodeGen::DestroyModule() {
   // Clear all references to LLVM objects owned by the module.
   loaded_functions_.clear();
   codegend_functions_.clear();
   registered_exprs_map_.clear();
   registered_exprs_.clear();
   llvm_intrinsics_.clear();
   hash_fns_.clear();
   fns_to_jit_compile_.clear();
   execution_engine_->removeModule(module_);
   module_ = NULL;
 }

 void LlvmCodeGen::AddFunctionToJit(Function* fn, void** fn_ptr) {
   Type* decimal_val_type = GetType(CodegenAnyVal::LLVM_DECIMALVAL_NAME);
   if (fn->getReturnType() == decimal_val_type) {
     // Per the x86 calling convention ABI, DecimalVals should be returned via an extra
     // first DecimalVal* argument. We generate non-compliant functions that return the
     // DecimalVal directly, which we can call from generated code, but not from compiled
     // native code.  To avoid accidentally calling a non-compliant function from native
     // code, call 'function' from an ABI-compliant wrapper.
     stringstream name;
     name << fn->getName().str() << "ABIWrapper";
     LlvmCodeGen::FnPrototype prototype(this, name.str(), void_type_);
     // Add return argument
     prototype.AddArgument(NamedVariable("result", decimal_val_type->getPointerTo()));
     // Add regular arguments
     for (Function::arg_iterator arg = fn->arg_begin(); arg != fn->arg_end(); ++arg) {
       prototype.AddArgument(NamedVariable(arg->getName(), arg->getType()));
     }
     LlvmBuilder builder(context());
     Value* args[fn->arg_size() + 1];
     Function* fn_wrapper = prototype.GeneratePrototype(&builder, &args[0]);
     fn_wrapper->addFnAttr(llvm::Attribute::AlwaysInline);
     // Mark first argument as sret (not sure if this is necessary but it can't hurt)
     fn_wrapper->addAttribute(1, Attribute::StructRet);
     // Call 'fn' and store the result in the result argument
     Value* result =
         builder.CreateCall(fn, ArrayRef<Value*>({&args[1], fn->arg_size()}), "result");
     builder.CreateStore(result, args[0]);
     builder.CreateRetVoid();
     fn = FinalizeFunction(fn_wrapper);
     DCHECK(fn != NULL);
   }

   AddFunctionToJitInternal(fn, fn_ptr);
 }

 void LlvmCodeGen::AddFunctionToJitInternal(Function* fn, void** fn_ptr) {
   DCHECK(!is_compiled_);
   fns_to_jit_compile_.push_back(make_pair(fn, fn_ptr));
 }

 void LlvmCodeGen::CodegenDebugTrace(LlvmBuilder* builder, const char* str,
     Value* v1) {
   LOG(ERROR) << "Remove IR codegen debug traces before checking in.";

   // Make a copy of str into memory owned by this object.  This is no guarantee that str is
   // still around when the debug printf is executed.
   debug_strings_.push_back(Substitute("LLVM Trace: $0", str));
   str = debug_strings_.back().c_str();

   Function* printf = module_->getFunction("printf");
   DCHECK(printf != NULL);

   // Call printf by turning 'str' into a constant ptr value
   Value* str_ptr = CastPtrToLlvmPtr(ptr_type_, const_cast<char*>(str));

   vector<Value*> calling_args;
   calling_args.push_back(str_ptr);
   if (v1 != NULL) calling_args.push_back(v1);
   builder->CreateCall(printf, calling_args);
 }

 void LlvmCodeGen::GetSymbols(unordered_set<string>* symbols) {
   for (const Function& fn: module_->functions()) {
     if (fn.isMaterializable()) symbols->insert(fn.getName());
   }
 }

 // TODO: cache this function (e.g. all min(int, int) are identical).
 // we probably want some more global IR function cache, or, implement this
 // in c and precompile it with clang.
 // define i32 @Min(i32 %v1, i32 %v2) {
 // entry:
 //   %0 = icmp slt i32 %v1, %v2
 //   br i1 %0, label %ret_v1, label %ret_v2
 //
 // ret_v1:                                           ; preds = %entry
 //   ret i32 %v1
 //
 // ret_v2:                                           ; preds = %entry
 //   ret i32 %v2
 // }
 Function* LlvmCodeGen::CodegenMinMax(const ColumnType& type, bool min) {
   LlvmCodeGen::FnPrototype prototype(this, min ? "Min" : "Max", GetType(type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("v1", GetType(type)));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("v2", GetType(type)));

   Value* params[2];
   LlvmBuilder builder(context());
   Function* fn = prototype.GeneratePrototype(&builder, &params[0]);

   Value* compare = NULL;
   switch (type.type) {
     case TYPE_NULL:
       compare = false_value();
       break;
     case TYPE_BOOLEAN:
       if (min) {
         // For min, return x && y
         compare = builder.CreateAnd(params[0], params[1]);
       } else {
         // For max, return x || y
         compare = builder.CreateOr(params[0], params[1]);
       }
       break;
     case TYPE_TINYINT:
     case TYPE_SMALLINT:
     case TYPE_INT:
     case TYPE_BIGINT:
     case TYPE_DECIMAL:
       if (min) {
         compare = builder.CreateICmpSLT(params[0], params[1]);
       } else {
         compare = builder.CreateICmpSGT(params[0], params[1]);
       }
       break;
     case TYPE_FLOAT:
     case TYPE_DOUBLE:
       if (min) {
         compare = builder.CreateFCmpULT(params[0], params[1]);
       } else {
         compare = builder.CreateFCmpUGT(params[0], params[1]);
       }
       break;
     default:
       DCHECK(false);
   }

   if (type.type == TYPE_BOOLEAN) {
     builder.CreateRet(compare);
   } else {
     BasicBlock* ret_v1, *ret_v2;
     CreateIfElseBlocks(fn, "ret_v1", "ret_v2", &ret_v1, &ret_v2);

     builder.CreateCondBr(compare, ret_v1, ret_v2);
     builder.SetInsertPoint(ret_v1);
     builder.CreateRet(params[0]);
     builder.SetInsertPoint(ret_v2);
     builder.CreateRet(params[1]);
   }

   fn = FinalizeFunction(fn);
   return fn;
 }

 // Intrinsics are loaded one by one.  Some are overloaded (e.g. memcpy) and the types must
 // be specified.
 // TODO: is there a better way to do this?
 Status LlvmCodeGen::LoadIntrinsics() {
   // Load memcpy
   {
     Type* types[] = { ptr_type(), ptr_type(), GetType(TYPE_INT) };
     Function* fn = Intrinsic::getDeclaration(module_, Intrinsic::memcpy, types);
     if (fn == NULL) {
       return Status("Could not find memcpy intrinsic.");
     }
     llvm_intrinsics_[Intrinsic::memcpy] = fn;
   }

   // TODO: where is the best place to put this?
   struct {
     Intrinsic::ID id;
     const char* error;
   } non_overloaded_intrinsics[] = {
     { Intrinsic::x86_sse42_crc32_32_8, "sse4.2 crc32_u8" },
     { Intrinsic::x86_sse42_crc32_32_16, "sse4.2 crc32_u16" },
     { Intrinsic::x86_sse42_crc32_32_32, "sse4.2 crc32_u32" },
     { Intrinsic::x86_sse42_crc32_64_64, "sse4.2 crc32_u64" },
   };
   const int num_intrinsics =
       sizeof(non_overloaded_intrinsics) / sizeof(non_overloaded_intrinsics[0]);

   for (int i = 0; i < num_intrinsics; ++i) {
     Intrinsic::ID id = non_overloaded_intrinsics[i].id;
     Function* fn = Intrinsic::getDeclaration(module_, id);
     if (fn == NULL) {
       stringstream ss;
       ss << "Could not find " << non_overloaded_intrinsics[i].error << " intrinsic";
       return Status(ss.str());
     }
     llvm_intrinsics_[id] = fn;
   }

   return Status::OK();
 }

 void LlvmCodeGen::CodegenMemcpy(LlvmBuilder* builder, Value* dst, Value* src, int size) {
   DCHECK_GE(size, 0);
   if (size == 0) return;
   Value* size_val = GetIntConstant(TYPE_BIGINT, size);
   CodegenMemcpy(builder, dst, src, size_val);
 }

 void LlvmCodeGen::CodegenMemcpy(LlvmBuilder* builder, Value* dst, Value* src,
     Value* size) {
   DCHECK(dst->getType()->isPointerTy()) << Print(dst);
   DCHECK(src->getType()->isPointerTy()) << Print(src);
   builder->CreateMemCpy(dst, src, size, /* no alignment */ 0);
 }

 void LlvmCodeGen::CodegenMemset(LlvmBuilder* builder, Value* dst, int value, int size) {
   DCHECK(dst->getType()->isPointerTy()) << Print(dst);
   DCHECK_GE(size, 0);
   if (size == 0) return;
   Value* value_const = GetIntConstant(TYPE_TINYINT, value);
   builder->CreateMemSet(dst, value_const, size, /* no alignment */ 0);
 }

 void LlvmCodeGen::CodegenClearNullBits(LlvmBuilder* builder, Value* tuple_ptr,
     const TupleDescriptor& tuple_desc) {
   Value* int8_ptr = builder->CreateBitCast(tuple_ptr, ptr_type(), "int8_ptr");
   Value* null_bytes_offset =
       ConstantInt::get(int_type(), tuple_desc.null_bytes_offset());
   Value* null_bytes_ptr =
       builder->CreateInBoundsGEP(int8_ptr, null_bytes_offset, "null_bytes_ptr");
   CodegenMemset(builder, null_bytes_ptr, 0, tuple_desc.num_null_bytes());
 }

 Value* LlvmCodeGen::CodegenAllocate(LlvmBuilder* builder, MemPool* pool, Value* size,
     const char* name) {
   DCHECK(pool != NULL);
   DCHECK(size->getType()->isIntegerTy());
   DCHECK_LE(size->getType()->getIntegerBitWidth(), 64);
   // Extend 'size' to i64 if necessary
   if (size->getType()->getIntegerBitWidth() < 64) {
     size = builder->CreateSExt(size, bigint_type());
   }
   Function* allocate_fn = GetFunction(IRFunction::MEMPOOL_ALLOCATE, false);
   PointerType* pool_type = GetPtrType(MemPool::LLVM_CLASS_NAME);
   Value* pool_val = CastPtrToLlvmPtr(pool_type, pool);
   Value* alignment = GetIntConstant(TYPE_INT, MemPool::DEFAULT_ALIGNMENT);
   Value* fn_args[] = {pool_val, size, alignment};
   return builder->CreateCall(allocate_fn, fn_args, name);
 }

 Value* LlvmCodeGen::CodegenArrayAt(LlvmBuilder* builder, Value* array, int idx,
     const char* name) {
   DCHECK(array->getType()->isPointerTy() || array->getType()->isArrayTy())
       << Print(array->getType());
   Value* ptr = builder->CreateConstGEP1_32(array, idx);
   return builder->CreateLoad(ptr, name);
 }

 void LlvmCodeGen::ClearHashFns() {
   hash_fns_.clear();
 }

 // Codegen to compute hash for a particular byte size.  Loops are unrolled in this
 // process.  For the case where num_bytes == 11, we'd do this by calling
 //   1. crc64 (for first 8 bytes)
 //   2. crc16 (for bytes 9, 10)
 //   3. crc8 (for byte 11)
 // The resulting IR looks like:
 // define i32 @CrcHash11(i8* %data, i32 %len, i32 %seed) {
 // entry:
 //   %0 = zext i32 %seed to i64
 //   %1 = bitcast i8* %data to i64*
 //   %2 = getelementptr i64* %1, i32 0
 //   %3 = load i64* %2
 //   %4 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %0, i64 %3)
 //   %5 = trunc i64 %4 to i32
 //   %6 = getelementptr i8* %data, i32 8
 //   %7 = bitcast i8* %6 to i16*
 //   %8 = load i16* %7
 //   %9 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %5, i16 %8)
 //   %10 = getelementptr i8* %6, i32 2
 //   %11 = load i8* %10
 //   %12 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %9, i8 %11)
 //   ret i32 %12
 // }
 Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
   if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
     if (num_bytes == -1) {
       // -1 indicates variable length, just return the generic loop based
       // hash fn.
       return GetFunction(IRFunction::HASH_CRC, false);
     }

     map<int, Function*>::iterator cached_fn = hash_fns_.find(num_bytes);
     if (cached_fn != hash_fns_.end()) {
       return cached_fn->second;
     }

     // Generate a function to hash these bytes
     stringstream ss;
     ss << "CrcHash" << num_bytes;
     FnPrototype prototype(this, ss.str(), GetType(TYPE_INT));
     prototype.AddArgument(LlvmCodeGen::NamedVariable("data", ptr_type()));
     prototype.AddArgument(LlvmCodeGen::NamedVariable("len", GetType(TYPE_INT)));
     prototype.AddArgument(LlvmCodeGen::NamedVariable("seed", GetType(TYPE_INT)));

     Value* args[3];
     LlvmBuilder builder(context());
     Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
     Value* data = args[0];
     Value* result = args[2];

     Function* crc8_fn = llvm_intrinsics_[Intrinsic::x86_sse42_crc32_32_8];
     Function* crc16_fn = llvm_intrinsics_[Intrinsic::x86_sse42_crc32_32_16];
     Function* crc32_fn = llvm_intrinsics_[Intrinsic::x86_sse42_crc32_32_32];
     Function* crc64_fn = llvm_intrinsics_[Intrinsic::x86_sse42_crc32_64_64];

     // Generate the crc instructions starting with the highest number of bytes
     if (num_bytes >= 8) {
       Value* result_64 = builder.CreateZExt(result, GetType(TYPE_BIGINT));
       Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_BIGINT));
       int i = 0;
       while (num_bytes >= 8) {
         Value* index[] = {GetIntConstant(TYPE_INT, i++)};
         Value* d = builder.CreateLoad(builder.CreateInBoundsGEP(ptr, index));
         result_64 = builder.CreateCall(crc64_fn, ArrayRef<Value*>({result_64, d}));
         num_bytes -= 8;
       }
       result = builder.CreateTrunc(result_64, GetType(TYPE_INT));
       Value* index[] = {GetIntConstant(TYPE_INT, i * 8)};
       // Update data to past the 8-byte chunks
       data = builder.CreateInBoundsGEP(data, index);
     }

     if (num_bytes >= 4) {
       DCHECK_LT(num_bytes, 8);
       Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_INT));
       Value* d = builder.CreateLoad(ptr);
       result = builder.CreateCall(crc32_fn, ArrayRef<Value*>({result, d}));
       Value* index[] = {GetIntConstant(TYPE_INT, 4)};
       data = builder.CreateInBoundsGEP(data, index);
       num_bytes -= 4;
     }

     if (num_bytes >= 2) {
       DCHECK_LT(num_bytes, 4);
       Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_SMALLINT));
       Value* d = builder.CreateLoad(ptr);
       result = builder.CreateCall(crc16_fn, ArrayRef<Value*>({result, d}));
       Value* index[] = {GetIntConstant(TYPE_INT, 2)};
       data = builder.CreateInBoundsGEP(data, index);
       num_bytes -= 2;
     }

     if (num_bytes > 0) {
       DCHECK_EQ(num_bytes, 1);
       Value* d = builder.CreateLoad(data);
       result = builder.CreateCall(crc8_fn, ArrayRef<Value*>({result, d}));
       --num_bytes;
     }
     DCHECK_EQ(num_bytes, 0);

     Value* shift_16 = GetIntConstant(TYPE_INT, 16);
     Value* upper_bits = builder.CreateShl(result, shift_16);
     Value* lower_bits = builder.CreateLShr(result, shift_16);
     result = builder.CreateOr(upper_bits, lower_bits);
     builder.CreateRet(result);

     fn = FinalizeFunction(fn);
     if (fn != NULL) {
       hash_fns_[num_bytes] = fn;
     }
     return fn;
   } else {
     return GetMurmurHashFunction(num_bytes);
   }
 }

 static Function* GetLenOptimizedHashFn(
     LlvmCodeGen* codegen, IRFunction::Type f, int len) {
   Function* fn = codegen->GetFunction(f, false);
   DCHECK(fn != NULL);
   if (len != -1) {
     // Clone this function since we're going to modify it by replacing the
     // length with num_bytes.
     fn = codegen->CloneFunction(fn);
     Value* len_arg = codegen->GetArgument(fn, 1);
     len_arg->replaceAllUsesWith(codegen->GetIntConstant(TYPE_INT, len));
   }
   return codegen->FinalizeFunction(fn);
 }

 Function* LlvmCodeGen::GetFnvHashFunction(int len) {
   return GetLenOptimizedHashFn(this, IRFunction::HASH_FNV, len);
 }

 Function* LlvmCodeGen::GetMurmurHashFunction(int len) {
   return GetLenOptimizedHashFn(this, IRFunction::HASH_MURMUR, len);
 }

 void LlvmCodeGen::ReplaceInstWithValue(Instruction* from, Value* to) {
   BasicBlock::iterator iter(from);
   llvm::ReplaceInstWithValue(from->getParent()->getInstList(), iter, to);
 }

 Argument* LlvmCodeGen::GetArgument(Function* fn, int i) {
   DCHECK_LE(i, fn->arg_size());
   Function::arg_iterator iter = fn->arg_begin();
   for (int j = 0; j < i; ++j) ++iter;
   return &*iter;
 }

 Value* LlvmCodeGen::GetPtrTo(LlvmBuilder* builder, Value* v, const char* name) {
   Value* ptr = CreateEntryBlockAlloca(*builder, v->getType(), name);
   builder->CreateStore(v, ptr);
   return ptr;
 }

 Constant* LlvmCodeGen::ConstantToGVPtr(Type* type, Constant* ir_constant,
     const string& name) {
   GlobalVariable* gv = new GlobalVariable(*module_, type, true,
       GlobalValue::PrivateLinkage, ir_constant, name);
   return ConstantExpr::getGetElementPtr(NULL, gv,
       ArrayRef<Constant*>({GetIntConstant(TYPE_INT, 0)}));
 }

 }

 namespace boost {

 /// Handler for exceptions in cross-compiled functions.
 /// When boost is configured with BOOST_NO_EXCEPTIONS, it calls this handler instead of
 /// throwing the exception.
 [[noreturn]] void throw_exception(std::exception const& e) {
   LOG(FATAL) << "Cannot handle exceptions in codegen'd code " << e.what();
 }

 }