blob: e65dc16ebb75bc42bd2826caccdd15ddb0acd823 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_EXEC_HASH_JOIN_NODE_H
#define IMPALA_EXEC_HASH_JOIN_NODE_H
#include <boost/scoped_ptr.hpp>
#include <boost/thread.hpp>
#include <string>
#include "exec/exec-node.h"
#include "exec/old-hash-table.h"
#include "exec/blocking-join-node.h"
#include "util/promise.h"
#include "gen-cpp/PlanNodes_types.h" // for TJoinOp
namespace impala {
class MemPool;
class RowBatch;
class TupleRow;
/// Node for in-memory hash joins:
/// - builds up a hash table with the rows produced by our right input
/// (child(1)); build exprs are the rhs exprs of our equi-join predicates
/// - for each row from our left input, probes the hash table to retrieve
/// matching entries; the probe exprs are the lhs exprs of our equi-join predicates
//
/// Row batches:
/// - In general, we are not able to pass our output row batch on to our left child (when
/// we're fetching the probe rows): if we have a 1xn join, our output will contain
/// multiple rows per left input row
/// - TODO: fix this, so in the case of 1x1/nx1 joins (for instance, fact to dimension tbl)
/// we don't do these extra copies
class HashJoinNode : public BlockingJoinNode {
public:
HashJoinNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs);
virtual Status Init(const TPlanNode& tnode, RuntimeState* state);
virtual Status Prepare(RuntimeState* state);
virtual void Codegen(RuntimeState* state);
virtual Status Open(RuntimeState* state);
virtual Status GetNext(RuntimeState* state, RowBatch* row_batch, bool* eos);
virtual Status Reset(RuntimeState* state);
virtual void Close(RuntimeState* state);
static const char* LLVM_CLASS_NAME;
protected:
virtual void AddToDebugString(int indentation_level, std::stringstream* out) const;
virtual Status ProcessBuildInput(RuntimeState* state);
private:
boost::scoped_ptr<OldHashTable> hash_tbl_;
OldHashTable::Iterator hash_tbl_iterator_;
/// holds everything referenced from build side
boost::scoped_ptr<MemPool> build_pool_;
/// our equi-join predicates "<lhs> = <rhs>" are separated into
/// build_exprs_ (over child(1)) and probe_exprs_ (over child(0))
std::vector<ExprContext*> probe_expr_ctxs_;
std::vector<ExprContext*> build_expr_ctxs_;
/// Expressions used to build runtime filters, one per entry in filters_.
std::vector<ExprContext*> filter_expr_ctxs_;
/// is_not_distinct_from_[i] is true if and only if the ith equi-join predicate is IS
/// NOT DISTINCT FROM, rather than equality.
std::vector<bool> is_not_distinct_from_;
/// non-equi-join conjuncts from the JOIN clause
std::vector<ExprContext*> other_join_conjunct_ctxs_;
/// Derived from join_op_
/// Output all rows coming from the probe input. Used in LEFT_OUTER_JOIN and
/// FULL_OUTER_JOIN.
bool match_all_probe_;
/// Match at most one build row to each probe row. Used in LEFT_SEMI_JOIN.
bool match_one_build_;
/// Output all rows coming from the build input. Used in RIGHT_OUTER_JOIN and
/// FULL_OUTER_JOIN.
bool match_all_build_;
/// llvm function for build batch
llvm::Function* codegen_process_build_batch_fn_;
/// Function declaration for codegen'd function. Signature must match
/// HashJoinNode::ProcessBuildBatch
typedef void (*ProcessBuildBatchFn)(HashJoinNode*, RowBatch*);
ProcessBuildBatchFn process_build_batch_fn_;
/// HashJoinNode::ProcessProbeBatch() exactly
typedef int (*ProcessProbeBatchFn)(HashJoinNode*, RowBatch*, RowBatch*, int);
/// Jitted ProcessProbeBatch function pointer. Null if codegen is disabled.
ProcessProbeBatchFn process_probe_batch_fn_;
/// RuntimeFilters to build.
std::vector<RuntimeFilter*> filters_;
RuntimeProfile::Counter* build_buckets_counter_; // num buckets in hash table
RuntimeProfile::Counter* hash_tbl_load_factor_counter_;
/// Prepares for the first call to GetNext(). Must be called after GetFirstProbeRow().
void InitGetNext();
/// GetNext helper function for the common join cases: Inner join, left semi and left
/// outer
Status LeftJoinGetNext(RuntimeState* state, RowBatch* row_batch, bool* eos);
/// Processes a probe batch for the common (non right-outer join) cases.
/// out_batch: the batch for resulting tuple rows
/// probe_batch: the probe batch to process. This function can be called to
/// continue processing a batch in the middle
/// max_added_rows: maximum rows that can be added to out_batch
/// return the number of rows added to out_batch
int ProcessProbeBatch(RowBatch* out_batch, RowBatch* probe_batch, int max_added_rows);
/// Construct the build hash table, adding all the rows in 'build_batch'
void ProcessBuildBatch(RowBatch* build_batch);
/// Codegen function to create output row
llvm::Function* CodegenCreateOutputRow(LlvmCodeGen* codegen);
/// Codegen processing build batches. Identical signature to ProcessBuildBatch.
/// hash_fn is the codegen'd function for computing hashes over tuple rows in the
/// hash table.
/// Returns NULL if codegen was not possible.
llvm::Function* CodegenProcessBuildBatch(LlvmCodeGen* codegen, llvm::Function* hash_fn);
/// Codegen processing probe batches. Identical signature to ProcessProbeBatch.
/// hash_fn is the codegen'd function for computing hashes over tuple rows in the
/// hash table.
/// Returns NULL if codegen was not possible.
llvm::Function* CodegenProcessProbeBatch(LlvmCodeGen* codegen, llvm::Function* hash_fn);
};
}
#endif