blob: b883ed1a5f792f7d74b07fd882f1d20ea028d04b [file] [log] [blame]
/**
* Copyright 2011-2015 Quickstep Technologies LLC.
* Copyright 2015-2016 Pivotal Software, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
#ifndef QUICKSTEP_RELATIONAL_OPERATORS_AGGREGATION_OPERATION_HPP_
#define QUICKSTEP_RELATIONAL_OPERATORS_AGGREGATION_OPERATION_HPP_
#include <cstddef>
#include <memory>
#include <vector>
#include "catalog/CatalogTypedefs.hpp"
#include "expressions/aggregation/AggregationHandle.hpp"
#include "expressions/aggregation/AggregationID.hpp"
#include "expressions/predicate/Predicate.hpp"
#include "expressions/scalar/Scalar.hpp"
#include "storage/AggregationOperationState.pb.h"
#include "storage/HashTableBase.hpp"
#include "storage/StorageBlockInfo.hpp"
#include "utility/Macros.hpp"
namespace quickstep {
class AggregateFunction;
class CatalogDatabaseLite;
class CatalogRelationSchema;
class InsertDestination;
class StorageManager;
/** \addtogroup Storage
* @{
*/
/**
* @brief Helper class for maintaining state during aggregation operation.
* If a GROUP BY list was provided, this class maintains a hash table
* for each aggregate computed where the key is the GROUP BY expression
* values and payload is each group's corresponding running aggregation
* state. Without GROUP BY, this class maintains a single aggregation
* state for each aggregate computed.
* @note See also AggregationHandle, which encapsulates logic for actually
* computing aggregates, and AggregateFunction, which represents a
* particular SQL aggregate in the abstract sense.
*
* This class represents the common state for an instance of
* AggregationOperator, and also encapsulates the high-level logic for
* aggregating over blocks and generating final results.
* AggregationWorkOrder::execute() is mainly just a call to aggregateBlock(),
* while FinalizeAggregationWorkOrder::execute() is mainly just a call to
* finalizeAggregate().
**/
class AggregationOperationState {
public:
/**
* @brief Constructor for aggregation operation state.
* @note The order of some of the parameters to this constructor (or the
* corresponding fields when reconstructing from a protobuf) determines
* the schema of tuples written out by finalizeAggregate(). If group_by
* is nonempty, the first attribute(s) will be the group-by values, in
* order. Following that will be the values for each aggregate
* specified by aggregate_functions (with arguments specified by
* attributes), in order.
*
* @param input_relation Input relation on which the aggregates are computed.
* @param aggregate_functions A list of the aggregate functions to be
* computed.
* @param arguments For each entry in aggregate_functions, a corresponding
* list of argument expressions to that aggregate. This is moved-from,
* with AggregationOperationState taking ownership.
* @param is_distinct For each entry in aggregate_functions, whether DISTINCT
* should be applied to the entry's arguments.
* @param group_by A list of expressions to compute the GROUP BY values. If
* empty, no grouping is used. This is moved-from, with
* AggregationOperationState taking ownership.
* @param predicate The predicate to be applied prior to aggregation. nullptr
* indicates no predicate to be applied. This object takes ownership
* of predicate.
* @param estimated_num_entries Estimated of number of entries in the hash
* table. A good estimate would be a fraction of total number of tuples
* in the input relation.
* @param hash_table_impl_type The HashTable implementation to use for
* GROUP BY. Ignored if group_by is empty.
* @param storage_manager The StorageManager to use for allocating hash
* tables. Single aggregation state (when GROUP BY list is not
* specified) is not allocated using memory from storage manager.
*/
AggregationOperationState(const CatalogRelationSchema &input_relation,
const std::vector<const AggregateFunction*> &aggregate_functions,
std::vector<std::vector<std::unique_ptr<const Scalar>>> &&arguments,
std::vector<bool> &&is_distinct,
std::vector<std::unique_ptr<const Scalar>> &&group_by,
const Predicate *predicate,
const std::size_t estimated_num_entries,
const HashTableImplType hash_table_impl_type,
StorageManager *storage_manager);
~AggregationOperationState() {}
/**
* @brief Generate the aggregation operation state from the serialized
* Protocol Buffer representation.
* @note The order of some repeated fields in the proto representation
* determines the schema of tuples written out by finalizeAggregate().
* See the note for the constructor for details.
*
* @param proto A serialized Protocol Buffer representation of an
* AggregationOperationState, originally generated by the optimizer.
* @param database The Database to resolve relation and attribute references
* in.
* @param storage_manager The StorageManager to use.
**/
static AggregationOperationState* ReconstructFromProto(
const serialization::AggregationOperationState &proto,
const CatalogDatabaseLite &database,
StorageManager *storage_manager);
/**
* @brief Check whether a serialization::AggregationOperationState is
* fully-formed and all parts are valid.
*
* @param proto A serialized Protocol Buffer representation of an
* AggregationOperationState, originally generated by the optimizer.
* @param database The Database to resolve relation and attribute references
* in.
* @return Whether proto is fully-formed and valid.
**/
static bool ProtoIsValid(const serialization::AggregationOperationState &proto,
const CatalogDatabaseLite &database);
/**
* @brief Compute aggregates on the tuples of the given storage block,
* updating the running state maintained by this
* AggregationOperationState.
*
* @param input_block The block ID of the storage block where the aggreates
* are going to be computed.
**/
void aggregateBlock(const block_id input_block);
/**
* @brief Generate the final results for the aggregates managed by this
* AggregationOperationState and write them out to StorageBlock(s).
*
* @param output_destination An InsertDestination where the finalized output
* tuple(s) from this aggregate are to be written.
**/
void finalizeAggregate(InsertDestination *output_destination);
private:
// Merge locally (per storage block) aggregated states with global aggregation
// states.
void mergeSingleState(const std::vector<std::unique_ptr<AggregationState>> &local_state);
// Aggregate on input block.
void aggregateBlockSingleState(const block_id input_block);
void aggregateBlockHashTable(const block_id input_block);
void finalizeSingleState(InsertDestination *output_destination);
void finalizeHashTable(InsertDestination *output_destination);
// Common state for all aggregates in this operation: the input relation, the
// filter predicate (if any), and the list of GROUP BY expressions (if any).
const CatalogRelationSchema &input_relation_;
std::unique_ptr<const Predicate> predicate_;
std::vector<std::unique_ptr<const Scalar>> group_by_list_;
// Each individual aggregate in this operation has an AggregationHandle and
// some number of Scalar arguments.
std::vector<std::unique_ptr<AggregationHandle>> handles_;
std::vector<std::vector<std::unique_ptr<const Scalar>>> arguments_;
// For each aggregate, whether DISTINCT should be applied to the aggregate's
// arguments.
std::vector<bool> is_distinct_;
// Hash table for obtaining distinct (i.e. unique) arguments.
std::vector<std::unique_ptr<AggregationStateHashTableBase>> distinctify_hashtables_;
#ifdef QUICKSTEP_ENABLE_VECTOR_COPY_ELISION_SELECTION
// If all an aggregate's argument expressions are simply attributes in
// 'input_relation_', then this caches the attribute IDs of those arguments.
std::vector<std::vector<attribute_id>> arguments_as_attributes_;
#endif
// Per-aggregate global states for aggregation without GROUP BY.
std::vector<std::unique_ptr<AggregationState>> single_states_;
// Per-aggregate HashTables for aggregation with GROUP BY.
//
// TODO(shoban): We should ideally store the aggregation state together in one
// hash table to prevent multiple lookups.
std::vector<std::unique_ptr<AggregationStateHashTableBase>> group_by_hashtables_;
StorageManager *storage_manager_;
DISALLOW_COPY_AND_ASSIGN(AggregationOperationState);
};
/** @} */
} // namespace quickstep
#endif // QUICKSTEP_RELATIONAL_OPERATORS_AGGREGATION_OPERATION_HPP_