blob: f05f81038d95fcb71b0823bcdcf7ed6bfb99ce30 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef KUDU_TABLET_COMPACTION_POLICY_H
#define KUDU_TABLET_COMPACTION_POLICY_H
#include <cstddef>
#include <cstdint>
#include <string>
#include <unordered_set>
#include <vector>
#include "kudu/gutil/macros.h"
#include "kudu/util/status.h"
namespace kudu {
namespace tablet {
class RowSet;
class RowSetInfo;
class RowSetTree;
// A set of rowsets selected for compaction.
typedef std::unordered_set<const RowSet*> CompactionSelection;
// A CompactionPolicy is responsible for picking which rowsets in a tablet
// should be compacted together.
class CompactionPolicy {
public:
CompactionPolicy() = default;
virtual ~CompactionPolicy() = default;
// Select a set of RowSets to compact out of 'tree'.
//
// Callers are responsible for externally synchronizing selection within a
// given tablet. This will only select rowsets whose compact_flush_lock
// is unlocked, but will not itself take the lock. Hence no other threads
// should lock or unlock the rowsets' compact_flush_lock while this method
// is running.
//
// *quality is set to represent how effectively the compaction reduces the
// tablet's average IO per write operation.
//
// If 'log' is not NULL, then a verbose log of the compaction selection
// process will be appended to it.
virtual Status PickRowSets(const RowSetTree& tree,
CompactionSelection* picked,
double* quality,
std::vector<std::string>* log) = 0;
// Return the size in bytes at which flush/compact should "roll" to new files.
// Some compaction policies may prefer to deal with small constant-size files
// whereas others may prefer large ones.
virtual uint64_t target_rowset_size() const = 0;
private:
DISALLOW_COPY_AND_ASSIGN(CompactionPolicy);
};
// A compaction policy that, given a size budget for a compaction, tries to pick
// a set of RowSets that fit into that budget and minimize the future cost of
// operations on the tablet.
//
// See docs/design-docs/compaction-policy.md for details.
class BudgetedCompactionPolicy : public CompactionPolicy {
public:
explicit BudgetedCompactionPolicy(int size_budget_mb);
Status PickRowSets(const RowSetTree &tree,
CompactionSelection* picked,
double* quality,
std::vector<std::string>* log) override;
uint64_t target_rowset_size() const override;
private:
struct SolutionAndValue {
CompactionSelection rowsets;
double value = 0.0;
};
// Set up the 'asc_min_key' and 'asc_max_key' vectors used by both the
// RunApproximation and RunExact. 'asc_min_key' will hold RowSetInfo instances
// for rowsets from 'tree' in ascending order by min key; 'asc_max_key' will
// hold the RowSetInfo instances in ascending order by max key.
void SetupKnapsackInput(const RowSetTree& tree,
std::vector<RowSetInfo>* asc_min_key,
std::vector<RowSetInfo>* asc_max_key) const;
// Runs the first pass approximate algorithm for the compaction selection
// problem. Stores the best solution found in 'best_solution'.
//
// Sets best_upper_bounds[i] to an upper bound for a solution containing
// asc_min_key[i] as its left-most rowset.
void RunApproximation(const std::vector<RowSetInfo>& asc_min_key,
const std::vector<RowSetInfo>& asc_max_key,
std::vector<double>* best_upper_bounds,
SolutionAndValue* best_solution) const;
// Runs the second pass of the algorithm.
//
// For each i, the algorithm first checks if best_upper_bounds[i] indicates
// that a solution containing asc_min_key[i] as its left-most rowset might be
// a better solution than the current 'best_solution' by at least the
// configured approximation ratio. If so, it runs the full knapsack algorithm
// to determine the value of that solution and, if it is indeed better,
// replaces '*best_solution' with the new best solution.
void RunExact(const std::vector<RowSetInfo>& asc_min_key,
const std::vector<RowSetInfo>& asc_max_key,
const std::vector<double>& best_upper_bounds,
SolutionAndValue* best_solution) const;
const size_t size_budget_mb_;
};
} // namespace tablet
} // namespace kudu
#endif