blob: 98e9ff1622fdcf0ead928f795ff4046d5ea83197 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.accumulo.core.client.admin.compaction;
import static org.apache.accumulo.core.client.summary.summarizers.DeletesSummarizer.DELETES_STAT;
import static org.apache.accumulo.core.client.summary.summarizers.DeletesSummarizer.TOTAL_STAT;
import java.util.Collection;
import java.util.List;
import java.util.function.Predicate;
import org.apache.accumulo.core.client.rfile.RFile.WriterOptions;
import org.apache.accumulo.core.client.summary.SummarizerConfiguration;
import org.apache.accumulo.core.client.summary.Summary;
import org.apache.accumulo.core.client.summary.summarizers.DeletesSummarizer;
/**
* This compaction selector works in concert with the {@link DeletesSummarizer}. Using the
* statistics from DeleteSummarizer this strategy will compact all files in a table when the number
* of deletes/non-deletes exceeds a threshold.
*
* <p>
* This strategy has two options. First the {@value #THRESHOLD_OPT} option allows setting the point
* at which a compaction will be triggered. This options defaults to {@value #THRESHOLD_OPT_DEFAULT}
* and must be in the range (0.0, 1.0]. The second option is {@value #PROCEED_ZERO_NO_SUMMARY_OPT}
* which determines if the strategy should proceed when a bulk imported file has no summary
* information.
*
* <p>
* If the delete summarizer was configured on a table that already had files, then those files will
* have not summary information. This strategy can still proceed in this situation. It will fall
* back to using Accumulo's estimated entries per file in this case. For the files without summary
* information the estimated number of deletes will be zero. This fall back method will
* underestimate deletes which will not lead to false positives, except for the case of bulk
* imported files. Accumulo estimates that bulk imported files have zero entries. The second option
* {@value #PROCEED_ZERO_NO_SUMMARY_OPT} determines if this strategy should proceed when it sees
* bulk imported files that do not have summary data. This option defaults to
* {@value #PROCEED_ZERO_NO_SUMMARY_OPT_DEFAULT}.
*
* <p>
* Bulk files can be generated with summary information by calling
* {@link WriterOptions#withSummarizers(SummarizerConfiguration...)}
*
* <p>
* When using this feature, its important to ensure summary cache is on and the summaries fit in the
* cache.
*
* @since 2.1.0
*/
public class TooManyDeletesSelector implements CompactionSelector {
private double threshold;
private boolean proceed_bns;
/**
* This option should be a floating point number between 1 and 0.
*/
public static final String THRESHOLD_OPT = "threshold";
/**
* The default threshold.
*/
public static final String THRESHOLD_OPT_DEFAULT = ".25";
public static final String PROCEED_ZERO_NO_SUMMARY_OPT = "proceed_zero_no_summary";
public static final String PROCEED_ZERO_NO_SUMMARY_OPT_DEFAULT = "false";
@Override
public void init(InitParameters iparams) {
var options = iparams.getOptions();
this.threshold = Double.parseDouble(options.getOrDefault(THRESHOLD_OPT, THRESHOLD_OPT_DEFAULT));
if (threshold <= 0.0 || threshold > 1.0) {
throw new IllegalArgumentException(
"Threshold must be in range (0.0, 1.0], saw : " + threshold);
}
this.proceed_bns = Boolean.parseBoolean(
options.getOrDefault(PROCEED_ZERO_NO_SUMMARY_OPT, PROCEED_ZERO_NO_SUMMARY_OPT_DEFAULT));
}
@Override
public Selection select(SelectionParameters sparams) {
var tableConf = sparams.getEnvironment().getConfiguration(sparams.getTableId());
Collection<SummarizerConfiguration> configuredSummarizers =
SummarizerConfiguration.fromTableProperties(tableConf);
// check if delete summarizer is configured for table
if (configuredSummarizers.stream().map(sc -> sc.getClassName())
.noneMatch(cn -> cn.equals(DeletesSummarizer.class.getName()))) {
return new Selection(List.of());
}
Predicate<SummarizerConfiguration> summarizerPredicate =
conf -> conf.getClassName().equals(DeletesSummarizer.class.getName())
&& conf.getOptions().isEmpty();
long total = 0;
long deletes = 0;
for (CompactableFile file : sparams.getAvailableFiles()) {
Collection<Summary> summaries = sparams.getSummaries(List.of(file), summarizerPredicate);
if (summaries.size() == 1) {
Summary summary = summaries.iterator().next();
total += summary.getStatistics().get(TOTAL_STAT);
deletes += summary.getStatistics().get(DELETES_STAT);
} else {
long numEntries = file.getEstimatedEntries();
if (numEntries == 0 && !proceed_bns) {
return new Selection(List.of());
} else {
// no summary data so use Accumulo's estimate of total entries in file
total += numEntries;
}
}
}
long nonDeletes = total - deletes;
if (nonDeletes >= 0) {
// check nonDeletes >= 0 because if this is not true then its clear evidence that the
// estimates are off
double ratio = deletes / (double) nonDeletes;
if (ratio >= threshold) {
return new Selection(sparams.getAvailableFiles());
}
}
return new Selection(List.of());
}
}