blob: 53039d5a76cfefe998700c896e139b991aa2652d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.core.client.rfile;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Collection;
import java.util.Map;
import java.util.Map.Entry;
import java.util.function.Predicate;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.admin.TableOperations;
import org.apache.accumulo.core.client.sample.SamplerConfiguration;
import org.apache.accumulo.core.client.summary.Summarizer;
import org.apache.accumulo.core.client.summary.SummarizerConfiguration;
import org.apache.accumulo.core.client.summary.Summary;
import org.apache.accumulo.core.client.summary.Summary.FileStatistics;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
/**
* RFile is Accumulo's internal storage format for Key Value pairs. This class is a Factory that
* enables creating a {@link Scanner} for reading and a {@link RFileWriter} for writing Rfiles.
*
* <p>
* The {@link Scanner} created by this class makes it easy to experiment with real data from a live
* system on a developers workstation. Also the {@link Scanner} can be used to write tools to
* analyze Accumulo's raw data.
*
* @since 1.8.0
*/
public class RFile {
/**
* This is an intermediate interface in a larger builder pattern. Supports setting the required
* input sources for reading a RFile.
*
* @since 1.8.0
*/
public interface InputArguments {
/**
* Specify RFiles to read from. When multiple inputs are specified the {@link Scanner}
* constructed will present a merged view.
*
* @param inputs
* one or more RFiles to read.
* @return this
*/
ScannerOptions from(RFileSource... inputs);
/**
* Specify RFiles to read from. When multiple are specified the {@link Scanner} constructed will
* present a merged view.
*
* @param files
* one or more RFiles to read.
* @return this
*/
ScannerFSOptions from(String... files);
}
/**
* This is an intermediate interface in a larger builder pattern. Enables optionally setting a
* FileSystem to read RFile(s) from.
*
* @since 1.8.0
*/
public interface ScannerFSOptions extends ScannerOptions {
/**
* Optionally provide a FileSystem to open RFiles. If not specified, the FileSystem will be
* constructed using configuration on the classpath.
*
* @param fs
* use this FileSystem to open files.
* @return this
*/
ScannerOptions withFileSystem(FileSystem fs);
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting optional
* parameters for reading RFile(s) and building a scanner over RFile(s).
*
* @since 1.8.0
*/
public interface ScannerOptions {
/**
* By default the {@link Scanner} created will setup the default Accumulo system iterators. The
* iterators do things like the following :
*
* <ul>
* <li>Suppress deleted data</li>
* <li>Filter based on @link {@link Authorizations}</li>
* <li>Filter columns specified by functions like {@link Scanner#fetchColumn(Text, Text)} and
* {@link Scanner#fetchColumnFamily(Text)}</li>
* </ul>
*
* <p>
* Calling this method will turn off these system iterators and allow reading the raw data in an
* RFile. When reading the raw data, delete data and delete markers may be seen. Delete markers
* are {@link Key}s with the delete flag set.
*
* <p>
* Disabling system iterators will cause {@link #withAuthorizations(Authorizations)},
* {@link Scanner#fetchColumn(Text, Text)}, and {@link Scanner#fetchColumnFamily(Text)} to throw
* runtime exceptions.
*
* @return this
*/
ScannerOptions withoutSystemIterators();
/**
* The authorizations passed here will be used to filter Keys, from the {@link Scanner}, based
* on the content of the column visibility field.
*
* @param auths
* scan with these authorizations
* @return this
*/
ScannerOptions withAuthorizations(Authorizations auths);
/**
* Enabling this option will cache RFiles data in memory. This option is useful when doing lots
* of random accesses.
*
* @param cacheSize
* the size of the data cache in bytes.
* @return this
*/
ScannerOptions withDataCache(long cacheSize);
/**
* Enabling this option will cache RFiles indexes in memory. Index data within a RFile is used
* to find data when seeking to a {@link Key}. This option is useful when doing lots of random
* accesses.
*
* @param cacheSize
* the size of the index cache in bytes.
* @return this
*/
ScannerOptions withIndexCache(long cacheSize);
/**
* This option allows limiting the {@link Scanner} from reading data outside of a given range. A
* scanner will not see any data outside of this range even if the RFile(s) have data outside
* the range.
*
* @return this
*/
ScannerOptions withBounds(Range range);
/**
* Construct the {@link Scanner} with iterators specified in a tables properties. Properties for
* a table can be obtained by calling {@link TableOperations#getProperties(String)}. Any
* property that impacts file behavior regardless of whether it has the
* {@link Property#TABLE_PREFIX} may be accepted and used. For example, cache and crypto
* properties could be passed here.
*
* @param props
* iterable over Accumulo table key value properties.
* @return this
*/
ScannerOptions withTableProperties(Iterable<Entry<String,String>> props);
/**
* @see #withTableProperties(Iterable) Any property that impacts file behavior regardless of
* whether it has the {@link Property#TABLE_PREFIX} may be accepted and used. For example,
* cache and crypto properties could be passed here.
* @param props
* a map instead of an Iterable
* @return this
*/
ScannerOptions withTableProperties(Map<String,String> props);
/**
* @return a Scanner over RFile using the specified options.
*/
Scanner build();
}
/**
* Entry point for building a new {@link Scanner} over one or more RFiles.
*/
public static InputArguments newScanner() {
return new RFileScannerBuilder();
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting the required
* input sources for reading summary data from an RFile.
*
* @since 2.0.0
*/
public interface SummaryInputArguments {
/**
* Specify RFiles to read from. When multiple inputs are specified the summary data will be
* merged.
*
* @param inputs
* one or more RFiles to read.
* @return this
*/
SummaryOptions from(RFileSource... inputs);
/**
* Specify RFiles to read from. When multiple are specified the summary data will be merged.
*
* @param files
* one or more RFiles to read.
* @return this
*/
SummaryFSOptions from(String... files);
}
/**
* This is an intermediate interface in a larger builder pattern. Enables optionally setting a
* FileSystem to read RFile summary data from.
*
* @since 2.0.0
*/
public interface SummaryFSOptions extends SummaryOptions {
/**
* Optionally provide a FileSystem to open RFiles. If not specified, the FileSystem will be
* constructed using configuration on the classpath.
*
* @param fs
* use this FileSystem to open files.
* @return this
*/
SummaryOptions withFileSystem(FileSystem fs);
}
/**
* This is an intermediate interface in a large builder pattern. Allows setting options for
* retrieving summary data.
*
* @since 2.0.0
*/
public interface SummaryOptions {
/**
* Retrieve summaries with provided tables properties. Properties for a table can be obtained by
* calling {@link TableOperations#getProperties(String)}. Any property that impacts file
* behavior regardless of whether it has the {@link Property#TABLE_PREFIX} may be accepted and
* used. For example, cache and crypto properties could be passed here.
*
* @param props
* iterable over Accumulo table key value properties.
* @return this
*/
SummaryOptions withTableProperties(Iterable<Entry<String,String>> props);
/**
* @see #withTableProperties(Iterable) Any property that impacts file behavior regardless of
* whether it has the {@link Property#TABLE_PREFIX} may be accepted and used. For example,
* cache and crypto properties could be passed here.
* @param props
* a map instead of an Iterable
* @return this
*/
SummaryOptions withTableProperties(Map<String,String> props);
/**
* This method allows retrieving a subset of summary data from a file. If a file has lots of
* separate summaries, reading a subset may be faster.
*
* @param summarySelector
* Only read summary data that was generated with configuration that this predicate
* matches.
* @return this
*/
SummaryOptions selectSummaries(Predicate<SummarizerConfiguration> summarySelector);
/**
* Summary data may possibly be stored at a more granular level than the entire file. However
* there is no guarantee of this. If the data was stored at a more granular level, then this
* will get a subset of the summary data. The subset will very likely be an inaccurate
* approximation.
*
* @param startRow
* A non-null start row. The startRow is used exclusively.
* @return this
*
* @see FileStatistics#getExtra()
*/
SummaryOptions startRow(Text startRow);
/**
* @param startRow
* UTF-8 encodes startRow. The startRow is used exclusively.
* @return this
* @see #startRow(Text)
*/
SummaryOptions startRow(CharSequence startRow);
/**
* Summary data may possibly be stored at a more granular level than the entire file. However
* there is no guarantee of this. If the data was stored at a more granular level, then this
* will get a subset of the summary data. The subset will very likely be an inaccurate
* approximation.
*
* @param endRow
* A non-null end row. The end row is used inclusively.
* @return this
*
* @see FileStatistics#getExtra()
*/
SummaryOptions endRow(Text endRow);
/**
* @param endRow
* UTF-8 encodes endRow. The end row is used inclusively.
* @return this
* @see #endRow(Text)
*/
SummaryOptions endRow(CharSequence endRow);
/**
* Reads summary data from file.
*
* @return The summary data in the file that satisfied the selection criteria.
*/
Collection<Summary> read() throws IOException;
}
/**
* Entry point for reading summary data from RFiles.
*
* @since 2.0.0
*/
public static SummaryInputArguments summaries() {
return new RFileSummariesRetriever();
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting the required
* output sink to write a RFile to.
*
* @since 1.8.0
*/
public interface OutputArguments {
/**
* @param filename
* name of file to write RFile data
* @return this
*/
WriterFSOptions to(String filename);
/**
* @param out
* output stream to write RFile data
* @return this
*/
WriterOptions to(OutputStream out);
}
/**
* This is an intermediate interface in a larger builder pattern. Enables optionally setting a
* FileSystem to write to.
*
* @since 1.8.0
*/
public interface WriterFSOptions extends WriterOptions {
/**
* Optionally provide a FileSystem to open a file to write a RFile. If not specified, the
* FileSystem will be constructed using configuration on the classpath.
*
* @param fs
* use this FileSystem to open files.
* @return this
*/
WriterOptions withFileSystem(FileSystem fs);
}
/**
* This is an intermediate interface in a larger builder pattern. Supports setting optional
* parameters for creating a RFile and building a RFileWriter.
*
* @since 1.8.0
*/
public interface WriterOptions {
/**
* Enable generating summary data in the created RFile by running {@link Summarizer}'s based on
* the specified configuration.
*
* @param summarizerConf
* Configuration for summarizer to run.
* @since 2.0.0
*/
WriterOptions withSummarizers(SummarizerConfiguration... summarizerConf);
/**
* An option to store sample data in the generated RFile.
*
* @param samplerConf
* configuration to use when generating sample data.
* @throws IllegalArgumentException
* if table properties were previously specified and the table properties also specify
* a sampler.
* @return this
*/
WriterOptions withSampler(SamplerConfiguration samplerConf);
/**
* Create an RFile using the same configuration as an Accumulo table. Properties for a table can
* be obtained by calling {@link TableOperations#getProperties(String)}. Any property that
* impacts file behavior regardless of whether it has the {@link Property#TABLE_PREFIX} may be
* accepted and used. For example, cache and crypto properties could be passed here.
*
* @param props
* iterable over Accumulo table key value properties.
* @throws IllegalArgumentException
* if sampler was previously specified and the table properties also specify a
* sampler.
* @return this
*/
WriterOptions withTableProperties(Iterable<Entry<String,String>> props);
/**
* @see #withTableProperties(Iterable)
*/
WriterOptions withTableProperties(Map<String,String> props);
/**
* @param maxSize
* As keys are added to an RFile the visibility field is validated. Validating the
* visibility field requires parsing it. In order to make validation faster, previously
* seen visibilities are cached. This option allows setting the maximum size of this
* cache.
* @return this
*/
WriterOptions withVisibilityCacheSize(int maxSize);
/**
* @return a new RfileWriter created with the options previously specified.
*/
RFileWriter build() throws IOException;
}
/**
* Entry point for creating a new RFile writer.
*/
public static OutputArguments newWriter() {
return new RFileWriterBuilder();
}
}