blob: 4537c7d81ca797c34f09fadf3211c80c37493838 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import java.util.Collection;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
/**
* API for configuring a table scan.
* <p>
* TableScan objects are immutable and can be shared between threads. Refinement methods, like
* {@link #select(Collection)} and {@link #filter(Expression)}, create new TableScan instances.
*/
public interface TableScan {
/**
* Returns the {@link Table} from which this scan loads data.
*
* @return this scan's table
*/
Table table();
/**
* Create a new {@link TableScan} from this scan's configuration that will use the given snapshot
* by ID.
*
* @param snapshotId a snapshot ID
* @return a new scan based on this with the given snapshot ID
* @throws IllegalArgumentException if the snapshot cannot be found
*/
TableScan useSnapshot(long snapshotId);
/**
* Create a new {@link TableScan} from this scan's configuration that will use the most recent
* snapshot as of the given time in milliseconds.
*
* @param timestampMillis a timestamp in milliseconds.
* @return a new scan based on this with the current snapshot at the given time
* @throws IllegalArgumentException if the snapshot cannot be found
*/
TableScan asOfTime(long timestampMillis);
/**
* Create a new {@link TableScan} from this scan's configuration that will override the {@link Table}'s behavior based
* on the incoming pair. Unknown properties will be ignored.
*
* @param property name of the table property to be overridden
* @param value value to override with
* @return a new scan based on this with overridden behavior
*/
TableScan option(String property, String value);
/**
* Create a new {@link TableScan} from this with the schema as its projection.
*
* @param schema a projection schema
* @return a new scan based on this with the given projection
*/
TableScan project(Schema schema);
/**
* Create a new {@link TableScan} from this that, if data columns where selected
* via {@link #select(java.util.Collection)}, controls whether the match to the schema will be done
* with case sensitivity.
*
* @return a new scan based on this with case sensitivity as stated
*/
TableScan caseSensitive(boolean caseSensitive);
/**
* Create a new {@link TableScan} from this that loads the column stats with each data file.
* <p>
* Column stats include: value count, null value count, lower bounds, and upper bounds.
*
* @return a new scan based on this that loads column stats.
*/
TableScan includeColumnStats();
/**
* Create a new {@link TableScan} from this that will read the given data columns. This produces
* an expected schema that includes all fields that are either selected or used by this scan's
* filter expression.
*
* @param columns column names from the table's schema
* @return a new scan based on this with the given projection columns
*/
default TableScan select(String... columns) {
return select(Lists.newArrayList(columns));
}
/**
* Create a new {@link TableScan} from this that will read the given data columns. This produces
* an expected schema that includes all fields that are either selected or used by this scan's
* filter expression.
*
* @param columns column names from the table's schema
* @return a new scan based on this with the given projection columns
*/
TableScan select(Collection<String> columns);
/**
* Create a new {@link TableScan} from the results of this filtered by the {@link Expression}.
*
* @param expr a filter expression
* @return a new scan based on this with results filtered by the expression
*/
TableScan filter(Expression expr);
/**
* Returns this scan's filter {@link Expression}.
*
* @return this scan's filter expression
*/
Expression filter();
/**
* Create a new {@link TableScan} from this that applies data filtering to files but not to rows in those files.
*
* @return a new scan based on this that does not filter rows in files.
*/
TableScan ignoreResiduals();
/**
* Create a new {@link TableScan} to read appended data from {@code fromSnapshotId} exclusive to {@code toSnapshotId}
* inclusive.
*
* @param fromSnapshotId the last snapshot id read by the user, exclusive
* @param toSnapshotId read append data up to this snapshot id
* @return a table scan which can read append data from {@code fromSnapshotId}
* exclusive and up to {@code toSnapshotId} inclusive
*/
TableScan appendsBetween(long fromSnapshotId, long toSnapshotId);
/**
* Create a new {@link TableScan} to read appended data from {@code fromSnapshotId} exclusive to the current snapshot
* inclusive.
*
* @param fromSnapshotId - the last snapshot id read by the user, exclusive
* @return a table scan which can read append data from {@code fromSnapshotId}
* exclusive and up to current snapshot inclusive
*/
TableScan appendsAfter(long fromSnapshotId);
/**
* Plan the {@link FileScanTask files} that will be read by this scan.
* <p>
* Each file has a residual expression that should be applied to filter the file's rows.
* <p>
* This simple plan returns file scans for each file from position 0 to the file's length. For
* planning that will combine small files, split large files, and attempt to balance work, use
* {@link #planTasks()} instead.
*
* @return an Iterable of file tasks that are required by this scan
*/
CloseableIterable<FileScanTask> planFiles();
/**
* Plan the {@link CombinedScanTask tasks} for this scan.
* <p>
* Tasks created by this method may read partial input files, multiple input files, or both.
*
* @return an Iterable of tasks for this scan
*/
CloseableIterable<CombinedScanTask> planTasks();
/**
* Returns this scan's projection {@link Schema}.
* <p>
* If the projection schema was set directly using {@link #project(Schema)}, returns that schema.
* <p>
* If the projection schema was set by calling {@link #select(Collection)}, returns a projection
* schema that includes the selected data fields and any fields used in the filter expression.
*
* @return this scan's projection schema
*/
Schema schema();
/**
* Returns the {@link Snapshot} that will be used by this scan.
* <p>
* If the snapshot was not configured using {@link #asOfTime(long)} or {@link #useSnapshot(long)}, the current table
* snapshot will be used.
*
* @return the Snapshot this scan will use
*/
Snapshot snapshot();
/**
* Returns whether this scan should apply column name case sensitiveness as per {@link #caseSensitive(boolean)}.
* @return true if case sensitive, false otherwise.
*/
boolean isCaseSensitive();
}