| /** |
| * Copyright 2010 The Apache Software Foundation |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.hbase.regionserver; |
| |
| import java.io.IOException; |
| import java.util.NavigableSet; |
| |
| import org.apache.hadoop.hbase.HConstants; |
| import org.apache.hadoop.hbase.KeyValue; |
| import org.apache.hadoop.hbase.client.Scan; |
| import org.apache.hadoop.hbase.filter.Filter; |
| import org.apache.hadoop.hbase.filter.Filter.ReturnCode; |
| import org.apache.hadoop.hbase.io.TimeRange; |
| import org.apache.hadoop.hbase.regionserver.DeleteTracker.DeleteResult; |
| import org.apache.hadoop.hbase.util.Bytes; |
| import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; |
| |
| /** |
| * A query matcher that is specifically designed for the scan case. |
| */ |
| public class ScanQueryMatcher { |
| // Optimization so we can skip lots of compares when we decide to skip |
| // to the next row. |
| private boolean stickyNextRow; |
| private final byte[] stopRow; |
| |
| private final TimeRange tr; |
| |
| private final Filter filter; |
| |
| /** Keeps track of deletes */ |
| private final DeleteTracker deletes; |
| |
| /* |
| * The following three booleans define how we deal with deletes. |
| * There are three different aspects: |
| * 1. Whether to keep delete markers. This is used in compactions. |
| * Minor compactions always keep delete markers. |
| * 2. Whether to keep deleted rows. This is also used in compactions, |
| * if the store is set to keep deleted rows. This implies keeping |
| * the delete markers as well. |
| * In this case deleted rows are subject to the normal max version |
| * and TTL/min version rules just like "normal" rows. |
| * 3. Whether a scan can do time travel queries even before deleted |
| * marker to reach deleted rows. |
| */ |
| /** whether to retain delete markers */ |
| private final boolean retainDeletesInOutput; |
| /** whether to return deleted rows */ |
| private final boolean keepDeletedCells; |
| /** whether time range queries can see rows "behind" a delete */ |
| private final boolean seePastDeleteMarkers; |
| |
| |
| /** Keeps track of columns and versions */ |
| private final ColumnTracker columns; |
| |
| /** Key to seek to in memstore and StoreFiles */ |
| private final KeyValue startKey; |
| |
| /** Row comparator for the region this query is for */ |
| private final KeyValue.KeyComparator rowComparator; |
| |
| /* row is not private for tests */ |
| /** Row the query is on */ |
| byte [] row; |
| |
| /** |
| * Oldest put in any of the involved store files |
| * Used to decide whether it is ok to delete |
| * family delete marker of this store keeps |
| * deleted KVs. |
| */ |
| private final long earliestPutTs; |
| |
| /** readPoint over which the KVs are unconditionally included */ |
| protected long maxReadPointToTrackVersions; |
| |
| /** |
| * This variable shows whether there is an null column in the query. There |
| * always exists a null column in the wildcard column query. |
| * There maybe exists a null column in the explicit column query based on the |
| * first column. |
| * */ |
| private boolean hasNullColumn = true; |
| |
| // By default, when hbase.hstore.time.to.purge.deletes is 0ms, a delete |
| // marker is always removed during a major compaction. If set to non-zero |
| // value then major compaction will try to keep a delete marker around for |
| // the given number of milliseconds. We want to keep the delete markers |
| // around a bit longer because old puts might appear out-of-order. For |
| // example, during log replication between two clusters. |
| // |
| // If the delete marker has lived longer than its column-family's TTL then |
| // the delete marker will be removed even if time.to.purge.deletes has not |
| // passed. This is because all the Puts that this delete marker can influence |
| // would have also expired. (Removing of delete markers on col family TTL will |
| // not happen if min-versions is set to non-zero) |
| // |
| // But, if time.to.purge.deletes has not expired then a delete |
| // marker will not be removed just because there are no Puts that it is |
| // currently influencing. This is because Puts, that this delete can |
| // influence. may appear out of order. |
| private final long timeToPurgeDeletes; |
| |
| private final boolean isUserScan; |
| |
| /** |
| * Construct a QueryMatcher for a scan |
| * @param scan |
| * @param scanInfo The store's immutable scan info |
| * @param columns |
| * @param scanType Type of the scan |
| * @param earliestPutTs Earliest put seen in any of the store files. |
| * @param oldestUnexpiredTS the oldest timestamp we are interested in, |
| * based on TTL |
| */ |
| public ScanQueryMatcher(Scan scan, Store.ScanInfo scanInfo, |
| NavigableSet<byte[]> columns, ScanType scanType, |
| long readPointToUse, long earliestPutTs, long oldestUnexpiredTS) { |
| this.tr = scan.getTimeRange(); |
| this.rowComparator = scanInfo.getComparator().getRawComparator(); |
| this.deletes = new ScanDeleteTracker(); |
| this.stopRow = scan.getStopRow(); |
| this.startKey = KeyValue.createFirstDeleteFamilyOnRow(scan.getStartRow(), |
| scanInfo.getFamily()); |
| this.filter = scan.getFilter(); |
| this.earliestPutTs = earliestPutTs; |
| this.maxReadPointToTrackVersions = readPointToUse; |
| this.timeToPurgeDeletes = scanInfo.getTimeToPurgeDeletes(); |
| |
| /* how to deal with deletes */ |
| this.isUserScan = scanType == ScanType.USER_SCAN; |
| // keep deleted cells: if compaction or raw scan |
| this.keepDeletedCells = (scanInfo.getKeepDeletedCells() && !isUserScan) || scan.isRaw(); |
| // retain deletes: if minor compaction or raw scan |
| this.retainDeletesInOutput = scanType == ScanType.MINOR_COMPACT || scan.isRaw(); |
| // seePastDeleteMarker: user initiated scans |
| this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() && isUserScan; |
| |
| int maxVersions = Math.min(scan.getMaxVersions(), scanInfo.getMaxVersions()); |
| // Single branch to deal with two types of reads (columns vs all in family) |
| if (columns == null || columns.size() == 0) { |
| // there is always a null column in the wildcard column query. |
| hasNullColumn = true; |
| |
| // use a specialized scan for wildcard column tracker. |
| this.columns = new ScanWildcardColumnTracker( |
| scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS); |
| } else { |
| // whether there is null column in the explicit column query |
| hasNullColumn = (columns.first().length == 0); |
| |
| // We can share the ExplicitColumnTracker, diff is we reset |
| // between rows, not between storefiles. |
| this.columns = new ExplicitColumnTracker(columns, |
| scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS); |
| } |
| } |
| |
| /* |
| * Constructor for tests |
| */ |
| ScanQueryMatcher(Scan scan, Store.ScanInfo scanInfo, |
| NavigableSet<byte[]> columns, long oldestUnexpiredTS) { |
| this(scan, scanInfo, columns, ScanType.USER_SCAN, |
| Long.MAX_VALUE, /* max Readpoint to track versions */ |
| HConstants.LATEST_TIMESTAMP, oldestUnexpiredTS); |
| } |
| |
| /** |
| * |
| * @return whether there is an null column in the query |
| */ |
| public boolean hasNullColumnInQuery() { |
| return hasNullColumn; |
| } |
| |
| /** |
| * Determines if the caller should do one of several things: |
| * - seek/skip to the next row (MatchCode.SEEK_NEXT_ROW) |
| * - seek/skip to the next column (MatchCode.SEEK_NEXT_COL) |
| * - include the current KeyValue (MatchCode.INCLUDE) |
| * - ignore the current KeyValue (MatchCode.SKIP) |
| * - got to the next row (MatchCode.DONE) |
| * |
| * @param kv KeyValue to check |
| * @return The match code instance. |
| * @throws IOException in case there is an internal consistency problem |
| * caused by a data corruption. |
| */ |
| public MatchCode match(KeyValue kv) throws IOException { |
| if (filter != null && filter.filterAllRemaining()) { |
| return MatchCode.DONE_SCAN; |
| } |
| |
| byte [] bytes = kv.getBuffer(); |
| int offset = kv.getOffset(); |
| int initialOffset = offset; |
| |
| int keyLength = Bytes.toInt(bytes, offset, Bytes.SIZEOF_INT); |
| offset += KeyValue.ROW_OFFSET; |
| |
| short rowLength = Bytes.toShort(bytes, offset, Bytes.SIZEOF_SHORT); |
| offset += Bytes.SIZEOF_SHORT; |
| |
| int ret = this.rowComparator.compareRows(row, 0, row.length, |
| bytes, offset, rowLength); |
| if (ret <= -1) { |
| return MatchCode.DONE; |
| } else if (ret >= 1) { |
| // could optimize this, if necessary? |
| // Could also be called SEEK_TO_CURRENT_ROW, but this |
| // should be rare/never happens. |
| return MatchCode.SEEK_NEXT_ROW; |
| } |
| |
| // optimize case. |
| if (this.stickyNextRow) |
| return MatchCode.SEEK_NEXT_ROW; |
| |
| if (this.columns.done()) { |
| stickyNextRow = true; |
| return MatchCode.SEEK_NEXT_ROW; |
| } |
| |
| //Passing rowLength |
| offset += rowLength; |
| |
| //Skipping family |
| byte familyLength = bytes [offset]; |
| offset += familyLength + 1; |
| |
| int qualLength = keyLength + KeyValue.ROW_OFFSET - |
| (offset - initialOffset) - KeyValue.TIMESTAMP_TYPE_SIZE; |
| |
| long timestamp = kv.getTimestamp(); |
| // check for early out based on timestamp alone |
| if (columns.isDone(timestamp)) { |
| return columns.getNextRowOrNextColumn(bytes, offset, qualLength); |
| } |
| |
| /* |
| * The delete logic is pretty complicated now. |
| * This is corroborated by the following: |
| * 1. The store might be instructed to keep deleted rows around. |
| * 2. A scan can optionally see past a delete marker now. |
| * 3. If deleted rows are kept, we have to find out when we can |
| * remove the delete markers. |
| * 4. Family delete markers are always first (regardless of their TS) |
| * 5. Delete markers should not be counted as version |
| * 6. Delete markers affect puts of the *same* TS |
| * 7. Delete marker need to be version counted together with puts |
| * they affect |
| */ |
| byte type = kv.getType(); |
| if (kv.isDelete()) { |
| if (!keepDeletedCells) { |
| // first ignore delete markers if the scanner can do so, and the |
| // range does not include the marker |
| // |
| // during flushes and compactions also ignore delete markers newer |
| // than the readpoint of any open scanner, this prevents deleted |
| // rows that could still be seen by a scanner from being collected |
| boolean includeDeleteMarker = seePastDeleteMarkers ? |
| tr.withinTimeRange(timestamp) : |
| tr.withinOrAfterTimeRange(timestamp); |
| if (includeDeleteMarker |
| && kv.getMemstoreTS() <= maxReadPointToTrackVersions) { |
| this.deletes.add(bytes, offset, qualLength, timestamp, type); |
| } |
| // Can't early out now, because DelFam come before any other keys |
| } |
| if (retainDeletesInOutput |
| || (!isUserScan && (EnvironmentEdgeManager.currentTimeMillis() - timestamp) <= timeToPurgeDeletes) |
| || kv.getMemstoreTS() > maxReadPointToTrackVersions) { |
| // always include or it is not time yet to check whether it is OK |
| // to purge deltes or not |
| return MatchCode.INCLUDE; |
| } else if (keepDeletedCells) { |
| if (timestamp < earliestPutTs) { |
| // keeping delete rows, but there are no puts older than |
| // this delete in the store files. |
| return columns.getNextRowOrNextColumn(bytes, offset, qualLength); |
| } |
| // else: fall through and do version counting on the |
| // delete markers |
| } else { |
| return MatchCode.SKIP; |
| } |
| // note the following next else if... |
| // delete marker are not subject to other delete markers |
| } else if (!this.deletes.isEmpty()) { |
| DeleteResult deleteResult = deletes.isDeleted(bytes, offset, qualLength, |
| timestamp); |
| switch (deleteResult) { |
| case FAMILY_DELETED: |
| case COLUMN_DELETED: |
| return columns.getNextRowOrNextColumn(bytes, offset, qualLength); |
| case VERSION_DELETED: |
| return MatchCode.SKIP; |
| case NOT_DELETED: |
| break; |
| default: |
| throw new RuntimeException("UNEXPECTED"); |
| } |
| } |
| |
| int timestampComparison = tr.compare(timestamp); |
| if (timestampComparison >= 1) { |
| return MatchCode.SKIP; |
| } else if (timestampComparison <= -1) { |
| return columns.getNextRowOrNextColumn(bytes, offset, qualLength); |
| } |
| |
| /** |
| * Filters should be checked before checking column trackers. If we do |
| * otherwise, as was previously being done, ColumnTracker may increment its |
| * counter for even that KV which may be discarded later on by Filter. This |
| * would lead to incorrect results in certain cases. |
| */ |
| ReturnCode filterResponse = ReturnCode.SKIP; |
| if (filter != null) { |
| filterResponse = filter.filterKeyValue(kv); |
| if (filterResponse == ReturnCode.SKIP) { |
| return MatchCode.SKIP; |
| } else if (filterResponse == ReturnCode.NEXT_COL) { |
| return columns.getNextRowOrNextColumn(bytes, offset, qualLength); |
| } else if (filterResponse == ReturnCode.NEXT_ROW) { |
| stickyNextRow = true; |
| return MatchCode.SEEK_NEXT_ROW; |
| } else if (filterResponse == ReturnCode.SEEK_NEXT_USING_HINT) { |
| return MatchCode.SEEK_NEXT_USING_HINT; |
| } |
| } |
| |
| MatchCode colChecker = columns.checkColumn(bytes, offset, qualLength, |
| timestamp, type, kv.getMemstoreTS() > maxReadPointToTrackVersions); |
| /* |
| * According to current implementation, colChecker can only be |
| * SEEK_NEXT_COL, SEEK_NEXT_ROW, SKIP or INCLUDE. Therefore, always return |
| * the MatchCode. If it is SEEK_NEXT_ROW, also set stickyNextRow. |
| */ |
| if (colChecker == MatchCode.SEEK_NEXT_ROW) { |
| stickyNextRow = true; |
| } else if (filter != null && colChecker == MatchCode.INCLUDE && |
| filterResponse == ReturnCode.INCLUDE_AND_NEXT_COL) { |
| return MatchCode.INCLUDE_AND_SEEK_NEXT_COL; |
| } |
| return colChecker; |
| |
| } |
| |
| public boolean moreRowsMayExistAfter(KeyValue kv) { |
| if (!Bytes.equals(stopRow , HConstants.EMPTY_END_ROW) && |
| rowComparator.compareRows(kv.getBuffer(),kv.getRowOffset(), |
| kv.getRowLength(), stopRow, 0, stopRow.length) >= 0) { |
| // KV >= STOPROW |
| // then NO there is nothing left. |
| return false; |
| } else { |
| return true; |
| } |
| } |
| |
| /** |
| * Set current row |
| * @param row |
| */ |
| public void setRow(byte [] row) { |
| this.row = row; |
| reset(); |
| } |
| |
| public void reset() { |
| this.deletes.reset(); |
| this.columns.reset(); |
| |
| stickyNextRow = false; |
| } |
| |
| /** |
| * |
| * @return the start key |
| */ |
| public KeyValue getStartKey() { |
| return this.startKey; |
| } |
| |
| /** |
| * |
| * @return the Filter |
| */ |
| Filter getFilter() { |
| return this.filter; |
| } |
| |
| public KeyValue getNextKeyHint(KeyValue kv) { |
| if (filter == null) { |
| return null; |
| } else { |
| return filter.getNextKeyHint(kv); |
| } |
| } |
| |
| public KeyValue getKeyForNextColumn(KeyValue kv) { |
| ColumnCount nextColumn = columns.getColumnHint(); |
| if (nextColumn == null) { |
| return KeyValue.createLastOnRow( |
| kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(), |
| kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(), |
| kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength()); |
| } else { |
| return KeyValue.createFirstOnRow( |
| kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(), |
| kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(), |
| nextColumn.getBuffer(), nextColumn.getOffset(), nextColumn.getLength()); |
| } |
| } |
| |
| public KeyValue getKeyForNextRow(KeyValue kv) { |
| return KeyValue.createLastOnRow( |
| kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(), |
| null, 0, 0, |
| null, 0, 0); |
| } |
| |
| /** |
| * {@link #match} return codes. These instruct the scanner moving through |
| * memstores and StoreFiles what to do with the current KeyValue. |
| * <p> |
| * Additionally, this contains "early-out" language to tell the scanner to |
| * move on to the next File (memstore or Storefile), or to return immediately. |
| */ |
| public static enum MatchCode { |
| /** |
| * Include KeyValue in the returned result |
| */ |
| INCLUDE, |
| |
| /** |
| * Do not include KeyValue in the returned result |
| */ |
| SKIP, |
| |
| /** |
| * Do not include, jump to next StoreFile or memstore (in time order) |
| */ |
| NEXT, |
| |
| /** |
| * Do not include, return current result |
| */ |
| DONE, |
| |
| /** |
| * These codes are used by the ScanQueryMatcher |
| */ |
| |
| /** |
| * Done with the row, seek there. |
| */ |
| SEEK_NEXT_ROW, |
| /** |
| * Done with column, seek to next. |
| */ |
| SEEK_NEXT_COL, |
| |
| /** |
| * Done with scan, thanks to the row filter. |
| */ |
| DONE_SCAN, |
| |
| /* |
| * Seek to next key which is given as hint. |
| */ |
| SEEK_NEXT_USING_HINT, |
| |
| /** |
| * Include KeyValue and done with column, seek to next. |
| */ |
| INCLUDE_AND_SEEK_NEXT_COL, |
| |
| /** |
| * Include KeyValue and done with row, seek to next. |
| */ |
| INCLUDE_AND_SEEK_NEXT_ROW, |
| } |
| } |