| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.phoenix.hbase.index; |
| |
| import static org.apache.phoenix.hbase.index.util.IndexManagementUtil.rethrowIndexingException; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Optional; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.hbase.Cell; |
| import org.apache.hadoop.hbase.CellUtil; |
| import org.apache.hadoop.hbase.CoprocessorEnvironment; |
| import org.apache.hadoop.hbase.HConstants; |
| import org.apache.hadoop.hbase.HConstants.OperationStatusCode; |
| import org.apache.hadoop.hbase.HRegionInfo; |
| import org.apache.hadoop.hbase.KeyValue; |
| import org.apache.hadoop.hbase.KeyValueUtil; |
| import org.apache.hadoop.hbase.client.Delete; |
| import org.apache.hadoop.hbase.client.Durability; |
| import org.apache.hadoop.hbase.client.Increment; |
| import org.apache.hadoop.hbase.client.Mutation; |
| import org.apache.hadoop.hbase.client.Put; |
| import org.apache.hadoop.hbase.client.Result; |
| import org.apache.hadoop.hbase.client.TableDescriptor; |
| import org.apache.hadoop.hbase.client.TableDescriptorBuilder; |
| import org.apache.hadoop.hbase.coprocessor.ObserverContext; |
| import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor; |
| import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; |
| import org.apache.hadoop.hbase.coprocessor.RegionObserver; |
| import org.apache.hadoop.hbase.regionserver.MiniBatchOperationInProgress; |
| import org.apache.hadoop.hbase.regionserver.OperationStatus; |
| import org.apache.hadoop.hbase.regionserver.Region; |
| import org.apache.hadoop.hbase.util.Bytes; |
| import org.apache.hadoop.hbase.util.Pair; |
| import org.apache.hadoop.hbase.wal.WALEdit; |
| import org.apache.htrace.Span; |
| import org.apache.htrace.Trace; |
| import org.apache.htrace.TraceScope; |
| import org.apache.phoenix.coprocessor.BaseScannerRegionObserver.ReplayWrite; |
| import org.apache.phoenix.coprocessor.DelegateRegionCoprocessorEnvironment; |
| import org.apache.phoenix.hbase.index.LockManager.RowLock; |
| import org.apache.phoenix.hbase.index.builder.FatalIndexBuildingFailureException; |
| import org.apache.phoenix.hbase.index.builder.IndexBuildManager; |
| import org.apache.phoenix.hbase.index.builder.IndexBuilder; |
| import org.apache.phoenix.hbase.index.metrics.MetricsIndexerSource; |
| import org.apache.phoenix.hbase.index.metrics.MetricsIndexerSourceFactory; |
| import org.apache.phoenix.hbase.index.table.HTableInterfaceReference; |
| import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr; |
| import org.apache.phoenix.hbase.index.util.IndexManagementUtil; |
| import org.apache.phoenix.hbase.index.util.VersionUtil; |
| import org.apache.phoenix.hbase.index.wal.IndexedKeyValue; |
| import org.apache.phoenix.hbase.index.write.IndexFailurePolicy; |
| import org.apache.phoenix.hbase.index.write.IndexWriter; |
| import org.apache.phoenix.hbase.index.write.RecoveryIndexWriter; |
| import org.apache.phoenix.hbase.index.write.recovery.PerRegionIndexWriteCache; |
| import org.apache.phoenix.hbase.index.write.recovery.StoreFailuresInCachePolicy; |
| import org.apache.phoenix.query.QueryServicesOptions; |
| import org.apache.phoenix.trace.TracingUtils; |
| import org.apache.phoenix.trace.util.NullSpan; |
| import org.apache.phoenix.util.EnvironmentEdgeManager; |
| import org.apache.phoenix.util.IndexUtil; |
| import org.apache.phoenix.util.ScanUtil; |
| import org.apache.phoenix.util.ServerUtil; |
| import org.apache.phoenix.util.ServerUtil.ConnectionType; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.phoenix.thirdparty.com.google.common.collect.Lists; |
| import org.apache.phoenix.thirdparty.com.google.common.collect.Multimap; |
| |
| /** |
| * Do all the work of managing index updates from a single coprocessor. All Puts/Delets are passed |
| * to an {@link IndexBuilder} to determine the actual updates to make. |
| * <p> |
| * If the WAL is enabled, these updates are then added to the WALEdit and attempted to be written to |
| * the WAL after the WALEdit has been saved. If any of the index updates fail, this server is |
| * immediately terminated and we rely on WAL replay to attempt the index updates again (see |
| * #preWALRestore(ObserverContext, HRegionInfo, HLogKey, WALEdit)). |
| * <p> |
| * If the WAL is disabled, the updates are attempted immediately. No consistency guarantees are made |
| * if the WAL is disabled - some or none of the index updates may be successful. All updates in a |
| * single batch must have the same durability level - either everything gets written to the WAL or |
| * nothing does. Currently, we do not support mixed-durability updates within a single batch. If you |
| * want to have different durability levels, you only need to split the updates into two different |
| * batches. |
| * <p> |
| * We don't need to implement {@link #postPut(ObserverContext, Put, WALEdit, Durability)} and |
| * {@link #postDelete(ObserverContext, Delete, WALEdit, Durability)} hooks because |
| * Phoenix always does batch mutations. |
| * <p> |
| */ |
| public class Indexer implements RegionObserver, RegionCoprocessor { |
| |
| private static final Logger LOGGER = LoggerFactory.getLogger(Indexer.class); |
| private static final OperationStatus IGNORE = new OperationStatus(OperationStatusCode.SUCCESS); |
| private static final OperationStatus NOWRITE = new OperationStatus(OperationStatusCode.SUCCESS); |
| |
| |
| protected IndexWriter writer; |
| protected IndexBuildManager builder; |
| private LockManager lockManager; |
| |
| // Hack to get around not being able to save any state between |
| // coprocessor calls. TODO: remove after HBASE-18127 when available |
| private static class BatchMutateContext { |
| public final int clientVersion; |
| public Collection<Pair<Mutation, byte[]>> indexUpdates = Collections.emptyList(); |
| public List<RowLock> rowLocks = Lists.newArrayListWithExpectedSize(QueryServicesOptions.DEFAULT_MUTATE_BATCH_SIZE); |
| |
| public BatchMutateContext(int clientVersion) { |
| this.clientVersion = clientVersion; |
| } |
| } |
| |
| private ThreadLocal<BatchMutateContext> batchMutateContext = |
| new ThreadLocal<BatchMutateContext>(); |
| |
| /** Configuration key for the {@link IndexBuilder} to use */ |
| public static final String INDEX_BUILDER_CONF_KEY = "index.builder"; |
| |
| /** |
| * Configuration key for if the indexer should check the version of HBase is running. Generally, |
| * you only want to ignore this for testing or for custom versions of HBase. |
| */ |
| public static final String CHECK_VERSION_CONF_KEY = "com.saleforce.hbase.index.checkversion"; |
| |
| private static final String INDEX_RECOVERY_FAILURE_POLICY_KEY = "org.apache.hadoop.hbase.index.recovery.failurepolicy"; |
| |
| private static final String INDEXER_INDEX_WRITE_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.post.batch.mutate.threshold"; |
| private static final long INDEXER_INDEX_WRITE_SLOW_THRESHOLD_DEFAULT = 3_000; |
| private static final String INDEXER_INDEX_PREPARE_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.pre.batch.mutate.threshold"; |
| private static final long INDEXER_INDEX_PREPARE_SLOW_THREHSOLD_DEFAULT = 3_000; |
| private static final String INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.pre.wal.restore.threshold"; |
| private static final long INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_DEFAULT = 3_000; |
| private static final String INDEXER_POST_OPEN_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.open.threshold"; |
| private static final long INDEXER_POST_OPEN_SLOW_THRESHOLD_DEFAULT = 3_000; |
| private static final String INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.pre.increment"; |
| private static final long INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_DEFAULT = 3_000; |
| |
| /** |
| * cache the failed updates to the various regions. Used for making the WAL recovery mechanisms |
| * more robust in the face of recoverying index regions that were on the same server as the |
| * primary table region |
| */ |
| private PerRegionIndexWriteCache failedIndexEdits = new PerRegionIndexWriteCache(); |
| |
| /** |
| * IndexWriter for writing the recovered index edits. Separate from the main indexer since we need |
| * different write/failure policies |
| */ |
| private IndexWriter recoveryWriter; |
| |
| private MetricsIndexerSource metricSource; |
| |
| private boolean stopped; |
| private boolean disabled; |
| private long slowIndexWriteThreshold; |
| private long slowIndexPrepareThreshold; |
| private long slowPreWALRestoreThreshold; |
| private long slowPostOpenThreshold; |
| private long slowPreIncrementThreshold; |
| private int rowLockWaitDuration; |
| private String dataTableName; |
| |
| public static final String RecoveryFailurePolicyKeyForTesting = INDEX_RECOVERY_FAILURE_POLICY_KEY; |
| |
| public static final int INDEXING_SUPPORTED_MAJOR_VERSION = VersionUtil |
| .encodeMaxPatchVersion(0, 94); |
| public static final int INDEXING_SUPPORTED__MIN_MAJOR_VERSION = VersionUtil |
| .encodeVersion("0.94.0"); |
| private static final int INDEX_WAL_COMPRESSION_MINIMUM_SUPPORTED_VERSION = VersionUtil |
| .encodeVersion("0.94.9"); |
| |
| private static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000; |
| |
| @Override |
| public Optional<RegionObserver> getRegionObserver() { |
| return Optional.of(this); |
| } |
| |
| @Override |
| public void start(CoprocessorEnvironment e) throws IOException { |
| try { |
| final RegionCoprocessorEnvironment env = (RegionCoprocessorEnvironment) e; |
| String serverName = env.getServerName().getServerName(); |
| if (env.getConfiguration().getBoolean(CHECK_VERSION_CONF_KEY, true)) { |
| // make sure the right version <-> combinations are allowed. |
| String errormsg = Indexer.validateVersion(env.getHBaseVersion(), env.getConfiguration()); |
| if (errormsg != null) { |
| throw new FatalIndexBuildingFailureException(errormsg); |
| } |
| } |
| |
| this.builder = new IndexBuildManager(env); |
| // Clone the config since it is shared |
| DelegateRegionCoprocessorEnvironment indexWriterEnv = new DelegateRegionCoprocessorEnvironment(env, ConnectionType.INDEX_WRITER_CONNECTION); |
| // setup the actual index writer |
| this.writer = new IndexWriter(indexWriterEnv, serverName + "-index-writer"); |
| |
| this.rowLockWaitDuration = env.getConfiguration().getInt("hbase.rowlock.wait.duration", |
| DEFAULT_ROWLOCK_WAIT_DURATION); |
| this.lockManager = new LockManager(); |
| |
| // Metrics impl for the Indexer -- avoiding unnecessary indirection for hadoop-1/2 compat |
| this.metricSource = MetricsIndexerSourceFactory.getInstance().getIndexerSource(); |
| setSlowThresholds(e.getConfiguration()); |
| this.dataTableName = env.getRegionInfo().getTable().getNameAsString(); |
| try { |
| // get the specified failure policy. We only ever override it in tests, but we need to do it |
| // here |
| Class<? extends IndexFailurePolicy> policyClass = |
| env.getConfiguration().getClass(INDEX_RECOVERY_FAILURE_POLICY_KEY, |
| StoreFailuresInCachePolicy.class, IndexFailurePolicy.class); |
| IndexFailurePolicy policy = |
| policyClass.getConstructor(PerRegionIndexWriteCache.class).newInstance(failedIndexEdits); |
| LOGGER.debug("Setting up recovery writter with failure policy: " + policy.getClass()); |
| recoveryWriter = |
| new RecoveryIndexWriter(policy, indexWriterEnv, serverName + "-recovery-writer"); |
| } catch (Exception ex) { |
| throw new IOException("Could not instantiate recovery failure policy!", ex); |
| } |
| } catch (NoSuchMethodError ex) { |
| disabled = true; |
| LOGGER.error("Must be too early a version of HBase. Disabled coprocessor ", ex); |
| } |
| } |
| |
| /** |
| * Extracts the slow call threshold values from the configuration. |
| */ |
| private void setSlowThresholds(Configuration c) { |
| slowIndexPrepareThreshold = c.getLong(INDEXER_INDEX_WRITE_SLOW_THRESHOLD_KEY, |
| INDEXER_INDEX_WRITE_SLOW_THRESHOLD_DEFAULT); |
| slowIndexWriteThreshold = c.getLong(INDEXER_INDEX_PREPARE_SLOW_THRESHOLD_KEY, |
| INDEXER_INDEX_PREPARE_SLOW_THREHSOLD_DEFAULT); |
| slowPreWALRestoreThreshold = c.getLong(INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_KEY, |
| INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_DEFAULT); |
| slowPostOpenThreshold = c.getLong(INDEXER_POST_OPEN_SLOW_THRESHOLD_KEY, |
| INDEXER_POST_OPEN_SLOW_THRESHOLD_DEFAULT); |
| slowPreIncrementThreshold = c.getLong(INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_KEY, |
| INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_DEFAULT); |
| } |
| |
| private String getCallTooSlowMessage(String callName, long duration, long threshold) { |
| StringBuilder sb = new StringBuilder(64); |
| sb.append("(callTooSlow) ").append(callName).append(" duration=").append(duration); |
| sb.append("ms, threshold=").append(threshold).append("ms"); |
| return sb.toString(); |
| } |
| |
| |
| @Override |
| public void stop(CoprocessorEnvironment e) throws IOException { |
| if (this.stopped) { |
| return; |
| } |
| if (this.disabled) { |
| return; |
| } |
| this.stopped = true; |
| String msg = "Indexer is being stopped"; |
| this.builder.stop(msg); |
| this.writer.stop(msg); |
| this.recoveryWriter.stop(msg); |
| } |
| |
| /** |
| * We use an Increment to serialize the ON DUPLICATE KEY clause so that the HBase plumbing |
| * sets up the necessary locks and mvcc to allow an atomic update. The Increment is not a |
| * real increment, though, it's really more of a Put. We translate the Increment into a |
| * list of mutations, at most a single Put and Delete that are the changes upon executing |
| * the list of ON DUPLICATE KEY clauses for this row. |
| */ |
| @Override |
| public Result preIncrementAfterRowLock(final ObserverContext<RegionCoprocessorEnvironment> e, |
| final Increment inc) throws IOException { |
| long start = EnvironmentEdgeManager.currentTimeMillis(); |
| try { |
| List<Mutation> mutations = this.builder.executeAtomicOp(inc); |
| if (mutations == null) { |
| return null; |
| } |
| |
| // Causes the Increment to be ignored as we're committing the mutations |
| // ourselves below. |
| e.bypass(); |
| // ON DUPLICATE KEY IGNORE will return empty list if row already exists |
| // as no action is required in that case. |
| if (!mutations.isEmpty()) { |
| Region region = e.getEnvironment().getRegion(); |
| // Otherwise, submit the mutations directly here |
| region.batchMutate(mutations.toArray(new Mutation[0])); |
| } |
| return Result.EMPTY_RESULT; |
| } catch (Throwable t) { |
| throw ServerUtil.createIOException( |
| "Unable to process ON DUPLICATE IGNORE for " + |
| e.getEnvironment().getRegion().getRegionInfo().getTable().getNameAsString() + |
| "(" + Bytes.toStringBinary(inc.getRow()) + ")", t); |
| } finally { |
| long duration = EnvironmentEdgeManager.currentTimeMillis() - start; |
| if (duration >= slowIndexPrepareThreshold) { |
| if (LOGGER.isDebugEnabled()) { |
| LOGGER.debug(getCallTooSlowMessage("preIncrementAfterRowLock", |
| duration, slowPreIncrementThreshold)); |
| } |
| metricSource.incrementSlowDuplicateKeyCheckCalls(dataTableName); |
| } |
| metricSource.updateDuplicateKeyCheckTime(dataTableName, duration); |
| } |
| } |
| |
| @Override |
| public void preBatchMutate(ObserverContext<RegionCoprocessorEnvironment> c, |
| MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { |
| if (this.disabled) { |
| return; |
| } |
| long start = EnvironmentEdgeManager.currentTimeMillis(); |
| try { |
| preBatchMutateWithExceptions(c, miniBatchOp); |
| return; |
| } catch (Throwable t) { |
| rethrowIndexingException(t); |
| } finally { |
| long duration = EnvironmentEdgeManager.currentTimeMillis() - start; |
| if (duration >= slowIndexPrepareThreshold) { |
| if (LOGGER.isDebugEnabled()) { |
| LOGGER.debug(getCallTooSlowMessage("preBatchMutate", |
| duration, slowIndexPrepareThreshold)); |
| } |
| metricSource.incrementNumSlowIndexPrepareCalls(dataTableName); |
| } |
| metricSource.updateIndexPrepareTime(dataTableName, duration); |
| } |
| throw new RuntimeException( |
| "Somehow didn't return an index update but also didn't propagate the failure to the client!"); |
| } |
| |
| private static void setTimeStamp(KeyValue kv, byte[] tsBytes) { |
| int tsOffset = kv.getTimestampOffset(); |
| System.arraycopy(tsBytes, 0, kv.getBuffer(), tsOffset, Bytes.SIZEOF_LONG); |
| } |
| |
| public void preBatchMutateWithExceptions(ObserverContext<RegionCoprocessorEnvironment> c, |
| MiniBatchOperationInProgress<Mutation> miniBatchOp) throws Throwable { |
| |
| // Need to add cell tags to Delete Marker before we do any index processing |
| // since we add tags to tables which doesn't have indexes also. |
| IndexUtil.setDeleteAttributes(miniBatchOp); |
| // first group all the updates for a single row into a single update to be processed |
| Map<ImmutableBytesPtr, MultiMutation> mutationsMap = |
| new HashMap<ImmutableBytesPtr, MultiMutation>(); |
| |
| Durability defaultDurability = Durability.SYNC_WAL; |
| if (c.getEnvironment().getRegion() != null) { |
| defaultDurability = c.getEnvironment().getRegion().getTableDescriptor().getDurability(); |
| defaultDurability = (defaultDurability == Durability.USE_DEFAULT) ? |
| Durability.SYNC_WAL : defaultDurability; |
| } |
| /* |
| * Exclusively lock all rows so we get a consistent read |
| * while determining the index updates |
| */ |
| BatchMutateContext context = new BatchMutateContext(this.builder.getIndexMetaData(miniBatchOp).getClientVersion()); |
| setBatchMutateContext(c, context); |
| Durability durability = Durability.SKIP_WAL; |
| boolean copyMutations = false; |
| for (int i = 0; i < miniBatchOp.size(); i++) { |
| Mutation m = miniBatchOp.getOperation(i); |
| if (this.builder.isAtomicOp(m)) { |
| miniBatchOp.setOperationStatus(i, IGNORE); |
| continue; |
| } |
| if (this.builder.isEnabled(m)) { |
| context.rowLocks.add(lockManager.lockRow(m.getRow(), rowLockWaitDuration)); |
| Durability effectiveDurablity = (m.getDurability() == Durability.USE_DEFAULT) ? |
| defaultDurability : m.getDurability(); |
| if (effectiveDurablity.ordinal() > durability.ordinal()) { |
| durability = effectiveDurablity; |
| } |
| // Track whether or not we need to |
| ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow()); |
| if (mutationsMap.containsKey(row)) { |
| copyMutations = true; |
| } else { |
| mutationsMap.put(row, null); |
| } |
| } |
| } |
| |
| // early exit if it turns out we don't have any edits |
| if (mutationsMap.isEmpty()) { |
| return; |
| } |
| |
| // If we're copying the mutations |
| Collection<Mutation> originalMutations; |
| Collection<? extends Mutation> mutations; |
| if (copyMutations) { |
| originalMutations = null; |
| mutations = mutationsMap.values(); |
| } else { |
| originalMutations = Lists.newArrayListWithExpectedSize(mutationsMap.size()); |
| mutations = originalMutations; |
| } |
| |
| Mutation firstMutation = miniBatchOp.getOperation(0); |
| ReplayWrite replayWrite = this.builder.getReplayWrite(firstMutation); |
| boolean resetTimeStamp = replayWrite == null; |
| long now = EnvironmentEdgeManager.currentTimeMillis(); |
| for (int i = 0; i < miniBatchOp.size(); i++) { |
| Mutation m = miniBatchOp.getOperation(i); |
| // skip this mutation if we aren't enabling indexing |
| // unfortunately, we really should ask if the raw mutation (rather than the combined mutation) |
| // should be indexed, which means we need to expose another method on the builder. Such is the |
| // way optimization go though. |
| if (miniBatchOp.getOperationStatus(i) != IGNORE && this.builder.isEnabled(m)) { |
| if (resetTimeStamp) { |
| // Unless we're replaying edits to rebuild the index, we update the time stamp |
| // of the data table to prevent overlapping time stamps (which prevents index |
| // inconsistencies as this case isn't handled correctly currently). |
| for (List<Cell> cells : m.getFamilyCellMap().values()) { |
| for (Cell cell : cells) { |
| CellUtil.setTimestamp(cell, now); |
| } |
| } |
| } |
| // No need to write the table mutations when we're rebuilding |
| // the index as they're already written and just being replayed. |
| if (replayWrite == ReplayWrite.INDEX_ONLY |
| || replayWrite == ReplayWrite.REBUILD_INDEX_ONLY) { |
| miniBatchOp.setOperationStatus(i, NOWRITE); |
| } |
| |
| // Only copy mutations if we found duplicate rows |
| // which only occurs when we're partially rebuilding |
| // the index (since we'll potentially have both a |
| // Put and a Delete mutation for the same row). |
| if (copyMutations) { |
| // Add the mutation to the batch set |
| |
| ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow()); |
| MultiMutation stored = mutationsMap.get(row); |
| // we haven't seen this row before, so add it |
| if (stored == null) { |
| stored = new MultiMutation(row); |
| mutationsMap.put(row, stored); |
| } |
| stored.addAll(m); |
| } else { |
| originalMutations.add(m); |
| } |
| } |
| } |
| |
| // dump all the index updates into a single WAL. They will get combined in the end anyways, so |
| // don't worry which one we get |
| WALEdit edit = miniBatchOp.getWalEdit(0); |
| if (edit == null) { |
| edit = new WALEdit(); |
| miniBatchOp.setWalEdit(0, edit); |
| } |
| |
| if (copyMutations || replayWrite != null) { |
| mutations = IndexManagementUtil.flattenMutationsByTimestamp(mutations); |
| } |
| |
| // get the current span, or just use a null-span to avoid a bunch of if statements |
| try (TraceScope scope = Trace.startSpan("Starting to build index updates")) { |
| Span current = scope.getSpan(); |
| if (current == null) { |
| current = NullSpan.INSTANCE; |
| } |
| long start = EnvironmentEdgeManager.currentTimeMillis(); |
| |
| // get the index updates for all elements in this batch |
| Collection<Pair<Mutation, byte[]>> indexUpdates = |
| this.builder.getIndexUpdate(miniBatchOp, mutations); |
| |
| |
| long duration = EnvironmentEdgeManager.currentTimeMillis() - start; |
| if (duration >= slowIndexPrepareThreshold) { |
| if (LOGGER.isDebugEnabled()) { |
| LOGGER.debug(getCallTooSlowMessage( |
| "indexPrepare", duration, slowIndexPrepareThreshold)); |
| } |
| metricSource.incrementNumSlowIndexPrepareCalls(dataTableName); |
| } |
| metricSource.updateIndexPrepareTime(dataTableName, duration); |
| current.addTimelineAnnotation("Built index updates, doing preStep"); |
| TracingUtils.addAnnotation(current, "index update count", indexUpdates.size()); |
| byte[] tableName = c.getEnvironment().getRegion().getTableDescriptor().getTableName().getName(); |
| Iterator<Pair<Mutation, byte[]>> indexUpdatesItr = indexUpdates.iterator(); |
| List<Mutation> localUpdates = new ArrayList<Mutation>(indexUpdates.size()); |
| while (indexUpdatesItr.hasNext()) { |
| Pair<Mutation, byte[]> next = indexUpdatesItr.next(); |
| if (Bytes.compareTo(next.getSecond(), tableName) == 0) { |
| localUpdates.add(next.getFirst()); |
| indexUpdatesItr.remove(); |
| } |
| } |
| if (!localUpdates.isEmpty()) { |
| miniBatchOp.addOperationsFromCP(0, |
| localUpdates.toArray(new Mutation[localUpdates.size()])); |
| } |
| if (!indexUpdates.isEmpty()) { |
| context.indexUpdates = indexUpdates; |
| // write index updates to WAL |
| if (durability != Durability.SKIP_WAL) { |
| // we have all the WAL durability, so we just update the WAL entry and move on |
| for (Pair<Mutation, byte[]> entry : indexUpdates) { |
| edit.add(IndexedKeyValue.newIndexedKeyValue(entry.getSecond(), |
| entry.getFirst())); |
| } |
| } |
| } |
| } |
| |
| } |
| |
| private void setBatchMutateContext(ObserverContext<RegionCoprocessorEnvironment> c, BatchMutateContext context) { |
| this.batchMutateContext.set(context); |
| } |
| |
| private BatchMutateContext getBatchMutateContext(ObserverContext<RegionCoprocessorEnvironment> c) { |
| return this.batchMutateContext.get(); |
| } |
| |
| private void removeBatchMutateContext(ObserverContext<RegionCoprocessorEnvironment> c) { |
| this.batchMutateContext.remove(); |
| } |
| |
| @Override |
| public void postBatchMutateIndispensably(ObserverContext<RegionCoprocessorEnvironment> c, |
| MiniBatchOperationInProgress<Mutation> miniBatchOp, final boolean success) throws IOException { |
| if (this.disabled) { |
| return; |
| } |
| long start = EnvironmentEdgeManager.currentTimeMillis(); |
| BatchMutateContext context = getBatchMutateContext(c); |
| if (context == null) { |
| return; |
| } |
| try { |
| for (RowLock rowLock : context.rowLocks) { |
| rowLock.release(); |
| } |
| this.builder.batchCompleted(miniBatchOp); |
| |
| if (success) { // if miniBatchOp was successfully written, write index updates |
| doPost(c, context); |
| } |
| } finally { |
| removeBatchMutateContext(c); |
| long duration = EnvironmentEdgeManager.currentTimeMillis() - start; |
| if (duration >= slowIndexWriteThreshold) { |
| if (LOGGER.isDebugEnabled()) { |
| LOGGER.debug(getCallTooSlowMessage("postBatchMutateIndispensably", |
| duration, slowIndexWriteThreshold)); |
| } |
| metricSource.incrementNumSlowIndexWriteCalls(dataTableName); |
| } |
| metricSource.updateIndexWriteTime(dataTableName, duration); |
| } |
| } |
| |
| private void doPost(ObserverContext<RegionCoprocessorEnvironment> c, BatchMutateContext context) throws IOException { |
| try { |
| doPostWithExceptions(c,context); |
| return; |
| } catch (Throwable e) { |
| rethrowIndexingException(e); |
| } |
| throw new RuntimeException( |
| "Somehow didn't complete the index update, but didn't return succesfully either!"); |
| } |
| |
| private void doPostWithExceptions(ObserverContext<RegionCoprocessorEnvironment> c, BatchMutateContext context) |
| throws IOException { |
| //short circuit, if we don't need to do any work |
| if (context == null || context.indexUpdates.isEmpty()) { |
| return; |
| } |
| |
| // get the current span, or just use a null-span to avoid a bunch of if statements |
| try (TraceScope scope = Trace.startSpan("Completing index writes")) { |
| Span current = scope.getSpan(); |
| if (current == null) { |
| current = NullSpan.INSTANCE; |
| } |
| long start = EnvironmentEdgeManager.currentTimeMillis(); |
| |
| current.addTimelineAnnotation("Actually doing index update for first time"); |
| writer.writeAndHandleFailure(context.indexUpdates, false, context.clientVersion); |
| |
| long duration = EnvironmentEdgeManager.currentTimeMillis() - start; |
| if (duration >= slowIndexWriteThreshold) { |
| if (LOGGER.isDebugEnabled()) { |
| LOGGER.debug(getCallTooSlowMessage("indexWrite", |
| duration, slowIndexWriteThreshold)); |
| } |
| metricSource.incrementNumSlowIndexWriteCalls(dataTableName); |
| } |
| metricSource.updateIndexWriteTime(dataTableName, duration); |
| } |
| } |
| |
| /** |
| * Extract the index updates from the WAL Edit |
| * @param edit to search for index updates |
| * @return the mutations to apply to the index tables |
| */ |
| private Collection<Pair<Mutation, byte[]>> extractIndexUpdate(WALEdit edit) { |
| // Avoid multiple internal array resizings. Initial size of 64, unless we have fewer cells in the edit |
| int initialSize = Math.min(edit.size(), 64); |
| Collection<Pair<Mutation, byte[]>> indexUpdates = new ArrayList<Pair<Mutation, byte[]>>(initialSize); |
| for (Cell kv : edit.getCells()) { |
| if (kv instanceof IndexedKeyValue) { |
| IndexedKeyValue ikv = (IndexedKeyValue) kv; |
| indexUpdates.add(new Pair<Mutation, byte[]>(ikv.getMutation(), ikv.getIndexTable())); |
| } |
| } |
| |
| return indexUpdates; |
| } |
| |
| @Override |
| public void postOpen(final ObserverContext<RegionCoprocessorEnvironment> c) { |
| Multimap<HTableInterfaceReference, Mutation> updates = failedIndexEdits.getEdits(c.getEnvironment().getRegion()); |
| |
| if (this.disabled) { |
| return; |
| } |
| |
| long start = EnvironmentEdgeManager.currentTimeMillis(); |
| try { |
| //if we have no pending edits to complete, then we are done |
| if (updates == null || updates.size() == 0) { |
| return; |
| } |
| |
| LOGGER.info("Found some outstanding index updates that didn't succeed during" |
| + " WAL replay - attempting to replay now."); |
| |
| // do the usual writer stuff, killing the server again, if we can't manage to make the index |
| // writes succeed again |
| try { |
| writer.writeAndHandleFailure(updates, true, ScanUtil.UNKNOWN_CLIENT_VERSION); |
| } catch (IOException e) { |
| LOGGER.error("During WAL replay of outstanding index updates, " |
| + "Exception is thrown instead of killing server during index writing", e); |
| } |
| } finally { |
| long duration = EnvironmentEdgeManager.currentTimeMillis() - start; |
| if (duration >= slowPostOpenThreshold) { |
| if (LOGGER.isDebugEnabled()) { |
| LOGGER.debug(getCallTooSlowMessage("postOpen", duration, slowPostOpenThreshold)); |
| } |
| metricSource.incrementNumSlowPostOpenCalls(dataTableName); |
| } |
| metricSource.updatePostOpenTime(dataTableName, duration); |
| } |
| } |
| |
| @Override |
| public void preWALRestore( |
| org.apache.hadoop.hbase.coprocessor.ObserverContext<? extends RegionCoprocessorEnvironment> ctx, |
| org.apache.hadoop.hbase.client.RegionInfo info, org.apache.hadoop.hbase.wal.WALKey logKey, WALEdit logEdit) |
| throws IOException { |
| |
| if (this.disabled) { |
| return; |
| } |
| |
| // TODO check the regions in transition. If the server on which the region lives is this one, |
| // then we should rety that write later in postOpen. |
| // we might be able to get even smarter here and pre-split the edits that are server-local |
| // into their own recovered.edits file. This then lets us do a straightforward recovery of each |
| // region (and more efficiently as we aren't writing quite as hectically from this one place). |
| |
| long start = EnvironmentEdgeManager.currentTimeMillis(); |
| try { |
| /* |
| * Basically, we let the index regions recover for a little while long before retrying in the |
| * hopes they come up before the primary table finishes. |
| */ |
| Collection<Pair<Mutation, byte[]>> indexUpdates = extractIndexUpdate(logEdit); |
| recoveryWriter.writeAndHandleFailure(indexUpdates, true, ScanUtil.UNKNOWN_CLIENT_VERSION); |
| } finally { |
| long duration = EnvironmentEdgeManager.currentTimeMillis() - start; |
| if (duration >= slowPreWALRestoreThreshold) { |
| if (LOGGER.isDebugEnabled()) { |
| LOGGER.debug(getCallTooSlowMessage("preWALRestore", |
| duration, slowPreWALRestoreThreshold)); |
| } |
| metricSource.incrementNumSlowPreWALRestoreCalls(dataTableName); |
| } |
| metricSource.updatePreWALRestoreTime(dataTableName, duration); |
| } |
| } |
| |
| |
| /** |
| * Exposed for testing! |
| * @return the currently instantiated index builder |
| */ |
| public IndexBuilder getBuilderForTesting() { |
| return this.builder.getBuilderForTesting(); |
| } |
| |
| /** |
| * Validate that the version and configuration parameters are supported |
| * @param hbaseVersion current version of HBase on which <tt>this</tt> coprocessor is installed |
| * @param conf configuration to check for allowed parameters (e.g. WAL Compression only {@code if >= |
| * 0.94.9) } |
| * @return <tt>null</tt> if the version is supported, the error message to display otherwise |
| */ |
| public static String validateVersion(String hbaseVersion, Configuration conf) { |
| int encodedVersion = VersionUtil.encodeVersion(hbaseVersion); |
| // above 0.94 everything should be supported |
| if (encodedVersion > INDEXING_SUPPORTED_MAJOR_VERSION) { |
| return null; |
| } |
| // check to see if its at least 0.94 |
| if (encodedVersion < INDEXING_SUPPORTED__MIN_MAJOR_VERSION) { |
| return "Indexing not supported for versions older than 0.94.X"; |
| } |
| // if less than 0.94.9, we need to check if WAL Compression is enabled |
| if (encodedVersion < INDEX_WAL_COMPRESSION_MINIMUM_SUPPORTED_VERSION) { |
| if (conf.getBoolean(HConstants.ENABLE_WAL_COMPRESSION, false)) { |
| return "Indexing not supported with WAL Compression for versions of HBase older than 0.94.9 - found version:" |
| + hbaseVersion; |
| } |
| } |
| return null; |
| } |
| |
| /** |
| * Enable indexing on the given table |
| * @param descBuilder {@link TableDescriptor} for the table on which indexing should be enabled |
| * @param builder class to use when building the index for this table |
| * @param properties map of custom configuration options to make available to your |
| * {@link IndexBuilder} on the server-side |
| * @param priority TODO |
| * @throws IOException the Indexer coprocessor cannot be added |
| */ |
| public static void enableIndexing(TableDescriptorBuilder descBuilder, Class<? extends IndexBuilder> builder, |
| Map<String, String> properties, int priority) throws IOException { |
| if (properties == null) { |
| properties = new HashMap<String, String>(); |
| } |
| properties.put(Indexer.INDEX_BUILDER_CONF_KEY, builder.getName()); |
| descBuilder.addCoprocessor(Indexer.class.getName(), null, priority, properties); |
| } |
| } |
| |