| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.hbase.regionserver; |
| |
| import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL; |
| import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY; |
| import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.REGION_NAMES_KEY; |
| import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.ROW_LOCK_READ_LOCK_KEY; |
| import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent; |
| |
| import com.google.errorprone.annotations.RestrictedApi; |
| import edu.umd.cs.findbugs.annotations.Nullable; |
| import io.opentelemetry.api.trace.Span; |
| import java.io.EOFException; |
| import java.io.FileNotFoundException; |
| import java.io.IOException; |
| import java.io.InterruptedIOException; |
| import java.lang.reflect.Constructor; |
| import java.nio.ByteBuffer; |
| import java.nio.charset.StandardCharsets; |
| import java.text.ParseException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.NavigableMap; |
| import java.util.NavigableSet; |
| import java.util.Objects; |
| import java.util.Optional; |
| import java.util.RandomAccess; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.UUID; |
| import java.util.concurrent.Callable; |
| import java.util.concurrent.CompletionService; |
| import java.util.concurrent.ConcurrentHashMap; |
| import java.util.concurrent.ConcurrentMap; |
| import java.util.concurrent.ConcurrentSkipListMap; |
| import java.util.concurrent.ExecutionException; |
| import java.util.concurrent.ExecutorCompletionService; |
| import java.util.concurrent.ExecutorService; |
| import java.util.concurrent.Executors; |
| import java.util.concurrent.Future; |
| import java.util.concurrent.FutureTask; |
| import java.util.concurrent.ThreadFactory; |
| import java.util.concurrent.ThreadPoolExecutor; |
| import java.util.concurrent.TimeUnit; |
| import java.util.concurrent.TimeoutException; |
| import java.util.concurrent.atomic.AtomicBoolean; |
| import java.util.concurrent.atomic.AtomicInteger; |
| import java.util.concurrent.atomic.LongAdder; |
| import java.util.concurrent.locks.Lock; |
| import java.util.concurrent.locks.ReadWriteLock; |
| import java.util.concurrent.locks.ReentrantReadWriteLock; |
| import java.util.function.Function; |
| import java.util.stream.Collectors; |
| import java.util.stream.Stream; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileStatus; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.LocatedFileStatus; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.hbase.Cell; |
| import org.apache.hadoop.hbase.CellBuilderType; |
| import org.apache.hadoop.hbase.CellComparator; |
| import org.apache.hadoop.hbase.CellComparatorImpl; |
| import org.apache.hadoop.hbase.CellScanner; |
| import org.apache.hadoop.hbase.CellUtil; |
| import org.apache.hadoop.hbase.CompareOperator; |
| import org.apache.hadoop.hbase.CompoundConfiguration; |
| import org.apache.hadoop.hbase.DoNotRetryIOException; |
| import org.apache.hadoop.hbase.DroppedSnapshotException; |
| import org.apache.hadoop.hbase.ExtendedCell; |
| import org.apache.hadoop.hbase.ExtendedCellBuilderFactory; |
| import org.apache.hadoop.hbase.HConstants; |
| import org.apache.hadoop.hbase.HConstants.OperationStatusCode; |
| import org.apache.hadoop.hbase.HDFSBlocksDistribution; |
| import org.apache.hadoop.hbase.KeyValue; |
| import org.apache.hadoop.hbase.MetaCellComparator; |
| import org.apache.hadoop.hbase.NamespaceDescriptor; |
| import org.apache.hadoop.hbase.NotServingRegionException; |
| import org.apache.hadoop.hbase.PrivateCellUtil; |
| import org.apache.hadoop.hbase.RegionTooBusyException; |
| import org.apache.hadoop.hbase.TableName; |
| import org.apache.hadoop.hbase.Tag; |
| import org.apache.hadoop.hbase.TagUtil; |
| import org.apache.hadoop.hbase.client.Append; |
| import org.apache.hadoop.hbase.client.CheckAndMutate; |
| import org.apache.hadoop.hbase.client.CheckAndMutateResult; |
| import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; |
| import org.apache.hadoop.hbase.client.CompactionState; |
| import org.apache.hadoop.hbase.client.Delete; |
| import org.apache.hadoop.hbase.client.Durability; |
| import org.apache.hadoop.hbase.client.Get; |
| import org.apache.hadoop.hbase.client.Increment; |
| import org.apache.hadoop.hbase.client.IsolationLevel; |
| import org.apache.hadoop.hbase.client.Mutation; |
| import org.apache.hadoop.hbase.client.Put; |
| import org.apache.hadoop.hbase.client.RegionInfo; |
| import org.apache.hadoop.hbase.client.RegionInfoBuilder; |
| import org.apache.hadoop.hbase.client.RegionReplicaUtil; |
| import org.apache.hadoop.hbase.client.Result; |
| import org.apache.hadoop.hbase.client.Row; |
| import org.apache.hadoop.hbase.client.RowMutations; |
| import org.apache.hadoop.hbase.client.Scan; |
| import org.apache.hadoop.hbase.client.TableDescriptor; |
| import org.apache.hadoop.hbase.client.TableDescriptorBuilder; |
| import org.apache.hadoop.hbase.conf.ConfigurationManager; |
| import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver; |
| import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; |
| import org.apache.hadoop.hbase.coprocessor.ReadOnlyConfiguration; |
| import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; |
| import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException; |
| import org.apache.hadoop.hbase.exceptions.TimeoutIOException; |
| import org.apache.hadoop.hbase.exceptions.UnknownProtocolException; |
| import org.apache.hadoop.hbase.filter.BinaryComparator; |
| import org.apache.hadoop.hbase.filter.ByteArrayComparable; |
| import org.apache.hadoop.hbase.filter.Filter; |
| import org.apache.hadoop.hbase.io.HFileLink; |
| import org.apache.hadoop.hbase.io.HeapSize; |
| import org.apache.hadoop.hbase.io.TimeRange; |
| import org.apache.hadoop.hbase.io.hfile.BlockCache; |
| import org.apache.hadoop.hbase.io.hfile.CombinedBlockCache; |
| import org.apache.hadoop.hbase.io.hfile.HFile; |
| import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache; |
| import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils; |
| import org.apache.hadoop.hbase.ipc.RpcCall; |
| import org.apache.hadoop.hbase.ipc.RpcServer; |
| import org.apache.hadoop.hbase.mob.MobFileCache; |
| import org.apache.hadoop.hbase.monitoring.MonitoredTask; |
| import org.apache.hadoop.hbase.monitoring.TaskMonitor; |
| import org.apache.hadoop.hbase.quotas.RegionServerSpaceQuotaManager; |
| import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry; |
| import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext; |
| import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker; |
| import org.apache.hadoop.hbase.regionserver.metrics.MetricsTableRequests; |
| import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory; |
| import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController; |
| import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector; |
| import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController; |
| import org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException; |
| import org.apache.hadoop.hbase.regionserver.wal.WALUtil; |
| import org.apache.hadoop.hbase.replication.ReplicationUtils; |
| import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver; |
| import org.apache.hadoop.hbase.security.User; |
| import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; |
| import org.apache.hadoop.hbase.snapshot.SnapshotManifest; |
| import org.apache.hadoop.hbase.trace.TraceUtil; |
| import org.apache.hadoop.hbase.util.Bytes; |
| import org.apache.hadoop.hbase.util.CancelableProgressable; |
| import org.apache.hadoop.hbase.util.ClassSize; |
| import org.apache.hadoop.hbase.util.CommonFSUtils; |
| import org.apache.hadoop.hbase.util.CoprocessorConfigurationUtil; |
| import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; |
| import org.apache.hadoop.hbase.util.FSUtils; |
| import org.apache.hadoop.hbase.util.HashedBytes; |
| import org.apache.hadoop.hbase.util.NonceKey; |
| import org.apache.hadoop.hbase.util.Pair; |
| import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; |
| import org.apache.hadoop.hbase.util.TableDescriptorChecker; |
| import org.apache.hadoop.hbase.util.Threads; |
| import org.apache.hadoop.hbase.wal.WAL; |
| import org.apache.hadoop.hbase.wal.WALEdit; |
| import org.apache.hadoop.hbase.wal.WALFactory; |
| import org.apache.hadoop.hbase.wal.WALKey; |
| import org.apache.hadoop.hbase.wal.WALKeyImpl; |
| import org.apache.hadoop.hbase.wal.WALSplitUtil; |
| import org.apache.hadoop.hbase.wal.WALSplitUtil.MutationReplay; |
| import org.apache.hadoop.hbase.wal.WALStreamReader; |
| import org.apache.hadoop.util.StringUtils; |
| import org.apache.yetus.audience.InterfaceAudience; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; |
| import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; |
| import org.apache.hbase.thirdparty.com.google.common.collect.Lists; |
| import org.apache.hbase.thirdparty.com.google.common.collect.Maps; |
| import org.apache.hbase.thirdparty.com.google.common.io.Closeables; |
| import org.apache.hbase.thirdparty.com.google.protobuf.Service; |
| import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat; |
| import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations; |
| import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils; |
| |
| import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor; |
| |
| /** |
| * Regions store data for a certain region of a table. It stores all columns for each row. A given |
| * table consists of one or more Regions. |
| * <p> |
| * An Region is defined by its table and its key extent. |
| * <p> |
| * Locking at the Region level serves only one purpose: preventing the region from being closed (and |
| * consequently split) while other operations are ongoing. Each row level operation obtains both a |
| * row lock and a region read lock for the duration of the operation. While a scanner is being |
| * constructed, getScanner holds a read lock. If the scanner is successfully constructed, it holds a |
| * read lock until it is closed. A close takes out a write lock and consequently will block for |
| * ongoing operations and will block new operations from starting while the close is in progress. |
| */ |
| @SuppressWarnings("deprecation") |
| @InterfaceAudience.Private |
| public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region { |
| private static final Logger LOG = LoggerFactory.getLogger(HRegion.class); |
| |
| public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY = |
| "hbase.hregion.scan.loadColumnFamiliesOnDemand"; |
| |
| public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize"; |
| public static final int DEFAULT_MAX_CELL_SIZE = 10485760; |
| |
| public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE = |
| "hbase.regionserver.minibatch.size"; |
| public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000; |
| |
| public static final String WAL_HSYNC_CONF_KEY = "hbase.wal.hsync"; |
| public static final boolean DEFAULT_WAL_HSYNC = false; |
| |
| /** Parameter name for compaction after bulkload */ |
| public static final String COMPACTION_AFTER_BULKLOAD_ENABLE = |
| "hbase.compaction.after.bulkload.enable"; |
| |
| /** Config for allow split when file count greater than the configured blocking file count */ |
| public static final String SPLIT_IGNORE_BLOCKING_ENABLED_KEY = |
| "hbase.hregion.split.ignore.blocking.enabled"; |
| |
| public static final String REGION_STORAGE_POLICY_KEY = "hbase.hregion.block.storage.policy"; |
| public static final String DEFAULT_REGION_STORAGE_POLICY = "NONE"; |
| |
| /** |
| * This is for for using HRegion as a local storage, where we may put the recovered edits in a |
| * special place. Once this is set, we will only replay the recovered edits under this directory |
| * and ignore the original replay directory configs. |
| */ |
| public static final String SPECIAL_RECOVERED_EDITS_DIR = |
| "hbase.hregion.special.recovered.edits.dir"; |
| |
| /** |
| * Mainly used for master local region, where we will replay the WAL file directly without |
| * splitting, so it is possible to have WAL files which are not closed cleanly, in this way, |
| * hitting EOF is expected so should not consider it as a critical problem. |
| */ |
| public static final String RECOVERED_EDITS_IGNORE_EOF = |
| "hbase.hregion.recovered.edits.ignore.eof"; |
| |
| /** |
| * Whether to use {@link MetaCellComparator} even if we are not meta region. Used when creating |
| * master local region. |
| */ |
| public static final String USE_META_CELL_COMPARATOR = "hbase.region.use.meta.cell.comparator"; |
| |
| public static final boolean DEFAULT_USE_META_CELL_COMPARATOR = false; |
| |
| final AtomicBoolean closed = new AtomicBoolean(false); |
| |
| /* |
| * Closing can take some time; use the closing flag if there is stuff we don't want to do while in |
| * closing state; e.g. like offer this region up to the master as a region to close if the |
| * carrying regionserver is overloaded. Once set, it is never cleared. |
| */ |
| final AtomicBoolean closing = new AtomicBoolean(false); |
| |
| /** |
| * The max sequence id of flushed data on this region. There is no edit in memory that is less |
| * that this sequence id. |
| */ |
| private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM; |
| |
| /** |
| * Record the sequence id of last flush operation. Can be in advance of {@link #maxFlushedSeqId} |
| * when flushing a single column family. In this case, {@link #maxFlushedSeqId} will be older than |
| * the oldest edit in memory. |
| */ |
| private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM; |
| |
| /** |
| * The sequence id of the last replayed open region event from the primary region. This is used to |
| * skip entries before this due to the possibility of replay edits coming out of order from |
| * replication. |
| */ |
| protected volatile long lastReplayedOpenRegionSeqId = -1L; |
| protected volatile long lastReplayedCompactionSeqId = -1L; |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // Members |
| ////////////////////////////////////////////////////////////////////////////// |
| |
| // map from a locked row to the context for that lock including: |
| // - CountDownLatch for threads waiting on that row |
| // - the thread that owns the lock (allow reentrancy) |
| // - reference count of (reentrant) locks held by the thread |
| // - the row itself |
| private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows = |
| new ConcurrentHashMap<>(); |
| |
| protected final Map<byte[], HStore> stores = |
| new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR); |
| |
| // TODO: account for each registered handler in HeapSize computation |
| private Map<String, com.google.protobuf.Service> coprocessorServiceHandlers = Maps.newHashMap(); |
| |
| // Track data size in all memstores |
| private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing(); |
| RegionServicesForStores regionServicesForStores; |
| |
| // Debug possible data loss due to WAL off |
| final LongAdder numMutationsWithoutWAL = new LongAdder(); |
| final LongAdder dataInMemoryWithoutWAL = new LongAdder(); |
| |
| // Debug why CAS operations are taking a while. |
| final LongAdder checkAndMutateChecksPassed = new LongAdder(); |
| final LongAdder checkAndMutateChecksFailed = new LongAdder(); |
| |
| // Number of requests |
| // Count rows for scan |
| final LongAdder readRequestsCount = new LongAdder(); |
| final LongAdder filteredReadRequestsCount = new LongAdder(); |
| // Count rows for multi row mutations |
| final LongAdder writeRequestsCount = new LongAdder(); |
| |
| // Number of requests blocked by memstore size. |
| private final LongAdder blockedRequestsCount = new LongAdder(); |
| |
| // Compaction LongAdders |
| final LongAdder compactionsFinished = new LongAdder(); |
| final LongAdder compactionsFailed = new LongAdder(); |
| final LongAdder compactionNumFilesCompacted = new LongAdder(); |
| final LongAdder compactionNumBytesCompacted = new LongAdder(); |
| final LongAdder compactionsQueued = new LongAdder(); |
| final LongAdder flushesQueued = new LongAdder(); |
| |
| private BlockCache blockCache; |
| private MobFileCache mobFileCache; |
| private final WAL wal; |
| private final HRegionFileSystem fs; |
| protected final Configuration conf; |
| private final Configuration baseConf; |
| private final int rowLockWaitDuration; |
| static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000; |
| |
| private Path regionWalDir; |
| private FileSystem walFS; |
| |
| // set to true if the region is restored from snapshot |
| private boolean isRestoredRegion = false; |
| |
| public void setRestoredRegion(boolean restoredRegion) { |
| isRestoredRegion = restoredRegion; |
| } |
| |
| public MetricsTableRequests getMetricsTableRequests() { |
| return metricsTableRequests; |
| } |
| |
| // Handle table latency metrics |
| private MetricsTableRequests metricsTableRequests; |
| |
| // The internal wait duration to acquire a lock before read/update |
| // from the region. It is not per row. The purpose of this wait time |
| // is to avoid waiting a long time while the region is busy, so that |
| // we can release the IPC handler soon enough to improve the |
| // availability of the region server. It can be adjusted by |
| // tuning configuration "hbase.busy.wait.duration". |
| final long busyWaitDuration; |
| static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT; |
| |
| // If updating multiple rows in one call, wait longer, |
| // i.e. waiting for busyWaitDuration * # of rows. However, |
| // we can limit the max multiplier. |
| final int maxBusyWaitMultiplier; |
| |
| // Max busy wait duration. There is no point to wait longer than the RPC |
| // purge timeout, when a RPC call will be terminated by the RPC engine. |
| final long maxBusyWaitDuration; |
| |
| // Max cell size. If nonzero, the maximum allowed size for any given cell |
| // in bytes |
| final long maxCellSize; |
| |
| // Number of mutations for minibatch processing. |
| private final int miniBatchSize; |
| |
| // negative number indicates infinite timeout |
| static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L; |
| final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool(); |
| |
| final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints; |
| final ReadPointCalculationLock smallestReadPointCalcLock; |
| |
| /** |
| * The sequence ID that was enLongAddered when this region was opened. |
| */ |
| private long openSeqNum = HConstants.NO_SEQNUM; |
| |
| /** |
| * The default setting for whether to enable on-demand CF loading for scan requests to this |
| * region. Requests can override it. |
| */ |
| private boolean isLoadingCfsOnDemandDefault = false; |
| |
| private final AtomicInteger majorInProgress = new AtomicInteger(0); |
| private final AtomicInteger minorInProgress = new AtomicInteger(0); |
| |
| // |
| // Context: During replay we want to ensure that we do not lose any data. So, we |
| // have to be conservative in how we replay wals. For each store, we calculate |
| // the maxSeqId up to which the store was flushed. And, skip the edits which |
| // are equal to or lower than maxSeqId for each store. |
| // The following map is populated when opening the region |
| Map<byte[], Long> maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| |
| /** Saved state from replaying prepare flush cache */ |
| private PrepareFlushResult prepareFlushResult = null; |
| |
| private volatile ConfigurationManager configurationManager; |
| |
| // Used for testing. |
| private volatile Long timeoutForWriteLock = null; |
| |
| private final CellComparator cellComparator; |
| |
| private final int minBlockSizeBytes; |
| |
| /** |
| * @return The smallest mvcc readPoint across all the scanners in this region. Writes older than |
| * this readPoint, are included in every read operation. |
| */ |
| public long getSmallestReadPoint() { |
| // We need to ensure that while we are calculating the smallestReadPoint |
| // no new RegionScanners can grab a readPoint that we are unaware of. |
| smallestReadPointCalcLock.lock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); |
| try { |
| long minimumReadPoint = mvcc.getReadPoint(); |
| for (Long readPoint : this.scannerReadPoints.values()) { |
| minimumReadPoint = Math.min(minimumReadPoint, readPoint); |
| } |
| return minimumReadPoint; |
| } finally { |
| smallestReadPointCalcLock.unlock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); |
| } |
| } |
| |
| /* |
| * Data structure of write state flags used coordinating flushes, compactions and closes. |
| */ |
| static class WriteState { |
| // Set while a memstore flush is happening. |
| volatile boolean flushing = false; |
| // Set when a flush has been requested. |
| volatile boolean flushRequested = false; |
| // Number of compactions running. |
| AtomicInteger compacting = new AtomicInteger(0); |
| // Gets set in close. If set, cannot compact or flush again. |
| volatile boolean writesEnabled = true; |
| // Set if region is read-only |
| volatile boolean readOnly = false; |
| // whether the reads are enabled. This is different than readOnly, because readOnly is |
| // static in the lifetime of the region, while readsEnabled is dynamic |
| volatile boolean readsEnabled = true; |
| |
| /** |
| * Set flags that make this region read-only. |
| * @param onOff flip value for region r/o setting |
| */ |
| synchronized void setReadOnly(final boolean onOff) { |
| this.writesEnabled = !onOff; |
| this.readOnly = onOff; |
| } |
| |
| boolean isReadOnly() { |
| return this.readOnly; |
| } |
| |
| boolean isFlushRequested() { |
| return this.flushRequested; |
| } |
| |
| void setReadsEnabled(boolean readsEnabled) { |
| this.readsEnabled = readsEnabled; |
| } |
| |
| static final long HEAP_SIZE = ClassSize.align(ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN); |
| } |
| |
| /** |
| * Objects from this class are created when flushing to describe all the different states that |
| * that method ends up in. The Result enum describes those states. The sequence id should only be |
| * specified if the flush was successful, and the failure message should only be specified if it |
| * didn't flush. |
| */ |
| public static class FlushResultImpl implements FlushResult { |
| final Result result; |
| final String failureReason; |
| final long flushSequenceId; |
| final boolean wroteFlushWalMarker; |
| |
| /** |
| * Convenience constructor to use when the flush is successful, the failure message is set to |
| * null. |
| * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED. |
| * @param flushSequenceId Generated sequence id that comes right after the edits in the |
| * memstores. |
| */ |
| FlushResultImpl(Result result, long flushSequenceId) { |
| this(result, flushSequenceId, null, false); |
| assert result == Result.FLUSHED_NO_COMPACTION_NEEDED |
| || result == Result.FLUSHED_COMPACTION_NEEDED; |
| } |
| |
| /** |
| * Convenience constructor to use when we cannot flush. |
| * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH. |
| * @param failureReason Reason why we couldn't flush. |
| */ |
| FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) { |
| this(result, -1, failureReason, wroteFlushMarker); |
| assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH; |
| } |
| |
| /** |
| * Constructor with all the parameters. |
| * @param result Any of the Result. |
| * @param flushSequenceId Generated sequence id if the memstores were flushed else -1. |
| * @param failureReason Reason why we couldn't flush, or null. |
| */ |
| FlushResultImpl(Result result, long flushSequenceId, String failureReason, |
| boolean wroteFlushMarker) { |
| this.result = result; |
| this.flushSequenceId = flushSequenceId; |
| this.failureReason = failureReason; |
| this.wroteFlushWalMarker = wroteFlushMarker; |
| } |
| |
| /** |
| * Convenience method, the equivalent of checking if result is FLUSHED_NO_COMPACTION_NEEDED or |
| * FLUSHED_NO_COMPACTION_NEEDED. |
| * @return true if the memstores were flushed, else false. |
| */ |
| @Override |
| public boolean isFlushSucceeded() { |
| return result == Result.FLUSHED_NO_COMPACTION_NEEDED |
| || result == Result.FLUSHED_COMPACTION_NEEDED; |
| } |
| |
| /** |
| * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED. |
| * @return True if the flush requested a compaction, else false (doesn't even mean it flushed). |
| */ |
| @Override |
| public boolean isCompactionNeeded() { |
| return result == Result.FLUSHED_COMPACTION_NEEDED; |
| } |
| |
| @Override |
| public String toString() { |
| return new StringBuilder().append("flush result:").append(result).append(", ") |
| .append("failureReason:").append(failureReason).append(",").append("flush seq id") |
| .append(flushSequenceId).toString(); |
| } |
| |
| @Override |
| public Result getResult() { |
| return result; |
| } |
| } |
| |
| /** A result object from prepare flush cache stage */ |
| static class PrepareFlushResult { |
| final FlushResultImpl result; // indicating a failure result from prepare |
| final TreeMap<byte[], StoreFlushContext> storeFlushCtxs; |
| final TreeMap<byte[], List<Path>> committedFiles; |
| final TreeMap<byte[], MemStoreSize> storeFlushableSize; |
| final long startTime; |
| final long flushOpSeqId; |
| final long flushedSeqId; |
| final MemStoreSizing totalFlushableSize; |
| |
| /** Constructs an early exit case */ |
| PrepareFlushResult(FlushResultImpl result, long flushSeqId) { |
| this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD); |
| } |
| |
| /** Constructs a successful prepare flush result */ |
| PrepareFlushResult(TreeMap<byte[], StoreFlushContext> storeFlushCtxs, |
| TreeMap<byte[], List<Path>> committedFiles, TreeMap<byte[], MemStoreSize> storeFlushableSize, |
| long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) { |
| this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushSeqId, |
| flushedSeqId, totalFlushableSize); |
| } |
| |
| private PrepareFlushResult(FlushResultImpl result, |
| TreeMap<byte[], StoreFlushContext> storeFlushCtxs, TreeMap<byte[], List<Path>> committedFiles, |
| TreeMap<byte[], MemStoreSize> storeFlushableSize, long startTime, long flushSeqId, |
| long flushedSeqId, MemStoreSizing totalFlushableSize) { |
| this.result = result; |
| this.storeFlushCtxs = storeFlushCtxs; |
| this.committedFiles = committedFiles; |
| this.storeFlushableSize = storeFlushableSize; |
| this.startTime = startTime; |
| this.flushOpSeqId = flushSeqId; |
| this.flushedSeqId = flushedSeqId; |
| this.totalFlushableSize = totalFlushableSize; |
| } |
| |
| public FlushResult getResult() { |
| return this.result; |
| } |
| } |
| |
| /** |
| * A class that tracks exceptions that have been observed in one batch. Not thread safe. |
| */ |
| static class ObservedExceptionsInBatch { |
| private boolean wrongRegion = false; |
| private boolean failedSanityCheck = false; |
| private boolean wrongFamily = false; |
| |
| /** Returns If a {@link WrongRegionException} has been observed. */ |
| boolean hasSeenWrongRegion() { |
| return wrongRegion; |
| } |
| |
| /** |
| * Records that a {@link WrongRegionException} has been observed. |
| */ |
| void sawWrongRegion() { |
| wrongRegion = true; |
| } |
| |
| /** Returns If a {@link FailedSanityCheckException} has been observed. */ |
| boolean hasSeenFailedSanityCheck() { |
| return failedSanityCheck; |
| } |
| |
| /** |
| * Records that a {@link FailedSanityCheckException} has been observed. |
| */ |
| void sawFailedSanityCheck() { |
| failedSanityCheck = true; |
| } |
| |
| /** Returns If a {@link NoSuchColumnFamilyException} has been observed. */ |
| boolean hasSeenNoSuchFamily() { |
| return wrongFamily; |
| } |
| |
| /** |
| * Records that a {@link NoSuchColumnFamilyException} has been observed. |
| */ |
| void sawNoSuchFamily() { |
| wrongFamily = true; |
| } |
| } |
| |
| final WriteState writestate = new WriteState(); |
| |
| long memstoreFlushSize; |
| final long timestampSlop; |
| final long rowProcessorTimeout; |
| |
| // Last flush time for each Store. Useful when we are flushing for each column |
| private final ConcurrentMap<HStore, Long> lastStoreFlushTimeMap = new ConcurrentHashMap<>(); |
| |
| protected RegionServerServices rsServices; |
| private RegionServerAccounting rsAccounting; |
| private long flushCheckInterval; |
| // flushPerChanges is to prevent too many changes in memstore |
| private long flushPerChanges; |
| private long blockingMemStoreSize; |
| // Used to guard closes |
| final ReentrantReadWriteLock lock; |
| // Used to track interruptible holders of the region lock. Currently that is only RPC handler |
| // threads. Boolean value in map determines if lock holder can be interrupted, normally true, |
| // but may be false when thread is transiting a critical section. |
| final ConcurrentHashMap<Thread, Boolean> regionLockHolders; |
| |
| // Stop updates lock |
| private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock(); |
| |
| private final MultiVersionConcurrencyControl mvcc; |
| |
| // Coprocessor host |
| private volatile RegionCoprocessorHost coprocessorHost; |
| |
| private TableDescriptor htableDescriptor = null; |
| private RegionSplitPolicy splitPolicy; |
| private RegionSplitRestriction splitRestriction; |
| private FlushPolicy flushPolicy; |
| |
| private final MetricsRegion metricsRegion; |
| private final MetricsRegionWrapperImpl metricsRegionWrapper; |
| private final Durability regionDurability; |
| private final boolean regionStatsEnabled; |
| // Stores the replication scope of the various column families of the table |
| // that has non-default scope |
| private final NavigableMap<byte[], Integer> replicationScope = |
| new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| |
| private final StoreHotnessProtector storeHotnessProtector; |
| |
| /** |
| * HRegion constructor. This constructor should only be used for testing and extensions. Instances |
| * of HRegion should be instantiated with the {@link HRegion#createHRegion} or |
| * {@link HRegion#openHRegion} method. |
| * @param tableDir qualified path of directory where region should be located, usually the table |
| * directory. |
| * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a |
| * logfile from the previous execution that's custom-computed for this HRegion. |
| * The HRegionServer computes and sorts the appropriate wal info for this |
| * HRegion. If there is a previous wal file (implying that the HRegion has been |
| * written-to before), then read it from the supplied path. |
| * @param fs is the filesystem. |
| * @param confParam is global configuration settings. |
| * @param regionInfo - RegionInfo that describes the region is new), then read them from the |
| * supplied path. |
| * @param htd the table descriptor |
| * @param rsServices reference to {@link RegionServerServices} or null |
| * @deprecated Use other constructors. |
| */ |
| @Deprecated |
| public HRegion(final Path tableDir, final WAL wal, final FileSystem fs, |
| final Configuration confParam, final RegionInfo regionInfo, final TableDescriptor htd, |
| final RegionServerServices rsServices) { |
| this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo), wal, confParam, htd, |
| rsServices); |
| } |
| |
| /** |
| * HRegion constructor. This constructor should only be used for testing and extensions. Instances |
| * of HRegion should be instantiated with the {@link HRegion#createHRegion} or |
| * {@link HRegion#openHRegion} method. |
| * @param fs is the filesystem. |
| * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a |
| * logfile from the previous execution that's custom-computed for this HRegion. |
| * The HRegionServer computes and sorts the appropriate wal info for this |
| * HRegion. If there is a previous wal file (implying that the HRegion has been |
| * written-to before), then read it from the supplied path. |
| * @param confParam is global configuration settings. |
| * @param htd the table descriptor |
| * @param rsServices reference to {@link RegionServerServices} or null |
| */ |
| public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam, |
| final TableDescriptor htd, final RegionServerServices rsServices) { |
| if (htd == null) { |
| throw new IllegalArgumentException("Need table descriptor"); |
| } |
| |
| if (confParam instanceof CompoundConfiguration) { |
| throw new IllegalArgumentException("Need original base configuration"); |
| } |
| |
| this.wal = wal; |
| this.fs = fs; |
| this.mvcc = new MultiVersionConcurrencyControl(getRegionInfo().getShortNameToLog()); |
| |
| // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor |
| this.baseConf = confParam; |
| this.conf = new CompoundConfiguration().add(confParam).addBytesMap(htd.getValues()); |
| this.cellComparator = htd.isMetaTable() |
| || conf.getBoolean(USE_META_CELL_COMPARATOR, DEFAULT_USE_META_CELL_COMPARATOR) |
| ? MetaCellComparator.META_COMPARATOR |
| : CellComparatorImpl.COMPARATOR; |
| this.lock = new ReentrantReadWriteLock( |
| conf.getBoolean(FAIR_REENTRANT_CLOSE_LOCK, DEFAULT_FAIR_REENTRANT_CLOSE_LOCK)); |
| this.regionLockHolders = new ConcurrentHashMap<>(); |
| this.flushCheckInterval = |
| conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL, DEFAULT_CACHE_FLUSH_INTERVAL); |
| this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES); |
| if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) { |
| throw new IllegalArgumentException( |
| MEMSTORE_FLUSH_PER_CHANGES + " can not exceed " + MAX_FLUSH_PER_CHANGES); |
| } |
| int tmpRowLockDuration = |
| conf.getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION); |
| if (tmpRowLockDuration <= 0) { |
| LOG.info("Found hbase.rowlock.wait.duration set to {}. values <= 0 will cause all row " |
| + "locking to fail. Treating it as 1ms to avoid region failure.", tmpRowLockDuration); |
| tmpRowLockDuration = 1; |
| } |
| this.rowLockWaitDuration = tmpRowLockDuration; |
| |
| this.smallestReadPointCalcLock = new ReadPointCalculationLock(conf); |
| |
| this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true); |
| this.htableDescriptor = htd; |
| Set<byte[]> families = this.htableDescriptor.getColumnFamilyNames(); |
| for (byte[] family : families) { |
| if (!replicationScope.containsKey(family)) { |
| int scope = htd.getColumnFamily(family).getScope(); |
| // Only store those families that has NON-DEFAULT scope |
| if (scope != REPLICATION_SCOPE_LOCAL) { |
| // Do a copy before storing it here. |
| replicationScope.put(Bytes.copy(family), scope); |
| } |
| } |
| } |
| |
| this.rsServices = rsServices; |
| if (rsServices != null) { |
| this.blockCache = rsServices.getBlockCache().orElse(null); |
| this.mobFileCache = rsServices.getMobFileCache().orElse(null); |
| } |
| this.regionServicesForStores = new RegionServicesForStores(this, rsServices); |
| |
| setHTableSpecificConf(); |
| this.scannerReadPoints = new ConcurrentHashMap<>(); |
| |
| this.busyWaitDuration = conf.getLong("hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION); |
| this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2); |
| if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) { |
| throw new IllegalArgumentException("Invalid hbase.busy.wait.duration (" + busyWaitDuration |
| + ") or hbase.busy.wait.multiplier.max (" + maxBusyWaitMultiplier |
| + "). Their product should be positive"); |
| } |
| this.maxBusyWaitDuration = |
| conf.getLong("hbase.ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT); |
| |
| /* |
| * timestamp.slop provides a server-side constraint on the timestamp. This assumes that you base |
| * your TS around EnvironmentEdgeManager.currentTime(). In this case, throw an error to the user |
| * if the user-specified TS is newer than now + slop. LATEST_TIMESTAMP == don't use this |
| * functionality |
| */ |
| this.timestampSlop = |
| conf.getLong("hbase.hregion.keyvalue.timestamp.slop.millisecs", HConstants.LATEST_TIMESTAMP); |
| |
| /** |
| * Timeout for the process time in processRowsWithLocks(). Use -1 to switch off time bound. |
| */ |
| this.rowProcessorTimeout = |
| conf.getLong("hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT); |
| |
| this.storeHotnessProtector = new StoreHotnessProtector(this, conf); |
| |
| boolean forceSync = conf.getBoolean(WAL_HSYNC_CONF_KEY, DEFAULT_WAL_HSYNC); |
| /** |
| * This is the global default value for durability. All tables/mutations not defining a |
| * durability or using USE_DEFAULT will default to this value. |
| */ |
| Durability defaultDurability = forceSync ? Durability.FSYNC_WAL : Durability.SYNC_WAL; |
| this.regionDurability = this.htableDescriptor.getDurability() == Durability.USE_DEFAULT |
| ? defaultDurability |
| : this.htableDescriptor.getDurability(); |
| |
| decorateRegionConfiguration(conf); |
| if (rsServices != null) { |
| this.rsAccounting = this.rsServices.getRegionServerAccounting(); |
| // don't initialize coprocessors if not running within a regionserver |
| // TODO: revisit if coprocessors should load in other cases |
| this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); |
| this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this); |
| this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper, conf); |
| } else { |
| this.metricsRegionWrapper = null; |
| this.metricsRegion = null; |
| } |
| if (LOG.isDebugEnabled()) { |
| // Write out region name, its encoded name and storeHotnessProtector as string. |
| LOG.debug("Instantiated " + this + "; " + storeHotnessProtector.toString()); |
| } |
| |
| configurationManager = null; |
| |
| // disable stats tracking system tables, but check the config for everything else |
| this.regionStatsEnabled = htd.getTableName().getNamespaceAsString() |
| .equals(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) |
| ? false |
| : conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE, |
| HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE); |
| |
| this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE); |
| this.miniBatchSize = |
| conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE, DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE); |
| |
| // recover the metrics of read and write requests count if they were retained |
| if (rsServices != null && rsServices.getRegionServerAccounting() != null) { |
| Pair<Long, Long> retainedRWRequestsCnt = rsServices.getRegionServerAccounting() |
| .getRetainedRegionRWRequestsCnt().get(getRegionInfo().getEncodedName()); |
| if (retainedRWRequestsCnt != null) { |
| this.addReadRequestsCount(retainedRWRequestsCnt.getFirst()); |
| this.addWriteRequestsCount(retainedRWRequestsCnt.getSecond()); |
| // remove them since won't use again |
| rsServices.getRegionServerAccounting().getRetainedRegionRWRequestsCnt() |
| .remove(getRegionInfo().getEncodedName()); |
| } |
| } |
| |
| minBlockSizeBytes = Arrays.stream(this.htableDescriptor.getColumnFamilies()) |
| .mapToInt(ColumnFamilyDescriptor::getBlocksize).min().orElse(HConstants.DEFAULT_BLOCKSIZE); |
| } |
| |
| private void setHTableSpecificConf() { |
| if (this.htableDescriptor == null) { |
| return; |
| } |
| long flushSize = this.htableDescriptor.getMemStoreFlushSize(); |
| |
| if (flushSize <= 0) { |
| flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, |
| TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE); |
| } |
| this.memstoreFlushSize = flushSize; |
| long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER, |
| HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER); |
| this.blockingMemStoreSize = this.memstoreFlushSize * mult; |
| } |
| |
| /** |
| * Initialize this region. Used only by tests and SplitTransaction to reopen the region. You |
| * should use createHRegion() or openHRegion() |
| * @return What the next sequence (edit) id should be. |
| * @throws IOException e |
| * @deprecated use HRegion.createHRegion() or HRegion.openHRegion() |
| */ |
| @Deprecated |
| public long initialize() throws IOException { |
| return initialize(null); |
| } |
| |
| /** |
| * Initialize this region. |
| * @param reporter Tickle every so often if initialize is taking a while. |
| * @return What the next sequence (edit) id should be. |
| */ |
| long initialize(final CancelableProgressable reporter) throws IOException { |
| |
| // Refuse to open the region if there is no column family in the table |
| if (htableDescriptor.getColumnFamilyCount() == 0) { |
| throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString() |
| + " should have at least one column family."); |
| } |
| |
| MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this, true); |
| long nextSeqId = -1; |
| try { |
| nextSeqId = initializeRegionInternals(reporter, status); |
| return nextSeqId; |
| } catch (IOException e) { |
| LOG.warn("Failed initialize of region= {}, starting to roll back memstore", |
| getRegionInfo().getRegionNameAsString(), e); |
| // global memstore size will be decreased when dropping memstore |
| try { |
| // drop the memory used by memstore if open region fails |
| dropMemStoreContents(); |
| } catch (IOException ioE) { |
| if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) { |
| LOG.warn( |
| "Failed drop memstore of region= {}, " |
| + "some chunks may not released forever since MSLAB is enabled", |
| getRegionInfo().getRegionNameAsString()); |
| } |
| |
| } |
| if (metricsTableRequests != null) { |
| metricsTableRequests.removeRegistry(); |
| } |
| throw e; |
| } finally { |
| // nextSeqid will be -1 if the initialization fails. |
| // At least it will be 0 otherwise. |
| if (nextSeqId == -1) { |
| status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() |
| + " initialization."); |
| } |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(), |
| status.prettyPrintJournal()); |
| } |
| status.cleanup(); |
| } |
| } |
| |
| private long initializeRegionInternals(final CancelableProgressable reporter, |
| final MonitoredTask status) throws IOException { |
| if (coprocessorHost != null) { |
| status.setStatus("Running coprocessor pre-open hook"); |
| coprocessorHost.preOpen(); |
| } |
| |
| String policyName = this.conf.get(REGION_STORAGE_POLICY_KEY, DEFAULT_REGION_STORAGE_POLICY); |
| this.fs.setStoragePolicy(policyName.trim()); |
| |
| // Write HRI to a file in case we need to recover hbase:meta |
| // Only the primary replica should write .regioninfo |
| if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) { |
| status.setStatus("Writing region info on filesystem"); |
| fs.checkRegionInfoOnFilesystem(); |
| } |
| |
| // Initialize all the HStores |
| status.setStatus("Initializing all the Stores"); |
| long maxSeqId = initializeStores(reporter, status); |
| this.mvcc.advanceTo(maxSeqId); |
| if (!isRestoredRegion && ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) { |
| Collection<HStore> stores = this.stores.values(); |
| try { |
| // update the stores that we are replaying |
| stores.forEach(HStore::startReplayingFromWAL); |
| // Recover any edits if available. |
| maxSeqId = |
| Math.max(maxSeqId, replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status)); |
| // Recover any hfiles if available |
| maxSeqId = Math.max(maxSeqId, loadRecoveredHFilesIfAny(stores)); |
| // Make sure mvcc is up to max. |
| this.mvcc.advanceTo(maxSeqId); |
| } finally { |
| // update the stores that we are done replaying |
| stores.forEach(HStore::stopReplayingFromWAL); |
| } |
| } |
| this.lastReplayedOpenRegionSeqId = maxSeqId; |
| |
| this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this)); |
| this.writestate.flushRequested = false; |
| this.writestate.compacting.set(0); |
| |
| if (this.writestate.writesEnabled) { |
| // Remove temporary data left over from old regions |
| status.setStatus("Cleaning up temporary data from old regions"); |
| fs.cleanupTempDir(); |
| } |
| |
| // Initialize split policy |
| this.splitPolicy = RegionSplitPolicy.create(this, conf); |
| |
| // Initialize split restriction |
| splitRestriction = RegionSplitRestriction.create(getTableDescriptor(), conf); |
| |
| // Initialize flush policy |
| this.flushPolicy = FlushPolicyFactory.create(this, conf); |
| |
| long lastFlushTime = EnvironmentEdgeManager.currentTime(); |
| for (HStore store : stores.values()) { |
| this.lastStoreFlushTimeMap.put(store, lastFlushTime); |
| } |
| |
| // Use maximum of log sequenceid or that which was found in stores |
| // (particularly if no recovered edits, seqid will be -1). |
| long nextSeqId = maxSeqId + 1; |
| if (!isRestoredRegion) { |
| // always get openSeqNum from the default replica, even if we are secondary replicas |
| long maxSeqIdFromFile = WALSplitUtil.getMaxRegionSequenceId(conf, |
| RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem, |
| this::getWalFileSystem); |
| nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1; |
| // The openSeqNum will always be increase even for read only region, as we rely on it to |
| // determine whether a region has been successfully reopened, so here we always need to update |
| // the max sequence id file. |
| if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) { |
| LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName()); |
| WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), |
| nextSeqId - 1); |
| // This means we have replayed all the recovered edits and also written out the max sequence |
| // id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617 |
| // for more details. |
| Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, |
| getRegionInfo().getTable(), getRegionInfo().getEncodedName()); |
| FileSystem walFs = getWalFileSystem(); |
| if (walFs.exists(wrongRegionWALDir)) { |
| if (!walFs.delete(wrongRegionWALDir, true)) { |
| LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir); |
| } |
| } |
| } |
| } |
| |
| LOG.info("Opened {}; next sequenceid={}; {}, {}", this.getRegionInfo().getShortNameToLog(), |
| nextSeqId, this.splitPolicy, this.flushPolicy); |
| |
| // A region can be reopened if failed a split; reset flags |
| this.closing.set(false); |
| this.closed.set(false); |
| |
| if (coprocessorHost != null) { |
| status.setStatus("Running coprocessor post-open hooks"); |
| coprocessorHost.postOpen(); |
| } |
| |
| metricsTableRequests = new MetricsTableRequests(htableDescriptor.getTableName(), conf); |
| |
| status.markComplete("Region opened successfully"); |
| return nextSeqId; |
| } |
| |
| /** |
| * Open all Stores. |
| * @return Highest sequenceId found out in a Store. |
| */ |
| private long initializeStores(CancelableProgressable reporter, MonitoredTask status) |
| throws IOException { |
| return initializeStores(reporter, status, false); |
| } |
| |
| private long initializeStores(CancelableProgressable reporter, MonitoredTask status, |
| boolean warmup) throws IOException { |
| // Load in all the HStores. |
| long maxSeqId = -1; |
| // initialized to -1 so that we pick up MemstoreTS from column families |
| long maxMemstoreTS = -1; |
| |
| if (htableDescriptor.getColumnFamilyCount() != 0) { |
| // initialize the thread pool for opening stores in parallel. |
| ThreadPoolExecutor storeOpenerThreadPool = |
| getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog()); |
| CompletionService<HStore> completionService = |
| new ExecutorCompletionService<>(storeOpenerThreadPool); |
| |
| // initialize each store in parallel |
| for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) { |
| status.setStatus("Instantiating store for column family " + family); |
| completionService.submit(new Callable<HStore>() { |
| @Override |
| public HStore call() throws IOException { |
| return instantiateHStore(family, warmup); |
| } |
| }); |
| } |
| boolean allStoresOpened = false; |
| boolean hasSloppyStores = false; |
| try { |
| for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) { |
| Future<HStore> future = completionService.take(); |
| HStore store = future.get(); |
| this.stores.put(store.getColumnFamilyDescriptor().getName(), store); |
| if (store.isSloppyMemStore()) { |
| hasSloppyStores = true; |
| } |
| |
| long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L); |
| maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()), storeMaxSequenceId); |
| if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) { |
| maxSeqId = storeMaxSequenceId; |
| } |
| long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L); |
| if (maxStoreMemstoreTS > maxMemstoreTS) { |
| maxMemstoreTS = maxStoreMemstoreTS; |
| } |
| } |
| allStoresOpened = true; |
| if (hasSloppyStores) { |
| htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor) |
| .setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName()).build(); |
| LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this); |
| } |
| } catch (InterruptedException e) { |
| throw throwOnInterrupt(e); |
| } catch (ExecutionException e) { |
| throw new IOException(e.getCause()); |
| } finally { |
| storeOpenerThreadPool.shutdownNow(); |
| if (!allStoresOpened) { |
| // something went wrong, close all opened stores |
| LOG.error("Could not initialize all stores for the region=" + this); |
| for (HStore store : this.stores.values()) { |
| try { |
| store.close(); |
| } catch (IOException e) { |
| LOG.warn("close store {} failed in region {}", store.toString(), this, e); |
| } |
| } |
| } |
| } |
| } |
| return Math.max(maxSeqId, maxMemstoreTS + 1); |
| } |
| |
| private void initializeWarmup(final CancelableProgressable reporter) throws IOException { |
| MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this); |
| // Initialize all the HStores |
| status.setStatus("Warmup all stores of " + this.getRegionInfo().getRegionNameAsString()); |
| try { |
| initializeStores(reporter, status, true); |
| } finally { |
| status.markComplete("Warmed up " + this.getRegionInfo().getRegionNameAsString()); |
| } |
| } |
| |
| /** Returns Map of StoreFiles by column family */ |
| private NavigableMap<byte[], List<Path>> getStoreFiles() { |
| NavigableMap<byte[], List<Path>> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| for (HStore store : stores.values()) { |
| Collection<HStoreFile> storeFiles = store.getStorefiles(); |
| if (storeFiles == null) { |
| continue; |
| } |
| List<Path> storeFileNames = new ArrayList<>(); |
| for (HStoreFile storeFile : storeFiles) { |
| storeFileNames.add(storeFile.getPath()); |
| } |
| allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames); |
| } |
| return allStoreFiles; |
| } |
| |
| protected void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException { |
| Map<byte[], List<Path>> storeFiles = getStoreFiles(); |
| RegionEventDescriptor regionOpenDesc = |
| ProtobufUtil.toRegionEventDescriptor(RegionEventDescriptor.EventType.REGION_OPEN, |
| getRegionInfo(), openSeqId, getRegionServerServices().getServerName(), storeFiles); |
| WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc, |
| mvcc); |
| } |
| |
| private void writeRegionCloseMarker(WAL wal) throws IOException { |
| Map<byte[], List<Path>> storeFiles = getStoreFiles(); |
| RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor( |
| RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(), |
| getRegionServerServices().getServerName(), storeFiles); |
| WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc, |
| mvcc); |
| |
| // Store SeqId in WAL FileSystem when a region closes |
| // checking region folder exists is due to many tests which delete the table folder while a |
| // table is still online |
| if (getWalFileSystem().exists(getWALRegionDir())) { |
| WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), |
| mvcc.getReadPoint()); |
| } |
| } |
| |
| /** Returns True if this region has references. */ |
| public boolean hasReferences() { |
| return stores.values().stream().anyMatch(HStore::hasReferences); |
| } |
| |
| public void blockUpdates() { |
| this.updatesLock.writeLock().lock(); |
| } |
| |
| public void unblockUpdates() { |
| this.updatesLock.writeLock().unlock(); |
| } |
| |
| public HDFSBlocksDistribution getHDFSBlocksDistribution() { |
| HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); |
| stores.values().stream().filter(s -> s.getStorefiles() != null) |
| .flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution) |
| .forEachOrdered(hdfsBlocksDistribution::add); |
| return hdfsBlocksDistribution; |
| } |
| |
| /** |
| * This is a helper function to compute HDFS block distribution on demand |
| * @param conf configuration |
| * @param tableDescriptor TableDescriptor of the table |
| * @param regionInfo encoded name of the region |
| * @return The HDFS blocks distribution for the given region. |
| */ |
| public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, |
| TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException { |
| Path tablePath = |
| CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), tableDescriptor.getTableName()); |
| return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath); |
| } |
| |
| /** |
| * This is a helper function to compute HDFS block distribution on demand |
| * @param conf configuration |
| * @param tableDescriptor TableDescriptor of the table |
| * @param regionInfo encoded name of the region |
| * @param tablePath the table directory |
| * @return The HDFS blocks distribution for the given region. |
| */ |
| public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, |
| TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException { |
| HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); |
| FileSystem fs = tablePath.getFileSystem(conf); |
| |
| HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo); |
| for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) { |
| List<LocatedFileStatus> locatedFileStatusList = |
| HRegionFileSystem.getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true); |
| if (locatedFileStatusList == null) { |
| continue; |
| } |
| |
| for (LocatedFileStatus status : locatedFileStatusList) { |
| Path p = status.getPath(); |
| if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) { |
| // Only construct StoreFileInfo object if its not a hfile, save obj |
| // creation |
| StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status); |
| hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs)); |
| } else if (StoreFileInfo.isHFile(p)) { |
| // If its a HFile, then lets just add to the block distribution |
| // lets not create more objects here, not even another HDFSBlocksDistribution |
| FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations()); |
| } else { |
| throw new IOException("path=" + p + " doesn't look like a valid StoreFile"); |
| } |
| } |
| } |
| return hdfsBlocksDistribution; |
| } |
| |
| /** |
| * Increase the size of mem store in this region and the size of global mem store |
| */ |
| private void incMemStoreSize(MemStoreSize mss) { |
| incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), |
| mss.getCellsCount()); |
| } |
| |
| void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, |
| int cellsCountDelta) { |
| if (this.rsAccounting != null) { |
| rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); |
| } |
| long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta, |
| offHeapSizeDelta, cellsCountDelta); |
| checkNegativeMemStoreDataSize(dataSize, dataSizeDelta); |
| } |
| |
| void decrMemStoreSize(MemStoreSize mss) { |
| decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), |
| mss.getCellsCount()); |
| } |
| |
| private void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, |
| int cellsCountDelta) { |
| if (this.rsAccounting != null) { |
| rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); |
| } |
| long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta, |
| offHeapSizeDelta, cellsCountDelta); |
| checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta); |
| } |
| |
| private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) { |
| // This is extremely bad if we make memStoreSizing negative. Log as much info on the offending |
| // caller as possible. (memStoreSizing might be a negative value already -- freeing memory) |
| if (memStoreDataSize < 0) { |
| LOG.error("Asked to modify this region's (" + this.toString() |
| + ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing=" |
| + (memStoreDataSize - delta) + ", delta=" + delta, new Exception()); |
| } |
| } |
| |
| @Override |
| public RegionInfo getRegionInfo() { |
| return this.fs.getRegionInfo(); |
| } |
| |
| /** |
| * Returns Instance of {@link RegionServerServices} used by this HRegion. Can be null. |
| */ |
| RegionServerServices getRegionServerServices() { |
| return this.rsServices; |
| } |
| |
| @Override |
| public long getReadRequestsCount() { |
| return readRequestsCount.sum(); |
| } |
| |
| @Override |
| public long getFilteredReadRequestsCount() { |
| return filteredReadRequestsCount.sum(); |
| } |
| |
| @Override |
| public long getWriteRequestsCount() { |
| return writeRequestsCount.sum(); |
| } |
| |
| @Override |
| public long getMemStoreDataSize() { |
| return memStoreSizing.getDataSize(); |
| } |
| |
| @Override |
| public long getMemStoreHeapSize() { |
| return memStoreSizing.getHeapSize(); |
| } |
| |
| @Override |
| public long getMemStoreOffHeapSize() { |
| return memStoreSizing.getOffHeapSize(); |
| } |
| |
| /** Returns store services for this region, to access services required by store level needs */ |
| public RegionServicesForStores getRegionServicesForStores() { |
| return regionServicesForStores; |
| } |
| |
| @Override |
| public long getNumMutationsWithoutWAL() { |
| return numMutationsWithoutWAL.sum(); |
| } |
| |
| @Override |
| public long getDataInMemoryWithoutWAL() { |
| return dataInMemoryWithoutWAL.sum(); |
| } |
| |
| @Override |
| public long getBlockedRequestsCount() { |
| return blockedRequestsCount.sum(); |
| } |
| |
| @Override |
| public long getCheckAndMutateChecksPassed() { |
| return checkAndMutateChecksPassed.sum(); |
| } |
| |
| @Override |
| public long getCheckAndMutateChecksFailed() { |
| return checkAndMutateChecksFailed.sum(); |
| } |
| |
| // TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing |
| // the op and bypassing the core, this might be needed? Should be stop supporting the bypass |
| // feature? |
| public MetricsRegion getMetrics() { |
| return metricsRegion; |
| } |
| |
| @Override |
| public boolean isClosed() { |
| return this.closed.get(); |
| } |
| |
| @Override |
| public boolean isClosing() { |
| return this.closing.get(); |
| } |
| |
| @Override |
| public boolean isReadOnly() { |
| return this.writestate.isReadOnly(); |
| } |
| |
| @Override |
| public boolean isAvailable() { |
| return !isClosed() && !isClosing(); |
| } |
| |
| @Override |
| public boolean isSplittable() { |
| return splitPolicy.canSplit(); |
| } |
| |
| @Override |
| public boolean isMergeable() { |
| if (!isAvailable()) { |
| LOG.debug("Region " + this + " is not mergeable because it is closing or closed"); |
| return false; |
| } |
| if (hasReferences()) { |
| LOG.debug("Region " + this + " is not mergeable because it has references"); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| public boolean areWritesEnabled() { |
| synchronized (this.writestate) { |
| return this.writestate.writesEnabled; |
| } |
| } |
| |
| public MultiVersionConcurrencyControl getMVCC() { |
| return mvcc; |
| } |
| |
| @Override |
| public long getMaxFlushedSeqId() { |
| return maxFlushedSeqId; |
| } |
| |
| /** Returns readpoint considering given IsolationLevel. Pass {@code null} for default */ |
| public long getReadPoint(IsolationLevel isolationLevel) { |
| if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) { |
| // This scan can read even uncommitted transactions |
| return Long.MAX_VALUE; |
| } |
| return mvcc.getReadPoint(); |
| } |
| |
| public boolean isLoadingCfsOnDemandDefault() { |
| return this.isLoadingCfsOnDemandDefault; |
| } |
| |
| /** |
| * Close down this HRegion. Flush the cache, shut down each HStore, don't service any more calls. |
| * <p> |
| * This method could take some time to execute, so don't call it from a time-sensitive thread. |
| * @return Vector of all the storage files that the HRegion's component HStores make use of. It's |
| * a list of all StoreFile objects. Returns empty vector if already closed and null if |
| * judged that it should not close. |
| * @throws IOException e |
| * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was |
| * not properly persisted. The region is put in closing mode, and |
| * the caller MUST abort after this. |
| */ |
| public Map<byte[], List<HStoreFile>> close() throws IOException { |
| return close(false); |
| } |
| |
| private final Object closeLock = new Object(); |
| |
| /** Conf key for fair locking policy */ |
| public static final String FAIR_REENTRANT_CLOSE_LOCK = |
| "hbase.regionserver.fair.region.close.lock"; |
| public static final boolean DEFAULT_FAIR_REENTRANT_CLOSE_LOCK = true; |
| /** Conf key for the periodic flush interval */ |
| public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL = |
| "hbase.regionserver.optionalcacheflushinterval"; |
| /** Default interval for the memstore flush */ |
| public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000; |
| /** Default interval for System tables memstore flush */ |
| public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes |
| |
| /** Conf key to force a flush if there are already enough changes for one region in memstore */ |
| public static final String MEMSTORE_FLUSH_PER_CHANGES = "hbase.regionserver.flush.per.changes"; |
| public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions |
| /** |
| * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes |
| * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region |
| */ |
| public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G |
| |
| public static final String CLOSE_WAIT_ABORT = "hbase.regionserver.close.wait.abort"; |
| public static final boolean DEFAULT_CLOSE_WAIT_ABORT = false; |
| public static final String CLOSE_WAIT_TIME = "hbase.regionserver.close.wait.time.ms"; |
| public static final long DEFAULT_CLOSE_WAIT_TIME = 60000; // 1 minute |
| public static final String CLOSE_WAIT_INTERVAL = "hbase.regionserver.close.wait.interval.ms"; |
| public static final long DEFAULT_CLOSE_WAIT_INTERVAL = 10000; // 10 seconds |
| |
| public Map<byte[], List<HStoreFile>> close(boolean abort) throws IOException { |
| return close(abort, false); |
| } |
| |
| /** |
| * Close this HRegion. |
| * @param abort true if server is aborting (only during testing) |
| * @param ignoreStatus true if ignore the status (won't be showed on task list) |
| * @return Vector of all the storage files that the HRegion's component HStores make use of. It's |
| * a list of StoreFile objects. Can be null if we are not to close at this time, or we are |
| * already closed. |
| * @throws IOException e |
| * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was |
| * not properly persisted. The region is put in closing mode, and |
| * the caller MUST abort after this. |
| */ |
| public Map<byte[], List<HStoreFile>> close(boolean abort, boolean ignoreStatus) |
| throws IOException { |
| return close(abort, ignoreStatus, false); |
| } |
| |
| /** |
| * Close down this HRegion. Flush the cache unless abort parameter is true, Shut down each HStore, |
| * don't service any more calls. This method could take some time to execute, so don't call it |
| * from a time-sensitive thread. |
| * @param abort true if server is aborting (only during testing) |
| * @param ignoreStatus true if ignore the status (wont be showed on task list) |
| * @param isGracefulStop true if region is being closed during graceful stop and the blocks in the |
| * BucketCache should not be evicted. |
| * @return Vector of all the storage files that the HRegion's component HStores make use of. It's |
| * a list of StoreFile objects. Can be null if we are not to close at this time or we are |
| * already closed. |
| * @throws IOException e |
| * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was |
| * not properly persisted. The region is put in closing mode, and |
| * the caller MUST abort after this. |
| */ |
| public Map<byte[], List<HStoreFile>> close(boolean abort, boolean ignoreStatus, |
| boolean isGracefulStop) throws IOException { |
| // Only allow one thread to close at a time. Serialize them so dual |
| // threads attempting to close will run up against each other. |
| MonitoredTask status = TaskMonitor.get().createStatus( |
| "Closing region " + this.getRegionInfo().getEncodedName() + (abort ? " due to abort" : ""), |
| true); |
| status.setStatus("Waiting for close lock"); |
| try { |
| synchronized (closeLock) { |
| if (isGracefulStop && rsServices != null) { |
| rsServices.getBlockCache().ifPresent(blockCache -> { |
| if (blockCache instanceof CombinedBlockCache) { |
| BlockCache l2 = ((CombinedBlockCache) blockCache).getSecondLevelCache(); |
| if (l2 instanceof BucketCache) { |
| if (((BucketCache) l2).isCachePersistenceEnabled()) { |
| LOG.info( |
| "Closing region {} during a graceful stop, and cache persistence is on, " |
| + "so setting evict on close to false. ", |
| this.getRegionInfo().getRegionNameAsString()); |
| this.getStores().forEach(s -> s.getCacheConfig().setEvictOnClose(false)); |
| } |
| } |
| } |
| }); |
| } |
| return doClose(abort, status); |
| } |
| } finally { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(), |
| status.prettyPrintJournal()); |
| } |
| status.cleanup(); |
| } |
| } |
| |
| /** |
| * Exposed for some very specific unit tests. |
| */ |
| public void setClosing(boolean closing) { |
| this.closing.set(closing); |
| } |
| |
| /** |
| * The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the |
| * unit test. Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the |
| * timeout. |
| * @param timeoutForWriteLock the second time to wait for the write lock in |
| * {@link HRegion#doClose} |
| */ |
| public void setTimeoutForWriteLock(long timeoutForWriteLock) { |
| assert timeoutForWriteLock >= 0; |
| this.timeoutForWriteLock = timeoutForWriteLock; |
| } |
| |
| @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UL_UNRELEASED_LOCK_EXCEPTION_PATH", |
| justification = "I think FindBugs is confused") |
| private Map<byte[], List<HStoreFile>> doClose(boolean abort, MonitoredTask status) |
| throws IOException { |
| if (isClosed()) { |
| LOG.warn("Region " + this + " already closed"); |
| return null; |
| } |
| |
| if (coprocessorHost != null) { |
| status.setStatus("Running coprocessor pre-close hooks"); |
| this.coprocessorHost.preClose(abort); |
| } |
| status.setStatus("Disabling compacts and flushes for region"); |
| boolean canFlush = true; |
| synchronized (writestate) { |
| // Disable compacting and flushing by background threads for this |
| // region. |
| canFlush = !writestate.readOnly; |
| writestate.writesEnabled = false; |
| LOG.debug("Closing {}, disabling compactions & flushes", |
| this.getRegionInfo().getEncodedName()); |
| waitForFlushesAndCompactions(); |
| } |
| // If we were not just flushing, is it worth doing a preflush...one |
| // that will clear out of the bulk of the memstore before we put up |
| // the close flag? |
| if (!abort && worthPreFlushing() && canFlush) { |
| status.setStatus("Pre-flushing region before close"); |
| LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName()); |
| try { |
| internalFlushcache(status); |
| } catch (IOException ioe) { |
| // Failed to flush the region. Keep going. |
| status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage()); |
| } |
| } |
| |
| // Set the closing flag |
| // From this point new arrivals at the region lock will get NSRE. |
| |
| this.closing.set(true); |
| LOG.info("Closing region {}", this); |
| |
| // Acquire the close lock |
| |
| // The configuration parameter CLOSE_WAIT_ABORT is overloaded to enable both |
| // the new regionserver abort condition and interrupts for running requests. |
| // If CLOSE_WAIT_ABORT is not enabled there is no change from earlier behavior, |
| // we will not attempt to interrupt threads servicing requests nor crash out |
| // the regionserver if something remains stubborn. |
| |
| final boolean canAbort = conf.getBoolean(CLOSE_WAIT_ABORT, DEFAULT_CLOSE_WAIT_ABORT); |
| boolean useTimedWait = false; |
| if (timeoutForWriteLock != null && timeoutForWriteLock != Long.MAX_VALUE) { |
| // convert legacy use of timeoutForWriteLock in seconds to new use in millis |
| timeoutForWriteLock = TimeUnit.SECONDS.toMillis(timeoutForWriteLock); |
| useTimedWait = true; |
| } else if (canAbort) { |
| timeoutForWriteLock = conf.getLong(CLOSE_WAIT_TIME, DEFAULT_CLOSE_WAIT_TIME); |
| useTimedWait = true; |
| } |
| if (LOG.isDebugEnabled()) { |
| LOG.debug((useTimedWait ? "Time limited wait" : "Waiting without time limit") |
| + " for close lock on " + this); |
| } |
| final long closeWaitInterval = conf.getLong(CLOSE_WAIT_INTERVAL, DEFAULT_CLOSE_WAIT_INTERVAL); |
| long elapsedWaitTime = 0; |
| if (useTimedWait) { |
| // Sanity check configuration |
| long remainingWaitTime = timeoutForWriteLock; |
| if (remainingWaitTime < closeWaitInterval) { |
| LOG.warn("Time limit for close wait of " + timeoutForWriteLock |
| + " ms is less than the configured lock acquisition wait interval " + closeWaitInterval |
| + " ms, using wait interval as time limit"); |
| remainingWaitTime = closeWaitInterval; |
| } |
| boolean acquired = false; |
| do { |
| long start = EnvironmentEdgeManager.currentTime(); |
| try { |
| acquired = lock.writeLock().tryLock(Math.min(remainingWaitTime, closeWaitInterval), |
| TimeUnit.MILLISECONDS); |
| } catch (InterruptedException e) { |
| // Interrupted waiting for close lock. More likely the server is shutting down, not |
| // normal operation, so aborting upon interrupt while waiting on this lock would not |
| // provide much value. Throw an IOE (as IIOE) like we would in the case where we |
| // fail to acquire the lock. |
| String msg = "Interrupted while waiting for close lock on " + this; |
| LOG.warn(msg, e); |
| throw (InterruptedIOException) new InterruptedIOException(msg).initCause(e); |
| } |
| long elapsed = EnvironmentEdgeManager.currentTime() - start; |
| elapsedWaitTime += elapsed; |
| remainingWaitTime -= elapsed; |
| if (canAbort && !acquired && remainingWaitTime > 0) { |
| // Before we loop to wait again, interrupt all region operations that might |
| // still be in progress, to encourage them to break out of waiting states or |
| // inner loops, throw an exception to clients, and release the read lock via |
| // endRegionOperation. |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Interrupting region operations after waiting for close lock for " |
| + elapsedWaitTime + " ms on " + this + ", " + remainingWaitTime + " ms remaining"); |
| } |
| interruptRegionOperations(); |
| } |
| } while (!acquired && remainingWaitTime > 0); |
| |
| // If we fail to acquire the lock, trigger an abort if we can; otherwise throw an IOE |
| // to let the caller know we could not proceed with the close. |
| if (!acquired) { |
| String msg = |
| "Failed to acquire close lock on " + this + " after waiting " + elapsedWaitTime + " ms"; |
| LOG.error(msg); |
| if (canAbort) { |
| // If we failed to acquire the write lock, abort the server |
| rsServices.abort(msg, null); |
| } |
| throw new IOException(msg); |
| } |
| |
| } else { |
| |
| long start = EnvironmentEdgeManager.currentTime(); |
| lock.writeLock().lock(); |
| elapsedWaitTime = EnvironmentEdgeManager.currentTime() - start; |
| |
| } |
| |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Acquired close lock on " + this + " after waiting " + elapsedWaitTime + " ms"); |
| } |
| |
| status.setStatus("Disabling writes for close"); |
| try { |
| if (this.isClosed()) { |
| status.abort("Already got closed by another process"); |
| // SplitTransaction handles the null |
| return null; |
| } |
| LOG.debug("Updates disabled for region " + this); |
| // Don't flush the cache if we are aborting |
| if (!abort && canFlush) { |
| int failedfFlushCount = 0; |
| int flushCount = 0; |
| long tmp = 0; |
| long remainingSize = this.memStoreSizing.getDataSize(); |
| while (remainingSize > 0) { |
| try { |
| internalFlushcache(status); |
| if (flushCount > 0) { |
| LOG.info("Running extra flush, " + flushCount + " (carrying snapshot?) " + this); |
| } |
| flushCount++; |
| tmp = this.memStoreSizing.getDataSize(); |
| if (tmp >= remainingSize) { |
| failedfFlushCount++; |
| } |
| remainingSize = tmp; |
| if (failedfFlushCount > 5) { |
| // If we failed 5 times and are unable to clear memory, abort |
| // so we do not lose data |
| throw new DroppedSnapshotException("Failed clearing memory after " + flushCount |
| + " attempts on region: " + Bytes.toStringBinary(getRegionInfo().getRegionName())); |
| } |
| } catch (IOException ioe) { |
| status.setStatus("Failed flush " + this + ", putting online again"); |
| synchronized (writestate) { |
| writestate.writesEnabled = true; |
| } |
| // Have to throw to upper layers. I can't abort server from here. |
| throw ioe; |
| } |
| } |
| } |
| |
| Map<byte[], List<HStoreFile>> result = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| if (!stores.isEmpty()) { |
| // initialize the thread pool for closing stores in parallel. |
| ThreadPoolExecutor storeCloserThreadPool = |
| getStoreOpenAndCloseThreadPool("StoreCloser-" + getRegionInfo().getRegionNameAsString()); |
| CompletionService<Pair<byte[], Collection<HStoreFile>>> completionService = |
| new ExecutorCompletionService<>(storeCloserThreadPool); |
| |
| // close each store in parallel |
| for (HStore store : stores.values()) { |
| MemStoreSize mss = store.getFlushableSize(); |
| if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) { |
| if (getRegionServerServices() != null) { |
| getRegionServerServices().abort("Assertion failed while closing store " |
| + getRegionInfo().getRegionNameAsString() + " " + store |
| + ". flushableSize expected=0, actual={" + mss + "}. Current memStoreSize=" |
| + this.memStoreSizing.getMemStoreSize() + ". Maybe a coprocessor " |
| + "operation failed and left the memstore in a partially updated state.", null); |
| } |
| } |
| completionService.submit(new Callable<Pair<byte[], Collection<HStoreFile>>>() { |
| @Override |
| public Pair<byte[], Collection<HStoreFile>> call() throws IOException { |
| return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close()); |
| } |
| }); |
| } |
| try { |
| for (int i = 0; i < stores.size(); i++) { |
| Future<Pair<byte[], Collection<HStoreFile>>> future = completionService.take(); |
| Pair<byte[], Collection<HStoreFile>> storeFiles = future.get(); |
| List<HStoreFile> familyFiles = result.get(storeFiles.getFirst()); |
| if (familyFiles == null) { |
| familyFiles = new ArrayList<>(); |
| result.put(storeFiles.getFirst(), familyFiles); |
| } |
| familyFiles.addAll(storeFiles.getSecond()); |
| } |
| } catch (InterruptedException e) { |
| throw throwOnInterrupt(e); |
| } catch (ExecutionException e) { |
| Throwable cause = e.getCause(); |
| if (cause instanceof IOException) { |
| throw (IOException) cause; |
| } |
| throw new IOException(cause); |
| } finally { |
| storeCloserThreadPool.shutdownNow(); |
| } |
| } |
| |
| status.setStatus("Writing region close event to WAL"); |
| // Always write close marker to wal even for read only table. This is not a big problem as we |
| // do not write any data into the region; it is just a meta edit in the WAL file. |
| if ( |
| !abort && wal != null && getRegionServerServices() != null |
| && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) |
| ) { |
| writeRegionCloseMarker(wal); |
| } |
| |
| this.closed.set(true); |
| |
| // Decrease refCount of table latency metric registry. |
| // Do this after closed#set to make sure only -1. |
| if (metricsTableRequests != null) { |
| metricsTableRequests.removeRegistry(); |
| } |
| |
| if (!canFlush) { |
| decrMemStoreSize(this.memStoreSizing.getMemStoreSize()); |
| } else if (this.memStoreSizing.getDataSize() != 0) { |
| LOG.error("Memstore data size is {} in region {}", this.memStoreSizing.getDataSize(), this); |
| } |
| if (coprocessorHost != null) { |
| status.setStatus("Running coprocessor post-close hooks"); |
| this.coprocessorHost.postClose(abort); |
| } |
| if (this.metricsRegion != null) { |
| this.metricsRegion.close(); |
| } |
| if (this.metricsRegionWrapper != null) { |
| Closeables.close(this.metricsRegionWrapper, true); |
| } |
| status.markComplete("Closed"); |
| LOG.info("Closed {}", this); |
| return result; |
| } finally { |
| lock.writeLock().unlock(); |
| } |
| } |
| |
| /** Wait for all current flushes and compactions of the region to complete */ |
| // TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for |
| // Phoenix needs. |
| public void waitForFlushesAndCompactions() { |
| synchronized (writestate) { |
| if (this.writestate.readOnly) { |
| // we should not wait for replayed flushed if we are read only (for example in case the |
| // region is a secondary replica). |
| return; |
| } |
| boolean interrupted = false; |
| try { |
| while (writestate.compacting.get() > 0 || writestate.flushing) { |
| LOG.debug("waiting for " + writestate.compacting + " compactions" |
| + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this); |
| try { |
| writestate.wait(); |
| } catch (InterruptedException iex) { |
| // essentially ignore and propagate the interrupt back up |
| LOG.warn("Interrupted while waiting in region {}", this); |
| interrupted = true; |
| break; |
| } |
| } |
| } finally { |
| if (interrupted) { |
| Thread.currentThread().interrupt(); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Wait for all current flushes of the region to complete |
| */ |
| public void waitForFlushes() { |
| waitForFlushes(0);// Unbound wait |
| } |
| |
| @Override |
| public boolean waitForFlushes(long timeout) { |
| synchronized (writestate) { |
| if (this.writestate.readOnly) { |
| // we should not wait for replayed flushed if we are read only (for example in case the |
| // region is a secondary replica). |
| return true; |
| } |
| if (!writestate.flushing) return true; |
| long start = EnvironmentEdgeManager.currentTime(); |
| long duration = 0; |
| boolean interrupted = false; |
| LOG.debug("waiting for cache flush to complete for region " + this); |
| try { |
| while (writestate.flushing) { |
| if (timeout > 0 && duration >= timeout) break; |
| try { |
| long toWait = timeout == 0 ? 0 : (timeout - duration); |
| writestate.wait(toWait); |
| } catch (InterruptedException iex) { |
| // essentially ignore and propagate the interrupt back up |
| LOG.warn("Interrupted while waiting in region {}", this); |
| interrupted = true; |
| break; |
| } finally { |
| duration = EnvironmentEdgeManager.currentTime() - start; |
| } |
| } |
| } finally { |
| if (interrupted) { |
| Thread.currentThread().interrupt(); |
| } |
| } |
| LOG.debug("Waited {} ms for region {} flush to complete", duration, this); |
| return !(writestate.flushing); |
| } |
| } |
| |
| @Override |
| public Configuration getReadOnlyConfiguration() { |
| return new ReadOnlyConfiguration(this.conf); |
| } |
| |
| @Override |
| public int getMinBlockSizeBytes() { |
| return minBlockSizeBytes; |
| } |
| |
| private ThreadPoolExecutor getStoreOpenAndCloseThreadPool(final String threadNamePrefix) { |
| int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); |
| int maxThreads = Math.min(numStores, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, |
| HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)); |
| return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); |
| } |
| |
| ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(final String threadNamePrefix) { |
| int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); |
| int maxThreads = Math.max(1, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, |
| HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX) / numStores); |
| return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); |
| } |
| |
| private static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads, |
| final String threadNamePrefix) { |
| return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS, |
| new ThreadFactory() { |
| private int count = 1; |
| |
| @Override |
| public Thread newThread(Runnable r) { |
| return new Thread(r, threadNamePrefix + "-" + count++); |
| } |
| }); |
| } |
| |
| /** Returns True if its worth doing a flush before we put up the close flag. */ |
| private boolean worthPreFlushing() { |
| return this.memStoreSizing.getDataSize() |
| > this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // HRegion accessors |
| ////////////////////////////////////////////////////////////////////////////// |
| |
| @Override |
| public TableDescriptor getTableDescriptor() { |
| return this.htableDescriptor; |
| } |
| |
| @RestrictedApi(explanation = "Should only be called in tests", link = "", |
| allowedOnPath = ".*/src/test/.*") |
| public void setTableDescriptor(TableDescriptor desc) { |
| htableDescriptor = desc; |
| } |
| |
| /** Returns WAL in use for this region */ |
| public WAL getWAL() { |
| return this.wal; |
| } |
| |
| public BlockCache getBlockCache() { |
| return this.blockCache; |
| } |
| |
| /** |
| * Only used for unit test which doesn't start region server. |
| */ |
| public void setBlockCache(BlockCache blockCache) { |
| this.blockCache = blockCache; |
| } |
| |
| public MobFileCache getMobFileCache() { |
| return this.mobFileCache; |
| } |
| |
| /** |
| * Only used for unit test which doesn't start region server. |
| */ |
| public void setMobFileCache(MobFileCache mobFileCache) { |
| this.mobFileCache = mobFileCache; |
| } |
| |
| /** Returns split policy for this region. */ |
| RegionSplitPolicy getSplitPolicy() { |
| return this.splitPolicy; |
| } |
| |
| /** |
| * A split takes the config from the parent region & passes it to the daughter region's |
| * constructor. If 'conf' was passed, you would end up using the HTD of the parent region in |
| * addition to the new daughter HTD. Pass 'baseConf' to the daughter regions to avoid this tricky |
| * dedupe problem. |
| * @return Configuration object |
| */ |
| Configuration getBaseConf() { |
| return this.baseConf; |
| } |
| |
| /** Returns {@link FileSystem} being used by this region */ |
| public FileSystem getFilesystem() { |
| return fs.getFileSystem(); |
| } |
| |
| /** Returns the {@link HRegionFileSystem} used by this region */ |
| public HRegionFileSystem getRegionFileSystem() { |
| return this.fs; |
| } |
| |
| /** Returns the WAL {@link HRegionFileSystem} used by this region */ |
| HRegionWALFileSystem getRegionWALFileSystem() throws IOException { |
| return new HRegionWALFileSystem(conf, getWalFileSystem(), |
| CommonFSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo()); |
| } |
| |
| /** Returns the WAL {@link FileSystem} being used by this region */ |
| FileSystem getWalFileSystem() throws IOException { |
| if (walFS == null) { |
| walFS = CommonFSUtils.getWALFileSystem(conf); |
| } |
| return walFS; |
| } |
| |
| /** |
| * @return the Region directory under WALRootDirectory |
| * @throws IOException if there is an error getting WALRootDir |
| */ |
| public Path getWALRegionDir() throws IOException { |
| if (regionWalDir == null) { |
| regionWalDir = CommonFSUtils.getWALRegionDir(conf, getRegionInfo().getTable(), |
| getRegionInfo().getEncodedName()); |
| } |
| return regionWalDir; |
| } |
| |
| @Override |
| public long getEarliestFlushTimeForAllStores() { |
| return Collections.min(lastStoreFlushTimeMap.values()); |
| } |
| |
| @Override |
| public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException { |
| long result = Long.MAX_VALUE; |
| for (HStore store : stores.values()) { |
| Collection<HStoreFile> storeFiles = store.getStorefiles(); |
| if (storeFiles == null) { |
| continue; |
| } |
| for (HStoreFile file : storeFiles) { |
| StoreFileReader sfReader = file.getReader(); |
| if (sfReader == null) { |
| continue; |
| } |
| HFile.Reader reader = sfReader.getHFileReader(); |
| if (reader == null) { |
| continue; |
| } |
| if (majorCompactionOnly) { |
| byte[] val = reader.getHFileInfo().get(MAJOR_COMPACTION_KEY); |
| if (val == null || !Bytes.toBoolean(val)) { |
| continue; |
| } |
| } |
| result = Math.min(result, reader.getFileContext().getFileCreateTime()); |
| } |
| } |
| return result == Long.MAX_VALUE ? 0 : result; |
| } |
| |
| RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) { |
| long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId; |
| byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes(); |
| regionLoadBldr.clearStoreCompleteSequenceId(); |
| for (byte[] familyName : this.stores.keySet()) { |
| long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName); |
| // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will |
| // give us a sequence id that is for sure flushed. We want edit replay to start after this |
| // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id. |
| long csid = (earliest == HConstants.NO_SEQNUM) ? lastFlushOpSeqIdLocal : earliest - 1; |
| regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder() |
| .setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build()); |
| } |
| return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId()); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // HRegion maintenance. |
| // |
| // These methods are meant to be called periodically by the HRegionServer for |
| // upkeep. |
| ////////////////////////////////////////////////////////////////////////////// |
| /** |
| * Do preparation for pending compaction. |
| */ |
| protected void doRegionCompactionPrep() throws IOException { |
| } |
| |
| /** |
| * Synchronously compact all stores in the region. |
| * <p> |
| * This operation could block for a long time, so don't call it from a time-sensitive thread. |
| * <p> |
| * Note that no locks are taken to prevent possible conflicts between compaction and splitting |
| * activities. The regionserver does not normally compact and split in parallel. However by |
| * calling this method you may introduce unexpected and unhandled concurrency. Don't do this |
| * unless you know what you are doing. |
| * @param majorCompaction True to force a major compaction regardless of thresholds |
| */ |
| public void compact(boolean majorCompaction) throws IOException { |
| if (majorCompaction) { |
| stores.values().forEach(HStore::triggerMajorCompaction); |
| } |
| for (HStore s : stores.values()) { |
| Optional<CompactionContext> compaction = s.requestCompaction(); |
| if (compaction.isPresent()) { |
| ThroughputController controller = null; |
| if (rsServices != null) { |
| controller = CompactionThroughputControllerFactory.create(rsServices, conf); |
| } |
| if (controller == null) { |
| controller = NoLimitThroughputController.INSTANCE; |
| } |
| compact(compaction.get(), s, controller, null); |
| } |
| } |
| } |
| |
| /** |
| * This is a helper function that compact all the stores synchronously. |
| * <p> |
| * It is used by utilities and testing |
| */ |
| public void compactStores() throws IOException { |
| for (HStore s : stores.values()) { |
| Optional<CompactionContext> compaction = s.requestCompaction(); |
| if (compaction.isPresent()) { |
| compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null); |
| } |
| } |
| } |
| |
| /** |
| * This is a helper function that compact the given store. |
| * <p> |
| * It is used by utilities and testing |
| */ |
| void compactStore(byte[] family, ThroughputController throughputController) throws IOException { |
| HStore s = getStore(family); |
| Optional<CompactionContext> compaction = s.requestCompaction(); |
| if (compaction.isPresent()) { |
| compact(compaction.get(), s, throughputController, null); |
| } |
| } |
| |
| /** |
| * Called by compaction thread and after region is opened to compact the HStores if necessary. |
| * <p> |
| * This operation could block for a long time, so don't call it from a time-sensitive thread. Note |
| * that no locking is necessary at this level because compaction only conflicts with a region |
| * split, and that cannot happen because the region server does them sequentially and not in |
| * parallel. |
| * @param compaction Compaction details, obtained by requestCompaction() |
| * @return whether the compaction completed |
| */ |
| public boolean compact(CompactionContext compaction, HStore store, |
| ThroughputController throughputController) throws IOException { |
| return compact(compaction, store, throughputController, null); |
| } |
| |
| public boolean compact(CompactionContext compaction, HStore store, |
| ThroughputController throughputController, User user) throws IOException { |
| assert compaction != null && compaction.hasSelection(); |
| assert !compaction.getRequest().getFiles().isEmpty(); |
| if (this.closing.get() || this.closed.get()) { |
| LOG.debug("Skipping compaction on " + this + " because closing/closed"); |
| store.cancelRequestedCompaction(compaction); |
| return false; |
| } |
| MonitoredTask status = null; |
| boolean requestNeedsCancellation = true; |
| /* |
| * We are trying to remove / relax the region read lock for compaction. Let's see what are the |
| * potential race conditions among the operations (user scan, region split, region close and |
| * region bulk load). user scan ---> region read lock region split --> region close first --> |
| * region write lock region close --> region write lock region bulk load --> region write lock |
| * read lock is compatible with read lock. ---> no problem with user scan/read region bulk load |
| * does not cause problem for compaction (no consistency problem, store lock will help the store |
| * file accounting). They can run almost concurrently at the region level. The only remaining |
| * race condition is between the region close and compaction. So we will evaluate, below, how |
| * region close intervenes with compaction if compaction does not acquire region read lock. Here |
| * are the steps for compaction: 1. obtain list of StoreFile's 2. create StoreFileScanner's |
| * based on list from #1 3. perform compaction and save resulting files under tmp dir 4. swap in |
| * compacted files #1 is guarded by store lock. This patch does not change this --> no worse or |
| * better For #2, we obtain smallest read point (for region) across all the Scanners (for both |
| * default compactor and stripe compactor). The read points are for user scans. Region keeps the |
| * read points for all currently open user scanners. Compaction needs to know the smallest read |
| * point so that during re-write of the hfiles, it can remove the mvcc points for the cells if |
| * their mvccs are older than the smallest since they are not needed anymore. This will not |
| * conflict with compaction. For #3, it can be performed in parallel to other operations. For #4 |
| * bulk load and compaction don't conflict with each other on the region level (for multi-family |
| * atomicy). Region close and compaction are guarded pretty well by the 'writestate'. In |
| * HRegion#doClose(), we have : synchronized (writestate) { // Disable compacting and flushing |
| * by background threads for this // region. canFlush = !writestate.readOnly; |
| * writestate.writesEnabled = false; LOG.debug("Closing " + this + |
| * ": disabling compactions & flushes"); waitForFlushesAndCompactions(); } |
| * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0. and in |
| * HRegion.compact() try { synchronized (writestate) { if (writestate.writesEnabled) { |
| * wasStateSet = true; ++writestate.compacting; } else { String msg = "NOT compacting region " + |
| * this + ". Writes disabled."; LOG.info(msg); status.abort(msg); return false; } } Also in |
| * compactor.performCompaction(): check periodically to see if a system stop is requested if |
| * (closeChecker != null && closeChecker.isTimeLimit(store, now)) { progress.cancel(); return |
| * false; } if (closeChecker != null && closeChecker.isSizeLimit(store, len)) { |
| * progress.cancel(); return false; } |
| */ |
| try { |
| byte[] cf = Bytes.toBytes(store.getColumnFamilyName()); |
| if (stores.get(cf) != store) { |
| LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this |
| + " has been re-instantiated, cancel this compaction request. " |
| + " It may be caused by the roll back of split transaction"); |
| return false; |
| } |
| |
| status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this); |
| if (this.closed.get()) { |
| String msg = "Skipping compaction on " + this + " because closed"; |
| LOG.debug(msg); |
| status.abort(msg); |
| return false; |
| } |
| boolean wasStateSet = false; |
| try { |
| synchronized (writestate) { |
| if (writestate.writesEnabled) { |
| wasStateSet = true; |
| writestate.compacting.incrementAndGet(); |
| } else { |
| String msg = "NOT compacting region " + this + ". Writes disabled."; |
| LOG.info(msg); |
| status.abort(msg); |
| return false; |
| } |
| } |
| LOG.info("Starting compaction of {} in {}{}", store, this, |
| (compaction.getRequest().isOffPeak() ? " as an off-peak compaction" : "")); |
| doRegionCompactionPrep(); |
| try { |
| status.setStatus("Compacting store " + store); |
| // We no longer need to cancel the request on the way out of this |
| // method because Store#compact will clean up unconditionally |
| requestNeedsCancellation = false; |
| store.compact(compaction, throughputController, user); |
| } catch (InterruptedIOException iioe) { |
| String msg = "region " + this + " compaction interrupted"; |
| LOG.info(msg, iioe); |
| status.abort(msg); |
| return false; |
| } |
| } finally { |
| if (wasStateSet) { |
| synchronized (writestate) { |
| writestate.compacting.decrementAndGet(); |
| if (writestate.compacting.get() <= 0) { |
| writestate.notifyAll(); |
| } |
| } |
| } |
| } |
| status.markComplete("Compaction complete"); |
| return true; |
| } finally { |
| if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction); |
| if (status != null) { |
| LOG.debug("Compaction status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), |
| status.prettyPrintJournal()); |
| status.cleanup(); |
| } |
| } |
| } |
| |
| /** |
| * Flush the cache. |
| * <p> |
| * When this method is called the cache will be flushed unless: |
| * <ol> |
| * <li>the cache is empty</li> |
| * <li>the region is closed.</li> |
| * <li>a flush is already in progress</li> |
| * <li>writes are disabled</li> |
| * </ol> |
| * <p> |
| * This method may block for some time, so it should not be called from a time-sensitive thread. |
| * @param flushAllStores whether we want to force a flush of all stores |
| * @return FlushResult indicating whether the flush was successful or not and if the region needs |
| * compacting |
| * @throws IOException general io exceptions because a snapshot was not properly persisted. |
| */ |
| // TODO HBASE-18905. We might have to expose a requestFlush API for CPs |
| public FlushResult flush(boolean flushAllStores) throws IOException { |
| return flushcache(flushAllStores, false, FlushLifeCycleTracker.DUMMY); |
| } |
| |
| public interface FlushResult { |
| enum Result { |
| FLUSHED_NO_COMPACTION_NEEDED, |
| FLUSHED_COMPACTION_NEEDED, |
| // Special case where a flush didn't run because there's nothing in the memstores. Used when |
| // bulk loading to know when we can still load even if a flush didn't happen. |
| CANNOT_FLUSH_MEMSTORE_EMPTY, |
| CANNOT_FLUSH |
| } |
| |
| /** Returns the detailed result code */ |
| Result getResult(); |
| |
| /** Returns true if the memstores were flushed, else false */ |
| boolean isFlushSucceeded(); |
| |
| /** Returns True if the flush requested a compaction, else false */ |
| boolean isCompactionNeeded(); |
| } |
| |
| FlushResultImpl flushcache(boolean flushAllStores, boolean writeFlushRequestWalMarker, |
| FlushLifeCycleTracker tracker) throws IOException { |
| List<byte[]> families = null; |
| if (flushAllStores) { |
| families = new ArrayList<>(); |
| families.addAll(this.getTableDescriptor().getColumnFamilyNames()); |
| } |
| return this.flushcache(families, writeFlushRequestWalMarker, tracker); |
| } |
| |
| /** |
| * Flush the cache. When this method is called the cache will be flushed unless: |
| * <ol> |
| * <li>the cache is empty</li> |
| * <li>the region is closed.</li> |
| * <li>a flush is already in progress</li> |
| * <li>writes are disabled</li> |
| * </ol> |
| * <p> |
| * This method may block for some time, so it should not be called from a time-sensitive thread. |
| * @param families stores of region to flush. |
| * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL |
| * @param tracker used to track the life cycle of this flush |
| * @return whether the flush is success and whether the region needs compacting |
| * @throws IOException general io exceptions |
| * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was |
| * not properly persisted. The region is put in closing mode, and |
| * the caller MUST abort after this. |
| */ |
| public FlushResultImpl flushcache(List<byte[]> families, boolean writeFlushRequestWalMarker, |
| FlushLifeCycleTracker tracker) throws IOException { |
| // fail-fast instead of waiting on the lock |
| if (this.closing.get()) { |
| String msg = "Skipping flush on " + this + " because closing"; |
| LOG.debug(msg); |
| return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); |
| } |
| MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this); |
| status.setStatus("Acquiring readlock on region"); |
| // block waiting for the lock for flushing cache |
| lock.readLock().lock(); |
| boolean flushed = true; |
| try { |
| if (this.closed.get()) { |
| String msg = "Skipping flush on " + this + " because closed"; |
| LOG.debug(msg); |
| status.abort(msg); |
| flushed = false; |
| return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); |
| } |
| if (coprocessorHost != null) { |
| status.setStatus("Running coprocessor pre-flush hooks"); |
| coprocessorHost.preFlush(tracker); |
| } |
| // TODO: this should be managed within memstore with the snapshot, updated only after flush |
| // successful |
| if (numMutationsWithoutWAL.sum() > 0) { |
| numMutationsWithoutWAL.reset(); |
| dataInMemoryWithoutWAL.reset(); |
| } |
| synchronized (writestate) { |
| if (!writestate.flushing && writestate.writesEnabled) { |
| this.writestate.flushing = true; |
| } else { |
| String msg = "NOT flushing " + this + " as " |
| + (writestate.flushing ? "already flushing" : "writes are not enabled"); |
| LOG.debug(msg); |
| status.abort(msg); |
| flushed = false; |
| return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); |
| } |
| } |
| |
| try { |
| // The reason that we do not always use flushPolicy is, when the flush is |
| // caused by logRoller, we should select stores which must be flushed |
| // rather than could be flushed. |
| Collection<HStore> specificStoresToFlush = null; |
| if (families != null) { |
| specificStoresToFlush = getSpecificStores(families); |
| } else { |
| specificStoresToFlush = flushPolicy.selectStoresToFlush(); |
| } |
| FlushResultImpl fs = |
| internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker); |
| |
| if (coprocessorHost != null) { |
| status.setStatus("Running post-flush coprocessor hooks"); |
| coprocessorHost.postFlush(tracker); |
| } |
| |
| if (fs.isFlushSucceeded()) { |
| flushesQueued.reset(); |
| } |
| |
| status.markComplete("Flush successful " + fs.toString()); |
| return fs; |
| } finally { |
| synchronized (writestate) { |
| writestate.flushing = false; |
| this.writestate.flushRequested = false; |
| writestate.notifyAll(); |
| } |
| } |
| } finally { |
| lock.readLock().unlock(); |
| if (flushed) { |
| // Don't log this journal stuff if no flush -- confusing. |
| LOG.debug("Flush status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), |
| status.prettyPrintJournal()); |
| } |
| status.cleanup(); |
| } |
| } |
| |
| /** |
| * get stores which matches the specified families |
| * @return the stores need to be flushed. |
| */ |
| private Collection<HStore> getSpecificStores(List<byte[]> families) { |
| Collection<HStore> specificStoresToFlush = new ArrayList<>(); |
| for (byte[] family : families) { |
| specificStoresToFlush.add(stores.get(family)); |
| } |
| return specificStoresToFlush; |
| } |
| |
| /** |
| * Should the store be flushed because it is old enough. |
| * <p> |
| * Every FlushPolicy should call this to determine whether a store is old enough to flush (except |
| * that you always flush all stores). Otherwise the method will always returns true which will |
| * make a lot of flush requests. |
| */ |
| boolean shouldFlushStore(HStore store) { |
| long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), |
| store.getColumnFamilyDescriptor().getName()) - 1; |
| if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " |
| + getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest |
| + " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint()); |
| } |
| return true; |
| } |
| if (this.flushCheckInterval <= 0) { |
| return false; |
| } |
| long now = EnvironmentEdgeManager.currentTime(); |
| if (store.timeOfOldestEdit() < now - this.flushCheckInterval) { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " |
| + getRegionInfo().getEncodedName() + " because time of oldest edit=" |
| + store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now); |
| } |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * Should the memstore be flushed now |
| */ |
| boolean shouldFlush(final StringBuilder whyFlush) { |
| whyFlush.setLength(0); |
| // This is a rough measure. |
| if ( |
| this.maxFlushedSeqId > 0 |
| && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint()) |
| ) { |
| whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush"); |
| return true; |
| } |
| long modifiedFlushCheckInterval = flushCheckInterval; |
| if ( |
| getRegionInfo().getTable().isSystemTable() |
| && getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID |
| ) { |
| modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL; |
| } |
| if (modifiedFlushCheckInterval <= 0) { // disabled |
| return false; |
| } |
| long now = EnvironmentEdgeManager.currentTime(); |
| // if we flushed in the recent past, we don't need to do again now |
| if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) { |
| return false; |
| } |
| // since we didn't flush in the recent past, flush now if certain conditions |
| // are met. Return true on first such memstore hit. |
| for (HStore s : stores.values()) { |
| if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) { |
| // we have an old enough edit in the memstore, flush |
| whyFlush.append(s.toString() + " has an old edit so flush to free WALs"); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Flushing all stores. |
| * @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker) |
| */ |
| private FlushResult internalFlushcache(MonitoredTask status) throws IOException { |
| return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY); |
| } |
| |
| /** |
| * Flushing given stores. |
| * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker) |
| */ |
| private FlushResultImpl internalFlushcache(Collection<HStore> storesToFlush, MonitoredTask status, |
| boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { |
| return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status, |
| writeFlushWalMarker, tracker); |
| } |
| |
| /** |
| * Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the |
| * memstore, all of which have also been written to the wal. We need to write those updates in the |
| * memstore out to disk, while being able to process reads/writes as much as possible during the |
| * flush operation. |
| * <p> |
| * This method may block for some time. Every time you call it, we up the regions sequence id even |
| * if we don't flush; i.e. the returned region id will be at least one larger than the last edit |
| * applied to this region. The returned id does not refer to an actual edit. The returned id can |
| * be used for say installing a bulk loaded file just ahead of the last hfile that was the result |
| * of this flush, etc. |
| * @param wal Null if we're NOT to go via wal. |
| * @param myseqid The seqid to use if <code>wal</code> is null writing out flush file. |
| * @param storesToFlush The list of stores to flush. |
| * @return object describing the flush's state |
| * @throws IOException general io exceptions |
| * @throws DroppedSnapshotException Thrown when replay of WAL is required. |
| */ |
| protected FlushResultImpl internalFlushcache(WAL wal, long myseqid, |
| Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, |
| FlushLifeCycleTracker tracker) throws IOException { |
| PrepareFlushResult result = |
| internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker); |
| if (result.result == null) { |
| return internalFlushCacheAndCommit(wal, status, result, storesToFlush); |
| } else { |
| return result.result; // early exit due to failure from prepare stage |
| } |
| } |
| |
| @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DLS_DEAD_LOCAL_STORE", |
| justification = "FindBugs seems confused about trxId") |
| protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid, |
| Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, |
| FlushLifeCycleTracker tracker) throws IOException { |
| if (this.rsServices != null && this.rsServices.isAborted()) { |
| // Don't flush when server aborting, it's unsafe |
| throw new IOException("Aborting flush because server is aborted..."); |
| } |
| final long startTime = EnvironmentEdgeManager.currentTime(); |
| // If nothing to flush, return, but return with a valid unused sequenceId. |
| // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a |
| // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs |
| // to no other that it can use to associate with the bulk load. Hence this little dance below |
| // to go get one. |
| if (this.memStoreSizing.getDataSize() <= 0) { |
| // Take an update lock so no edits can come into memory just yet. |
| this.updatesLock.writeLock().lock(); |
| WriteEntry writeEntry = null; |
| try { |
| if (this.memStoreSizing.getDataSize() <= 0) { |
| // Presume that if there are still no edits in the memstore, then there are no edits for |
| // this region out in the WAL subsystem so no need to do any trickery clearing out |
| // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for |
| // sure just beyond the last appended region edit and not associated with any edit |
| // (useful as marker when bulk loading, etc.). |
| if (wal != null) { |
| writeEntry = mvcc.begin(); |
| long flushOpSeqId = writeEntry.getWriteNumber(); |
| FlushResultImpl flushResult = |
| new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, |
| "Nothing to flush", writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker)); |
| mvcc.completeAndWait(writeEntry); |
| // Set to null so we don't complete it again down in finally block. |
| writeEntry = null; |
| return new PrepareFlushResult(flushResult, myseqid); |
| } else { |
| return new PrepareFlushResult(new FlushResultImpl( |
| FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid); |
| } |
| } |
| } finally { |
| if (writeEntry != null) { |
| // If writeEntry is non-null, this operation failed; the mvcc transaction failed... |
| // but complete it anyways so it doesn't block the mvcc queue. |
| mvcc.complete(writeEntry); |
| } |
| this.updatesLock.writeLock().unlock(); |
| } |
| } |
| logFatLineOnFlush(storesToFlush, myseqid); |
| // Stop updates while we snapshot the memstore of all of these regions' stores. We only have |
| // to do this for a moment. It is quick. We also set the memstore size to zero here before we |
| // allow updates again so its value will represent the size of the updates received |
| // during flush |
| |
| // We have to take an update lock during snapshot, or else a write could end up in both snapshot |
| // and memstore (makes it difficult to do atomic rows then) |
| status.setStatus("Obtaining lock to block concurrent updates"); |
| // block waiting for the lock for internal flush |
| this.updatesLock.writeLock().lock(); |
| status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName()); |
| MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing(); |
| |
| Map<byte[], Long> flushedFamilyNamesToSeq = new HashMap<>(); |
| for (HStore store : storesToFlush) { |
| flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(), |
| store.preFlushSeqIDEstimation()); |
| } |
| |
| TreeMap<byte[], StoreFlushContext> storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| TreeMap<byte[], List<Path>> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| TreeMap<byte[], MemStoreSize> storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| // The sequence id of this flush operation which is used to log FlushMarker and pass to |
| // createFlushContext to use as the store file's sequence id. It can be in advance of edits |
| // still in the memstore, edits that are in other column families yet to be flushed. |
| long flushOpSeqId = HConstants.NO_SEQNUM; |
| // The max flushed sequence id after this flush operation completes. All edits in memstore |
| // will be in advance of this sequence id. |
| long flushedSeqId = HConstants.NO_SEQNUM; |
| byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes(); |
| try { |
| if (wal != null) { |
| Long earliestUnflushedSequenceIdForTheRegion = |
| wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq); |
| if (earliestUnflushedSequenceIdForTheRegion == null) { |
| // This should never happen. This is how startCacheFlush signals flush cannot proceed. |
| String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing."; |
| status.setStatus(msg); |
| return new PrepareFlushResult( |
| new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false), myseqid); |
| } |
| flushOpSeqId = getNextSequenceId(wal); |
| // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit |
| flushedSeqId = earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM |
| ? flushOpSeqId |
| : earliestUnflushedSequenceIdForTheRegion.longValue() - 1; |
| } else { |
| // use the provided sequence Id as WAL is not being used for this flush. |
| flushedSeqId = flushOpSeqId = myseqid; |
| } |
| |
| for (HStore s : storesToFlush) { |
| storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(), |
| s.createFlushContext(flushOpSeqId, tracker)); |
| // for writing stores to WAL |
| committedFiles.put(s.getColumnFamilyDescriptor().getName(), null); |
| } |
| |
| // write the snapshot start to WAL |
| if (wal != null && !writestate.readOnly) { |
| FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH, |
| getRegionInfo(), flushOpSeqId, committedFiles); |
| // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH |
| WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, |
| mvcc); |
| } |
| |
| // Prepare flush (take a snapshot) |
| storeFlushCtxs.forEach((name, flush) -> { |
| MemStoreSize snapshotSize = flush.prepare(); |
| totalSizeOfFlushableStores.incMemStoreSize(snapshotSize); |
| storeFlushableSize.put(name, snapshotSize); |
| }); |
| } catch (IOException ex) { |
| doAbortFlushToWAL(wal, flushOpSeqId, committedFiles); |
| throw ex; |
| } finally { |
| this.updatesLock.writeLock().unlock(); |
| } |
| String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " |
| + "flushsize=" + totalSizeOfFlushableStores; |
| status.setStatus(s); |
| doSyncOfUnflushedWALChanges(wal, getRegionInfo()); |
| return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime, |
| flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores); |
| } |
| |
| /** |
| * Utility method broken out of internalPrepareFlushCache so that method is smaller. |
| */ |
| private void logFatLineOnFlush(Collection<HStore> storesToFlush, long sequenceId) { |
| if (!LOG.isInfoEnabled()) { |
| return; |
| } |
| // Log a fat line detailing what is being flushed. |
| StringBuilder perCfExtras = null; |
| if (!isAllFamilies(storesToFlush)) { |
| perCfExtras = new StringBuilder(); |
| for (HStore store : storesToFlush) { |
| MemStoreSize mss = store.getFlushableSize(); |
| perCfExtras.append("; ").append(store.getColumnFamilyName()); |
| perCfExtras.append("={dataSize=").append(StringUtils.byteDesc(mss.getDataSize())); |
| perCfExtras.append(", heapSize=").append(StringUtils.byteDesc(mss.getHeapSize())); |
| perCfExtras.append(", offHeapSize=").append(StringUtils.byteDesc(mss.getOffHeapSize())); |
| perCfExtras.append("}"); |
| } |
| } |
| MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); |
| LOG.info("Flushing " + this.getRegionInfo().getEncodedName() + " " + storesToFlush.size() + "/" |
| + stores.size() + " column families," + " dataSize=" + StringUtils.byteDesc(mss.getDataSize()) |
| + " heapSize=" + StringUtils.byteDesc(mss.getHeapSize()) |
| + ((perCfExtras != null && perCfExtras.length() > 0) ? perCfExtras.toString() : "") |
| + ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId)); |
| } |
| |
| private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId, |
| final Map<byte[], List<Path>> committedFiles) { |
| if (wal == null) return; |
| try { |
| FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, |
| getRegionInfo(), flushOpSeqId, committedFiles); |
| WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc); |
| } catch (Throwable t) { |
| LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL: {} in " |
| + " region {}", StringUtils.stringifyException(t), this); |
| // ignore this since we will be aborting the RS with DSE. |
| } |
| // we have called wal.startCacheFlush(), now we have to abort it |
| wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); |
| } |
| |
| /** |
| * Sync unflushed WAL changes. See HBASE-8208 for details |
| */ |
| private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri) |
| throws IOException { |
| if (wal == null) { |
| return; |
| } |
| try { |
| wal.sync(); // ensure that flush marker is sync'ed |
| } catch (IOException ioe) { |
| wal.abortCacheFlush(hri.getEncodedNameAsBytes()); |
| throw ioe; |
| } |
| } |
| |
| /** Returns True if passed Set is all families in the region. */ |
| private boolean isAllFamilies(Collection<HStore> families) { |
| return families == null || this.stores.size() == families.size(); |
| } |
| |
| /** |
| * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various |
| * reasons. Ignores exceptions from WAL. Returns whether the write succeeded. |
| * @return whether WAL write was successful |
| */ |
| private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) { |
| if (writeFlushWalMarker && wal != null && !writestate.readOnly) { |
| FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH, |
| getRegionInfo(), -1, new TreeMap<>(Bytes.BYTES_COMPARATOR)); |
| try { |
| WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, |
| mvcc); |
| return true; |
| } catch (IOException e) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received exception while trying to write the flush request to wal", e); |
| } |
| } |
| return false; |
| } |
| |
| @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", |
| justification = "Intentional; notify is about completed flush") |
| FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status, |
| PrepareFlushResult prepareResult, Collection<HStore> storesToFlush) throws IOException { |
| // prepare flush context is carried via PrepareFlushResult |
| TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs; |
| TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles; |
| long startTime = prepareResult.startTime; |
| long flushOpSeqId = prepareResult.flushOpSeqId; |
| long flushedSeqId = prepareResult.flushedSeqId; |
| |
| String s = "Flushing stores of " + this; |
| status.setStatus(s); |
| if (LOG.isTraceEnabled()) LOG.trace(s); |
| |
| // Any failure from here on out will be catastrophic requiring server |
| // restart so wal content can be replayed and put back into the memstore. |
| // Otherwise, the snapshot content while backed up in the wal, it will not |
| // be part of the current running servers state. |
| boolean compactionRequested = false; |
| long flushedOutputFileSize = 0; |
| try { |
| // A. Flush memstore to all the HStores. |
| // Keep running vector of all store files that includes both old and the |
| // just-made new flush store file. The new flushed file is still in the |
| // tmp directory. |
| |
| for (StoreFlushContext flush : storeFlushCtxs.values()) { |
| flush.flushCache(status); |
| } |
| |
| // Switch snapshot (in memstore) -> new hfile (thus causing |
| // all the store scanners to reset/reseek). |
| for (Map.Entry<byte[], StoreFlushContext> flushEntry : storeFlushCtxs.entrySet()) { |
| StoreFlushContext sfc = flushEntry.getValue(); |
| boolean needsCompaction = sfc.commit(status); |
| if (needsCompaction) { |
| compactionRequested = true; |
| } |
| byte[] storeName = flushEntry.getKey(); |
| List<Path> storeCommittedFiles = sfc.getCommittedFiles(); |
| committedFiles.put(storeName, storeCommittedFiles); |
| // Flush committed no files, indicating flush is empty or flush was canceled |
| if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) { |
| MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName); |
| prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize); |
| } |
| flushedOutputFileSize += sfc.getOutputFileSize(); |
| } |
| storeFlushCtxs.clear(); |
| |
| // Set down the memstore size by amount of flush. |
| MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); |
| this.decrMemStoreSize(mss); |
| |
| // Increase the size of this Region for the purposes of quota. Noop if quotas are disabled. |
| // During startup, quota manager may not be initialized yet. |
| if (rsServices != null) { |
| RegionServerSpaceQuotaManager quotaManager = rsServices.getRegionServerSpaceQuotaManager(); |
| if (quotaManager != null) { |
| quotaManager.getRegionSizeStore().incrementRegionSize(this.getRegionInfo(), |
| flushedOutputFileSize); |
| } |
| } |
| |
| if (wal != null) { |
| // write flush marker to WAL. If fail, we should throw DroppedSnapshotException |
| FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH, |
| getRegionInfo(), flushOpSeqId, committedFiles); |
| WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, |
| mvcc); |
| } |
| } catch (Throwable t) { |
| // An exception here means that the snapshot was not persisted. |
| // The wal needs to be replayed so its content is restored to memstore. |
| // Currently, only a server restart will do this. |
| // We used to only catch IOEs but its possible that we'd get other |
| // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch |
| // all and sundry. |
| if (wal != null) { |
| try { |
| FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, |
| getRegionInfo(), flushOpSeqId, committedFiles); |
| WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc); |
| } catch (Throwable ex) { |
| LOG.warn( |
| getRegionInfo().getEncodedName() + " : " + "failed writing ABORT_FLUSH marker to WAL", |
| ex); |
| // ignore this since we will be aborting the RS with DSE. |
| } |
| wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); |
| } |
| DroppedSnapshotException dse = new DroppedSnapshotException( |
| "region: " + Bytes.toStringBinary(getRegionInfo().getRegionName()), t); |
| status.abort("Flush failed: " + StringUtils.stringifyException(t)); |
| |
| // Callers for flushcache() should catch DroppedSnapshotException and abort the region server. |
| // However, since we may have the region read lock, we cannot call close(true) here since |
| // we cannot promote to a write lock. Instead we are setting closing so that all other region |
| // operations except for close will be rejected. |
| this.closing.set(true); |
| |
| if (rsServices != null) { |
| // This is a safeguard against the case where the caller fails to explicitly handle aborting |
| rsServices.abort("Replay of WAL required. Forcing server shutdown", dse); |
| } |
| |
| throw dse; |
| } |
| |
| // If we get to here, the HStores have been written. |
| if (wal != null) { |
| wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes(), flushedSeqId); |
| } |
| |
| // Record latest flush time |
| for (HStore store : storesToFlush) { |
| this.lastStoreFlushTimeMap.put(store, startTime); |
| } |
| |
| this.maxFlushedSeqId = flushedSeqId; |
| this.lastFlushOpSeqId = flushOpSeqId; |
| |
| // C. Finally notify anyone waiting on memstore to clear: |
| // e.g. checkResources(). |
| synchronized (this) { |
| notifyAll(); // FindBugs NN_NAKED_NOTIFY |
| } |
| |
| long time = EnvironmentEdgeManager.currentTime() - startTime; |
| MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); |
| long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize(); |
| String msg = "Finished flush of" + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/" |
| + mss.getDataSize() + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/" |
| + mss.getHeapSize() + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/" |
| + memstoresize + " for " + this.getRegionInfo().getEncodedName() + " in " + time |
| + "ms, sequenceid=" + flushOpSeqId + ", compaction requested=" + compactionRequested |
| + ((wal == null) ? "; wal=null" : ""); |
| LOG.info(msg); |
| status.setStatus(msg); |
| |
| if (rsServices != null && rsServices.getMetrics() != null) { |
| rsServices.getMetrics().updateFlush(getTableDescriptor().getTableName().getNameAsString(), |
| time, mss.getDataSize(), flushedOutputFileSize); |
| } |
| |
| return new FlushResultImpl(compactionRequested |
| ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED |
| : FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId); |
| } |
| |
| /** |
| * Method to safely get the next sequence number. |
| * @return Next sequence number unassociated with any actual edit. |
| */ |
| protected long getNextSequenceId(final WAL wal) throws IOException { |
| WriteEntry we = mvcc.begin(); |
| mvcc.completeAndWait(we); |
| return we.getWriteNumber(); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // get() methods for client use. |
| ////////////////////////////////////////////////////////////////////////////// |
| |
| @Override |
| public RegionScannerImpl getScanner(Scan scan) throws IOException { |
| return getScanner(scan, null); |
| } |
| |
| @Override |
| public RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners) |
| throws IOException { |
| return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| private RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners, |
| long nonceGroup, long nonce) throws IOException { |
| return TraceUtil.trace(() -> { |
| startRegionOperation(Operation.SCAN); |
| try { |
| // Verify families are all valid |
| if (!scan.hasFamilies()) { |
| // Adding all families to scanner |
| for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { |
| scan.addFamily(family); |
| } |
| } else { |
| for (byte[] family : scan.getFamilyMap().keySet()) { |
| checkFamily(family); |
| } |
| } |
| return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce); |
| } finally { |
| closeRegionOperation(Operation.SCAN); |
| } |
| }, () -> createRegionSpan("Region.getScanner")); |
| } |
| |
| protected RegionScannerImpl instantiateRegionScanner(Scan scan, |
| List<KeyValueScanner> additionalScanners, long nonceGroup, long nonce) throws IOException { |
| if (scan.isReversed()) { |
| if (scan.getFilter() != null) { |
| scan.getFilter().setReversed(true); |
| } |
| return new ReversedRegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); |
| } |
| return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); |
| } |
| |
| /** |
| * Prepare a delete for a row mutation processor |
| * @param delete The passed delete is modified by this method. WARNING! |
| */ |
| private void prepareDelete(Delete delete) throws IOException { |
| // Check to see if this is a deleteRow insert |
| if (delete.getFamilyCellMap().isEmpty()) { |
| for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { |
| // Don't eat the timestamp |
| delete.addFamily(family, delete.getTimestamp()); |
| } |
| } else { |
| for (byte[] family : delete.getFamilyCellMap().keySet()) { |
| if (family == null) { |
| throw new NoSuchColumnFamilyException("Empty family is invalid"); |
| } |
| checkFamily(family); |
| } |
| } |
| } |
| |
| @Override |
| public void delete(Delete delete) throws IOException { |
| TraceUtil.trace(() -> { |
| checkReadOnly(); |
| checkResources(); |
| startRegionOperation(Operation.DELETE); |
| try { |
| // All edits for the given row (across all column families) must happen atomically. |
| return mutate(delete); |
| } finally { |
| closeRegionOperation(Operation.DELETE); |
| } |
| }, () -> createRegionSpan("Region.delete")); |
| } |
| |
| /** |
| * Set up correct timestamps in the KVs in Delete object. |
| * <p/> |
| * Caller should have the row and region locks. |
| */ |
| private void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<ExtendedCell>> familyMap, |
| byte[] byteNow) throws IOException { |
| for (Map.Entry<byte[], List<ExtendedCell>> e : familyMap.entrySet()) { |
| |
| byte[] family = e.getKey(); |
| List<ExtendedCell> cells = e.getValue(); |
| assert cells instanceof RandomAccess; |
| |
| Map<byte[], Integer> kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| int listSize = cells.size(); |
| for (int i = 0; i < listSize; i++) { |
| ExtendedCell cell = cells.get(i); |
| // Check if time is LATEST, change to time of most recent addition if so |
| // This is expensive. |
| if ( |
| cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && PrivateCellUtil.isDeleteType(cell) |
| ) { |
| byte[] qual = CellUtil.cloneQualifier(cell); |
| |
| Integer count = kvCount.get(qual); |
| if (count == null) { |
| kvCount.put(qual, 1); |
| } else { |
| kvCount.put(qual, count + 1); |
| } |
| count = kvCount.get(qual); |
| |
| Get get = new Get(CellUtil.cloneRow(cell)); |
| get.setMaxVersions(count); |
| get.addColumn(family, qual); |
| if (coprocessorHost != null) { |
| if ( |
| !coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell, byteNow, get) |
| ) { |
| updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); |
| } |
| } else { |
| updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); |
| } |
| } else { |
| PrivateCellUtil.updateLatestStamp(cell, byteNow); |
| } |
| } |
| } |
| } |
| |
| private void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow) |
| throws IOException { |
| try (RegionScanner scanner = getScanner(new Scan(get))) { |
| // NOTE: Please don't use HRegion.get() instead, |
| // because it will copy cells to heap. See HBASE-26036 |
| List<Cell> result = new ArrayList<>(); |
| scanner.next(result); |
| |
| if (result.size() < count) { |
| // Nothing to delete |
| PrivateCellUtil.updateLatestStamp(cell, byteNow); |
| return; |
| } |
| if (result.size() > count) { |
| throw new RuntimeException("Unexpected size: " + result.size()); |
| } |
| Cell getCell = result.get(count - 1); |
| PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp()); |
| } |
| } |
| |
| @Override |
| public void put(Put put) throws IOException { |
| TraceUtil.trace(() -> { |
| checkReadOnly(); |
| |
| // Do a rough check that we have resources to accept a write. The check is |
| // 'rough' in that between the resource check and the call to obtain a |
| // read lock, resources may run out. For now, the thought is that this |
| // will be extremely rare; we'll deal with it when it happens. |
| checkResources(); |
| startRegionOperation(Operation.PUT); |
| try { |
| // All edits for the given row (across all column families) must happen atomically. |
| return mutate(put); |
| } finally { |
| closeRegionOperation(Operation.PUT); |
| } |
| }, () -> createRegionSpan("Region.put")); |
| } |
| |
| /** |
| * Class that tracks the progress of a batch operations, accumulating status codes and tracking |
| * the index at which processing is proceeding. These batch operations may get split into |
| * mini-batches for processing. |
| */ |
| private abstract static class BatchOperation<T> { |
| protected final T[] operations; |
| protected final OperationStatus[] retCodeDetails; |
| protected final WALEdit[] walEditsFromCoprocessors; |
| // reference family cell maps directly so coprocessors can mutate them if desired |
| protected final Map<byte[], List<ExtendedCell>>[] familyCellMaps; |
| // For Increment/Append operations |
| protected final Result[] results; |
| |
| protected final HRegion region; |
| protected int nextIndexToProcess = 0; |
| protected final ObservedExceptionsInBatch observedExceptions; |
| // Durability of the batch (highest durability of all operations) |
| protected Durability durability; |
| protected boolean atomic = false; |
| |
| public BatchOperation(final HRegion region, T[] operations) { |
| this.operations = operations; |
| this.retCodeDetails = new OperationStatus[operations.length]; |
| Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN); |
| this.walEditsFromCoprocessors = new WALEdit[operations.length]; |
| familyCellMaps = new Map[operations.length]; |
| this.results = new Result[operations.length]; |
| |
| this.region = region; |
| observedExceptions = new ObservedExceptionsInBatch(); |
| durability = Durability.USE_DEFAULT; |
| } |
| |
| /** |
| * Visitor interface for batch operations |
| */ |
| @FunctionalInterface |
| interface Visitor { |
| /** |
| * @param index operation index |
| * @return If true continue visiting remaining entries, break otherwise |
| */ |
| boolean visit(int index) throws IOException; |
| } |
| |
| /** |
| * Helper method for visiting pending/ all batch operations |
| */ |
| public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor) |
| throws IOException { |
| assert lastIndexExclusive <= this.size(); |
| for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) { |
| if (!pendingOnly || isOperationPending(i)) { |
| if (!visitor.visit(i)) { |
| break; |
| } |
| } |
| } |
| } |
| |
| public abstract Mutation getMutation(int index); |
| |
| public abstract long getNonceGroup(int index); |
| |
| public abstract long getNonce(int index); |
| |
| /** |
| * This method is potentially expensive and useful mostly for non-replay CP path. |
| */ |
| public abstract Mutation[] getMutationsForCoprocs(); |
| |
| public abstract boolean isInReplay(); |
| |
| public abstract long getOrigLogSeqNum(); |
| |
| public abstract void startRegionOperation() throws IOException; |
| |
| public abstract void closeRegionOperation() throws IOException; |
| |
| /** |
| * Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs |
| * CP prePut()/preDelete()/preIncrement()/preAppend() hooks for all mutations in a batch. This |
| * is intended to operate on entire batch and will be called from outside of class to check and |
| * prepare batch. This can be implemented by calling helper method |
| * {@link #checkAndPrepareMutation(int, long)} in a 'for' loop over mutations. |
| */ |
| public abstract void checkAndPrepare() throws IOException; |
| |
| /** |
| * Implement any Put request specific check and prepare logic here. Please refer to |
| * {@link #checkAndPrepareMutation(Mutation, long)} for how its used. |
| */ |
| protected abstract void checkAndPreparePut(final Put p) throws IOException; |
| |
| /** |
| * If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell |
| * count, tags and timestamp for all cells of all operations in a mini-batch. |
| */ |
| public abstract void prepareMiniBatchOperations( |
| MiniBatchOperationInProgress<Mutation> miniBatchOp, long timestamp, |
| final List<RowLock> acquiredRowLocks) throws IOException; |
| |
| /** |
| * Write mini-batch operations to MemStore |
| */ |
| public abstract WriteEntry writeMiniBatchOperationsToMemStore( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) |
| throws IOException; |
| |
| protected void writeMiniBatchOperationsToMemStore( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, final long writeNumber) |
| throws IOException { |
| MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing(); |
| visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { |
| // We need to update the sequence id for following reasons. |
| // 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id. |
| // 2) If no WAL, FSWALEntry won't be used |
| // we use durability of the original mutation for the mutation passed by CP. |
| if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) { |
| region.updateSequenceId(familyCellMaps[index].values(), writeNumber); |
| } |
| applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting); |
| return true; |
| }); |
| // update memStore size |
| region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(), |
| memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount()); |
| } |
| |
| public boolean isDone() { |
| return nextIndexToProcess == operations.length; |
| } |
| |
| public int size() { |
| return operations.length; |
| } |
| |
| public boolean isOperationPending(int index) { |
| return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN; |
| } |
| |
| public List<UUID> getClusterIds() { |
| assert size() != 0; |
| return getMutation(0).getClusterIds(); |
| } |
| |
| boolean isAtomic() { |
| return atomic; |
| } |
| |
| /** |
| * Helper method that checks and prepares only one mutation. This can be used to implement |
| * {@link #checkAndPrepare()} for entire Batch. NOTE: As CP |
| * prePut()/preDelete()/preIncrement()/preAppend() hooks may modify mutations, this method |
| * should be called after prePut()/preDelete()/preIncrement()/preAppend() CP hooks are run for |
| * the mutation |
| */ |
| protected void checkAndPrepareMutation(Mutation mutation, final long timestamp) |
| throws IOException { |
| region.checkRow(mutation.getRow(), "batchMutate"); |
| if (mutation instanceof Put) { |
| // Check the families in the put. If bad, skip this one. |
| checkAndPreparePut((Put) mutation); |
| region.checkTimestamps(mutation.getFamilyCellMap(), timestamp); |
| } else if (mutation instanceof Delete) { |
| region.prepareDelete((Delete) mutation); |
| } else if (mutation instanceof Increment || mutation instanceof Append) { |
| region.checkFamilies(mutation.getFamilyCellMap().keySet()); |
| } |
| } |
| |
| protected void checkAndPrepareMutation(int index, long timestamp) throws IOException { |
| Mutation mutation = getMutation(index); |
| try { |
| this.checkAndPrepareMutation(mutation, timestamp); |
| |
| if (mutation instanceof Put || mutation instanceof Delete) { |
| // store the family map reference to allow for mutations |
| // we know that in mutation, only ExtendedCells are allow so here we do a fake cast, to |
| // simplify later logic |
| familyCellMaps[index] = (Map) mutation.getFamilyCellMap(); |
| } |
| |
| // store durability for the batch (highest durability of all operations in the batch) |
| Durability tmpDur = region.getEffectiveDurability(mutation.getDurability()); |
| if (tmpDur.ordinal() > durability.ordinal()) { |
| durability = tmpDur; |
| } |
| } catch (NoSuchColumnFamilyException nscfe) { |
| final String msg = "No such column family in batch mutation in region " + this; |
| if (observedExceptions.hasSeenNoSuchFamily()) { |
| LOG.warn(msg + nscfe.getMessage()); |
| } else { |
| LOG.warn(msg, nscfe); |
| observedExceptions.sawNoSuchFamily(); |
| } |
| retCodeDetails[index] = |
| new OperationStatus(OperationStatusCode.BAD_FAMILY, nscfe.getMessage()); |
| if (isAtomic()) { // fail, atomic means all or none |
| throw nscfe; |
| } |
| } catch (FailedSanityCheckException fsce) { |
| final String msg = "Batch Mutation did not pass sanity check in region " + this; |
| if (observedExceptions.hasSeenFailedSanityCheck()) { |
| LOG.warn(msg + fsce.getMessage()); |
| } else { |
| LOG.warn(msg, fsce); |
| observedExceptions.sawFailedSanityCheck(); |
| } |
| retCodeDetails[index] = |
| new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); |
| if (isAtomic()) { |
| throw fsce; |
| } |
| } catch (WrongRegionException we) { |
| final String msg = "Batch mutation had a row that does not belong to this region " + this; |
| if (observedExceptions.hasSeenWrongRegion()) { |
| LOG.warn(msg + we.getMessage()); |
| } else { |
| LOG.warn(msg, we); |
| observedExceptions.sawWrongRegion(); |
| } |
| retCodeDetails[index] = |
| new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); |
| if (isAtomic()) { |
| throw we; |
| } |
| } |
| } |
| |
| /** |
| * Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which a row |
| * lock can be acquired. All mutations with locked rows are considered to be In-progress |
| * operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch is window over |
| * {@link BatchOperation} and contains contiguous pending operations. |
| * @param acquiredRowLocks keeps track of rowLocks acquired. |
| */ |
| public MiniBatchOperationInProgress<Mutation> |
| lockRowsAndBuildMiniBatch(List<RowLock> acquiredRowLocks) throws IOException { |
| int readyToWriteCount = 0; |
| int lastIndexExclusive = 0; |
| RowLock prevRowLock = null; |
| for (; lastIndexExclusive < size(); lastIndexExclusive++) { |
| // It reaches the miniBatchSize, stop here and process the miniBatch |
| // This only applies to non-atomic batch operations. |
| if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) { |
| break; |
| } |
| |
| if (!isOperationPending(lastIndexExclusive)) { |
| continue; |
| } |
| |
| // HBASE-19389 Limit concurrency of put with dense (hundreds) columns to avoid exhausting |
| // RS handlers, covering both MutationBatchOperation and ReplayBatchOperation |
| // The BAD_FAMILY/SANITY_CHECK_FAILURE cases are handled in checkAndPrepare phase and won't |
| // pass the isOperationPending check |
| Map<byte[], List<Cell>> curFamilyCellMap = |
| getMutation(lastIndexExclusive).getFamilyCellMap(); |
| try { |
| // start the protector before acquiring row lock considering performance, and will finish |
| // it when encountering exception |
| region.storeHotnessProtector.start(curFamilyCellMap); |
| } catch (RegionTooBusyException rtbe) { |
| region.storeHotnessProtector.finish(curFamilyCellMap); |
| if (isAtomic()) { |
| throw rtbe; |
| } |
| retCodeDetails[lastIndexExclusive] = |
| new OperationStatus(OperationStatusCode.STORE_TOO_BUSY, rtbe.getMessage()); |
| continue; |
| } |
| |
| Mutation mutation = getMutation(lastIndexExclusive); |
| // If we haven't got any rows in our batch, we should block to get the next one. |
| RowLock rowLock = null; |
| boolean throwException = false; |
| try { |
| // if atomic then get exclusive lock, else shared lock |
| rowLock = region.getRowLock(mutation.getRow(), !isAtomic(), prevRowLock); |
| } catch (TimeoutIOException | InterruptedIOException e) { |
| // NOTE: We will retry when other exceptions, but we should stop if we receive |
| // TimeoutIOException or InterruptedIOException as operation has timed out or |
| // interrupted respectively. |
| throwException = true; |
| throw e; |
| } catch (IOException ioe) { |
| LOG.warn("Failed getting lock, row={}, in region {}", |
| Bytes.toStringBinary(mutation.getRow()), this, ioe); |
| if (isAtomic()) { // fail, atomic means all or none |
| throwException = true; |
| throw ioe; |
| } |
| } catch (Throwable throwable) { |
| throwException = true; |
| throw throwable; |
| } finally { |
| if (throwException) { |
| region.storeHotnessProtector.finish(curFamilyCellMap); |
| } |
| } |
| if (rowLock == null) { |
| // We failed to grab another lock |
| if (isAtomic()) { |
| region.storeHotnessProtector.finish(curFamilyCellMap); |
| throw new IOException("Can't apply all operations atomically!"); |
| } |
| break; // Stop acquiring more rows for this batch |
| } else { |
| if (rowLock != prevRowLock) { |
| // It is a different row now, add this to the acquiredRowLocks and |
| // set prevRowLock to the new returned rowLock |
| acquiredRowLocks.add(rowLock); |
| prevRowLock = rowLock; |
| } |
| } |
| |
| readyToWriteCount++; |
| } |
| return createMiniBatch(lastIndexExclusive, readyToWriteCount); |
| } |
| |
| protected MiniBatchOperationInProgress<Mutation> createMiniBatch(final int lastIndexExclusive, |
| final int readyToWriteCount) { |
| return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails, |
| walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount); |
| } |
| |
| /** |
| * Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are |
| * present, they are merged to result WALEdit. |
| */ |
| public List<Pair<NonceKey, WALEdit>> |
| buildWALEdits(final MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { |
| List<Pair<NonceKey, WALEdit>> walEdits = new ArrayList<>(); |
| |
| visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() { |
| private Pair<NonceKey, WALEdit> curWALEditForNonce; |
| |
| @Override |
| public boolean visit(int index) throws IOException { |
| Mutation m = getMutation(index); |
| // we use durability of the original mutation for the mutation passed by CP. |
| if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) { |
| region.recordMutationWithoutWal(m.getFamilyCellMap()); |
| return true; |
| } |
| |
| // the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each. |
| // Given how nonce keys are originally written, these should be contiguous. |
| // They don't have to be, it will still work, just write more WALEdits than needed. |
| long nonceGroup = getNonceGroup(index); |
| long nonce = getNonce(index); |
| if ( |
| curWALEditForNonce == null |
| || curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup |
| || curWALEditForNonce.getFirst().getNonce() != nonce |
| ) { |
| curWALEditForNonce = new Pair<>(new NonceKey(nonceGroup, nonce), |
| new WALEdit(miniBatchOp.getCellCount(), isInReplay())); |
| walEdits.add(curWALEditForNonce); |
| } |
| WALEdit walEdit = curWALEditForNonce.getSecond(); |
| |
| // Add WAL edits from CPs. |
| WALEdit fromCP = walEditsFromCoprocessors[index]; |
| if (fromCP != null) { |
| for (Cell cell : fromCP.getCells()) { |
| walEdit.add(cell); |
| } |
| } |
| walEdit.add((Map) familyCellMaps[index]); |
| |
| return true; |
| } |
| }); |
| return walEdits; |
| } |
| |
| /** |
| * This method completes mini-batch operations by calling postBatchMutate() CP hook (if |
| * required) and completing mvcc. |
| */ |
| public void completeMiniBatchOperations( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) |
| throws IOException { |
| if (writeEntry != null) { |
| region.mvcc.completeAndWait(writeEntry); |
| } |
| } |
| |
| public void doPostOpCleanupForMiniBatch( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WALEdit walEdit, |
| boolean success) throws IOException { |
| doFinishHotnessProtector(miniBatchOp); |
| } |
| |
| private void |
| doFinishHotnessProtector(final MiniBatchOperationInProgress<Mutation> miniBatchOp) { |
| // check and return if the protector is not enabled |
| if (!region.storeHotnessProtector.isEnable()) { |
| return; |
| } |
| // miniBatchOp is null, if and only if lockRowsAndBuildMiniBatch throwing exception. |
| // This case was handled. |
| if (miniBatchOp == null) { |
| return; |
| } |
| |
| final int finalLastIndexExclusive = miniBatchOp.getLastIndexExclusive(); |
| |
| for (int i = nextIndexToProcess; i < finalLastIndexExclusive; i++) { |
| switch (retCodeDetails[i].getOperationStatusCode()) { |
| case SUCCESS: |
| case FAILURE: |
| region.storeHotnessProtector.finish(getMutation(i).getFamilyCellMap()); |
| break; |
| default: |
| // do nothing |
| // We won't start the protector for NOT_RUN/BAD_FAMILY/SANITY_CHECK_FAILURE and the |
| // STORE_TOO_BUSY case is handled in StoreHotnessProtector#start |
| break; |
| } |
| } |
| } |
| |
| /** |
| * Atomically apply the given map of family->edits to the memstore. This handles the consistency |
| * control on its own, but the caller should already have locked updatesLock.readLock(). This |
| * also does <b>not</b> check the families for validity. |
| * @param familyMap Map of Cells by family |
| */ |
| protected void applyFamilyMapToMemStore(Map<byte[], List<ExtendedCell>> familyMap, |
| MemStoreSizing memstoreAccounting) { |
| for (Map.Entry<byte[], List<ExtendedCell>> e : familyMap.entrySet()) { |
| byte[] family = e.getKey(); |
| List<ExtendedCell> cells = e.getValue(); |
| assert cells instanceof RandomAccess; |
| region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting); |
| } |
| } |
| } |
| |
| /** |
| * Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most of |
| * the logic is same. |
| */ |
| private static class MutationBatchOperation extends BatchOperation<Mutation> { |
| |
| // For nonce operations |
| private long nonceGroup; |
| private long nonce; |
| protected boolean canProceed; |
| |
| public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic, |
| long nonceGroup, long nonce) { |
| super(region, operations); |
| this.atomic = atomic; |
| this.nonceGroup = nonceGroup; |
| this.nonce = nonce; |
| } |
| |
| @Override |
| public Mutation getMutation(int index) { |
| return this.operations[index]; |
| } |
| |
| @Override |
| public long getNonceGroup(int index) { |
| return nonceGroup; |
| } |
| |
| @Override |
| public long getNonce(int index) { |
| return nonce; |
| } |
| |
| @Override |
| public Mutation[] getMutationsForCoprocs() { |
| return this.operations; |
| } |
| |
| @Override |
| public boolean isInReplay() { |
| return false; |
| } |
| |
| @Override |
| public long getOrigLogSeqNum() { |
| return SequenceId.NO_SEQUENCE_ID; |
| } |
| |
| @Override |
| public void startRegionOperation() throws IOException { |
| region.startRegionOperation(Operation.BATCH_MUTATE); |
| } |
| |
| @Override |
| public void closeRegionOperation() throws IOException { |
| region.closeRegionOperation(Operation.BATCH_MUTATE); |
| } |
| |
| @Override |
| public void checkAndPreparePut(Put p) throws IOException { |
| region.checkFamilies(p.getFamilyCellMap().keySet()); |
| } |
| |
| @Override |
| public void checkAndPrepare() throws IOException { |
| // index 0: puts, index 1: deletes, index 2: increments, index 3: append |
| final int[] metrics = { 0, 0, 0, 0 }; |
| |
| visitBatchOperations(true, this.size(), new Visitor() { |
| private long now = EnvironmentEdgeManager.currentTime(); |
| private WALEdit walEdit; |
| |
| @Override |
| public boolean visit(int index) throws IOException { |
| // Run coprocessor pre hook outside of locks to avoid deadlock |
| if (region.coprocessorHost != null) { |
| if (walEdit == null) { |
| walEdit = new WALEdit(); |
| } |
| callPreMutateCPHook(index, walEdit, metrics); |
| if (!walEdit.isEmpty()) { |
| walEditsFromCoprocessors[index] = walEdit; |
| walEdit = null; |
| } |
| } |
| if (isOperationPending(index)) { |
| // TODO: Currently validation is done with current time before acquiring locks and |
| // updates are done with different timestamps after acquiring locks. This behavior is |
| // inherited from the code prior to this change. Can this be changed? |
| checkAndPrepareMutation(index, now); |
| } |
| return true; |
| } |
| }); |
| |
| // FIXME: we may update metrics twice! here for all operations bypassed by CP and later in |
| // normal processing. |
| // Update metrics in same way as it is done when we go the normal processing route (we now |
| // update general metrics though a Coprocessor did the work). |
| if (region.metricsRegion != null) { |
| if (metrics[0] > 0) { |
| // There were some Puts in the batch. |
| region.metricsRegion.updatePut(); |
| } |
| if (metrics[1] > 0) { |
| // There were some Deletes in the batch. |
| region.metricsRegion.updateDelete(); |
| } |
| if (metrics[2] > 0) { |
| // There were some Increment in the batch. |
| region.metricsRegion.updateIncrement(); |
| } |
| if (metrics[3] > 0) { |
| // There were some Append in the batch. |
| region.metricsRegion.updateAppend(); |
| } |
| } |
| } |
| |
| @Override |
| public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp, |
| long timestamp, final List<RowLock> acquiredRowLocks) throws IOException { |
| // For nonce operations |
| canProceed = startNonceOperation(); |
| |
| visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { |
| Mutation mutation = getMutation(index); |
| if (mutation instanceof Put) { |
| HRegion.updateCellTimestamps(familyCellMaps[index].values(), Bytes.toBytes(timestamp)); |
| miniBatchOp.incrementNumOfPuts(); |
| } else if (mutation instanceof Delete) { |
| region.prepareDeleteTimestamps(mutation, familyCellMaps[index], Bytes.toBytes(timestamp)); |
| miniBatchOp.incrementNumOfDeletes(); |
| } else if (mutation instanceof Increment || mutation instanceof Append) { |
| boolean returnResults; |
| if (mutation instanceof Increment) { |
| returnResults = ((Increment) mutation).isReturnResults(); |
| } else { |
| returnResults = ((Append) mutation).isReturnResults(); |
| } |
| |
| // For nonce operations |
| if (!canProceed) { |
| Result result; |
| if (returnResults) { |
| // convert duplicate increment/append to get |
| List<Cell> results = region.get(toGet(mutation), false, nonceGroup, nonce); |
| result = Result.create(results); |
| } else { |
| result = Result.EMPTY_RESULT; |
| } |
| retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); |
| return true; |
| } |
| |
| Result result = null; |
| if (region.coprocessorHost != null) { |
| if (mutation instanceof Increment) { |
| result = region.coprocessorHost.preIncrementAfterRowLock((Increment) mutation); |
| } else { |
| result = region.coprocessorHost.preAppendAfterRowLock((Append) mutation); |
| } |
| } |
| if (result != null) { |
| retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, |
| returnResults ? result : Result.EMPTY_RESULT); |
| return true; |
| } |
| |
| List<ExtendedCell> results = returnResults ? new ArrayList<>(mutation.size()) : null; |
| familyCellMaps[index] = reckonDeltas(mutation, results, timestamp); |
| this.results[index] = results != null ? Result.create(results) : Result.EMPTY_RESULT; |
| |
| if (mutation instanceof Increment) { |
| miniBatchOp.incrementNumOfIncrements(); |
| } else { |
| miniBatchOp.incrementNumOfAppends(); |
| } |
| } |
| region.rewriteCellTags(familyCellMaps[index], mutation); |
| |
| // update cell count |
| if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { |
| for (List<Cell> cells : mutation.getFamilyCellMap().values()) { |
| miniBatchOp.addCellCount(cells.size()); |
| } |
| } |
| |
| WALEdit fromCP = walEditsFromCoprocessors[index]; |
| if (fromCP != null) { |
| miniBatchOp.addCellCount(fromCP.size()); |
| } |
| return true; |
| }); |
| |
| if (region.coprocessorHost != null) { |
| // calling the pre CP hook for batch mutation |
| region.coprocessorHost.preBatchMutate(miniBatchOp); |
| checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp); |
| } |
| } |
| |
| /** |
| * Starts the nonce operation for a mutation, if needed. |
| * @return whether to proceed this mutation. |
| */ |
| private boolean startNonceOperation() throws IOException { |
| if ( |
| region.rsServices == null || region.rsServices.getNonceManager() == null |
| || nonce == HConstants.NO_NONCE |
| ) { |
| return true; |
| } |
| boolean canProceed; |
| try { |
| canProceed = |
| region.rsServices.getNonceManager().startOperation(nonceGroup, nonce, region.rsServices); |
| } catch (InterruptedException ex) { |
| throw new InterruptedIOException("Nonce start operation interrupted"); |
| } |
| return canProceed; |
| } |
| |
| /** |
| * Ends nonce operation for a mutation, if needed. |
| * @param success Whether the operation for this nonce has succeeded. |
| */ |
| private void endNonceOperation(boolean success) { |
| if ( |
| region.rsServices != null && region.rsServices.getNonceManager() != null |
| && nonce != HConstants.NO_NONCE |
| ) { |
| region.rsServices.getNonceManager().endOperation(nonceGroup, nonce, success); |
| } |
| } |
| |
| private static Get toGet(final Mutation mutation) throws IOException { |
| assert mutation instanceof Increment || mutation instanceof Append; |
| Get get = new Get(mutation.getRow()); |
| CellScanner cellScanner = mutation.cellScanner(); |
| while (cellScanner.advance()) { |
| Cell cell = cellScanner.current(); |
| get.addColumn(CellUtil.cloneFamily(cell), CellUtil.cloneQualifier(cell)); |
| } |
| if (mutation instanceof Increment) { |
| // Increment |
| Increment increment = (Increment) mutation; |
| get.setTimeRange(increment.getTimeRange().getMin(), increment.getTimeRange().getMax()); |
| } else { |
| // Append |
| Append append = (Append) mutation; |
| get.setTimeRange(append.getTimeRange().getMin(), append.getTimeRange().getMax()); |
| } |
| for (Entry<String, byte[]> entry : mutation.getAttributesMap().entrySet()) { |
| get.setAttribute(entry.getKey(), entry.getValue()); |
| } |
| return get; |
| } |
| |
| private Map<byte[], List<ExtendedCell>> reckonDeltas(Mutation mutation, |
| List<ExtendedCell> results, long now) throws IOException { |
| assert mutation instanceof Increment || mutation instanceof Append; |
| Map<byte[], List<ExtendedCell>> ret = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| // Process a Store/family at a time. |
| for (Map.Entry<byte[], List<Cell>> entry : mutation.getFamilyCellMap().entrySet()) { |
| final byte[] columnFamilyName = entry.getKey(); |
| List<ExtendedCell> deltas = (List) entry.getValue(); |
| // Reckon for the Store what to apply to WAL and MemStore. |
| List<ExtendedCell> toApply = |
| reckonDeltasByStore(region.stores.get(columnFamilyName), mutation, now, deltas, results); |
| if (!toApply.isEmpty()) { |
| for (ExtendedCell cell : toApply) { |
| HStore store = region.getStore(cell); |
| if (store == null) { |
| region.checkFamily(CellUtil.cloneFamily(cell)); |
| } else { |
| ret.computeIfAbsent(store.getColumnFamilyDescriptor().getName(), |
| key -> new ArrayList<>()).add(cell); |
| } |
| } |
| } |
| } |
| return ret; |
| } |
| |
| /** |
| * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed column |
| * family/Store. Does Get of current value and then adds passed in deltas for this Store |
| * returning the result. |
| * @param mutation The encompassing Mutation object |
| * @param deltas Changes to apply to this Store; either increment amount or data to append |
| * @param results In here we accumulate all the Cells we are to return to the client. If null, |
| * client doesn't want results returned. |
| * @return Resulting Cells after <code>deltas</code> have been applied to current values. Side |
| * effect is our filling out of the <code>results</code> List. |
| */ |
| private List<ExtendedCell> reckonDeltasByStore(HStore store, Mutation mutation, long now, |
| List<ExtendedCell> deltas, List<ExtendedCell> results) throws IOException { |
| assert mutation instanceof Increment || mutation instanceof Append; |
| byte[] columnFamily = store.getColumnFamilyDescriptor().getName(); |
| List<Pair<ExtendedCell, ExtendedCell>> cellPairs = new ArrayList<>(deltas.size()); |
| |
| // Sort the cells so that they match the order that they appear in the Get results. |
| // Otherwise, we won't be able to find the existing values if the cells are not specified |
| // in order by the client since cells are in an array list. |
| deltas.sort(store.getComparator()); |
| |
| // Get previous values for all columns in this family. |
| Get get = new Get(mutation.getRow()); |
| for (ExtendedCell cell : deltas) { |
| get.addColumn(columnFamily, CellUtil.cloneQualifier(cell)); |
| } |
| TimeRange tr; |
| if (mutation instanceof Increment) { |
| tr = ((Increment) mutation).getTimeRange(); |
| } else { |
| tr = ((Append) mutation).getTimeRange(); |
| } |
| |
| if (tr != null) { |
| get.setTimeRange(tr.getMin(), tr.getMax()); |
| } |
| |
| try (RegionScanner scanner = region.getScanner(new Scan(get))) { |
| // NOTE: Please don't use HRegion.get() instead, |
| // because it will copy cells to heap. See HBASE-26036 |
| List<ExtendedCell> currentValues = new ArrayList<>(); |
| scanner.next((List) currentValues); |
| // Iterate the input columns and update existing values if they were found, otherwise |
| // add new column initialized to the delta amount |
| int currentValuesIndex = 0; |
| for (int i = 0; i < deltas.size(); i++) { |
| ExtendedCell delta = deltas.get(i); |
| ExtendedCell currentValue = null; |
| if ( |
| currentValuesIndex < currentValues.size() |
| && CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta) |
| ) { |
| currentValue = currentValues.get(currentValuesIndex); |
| if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) { |
| currentValuesIndex++; |
| } |
| } |
| // Switch on whether this an increment or an append building the new Cell to apply. |
| ExtendedCell newCell; |
| if (mutation instanceof Increment) { |
| long deltaAmount = getLongValue(delta); |
| final long newValue = |
| currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount; |
| newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, |
| (oldCell) -> Bytes.toBytes(newValue)); |
| } else { |
| newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, |
| (oldCell) -> ByteBuffer |
| .wrap(new byte[delta.getValueLength() + oldCell.getValueLength()]) |
| .put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength()) |
| .put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength()) |
| .array()); |
| } |
| if (region.maxCellSize > 0) { |
| int newCellSize = PrivateCellUtil.estimatedSerializedSizeOf(newCell); |
| if (newCellSize > region.maxCellSize) { |
| String msg = "Cell with size " + newCellSize + " exceeds limit of " |
| + region.maxCellSize + " bytes in region " + this; |
| LOG.debug(msg); |
| throw new DoNotRetryIOException(msg); |
| } |
| } |
| cellPairs.add(new Pair<>(currentValue, newCell)); |
| // Add to results to get returned to the Client. If null, cilent does not want results. |
| if (results != null) { |
| results.add(newCell); |
| } |
| } |
| // Give coprocessors a chance to update the new cells before apply to WAL or memstore |
| if (region.coprocessorHost != null) { |
| // Here the operation must be increment or append. |
| cellPairs = mutation instanceof Increment |
| ? region.coprocessorHost.postIncrementBeforeWAL(mutation, (List) cellPairs) |
| : region.coprocessorHost.postAppendBeforeWAL(mutation, (List) cellPairs); |
| } |
| } |
| return cellPairs.stream().map(Pair::getSecond).collect(Collectors.toList()); |
| } |
| |
| private static ExtendedCell reckonDelta(final Cell delta, final Cell currentCell, |
| final byte[] columnFamily, final long now, Mutation mutation, Function<Cell, byte[]> supplier) |
| throws IOException { |
| // Forward any tags found on the delta. |
| List<Tag> tags = TagUtil.carryForwardTags(delta); |
| if (currentCell != null) { |
| tags = TagUtil.carryForwardTags(tags, currentCell); |
| tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); |
| byte[] newValue = supplier.apply(currentCell); |
| return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) |
| .setRow(mutation.getRow(), 0, mutation.getRow().length) |
| .setFamily(columnFamily, 0, columnFamily.length) |
| // copy the qualifier if the cell is located in shared memory. |
| .setQualifier(CellUtil.cloneQualifier(delta)) |
| .setTimestamp(Math.max(currentCell.getTimestamp() + 1, now)) |
| .setType(KeyValue.Type.Put.getCode()).setValue(newValue, 0, newValue.length) |
| .setTags(TagUtil.fromList(tags)).build(); |
| } else { |
| tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); |
| PrivateCellUtil.updateLatestStamp(delta, now); |
| assert delta instanceof ExtendedCell; |
| ExtendedCell deltaCell = (ExtendedCell) delta; |
| return CollectionUtils.isEmpty(tags) |
| ? deltaCell |
| : PrivateCellUtil.createCell(deltaCell, tags); |
| } |
| } |
| |
| /** Returns Get the long out of the passed in Cell */ |
| private static long getLongValue(final Cell cell) throws DoNotRetryIOException { |
| int len = cell.getValueLength(); |
| if (len != Bytes.SIZEOF_LONG) { |
| // throw DoNotRetryIOException instead of IllegalArgumentException |
| throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide"); |
| } |
| return PrivateCellUtil.getValueAsLong(cell); |
| } |
| |
| @Override |
| public List<Pair<NonceKey, WALEdit>> |
| buildWALEdits(final MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { |
| List<Pair<NonceKey, WALEdit>> walEdits = super.buildWALEdits(miniBatchOp); |
| // for MutationBatchOperation, more than one nonce is not allowed |
| if (walEdits.size() > 1) { |
| throw new IOException("Found multiple nonce keys per batch!"); |
| } |
| return walEdits; |
| } |
| |
| @Override |
| public WriteEntry writeMiniBatchOperationsToMemStore( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, @Nullable WriteEntry writeEntry) |
| throws IOException { |
| if (writeEntry == null) { |
| writeEntry = region.mvcc.begin(); |
| } |
| super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber()); |
| return writeEntry; |
| } |
| |
| @Override |
| public void completeMiniBatchOperations( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) |
| throws IOException { |
| // TODO: can it be done after completing mvcc? |
| // calling the post CP hook for batch mutation |
| if (region.coprocessorHost != null) { |
| region.coprocessorHost.postBatchMutate(miniBatchOp); |
| } |
| super.completeMiniBatchOperations(miniBatchOp, writeEntry); |
| |
| if (nonce != HConstants.NO_NONCE) { |
| if (region.rsServices != null && region.rsServices.getNonceManager() != null) { |
| region.rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce, |
| writeEntry.getWriteNumber()); |
| } |
| } |
| } |
| |
| @Override |
| public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress<Mutation> miniBatchOp, |
| final WALEdit walEdit, boolean success) throws IOException { |
| |
| super.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, success); |
| if (miniBatchOp != null) { |
| // synced so that the coprocessor contract is adhered to. |
| if (region.coprocessorHost != null) { |
| visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { |
| // only for successful puts/deletes/increments/appends |
| if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) { |
| Mutation m = getMutation(i); |
| if (m instanceof Put) { |
| region.coprocessorHost.postPut((Put) m, walEdit); |
| } else if (m instanceof Delete) { |
| region.coprocessorHost.postDelete((Delete) m, walEdit); |
| } else if (m instanceof Increment) { |
| Result result = |
| region.getCoprocessorHost().postIncrement((Increment) m, results[i], walEdit); |
| if (result != results[i]) { |
| retCodeDetails[i] = |
| new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); |
| } |
| } else if (m instanceof Append) { |
| Result result = |
| region.getCoprocessorHost().postAppend((Append) m, results[i], walEdit); |
| if (result != results[i]) { |
| retCodeDetails[i] = |
| new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); |
| } |
| } |
| } |
| return true; |
| }); |
| } |
| |
| // For nonce operations |
| if (canProceed && nonce != HConstants.NO_NONCE) { |
| boolean[] areAllIncrementsAndAppendsSuccessful = new boolean[] { true }; |
| visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { |
| Mutation mutation = getMutation(i); |
| if (mutation instanceof Increment || mutation instanceof Append) { |
| if (retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.SUCCESS) { |
| areAllIncrementsAndAppendsSuccessful[0] = false; |
| return false; |
| } |
| } |
| return true; |
| }); |
| endNonceOperation(areAllIncrementsAndAppendsSuccessful[0]); |
| } |
| |
| // See if the column families were consistent through the whole thing. |
| // if they were then keep them. If they were not then pass a null. |
| // null will be treated as unknown. |
| // Total time taken might be involving Puts, Deletes, Increments and Appends. |
| // Split the time for puts and deletes based on the total number of Puts, Deletes, |
| // Increments and Appends. |
| if (region.metricsRegion != null) { |
| if (miniBatchOp.getNumOfPuts() > 0) { |
| // There were some Puts in the batch. |
| region.metricsRegion.updatePut(); |
| } |
| if (miniBatchOp.getNumOfDeletes() > 0) { |
| // There were some Deletes in the batch. |
| region.metricsRegion.updateDelete(); |
| } |
| if (miniBatchOp.getNumOfIncrements() > 0) { |
| // There were some Increments in the batch. |
| region.metricsRegion.updateIncrement(); |
| } |
| if (miniBatchOp.getNumOfAppends() > 0) { |
| // There were some Appends in the batch. |
| region.metricsRegion.updateAppend(); |
| } |
| } |
| } |
| |
| if (region.coprocessorHost != null) { |
| // call the coprocessor hook to do any finalization steps after the put is done |
| region.coprocessorHost.postBatchMutateIndispensably( |
| miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success); |
| } |
| } |
| |
| /** |
| * Runs prePut/preDelete/preIncrement/preAppend coprocessor hook for input mutation in a batch |
| * @param metrics Array of 2 ints. index 0: count of puts, index 1: count of deletes, index 2: |
| * count of increments and 3: count of appends |
| */ |
| private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics) |
| throws IOException { |
| Mutation m = getMutation(index); |
| if (m instanceof Put) { |
| if (region.coprocessorHost.prePut((Put) m, walEdit)) { |
| // pre hook says skip this Put |
| // mark as success and skip in doMiniBatchMutation |
| metrics[0]++; |
| retCodeDetails[index] = OperationStatus.SUCCESS; |
| } |
| } else if (m instanceof Delete) { |
| Delete curDel = (Delete) m; |
| if (curDel.getFamilyCellMap().isEmpty()) { |
| // handle deleting a row case |
| // TODO: prepareDelete() has been called twice, before and after preDelete() CP hook. |
| // Can this be avoided? |
| region.prepareDelete(curDel); |
| } |
| if (region.coprocessorHost.preDelete(curDel, walEdit)) { |
| // pre hook says skip this Delete |
| // mark as success and skip in doMiniBatchMutation |
| metrics[1]++; |
| retCodeDetails[index] = OperationStatus.SUCCESS; |
| } |
| } else if (m instanceof Increment) { |
| Increment increment = (Increment) m; |
| Result result = region.coprocessorHost.preIncrement(increment, walEdit); |
| if (result != null) { |
| // pre hook says skip this Increment |
| // mark as success and skip in doMiniBatchMutation |
| metrics[2]++; |
| retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); |
| } |
| } else if (m instanceof Append) { |
| Append append = (Append) m; |
| Result result = region.coprocessorHost.preAppend(append, walEdit); |
| if (result != null) { |
| // pre hook says skip this Append |
| // mark as success and skip in doMiniBatchMutation |
| metrics[3]++; |
| retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); |
| } |
| } else { |
| String msg = "Put/Delete/Increment/Append mutations only supported in a batch"; |
| retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg); |
| if (isAtomic()) { // fail, atomic means all or none |
| throw new IOException(msg); |
| } |
| } |
| } |
| |
| // TODO Support Increment/Append operations |
| private void checkAndMergeCPMutations(final MiniBatchOperationInProgress<Mutation> miniBatchOp, |
| final List<RowLock> acquiredRowLocks, final long timestamp) throws IOException { |
| visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> { |
| // we pass (i - firstIndex) below since the call expects a relative index |
| Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess); |
| if (cpMutations == null) { |
| return true; |
| } |
| // Else Coprocessor added more Mutations corresponding to the Mutation at this index. |
| Mutation mutation = getMutation(i); |
| for (Mutation cpMutation : cpMutations) { |
| this.checkAndPrepareMutation(cpMutation, timestamp); |
| |
| // Acquire row locks. If not, the whole batch will fail. |
| acquiredRowLocks.add(region.getRowLock(cpMutation.getRow(), true, null)); |
| |
| // Returned mutations from coprocessor correspond to the Mutation at index i. We can |
| // directly add the cells from those mutations to the familyMaps of this mutation. |
| Map<byte[], List<ExtendedCell>> cpFamilyMap = (Map) cpMutation.getFamilyCellMap(); |
| region.rewriteCellTags(cpFamilyMap, mutation); |
| // will get added to the memStore later |
| mergeFamilyMaps(familyCellMaps[i], cpFamilyMap); |
| |
| // The durability of returned mutation is replaced by the corresponding mutation. |
| // If the corresponding mutation contains the SKIP_WAL, we shouldn't count the |
| // cells of returned mutation. |
| if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { |
| for (List<ExtendedCell> cells : cpFamilyMap.values()) { |
| miniBatchOp.addCellCount(cells.size()); |
| } |
| } |
| } |
| return true; |
| }); |
| } |
| |
| private void mergeFamilyMaps(Map<byte[], List<ExtendedCell>> familyMap, |
| Map<byte[], List<ExtendedCell>> toBeMerged) { |
| for (Map.Entry<byte[], List<ExtendedCell>> entry : toBeMerged.entrySet()) { |
| List<ExtendedCell> cells = familyMap.get(entry.getKey()); |
| if (cells == null) { |
| familyMap.put(entry.getKey(), entry.getValue()); |
| } else { |
| cells.addAll(entry.getValue()); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most |
| * of the logic is same. |
| */ |
| private static final class ReplayBatchOperation extends BatchOperation<MutationReplay> { |
| |
| private long origLogSeqNum = 0; |
| |
| public ReplayBatchOperation(final HRegion region, MutationReplay[] operations, |
| long origLogSeqNum) { |
| super(region, operations); |
| this.origLogSeqNum = origLogSeqNum; |
| } |
| |
| @Override |
| public Mutation getMutation(int index) { |
| return this.operations[index].mutation; |
| } |
| |
| @Override |
| public long getNonceGroup(int index) { |
| return this.operations[index].nonceGroup; |
| } |
| |
| @Override |
| public long getNonce(int index) { |
| return this.operations[index].nonce; |
| } |
| |
| @Override |
| public Mutation[] getMutationsForCoprocs() { |
| return null; |
| } |
| |
| @Override |
| public boolean isInReplay() { |
| return true; |
| } |
| |
| @Override |
| public long getOrigLogSeqNum() { |
| return this.origLogSeqNum; |
| } |
| |
| @Override |
| public void startRegionOperation() throws IOException { |
| region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE); |
| } |
| |
| @Override |
| public void closeRegionOperation() throws IOException { |
| region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE); |
| } |
| |
| /** |
| * During replay, there could exist column families which are removed between region server |
| * failure and replay |
| */ |
| @Override |
| protected void checkAndPreparePut(Put p) throws IOException { |
| Map<byte[], List<Cell>> familyCellMap = p.getFamilyCellMap(); |
| List<byte[]> nonExistentList = null; |
| for (byte[] family : familyCellMap.keySet()) { |
| if (!region.htableDescriptor.hasColumnFamily(family)) { |
| if (nonExistentList == null) { |
| nonExistentList = new ArrayList<>(); |
| } |
| nonExistentList.add(family); |
| } |
| } |
| if (nonExistentList != null) { |
| for (byte[] family : nonExistentList) { |
| // Perhaps schema was changed between crash and replay |
| LOG.info("No family for {} omit from reply in region {}.", Bytes.toString(family), this); |
| familyCellMap.remove(family); |
| } |
| } |
| } |
| |
| @Override |
| public void checkAndPrepare() throws IOException { |
| long now = EnvironmentEdgeManager.currentTime(); |
| visitBatchOperations(true, this.size(), (int index) -> { |
| checkAndPrepareMutation(index, now); |
| return true; |
| }); |
| } |
| |
| @Override |
| public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp, |
| long timestamp, final List<RowLock> acquiredRowLocks) throws IOException { |
| visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { |
| // update cell count |
| for (List<Cell> cells : getMutation(index).getFamilyCellMap().values()) { |
| miniBatchOp.addCellCount(cells.size()); |
| } |
| return true; |
| }); |
| } |
| |
| @Override |
| public WriteEntry writeMiniBatchOperationsToMemStore( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) |
| throws IOException { |
| super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum()); |
| return writeEntry; |
| } |
| |
| @Override |
| public void completeMiniBatchOperations( |
| final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) |
| throws IOException { |
| super.completeMiniBatchOperations(miniBatchOp, writeEntry); |
| region.mvcc.advanceTo(getOrigLogSeqNum()); |
| } |
| } |
| |
| public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, |
| long nonce) throws IOException { |
| // As it stands, this is used for 3 things |
| // * batchMutate with single mutation - put/delete/increment/append, separate or from |
| // checkAndMutate. |
| // * coprocessor calls (see ex. BulkDeleteEndpoint). |
| // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd... |
| return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce)); |
| } |
| |
| @Override |
| public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException { |
| // If the mutations has any Increment/Append operations, we need to do batchMutate atomically |
| boolean atomic = |
| Arrays.stream(mutations).anyMatch(m -> m instanceof Increment || m instanceof Append); |
| return batchMutate(mutations, atomic); |
| } |
| |
| OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic) throws IOException { |
| return TraceUtil.trace( |
| () -> batchMutate(mutations, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE), |
| () -> createRegionSpan("Region.batchMutate")); |
| } |
| |
| public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId) |
| throws IOException { |
| if ( |
| !RegionReplicaUtil.isDefaultReplica(getRegionInfo()) |
| && replaySeqId < lastReplayedOpenRegionSeqId |
| ) { |
| // if it is a secondary replica we should ignore these entries silently |
| // since they are coming out of order |
| if (LOG.isTraceEnabled()) { |
| LOG.trace(getRegionInfo().getEncodedName() + " : " + "Skipping " + mutations.length |
| + " mutations with replaySeqId=" + replaySeqId |
| + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); |
| for (MutationReplay mut : mutations) { |
| LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation); |
| } |
| } |
| |
| OperationStatus[] statuses = new OperationStatus[mutations.length]; |
| for (int i = 0; i < statuses.length; i++) { |
| statuses[i] = OperationStatus.SUCCESS; |
| } |
| return statuses; |
| } |
| return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId)); |
| } |
| |
| /** |
| * Perform a batch of mutations. |
| * <p/> |
| * Operations in a batch are stored with highest durability specified of for all operations in a |
| * batch, except for {@link Durability#SKIP_WAL}. |
| * <p/> |
| * This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with |
| * {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[])} with |
| * {@link MutationBatchOperation} instance as an argument. As the processing of replay batch and |
| * mutation batch is very similar, lot of code is shared by providing generic methods in base |
| * class {@link BatchOperation}. The logic for this method and |
| * {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which are |
| * overridden by derived classes to implement special behavior. |
| * @param batchOp contains the list of mutations |
| * @return an array of OperationStatus which internally contains the OperationStatusCode and the |
| * exceptionMessage if any. |
| * @throws IOException if an IO problem is encountered |
| */ |
| private OperationStatus[] batchMutate(BatchOperation<?> batchOp) throws IOException { |
| boolean initialized = false; |
| batchOp.startRegionOperation(); |
| try { |
| while (!batchOp.isDone()) { |
| if (!batchOp.isInReplay()) { |
| checkReadOnly(); |
| } |
| checkResources(); |
| |
| if (!initialized) { |
| this.writeRequestsCount.add(batchOp.size()); |
| // validate and prepare batch for write, for MutationBatchOperation it also calls CP |
| // prePut()/preDelete()/preIncrement()/preAppend() hooks |
| batchOp.checkAndPrepare(); |
| initialized = true; |
| } |
| doMiniBatchMutate(batchOp); |
| requestFlushIfNeeded(); |
| } |
| } finally { |
| if (rsServices != null && rsServices.getMetrics() != null) { |
| rsServices.getMetrics().updateWriteQueryMeter(this, batchOp.size()); |
| } |
| batchOp.closeRegionOperation(); |
| } |
| return batchOp.retCodeDetails; |
| } |
| |
| /** |
| * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[])} In here we |
| * also handle replay of edits on region recover. Also gets change in size brought about by |
| * applying {@code batchOp}. |
| */ |
| private void doMiniBatchMutate(BatchOperation<?> batchOp) throws IOException { |
| boolean success = false; |
| WALEdit walEdit = null; |
| WriteEntry writeEntry = null; |
| boolean locked = false; |
| // We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive) |
| MiniBatchOperationInProgress<Mutation> miniBatchOp = null; |
| /** Keep track of the locks we hold so we can release them in finally clause */ |
| List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size()); |
| |
| // Check for thread interrupt status in case we have been signaled from |
| // #interruptRegionOperation. |
| checkInterrupt(); |
| |
| try { |
| // STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with |
| // locked rows |
| miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks); |
| |
| // We've now grabbed as many mutations off the list as we can |
| // Ensure we acquire at least one. |
| if (miniBatchOp.getReadyToWriteCount() <= 0) { |
| // Nothing to put/delete/increment/append -- an exception in the above such as |
| // NoSuchColumnFamily? |
| return; |
| } |
| |
| // Check for thread interrupt status in case we have been signaled from |
| // #interruptRegionOperation. Do it before we take the lock and disable interrupts for |
| // the WAL append. |
| checkInterrupt(); |
| |
| lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount()); |
| locked = true; |
| |
| // From this point until memstore update this operation should not be interrupted. |
| disableInterrupts(); |
| |
| // STEP 2. Update mini batch of all operations in progress with LATEST_TIMESTAMP timestamp |
| // We should record the timestamp only after we have acquired the rowLock, |
| // otherwise, newer puts/deletes/increment/append are not guaranteed to have a newer |
| // timestamp |
| |
| long now = EnvironmentEdgeManager.currentTime(); |
| batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks); |
| |
| // STEP 3. Build WAL edit |
| |
| List<Pair<NonceKey, WALEdit>> walEdits = batchOp.buildWALEdits(miniBatchOp); |
| |
| // STEP 4. Append the WALEdits to WAL and sync. |
| |
| for (Iterator<Pair<NonceKey, WALEdit>> it = walEdits.iterator(); it.hasNext();) { |
| Pair<NonceKey, WALEdit> nonceKeyWALEditPair = it.next(); |
| walEdit = nonceKeyWALEditPair.getSecond(); |
| NonceKey nonceKey = nonceKeyWALEditPair.getFirst(); |
| |
| if (walEdit != null && !walEdit.isEmpty()) { |
| writeEntry = doWALAppend(walEdit, batchOp.durability, batchOp.getClusterIds(), now, |
| nonceKey.getNonceGroup(), nonceKey.getNonce(), batchOp.getOrigLogSeqNum()); |
| } |
| |
| // Complete mvcc for all but last writeEntry (for replay case) |
| if (it.hasNext() && writeEntry != null) { |
| mvcc.complete(writeEntry); |
| writeEntry = null; |
| } |
| } |
| |
| // STEP 5. Write back to memStore |
| // NOTE: writeEntry can be null here |
| writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry); |
| |
| // STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and |
| // complete mvcc for last writeEntry |
| batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry); |
| writeEntry = null; |
| success = true; |
| } finally { |
| // Call complete rather than completeAndWait because we probably had error if walKey != null |
| if (writeEntry != null) mvcc.complete(writeEntry); |
| |
| if (locked) { |
| this.updatesLock.readLock().unlock(); |
| } |
| releaseRowLocks(acquiredRowLocks); |
| |
| enableInterrupts(); |
| |
| final int finalLastIndexExclusive = |
| miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size(); |
| final boolean finalSuccess = success; |
| batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> { |
| Mutation mutation = batchOp.getMutation(i); |
| if (mutation instanceof Increment || mutation instanceof Append) { |
| if (finalSuccess) { |
| batchOp.retCodeDetails[i] = |
| new OperationStatus(OperationStatusCode.SUCCESS, batchOp.results[i]); |
| } else { |
| batchOp.retCodeDetails[i] = OperationStatus.FAILURE; |
| } |
| } else { |
| batchOp.retCodeDetails[i] = |
| finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE; |
| } |
| return true; |
| }); |
| |
| batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess); |
| |
| batchOp.nextIndexToProcess = finalLastIndexExclusive; |
| } |
| } |
| |
| /** |
| * Returns effective durability from the passed durability and the table descriptor. |
| */ |
| private Durability getEffectiveDurability(Durability d) { |
| return d == Durability.USE_DEFAULT ? this.regionDurability : d; |
| } |
| |
| @Override |
| @Deprecated |
| public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, |
| ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException { |
| CheckAndMutate checkAndMutate; |
| try { |
| CheckAndMutate.Builder builder = CheckAndMutate.newBuilder(row) |
| .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange); |
| if (mutation instanceof Put) { |
| checkAndMutate = builder.build((Put) mutation); |
| } else if (mutation instanceof Delete) { |
| checkAndMutate = builder.build((Delete) mutation); |
| } else { |
| throw new DoNotRetryIOException( |
| "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); |
| } |
| } catch (IllegalArgumentException e) { |
| throw new DoNotRetryIOException(e.getMessage()); |
| } |
| return checkAndMutate(checkAndMutate).isSuccess(); |
| } |
| |
| @Override |
| @Deprecated |
| public boolean checkAndMutate(byte[] row, Filter filter, TimeRange timeRange, Mutation mutation) |
| throws IOException { |
| CheckAndMutate checkAndMutate; |
| try { |
| CheckAndMutate.Builder builder = |
| CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange); |
| if (mutation instanceof Put) { |
| checkAndMutate = builder.build((Put) mutation); |
| } else if (mutation instanceof Delete) { |
| checkAndMutate = builder.build((Delete) mutation); |
| } else { |
| throw new DoNotRetryIOException( |
| "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); |
| } |
| } catch (IllegalArgumentException e) { |
| throw new DoNotRetryIOException(e.getMessage()); |
| } |
| return checkAndMutate(checkAndMutate).isSuccess(); |
| } |
| |
| @Override |
| @Deprecated |
| public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, |
| ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException { |
| CheckAndMutate checkAndMutate; |
| try { |
| checkAndMutate = CheckAndMutate.newBuilder(row) |
| .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange).build(rm); |
| } catch (IllegalArgumentException e) { |
| throw new DoNotRetryIOException(e.getMessage()); |
| } |
| return checkAndMutate(checkAndMutate).isSuccess(); |
| } |
| |
| @Override |
| @Deprecated |
| public boolean checkAndRowMutate(byte[] row, Filter filter, TimeRange timeRange, RowMutations rm) |
| throws IOException { |
| CheckAndMutate checkAndMutate; |
| try { |
| checkAndMutate = |
| CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange).build(rm); |
| } catch (IllegalArgumentException e) { |
| throw new DoNotRetryIOException(e.getMessage()); |
| } |
| return checkAndMutate(checkAndMutate).isSuccess(); |
| } |
| |
| @Override |
| public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate) throws IOException { |
| return checkAndMutate(checkAndMutate, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate, long nonceGroup, |
| long nonce) throws IOException { |
| return TraceUtil.trace(() -> checkAndMutateInternal(checkAndMutate, nonceGroup, nonce), |
| () -> createRegionSpan("Region.checkAndMutate")); |
| } |
| |
| private CheckAndMutateResult checkAndMutateInternal(CheckAndMutate checkAndMutate, |
| long nonceGroup, long nonce) throws IOException { |
| byte[] row = checkAndMutate.getRow(); |
| Filter filter = null; |
| byte[] family = null; |
| byte[] qualifier = null; |
| CompareOperator op = null; |
| ByteArrayComparable comparator = null; |
| if (checkAndMutate.hasFilter()) { |
| filter = checkAndMutate.getFilter(); |
| } else { |
| family = checkAndMutate.getFamily(); |
| qualifier = checkAndMutate.getQualifier(); |
| op = checkAndMutate.getCompareOp(); |
| comparator = new BinaryComparator(checkAndMutate.getValue()); |
| } |
| TimeRange timeRange = checkAndMutate.getTimeRange(); |
| |
| Mutation mutation = null; |
| RowMutations rowMutations = null; |
| if (checkAndMutate.getAction() instanceof Mutation) { |
| mutation = (Mutation) checkAndMutate.getAction(); |
| } else { |
| rowMutations = (RowMutations) checkAndMutate.getAction(); |
| } |
| |
| if (mutation != null) { |
| checkMutationType(mutation); |
| checkRow(mutation, row); |
| } else { |
| checkRow(rowMutations, row); |
| } |
| checkReadOnly(); |
| // TODO, add check for value length also move this check to the client |
| checkResources(); |
| startRegionOperation(); |
| try { |
| Get get = new Get(row); |
| if (family != null) { |
| checkFamily(family); |
| get.addColumn(family, qualifier); |
| } |
| if (filter != null) { |
| get.setFilter(filter); |
| } |
| if (timeRange != null) { |
| get.setTimeRange(timeRange.getMin(), timeRange.getMax()); |
| } |
| // Lock row - note that doBatchMutate will relock this row if called |
| checkRow(row, "doCheckAndRowMutate"); |
| RowLock rowLock = getRowLock(get.getRow(), false, null); |
| try { |
| if (this.getCoprocessorHost() != null) { |
| CheckAndMutateResult result = |
| getCoprocessorHost().preCheckAndMutateAfterRowLock(checkAndMutate); |
| if (result != null) { |
| return result; |
| } |
| } |
| |
| // NOTE: We used to wait here until mvcc caught up: mvcc.await(); |
| // Supposition is that now all changes are done under row locks, then when we go to read, |
| // we'll get the latest on this row. |
| boolean matches = false; |
| long cellTs = 0; |
| try (RegionScanner scanner = getScanner(new Scan(get))) { |
| // NOTE: Please don't use HRegion.get() instead, |
| // because it will copy cells to heap. See HBASE-26036 |
| List<Cell> result = new ArrayList<>(1); |
| scanner.next(result); |
| if (filter != null) { |
| if (!result.isEmpty()) { |
| matches = true; |
| cellTs = result.get(0).getTimestamp(); |
| } |
| } else { |
| boolean valueIsNull = |
| comparator.getValue() == null || comparator.getValue().length == 0; |
| if (result.isEmpty() && valueIsNull) { |
| matches = op != CompareOperator.NOT_EQUAL; |
| } else if (result.size() > 0 && valueIsNull) { |
| matches = (result.get(0).getValueLength() == 0) == (op != CompareOperator.NOT_EQUAL); |
| cellTs = result.get(0).getTimestamp(); |
| } else if (result.size() == 1) { |
| Cell kv = result.get(0); |
| cellTs = kv.getTimestamp(); |
| int compareResult = PrivateCellUtil.compareValue(kv, comparator); |
| matches = matches(op, compareResult); |
| } |
| } |
| } |
| |
| // If matches, perform the mutation or the rowMutations |
| if (matches) { |
| // We have acquired the row lock already. If the system clock is NOT monotonically |
| // non-decreasing (see HBASE-14070) we should make sure that the mutation has a |
| // larger timestamp than what was observed via Get. doBatchMutate already does this, but |
| // there is no way to pass the cellTs. See HBASE-14054. |
| long now = EnvironmentEdgeManager.currentTime(); |
| long ts = Math.max(now, cellTs); // ensure write is not eclipsed |
| byte[] byteTs = Bytes.toBytes(ts); |
| if (mutation != null) { |
| if (mutation instanceof Put) { |
| updateCellTimestamps((Iterable) mutation.getFamilyCellMap().values(), byteTs); |
| } |
| // And else 'delete' is not needed since it already does a second get, and sets the |
| // timestamp from get (see prepareDeleteTimestamps). |
| } else { |
| for (Mutation m : rowMutations.getMutations()) { |
| if (m instanceof Put) { |
| updateCellTimestamps((Iterable) m.getFamilyCellMap().values(), byteTs); |
| } |
| } |
| // And else 'delete' is not needed since it already does a second get, and sets the |
| // timestamp from get (see prepareDeleteTimestamps). |
| } |
| // All edits for the given row (across all column families) must happen atomically. |
| Result r; |
| if (mutation != null) { |
| r = mutate(mutation, true, nonceGroup, nonce).getResult(); |
| } else { |
| r = mutateRow(rowMutations, nonceGroup, nonce); |
| } |
| this.checkAndMutateChecksPassed.increment(); |
| return new CheckAndMutateResult(true, r); |
| } |
| this.checkAndMutateChecksFailed.increment(); |
| return new CheckAndMutateResult(false, null); |
| } finally { |
| rowLock.release(); |
| } |
| } finally { |
| closeRegionOperation(); |
| } |
| } |
| |
| private void checkMutationType(final Mutation mutation) throws DoNotRetryIOException { |
| if ( |
| !(mutation instanceof Put) && !(mutation instanceof Delete) |
| && !(mutation instanceof Increment) && !(mutation instanceof Append) |
| ) { |
| throw new org.apache.hadoop.hbase.DoNotRetryIOException( |
| "Action must be Put or Delete or Increment or Delete"); |
| } |
| } |
| |
| private void checkRow(final Row action, final byte[] row) throws DoNotRetryIOException { |
| if (!Bytes.equals(row, action.getRow())) { |
| throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match"); |
| } |
| } |
| |
| private boolean matches(final CompareOperator op, final int compareResult) { |
| boolean matches = false; |
| switch (op) { |
| case LESS: |
| matches = compareResult < 0; |
| break; |
| case LESS_OR_EQUAL: |
| matches = compareResult <= 0; |
| break; |
| case EQUAL: |
| matches = compareResult == 0; |
| break; |
| case NOT_EQUAL: |
| matches = compareResult != 0; |
| break; |
| case GREATER_OR_EQUAL: |
| matches = compareResult >= 0; |
| break; |
| case GREATER: |
| matches = compareResult > 0; |
| break; |
| default: |
| throw new RuntimeException("Unknown Compare op " + op.name()); |
| } |
| return matches; |
| } |
| |
| private OperationStatus mutate(Mutation mutation) throws IOException { |
| return mutate(mutation, false); |
| } |
| |
| private OperationStatus mutate(Mutation mutation, boolean atomic) throws IOException { |
| return mutate(mutation, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| private OperationStatus mutate(Mutation mutation, boolean atomic, long nonceGroup, long nonce) |
| throws IOException { |
| OperationStatus[] status = |
| this.batchMutate(new Mutation[] { mutation }, atomic, nonceGroup, nonce); |
| if (status[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) { |
| throw new FailedSanityCheckException(status[0].getExceptionMsg()); |
| } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) { |
| throw new NoSuchColumnFamilyException(status[0].getExceptionMsg()); |
| } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.STORE_TOO_BUSY)) { |
| throw new RegionTooBusyException(status[0].getExceptionMsg()); |
| } |
| return status[0]; |
| } |
| |
| /** |
| * Complete taking the snapshot on the region. Writes the region info and adds references to the |
| * working snapshot directory. TODO for api consistency, consider adding another version with no |
| * {@link ForeignExceptionSnare} arg. (In the future other cancellable HRegion methods could |
| * eventually add a {@link ForeignExceptionSnare}, or we could do something fancier). |
| * @param desc snapshot description object |
| * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to bail |
| * out. This is allowed to be null and will just be ignored in that case. |
| * @throws IOException if there is an external or internal error causing the snapshot to fail |
| */ |
| public void addRegionToSnapshot(SnapshotDescription desc, ForeignExceptionSnare exnSnare) |
| throws IOException { |
| Path rootDir = CommonFSUtils.getRootDir(conf); |
| Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir, conf); |
| |
| SnapshotManifest manifest = |
| SnapshotManifest.create(conf, getFilesystem(), snapshotDir, desc, exnSnare); |
| manifest.addRegion(this); |
| } |
| |
| private void updateSequenceId(final Iterable<List<ExtendedCell>> cellItr, final long sequenceId) |
| throws IOException { |
| for (List<ExtendedCell> cells : cellItr) { |
| if (cells == null) { |
| return; |
| } |
| for (ExtendedCell cell : cells) { |
| cell.setSequenceId(sequenceId); |
| } |
| } |
| } |
| |
| /** |
| * Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP} |
| * provided current timestamp. |
| */ |
| private static void updateCellTimestamps(final Iterable<List<ExtendedCell>> cellItr, |
| final byte[] now) throws IOException { |
| for (List<ExtendedCell> cells : cellItr) { |
| if (cells == null) { |
| continue; |
| } |
| // Optimization: 'foreach' loop is not used. See: |
| // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects |
| assert cells instanceof RandomAccess; |
| int listSize = cells.size(); |
| for (int i = 0; i < listSize; i++) { |
| PrivateCellUtil.updateLatestStamp(cells.get(i), now); |
| } |
| } |
| } |
| |
| /** |
| * Possibly rewrite incoming cell tags. |
| */ |
| private void rewriteCellTags(Map<byte[], List<ExtendedCell>> familyMap, final Mutation m) { |
| // Check if we have any work to do and early out otherwise |
| // Update these checks as more logic is added here |
| if (m.getTTL() == Long.MAX_VALUE) { |
| return; |
| } |
| |
| // From this point we know we have some work to do |
| for (Map.Entry<byte[], List<ExtendedCell>> e : familyMap.entrySet()) { |
| List<ExtendedCell> cells = e.getValue(); |
| assert cells instanceof RandomAccess; |
| int listSize = cells.size(); |
| for (int i = 0; i < listSize; i++) { |
| ExtendedCell cell = cells.get(i); |
| List<Tag> newTags = TagUtil.carryForwardTags(null, cell); |
| newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL()); |
| // Rewrite the cell with the updated set of tags |
| cells.set(i, PrivateCellUtil.createCell(cell, newTags)); |
| } |
| } |
| } |
| |
| /** |
| * Check if resources to support an update. |
| * <p/> |
| * We throw RegionTooBusyException if above memstore limit and expect client to retry using some |
| * kind of backoff |
| */ |
| private void checkResources() throws RegionTooBusyException { |
| // If catalog region, do not impose resource constraints or block updates. |
| if (this.getRegionInfo().isMetaRegion()) { |
| return; |
| } |
| |
| MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); |
| if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) { |
| blockedRequestsCount.increment(); |
| requestFlush(); |
| // Don't print current limit because it will vary too much. The message is used as a key |
| // over in RetriesExhaustedWithDetailsException processing. |
| final String regionName = |
| this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getEncodedName(); |
| final String serverName = this.getRegionServerServices() == null |
| ? "unknown" |
| : (this.getRegionServerServices().getServerName() == null |
| ? "unknown" |
| : this.getRegionServerServices().getServerName().toString()); |
| RegionTooBusyException rtbe = new RegionTooBusyException("Over memstore limit=" |
| + org.apache.hadoop.hbase.procedure2.util.StringUtils.humanSize(this.blockingMemStoreSize) |
| + ", regionName=" + regionName + ", server=" + serverName); |
| LOG.warn("Region is too busy due to exceeding memstore size limit.", rtbe); |
| throw rtbe; |
| } |
| } |
| |
| /** |
| * @throws IOException Throws exception if region is in read-only mode. |
| */ |
| private void checkReadOnly() throws IOException { |
| if (isReadOnly()) { |
| throw new DoNotRetryIOException("region is read only"); |
| } |
| } |
| |
| private void checkReadsEnabled() throws IOException { |
| if (!this.writestate.readsEnabled) { |
| throw new IOException(getRegionInfo().getEncodedName() |
| + ": The region's reads are disabled. Cannot serve the request"); |
| } |
| } |
| |
| public void setReadsEnabled(boolean readsEnabled) { |
| if (readsEnabled && !this.writestate.readsEnabled) { |
| LOG.info("Enabling reads for {}", getRegionInfo().getEncodedName()); |
| } |
| this.writestate.setReadsEnabled(readsEnabled); |
| } |
| |
| /** |
| * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be |
| * set; when set we will run operations that make sense in the increment/append |
| * scenario but that do not make sense otherwise. |
| * @see #applyToMemStore(HStore, Cell, MemStoreSizing) |
| */ |
| private void applyToMemStore(HStore store, List<ExtendedCell> cells, boolean delta, |
| MemStoreSizing memstoreAccounting) { |
| // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! |
| boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1; |
| if (upsert) { |
| store.upsert(cells, getSmallestReadPoint(), memstoreAccounting); |
| } else { |
| store.add(cells, memstoreAccounting); |
| } |
| } |
| |
| /** |
| * @see #applyToMemStore(HStore, List, boolean, MemStoreSizing) |
| */ |
| private void applyToMemStore(HStore store, ExtendedCell cell, MemStoreSizing memstoreAccounting) |
| throws IOException { |
| // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! |
| if (store == null) { |
| checkFamily(CellUtil.cloneFamily(cell)); |
| // Unreachable because checkFamily will throw exception |
| } |
| store.add(cell, memstoreAccounting); |
| } |
| |
| /** |
| * Check the collection of families for validity. |
| */ |
| public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException { |
| for (byte[] family : families) { |
| checkFamily(family); |
| } |
| } |
| |
| /** |
| * Check the collection of families for valid timestamps |
| * @param now current timestamp |
| */ |
| public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now) |
| throws FailedSanityCheckException { |
| if (timestampSlop == HConstants.LATEST_TIMESTAMP) { |
| return; |
| } |
| long maxTs = now + timestampSlop; |
| for (List<Cell> kvs : familyMap.values()) { |
| // Optimization: 'foreach' loop is not used. See: |
| // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects |
| assert kvs instanceof RandomAccess; |
| int listSize = kvs.size(); |
| for (int i = 0; i < listSize; i++) { |
| Cell cell = kvs.get(i); |
| // see if the user-side TS is out of range. latest = server-side |
| long ts = cell.getTimestamp(); |
| if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) { |
| throw new FailedSanityCheckException( |
| "Timestamp for KV out of range " + cell + " (too.new=" + timestampSlop + ")"); |
| } |
| } |
| } |
| } |
| |
| /* |
| * @return True if size is over the flush threshold |
| */ |
| private boolean isFlushSize(MemStoreSize size) { |
| return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize(); |
| } |
| |
| private void deleteRecoveredEdits(FileSystem fs, Iterable<Path> files) throws IOException { |
| for (Path file : files) { |
| if (!fs.delete(file, false)) { |
| LOG.error("Failed delete of {}", file); |
| } else { |
| LOG.debug("Deleted recovered.edits file={}", file); |
| } |
| } |
| } |
| |
| /** |
| * Read the edits put under this region by wal splitting process. Put the recovered edits back up |
| * into this region. |
| * <p> |
| * We can ignore any wal message that has a sequence ID that's equal to or lower than minSeqId. |
| * (Because we know such messages are already reflected in the HFiles.) |
| * <p> |
| * While this is running we are putting pressure on memory yet we are outside of our usual |
| * accounting because we are not yet an onlined region (this stuff is being run as part of Region |
| * initialization). This means that if we're up against global memory limits, we'll not be flagged |
| * to flush because we are not online. We can't be flushed by usual mechanisms anyways; we're not |
| * yet online so our relative sequenceids are not yet aligned with WAL sequenceids -- not till we |
| * come up online, post processing of split edits. |
| * <p> |
| * But to help relieve memory pressure, at least manage our own heap size flushing if are in |
| * excess of per-region limits. Flushing, though, we have to be careful and avoid using the |
| * regionserver/wal sequenceid. Its running on a different line to whats going on in here in this |
| * region context so if we crashed replaying these edits, but in the midst had a flush that used |
| * the regionserver wal with a sequenceid in excess of whats going on in here in this region and |
| * with its split editlogs, then we could miss edits the next time we go to recover. So, we have |
| * to flush inline, using seqids that make sense in a this single region context only -- until we |
| * online. |
| * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of the maxSeqId |
| * for the store to be applied, else its skipped. |
| * @return the sequence id of the last edit added to this region out of the recovered edits log or |
| * <code>minSeqId</code> if nothing added from editlogs. |
| */ |
| long replayRecoveredEditsIfAny(Map<byte[], Long> maxSeqIdInStores, |
| final CancelableProgressable reporter, final MonitoredTask status) throws IOException { |
| long minSeqIdForTheRegion = -1; |
| for (Long maxSeqIdInStore : maxSeqIdInStores.values()) { |
| if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) { |
| minSeqIdForTheRegion = maxSeqIdInStore; |
| } |
| } |
| long seqId = minSeqIdForTheRegion; |
| String specialRecoveredEditsDirStr = conf.get(SPECIAL_RECOVERED_EDITS_DIR); |
| if (org.apache.commons.lang3.StringUtils.isBlank(specialRecoveredEditsDirStr)) { |
| FileSystem walFS = getWalFileSystem(); |
| FileSystem rootFS = getFilesystem(); |
| Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(), |
| getRegionInfo().getEncodedName()); |
| Path regionWALDir = getWALRegionDir(); |
| Path regionDir = |
| FSUtils.getRegionDirFromRootDir(CommonFSUtils.getRootDir(conf), getRegionInfo()); |
| |
| // We made a mistake in HBASE-20734 so we need to do this dirty hack... |
| NavigableSet<Path> filesUnderWrongRegionWALDir = |
| WALSplitUtil.getSplitEditFilesSorted(walFS, wrongRegionWALDir); |
| seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, |
| filesUnderWrongRegionWALDir, reporter, regionDir)); |
| // This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear |
| // under the root dir even if walDir is set. |
| NavigableSet<Path> filesUnderRootDir = Collections.emptyNavigableSet(); |
| if (!regionWALDir.equals(regionDir)) { |
| filesUnderRootDir = WALSplitUtil.getSplitEditFilesSorted(rootFS, regionDir); |
| seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS, |
| filesUnderRootDir, reporter, regionDir)); |
| } |
| |
| NavigableSet<Path> files = WALSplitUtil.getSplitEditFilesSorted(walFS, regionWALDir); |
| seqId = Math.max(seqId, |
| replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, files, reporter, regionWALDir)); |
| if (seqId > minSeqIdForTheRegion) { |
| // Then we added some edits to memory. Flush and cleanup split edit files. |
| internalFlushcache(null, seqId, stores.values(), status, false, |
| FlushLifeCycleTracker.DUMMY); |
| } |
| // Now delete the content of recovered edits. We're done w/ them. |
| if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) { |
| // For debugging data loss issues! |
| // If this flag is set, make use of the hfile archiving by making recovered.edits a fake |
| // column family. Have to fake out file type too by casting our recovered.edits as |
| // storefiles |
| String fakeFamilyName = WALSplitUtil.getRegionDirRecoveredEditsDir(regionWALDir).getName(); |
| Set<HStoreFile> fakeStoreFiles = new HashSet<>(files.size()); |
| for (Path file : files) { |
| fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true)); |
| } |
| getRegionWALFileSystem().archiveRecoveredEdits(fakeFamilyName, fakeStoreFiles); |
| } else { |
| deleteRecoveredEdits(walFS, Iterables.concat(files, filesUnderWrongRegionWALDir)); |
| deleteRecoveredEdits(rootFS, filesUnderRootDir); |
| } |
| } else { |
| Path recoveredEditsDir = new Path(specialRecoveredEditsDirStr); |
| FileSystem fs = recoveredEditsDir.getFileSystem(conf); |
| FileStatus[] files = fs.listStatus(recoveredEditsDir); |
| LOG.debug("Found {} recovered edits file(s) under {}", files == null ? 0 : files.length, |
| recoveredEditsDir); |
| if (files != null) { |
| for (FileStatus file : files) { |
| // it is safe to trust the zero-length in this case because we've been through rename and |
| // lease recovery in the above. |
| if (isZeroLengthThenDelete(fs, file, file.getPath())) { |
| continue; |
| } |
| seqId = |
| Math.max(seqId, replayRecoveredEdits(file.getPath(), maxSeqIdInStores, reporter, fs)); |
| } |
| } |
| if (seqId > minSeqIdForTheRegion) { |
| // Then we added some edits to memory. Flush and cleanup split edit files. |
| internalFlushcache(null, seqId, stores.values(), status, false, |
| FlushLifeCycleTracker.DUMMY); |
| } |
| deleteRecoveredEdits(fs, |
| Stream.of(files).map(FileStatus::getPath).collect(Collectors.toList())); |
| } |
| |
| return seqId; |
| } |
| |
| private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs, |
| final NavigableSet<Path> files, final CancelableProgressable reporter, final Path regionDir) |
| throws IOException { |
| long seqid = minSeqIdForTheRegion; |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Found " + (files == null ? 0 : files.size()) + " recovered edits file(s) under " |
| + regionDir); |
| } |
| |
| if (files == null || files.isEmpty()) { |
| return minSeqIdForTheRegion; |
| } |
| |
| for (Path edits : files) { |
| if (edits == null || !fs.exists(edits)) { |
| LOG.warn("Null or non-existent edits file: " + edits); |
| continue; |
| } |
| if (isZeroLengthThenDelete(fs, fs.getFileStatus(edits), edits)) { |
| continue; |
| } |
| |
| long maxSeqId; |
| String fileName = edits.getName(); |
| maxSeqId = Math.abs(Long.parseLong(fileName)); |
| if (maxSeqId <= minSeqIdForTheRegion) { |
| if (LOG.isDebugEnabled()) { |
| String msg = "Maximum sequenceid for this wal is " + maxSeqId |
| + " and minimum sequenceid for the region " + this + " is " + minSeqIdForTheRegion |
| + ", skipped the whole file, path=" + edits; |
| LOG.debug(msg); |
| } |
| continue; |
| } |
| |
| try { |
| // replay the edits. Replay can return -1 if everything is skipped, only update |
| // if seqId is greater |
| seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs)); |
| } catch (IOException e) { |
| handleException(fs, edits, e); |
| } |
| } |
| return seqid; |
| } |
| |
| private void handleException(FileSystem fs, Path edits, IOException e) throws IOException { |
| boolean skipErrors = conf.getBoolean(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS, |
| conf.getBoolean("hbase.skip.errors", HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS)); |
| if (conf.get("hbase.skip.errors") != null) { |
| LOG.warn("The property 'hbase.skip.errors' has been deprecated. Please use " |
| + HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead."); |
| } |
| if (skipErrors) { |
| Path p = WALSplitUtil.moveAsideBadEditsFile(fs, edits); |
| LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + "=true so continuing. Renamed " |
| + edits + " as " + p, e); |
| } else { |
| throw e; |
| } |
| } |
| |
| /** |
| * @param edits File of recovered edits. |
| * @param maxSeqIdInStores Maximum sequenceid found in each store. Edits in wal must be larger |
| * than this to be replayed for each store. |
| * @return the sequence id of the last edit added to this region out of the recovered edits log or |
| * <code>minSeqId</code> if nothing added from editlogs. |
| */ |
| private long replayRecoveredEdits(final Path edits, Map<byte[], Long> maxSeqIdInStores, |
| final CancelableProgressable reporter, FileSystem fs) throws IOException { |
| String msg = "Replaying edits from " + edits; |
| LOG.info(msg); |
| MonitoredTask status = TaskMonitor.get().createStatus(msg); |
| |
| status.setStatus("Opening recovered edits"); |
| try (WALStreamReader reader = WALFactory.createStreamReader(fs, edits, conf)) { |
| long currentEditSeqId = -1; |
| long currentReplaySeqId = -1; |
| long firstSeqIdInLog = -1; |
| long skippedEdits = 0; |
| long editsCount = 0; |
| long intervalEdits = 0; |
| WAL.Entry entry; |
| HStore store = null; |
| boolean reported_once = false; |
| ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager(); |
| |
| try { |
| // How many edits seen before we check elapsed time |
| int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000); |
| // How often to send a progress report (default 1/2 master timeout) |
| int period = this.conf.getInt("hbase.hstore.report.period", 300000); |
| long lastReport = EnvironmentEdgeManager.currentTime(); |
| |
| if (coprocessorHost != null) { |
| coprocessorHost.preReplayWALs(this.getRegionInfo(), edits); |
| } |
| |
| while ((entry = reader.next()) != null) { |
| WALKey key = entry.getKey(); |
| WALEdit val = entry.getEdit(); |
| |
| if (ng != null) { // some test, or nonces disabled |
| ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime()); |
| } |
| |
| if (reporter != null) { |
| intervalEdits += val.size(); |
| if (intervalEdits >= interval) { |
| // Number of edits interval reached |
| intervalEdits = 0; |
| long cur = EnvironmentEdgeManager.currentTime(); |
| if (lastReport + period <= cur) { |
| status.setStatus( |
| "Replaying edits..." + " skipped=" + skippedEdits + " edits=" + editsCount); |
| // Timeout reached |
| if (!reporter.progress()) { |
| msg = "Progressable reporter failed, stopping replay for region " + this; |
| LOG.warn(msg); |
| status.abort(msg); |
| throw new IOException(msg); |
| } |
| reported_once = true; |
| lastReport = cur; |
| } |
| } |
| } |
| |
| if (firstSeqIdInLog == -1) { |
| firstSeqIdInLog = key.getSequenceId(); |
| } |
| if (currentEditSeqId > key.getSequenceId()) { |
| // when this condition is true, it means we have a serious defect because we need to |
| // maintain increasing SeqId for WAL edits per region |
| LOG.error(getRegionInfo().getEncodedName() + " : " + "Found decreasing SeqId. PreId=" |
| + currentEditSeqId + " key=" + key + "; edit=" + val); |
| } else { |
| currentEditSeqId = key.getSequenceId(); |
| } |
| currentReplaySeqId = |
| (key.getOrigLogSeqNum() > 0) ? key.getOrigLogSeqNum() : currentEditSeqId; |
| |
| // Start coprocessor replay here. The coprocessor is for each WALEdit |
| // instead of a KeyValue. |
| if (coprocessorHost != null) { |
| status.setStatus("Running pre-WAL-restore hook in coprocessors"); |
| if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) { |
| // if bypass this wal entry, ignore it ... |
| continue; |
| } |
| } |
| boolean checkRowWithinBoundary = false; |
| // Check this edit is for this region. |
| if ( |
| !Bytes.equals(key.getEncodedRegionName(), this.getRegionInfo().getEncodedNameAsBytes()) |
| ) { |
| checkRowWithinBoundary = true; |
| } |
| |
| boolean flush = false; |
| MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing(); |
| for (Cell c : val.getCells()) { |
| assert c instanceof ExtendedCell; |
| ExtendedCell cell = (ExtendedCell) c; |
| // Check this edit is for me. Also, guard against writing the special |
| // METACOLUMN info such as HBASE::CACHEFLUSH entries |
| if (WALEdit.isMetaEditFamily(cell)) { |
| // if region names don't match, skipp replaying compaction marker |
| if (!checkRowWithinBoundary) { |
| // this is a special edit, we should handle it |
| CompactionDescriptor compaction = WALEdit.getCompaction(cell); |
| if (compaction != null) { |
| // replay the compaction |
| replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE); |
| } |
| } |
| skippedEdits++; |
| continue; |
| } |
| // Figure which store the edit is meant for. |
| if ( |
| store == null |
| || !CellUtil.matchingFamily(cell, store.getColumnFamilyDescriptor().getName()) |
| ) { |
| store = getStore(cell); |
| } |
| if (store == null) { |
| // This should never happen. Perhaps schema was changed between |
| // crash and redeploy? |
| LOG.warn("No family for cell {} in region {}", cell, this); |
| skippedEdits++; |
| continue; |
| } |
| if ( |
| checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(), cell.getRowArray(), |
| cell.getRowOffset(), cell.getRowLength()) |
| ) { |
| LOG.warn("Row of {} is not within region boundary for region {}", cell, this); |
| skippedEdits++; |
| continue; |
| } |
| // Now, figure if we should skip this edit. |
| if ( |
| key.getSequenceId() |
| <= maxSeqIdInStores.get(store.getColumnFamilyDescriptor().getName()) |
| ) { |
| skippedEdits++; |
| continue; |
| } |
| PrivateCellUtil.setSequenceId(cell, currentReplaySeqId); |
| |
| restoreEdit(store, cell, memStoreSizing); |
| editsCount++; |
| } |
| MemStoreSize mss = memStoreSizing.getMemStoreSize(); |
| incMemStoreSize(mss); |
| flush = isFlushSize(this.memStoreSizing.getMemStoreSize()); |
| if (flush) { |
| internalFlushcache(null, currentEditSeqId, stores.values(), status, false, |
| FlushLifeCycleTracker.DUMMY); |
| } |
| |
| if (coprocessorHost != null) { |
| coprocessorHost.postWALRestore(this.getRegionInfo(), key, val); |
| } |
| } |
| |
| if (coprocessorHost != null) { |
| coprocessorHost.postReplayWALs(this.getRegionInfo(), edits); |
| } |
| } catch (EOFException eof) { |
| if (!conf.getBoolean(RECOVERED_EDITS_IGNORE_EOF, false)) { |
| Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); |
| msg = "EnLongAddered EOF. Most likely due to Master failure during " |
| + "wal splitting, so we have this data in another edit. Continuing, but renaming " |
| + edits + " as " + p + " for region " + this; |
| LOG.warn(msg, eof); |
| status.abort(msg); |
| } else { |
| LOG.warn("EOF while replaying recover edits and config '{}' is true so " |
| + "we will ignore it and continue", RECOVERED_EDITS_IGNORE_EOF, eof); |
| } |
| } catch (IOException ioe) { |
| // If the IOE resulted from bad file format, |
| // then this problem is idempotent and retrying won't help |
| if (ioe.getCause() instanceof ParseException) { |
| Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); |
| msg = |
| "File corruption enLongAddered! " + "Continuing, but renaming " + edits + " as " + p; |
| LOG.warn(msg, ioe); |
| status.setStatus(msg); |
| } else { |
| status.abort(StringUtils.stringifyException(ioe)); |
| // other IO errors may be transient (bad network connection, |
| // checksum exception on one datanode, etc). throw & retry |
| throw ioe; |
| } |
| } |
| if (reporter != null && !reported_once) { |
| reporter.progress(); |
| } |
| msg = "Applied " + editsCount + ", skipped " + skippedEdits + ", firstSequenceIdInLog=" |
| + firstSeqIdInLog + ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits; |
| status.markComplete(msg); |
| LOG.debug(msg); |
| return currentEditSeqId; |
| } finally { |
| status.cleanup(); |
| } |
| } |
| |
| /** |
| * Call to complete a compaction. Its for the case where we find in the WAL a compaction that was |
| * not finished. We could find one recovering a WAL after a regionserver crash. See HBASE-2331. |
| */ |
| void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles, |
| boolean removeFiles, long replaySeqId) throws IOException { |
| try { |
| checkTargetRegion(compaction.getEncodedRegionName().toByteArray(), |
| "Compaction marker from WAL ", compaction); |
| } catch (WrongRegionException wre) { |
| if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { |
| // skip the compaction marker since it is not for this region |
| return; |
| } |
| throw wre; |
| } |
| |
| synchronized (writestate) { |
| if (replaySeqId < lastReplayedOpenRegionSeqId) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" |
| + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId |
| + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " |
| + lastReplayedOpenRegionSeqId); |
| return; |
| } |
| if (replaySeqId < lastReplayedCompactionSeqId) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" |
| + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId |
| + " is smaller than this regions " + "lastReplayedCompactionSeqId of " |
| + lastReplayedCompactionSeqId); |
| return; |
| } else { |
| lastReplayedCompactionSeqId = replaySeqId; |
| } |
| |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying compaction marker " |
| + TextFormat.shortDebugString(compaction) + " with seqId=" + replaySeqId |
| + " and lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); |
| } |
| |
| startRegionOperation(Operation.REPLAY_EVENT); |
| try { |
| HStore store = this.getStore(compaction.getFamilyName().toByteArray()); |
| if (store == null) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Found Compaction WAL edit for deleted family:" |
| + Bytes.toString(compaction.getFamilyName().toByteArray())); |
| return; |
| } |
| store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles); |
| logRegionFiles(); |
| } catch (FileNotFoundException ex) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "At least one of the store files in compaction: " |
| + TextFormat.shortDebugString(compaction) |
| + " doesn't exist any more. Skip loading the file(s)", ex); |
| } finally { |
| closeRegionOperation(Operation.REPLAY_EVENT); |
| } |
| } |
| } |
| |
| void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException { |
| checkTargetRegion(flush.getEncodedRegionName().toByteArray(), "Flush marker from WAL ", flush); |
| |
| if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { |
| return; // if primary nothing to do |
| } |
| |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying flush marker " |
| + TextFormat.shortDebugString(flush)); |
| } |
| |
| startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close |
| try { |
| FlushAction action = flush.getAction(); |
| switch (action) { |
| case START_FLUSH: |
| replayWALFlushStartMarker(flush); |
| break; |
| case COMMIT_FLUSH: |
| replayWALFlushCommitMarker(flush); |
| break; |
| case ABORT_FLUSH: |
| replayWALFlushAbortMarker(flush); |
| break; |
| case CANNOT_FLUSH: |
| replayWALFlushCannotFlushMarker(flush, replaySeqId); |
| break; |
| default: |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush event with unknown action, ignoring. " |
| + TextFormat.shortDebugString(flush)); |
| break; |
| } |
| |
| logRegionFiles(); |
| } finally { |
| closeRegionOperation(Operation.REPLAY_EVENT); |
| } |
| } |
| |
| /** |
| * Replay the flush marker from primary region by creating a corresponding snapshot of the store |
| * memstores, only if the memstores do not have a higher seqId from an earlier wal edit (because |
| * the events may be coming out of order). |
| */ |
| PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException { |
| long flushSeqId = flush.getFlushSequenceNumber(); |
| |
| HashSet<HStore> storesToFlush = new HashSet<>(); |
| for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { |
| byte[] family = storeFlush.getFamilyName().toByteArray(); |
| HStore store = getStore(family); |
| if (store == null) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush start marker from primary, but the family is not found. Ignoring" |
| + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush)); |
| continue; |
| } |
| storesToFlush.add(store); |
| } |
| |
| MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this); |
| |
| // we will use writestate as a coarse-grain lock for all the replay events |
| // (flush, compaction, region open etc) |
| synchronized (writestate) { |
| try { |
| if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" |
| + TextFormat.shortDebugString(flush) |
| + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " |
| + " of " + lastReplayedOpenRegionSeqId); |
| return null; |
| } |
| if (numMutationsWithoutWAL.sum() > 0) { |
| numMutationsWithoutWAL.reset(); |
| dataInMemoryWithoutWAL.reset(); |
| } |
| |
| if (!writestate.flushing) { |
| // we do not have an active snapshot and corresponding this.prepareResult. This means |
| // we can just snapshot our memstores and continue as normal. |
| |
| // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal |
| PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId, |
| storesToFlush, status, false, FlushLifeCycleTracker.DUMMY); |
| if (prepareResult.result == null) { |
| // save the PrepareFlushResult so that we can use it later from commit flush |
| this.writestate.flushing = true; |
| this.prepareFlushResult = prepareResult; |
| status.markComplete("Flush prepare successful"); |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared flush with seqId:" |
| + flush.getFlushSequenceNumber()); |
| } |
| } else { |
| // special case empty memstore. We will still save the flush result in this case, since |
| // our memstore ie empty, but the primary is still flushing |
| if ( |
| prepareResult.getResult().getResult() |
| == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY |
| ) { |
| this.writestate.flushing = true; |
| this.prepareFlushResult = prepareResult; |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " |
| + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber()); |
| } |
| } |
| status.abort("Flush prepare failed with " + prepareResult.result); |
| // nothing much to do. prepare flush failed because of some reason. |
| } |
| return prepareResult; |
| } else { |
| // we already have an active snapshot. |
| if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) { |
| // They define the same flush. Log and continue. |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush prepare marker with the same seqId: " |
| + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " |
| + prepareFlushResult.flushOpSeqId + ". Ignoring"); |
| // ignore |
| } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) { |
| // We received a flush with a smaller seqNum than what we have prepared. We can only |
| // ignore this prepare flush request. |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush prepare marker with a smaller seqId: " |
| + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " |
| + prepareFlushResult.flushOpSeqId + ". Ignoring"); |
| // ignore |
| } else { |
| // We received a flush with a larger seqNum than what we have prepared |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush prepare marker with a larger seqId: " |
| + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " |
| + prepareFlushResult.flushOpSeqId + ". Ignoring"); |
| // We do not have multiple active snapshots in the memstore or a way to merge current |
| // memstore snapshot with the contents and resnapshot for now. We cannot take |
| // another snapshot and drop the previous one because that will cause temporary |
| // data loss in the secondary. So we ignore this for now, deferring the resolution |
| // to happen when we see the corresponding flush commit marker. If we have a memstore |
| // snapshot with x, and later received another prepare snapshot with y (where x < y), |
| // when we see flush commit for y, we will drop snapshot for x, and can also drop all |
| // the memstore edits if everything in memstore is < y. This is the usual case for |
| // RS crash + recovery where we might see consequtive prepare flush wal markers. |
| // Otherwise, this will cause more memory to be used in secondary replica until a |
| // further prapare + commit flush is seen and replayed. |
| } |
| } |
| } finally { |
| status.cleanup(); |
| writestate.notifyAll(); |
| } |
| } |
| return null; |
| } |
| |
| @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", |
| justification = "Intentional; post memstore flush") |
| void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException { |
| MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this); |
| |
| // check whether we have the memstore snapshot with the corresponding seqId. Replay to |
| // secondary region replicas are in order, except for when the region moves or then the |
| // region server crashes. In those cases, we may receive replay requests out of order from |
| // the original seqIds. |
| synchronized (writestate) { |
| try { |
| if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" |
| + TextFormat.shortDebugString(flush) |
| + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " |
| + " of " + lastReplayedOpenRegionSeqId); |
| return; |
| } |
| |
| if (writestate.flushing) { |
| PrepareFlushResult prepareFlushResult = this.prepareFlushResult; |
| if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber() |
| + " and a previous prepared snapshot was found"); |
| } |
| // This is the regular case where we received commit flush after prepare flush |
| // corresponding to the same seqId. |
| replayFlushInStores(flush, prepareFlushResult, true); |
| |
| // Set down the memstore size by amount of flush. |
| this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); |
| this.prepareFlushResult = null; |
| writestate.flushing = false; |
| } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) { |
| // This should not happen normally. However, lets be safe and guard against these cases |
| // we received a flush commit with a smaller seqId than what we have prepared |
| // we will pick the flush file up from this commit (if we have not seen it), but we |
| // will not drop the memstore |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush commit marker with smaller seqId: " |
| + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " |
| + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping" |
| + " prepared memstore snapshot"); |
| replayFlushInStores(flush, prepareFlushResult, false); |
| |
| // snapshot is not dropped, so memstore sizes should not be decremented |
| // we still have the prepared snapshot, flushing should still be true |
| } else { |
| // This should not happen normally. However, lets be safe and guard against these cases |
| // we received a flush commit with a larger seqId than what we have prepared |
| // we will pick the flush file for this. We will also obtain the updates lock and |
| // look for contents of the memstore to see whether we have edits after this seqId. |
| // If not, we will drop all the memstore edits and the snapshot as well. |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush commit marker with larger seqId: " |
| + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " |
| + prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared" |
| + " memstore snapshot"); |
| |
| replayFlushInStores(flush, prepareFlushResult, true); |
| |
| // Set down the memstore size by amount of flush. |
| this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); |
| |
| // Inspect the memstore contents to see whether the memstore contains only edits |
| // with seqId smaller than the flush seqId. If so, we can discard those edits. |
| dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); |
| |
| this.prepareFlushResult = null; |
| writestate.flushing = false; |
| } |
| // If we were waiting for observing a flush or region opening event for not showing |
| // partial data after a secondary region crash, we can allow reads now. We can only make |
| // sure that we are not showing partial data (for example skipping some previous edits) |
| // until we observe a full flush start and flush commit. So if we were not able to find |
| // a previous flush we will not enable reads now. |
| this.setReadsEnabled(true); |
| } else { |
| LOG.warn( |
| getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:" |
| + flush.getFlushSequenceNumber() + ", but no previous prepared snapshot was found"); |
| // There is no corresponding prepare snapshot from before. |
| // We will pick up the new flushed file |
| replayFlushInStores(flush, null, false); |
| |
| // Inspect the memstore contents to see whether the memstore contains only edits |
| // with seqId smaller than the flush seqId. If so, we can discard those edits. |
| dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); |
| } |
| |
| status.markComplete("Flush commit successful"); |
| |
| // Update the last flushed sequence id for region. |
| this.maxFlushedSeqId = flush.getFlushSequenceNumber(); |
| |
| // advance the mvcc read point so that the new flushed file is visible. |
| mvcc.advanceTo(flush.getFlushSequenceNumber()); |
| |
| } catch (FileNotFoundException ex) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush) |
| + " doesn't exist any more. Skip loading the file(s)", ex); |
| } finally { |
| status.cleanup(); |
| writestate.notifyAll(); |
| } |
| } |
| |
| // C. Finally notify anyone waiting on memstore to clear: |
| // e.g. checkResources(). |
| synchronized (this) { |
| notifyAll(); // FindBugs NN_NAKED_NOTIFY |
| } |
| } |
| |
| /** |
| * Replays the given flush descriptor by opening the flush files in stores and dropping the |
| * memstore snapshots if requested. |
| */ |
| private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult, |
| boolean dropMemstoreSnapshot) throws IOException { |
| for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { |
| byte[] family = storeFlush.getFamilyName().toByteArray(); |
| HStore store = getStore(family); |
| if (store == null) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a flush commit marker from primary, but the family is not found." |
| + "Ignoring StoreFlushDescriptor:" + storeFlush); |
| continue; |
| } |
| List<String> flushFiles = storeFlush.getFlushOutputList(); |
| StoreFlushContext ctx = null; |
| long startTime = EnvironmentEdgeManager.currentTime(); |
| if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) { |
| ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY); |
| } else { |
| ctx = prepareFlushResult.storeFlushCtxs.get(family); |
| startTime = prepareFlushResult.startTime; |
| } |
| |
| if (ctx == null) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Unexpected: flush commit marker received from store " + Bytes.toString(family) |
| + " but no associated flush context. Ignoring"); |
| continue; |
| } |
| |
| ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush |
| |
| // Record latest flush time |
| this.lastStoreFlushTimeMap.put(store, startTime); |
| } |
| } |
| |
| private long loadRecoveredHFilesIfAny(Collection<HStore> stores) throws IOException { |
| Path regionDir = fs.getRegionDir(); |
| long maxSeqId = -1; |
| for (HStore store : stores) { |
| String familyName = store.getColumnFamilyName(); |
| FileStatus[] files = |
| WALSplitUtil.getRecoveredHFiles(fs.getFileSystem(), regionDir, familyName); |
| if (files != null && files.length != 0) { |
| for (FileStatus file : files) { |
| Path filePath = file.getPath(); |
| // If file length is zero then delete it |
| if (isZeroLengthThenDelete(fs.getFileSystem(), file, filePath)) { |
| continue; |
| } |
| try { |
| HStoreFile storefile = store.tryCommitRecoveredHFile(file.getPath()); |
| maxSeqId = Math.max(maxSeqId, storefile.getReader().getSequenceID()); |
| } catch (IOException e) { |
| handleException(fs.getFileSystem(), filePath, e); |
| continue; |
| } |
| } |
| if (this.rsServices != null && store.needsCompaction()) { |
| this.rsServices.getCompactionRequestor().requestCompaction(this, store, |
| "load recovered hfiles request compaction", Store.PRIORITY_USER + 1, |
| CompactionLifeCycleTracker.DUMMY, null); |
| } |
| } |
| } |
| return maxSeqId; |
| } |
| |
| /** |
| * Be careful, this method will drop all data in the memstore of this region. Currently, this |
| * method is used to drop memstore to prevent memory leak when replaying recovered.edits while |
| * opening region. |
| */ |
| private MemStoreSize dropMemStoreContents() throws IOException { |
| MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); |
| this.updatesLock.writeLock().lock(); |
| try { |
| for (HStore s : stores.values()) { |
| MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM); |
| LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region " |
| + this.getRegionInfo().getRegionNameAsString() + " , dropped memstoresize: [" |
| + memStoreSize + " }"); |
| totalFreedSize.incMemStoreSize(memStoreSize); |
| } |
| return totalFreedSize.getMemStoreSize(); |
| } finally { |
| this.updatesLock.writeLock().unlock(); |
| } |
| } |
| |
| /** |
| * Drops the memstore contents after replaying a flush descriptor or region open event replay if |
| * the memstore edits have seqNums smaller than the given seq id |
| */ |
| private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException { |
| MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); |
| this.updatesLock.writeLock().lock(); |
| try { |
| |
| long currentSeqId = mvcc.getReadPoint(); |
| if (seqId >= currentSeqId) { |
| // then we can drop the memstore contents since everything is below this seqId |
| LOG.info(getRegionInfo().getEncodedName() + " : " |
| + "Dropping memstore contents as well since replayed flush seqId: " + seqId |
| + " is greater than current seqId:" + currentSeqId); |
| |
| // Prepare flush (take a snapshot) and then abort (drop the snapshot) |
| if (store == null) { |
| for (HStore s : stores.values()) { |
| totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId)); |
| } |
| } else { |
| totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId)); |
| } |
| } else { |
| LOG.info(getRegionInfo().getEncodedName() + " : " |
| + "Not dropping memstore contents since replayed flush seqId: " + seqId |
| + " is smaller than current seqId:" + currentSeqId); |
| } |
| } finally { |
| this.updatesLock.writeLock().unlock(); |
| } |
| return totalFreedSize.getMemStoreSize(); |
| } |
| |
| private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId) |
| throws IOException { |
| MemStoreSize flushableSize = s.getFlushableSize(); |
| this.decrMemStoreSize(flushableSize); |
| StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY); |
| ctx.prepare(); |
| ctx.abort(); |
| return flushableSize; |
| } |
| |
| private void replayWALFlushAbortMarker(FlushDescriptor flush) { |
| // nothing to do for now. A flush abort will cause a RS abort which means that the region |
| // will be opened somewhere else later. We will see the region open event soon, and replaying |
| // that will drop the snapshot |
| } |
| |
| private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) { |
| synchronized (writestate) { |
| if (this.lastReplayedOpenRegionSeqId > replaySeqId) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" |
| + TextFormat.shortDebugString(flush) + " because its sequence id " + replaySeqId |
| + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " |
| + lastReplayedOpenRegionSeqId); |
| return; |
| } |
| |
| // If we were waiting for observing a flush or region opening event for not showing partial |
| // data after a secondary region crash, we can allow reads now. This event means that the |
| // primary was not able to flush because memstore is empty when we requested flush. By the |
| // time we observe this, we are guaranteed to have up to date seqId with our previous |
| // assignment. |
| this.setReadsEnabled(true); |
| } |
| } |
| |
| PrepareFlushResult getPrepareFlushResult() { |
| return prepareFlushResult; |
| } |
| |
| @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", |
| justification = "Intentional; cleared the memstore") |
| void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException { |
| checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(), |
| "RegionEvent marker from WAL ", regionEvent); |
| |
| startRegionOperation(Operation.REPLAY_EVENT); |
| try { |
| if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { |
| return; // if primary nothing to do |
| } |
| |
| if (regionEvent.getEventType() == EventType.REGION_CLOSE) { |
| // nothing to do on REGION_CLOSE for now. |
| return; |
| } |
| if (regionEvent.getEventType() != EventType.REGION_OPEN) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Unknown region event received, ignoring :" + TextFormat.shortDebugString(regionEvent)); |
| return; |
| } |
| |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying region open event marker " |
| + TextFormat.shortDebugString(regionEvent)); |
| } |
| |
| // we will use writestate as a coarse-grain lock for all the replay events |
| synchronized (writestate) { |
| // Replication can deliver events out of order when primary region moves or the region |
| // server crashes, since there is no coordination between replication of different wal files |
| // belonging to different region servers. We have to safe guard against this case by using |
| // region open event's seqid. Since this is the first event that the region puts (after |
| // possibly flushing recovered.edits), after seeing this event, we can ignore every edit |
| // smaller than this seqId |
| if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) { |
| this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber(); |
| } else { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying region event :" |
| + TextFormat.shortDebugString(regionEvent) |
| + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " |
| + " of " + lastReplayedOpenRegionSeqId); |
| return; |
| } |
| |
| // region open lists all the files that the region has at the time of the opening. Just pick |
| // all the files and drop prepared flushes and empty memstores |
| for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) { |
| // stores of primary may be different now |
| byte[] family = storeDescriptor.getFamilyName().toByteArray(); |
| HStore store = getStore(family); |
| if (store == null) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a region open marker from primary, but the family is not found. " |
| + "Ignoring. StoreDescriptor:" + storeDescriptor); |
| continue; |
| } |
| |
| long storeSeqId = store.getMaxSequenceId().orElse(0L); |
| List<String> storeFiles = storeDescriptor.getStoreFileList(); |
| try { |
| store.refreshStoreFiles(storeFiles); // replace the files with the new ones |
| } catch (FileNotFoundException ex) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files: " |
| + storeFiles + " doesn't exist any more. Skip loading the file(s)", ex); |
| continue; |
| } |
| if (store.getMaxSequenceId().orElse(0L) != storeSeqId) { |
| // Record latest flush time if we picked up new files |
| lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime()); |
| } |
| |
| if (writestate.flushing) { |
| // only drop memstore snapshots if they are smaller than last flush for the store |
| if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) { |
| StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null |
| ? null |
| : this.prepareFlushResult.storeFlushCtxs.get(family); |
| if (ctx != null) { |
| MemStoreSize mss = store.getFlushableSize(); |
| ctx.abort(); |
| this.decrMemStoreSize(mss); |
| this.prepareFlushResult.storeFlushCtxs.remove(family); |
| } |
| } |
| } |
| |
| // Drop the memstore contents if they are now smaller than the latest seen flushed file |
| dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store); |
| if (storeSeqId > this.maxFlushedSeqId) { |
| this.maxFlushedSeqId = storeSeqId; |
| } |
| } |
| |
| // if all stores ended up dropping their snapshots, we can safely drop the |
| // prepareFlushResult |
| dropPrepareFlushIfPossible(); |
| |
| // advance the mvcc read point so that the new flushed file is visible. |
| mvcc.await(); |
| |
| // If we were waiting for observing a flush or region opening event for not showing partial |
| // data after a secondary region crash, we can allow reads now. |
| this.setReadsEnabled(true); |
| |
| // C. Finally notify anyone waiting on memstore to clear: |
| // e.g. checkResources(). |
| synchronized (this) { |
| notifyAll(); // FindBugs NN_NAKED_NOTIFY |
| } |
| } |
| logRegionFiles(); |
| } finally { |
| closeRegionOperation(Operation.REPLAY_EVENT); |
| } |
| } |
| |
| void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException { |
| checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(), |
| "BulkLoad marker from WAL ", bulkLoadEvent); |
| |
| if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { |
| return; // if primary nothing to do |
| } |
| |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying bulkload event marker " |
| + TextFormat.shortDebugString(bulkLoadEvent)); |
| } |
| // check if multiple families involved |
| boolean multipleFamilies = false; |
| byte[] family = null; |
| for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { |
| byte[] fam = storeDescriptor.getFamilyName().toByteArray(); |
| if (family == null) { |
| family = fam; |
| } else if (!Bytes.equals(family, fam)) { |
| multipleFamilies = true; |
| break; |
| } |
| } |
| |
| startBulkRegionOperation(multipleFamilies); |
| try { |
| // we will use writestate as a coarse-grain lock for all the replay events |
| synchronized (writestate) { |
| // Replication can deliver events out of order when primary region moves or the region |
| // server crashes, since there is no coordination between replication of different wal files |
| // belonging to different region servers. We have to safe guard against this case by using |
| // region open event's seqid. Since this is the first event that the region puts (after |
| // possibly flushing recovered.edits), after seeing this event, we can ignore every edit |
| // smaller than this seqId |
| if ( |
| bulkLoadEvent.getBulkloadSeqNum() >= 0 |
| && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum() |
| ) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying bulkload event :" |
| + TextFormat.shortDebugString(bulkLoadEvent) |
| + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId" |
| + " =" + lastReplayedOpenRegionSeqId); |
| |
| return; |
| } |
| |
| for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { |
| // stores of primary may be different now |
| family = storeDescriptor.getFamilyName().toByteArray(); |
| HStore store = getStore(family); |
| if (store == null) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + "Received a bulk load marker from primary, but the family is not found. " |
| + "Ignoring. StoreDescriptor:" + storeDescriptor); |
| continue; |
| } |
| |
| List<String> storeFiles = storeDescriptor.getStoreFileList(); |
| for (String storeFile : storeFiles) { |
| StoreFileInfo storeFileInfo = null; |
| try { |
| storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile); |
| store.bulkLoadHFile(storeFileInfo); |
| } catch (FileNotFoundException ex) { |
| LOG.warn(getRegionInfo().getEncodedName() + " : " |
| + ((storeFileInfo != null) |
| ? storeFileInfo.toString() |
| : (new Path(Bytes.toString(family), storeFile)).toString()) |
| + " doesn't exist any more. Skip loading the file"); |
| } |
| } |
| } |
| } |
| if (bulkLoadEvent.getBulkloadSeqNum() > 0) { |
| mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum()); |
| } |
| } finally { |
| closeBulkRegionOperation(); |
| } |
| } |
| |
| /** |
| * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult |
| */ |
| private void dropPrepareFlushIfPossible() { |
| if (writestate.flushing) { |
| boolean canDrop = true; |
| if (prepareFlushResult.storeFlushCtxs != null) { |
| for (Entry<byte[], StoreFlushContext> entry : prepareFlushResult.storeFlushCtxs |
| .entrySet()) { |
| HStore store = getStore(entry.getKey()); |
| if (store == null) { |
| continue; |
| } |
| if (store.getSnapshotSize().getDataSize() > 0) { |
| canDrop = false; |
| break; |
| } |
| } |
| } |
| |
| // this means that all the stores in the region has finished flushing, but the WAL marker |
| // may not have been written or we did not receive it yet. |
| if (canDrop) { |
| writestate.flushing = false; |
| this.prepareFlushResult = null; |
| } |
| } |
| } |
| |
| @Override |
| public boolean refreshStoreFiles() throws IOException { |
| return refreshStoreFiles(false); |
| } |
| |
| @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", |
| justification = "Notify is about post replay. Intentional") |
| protected boolean refreshStoreFiles(boolean force) throws IOException { |
| if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { |
| return false; // if primary nothing to do |
| } |
| |
| if (LOG.isDebugEnabled()) { |
| LOG.debug(getRegionInfo().getEncodedName() + " : " |
| + "Refreshing store files to see whether we can free up memstore"); |
| } |
| |
| long totalFreedDataSize = 0; |
| |
| long smallestSeqIdInStores = Long.MAX_VALUE; |
| |
| startRegionOperation(); // obtain region close lock |
| try { |
| Map<HStore, Long> map = new HashMap<>(); |
| synchronized (writestate) { |
| for (HStore store : stores.values()) { |
| // TODO: some stores might see new data from flush, while others do not which |
| // MIGHT break atomic edits across column families. |
| long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L); |
| |
| // refresh the store files. This is similar to observing a region open wal marker. |
| store.refreshStoreFiles(); |
| |
| long storeSeqId = store.getMaxSequenceId().orElse(0L); |
| if (storeSeqId < smallestSeqIdInStores) { |
| smallestSeqIdInStores = storeSeqId; |
| } |
| |
| // see whether we can drop the memstore or the snapshot |
| if (storeSeqId > maxSeqIdBefore) { |
| if (writestate.flushing) { |
| // only drop memstore snapshots if they are smaller than last flush for the store |
| if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) { |
| StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null |
| ? null |
| : this.prepareFlushResult.storeFlushCtxs |
| .get(store.getColumnFamilyDescriptor().getName()); |
| if (ctx != null) { |
| MemStoreSize mss = store.getFlushableSize(); |
| ctx.abort(); |
| this.decrMemStoreSize(mss); |
| this.prepareFlushResult.storeFlushCtxs |
| .remove(store.getColumnFamilyDescriptor().getName()); |
| totalFreedDataSize += mss.getDataSize(); |
| } |
| } |
| } |
| |
| map.put(store, storeSeqId); |
| } |
| } |
| |
| // if all stores ended up dropping their snapshots, we can safely drop the |
| // prepareFlushResult |
| dropPrepareFlushIfPossible(); |
| |
| // advance the mvcc read point so that the new flushed files are visible. |
| // either greater than flush seq number or they were already picked up via flush. |
| for (HStore s : stores.values()) { |
| mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L)); |
| } |
| |
| // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely |
| // skip all edits that are to be replayed in the future with that has a smaller seqId |
| // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits |
| // that we have picked the flush files for |
| if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) { |
| this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores; |
| } |
| } |
| if (!map.isEmpty()) { |
| for (Map.Entry<HStore, Long> entry : map.entrySet()) { |
| // Drop the memstore contents if they are now smaller than the latest seen flushed file |
| totalFreedDataSize += |
| dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey()).getDataSize(); |
| } |
| } |
| // C. Finally notify anyone waiting on memstore to clear: |
| // e.g. checkResources(). |
| synchronized (this) { |
| notifyAll(); // FindBugs NN_NAKED_NOTIFY |
| } |
| return totalFreedDataSize > 0; |
| } finally { |
| closeRegionOperation(); |
| } |
| } |
| |
| private void logRegionFiles() { |
| if (LOG.isTraceEnabled()) { |
| LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: "); |
| stores.values().stream().filter(s -> s.getStorefiles() != null) |
| .flatMap(s -> s.getStorefiles().stream()) |
| .forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf)); |
| } |
| } |
| |
| /** |
| * Checks whether the given regionName is either equal to our region, or that the regionName is |
| * the primary region to our corresponding range for the secondary replica. |
| */ |
| private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload) |
| throws WrongRegionException { |
| if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) { |
| return; |
| } |
| |
| if ( |
| !RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) |
| && Bytes.equals(encodedRegionName, this.fs.getRegionInfoForFS().getEncodedNameAsBytes()) |
| ) { |
| return; |
| } |
| |
| throw new WrongRegionException( |
| exceptionMsg + payload + " targetted for region " + Bytes.toStringBinary(encodedRegionName) |
| + " does not match this region: " + this.getRegionInfo()); |
| } |
| |
| /** |
| * Used by tests |
| * @param s Store to add edit too. |
| * @param cell Cell to add. |
| */ |
| protected void restoreEdit(HStore s, ExtendedCell cell, MemStoreSizing memstoreAccounting) { |
| s.add(cell, memstoreAccounting); |
| } |
| |
| /** |
| * make sure have been through lease recovery before get file status, so the file length can be |
| * trusted. |
| * @param p File to check. |
| * @return True if file was zero-length (and if so, we'll delete it in here). |
| */ |
| private static boolean isZeroLengthThenDelete(final FileSystem fs, final FileStatus stat, |
| final Path p) throws IOException { |
| if (stat.getLen() > 0) { |
| return false; |
| } |
| LOG.warn("File " + p + " is zero-length, deleting."); |
| fs.delete(p, false); |
| return true; |
| } |
| |
| protected HStore instantiateHStore(final ColumnFamilyDescriptor family, boolean warmup) |
| throws IOException { |
| if (family.isMobEnabled()) { |
| if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) { |
| throw new IOException("A minimum HFile version of " + HFile.MIN_FORMAT_VERSION_WITH_TAGS |
| + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY |
| + " accordingly."); |
| } |
| return new HMobStore(this, family, this.conf, warmup); |
| } |
| return new HStore(this, family, this.conf, warmup); |
| } |
| |
| @Override |
| public HStore getStore(byte[] column) { |
| return this.stores.get(column); |
| } |
| |
| /** |
| * Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on |
| * the list. |
| */ |
| private HStore getStore(Cell cell) { |
| return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey())) |
| .map(e -> e.getValue()).findFirst().orElse(null); |
| } |
| |
| @Override |
| public List<HStore> getStores() { |
| return new ArrayList<>(stores.values()); |
| } |
| |
| @Override |
| public List<String> getStoreFileList(byte[][] columns) throws IllegalArgumentException { |
| List<String> storeFileNames = new ArrayList<>(); |
| synchronized (closeLock) { |
| for (byte[] column : columns) { |
| HStore store = this.stores.get(column); |
| if (store == null) { |
| throw new IllegalArgumentException( |
| "No column family : " + new String(column, StandardCharsets.UTF_8) + " available"); |
| } |
| Collection<HStoreFile> storeFiles = store.getStorefiles(); |
| if (storeFiles == null) { |
| continue; |
| } |
| for (HStoreFile storeFile : storeFiles) { |
| storeFileNames.add(storeFile.getPath().toString()); |
| } |
| |
| logRegionFiles(); |
| } |
| } |
| return storeFileNames; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // Support code |
| ////////////////////////////////////////////////////////////////////////////// |
| |
| /** Make sure this is a valid row for the HRegion */ |
| void checkRow(byte[] row, String op) throws IOException { |
| if (!rowIsInRange(getRegionInfo(), row)) { |
| throw new WrongRegionException("Requested row out of range for " + op + " on HRegion " + this |
| + ", startKey='" + Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" |
| + Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" + Bytes.toStringBinary(row) |
| + "'"); |
| } |
| } |
| |
| /** |
| * Get an exclusive ( write lock ) lock on a given row. |
| * @param row Which row to lock. |
| * @return A locked RowLock. The lock is exclusive and already aqquired. |
| */ |
| public RowLock getRowLock(byte[] row) throws IOException { |
| return getRowLock(row, false); |
| } |
| |
| @Override |
| public RowLock getRowLock(byte[] row, boolean readLock) throws IOException { |
| checkRow(row, "row lock"); |
| return getRowLock(row, readLock, null); |
| } |
| |
| Span createRegionSpan(String name) { |
| return TraceUtil.createSpan(name).setAttribute(REGION_NAMES_KEY, |
| Collections.singletonList(getRegionInfo().getRegionNameAsString())); |
| } |
| |
| // will be override in tests |
| protected RowLock getRowLockInternal(byte[] row, boolean readLock, RowLock prevRowLock) |
| throws IOException { |
| // create an object to use a a key in the row lock map |
| HashedBytes rowKey = new HashedBytes(row); |
| |
| RowLockContext rowLockContext = null; |
| RowLockImpl result = null; |
| |
| boolean success = false; |
| try { |
| // Keep trying until we have a lock or error out. |
| // TODO: do we need to add a time component here? |
| while (result == null) { |
| rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey)); |
| // Now try an get the lock. |
| // This can fail as |
| if (readLock) { |
| // For read lock, if the caller has locked the same row previously, it will not try |
| // to acquire the same read lock. It simply returns the previous row lock. |
| RowLockImpl prevRowLockImpl = (RowLockImpl) prevRowLock; |
| if ( |
| (prevRowLockImpl != null) |
| && (prevRowLockImpl.getLock() == rowLockContext.readWriteLock.readLock()) |
| ) { |
| success = true; |
| return prevRowLock; |
| } |
| result = rowLockContext.newReadLock(); |
| } else { |
| result = rowLockContext.newWriteLock(); |
| } |
| } |
| |
| int timeout = rowLockWaitDuration; |
| boolean reachDeadlineFirst = false; |
| Optional<RpcCall> call = RpcServer.getCurrentCall(); |
| if (call.isPresent()) { |
| long deadline = call.get().getDeadline(); |
| if (deadline < Long.MAX_VALUE) { |
| int timeToDeadline = (int) (deadline - EnvironmentEdgeManager.currentTime()); |
| if (timeToDeadline <= this.rowLockWaitDuration) { |
| reachDeadlineFirst = true; |
| timeout = timeToDeadline; |
| } |
| } |
| } |
| |
| if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) { |
| String message = "Timed out waiting for lock for row: " + rowKey + " in region " |
| + getRegionInfo().getEncodedName(); |
| if (reachDeadlineFirst) { |
| throw new TimeoutIOException(message); |
| } else { |
| // If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request. |
| throw new IOException(message); |
| } |
| } |
| rowLockContext.setThreadName(Thread.currentThread().getName()); |
| success = true; |
| return result; |
| } catch (InterruptedException ie) { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Thread interrupted waiting for lock on row: {}, in region {}", rowKey, |
| getRegionInfo().getRegionNameAsString()); |
| } |
| throw throwOnInterrupt(ie); |
| } catch (Error error) { |
| // The maximum lock count for read lock is 64K (hardcoded), when this maximum count |
| // is reached, it will throw out an Error. This Error needs to be caught so it can |
| // go ahead to process the minibatch with lock acquired. |
| LOG.warn("Error to get row lock for {}, in region {}, cause: {}", Bytes.toStringBinary(row), |
| getRegionInfo().getRegionNameAsString(), error); |
| IOException ioe = new IOException(error); |
| throw ioe; |
| } finally { |
| // Clean up the counts just in case this was the thing keeping the context alive. |
| if (!success && rowLockContext != null) { |
| rowLockContext.cleanUp(); |
| } |
| } |
| } |
| |
| private RowLock getRowLock(byte[] row, boolean readLock, final RowLock prevRowLock) |
| throws IOException { |
| return TraceUtil.trace(() -> getRowLockInternal(row, readLock, prevRowLock), |
| () -> createRegionSpan("Region.getRowLock").setAttribute(ROW_LOCK_READ_LOCK_KEY, readLock)); |
| } |
| |
| private void releaseRowLocks(List<RowLock> rowLocks) { |
| if (rowLocks != null) { |
| for (RowLock rowLock : rowLocks) { |
| rowLock.release(); |
| } |
| rowLocks.clear(); |
| } |
| } |
| |
| public int getReadLockCount() { |
| return lock.getReadLockCount(); |
| } |
| |
| public ConcurrentHashMap<HashedBytes, RowLockContext> getLockedRows() { |
| return lockedRows; |
| } |
| |
| class RowLockContext { |
| private final HashedBytes row; |
| final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true); |
| final AtomicBoolean usable = new AtomicBoolean(true); |
| final AtomicInteger count = new AtomicInteger(0); |
| final Object lock = new Object(); |
| private String threadName; |
| |
| RowLockContext(HashedBytes row) { |
| this.row = row; |
| } |
| |
| RowLockImpl newWriteLock() { |
| Lock l = readWriteLock.writeLock(); |
| return getRowLock(l); |
| } |
| |
| RowLockImpl newReadLock() { |
| Lock l = readWriteLock.readLock(); |
| return getRowLock(l); |
| } |
| |
| private RowLockImpl getRowLock(Lock l) { |
| count.incrementAndGet(); |
| synchronized (lock) { |
| if (usable.get()) { |
| return new RowLockImpl(this, l); |
| } else { |
| return null; |
| } |
| } |
| } |
| |
| void cleanUp() { |
| long c = count.decrementAndGet(); |
| if (c <= 0) { |
| synchronized (lock) { |
| if (count.get() <= 0 && usable.get()) { // Don't attempt to remove row if already removed |
| usable.set(false); |
| RowLockContext removed = lockedRows.remove(row); |
| assert removed == this : "we should never remove a different context"; |
| } |
| } |
| } |
| } |
| |
| public void setThreadName(String threadName) { |
| this.threadName = threadName; |
| } |
| |
| @Override |
| public String toString() { |
| return "RowLockContext{" + "row=" + row + ", readWriteLock=" + readWriteLock + ", count=" |
| + count + ", threadName=" + threadName + '}'; |
| } |
| } |
| |
| /** |
| * Class used to represent a lock on a row. |
| */ |
| public static class RowLockImpl implements RowLock { |
| private final RowLockContext context; |
| private final Lock lock; |
| |
| public RowLockImpl(RowLockContext context, Lock lock) { |
| this.context = context; |
| this.lock = lock; |
| } |
| |
| public Lock getLock() { |
| return lock; |
| } |
| |
| public RowLockContext getContext() { |
| return context; |
| } |
| |
| @Override |
| public void release() { |
| lock.unlock(); |
| context.cleanUp(); |
| } |
| |
| @Override |
| public String toString() { |
| return "RowLockImpl{" + "context=" + context + ", lock=" + lock + '}'; |
| } |
| } |
| |
| /** |
| * Determines whether multiple column families are present Precondition: familyPaths is not null |
| * @param familyPaths List of (column family, hfilePath) |
| */ |
| private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) { |
| boolean multipleFamilies = false; |
| byte[] family = null; |
| for (Pair<byte[], String> pair : familyPaths) { |
| byte[] fam = pair.getFirst(); |
| if (family == null) { |
| family = fam; |
| } else if (!Bytes.equals(family, fam)) { |
| multipleFamilies = true; |
| break; |
| } |
| } |
| return multipleFamilies; |
| } |
| |
| /** |
| * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple |
| * column families atomically. |
| * @param familyPaths List of Pair<byte[] column family, String hfilePath> |
| * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be |
| * bulk loaded |
| * @return Map from family to List of store file paths if successful, null if failed recoverably |
| * @throws IOException if failed unrecoverably. |
| */ |
| public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, |
| boolean assignSeqId, BulkLoadListener bulkLoadListener) throws IOException { |
| return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false, null, true); |
| } |
| |
| /** |
| * Listener class to enable callers of bulkLoadHFile() to perform any necessary pre/post |
| * processing of a given bulkload call |
| */ |
| public interface BulkLoadListener { |
| /** |
| * Called before an HFile is actually loaded |
| * @param family family being loaded to |
| * @param srcPath path of HFile |
| * @return final path to be used for actual loading |
| */ |
| String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile, String customStaging) |
| throws IOException; |
| |
| /** |
| * Called after a successful HFile load |
| * @param family family being loaded to |
| * @param srcPath path of HFile |
| */ |
| void doneBulkLoad(byte[] family, String srcPath) throws IOException; |
| |
| /** |
| * Called after a failed HFile load |
| * @param family family being loaded to |
| * @param srcPath path of HFile |
| */ |
| void failedBulkLoad(byte[] family, String srcPath) throws IOException; |
| } |
| |
| /** |
| * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple |
| * column families atomically. |
| * @param familyPaths List of Pair<byte[] column family, String hfilePath> |
| * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be |
| * bulk loaded |
| * @param copyFile always copy hfiles if true |
| * @param clusterIds ids from clusters that had already handled the given bulkload event. |
| * @return Map from family to List of store file paths if successful, null if failed recoverably |
| * @throws IOException if failed unrecoverably. |
| */ |
| public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, |
| boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile, |
| List<String> clusterIds, boolean replicate) throws IOException { |
| long seqId = -1; |
| Map<byte[], List<Path>> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| Map<String, Long> storeFilesSizes = new HashMap<>(); |
| Preconditions.checkNotNull(familyPaths); |
| // we need writeLock for multi-family bulk load |
| startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths)); |
| boolean isSuccessful = false; |
| try { |
| this.writeRequestsCount.increment(); |
| |
| // There possibly was a split that happened between when the split keys |
| // were gathered and before the HRegion's write lock was taken. We need |
| // to validate the HFile region before attempting to bulk load all of them |
| IOException ioException = null; |
| List<Pair<byte[], String>> failures = new ArrayList<>(); |
| for (Pair<byte[], String> p : familyPaths) { |
| byte[] familyName = p.getFirst(); |
| String path = p.getSecond(); |
| |
| HStore store = getStore(familyName); |
| if (store == null) { |
| ioException = new org.apache.hadoop.hbase.DoNotRetryIOException( |
| "No such column family " + Bytes.toStringBinary(familyName)); |
| } else { |
| try { |
| store.assertBulkLoadHFileOk(new Path(path)); |
| } catch (WrongRegionException wre) { |
| // recoverable (file doesn't fit in region) |
| failures.add(p); |
| } catch (IOException ioe) { |
| // unrecoverable (hdfs problem) |
| ioException = ioe; |
| } |
| } |
| |
| // validation failed because of some sort of IO problem. |
| if (ioException != null) { |
| LOG.error("There was IO error when checking if the bulk load is ok in region {}.", this, |
| ioException); |
| throw ioException; |
| } |
| } |
| // validation failed, bail out before doing anything permanent. |
| if (failures.size() != 0) { |
| StringBuilder list = new StringBuilder(); |
| for (Pair<byte[], String> p : failures) { |
| list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ") |
| .append(p.getSecond()); |
| } |
| // problem when validating |
| LOG.warn("There was a recoverable bulk load failure likely due to a split. These (family," |
| + " HFile) pairs were not loaded: {}, in region {}", list.toString(), this); |
| return null; |
| } |
| |
| // We need to assign a sequential ID that's in between two memstores in order to preserve |
| // the guarantee that all the edits lower than the highest sequential ID from all the |
| // HFiles are flushed on disk. See HBASE-10958. The sequence id returned when we flush is |
| // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is |
| // a sequence id that we can be sure is beyond the last hfile written). |
| if (assignSeqId) { |
| FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY); |
| if (fs.isFlushSucceeded()) { |
| seqId = ((FlushResultImpl) fs).flushSequenceId; |
| } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) { |
| seqId = ((FlushResultImpl) fs).flushSequenceId; |
| } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) { |
| // CANNOT_FLUSH may mean that a flush is already on-going |
| // we need to wait for that flush to complete |
| waitForFlushes(); |
| } else { |
| throw new IOException("Could not bulk load with an assigned sequential ID because the " |
| + "flush didn't run. Reason for not flushing: " + ((FlushResultImpl) fs).failureReason); |
| } |
| } |
| |
| Map<byte[], List<Pair<Path, Path>>> familyWithFinalPath = |
| new TreeMap<>(Bytes.BYTES_COMPARATOR); |
| for (Pair<byte[], String> p : familyPaths) { |
| byte[] familyName = p.getFirst(); |
| String path = p.getSecond(); |
| HStore store = getStore(familyName); |
| if (!familyWithFinalPath.containsKey(familyName)) { |
| familyWithFinalPath.put(familyName, new ArrayList<>()); |
| } |
| List<Pair<Path, Path>> lst = familyWithFinalPath.get(familyName); |
| String finalPath = path; |
| try { |
| boolean reqTmp = store.storeEngine.requireWritingToTmpDirFirst(); |
| if (bulkLoadListener != null) { |
| finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile, |
| reqTmp ? null : fs.getRegionDir().toString()); |
| } |
| Pair<Path, Path> pair = null; |
| if (reqTmp || !StoreFileInfo.isHFile(finalPath)) { |
| pair = store.preBulkLoadHFile(finalPath, seqId); |
| } else { |
| Path livePath = new Path(finalPath); |
| pair = new Pair<>(livePath, livePath); |
| } |
| lst.add(pair); |
| } catch (IOException ioe) { |
| // A failure here can cause an atomicity violation that we currently |
| // cannot recover from since it is likely a failed HDFS operation. |
| |
| LOG.error("There was a partial failure due to IO when attempting to" + " load " |
| + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe); |
| if (bulkLoadListener != null) { |
| try { |
| bulkLoadListener.failedBulkLoad(familyName, finalPath); |
| } catch (Exception ex) { |
| LOG.error("Error while calling failedBulkLoad for family " |
| + Bytes.toString(familyName) + " with path " + path, ex); |
| } |
| } |
| throw ioe; |
| } |
| } |
| |
| if (this.getCoprocessorHost() != null) { |
| for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) { |
| this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue()); |
| } |
| } |
| for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) { |
| byte[] familyName = entry.getKey(); |
| for (Pair<Path, Path> p : entry.getValue()) { |
| String path = p.getFirst().toString(); |
| Path commitedStoreFile = p.getSecond(); |
| HStore store = getStore(familyName); |
| try { |
| store.bulkLoadHFile(familyName, path, commitedStoreFile); |
| // Note the size of the store file |
| try { |
| FileSystem fs = commitedStoreFile.getFileSystem(baseConf); |
| storeFilesSizes.put(commitedStoreFile.getName(), |
| fs.getFileStatus(commitedStoreFile).getLen()); |
| } catch (IOException e) { |
| LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e); |
| storeFilesSizes.put(commitedStoreFile.getName(), 0L); |
| } |
| |
| if (storeFiles.containsKey(familyName)) { |
| storeFiles.get(familyName).add(commitedStoreFile); |
| } else { |
| List<Path> storeFileNames = new ArrayList<>(); |
| storeFileNames.add(commitedStoreFile); |
| storeFiles.put(familyName, storeFileNames); |
| } |
| if (bulkLoadListener != null) { |
| bulkLoadListener.doneBulkLoad(familyName, path); |
| } |
| } catch (IOException ioe) { |
| // A failure here can cause an atomicity violation that we currently |
| // cannot recover from since it is likely a failed HDFS operation. |
| |
| // TODO Need a better story for reverting partial failures due to HDFS. |
| LOG.error("There was a partial failure due to IO when attempting to" + " load " |
| + Bytes.toString(familyName) + " : " + p.getSecond(), ioe); |
| if (bulkLoadListener != null) { |
| try { |
| bulkLoadListener.failedBulkLoad(familyName, path); |
| } catch (Exception ex) { |
| LOG.error("Error while calling failedBulkLoad for family " |
| + Bytes.toString(familyName) + " with path " + path, ex); |
| } |
| } |
| throw ioe; |
| } |
| } |
| } |
| |
| isSuccessful = true; |
| if (conf.getBoolean(COMPACTION_AFTER_BULKLOAD_ENABLE, true)) { |
| // request compaction |
| familyWithFinalPath.keySet().forEach(family -> { |
| HStore store = getStore(family); |
| try { |
| if (this.rsServices != null && store.needsCompaction()) { |
| this.rsServices.getCompactionRequestor().requestSystemCompaction(this, store, |
| "bulkload hfiles request compaction", true); |
| LOG.info("Request compaction for region {} family {} after bulk load", |
| this.getRegionInfo().getEncodedName(), store.getColumnFamilyName()); |
| } |
| } catch (IOException e) { |
| LOG.error("bulkload hfiles request compaction error ", e); |
| } |
| }); |
| } |
| } finally { |
| if (wal != null && !storeFiles.isEmpty()) { |
| // Write a bulk load event for hfiles that are loaded |
| try { |
| WALProtos.BulkLoadDescriptor loadDescriptor = |
| ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(), |
| UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()), |
| storeFiles, storeFilesSizes, seqId, clusterIds, replicate); |
| WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(), |
| loadDescriptor, mvcc); |
| } catch (IOException ioe) { |
| if (this.rsServices != null) { |
| // Have to abort region server because some hfiles has been loaded but we can't write |
| // the event into WAL |
| isSuccessful = false; |
| this.rsServices.abort("Failed to write bulk load event into WAL.", ioe); |
| } |
| } |
| } |
| |
| closeBulkRegionOperation(); |
| } |
| return isSuccessful ? storeFiles : null; |
| } |
| |
| @Override |
| public boolean equals(Object o) { |
| return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(), |
| ((HRegion) o).getRegionInfo().getRegionName()); |
| } |
| |
| @Override |
| public int hashCode() { |
| return Bytes.hashCode(getRegionInfo().getRegionName()); |
| } |
| |
| @Override |
| public String toString() { |
| return getRegionInfo().getRegionNameAsString(); |
| } |
| |
| // Utility methods |
| /** |
| * A utility method to create new instances of HRegion based on the {@link HConstants#REGION_IMPL} |
| * configuration property. |
| * @param tableDir qualified path of directory where region should be located, usually the table |
| * directory. |
| * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a |
| * logfile from the previous execution that's custom-computed for this HRegion. |
| * The HRegionServer computes and sorts the appropriate wal info for this |
| * HRegion. If there is a previous file (implying that the HRegion has been |
| * written-to before), then read it from the supplied path. |
| * @param fs is the filesystem. |
| * @param conf is global configuration settings. |
| * @param regionInfo - RegionInfo that describes the region is new), then read them from the |
| * supplied path. |
| * @param htd the table descriptor |
| * @return the new instance |
| */ |
| public static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration conf, |
| RegionInfo regionInfo, final TableDescriptor htd, RegionServerServices rsServices) { |
| try { |
| @SuppressWarnings("unchecked") |
| Class<? extends HRegion> regionClass = |
| (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class); |
| |
| Constructor<? extends HRegion> c = |
| regionClass.getConstructor(Path.class, WAL.class, FileSystem.class, Configuration.class, |
| RegionInfo.class, TableDescriptor.class, RegionServerServices.class); |
| |
| return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices); |
| } catch (Throwable e) { |
| // todo: what should I throw here? |
| throw new IllegalStateException("Could not instantiate a region instance.", e); |
| } |
| } |
| |
| /** |
| * Convenience method creating new HRegions. Used by createTable. |
| * @param info Info for region to create. |
| * @param rootDir Root directory for HBase instance |
| * @param wal shared WAL |
| * @param initialize - true to initialize the region |
| * @return new HRegion |
| */ |
| public static HRegion createHRegion(final RegionInfo info, final Path rootDir, |
| final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, |
| final boolean initialize) throws IOException { |
| return createHRegion(info, rootDir, conf, hTableDescriptor, wal, initialize, null); |
| } |
| |
| /** |
| * Convenience method creating new HRegions. Used by createTable. |
| * @param info Info for region to create. |
| * @param rootDir Root directory for HBase instance |
| * @param wal shared WAL |
| * @param initialize - true to initialize the region |
| * @param rsRpcServices An interface we can request flushes against. |
| * @return new HRegion |
| */ |
| public static HRegion createHRegion(final RegionInfo info, final Path rootDir, |
| final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, |
| final boolean initialize, RegionServerServices rsRpcServices) throws IOException { |
| LOG.info("creating " + info + ", tableDescriptor=" |
| + (hTableDescriptor == null ? "null" : hTableDescriptor) + ", regionDir=" + rootDir); |
| createRegionDir(conf, info, rootDir); |
| FileSystem fs = rootDir.getFileSystem(conf); |
| Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); |
| HRegion region = |
| HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, rsRpcServices); |
| if (initialize) { |
| region.initialize(null); |
| } |
| return region; |
| } |
| |
| /** |
| * Create a region under the given table directory. |
| */ |
| public static HRegion createHRegion(Configuration conf, RegionInfo regionInfo, FileSystem fs, |
| Path tableDir, TableDescriptor tableDesc) throws IOException { |
| LOG.info("Creating {}, tableDescriptor={}, under table dir {}", regionInfo, tableDesc, |
| tableDir); |
| HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, regionInfo); |
| HRegion region = HRegion.newHRegion(tableDir, null, fs, conf, regionInfo, tableDesc, null); |
| return region; |
| } |
| |
| /** |
| * Create the region directory in the filesystem. |
| */ |
| public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri, |
| Path rootDir) throws IOException { |
| FileSystem fs = rootDir.getFileSystem(configuration); |
| Path tableDir = CommonFSUtils.getTableDir(rootDir, ri.getTable()); |
| // If directory already exists, will log warning and keep going. Will try to create |
| // .regioninfo. If one exists, will overwrite. |
| return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri); |
| } |
| |
| public static HRegion createHRegion(final RegionInfo info, final Path rootDir, |
| final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal) |
| throws IOException { |
| return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true); |
| } |
| |
| /** |
| * Open a Region. |
| * @param info Info for region to be opened. |
| * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing |
| * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is |
| * properly kept up. HRegionStore does this every time it opens a new region. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, |
| final Configuration conf) throws IOException { |
| return openHRegion(info, htd, wal, conf, null, null); |
| } |
| |
| /** |
| * Open a Region. |
| * @param info Info for region to be opened |
| * @param htd the table descriptor |
| * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) |
| * passing the result of the call to HRegion#getMinSequenceId() to ensure the |
| * wal id is properly kept up. HRegionStore does this every time it opens a new |
| * region. |
| * @param conf The Configuration object to use. |
| * @param rsServices An interface we can request flushes against. |
| * @param reporter An interface we can report progress against. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, |
| final Configuration conf, final RegionServerServices rsServices, |
| final CancelableProgressable reporter) throws IOException { |
| return openHRegion(CommonFSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter); |
| } |
| |
| /** |
| * Open a Region. |
| * @param rootDir Root directory for HBase instance |
| * @param info Info for region to be opened. |
| * @param htd the table descriptor |
| * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing |
| * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is |
| * properly kept up. HRegionStore does this every time it opens a new region. |
| * @param conf The Configuration object to use. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegion(Path rootDir, final RegionInfo info, final TableDescriptor htd, |
| final WAL wal, final Configuration conf) throws IOException { |
| return openHRegion(rootDir, info, htd, wal, conf, null, null); |
| } |
| |
| /** |
| * Open a Region. |
| * @param rootDir Root directory for HBase instance |
| * @param info Info for region to be opened. |
| * @param htd the table descriptor |
| * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) |
| * passing the result of the call to HRegion#getMinSequenceId() to ensure the |
| * wal id is properly kept up. HRegionStore does this every time it opens a new |
| * region. |
| * @param conf The Configuration object to use. |
| * @param rsServices An interface we can request flushes against. |
| * @param reporter An interface we can report progress against. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegion(final Path rootDir, final RegionInfo info, |
| final TableDescriptor htd, final WAL wal, final Configuration conf, |
| final RegionServerServices rsServices, final CancelableProgressable reporter) |
| throws IOException { |
| FileSystem fs = null; |
| if (rsServices != null) { |
| fs = rsServices.getFileSystem(); |
| } |
| if (fs == null) { |
| fs = rootDir.getFileSystem(conf); |
| } |
| return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter); |
| } |
| |
| /** |
| * Open a Region. |
| * @param conf The Configuration object to use. |
| * @param fs Filesystem to use |
| * @param rootDir Root directory for HBase instance |
| * @param info Info for region to be opened. |
| * @param htd the table descriptor |
| * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing |
| * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is |
| * properly kept up. HRegionStore does this every time it opens a new region. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegion(final Configuration conf, final FileSystem fs, |
| final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal) |
| throws IOException { |
| return openHRegion(conf, fs, rootDir, info, htd, wal, null, null); |
| } |
| |
| /** |
| * Open a Region. |
| * @param conf The Configuration object to use. |
| * @param fs Filesystem to use |
| * @param rootDir Root directory for HBase instance |
| * @param info Info for region to be opened. |
| * @param htd the table descriptor |
| * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) |
| * passing the result of the call to HRegion#getMinSequenceId() to ensure the |
| * wal id is properly kept up. HRegionStore does this every time it opens a new |
| * region. |
| * @param rsServices An interface we can request flushes against. |
| * @param reporter An interface we can report progress against. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegion(final Configuration conf, final FileSystem fs, |
| final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, |
| final RegionServerServices rsServices, final CancelableProgressable reporter) |
| throws IOException { |
| Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); |
| return openHRegionFromTableDir(conf, fs, tableDir, info, htd, wal, rsServices, reporter); |
| } |
| |
| /** |
| * Open a Region. |
| * @param conf The Configuration object to use. |
| * @param fs Filesystem to use |
| * @param info Info for region to be opened. |
| * @param htd the table descriptor |
| * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) |
| * passing the result of the call to HRegion#getMinSequenceId() to ensure the |
| * wal id is properly kept up. HRegionStore does this every time it opens a new |
| * region. |
| * @param rsServices An interface we can request flushes against. |
| * @param reporter An interface we can report progress against. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegionFromTableDir(final Configuration conf, final FileSystem fs, |
| final Path tableDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, |
| final RegionServerServices rsServices, final CancelableProgressable reporter) |
| throws IOException { |
| Objects.requireNonNull(info, "RegionInfo cannot be null"); |
| LOG.debug("Opening region: {}", info); |
| HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices); |
| return r.openHRegion(reporter); |
| } |
| |
| public NavigableMap<byte[], Integer> getReplicationScope() { |
| return this.replicationScope; |
| } |
| |
| /** |
| * Useful when reopening a closed region (normally for unit tests) |
| * @param other original object |
| * @param reporter An interface we can report progress against. |
| * @return new HRegion |
| */ |
| public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter) |
| throws IOException { |
| HRegionFileSystem regionFs = other.getRegionFileSystem(); |
| HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(), |
| other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null); |
| return r.openHRegion(reporter); |
| } |
| |
| public static Region openHRegion(final Region other, final CancelableProgressable reporter) |
| throws IOException { |
| return openHRegion((HRegion) other, reporter); |
| } |
| |
| /** |
| * Open HRegion. |
| * <p/> |
| * Calls initialize and sets sequenceId. |
| * @return Returns <code>this</code> |
| */ |
| private HRegion openHRegion(final CancelableProgressable reporter) throws IOException { |
| try { |
| CompoundConfiguration cConfig = |
| new CompoundConfiguration().add(conf).addBytesMap(htableDescriptor.getValues()); |
| // Refuse to open the region if we are missing local compression support |
| TableDescriptorChecker.checkCompression(cConfig, htableDescriptor); |
| // Refuse to open the region if encryption configuration is incorrect or |
| // codec support is missing |
| LOG.debug("checking encryption for " + this.getRegionInfo().getEncodedName()); |
| TableDescriptorChecker.checkEncryption(cConfig, htableDescriptor); |
| // Refuse to open the region if a required class cannot be loaded |
| LOG.debug("checking classloading for " + this.getRegionInfo().getEncodedName()); |
| TableDescriptorChecker.checkClassLoading(cConfig, htableDescriptor); |
| this.openSeqNum = initialize(reporter); |
| this.mvcc.advanceTo(openSeqNum); |
| // The openSeqNum must be increased every time when a region is assigned, as we rely on it to |
| // determine whether a region has been successfully reopened. So here we always write open |
| // marker, even if the table is read only. |
| if ( |
| wal != null && getRegionServerServices() != null |
| && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) |
| ) { |
| writeRegionOpenMarker(wal, openSeqNum); |
| } |
| } catch (Throwable t) { |
| // By coprocessor path wrong region will open failed, |
| // MetricsRegionWrapperImpl is already init and not close, |
| // add region close when open failed |
| try { |
| // It is not required to write sequence id file when region open is failed. |
| // Passing true to skip the sequence id file write. |
| this.close(true); |
| } catch (Throwable e) { |
| LOG.warn("Open region: {} failed. Try close region but got exception ", |
| this.getRegionInfo(), e); |
| } |
| throw t; |
| } |
| return this; |
| } |
| |
| /** |
| * Open a Region on a read-only file-system (like hdfs snapshots) |
| * @param conf The Configuration object to use. |
| * @param fs Filesystem to use |
| * @param info Info for region to be opened. |
| * @param htd the table descriptor |
| * @return new HRegion |
| */ |
| public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs, |
| final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException { |
| if (info == null) { |
| throw new NullPointerException("Passed region info is null"); |
| } |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Opening region (readOnly filesystem): " + info); |
| } |
| if (info.getReplicaId() <= 0) { |
| info = RegionInfoBuilder.newBuilder(info).setReplicaId(1).build(); |
| } |
| HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null); |
| r.writestate.setReadOnly(true); |
| return r.openHRegion(null); |
| } |
| |
| public static HRegion warmupHRegion(final RegionInfo info, final TableDescriptor htd, |
| final WAL wal, final Configuration conf, final RegionServerServices rsServices, |
| final CancelableProgressable reporter) throws IOException { |
| |
| Objects.requireNonNull(info, "RegionInfo cannot be null"); |
| LOG.debug("Warmup {}", info); |
| Path rootDir = CommonFSUtils.getRootDir(conf); |
| Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); |
| FileSystem fs = null; |
| if (rsServices != null) { |
| fs = rsServices.getFileSystem(); |
| } |
| if (fs == null) { |
| fs = rootDir.getFileSystem(conf); |
| } |
| HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null); |
| r.initializeWarmup(reporter); |
| r.close(); |
| return r; |
| } |
| |
| /** |
| * Computes the Path of the HRegion |
| * @param tabledir qualified path for table |
| * @param name ENCODED region name |
| * @return Path of HRegion directory |
| * @deprecated For tests only; to be removed. |
| */ |
| @Deprecated |
| public static Path getRegionDir(final Path tabledir, final String name) { |
| return new Path(tabledir, name); |
| } |
| |
| /** |
| * Determines if the specified row is within the row range specified by the specified RegionInfo |
| * @param info RegionInfo that specifies the row range |
| * @param row row to be checked |
| * @return true if the row is within the range specified by the RegionInfo |
| */ |
| public static boolean rowIsInRange(RegionInfo info, final byte[] row) { |
| return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), row) <= 0)) |
| && ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), row) > 0)); |
| } |
| |
| public static boolean rowIsInRange(RegionInfo info, final byte[] row, final int offset, |
| final short length) { |
| return ((info.getStartKey().length == 0) |
| || (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length, row, offset, length) |
| <= 0)) |
| && ((info.getEndKey().length == 0) |
| || (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) |
| > 0)); |
| } |
| |
| @Override |
| public Result get(final Get get) throws IOException { |
| prepareGet(get); |
| List<Cell> results = get(get, true); |
| boolean stale = this.getRegionInfo().getReplicaId() != 0; |
| return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale); |
| } |
| |
| void prepareGet(final Get get) throws IOException { |
| checkRow(get.getRow(), "Get"); |
| // Verify families are all valid |
| if (get.hasFamilies()) { |
| for (byte[] family : get.familySet()) { |
| checkFamily(family); |
| } |
| } else { // Adding all families to scanner |
| for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { |
| get.addFamily(family); |
| } |
| } |
| } |
| |
| @Override |
| public List<Cell> get(Get get, boolean withCoprocessor) throws IOException { |
| return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| private List<Cell> get(Get get, boolean withCoprocessor, long nonceGroup, long nonce) |
| throws IOException { |
| return TraceUtil.trace(() -> getInternal(get, withCoprocessor, nonceGroup, nonce), |
| () -> createRegionSpan("Region.get")); |
| } |
| |
| private List<Cell> getInternal(Get get, boolean withCoprocessor, long nonceGroup, long nonce) |
| throws IOException { |
| List<Cell> results = new ArrayList<>(); |
| long before = EnvironmentEdgeManager.currentTime(); |
| |
| // pre-get CP hook |
| if (withCoprocessor && (coprocessorHost != null)) { |
| if (coprocessorHost.preGet(get, results)) { |
| metricsUpdateForGet(results, before); |
| return results; |
| } |
| } |
| Scan scan = new Scan(get); |
| if (scan.getLoadColumnFamiliesOnDemandValue() == null) { |
| scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault()); |
| } |
| try (RegionScanner scanner = getScanner(scan, null, nonceGroup, nonce)) { |
| List<Cell> tmp = new ArrayList<>(); |
| scanner.next(tmp); |
| // Copy EC to heap, then close the scanner. |
| // This can be an EXPENSIVE call. It may make an extra copy from offheap to onheap buffers. |
| // See more details in HBASE-26036. |
| for (Cell cell : tmp) { |
| results.add(CellUtil.cloneIfNecessary(cell)); |
| } |
| } |
| |
| // post-get CP hook |
| if (withCoprocessor && (coprocessorHost != null)) { |
| coprocessorHost.postGet(get, results); |
| } |
| |
| metricsUpdateForGet(results, before); |
| |
| return results; |
| } |
| |
| void metricsUpdateForGet(List<Cell> results, long before) { |
| if (this.metricsRegion != null) { |
| this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before); |
| } |
| if (rsServices != null && this.rsServices.getMetrics() != null) { |
| rsServices.getMetrics().updateReadQueryMeter(this, 1); |
| } |
| } |
| |
| @Override |
| public Result mutateRow(RowMutations rm) throws IOException { |
| return mutateRow(rm, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| public Result mutateRow(RowMutations rm, long nonceGroup, long nonce) throws IOException { |
| final List<Mutation> m = rm.getMutations(); |
| OperationStatus[] statuses = batchMutate(m.toArray(new Mutation[0]), true, nonceGroup, nonce); |
| |
| List<Result> results = new ArrayList<>(); |
| for (OperationStatus status : statuses) { |
| if (status.getResult() != null) { |
| results.add(status.getResult()); |
| } |
| } |
| |
| if (results.isEmpty()) { |
| return null; |
| } |
| |
| // Merge the results of the Increment/Append operations |
| List<Cell> cells = new ArrayList<>(); |
| for (Result result : results) { |
| if (result.rawCells() != null) { |
| cells.addAll(Arrays.asList(result.rawCells())); |
| } |
| } |
| return Result.create(cells); |
| } |
| |
| /** |
| * Perform atomic (all or none) mutations within the region. |
| * @param mutations The list of mutations to perform. <code>mutations</code> can contain |
| * operations for multiple rows. Caller has to ensure that all rows are |
| * contained in this region. |
| * @param rowsToLock Rows to lock |
| * @param nonceGroup Optional nonce group of the operation (client Id) |
| * @param nonce Optional nonce of the operation (unique random id to ensure "more |
| * idempotence") If multiple rows are locked care should be taken that |
| * <code>rowsToLock</code> is sorted in order to avoid deadlocks. |
| */ |
| @Override |
| public void mutateRowsWithLocks(Collection<Mutation> mutations, Collection<byte[]> rowsToLock, |
| long nonceGroup, long nonce) throws IOException { |
| batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]), |
| true, nonceGroup, nonce) { |
| @Override |
| public MiniBatchOperationInProgress<Mutation> |
| lockRowsAndBuildMiniBatch(List<RowLock> acquiredRowLocks) throws IOException { |
| RowLock prevRowLock = null; |
| for (byte[] row : rowsToLock) { |
| try { |
| RowLock rowLock = region.getRowLock(row, false, prevRowLock); // write lock |
| if (rowLock != prevRowLock) { |
| acquiredRowLocks.add(rowLock); |
| prevRowLock = rowLock; |
| } |
| } catch (IOException ioe) { |
| LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(row), this, |
| ioe); |
| throw ioe; |
| } |
| } |
| return createMiniBatch(size(), size()); |
| } |
| }); |
| } |
| |
| /** Returns statistics about the current load of the region */ |
| public ClientProtos.RegionLoadStats getLoadStatistics() { |
| if (!regionStatsEnabled) { |
| return null; |
| } |
| ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder(); |
| stats.setMemStoreLoad((int) (Math.min(100, |
| (this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize))); |
| if (rsServices.getHeapMemoryManager() != null) { |
| // the HeapMemoryManager uses -0.0 to signal a problem asking the JVM, |
| // so we could just do the calculation below and we'll get a 0. |
| // treating it as a special case analogous to no HMM instead so that it can be |
| // programatically treated different from using <1% of heap. |
| final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent(); |
| if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) { |
| stats.setHeapOccupancy((int) (occupancy * 100)); |
| } |
| } |
| stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100 |
| ? 100 |
| : rsServices.getCompactionPressure() * 100)); |
| return stats.build(); |
| } |
| |
| @Override |
| public void processRowsWithLocks(RowProcessor<?, ?> processor) throws IOException { |
| processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| @Override |
| public void processRowsWithLocks(RowProcessor<?, ?> processor, long nonceGroup, long nonce) |
| throws IOException { |
| processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce); |
| } |
| |
| @Override |
| public void processRowsWithLocks(RowProcessor<?, ?> processor, long timeout, long nonceGroup, |
| long nonce) throws IOException { |
| for (byte[] row : processor.getRowsToLock()) { |
| checkRow(row, "processRowsWithLocks"); |
| } |
| if (!processor.readOnly()) { |
| checkReadOnly(); |
| } |
| checkResources(); |
| startRegionOperation(); |
| WALEdit walEdit = new WALEdit(); |
| |
| // STEP 1. Run pre-process hook |
| preProcess(processor, walEdit); |
| // Short circuit the read only case |
| if (processor.readOnly()) { |
| try { |
| long now = EnvironmentEdgeManager.currentTime(); |
| doProcessRowWithTimeout(processor, now, this, null, null, timeout); |
| processor.postProcess(this, walEdit, true); |
| } finally { |
| closeRegionOperation(); |
| } |
| return; |
| } |
| |
| boolean locked = false; |
| List<RowLock> acquiredRowLocks = null; |
| List<Mutation> mutations = new ArrayList<>(); |
| Collection<byte[]> rowsToLock = processor.getRowsToLock(); |
| // This is assigned by mvcc either explicity in the below or in the guts of the WAL append |
| // when it assigns the edit a sequencedid (A.K.A the mvcc write number). |
| WriteEntry writeEntry = null; |
| MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing(); |
| |
| // Check for thread interrupt status in case we have been signaled from |
| // #interruptRegionOperation. |
| checkInterrupt(); |
| |
| try { |
| boolean success = false; |
| try { |
| // STEP 2. Acquire the row lock(s) |
| acquiredRowLocks = new ArrayList<>(rowsToLock.size()); |
| RowLock prevRowLock = null; |
| for (byte[] row : rowsToLock) { |
| // Attempt to lock all involved rows, throw if any lock times out |
| // use a writer lock for mixed reads and writes |
| RowLock rowLock = getRowLockInternal(row, false, prevRowLock); |
| if (rowLock != prevRowLock) { |
| acquiredRowLocks.add(rowLock); |
| prevRowLock = rowLock; |
| } |
| } |
| |
| // Check for thread interrupt status in case we have been signaled from |
| // #interruptRegionOperation. Do it before we take the lock and disable interrupts for |
| // the WAL append. |
| checkInterrupt(); |
| |
| // STEP 3. Region lock |
| lock(this.updatesLock.readLock(), acquiredRowLocks.isEmpty() ? 1 : acquiredRowLocks.size()); |
| locked = true; |
| |
| // From this point until memstore update this operation should not be interrupted. |
| disableInterrupts(); |
| |
| long now = EnvironmentEdgeManager.currentTime(); |
| // STEP 4. Let the processor scan the rows, generate mutations and add waledits |
| doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout); |
| if (!mutations.isEmpty()) { |
| writeRequestsCount.add(mutations.size()); |
| // STEP 5. Call the preBatchMutate hook |
| processor.preBatchMutate(this, walEdit); |
| |
| // STEP 6. Append and sync if walEdit has data to write out. |
| if (!walEdit.isEmpty()) { |
| writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()), |
| processor.getClusterIds(), now, nonceGroup, nonce); |
| } else { |
| // We are here if WAL is being skipped. |
| writeEntry = this.mvcc.begin(); |
| } |
| |
| // STEP 7. Apply to memstore |
| long sequenceId = writeEntry.getWriteNumber(); |
| for (Mutation m : mutations) { |
| // Handle any tag based cell features. |
| // TODO: Do we need to call rewriteCellTags down in applyToMemStore()? Why not before |
| // so tags go into WAL? |
| rewriteCellTags((Map) m.getFamilyCellMap(), m); |
| for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) { |
| Cell cell = cellScanner.current(); |
| if (walEdit.isEmpty()) { |
| // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id. |
| // If no WAL, need to stamp it here. |
| PrivateCellUtil.setSequenceId(cell, sequenceId); |
| } |
| applyToMemStore(getStore(cell), (ExtendedCell) cell, memstoreAccounting); |
| } |
| } |
| |
| // STEP 8. call postBatchMutate hook |
| processor.postBatchMutate(this); |
| |
| // STEP 9. Complete mvcc. |
| mvcc.completeAndWait(writeEntry); |
| writeEntry = null; |
| |
| // STEP 10. Release region lock |
| if (locked) { |
| this.updatesLock.readLock().unlock(); |
| locked = false; |
| } |
| |
| // STEP 11. Release row lock(s) |
| releaseRowLocks(acquiredRowLocks); |
| |
| if (rsServices != null && rsServices.getMetrics() != null) { |
| rsServices.getMetrics().updateWriteQueryMeter(this, mutations.size()); |
| } |
| } |
| success = true; |
| } finally { |
| // Call complete rather than completeAndWait because we probably had error if walKey != null |
| if (writeEntry != null) mvcc.complete(writeEntry); |
| if (locked) { |
| this.updatesLock.readLock().unlock(); |
| } |
| // release locks if some were acquired but another timed out |
| releaseRowLocks(acquiredRowLocks); |
| |
| enableInterrupts(); |
| } |
| |
| // 12. Run post-process hook |
| processor.postProcess(this, walEdit, success); |
| } finally { |
| closeRegionOperation(); |
| if (!mutations.isEmpty()) { |
| this.incMemStoreSize(memstoreAccounting.getMemStoreSize()); |
| requestFlushIfNeeded(); |
| } |
| } |
| } |
| |
| private void preProcess(final RowProcessor<?, ?> processor, final WALEdit walEdit) |
| throws IOException { |
| try { |
| processor.preProcess(this, walEdit); |
| } catch (IOException e) { |
| closeRegionOperation(); |
| throw e; |
| } |
| } |
| |
| private void doProcessRowWithTimeout(final RowProcessor<?, ?> processor, final long now, |
| final HRegion region, final List<Mutation> mutations, final WALEdit walEdit, final long timeout) |
| throws IOException { |
| // Short circuit the no time bound case. |
| if (timeout < 0) { |
| try { |
| processor.process(now, region, mutations, walEdit); |
| } catch (IOException e) { |
| String row = processor.getRowsToLock().isEmpty() |
| ? "" |
| : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) |
| + "..."; |
| LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", |
| processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); |
| throw e; |
| } |
| return; |
| } |
| |
| // Case with time bound |
| FutureTask<Void> task = new FutureTask<>(new Callable<Void>() { |
| @Override |
| public Void call() throws IOException { |
| try { |
| processor.process(now, region, mutations, walEdit); |
| return null; |
| } catch (IOException e) { |
| String row = processor.getRowsToLock().isEmpty() |
| ? "" |
| : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) |
| + "..."; |
| LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", |
| processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); |
| throw e; |
| } |
| } |
| }); |
| rowProcessorExecutor.execute(task); |
| try { |
| task.get(timeout, TimeUnit.MILLISECONDS); |
| } catch (InterruptedException ie) { |
| throw throwOnInterrupt(ie); |
| } catch (TimeoutException te) { |
| String row = processor.getRowsToLock().isEmpty() |
| ? "" |
| : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; |
| LOG.error("RowProcessor timeout: {} ms, in region {}, {}", timeout, |
| getRegionInfo().getRegionNameAsString(), row); |
| throw new IOException(te); |
| } catch (Exception e) { |
| throw new IOException(e); |
| } |
| } |
| |
| @Override |
| public Result append(Append append) throws IOException { |
| return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| public Result append(Append append, long nonceGroup, long nonce) throws IOException { |
| return TraceUtil.trace(() -> { |
| checkReadOnly(); |
| checkResources(); |
| startRegionOperation(Operation.APPEND); |
| try { |
| // All edits for the given row (across all column families) must happen atomically. |
| return mutate(append, true, nonceGroup, nonce).getResult(); |
| } finally { |
| closeRegionOperation(Operation.APPEND); |
| } |
| }, () -> createRegionSpan("Region.append")); |
| } |
| |
| @Override |
| public Result increment(Increment increment) throws IOException { |
| return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE); |
| } |
| |
| public Result increment(Increment increment, long nonceGroup, long nonce) throws IOException { |
| return TraceUtil.trace(() -> { |
| checkReadOnly(); |
| checkResources(); |
| startRegionOperation(Operation.INCREMENT); |
| try { |
| // All edits for the given row (across all column families) must happen atomically. |
| return mutate(increment, true, nonceGroup, nonce).getResult(); |
| } finally { |
| closeRegionOperation(Operation.INCREMENT); |
| } |
| }, () -> createRegionSpan("Region.increment")); |
| } |
| |
| private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds, |
| long now, long nonceGroup, long nonce) throws IOException { |
| return doWALAppend(walEdit, durability, clusterIds, now, nonceGroup, nonce, |
| SequenceId.NO_SEQUENCE_ID); |
| } |
| |
| /** Returns writeEntry associated with this append */ |
| private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds, |
| long now, long nonceGroup, long nonce, long origLogSeqNum) throws IOException { |
| Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(), "WALEdit is null or empty!"); |
| Preconditions.checkArgument(!walEdit.isReplay() || origLogSeqNum != SequenceId.NO_SEQUENCE_ID, |
| "Invalid replay sequence Id for replay WALEdit!"); |
| // Using default cluster id, as this can only happen in the originating cluster. |
| // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey |
| // here instead of WALKeyImpl directly to support legacy coprocessors. |
| WALKeyImpl walKey = walEdit.isReplay() |
| ? new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), |
| this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, |
| nonceGroup, nonce, mvcc) |
| : new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), |
| this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, |
| nonceGroup, nonce, mvcc, this.getReplicationScope()); |
| if (walEdit.isReplay()) { |
| walKey.setOrigLogSeqNum(origLogSeqNum); |
| } |
| // don't call the coproc hook for writes to the WAL caused by |
| // system lifecycle events like flushes or compactions |
| if (this.coprocessorHost != null && !walEdit.isMetaEdit()) { |
| this.coprocessorHost.preWALAppend(walKey, walEdit); |
| } |
| WriteEntry writeEntry = null; |
| try { |
| long txid = this.wal.appendData(this.getRegionInfo(), walKey, walEdit); |
| // Call sync on our edit. |
| if (txid != 0) { |
| sync(txid, durability); |
| } |
| writeEntry = walKey.getWriteEntry(); |
| } catch (IOException ioe) { |
| if (walKey != null && walKey.getWriteEntry() != null) { |
| mvcc.complete(walKey.getWriteEntry()); |
| } |
| |
| /** |
| * If {@link WAL#sync} get a timeout exception, the only correct way is to abort the region |
| * server, as the design of {@link WAL#sync}, is to succeed or die, there is no 'failure'. It |
| * is usually not a big deal is because we set a very large default value(5 minutes) for |
| * {@link AbstractFSWAL#WAL_SYNC_TIMEOUT_MS}, usually the WAL system will abort the region |
| * server if it can not finish the sync within 5 minutes. |
| */ |
| if (ioe instanceof WALSyncTimeoutIOException) { |
| if (rsServices != null) { |
| rsServices.abort("WAL sync timeout,forcing server shutdown", ioe); |
| } |
| } |
| throw ioe; |
| } |
| return writeEntry; |
| } |
| |
| // |
| // New HBASE-880 Helpers |
| // |
| void checkFamily(final byte[] family) throws NoSuchColumnFamilyException { |
| if (!this.htableDescriptor.hasColumnFamily(family)) { |
| throw new NoSuchColumnFamilyException("Column family " + Bytes.toString(family) |
| + " does not exist in region " + this + " in table " + this.htableDescriptor); |
| } |
| } |
| |
| public static final long FIXED_OVERHEAD = ClassSize.estimateBase(HRegion.class, false); |
| |
| // woefully out of date - currently missing: |
| // 1 x HashMap - coprocessorServiceHandlers |
| // 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL, |
| // checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount, |
| // writeRequestsCount |
| // 1 x HRegion$WriteState - writestate |
| // 1 x RegionCoprocessorHost - coprocessorHost |
| // 1 x RegionSplitPolicy - splitPolicy |
| // 1 x MetricsRegion - metricsRegion |
| // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper |
| // 1 x ReadPointCalculationLock - smallestReadPointCalcLock |
| public static final long DEEP_OVERHEAD = FIXED_OVERHEAD + ClassSize.OBJECT + // closeLock |
| (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing |
| (3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL, |
| // compactionsFailed |
| (3 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, scannerReadPoints, regionLockHolders |
| WriteState.HEAP_SIZE + // writestate |
| ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores |
| (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock |
| MultiVersionConcurrencyControl.FIXED_SIZE // mvcc |
| + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes |
| + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress |
| + ClassSize.STORE_SERVICES // store services |
| + StoreHotnessProtector.FIXED_SIZE; |
| |
| @Override |
| public long heapSize() { |
| // this does not take into account row locks, recent flushes, mvcc entries, and more |
| return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum(); |
| } |
| |
| /** |
| * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to be |
| * available for handling Region#execService(com.google.protobuf.RpcController, |
| * org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall) calls. |
| * <p> |
| * Only a single instance may be registered per region for a given {@link Service} subclass (the |
| * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}. |
| * After the first registration, subsequent calls with the same service name will fail with a |
| * return value of {@code false}. |
| * </p> |
| * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint |
| * @return {@code true} if the registration was successful, {@code false} otherwise |
| */ |
| public boolean registerService(com.google.protobuf.Service instance) { |
| /* |
| * No stacking of instances is allowed for a single service name |
| */ |
| com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType(); |
| String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc); |
| if (coprocessorServiceHandlers.containsKey(serviceName)) { |
| LOG.error("Coprocessor service {} already registered, rejecting request from {} in region {}", |
| serviceName, instance, this); |
| return false; |
| } |
| |
| coprocessorServiceHandlers.put(serviceName, instance); |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Registered coprocessor service: region=" |
| + Bytes.toStringBinary(getRegionInfo().getRegionName()) + " service=" + serviceName); |
| } |
| return true; |
| } |
| |
| /** |
| * Executes a single protocol buffer coprocessor endpoint {@link Service} method using the |
| * registered protocol handlers. {@link Service} implementations must be registered via the |
| * {@link #registerService(com.google.protobuf.Service)} method before they are available. |
| * @param controller an {@code RpcContoller} implementation to pass to the invoked service |
| * @param call a {@code CoprocessorServiceCall} instance identifying the service, method, |
| * and parameters for the method invocation |
| * @return a protocol buffer {@code Message} instance containing the method's result |
| * @throws IOException if no registered service handler is found or an error occurs during the |
| * invocation |
| * @see #registerService(com.google.protobuf.Service) |
| */ |
| public com.google.protobuf.Message execService(com.google.protobuf.RpcController controller, |
| CoprocessorServiceCall call) throws IOException { |
| String serviceName = call.getServiceName(); |
| com.google.protobuf.Service service = coprocessorServiceHandlers.get(serviceName); |
| if (service == null) { |
| throw new UnknownProtocolException(null, "No registered coprocessor service found for " |
| + serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName())); |
| } |
| com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType(); |
| |
| String methodName = call.getMethodName(); |
| com.google.protobuf.Descriptors.MethodDescriptor methodDesc = |
| CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc); |
| |
| com.google.protobuf.Message.Builder builder = |
| service.getRequestPrototype(methodDesc).newBuilderForType(); |
| |
| org.apache.hadoop.hbase.protobuf.ProtobufUtil.mergeFrom(builder, |
| call.getRequest().toByteArray()); |
| com.google.protobuf.Message request = |
| CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest()); |
| |
| if (coprocessorHost != null) { |
| request = coprocessorHost.preEndpointInvocation(service, methodName, request); |
| } |
| |
| final com.google.protobuf.Message.Builder responseBuilder = |
| service.getResponsePrototype(methodDesc).newBuilderForType(); |
| service.callMethod(methodDesc, controller, request, |
| new com.google.protobuf.RpcCallback<com.google.protobuf.Message>() { |
| @Override |
| public void run(com.google.protobuf.Message message) { |
| if (message != null) { |
| responseBuilder.mergeFrom(message); |
| } |
| } |
| }); |
| |
| if (coprocessorHost != null) { |
| coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder); |
| } |
| IOException exception = |
| org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller); |
| if (exception != null) { |
| throw exception; |
| } |
| |
| return responseBuilder.build(); |
| } |
| |
| public Optional<byte[]> checkSplit() { |
| return checkSplit(false); |
| } |
| |
| /** |
| * Return the split point. An empty result indicates the region isn't splittable. |
| */ |
| public Optional<byte[]> checkSplit(boolean force) { |
| // Can't split META |
| if ( |
| this.getRegionInfo().isMetaRegion() |
| || TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable()) |
| ) { |
| return Optional.empty(); |
| } |
| |
| // Can't split a region that is closing. |
| if (this.isClosing()) { |
| return Optional.empty(); |
| } |
| |
| if (!force && !splitPolicy.shouldSplit()) { |
| return Optional.empty(); |
| } |
| |
| byte[] ret = splitPolicy.getSplitPoint(); |
| if (ret != null && ret.length > 0) { |
| ret = splitRestriction.getRestrictedSplitPoint(ret); |
| } |
| |
| if (ret != null) { |
| try { |
| checkRow(ret, "calculated split"); |
| } catch (IOException e) { |
| LOG.error("Ignoring invalid split for region {}", this, e); |
| return Optional.empty(); |
| } |
| return Optional.of(ret); |
| } else { |
| return Optional.empty(); |
| } |
| } |
| |
| /** Returns The priority that this region should have in the compaction queue */ |
| public int getCompactPriority() { |
| if (conf.getBoolean(SPLIT_IGNORE_BLOCKING_ENABLED_KEY, false) && checkSplit().isPresent()) { |
| // if a region should split, split it before compact |
| return Store.PRIORITY_USER; |
| } |
| return stores.values().stream().mapToInt(HStore::getCompactPriority).min() |
| .orElse(Store.NO_PRIORITY); |
| } |
| |
| /** Returns the coprocessor host */ |
| public RegionCoprocessorHost getCoprocessorHost() { |
| return coprocessorHost; |
| } |
| |
| /** @param coprocessorHost the new coprocessor host */ |
| public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) { |
| this.coprocessorHost = coprocessorHost; |
| } |
| |
| @Override |
| public void startRegionOperation() throws IOException { |
| startRegionOperation(Operation.ANY); |
| } |
| |
| @Override |
| public void startRegionOperation(Operation op) throws IOException { |
| boolean isInterruptableOp = false; |
| switch (op) { |
| case GET: // interruptible read operations |
| case SCAN: |
| isInterruptableOp = true; |
| checkReadsEnabled(); |
| break; |
| case INCREMENT: // interruptible write operations |
| case APPEND: |
| case PUT: |
| case DELETE: |
| case BATCH_MUTATE: |
| case CHECK_AND_MUTATE: |
| isInterruptableOp = true; |
| break; |
| default: // all others |
| break; |
| } |
| if ( |
| op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION || op == Operation.COMPACT_REGION |
| || op == Operation.COMPACT_SWITCH |
| ) { |
| // split, merge or compact region doesn't need to check the closing/closed state or lock the |
| // region |
| return; |
| } |
| if (this.closing.get()) { |
| throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); |
| } |
| lock(lock.readLock()); |
| // Update regionLockHolders ONLY for any startRegionOperation call that is invoked from |
| // an RPC handler |
| Thread thisThread = Thread.currentThread(); |
| if (isInterruptableOp) { |
| regionLockHolders.put(thisThread, true); |
| } |
| if (this.closed.get()) { |
| lock.readLock().unlock(); |
| throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); |
| } |
| // The unit for snapshot is a region. So, all stores for this region must be |
| // prepared for snapshot operation before proceeding. |
| if (op == Operation.SNAPSHOT) { |
| stores.values().forEach(HStore::preSnapshotOperation); |
| } |
| try { |
| if (coprocessorHost != null) { |
| coprocessorHost.postStartRegionOperation(op); |
| } |
| } catch (Exception e) { |
| if (isInterruptableOp) { |
| // would be harmless to remove what we didn't add but we know by 'isInterruptableOp' |
| // if we added this thread to regionLockHolders |
| regionLockHolders.remove(thisThread); |
| } |
| lock.readLock().unlock(); |
| throw new IOException(e); |
| } |
| } |
| |
| @Override |
| public void closeRegionOperation() throws IOException { |
| closeRegionOperation(Operation.ANY); |
| } |
| |
| @Override |
| public void closeRegionOperation(Operation operation) throws IOException { |
| if (operation == Operation.SNAPSHOT) { |
| stores.values().forEach(HStore::postSnapshotOperation); |
| } |
| Thread thisThread = Thread.currentThread(); |
| regionLockHolders.remove(thisThread); |
| lock.readLock().unlock(); |
| if (coprocessorHost != null) { |
| coprocessorHost.postCloseRegionOperation(operation); |
| } |
| } |
| |
| /** |
| * This method needs to be called before any public call that reads or modifies stores in bulk. It |
| * has to be called just before a try. #closeBulkRegionOperation needs to be called in the try's |
| * finally block Acquires a writelock and checks if the region is closing or closed. |
| * @throws NotServingRegionException when the region is closing or closed |
| * @throws RegionTooBusyException if failed to get the lock in time |
| * @throws InterruptedIOException if interrupted while waiting for a lock |
| */ |
| private void startBulkRegionOperation(boolean writeLockNeeded) throws IOException { |
| if (this.closing.get()) { |
| throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); |
| } |
| if (writeLockNeeded) lock(lock.writeLock()); |
| else lock(lock.readLock()); |
| if (this.closed.get()) { |
| if (writeLockNeeded) lock.writeLock().unlock(); |
| else lock.readLock().unlock(); |
| throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); |
| } |
| regionLockHolders.put(Thread.currentThread(), true); |
| } |
| |
| /** |
| * Closes the lock. This needs to be called in the finally block corresponding to the try block of |
| * #startRegionOperation |
| */ |
| private void closeBulkRegionOperation() { |
| regionLockHolders.remove(Thread.currentThread()); |
| if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock(); |
| else lock.readLock().unlock(); |
| } |
| |
| /** |
| * Update LongAdders for number of puts without wal and the size of possible data loss. These |
| * information are exposed by the region server metrics. |
| */ |
| private void recordMutationWithoutWal(final Map<byte[], List<Cell>> familyMap) { |
| numMutationsWithoutWAL.increment(); |
| if (numMutationsWithoutWAL.sum() <= 1) { |
| LOG.info("writing data to region " + this |
| + " with WAL disabled. Data may be lost in the event of a crash."); |
| } |
| |
| long mutationSize = 0; |
| for (List<Cell> cells : familyMap.values()) { |
| // Optimization: 'foreach' loop is not used. See: |
| // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects |
| assert cells instanceof RandomAccess; |
| int listSize = cells.size(); |
| for (int i = 0; i < listSize; i++) { |
| Cell cell = cells.get(i); |
| mutationSize += cell.getSerializedSize(); |
| } |
| } |
| |
| dataInMemoryWithoutWAL.add(mutationSize); |
| } |
| |
| private void lock(final Lock lock) throws IOException { |
| lock(lock, 1); |
| } |
| |
| /** |
| * Try to acquire a lock. Throw RegionTooBusyException if failed to get the lock in time. Throw |
| * InterruptedIOException if interrupted while waiting for the lock. |
| */ |
| private void lock(final Lock lock, final int multiplier) throws IOException { |
| try { |
| final long waitTime = Math.min(maxBusyWaitDuration, |
| busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier)); |
| if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) { |
| // Don't print millis. Message is used as a key over in |
| // RetriesExhaustedWithDetailsException processing. |
| final String regionName = |
| this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getRegionNameAsString(); |
| final String serverName = this.getRegionServerServices() == null |
| ? "unknown" |
| : (this.getRegionServerServices().getServerName() == null |
| ? "unknown" |
| : this.getRegionServerServices().getServerName().toString()); |
| RegionTooBusyException rtbe = new RegionTooBusyException( |
| "Failed to obtain lock; regionName=" + regionName + ", server=" + serverName); |
| LOG.warn("Region is too busy to allow lock acquisition.", rtbe); |
| throw rtbe; |
| } |
| } catch (InterruptedException ie) { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Interrupted while waiting for a lock in region {}", this); |
| } |
| throw throwOnInterrupt(ie); |
| } |
| } |
| |
| /** |
| * Calls sync with the given transaction ID |
| * @param txid should sync up to which transaction |
| * @throws IOException If anything goes wrong with DFS |
| */ |
| private void sync(long txid, Durability durability) throws IOException { |
| if (this.getRegionInfo().isMetaRegion()) { |
| this.wal.sync(txid); |
| } else { |
| switch (durability) { |
| case USE_DEFAULT: |
| // do what table defaults to |
| if (shouldSyncWAL()) { |
| this.wal.sync(txid); |
| } |
| break; |
| case SKIP_WAL: |
| // nothing do to |
| break; |
| case ASYNC_WAL: |
| // nothing do to |
| break; |
| case SYNC_WAL: |
| this.wal.sync(txid, false); |
| break; |
| case FSYNC_WAL: |
| this.wal.sync(txid, true); |
| break; |
| default: |
| throw new RuntimeException("Unknown durability " + durability); |
| } |
| } |
| } |
| |
| /** |
| * Check whether we should sync the wal from the table's durability settings |
| */ |
| private boolean shouldSyncWAL() { |
| return regionDurability.ordinal() > Durability.ASYNC_WAL.ordinal(); |
| } |
| |
| /** Returns the latest sequence number that was read from storage when this region was opened */ |
| public long getOpenSeqNum() { |
| return this.openSeqNum; |
| } |
| |
| @Override |
| public Map<byte[], Long> getMaxStoreSeqId() { |
| return this.maxSeqIdInStores; |
| } |
| |
| public long getOldestSeqIdOfStore(byte[] familyName) { |
| return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName); |
| } |
| |
| @Override |
| public CompactionState getCompactionState() { |
| boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0; |
| return (hasMajor |
| ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR) |
| : (hasMinor ? CompactionState.MINOR : CompactionState.NONE)); |
| } |
| |
| public void reportCompactionRequestStart(boolean isMajor) { |
| (isMajor ? majorInProgress : minorInProgress).incrementAndGet(); |
| } |
| |
| public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) { |
| int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet(); |
| |
| // metrics |
| compactionsFinished.increment(); |
| compactionNumFilesCompacted.add(numFiles); |
| compactionNumBytesCompacted.add(filesSizeCompacted); |
| |
| assert newValue >= 0; |
| } |
| |
| public void reportCompactionRequestFailure() { |
| compactionsFailed.increment(); |
| } |
| |
| public void incrementCompactionsQueuedCount() { |
| compactionsQueued.increment(); |
| } |
| |
| public void decrementCompactionsQueuedCount() { |
| compactionsQueued.decrement(); |
| } |
| |
| public void incrementFlushesQueuedCount() { |
| flushesQueued.increment(); |
| } |
| |
| protected void decrementFlushesQueuedCount() { |
| flushesQueued.decrement(); |
| } |
| |
| /** |
| * If a handler thread is eligible for interrupt, make it ineligible. Should be paired with |
| * {{@link #enableInterrupts()}. |
| */ |
| void disableInterrupts() { |
| regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> false); |
| } |
| |
| /** |
| * If a handler thread was made ineligible for interrupt via {{@link #disableInterrupts()}, make |
| * it eligible again. No-op if interrupts are already enabled. |
| */ |
| void enableInterrupts() { |
| regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> true); |
| } |
| |
| /** |
| * Interrupt any region options that have acquired the region lock via |
| * {@link #startRegionOperation(org.apache.hadoop.hbase.regionserver.Region.Operation)}, or |
| * {@link #startBulkRegionOperation(boolean)}. |
| */ |
| private void interruptRegionOperations() { |
| for (Map.Entry<Thread, Boolean> entry : regionLockHolders.entrySet()) { |
| // An entry in this map will have a boolean value indicating if it is currently |
| // eligible for interrupt; if so, we should interrupt it. |
| if (entry.getValue().booleanValue()) { |
| entry.getKey().interrupt(); |
| } |
| } |
| } |
| |
| /** |
| * Check thread interrupt status and throw an exception if interrupted. |
| * @throws NotServingRegionException if region is closing |
| * @throws InterruptedIOException if interrupted but region is not closing |
| */ |
| // Package scope for tests |
| void checkInterrupt() throws NotServingRegionException, InterruptedIOException { |
| if (Thread.interrupted()) { |
| if (this.closing.get()) { |
| throw new NotServingRegionException( |
| getRegionInfo().getRegionNameAsString() + " is closing"); |
| } |
| throw new InterruptedIOException(); |
| } |
| } |
| |
| /** |
| * Throw the correct exception upon interrupt |
| * @param t cause |
| */ |
| // Package scope for tests |
| IOException throwOnInterrupt(Throwable t) { |
| if (this.closing.get()) { |
| return (NotServingRegionException) new NotServingRegionException( |
| getRegionInfo().getRegionNameAsString() + " is closing").initCause(t); |
| } |
| return (InterruptedIOException) new InterruptedIOException().initCause(t); |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public void onConfigurationChange(Configuration conf) { |
| this.storeHotnessProtector.update(conf); |
| // update coprocessorHost if the configuration has changed. |
| if ( |
| CoprocessorConfigurationUtil.checkConfigurationChange(getReadOnlyConfiguration(), conf, |
| CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, |
| CoprocessorHost.USER_REGION_COPROCESSOR_CONF_KEY) |
| ) { |
| LOG.info("Update the system coprocessors because the configuration has changed"); |
| decorateRegionConfiguration(conf); |
| this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); |
| } |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public void registerChildren(ConfigurationManager manager) { |
| configurationManager = manager; |
| stores.values().forEach(manager::registerObserver); |
| } |
| |
| /** |
| * {@inheritDoc} |
| */ |
| @Override |
| public void deregisterChildren(ConfigurationManager manager) { |
| stores.values().forEach(configurationManager::deregisterObserver); |
| } |
| |
| @Override |
| public CellComparator getCellComparator() { |
| return cellComparator; |
| } |
| |
| public long getMemStoreFlushSize() { |
| return this.memstoreFlushSize; |
| } |
| |
| //// method for debugging tests |
| void throwException(String title, String regionName) { |
| StringBuilder buf = new StringBuilder(); |
| buf.append(title + ", "); |
| buf.append(getRegionInfo().toString()); |
| buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " "); |
| buf.append("stores: "); |
| for (HStore s : stores.values()) { |
| buf.append(s.getColumnFamilyDescriptor().getNameAsString()); |
| buf.append(" size: "); |
| buf.append(s.getMemStoreSize().getDataSize()); |
| buf.append(" "); |
| } |
| buf.append("end-of-stores"); |
| buf.append(", memstore size "); |
| buf.append(getMemStoreDataSize()); |
| if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) { |
| throw new RuntimeException(buf.toString()); |
| } |
| } |
| |
| @Override |
| public void requestCompaction(String why, int priority, boolean major, |
| CompactionLifeCycleTracker tracker) throws IOException { |
| if (major) { |
| stores.values().forEach(HStore::triggerMajorCompaction); |
| } |
| rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker, |
| RpcServer.getRequestUser().orElse(null)); |
| } |
| |
| @Override |
| public void requestCompaction(byte[] family, String why, int priority, boolean major, |
| CompactionLifeCycleTracker tracker) throws IOException { |
| HStore store = stores.get(family); |
| if (store == null) { |
| throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family) |
| + " does not exist in region " + getRegionInfo().getRegionNameAsString()); |
| } |
| if (major) { |
| store.triggerMajorCompaction(); |
| } |
| rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker, |
| RpcServer.getRequestUser().orElse(null)); |
| } |
| |
| private void requestFlushIfNeeded() throws RegionTooBusyException { |
| if (isFlushSize(this.memStoreSizing.getMemStoreSize())) { |
| requestFlush(); |
| } |
| } |
| |
| private void requestFlush() { |
| if (this.rsServices == null) { |
| return; |
| } |
| requestFlush0(FlushLifeCycleTracker.DUMMY); |
| } |
| |
| private void requestFlush0(FlushLifeCycleTracker tracker) { |
| boolean shouldFlush = false; |
| synchronized (writestate) { |
| if (!this.writestate.isFlushRequested()) { |
| shouldFlush = true; |
| writestate.flushRequested = true; |
| } |
| } |
| if (shouldFlush) { |
| // Make request outside of synchronize block; HBASE-818. |
| this.rsServices.getFlushRequester().requestFlush(this, tracker); |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName()); |
| } |
| } else { |
| tracker.notExecuted("Flush already requested on " + this); |
| } |
| } |
| |
| @Override |
| public void requestFlush(FlushLifeCycleTracker tracker) throws IOException { |
| requestFlush0(tracker); |
| } |
| |
| /** |
| * This method modifies the region's configuration in order to inject replication-related features |
| * @param conf region configurations |
| */ |
| private static void decorateRegionConfiguration(Configuration conf) { |
| if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) { |
| String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, ""); |
| String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName(); |
| if (!plugins.contains(replicationCoprocessorClass)) { |
| conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, |
| (plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass); |
| } |
| } |
| } |
| |
| public void addReadRequestsCount(long readRequestsCount) { |
| this.readRequestsCount.add(readRequestsCount); |
| } |
| |
| public void addWriteRequestsCount(long writeRequestsCount) { |
| this.writeRequestsCount.add(writeRequestsCount); |
| } |
| |
| @RestrictedApi(explanation = "Should only be called in tests", link = "", |
| allowedOnPath = ".*/src/test/.*") |
| boolean isReadsEnabled() { |
| return this.writestate.readsEnabled; |
| } |
| } |