| /*- |
| * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved. |
| * |
| * This file was distributed by Oracle as part of a version of Oracle Berkeley |
| * DB Java Edition made available at: |
| * |
| * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html |
| * |
| * Please see the LICENSE file included in the top-level directory of the |
| * appropriate version of Oracle Berkeley DB Java Edition for a copy of the |
| * license and additional information. |
| */ |
| |
| package com.sleepycat.je.rep.vlsn; |
| |
| import static com.sleepycat.je.utilint.VLSN.NULL_VLSN; |
| |
| import java.util.ArrayList; |
| import java.util.NoSuchElementException; |
| import java.util.SortedMap; |
| import java.util.TreeMap; |
| |
| import com.sleepycat.bind.tuple.LongBinding; |
| import com.sleepycat.je.Cursor; |
| import com.sleepycat.je.CursorConfig; |
| import com.sleepycat.je.DatabaseEntry; |
| import com.sleepycat.je.DatabaseException; |
| import com.sleepycat.je.DbInternal; |
| import com.sleepycat.je.EnvironmentFailureException; |
| import com.sleepycat.je.LockMode; |
| import com.sleepycat.je.OperationStatus; |
| import com.sleepycat.je.cleaner.DataEraser; |
| import com.sleepycat.je.cleaner.FileProtector; |
| import com.sleepycat.je.cleaner.FileProtector.ProtectedFileSet; |
| import com.sleepycat.je.dbi.DatabaseImpl; |
| import com.sleepycat.je.dbi.EnvironmentImpl; |
| import com.sleepycat.je.rep.impl.RepParams; |
| import com.sleepycat.je.rep.vlsn.VLSNRange.VLSNRangeBinding; |
| import com.sleepycat.je.txn.BasicLocker; |
| import com.sleepycat.je.txn.Locker; |
| import com.sleepycat.je.txn.Txn; |
| import com.sleepycat.je.utilint.DbLsn; |
| import com.sleepycat.je.utilint.LongStat; |
| import com.sleepycat.je.utilint.Pair; |
| import com.sleepycat.je.utilint.StatGroup; |
| import com.sleepycat.je.utilint.VLSN; |
| |
| /** |
| * See @link{VLSNIndex} for an overview of the mapping system. The VLSNTracker |
| * packages the VLSNRange and the cached, in-memory VLSNBuckets. |
| * |
| * The tracker has a notion of the "currentBucket", which is the one receiving |
| * updates. All other cached buckets are finished and are awaiting a write to |
| * the database. Those finished buckets will only be updated in special |
| * circumstances, such as log cleaning or replication stream truncation, when |
| * we can assume that there will no readers viewing the buckets. |
| */ |
| class VLSNTracker { |
| private final EnvironmentImpl envImpl; |
| |
| /* The first mapping that is is the tracker cache. */ |
| private VLSN firstTrackedVLSN = NULL_VLSN; |
| |
| /* The last VLSN that is stored on disk. */ |
| private VLSN lastOnDiskVLSN = NULL_VLSN; |
| |
| /* |
| * A cache of buckets that are not on disk. The map key is the bucket's |
| * first VLSN. |
| */ |
| SortedMap<Long, VLSNBucket> bucketCache; |
| |
| /* |
| * The range should always be updated through assignment to a new |
| * VLSNRange, to ensure that the values stay consistent. The current bucket |
| * mutex must be taken in order to update this value, so that the current |
| * bucket is updated before the range is updated. When reading range |
| * fields, the caller must be sure to get a reference to a single range |
| * instance, so that there's no danger of getting inconsistent values. |
| */ |
| protected volatile VLSNRange range; |
| private boolean rangeTruncated; |
| |
| /* |
| * ProtectedFileRange protects files in 'range' from being deleted. The |
| * range start is changed during initialization and when the head of the |
| * index is truncated. It is changed while synchronized on 'this' to |
| * guarantees that files are not deleted while the range head is locked. |
| */ |
| private final FileProtector.ProtectedFileRange protectedFileRange; |
| |
| /* |
| * In the future, we may want to vary stride, maxMappings and maxDistance |
| * dynamically, in reaction to efficient the mappings look. |
| */ |
| private final int stride; |
| private final int maxMappings; |
| private final int maxDistance; |
| |
| private final LongStat nBucketsCreated; |
| |
| /* |
| * Create an VLSNTracker, with the range initialized from the mapping db. |
| */ |
| VLSNTracker(EnvironmentImpl envImpl, |
| DatabaseImpl mappingDbImpl, |
| int stride, |
| int maxMappings, |
| int maxDistance, |
| StatGroup statistics) |
| throws DatabaseException { |
| |
| this.stride = stride; |
| this.maxMappings = maxMappings; |
| this.maxDistance = maxDistance; |
| this.envImpl = envImpl; |
| nBucketsCreated = |
| new LongStat(statistics, VLSNIndexStatDefinition.N_BUCKETS_CREATED); |
| |
| bucketCache = new TreeMap<>(); |
| |
| /* |
| * Protect all files initially. The range lower bound will be advanced |
| * later during initialization. This special ProtectedFileRange does |
| * not impact LogSizeStats -- see FileProtector. |
| */ |
| final FileProtector fileProtector = envImpl.getFileProtector(); |
| |
| protectedFileRange = fileProtector.protectFileRange( |
| FileProtector.VLSN_INDEX_NAME, 0 /*rangeStart*/, |
| true /*protectVlsnIndex*/, false /*protectBarrenFiles*/); |
| |
| fileProtector.setVLSNIndexProtectedFileRange(protectedFileRange); |
| |
| /* Read the current range information. */ |
| DatabaseEntry key = new DatabaseEntry(); |
| DatabaseEntry data = new DatabaseEntry(); |
| LongBinding.longToEntry(VLSNRange.RANGE_KEY, key); |
| |
| Cursor cursor = null; |
| Locker locker = null; |
| try { |
| locker = BasicLocker.createBasicLocker(envImpl); |
| cursor = DbInternal.makeCursor(mappingDbImpl, |
| locker, |
| CursorConfig.DEFAULT); |
| DbInternal.getCursorImpl(cursor).setAllowEviction(false); |
| |
| OperationStatus status = cursor.getSearchKey(key, data, |
| LockMode.DEFAULT); |
| if (status == OperationStatus.SUCCESS) { |
| /* initialize the range from the database. */ |
| VLSNRangeBinding rangeBinding = new VLSNRangeBinding(); |
| range = rangeBinding.entryToObject(data); |
| lastOnDiskVLSN = range.getLast(); |
| } else if (status == OperationStatus.NOTFOUND) { |
| /* No mappings exist before. */ |
| range = VLSNRange.EMPTY; |
| } else { |
| throw EnvironmentFailureException.unexpectedState |
| ("VLSNTracker init: status=" + status); |
| } |
| } finally { |
| if (cursor != null) { |
| cursor.close(); |
| } |
| |
| if (locker != null) { |
| locker.operationEnd(true); |
| } |
| } |
| } |
| |
| /* |
| * Create an empty VLSNTracker. Used during recovery. |
| */ |
| VLSNTracker(EnvironmentImpl envImpl, |
| int stride, |
| int maxMappings, |
| int maxDistance) { |
| |
| this.envImpl = envImpl; |
| this.stride = stride; |
| this.maxMappings = maxMappings; |
| this.maxDistance = maxDistance; |
| this.protectedFileRange = null; |
| |
| /* Set up a temporary stat group for use during recovery */ |
| StatGroup statistics = |
| new StatGroup(VLSNIndexStatDefinition.GROUP_NAME, |
| VLSNIndexStatDefinition.GROUP_DESC); |
| nBucketsCreated = |
| new LongStat(statistics, |
| VLSNIndexStatDefinition.N_BUCKETS_CREATED); |
| |
| initEmpty(); |
| } |
| |
| void initEmpty() { |
| bucketCache = new TreeMap<>(); |
| range = VLSNRange.EMPTY; |
| } |
| |
| /** |
| * Return a bucket for reading a mapping for this VLSN. If vlsn is > |
| * lastOnDisk, a bucket is guaranteed to be returned. |
| */ |
| synchronized VLSNBucket getGTEBucket(VLSN vlsn) { |
| |
| if (lastOnDiskVLSN.compareTo(vlsn) >= 0) { |
| /* The bucket is in the database, not the cache. */ |
| return null; |
| } |
| |
| Long pivot = vlsn.getSequence() + 1; |
| SortedMap<Long, VLSNBucket> head = bucketCache.headMap(pivot); |
| VLSNBucket prevBucket = null; |
| if (head.size() > 0) { |
| prevBucket = head.get(head.lastKey()); |
| if (prevBucket.owns(vlsn)) { |
| return prevBucket; |
| } |
| } |
| |
| /* |
| * If the key is not in the headMap, we must return the next bucket |
| * with mappings that follow on. |
| */ |
| SortedMap<Long, VLSNBucket> tail = bucketCache.tailMap(pivot); |
| if (tail.size() > 0) { |
| VLSNBucket bucket = tail.get(tail.firstKey()); |
| assert (bucket.owns(vlsn) || bucket.follows(vlsn)) : |
| "VLSN " + vlsn + " got wrong bucket " + bucket; |
| return bucket; |
| } |
| |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, "VLSN " + vlsn + " should be held within this tracker. " + |
| this + " prevBucket=" + prevBucket); |
| } |
| |
| /** |
| * Get the bucket which holds a mapping for this VLSN. If there is no such |
| * bucket, get the one directly preceding it. If this VLSN is >= |
| * firstTrackedVLSN, then we should be able to guarantee that a bucket is |
| * returned. |
| */ |
| synchronized VLSNBucket getLTEBucket(VLSN vlsn) { |
| |
| if (firstTrackedVLSN.equals(NULL_VLSN) || |
| (firstTrackedVLSN.compareTo(vlsn) > 0)) { |
| return null; |
| } |
| |
| Long pivot = vlsn.getSequence() + 1; |
| SortedMap<Long, VLSNBucket> head = bucketCache.headMap(pivot); |
| if (head.size() > 0) { |
| return head.get(head.lastKey()); |
| } |
| |
| /* |
| * We shouldn't get here. Get the tail purely for creating a debugging |
| * message. |
| */ |
| SortedMap<Long, VLSNBucket> tail = bucketCache.tailMap(pivot); |
| VLSNBucket nextBucket = null; |
| if (tail.size() > 0) { |
| nextBucket = tail.get(tail.firstKey()); |
| } |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, "VLSN " + vlsn + " should be held within this tracker. " + |
| this + " nextBucket=" + nextBucket); |
| } |
| |
| /** |
| * Record a new VLSN->LSN mapping. We guarantee that the first and last |
| * VLSNs in the range have a mapping. If a VLSN comes out of order, we will |
| * discard it, and will not update the range. |
| */ |
| synchronized void track(VLSN vlsn, long lsn, byte entryTypeNum) { |
| |
| VLSNBucket currentBucket; |
| |
| if (vlsn.compareTo(lastOnDiskVLSN) < 0) { |
| |
| /* |
| * This VLSN is a laggard. It belongs to a bucket that has already |
| * gone to disk. Since on disk buckets can't be modified, throw |
| * this mapping away. Do be sure to update the range in case |
| * lastSync or lastTxnEnd should be updated as a result of this |
| * mapping. |
| */ |
| updateRange(vlsn, entryTypeNum); |
| return; |
| } |
| |
| if (bucketCache.size() == 0) { |
| /* Nothing in the tracker, add a new bucket. */ |
| currentBucket = new VLSNBucket(DbLsn.getFileNumber(lsn), stride, |
| maxMappings, maxDistance, vlsn); |
| nBucketsCreated.increment(); |
| |
| bucketCache.put(currentBucket.getFirst().getSequence(), |
| currentBucket); |
| } else { |
| /* Find the last bucket. Only the last bucket is updateable. */ |
| currentBucket = bucketCache.get(bucketCache.lastKey()); |
| } |
| |
| /* |
| * This VLSN is a laggard that was preceded by an earlier mapping which |
| * came unseasonably early. This VLSN can't fit into the current |
| * bucket, and since we only want to update the last bucket, we'll |
| * throw this mapping away. |
| */ |
| if (currentBucket.follows(vlsn)) { |
| updateRange(vlsn, entryTypeNum); |
| return; |
| } |
| |
| if (!currentBucket.put(vlsn, lsn)) { |
| |
| /* |
| * Couldn't put the mapping in this bucket. Close it and move to a |
| * new one. |
| */ |
| currentBucket = |
| new VLSNBucket(DbLsn.getFileNumber(lsn), stride, |
| maxMappings, maxDistance, vlsn); |
| nBucketsCreated.increment(); |
| bucketCache.put(currentBucket.getFirst().getSequence(), |
| currentBucket); |
| if (!currentBucket.put(vlsn, lsn)) { |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, "Couldn't put VLSN " + vlsn + " into " + |
| currentBucket); |
| } |
| } |
| |
| updateRange(vlsn, entryTypeNum); |
| |
| /* |
| * Only update the firstTrackedVLSN if this mapping is really resident |
| * in a bucket. Don't update it if it's an out-of-order, skipped |
| * mapping. |
| */ |
| if (firstTrackedVLSN == NULL_VLSN) { |
| firstTrackedVLSN = vlsn; |
| } |
| } |
| |
| /** |
| * Update the range with a newly arrived VLSN. |
| */ |
| private void updateRange(VLSN vlsn, byte entryTypeNum) { |
| |
| /* |
| * Get a reference to the volatile range field so that we always see |
| * consistent values. |
| */ |
| VLSNRange currentRange = range; |
| range = currentRange.getUpdateForNewMapping(vlsn, entryTypeNum); |
| } |
| |
| /** |
| * Flush the tracker cache to disk. |
| * Ideally, we'd allow concurrent VLSN put() calls while a flush to database |
| * is happening. To do that, we need a more sophisticated synchronization |
| * scheme that defines the immutable vs. mutable portion of the tracker |
| * cache. See SR [#17689] |
| */ |
| synchronized void flushToDatabase(DatabaseImpl mappingDbImpl, Txn txn) { |
| |
| VLSNRange flushRange = range; |
| |
| if (bucketCache.size() == 0) { |
| /* |
| * No mappings to write out, but it's possible that the range |
| * changed due to a truncation, in which case we must update the |
| * database. RangeTruncated is used to reduce the chance that |
| * we are doing unnecessary writes of the range record to the db. |
| */ |
| if (rangeTruncated) { |
| lastOnDiskVLSN = flushRange.writeToDatabase(envImpl, |
| mappingDbImpl, txn); |
| rangeTruncated = false; |
| } |
| return; |
| } |
| |
| /* |
| * Save information about the portion of the cache that we are trying |
| * to flush. Close off the last bucket, so the flush portion becomes |
| * immutable. In the past, this was in a synchronization block. |
| * This isn't strictly needed right now, since the method is currently |
| * fully synchronized, but is a good practice, and will be |
| * necessary if future changes make it possible to have concurrent |
| * puts. |
| */ |
| VLSNBucket lastBucket = bucketCache.get(bucketCache.lastKey()); |
| lastBucket.close(); |
| |
| /* |
| * Write the flush portion to disk. Ideally, this portion would be done |
| * without synchronization on the tracker, to permit concurrent reads |
| * and writes to the tracker. Note that because there is no |
| * synchronization, the buckets must be flushed to disk in vlsn order, |
| * so that the vlsn index always looks consistent. There should be no |
| * gaps in the bucket range. This assumption permits concurrent access |
| * by VLSNIndex.getGTEBucketFromDatabase. |
| */ |
| VLSN currentLastVLSN = lastOnDiskVLSN; |
| for (Long key : bucketCache.keySet()) { |
| |
| VLSNBucket target = bucketCache.get(key); |
| |
| /* Do some sanity checking before we write this bucket to disk. */ |
| validateBeforeWrite(target, |
| flushRange, |
| currentLastVLSN, |
| target==lastBucket); |
| target.writeToDatabase(envImpl, mappingDbImpl, txn); |
| currentLastVLSN = target.getLast(); |
| } |
| |
| lastOnDiskVLSN = flushRange.writeToDatabase(envImpl, |
| mappingDbImpl, txn); |
| rangeTruncated = false; |
| |
| /* |
| * Update the tracker to remove the parts that have been written to |
| * disk. Update firstTrackedVLSN, lastOnDiskVLSN, and clear the bucket |
| * cache. |
| * |
| * In the current implementation, bucketCache is guaranteed not to |
| * change, so we can just clear the cache. If we had written the |
| * buckets out without synchronization, the cache might have gained |
| * mappings, and we wouuld have to be careful to set the |
| * firstTrackedVLSN to be the first VLSN of the remaining buckets, and |
| * to set lastTrackedVLSN to be the last VLSN in flushRange. |
| * |
| * We would also have to explicitly call SortedMap.remove() to remove |
| * the written buckets. Do not use TreeMap.tailSet, which would |
| * inadvertently leave us with the semantics of a SubMap, which has |
| * mandated key ranges. With rollbacks, we may end up adding buckets |
| * that are "repeats" in the future. |
| */ |
| bucketCache.clear(); |
| firstTrackedVLSN = NULL_VLSN; |
| } |
| |
| /** |
| * Do some sanity checking before the write, to help prevent any corruption. |
| */ |
| private void validateBeforeWrite(VLSNBucket target, |
| VLSNRange flushRange, |
| VLSN currentLastVLSN, |
| boolean isLastBucket) { |
| |
| if (!(currentLastVLSN.equals(NULL_VLSN)) && |
| (target.getFirst().compareTo(currentLastVLSN) <= 0)) { |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, "target bucket overlaps previous bucket. " + |
| "currentLastVLSN= " + currentLastVLSN + " target=" + |
| target); |
| } |
| |
| if (target.getLast().compareTo(flushRange.getLast()) > 0) { |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, "target bucket exceeds flush range. " + |
| "range= " + flushRange + " target=" + |
| target); |
| } |
| |
| if (isLastBucket) { |
| if (target.getLast().compareTo(flushRange.getLast()) != 0) { |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, "end of last bucket should match end of range. " + |
| "range= " + flushRange + " target=" + target); |
| } |
| } |
| } |
| |
| /** |
| * Initially the protectedFileRange is set to the file of the range start. |
| * It is advanced later when the range head is truncated. |
| */ |
| void initProtectedFileRange(long firstFile) { |
| protectedFileRange.advanceRange(firstFile); |
| } |
| |
| /** |
| * @see VLSNIndex#protectRangeHead |
| */ |
| synchronized ProtectedFileSet protectRangeHead(String lockerName) { |
| |
| /* |
| * Protect all files in the range, including barren files. Because we |
| * are synchronized, we know that protectedFileRange will not be |
| * advanced and the range head will remain stable. |
| */ |
| return envImpl.getFileProtector().protectFileRange( |
| lockerName, protectedFileRange.getRangeStart(), |
| true /*protectVlsnIndex*/, true /*protectBarrenFiles*/); |
| } |
| |
| /** |
| * Returns the file at the lower bound of the current range. This method |
| * does not synchronize. |
| */ |
| long getProtectedRangeStartFile() { |
| return protectedFileRange.getRangeStart(); |
| } |
| |
| /** |
| * Try to advance the VLSNIndex ProtectedFileRange and truncate the head |
| * of the tracker's range, so that bytesNeeded can be freed by deleting |
| * files in this range. |
| * |
| * @return {deleteEnd, deleteFileNum} pair if the range was changed, or |
| * null if no change is currently needed/possible. |
| * -- deleteEnd is the last VLSN to be truncated. |
| * -- deleteFileNum the file having deleteEnd as its last VLSN. |
| */ |
| synchronized Pair<VLSN, Long> tryTruncateFromHead( |
| final long bytesNeeded, |
| final LogItemCache logItemCache) { |
| |
| /* Preserve the last matchpoint and a minimum number of VLSNs. */ |
| final long preserveVlsn = getPreserveVLSN(); |
| |
| if (preserveVlsn < 0) { |
| return null; |
| } |
| |
| /* Determine the vlsn/file after which to truncate the range. */ |
| final Pair<VLSN, Long> truncateInfo = |
| envImpl.getFileProtector().checkVLSNIndexTruncation( |
| bytesNeeded, new VLSN(preserveVlsn)); |
| |
| if (truncateInfo == null) { |
| return null; |
| } |
| |
| /* |
| * It is possible that after checkVLSNIndexTruncation returns, a file |
| * in the range we expect to delete becomes protected. This is safe, |
| * because we know that a syncup will not begin while synchronized on |
| * the VLSNTracker, and feeders always advance their range (never |
| * retreat). It is also still beneficial to truncate the index because |
| * we expect the other protection for such a file to be removed soon, |
| * and then the file can be deleted quickly. |
| */ |
| return truncateFromHead( |
| truncateInfo.first(), truncateInfo.second(), logItemCache) ? |
| truncateInfo : null; |
| } |
| |
| /** |
| * Like {@link #tryTruncateFromHead(long, LogItemCache)} but allows |
| * specifying a specific {deleteEnd, deleteFileNum}. |
| * |
| * <p>There are several reasons for returning false. TODO: Return more |
| * info so that {@link DataEraser#getNextFile} can distinguish between |
| * these cases.</p> |
| * <ol> |
| * <li>The files are protected.</li> |
| * <li>The files are needed to retain at least |
| * {@link RepParams#MIN_VLSN_INDEX_SIZE} VLSNs.</li> |
| * <li>The truncation truncation was performed earlier, perhaps by |
| * another thread.</li> |
| * </ol> |
| */ |
| synchronized boolean tryTruncateFromHead( |
| final VLSN deleteEnd, |
| final long deleteFileNum, |
| final LogItemCache logItemCache) { |
| |
| /* Preserve the last matchpoint and a minimum number of VLSNs. */ |
| if (deleteEnd.getSequence() >= getPreserveVLSN()) { |
| return false; |
| } |
| |
| /* Determine whether the VLSNIndex for this file is protected. */ |
| if (!envImpl.getFileProtector().checkVLSNIndexTruncation( |
| deleteFileNum)) { |
| return false; |
| } |
| |
| return truncateFromHead(deleteEnd, deleteFileNum, logItemCache); |
| } |
| |
| /** |
| * Returns the start of the VLSN range that must be preserved. This is |
| * the minimum of two values: |
| * |
| * 1. The lastSync VLSN, which is the last matchpoint and is needed |
| * to perform syncup. |
| * |
| * 2. The last VLSN minus MIN_VLSN_INDEX_SIZE, which is the minimum |
| * number of VLSNs to preserve for good measure and to prevent the |
| * VLSNIndex from becoming empty. |
| * |
| * Note that a negative value may be returned. |
| */ |
| private long getPreserveVLSN() { |
| return Math.min( |
| range.getLastSync().getSequence(), |
| range.getLast().getSequence() - |
| envImpl.getConfigManager().getInt( |
| RepParams.MIN_VLSN_INDEX_SIZE)); |
| } |
| |
| /** |
| * Truncate the head of the tracker's range, and advance the |
| * ProtectedFileRange accordingly. |
| * |
| * @return true if range is changed, or false if no change is needed. |
| */ |
| synchronized boolean truncateFromHead( |
| VLSN deleteEnd, |
| long deleteFileNum, |
| LogItemCache logItemCache) { |
| |
| /* The caller should pass a non-null VLSN, but check for safety. */ |
| if (deleteEnd.equals(VLSN.NULL_VLSN)) { |
| return false; |
| } |
| |
| /* |
| * Check the VLSN found in the deleted file against the existing |
| * mappings. The range should not be empty, and doing the truncation |
| * should not remove the last sync point. We assume that once this |
| * environment has received any portion of the replication stream, it |
| * will maintain a minimum set of VLSNs. |
| */ |
| VLSNRange oldRange = range; |
| |
| if (oldRange.getFirst().compareTo(deleteEnd) > 0) { |
| /* deleteEnd has already been cast out of the index. */ |
| return false; |
| } |
| |
| if (oldRange.isEmpty()) { |
| throw EnvironmentFailureException.unexpectedState( |
| envImpl, "Didn't expect current range to be empty. " + |
| " End of delete range = " + deleteEnd); |
| } |
| |
| if (!oldRange.getLastSync().equals(NULL_VLSN) && |
| (deleteEnd.compareTo(oldRange.getLastSync()) > 0)) { |
| |
| throw EnvironmentFailureException.unexpectedState( |
| envImpl, "Can't log clean away last matchpoint. DeleteEnd= " + |
| deleteEnd + " lastSync=" + oldRange.getLastSync()); |
| } |
| |
| /* |
| * Sanity checks are over and we are committed to changing the range. |
| * |
| * Clear the LogItemCache in case it contains entries with VLSNs being |
| * truncated. This seems extremely unlikely, since the cache is a small |
| * number of entries at the end of the VLSN range. But perhaps it is |
| * simplest to clear it in order to guarantee safely, and incur the |
| * infrequent cost of a few cache misses. |
| */ |
| if (logItemCache != null) { |
| logItemCache.clear(); |
| } |
| |
| /* |
| * Advance the ProtectedFileRange to allow files in the truncated |
| * portion to be deleted. deleteFileNum is the file having deleteEnd as |
| * its highest VLSN. The VLSN range will now start with deleteEnd+1, so |
| * now we only need to protect files starting at deleteFileNum+1. |
| */ |
| protectedFileRange.advanceRange(deleteFileNum + 1); |
| |
| /* |
| * Update the in-memory, cached portion of the index. |
| * |
| * Update range first, to prevent others from attempting to read the |
| * truncated portions. Update the firstTrackedLSN last, after the |
| * buckets are removed; getLTEBucket relies on the firstTrackedLSN |
| * value. |
| */ |
| range = oldRange.shortenFromHead(deleteEnd); |
| |
| /* |
| * Ensure that the range is written to the db even if the resulting |
| * number of buckets is 0. |
| */ |
| rangeTruncated = true; |
| |
| /* |
| * afterDelete may not == range.getFirst, becayse range.getFirst may |
| * be NULL_VLSN if the entire vlsn index has been deleted. |
| */ |
| VLSN afterDelete = deleteEnd.getNext(); |
| |
| /* |
| * Check if the cached buckets needs to be modified. Suppose vlsns 1-8 |
| * are on disk, and vlsns 12->40 are in the tracker cache, with an out |
| * of order gap from 9 - 11. |
| * |
| * case 1: deleteEnd <= 8, Do not need to truncate anything in the |
| * bucket cacke, but we may have to make a ghost bucket to fill the |
| * gap. |
| * case 2: deleteEnd == 11, we don't need to modify the cache or |
| * create a ghost bucket. |
| * case 3: deleteEnd >8 and < 11 is in the gap, we need to create a |
| * ghost bucket to hold the beginning of the range.. |
| * case 4: deleteEnd >= 12, we need to modify the bucket cache. |
| */ |
| if (!lastOnDiskVLSN.equals(VLSN.NULL_VLSN) && |
| (lastOnDiskVLSN.compareTo(deleteEnd) >= 0)) { |
| |
| /* |
| * case 1: the buckets in the tracker cache are unaffected, all |
| * the pertinent mappings are on disk. |
| */ |
| if (lastOnDiskVLSN.equals(deleteEnd)) { |
| if (firstTrackedVLSN.compareTo(afterDelete) > 0) { |
| checkForGhostBucket(bucketCache, deleteFileNum); |
| } |
| lastOnDiskVLSN = NULL_VLSN; |
| } |
| return true; |
| } |
| |
| assert(!firstTrackedVLSN.equals(NULL_VLSN)); |
| |
| if (firstTrackedVLSN.equals(afterDelete)) { |
| /* case 2: The tracker is lined up perfectly with the new range. */ |
| lastOnDiskVLSN = NULL_VLSN; |
| return true; |
| } |
| |
| if (firstTrackedVLSN.compareTo(afterDelete) > 0) { |
| /* |
| * case 3: we have to make a ghost bucket. |
| */ |
| checkForGhostBucket(bucketCache, deleteFileNum); |
| lastOnDiskVLSN = NULL_VLSN; |
| return true; |
| } |
| |
| /* |
| * case 4: we have to prune the buckets. |
| */ |
| VLSNBucket owningBucket = getLTEBucket(deleteEnd); |
| Long retainBucketKey = owningBucket.getFirst().getNext().getSequence(); |
| SortedMap<Long, VLSNBucket> tail; |
| try { |
| /* |
| * We need to chop off part of the bucket cache. Find the portion |
| * to retain. |
| */ |
| tail = bucketCache.tailMap(retainBucketKey); |
| checkForGhostBucket(tail, deleteFileNum); |
| bucketCache = tail; |
| } catch (NoSuchElementException e) { |
| firstTrackedVLSN = NULL_VLSN; |
| bucketCache = new TreeMap<>(); |
| } |
| |
| lastOnDiskVLSN = NULL_VLSN; |
| return true; |
| } |
| |
| private void checkForGhostBucket(SortedMap<Long, VLSNBucket> buckets, |
| long deleteFileNum) { |
| /* |
| * The range has just been truncated. If the first bucket's first vlsn |
| * != the first in range, insert a GhostBucket. |
| */ |
| Long firstKey = buckets.firstKey(); |
| VLSNBucket firstBucket = buckets.get(firstKey); |
| VLSN firstRangeVLSN = range.getFirst(); |
| VLSN firstBucketVLSN = firstBucket.getFirst(); |
| if (!firstBucketVLSN.equals(firstRangeVLSN)) { |
| long nextFile = |
| envImpl.getFileManager().getFollowingFileNum(deleteFileNum, |
| true /* forward */); |
| long lastPossibleLsn = firstBucket.getLsn(firstBucketVLSN); |
| VLSNBucket placeholder = new GhostBucket(firstRangeVLSN, |
| DbLsn.makeLsn(nextFile, 0), |
| lastPossibleLsn); |
| buckets.put(firstRangeVLSN.getSequence(), placeholder); |
| } |
| } |
| |
| /** |
| * Remove the mappings for VLSNs >= deleteStart. This should only be used |
| * at syncup, or recovery. We can assume no new mappings are coming in, but |
| * the VLSNIndex may be read during this time; checkpoints continue to |
| * happen during rollbacks. Since syncup is always at a sync-able log |
| * entry, and since we don't roll back past a commit, we can assume that |
| * the lastSync can be changed to deleteStart.getPrev(), and lastTxnEnd is |
| * unchanged. The VLSNRange itself is updated before the tracker and |
| * on-disk buckets are pruned. |
| * |
| * The VLSNIndex guarantees that the beginning and end vlsns in the range |
| * have mappings so LTE and GTE search operations work. When truncating the |
| * mappings in the tracker, check to see if the new end vlsn has a mapping, |
| * since that may not happen if the buckets are cut in the middle, or have |
| * a gap due to out of order mapping. For example, suppose mapping for vlsn |
| * 20 came before the mappings for vlsn 17, 18 and 19. The buckets would |
| * have this gap: |
| * Bucket A: vlsns 10-16 |
| * Bucket B: vlsns 20-25 |
| * Bucket C: vlsns 26 - 30 |
| * If the new end of vlsn range is 11-14, 17, 18, 19, 21 - 24, or 27-29, |
| * we have to end the vlsn tracker with a new mapping of |
| * (vlsn (deleteStart-1) -> prevLsn) to cap off the buckets. |
| * |
| * Cases: |
| * End of range is in the gap -- either 17, 18, 19 |
| * 1. on disk buckets exist, but don't encompass the gap: bucket A on disk |
| * 2. all buckets are in the tracker cache: bucket A in tracker |
| * 3. on disk buckets encompass the gap: bucket A and B on disk. |
| */ |
| synchronized void truncateFromTail(VLSN deleteStart, long prevLsn) { |
| |
| /* |
| * Update the range first, it will define the range that the vlsnIndex |
| * covers. Then adjust any mappings held in the bucket cache. Don't |
| * update the lastOnDiskVLSN marker, which says what is on disk. Since |
| * the on-disk portion may also get truncated, the caller will update |
| * the lastOnDiskVLSN field when it inspects the database. |
| */ |
| VLSNRange oldRange = range; |
| range = oldRange.shortenFromEnd(deleteStart); |
| |
| /* |
| * Ensure that the range is written to the db even if the resulting |
| * number of buckets is 0. |
| */ |
| rangeTruncated = true; |
| |
| if (firstTrackedVLSN.equals(NULL_VLSN)) { |
| /* No mappings in this tracker */ |
| return; |
| } |
| |
| if (firstTrackedVLSN.compareTo(deleteStart) >= 0) { |
| |
| /* |
| * Everything in this tracker should be removed. In addition, the |
| * caller may be removing some items from the database. |
| */ |
| bucketCache.clear(); |
| firstTrackedVLSN = NULL_VLSN; |
| return; |
| } |
| |
| /* |
| * We need to do some pruning of the bucket in the cache. Find the |
| * buckets in the bucketCache that have mappings >= deleteStart, and |
| * remove their mappings. First establish the headset of buckets that |
| * should be preserved. |
| */ |
| VLSNBucket targetBucket = getGTEBucket(deleteStart); |
| Long targetKey = targetBucket.getFirst().getSequence(); |
| |
| /* The newCache has buckets that should be preserved. */ |
| SortedMap<Long, VLSNBucket> newCache = |
| new TreeMap<>(bucketCache.headMap(targetKey)); |
| |
| /* |
| * Prune any mappings >= deleteStart out of targetBucket. If it has any |
| * mappings left, add it to newCache. |
| */ |
| targetBucket.removeFromTail(deleteStart, prevLsn); |
| if (!targetBucket.empty()) { |
| newCache.put(targetBucket.getFirst().getSequence(), |
| targetBucket); |
| } |
| |
| bucketCache = newCache; |
| |
| /* |
| * Now all truncated mappings have been removed from the index. Check |
| * that the end point of the vlsn range is represented in this |
| * tracker. Since vlsn mappings can come out of order, it's |
| * possible that the buckets have a gap, and that truncation will |
| * cleave the bucket cache just at a point where a mapping is |
| * missing. For example, vlsn mappings come in this order: |
| * vlsn 16, vlsn 18, vlsn 17 |
| * causing the buckets to have this gap: |
| * bucket N: vlsn 10 - 16 |
| * bucket N+1: vlsn 18 - 20 |
| * and that the vlsn range is truncated with a deleteStart of vlsn 18. |
| * The new end range becomes 17, but the buckets do not have a |
| * mapping for vlsn 17. If so, create a mapping now. |
| * |
| * If we are at this point, we know that the tracker should contain |
| * a mapping for the last VLSN in the range. Fix it now, so the |
| * datastructure is as tidy as possible. |
| */ |
| boolean needEndMapping; |
| if (bucketCache.isEmpty()) { |
| needEndMapping = true; |
| } else { |
| VLSNBucket lastBucket = bucketCache.get(bucketCache.lastKey()); |
| needEndMapping = |
| (lastBucket.getLast().compareTo(range.getLast()) < 0); |
| } |
| |
| if (needEndMapping) { |
| addEndMapping(range.getLast(), prevLsn); |
| } |
| } |
| |
| /** |
| * Called by the VLSNIndex to see if we have to add a bucket in the tracker |
| * after pruning the on-disk storage. The get{LTE,GTE}Bucket methods |
| * assume that there are mappings for the first and last VLSN in the range. |
| * But since out of order vlsn mappings can cause non-contiguous buckets, |
| * the pruning may have lost the end mapping. [#23491] |
| */ |
| synchronized void ensureRangeEndIsMapped(VLSN lastVLSN, long lastLsn) { |
| |
| /* |
| * if lastOnDiskVLSN < lastVLSN < firstTrackedVLSN or |
| * lastOnDiskVLSN < lastVLSN and firstTrackedVLSN is null |
| * then we need to add a mapping for lastVLSN->lastLsn. Otherwise a |
| * mapping already exists. |
| */ |
| if (lastOnDiskVLSN.compareTo(lastVLSN) < 0) { |
| |
| /* |
| * The on-disk vlsn mappings aren't sufficient to cover the |
| * lastVLSN. There needs to be a mapping in the tracker for the |
| * end point of the range. |
| */ |
| if (!firstTrackedVLSN.equals(NULL_VLSN)) { |
| |
| /* |
| * There are mappings in the tracker, so we can assume |
| * that they cover the end point, because truncateFromTail() |
| * should have adjusted for that. But assert to be sure! |
| */ |
| if (lastVLSN.compareTo(firstTrackedVLSN) < 0) { |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, |
| "Expected this tracker to cover vlsn " + lastVLSN + |
| " after truncateFromTail. LastOnDiskVLSN= " + |
| lastOnDiskVLSN + " tracker=" + this); |
| } |
| |
| return; |
| } |
| |
| /* |
| * There are no mappings in the tracker. The on disk mappings were |
| * pruned, and a gap was created between what was on disk and what |
| * is in the tracker cache. Add a mapping. |
| */ |
| addEndMapping(lastVLSN, lastLsn); |
| } |
| } |
| |
| /** |
| * Add a bucket holding a lastVLSN -> lastLsn mapping |
| */ |
| private void addEndMapping(VLSN lastVLSN, long lastLsn) { |
| /* Assert that it's in the range */ |
| assert (lastVLSN.compareTo(range.getLast()) == 0) : |
| "lastVLSN=" + lastVLSN + " lastLsn = " + lastLsn + |
| " range=" + range; |
| |
| VLSNBucket addBucket = |
| new VLSNBucket(DbLsn.getFileNumber(lastLsn), stride, |
| maxMappings, maxDistance, lastVLSN); |
| addBucket.put(lastVLSN, lastLsn); |
| nBucketsCreated.increment(); |
| bucketCache.put(addBucket.getFirst().getSequence(), addBucket); |
| if (firstTrackedVLSN.equals(NULL_VLSN)) { |
| firstTrackedVLSN = lastVLSN; |
| } |
| } |
| |
| /** |
| * Attempt to replace the mappings in this vlsnIndex for |
| * deleteStart->lastVLSN with those from the recovery mapper. |
| * Since we do not supply a prevLsn argument, we may not be able to "cap" |
| * the truncated vlsn bucket the same what that we can in |
| * truncateFromTail. Because of that, we may need to remove mappings that |
| * are < deleteStart. |
| * |
| * For example, suppose a bucket holds mappings |
| * 10 -> 101 |
| * 15 -> 201 |
| * 18 -> 301 |
| * If deleteStart == 17, we will have to delete all the way to vlsn 15. |
| * |
| * The maintenance of VLSNRange.lastSync and lastTxnEnd are simplified |
| * because of assumptions we can make because we are about to append |
| * mappings found by the recovery tracker that begin at deleteStart. If |
| * lastSync and lastTxnEnd are <= deleteStart, we know that they will |
| * either stay the same, or be replaced by lastSync and lastTxnEnd from the |
| * recoveryTracker. Even we delete mappings that are < deleteStart, we know |
| * that we did not roll back past an abort or commit, so that we do not |
| * have to updated the lastSync or lastTxnEnd value. |
| */ |
| void merge(VLSN prunedLastOnDiskVLSN, VLSNRecoveryTracker recoveryTracker) { |
| |
| VLSNRange oldRange = range; |
| range = oldRange.merge(recoveryTracker.range); |
| VLSN recoveryFirst = recoveryTracker.getRange().getFirst(); |
| |
| lastOnDiskVLSN = prunedLastOnDiskVLSN; |
| |
| /* |
| * Find the buckets in the bucketCache that have mappings >= |
| * recoveryFirst, and remove their mappings. First establish the |
| * headset of buckets that should be preserved. At this point, we |
| * have already pruned the database, so the bucket set may not |
| * be fully contiguous -- we may have pruned away mappings that |
| * would normally be in the database. |
| */ |
| if (bucketCache.size() == 0) { |
| /* Just take all the recovery tracker's mappings */ |
| bucketCache = recoveryTracker.bucketCache; |
| } else { |
| VLSNBucket targetBucket = getGTEBucket(recoveryFirst); |
| Long targetKey = targetBucket.getFirst().getSequence(); |
| |
| /* The newCache holds buckets that should be preserved. */ |
| SortedMap<Long, VLSNBucket> newCache = |
| new TreeMap<>(bucketCache.headMap(targetKey)); |
| |
| /* |
| * Prune any mappings >= recoveryFirst out of targetBucket. If it |
| * has any mappings left, add it to newCache. |
| */ |
| targetBucket.removeFromTail(recoveryFirst, DbLsn.NULL_LSN); |
| if (!targetBucket.empty()) { |
| newCache.put(targetBucket.getFirst().getSequence(), |
| targetBucket); |
| } |
| |
| newCache.putAll(recoveryTracker.bucketCache); |
| bucketCache = newCache; |
| } |
| |
| if (bucketCache.size() > 0) { |
| VLSNBucket firstBucket = bucketCache.get(bucketCache.firstKey()); |
| firstTrackedVLSN = firstBucket.getFirst(); |
| } |
| } |
| |
| void append(VLSNRecoveryTracker recoveryTracker) { |
| |
| /* |
| * This method assume that there is no overlap between this tracker |
| * and the recovery tracker. Everything in this tracker should precede |
| * the recovery tracker. |
| */ |
| if (!range.getLast().isNull()) { |
| if (range.getLast().compareTo |
| (recoveryTracker.getFirstTracked()) >= 0) { |
| |
| throw EnvironmentFailureException.unexpectedState |
| (envImpl, |
| "Expected this tracker to precede recovery tracker. " + |
| "This tracker= " + this + " recoveryTracker = " + |
| recoveryTracker); |
| } |
| } |
| |
| bucketCache.putAll(recoveryTracker.bucketCache); |
| VLSNRange currentRange = range; |
| range = currentRange.getUpdate(recoveryTracker.getRange()); |
| if (bucketCache.size() > 0) { |
| VLSNBucket firstBucket = bucketCache.get(bucketCache.firstKey()); |
| firstTrackedVLSN = firstBucket.getFirst(); |
| } |
| } |
| |
| VLSNRange getRange() { |
| return range; |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append(range); |
| sb.append(" firstTracked=").append(firstTrackedVLSN); |
| sb.append(" lastOnDiskVLSN=").append(lastOnDiskVLSN); |
| |
| for (VLSNBucket bucket: bucketCache.values()) { |
| sb.append("\n"); |
| sb.append(bucket); |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * For unit test support. |
| */ |
| synchronized boolean verify(boolean verbose) { |
| |
| if (!range.verify(verbose)) { |
| return false; |
| } |
| |
| /* Check that all the buckets are in order. */ |
| ArrayList<VLSN> firstVLSN = new ArrayList<>(); |
| ArrayList<VLSN> lastVLSN = new ArrayList<>(); |
| |
| for (VLSNBucket b : bucketCache.values()) { |
| firstVLSN.add(b.getFirst()); |
| lastVLSN.add(b.getLast()); |
| } |
| |
| if (!verifyBucketBoundaries(firstVLSN, lastVLSN)) { |
| return false; |
| } |
| |
| if (firstVLSN.size() > 0) { |
| if (!firstVLSN.get(0).equals(firstTrackedVLSN)) { |
| if (verbose) { |
| System.err.println("firstBucketVLSN = " + |
| firstVLSN.get(0) + " should equal " + |
| firstTrackedVLSN); |
| } |
| return false; |
| } |
| |
| VLSN lastBucketVLSN = lastVLSN.get(lastVLSN.size() -1); |
| if (!lastBucketVLSN.equals(range.getLast())) { |
| if (verbose) { |
| System.err.println("lastBucketVLSN = " + lastBucketVLSN + |
| " should equal " + range.getLast()); |
| } |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| /* |
| * Used to check bucket boundaries for both the in-memory flush list and |
| * the on-disk database buckets. Buckets may not be adjacent, but must be |
| * in order. |
| */ |
| static boolean verifyBucketBoundaries(ArrayList<VLSN> firstVLSN, |
| ArrayList<VLSN> lastVLSN) { |
| for (int i = 1; i < firstVLSN.size(); i++) { |
| VLSN first = firstVLSN.get(i); |
| VLSN prevLast = lastVLSN.get(i-1); |
| if (first.compareTo(prevLast.getNext()) < 0) { |
| System.out.println("Boundary problem: bucket " + i + |
| " first " + first + |
| " follows bucket.last " + prevLast); |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| VLSN getFirstTracked() { |
| return firstTrackedVLSN; |
| } |
| |
| VLSN getLastOnDisk() { |
| return lastOnDiskVLSN; |
| } |
| |
| void setLastOnDiskVLSN(VLSN lastOnDisk) { |
| lastOnDiskVLSN = lastOnDisk; |
| } |
| |
| /* |
| * For unit test support only. Can only be called when replication stream |
| * is quiescent. |
| */ |
| boolean isFlushedToDisk() { |
| return lastOnDiskVLSN.equals(range.getLast()); |
| } |
| |
| void close() { |
| if (protectedFileRange != null) { |
| final FileProtector fileProtector = envImpl.getFileProtector(); |
| fileProtector.removeFileProtection(protectedFileRange); |
| fileProtector.setVLSNIndexProtectedFileRange(null); |
| } |
| } |
| } |