blob: 7677b62a81051dd0cb5a4fdfc484cca8af5e377d [file] [log] [blame]
/*-
* Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle Berkeley
* DB Java Edition made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle Berkeley DB Java Edition for a copy of the
* license and additional information.
*/
package com.sleepycat.je.recovery;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_CHECKPOINTS;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_DELTA_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_BIN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPTID;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_END;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_INTERVAL;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_START;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_DESC;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_NAME;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import com.sleepycat.je.CacheMode;
import com.sleepycat.je.CheckpointConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DiskLimitException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.cleaner.Cleaner;
import com.sleepycat.je.cleaner.ExtinctionScanner;
import com.sleepycat.je.cleaner.FileSelector.CheckpointStartCleanerState;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.evictor.OffHeapCache;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.Provisional;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.INLogEntry;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.ChildReference;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.tree.Tree;
import com.sleepycat.je.tree.WithRootLatched;
import com.sleepycat.je.utilint.DaemonThread;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LSNStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;
import com.sleepycat.je.utilint.VLSN;
/**
* The Checkpointer looks through the tree for internal nodes that must be
* flushed to the log. Checkpoint flushes must be done in ascending order from
* the bottom of the tree up.
*
* Checkpoint and IN Logging Rules
* -------------------------------
* The checkpoint must log, and make accessible via non-provisional ancestors,
* all INs that are dirty at CkptStart. If we crash and recover from that
* CkptStart onward, any IN that became dirty (before the crash) after the
* CkptStart must become dirty again as the result of replaying the action that
* caused it to originally become dirty.
*
* Therefore, when an IN is dirtied at some point in the checkpoint interval,
* but is not logged by the checkpoint, the log entry representing the action
* that dirtied the IN must follow either the CkptStart or the FirstActiveLSN
* that is recorded in the CkptEnd entry. The FirstActiveLSN is less than or
* equal to the CkptStart LSN. Recovery will process LNs between the
* FirstActiveLSN and the end of the log. Other entries are only processed
* from the CkptStart forward. And provisional entries are not processed.
*
* Example: Non-transactional LN logging. We take two actions: 1) log the LN
* and then 2) dirty the parent BIN. What if the LN is logged before CkptStart
* and the BIN is dirtied after CkptStart? How do we avoid breaking the rules?
* The answer is that we log the LN while holding the latch on the parent BIN,
* and we don't release the latch until after we dirty the BIN. The
* construction of the checkpoint dirty map requires latching the BIN. Since
* the LN was logged before CkptStart, the BIN will be dirtied before the
* checkpointer latches it during dirty map construction. So the BIN will
* always be included in the dirty map and logged by the checkpoint.
*
* Example: Abort. We take two actions: 1) log the abort and then 2) undo the
* changes, which modifies (dirties) the BIN parents of the undone LNs. There
* is nothing to prevent logging CkptStart in between these two actions, so how
* do we avoid breaking the rules? The answer is that we do not unregister the
* transaction until after the undo phase. So although the BINs may be dirtied
* by the undo after CkptStart is logged, the FirstActiveLSN will be prior to
* CkptStart. Therefore, we will process the Abort and replay the action that
* modifies the BINs.
*
* Exception: Lazy migration. The log cleaner will make an IN dirty without
* logging an action that makes it dirty. This is an exception to the general
* rule that actions should be logged when they cause dirtiness. The reasons
* this is safe are:
* 1. The IN contents are not modified, so there is no information lost if the
* IN is never logged, or is logged provisionally and no ancestor is logged
* non-provisionally.
* 2. If the IN is logged non-provisionally, this will have the side effect of
* recording the old LSN as being obsolete. However, the general rules for
* checkpointing and recovery will ensure that the new version is used in
* the Btree. The new version will either be replayed by recovery or
* referenced in the active Btree via a non-provisional ancestor.
*
* Checkpoint Algorithm TODO update this
* --------------------
* The final checkpointDirtyMap field is used to hold (in addition to the dirty
* INs) the state of the checkpoint and highest flush levels. Access to this
* object is synchronized so that eviction and checkpointing can access it
* concurrently. When a checkpoint is not active, the state is CkptState.NONE
* and the dirty map is empty. When a checkpoint runs, we do this:
*
* 1. Get set of files from cleaner that can be deleted after this checkpoint.
* 2. Set checkpointDirtyMap state to DIRTY_MAP_INCOMPLETE, meaning that dirty
* map construction is in progress.
* 3. Log CkptStart
* 4. Construct dirty map, organized by Btree level, from dirty INs in INList.
* The highest flush levels are calculated during dirty map construction.
* Set checkpointDirtyMap state to DIRTY_MAP_COMPLETE.
* 5. Flush INs in dirty map.
* + First, flush the bottom two levels a sub-tree at a time, where a
* sub-tree is one IN at level two and all its BIN children. Higher
* levels (above level two) are logged strictly by level, not using
* subtrees.
* o If je.checkpointer.highPriority=false, we log one IN at a
* time, whether or not the IN is logged as part of a subtree,
* and do a Btree search for the parent of each IN.
* o If je.checkpointer.highPriority=true, for the bottom two
* levels we log each sub-tree in a single call to the
* LogManager with the parent IN latched, and we only do one
* Btree search for each level two IN. Higher levels are logged
* one IN at a time as with highPriority=false.
* + The Provisional property is set as follows, depending on the level
* of the IN:
* o level is max flush level: Provisional.NO
* o level is bottom level: Provisional.YES
* o Otherwise (middle levels): Provisional.BEFORE_CKPT_END
* 6. Flush VLSNIndex cache to make VLSNIndex recoverable.
* 7. Flush UtilizationTracker (write FileSummaryLNs) to persist all
* tracked obsolete offsets and utilization summary info, to make this info
* recoverable.
* 8. Log CkptEnd
* 9. Delete cleaned files from step 1.
* 10. Set checkpointDirtyMap state to NONE.
*
* Per-DB Highest Flush Level
* --------------------------
* As mentioned above, when the dirty map is constructed we also determine the
* highest flush level for each database. This is the maximum Btree level at
* which a dirty node exists in the DB.
*
* When logging a node below the maxFlushLevel, we add the parent to the dirty
* map. It may or may not have been added when the dirty map was constructed.
* The idea is to flush all ancestors of all nodes in the dirty map, up to and
* including the maxFlushLevel, even if those ancestors were not dirty when the
* dirty map was constructed.
*
* This is done to avoid orphaning a dirty node as shown in this example.
*
* IN-A (root level=4)
* / \
* (d) IN-B IN-C (maxFlushLevel=3)
* \
* (d) IN-D
*
* IN-C is not dirty (d) when the dirty map is constructed, but it will be
* logged because its child (IN-D) is dirty, and it is not above maxFlushLevel.
*
* If IN-C were not logged, and there were a crash after the checkpoint, the
* changes to IN-D would be lost. IN-D would not be replayed by recovery
* because it is logged provisionally, and it would not be accessible via its
* parent. This is because only nodes at maxFlushLevel are logged
* non-provisionally. The actions that led to the changes in IN-D may not be
* replayed either, because they may appear before the firstActiveLsn
* associated with the checkpoint.
*
* When log files are to be deleted at the end of the checkpoint (after being
* processed by the log cleaner), the maxFlushLevel is increased by one.
* This is to ensure that LSNs in deleted files will not be fetched during
* recovery. Such files are in the FileSelector.CLEANED state, which means
* they have been processed by the cleaner since the last checkpoint.
*
* TODO: Document circumstances and motivation for the extra flush level.
*
* Lastly, for Database.sync or a checkpoint with MinimizeRecoveryTime
* configured, we will flush all the way to the root rather than using the
* maxFlushLevel computed as described above.
*
* Provisional.BEFORE_CKPT_END
* ---------------------------
* See Provisional.java for a description of the relationship between the
* checkpoint algorithm above and the BEFORE_CKPT_END property.
*
* Coordination of Eviction and Checkpointing
* ------------------------------------------
* Eviction can proceed concurrently with all phases of a checkpoint, and
* eviction may take place concurrently in multiple threads. This concurrency
* is crucial to avoid blocking application threads that perform eviction and
* to reduce the amount of eviction required in application threads.
*
* Eviction calls Checkpointer.coordinateEvictionWithCheckpoint, which calls
* DirtyINMap.coordinateEvictionWithCheckpoint, just before logging an IN.
* coordinateEvictionWithCheckpoint returns whether the IN should be logged
* provisionally (Provisional.YES) or non-provisionally (Provisional.NO).
*
* Other coordination necessary depends on the state of the checkpoint:
* + NONE: No additional action.
* o return Provisional.NO
* + DIRTY_MAP_INCOMPLETE: The parent IN is added to the dirty map, exactly
* as if it were encountered as dirty in the INList during dirty map
* construction.
* o IN is root: return Provisional.NO
* o IN is not root: return Provisional.YES
* + DIRTY_MAP_COMPLETE:
* o IN level GTE highest flush level: return Provisional.NO
* o IN level LT highest flush level: return Provisional.YES
*
* In general this is designed so that eviction will use the same provisional
* value that would be used by the checkpoint, as if the checkpoint itself were
* logging the IN. However, there are several conditions where this is not
* exactly the case.
*
* 1. Eviction may log an IN with Provisional.YES when the IN was not dirty at
* the time of dirty map creation, if it became dirty afterwards. In this
* case, the checkpointer would not have logged the IN at all. This is safe
* because the actions that made that IN dirty are logged in the recovery
* period.
* 2. Eviction may log an IN with Provisional.YES after the checkpoint has
* logged it, if it becomes dirty again. In this case the IN is logged
* twice, which would not have been done by the checkpoint alone. This is
* safe because the actions that made that IN dirty are logged in the
* recovery period.
* 3. An intermediate level IN (not bottom most and not the highest flush
* level) will be logged by the checkpoint with Provisional.BEFORE_CKPT_END
* but will be logged by eviction with Provisional.YES. See below for why
* this is safe.
* 4. Between checkpoint step 8 (log CkptEnd) and 10 (set checkpointDirtyMap
* state to NONE), eviction may log an IN with Provisional.YES, although a
* checkpoint is not strictly active during this interval. See below for
* why this is safe.
*
* It is safe for eviction to log an IN as Provisional.YES for the last two
* special cases, because this does not cause incorrect recovery behavior. For
* recovery to work properly, it is only necessary that:
*
* + Provisional.NO is used for INs at the max flush level during an active
* checkpoint.
* + Provisional.YES or BEFORE_CKPT_END is used for INs below the max flush
* level, to avoid replaying an IN during recovery that may depend on a file
* deleted as the result of the checkpoint.
*
* You may ask why we don't use Provisional.YES for eviction when a checkpoint
* is not active. There are two reason, both related to performance:
*
* 1. This would be wasteful when an IN is evicted in between checkpoints, and
* that portion of the log is processed by recovery later, in the event of a
* crash. The evicted INs would be ignored by recovery, but the actions
* that caused them to be dirty would be replayed and the INs would be
* logged again redundantly.
* 2. Logging a IN provisionally will not count the old LSN as obsolete
* immediately, so cleaner utilization will be inaccurate until the a
* non-provisional parent is logged, typically by the next checkpoint. It
* is always important to keep the cleaner from stalling and spiking, to
* keep latency and throughput as level as possible.
*
* Therefore, it is safe to log with Provisional.YES in between checkpoints,
* but not desirable.
*
* Although we don't do this, it would be safe and optimal to evict with
* BEFORE_CKPT_END in between checkpoints, because it would be treated by
* recovery as if it were Provisional.NO. This is because the interval between
* checkpoints is only processed by recovery if it follows the last CkptEnd,
* and BEFORE_CKPT_END is treated as Provisional.NO if the IN follows the last
* CkptEnd.
*
* However, it would not be safe to evict an IN with BEFORE_CKPT_END during a
* checkpoint, when logging of the IN's ancestors does not occur according to
* the rules of the checkpoint. If this were done, then if the checkpoint
* completes and is used during a subsequent recovery, an obsolete offset for
* the old version of the IN will mistakenly be recorded. Below are two cases
* where BEFORE_CKPT_END is used correctly and one showing how it could be used
* incorrectly.
*
* 1. Correct use of BEFORE_CKPT_END when the checkpoint does not complete.
*
* 050 BIN-A
* 060 IN-B parent of BIN-A
* 100 CkptStart
* 200 BIN-A logged with BEFORE_CKPT_END
* 300 FileSummaryLN with obsolete offset for BIN-A at 050
* Crash and recover
*
* Recovery will process BIN-A at 200 (it will be considered
* non-provisional) because there is no following CkptEnd. It is
* therefore correct that BIN-A at 050 is obsolete.
*
* 2. Correct use of BEFORE_CKPT_END when the checkpoint does complete.
*
* 050 BIN-A
* 060 IN-B parent of BIN-A
* 100 CkptStart
* 200 BIN-A logged with BEFORE_CKPT_END
* 300 FileSummaryLN with obsolete offset for BIN-A at 050
* 400 IN-B parent of BIN-A, non-provisional
* 500 CkptEnd
* Crash and recover
*
* Recovery will not process BIN-A at 200 (it will be considered
* provisional) because there is a following CkptEnd, but it will
* process its parent IN-B at 400, and therefore the BIN-A at 200 will be
* active in the tree. It is therefore correct that BIN-A at 050 is
* obsolete.
*
* 3. Incorrect use of BEFORE_CKPT_END when the checkpoint does complete.
*
* 050 BIN-A
* 060 IN-B parent of BIN-A
* 100 CkptStart
* 200 BIN-A logged with BEFORE_CKPT_END
* 300 FileSummaryLN with obsolete offset for BIN-A at 050
* 400 CkptEnd
* Crash and recover
*
* Recovery will not process BIN-A at 200 (it will be considered
* provisional) because there is a following CkptEnd, but no parent
* IN-B is logged, and therefore the IN-B at 060 and BIN-A at 050 will be
* active in the tree. It is therefore incorrect that BIN-A at 050 is
* obsolete.
*
* This last case is what caused the LFNF in SR [#19422], when BEFORE_CKPT_END
* was mistakenly used for logging evicted BINs via CacheMode.EVICT_BIN.
* During the checkpoint, we evict BIN-A and log it with BEFORE_CKPT_END, yet
* neither it nor its parent are part of the checkpoint. After being counted
* obsolete, we crash and recover. Then the file containing the BIN (BIN-A at
* 050 above) is cleaned and deleted. During cleaning, it is not migrated
* because an obsolete offset was previously recorded. The LFNF occurs when
* trying to access this BIN during a user operation.
*
* CacheMode.EVICT_BIN
* -------------------
* Unlike in JE 4.0 where EVICT_BIN was first introduced, in JE 4.1 and later
* we do not use special rules when an IN is evicted. Since concurrent
* eviction and checkpointing are supported in JE 4.1, the above rules apply to
* EVICT_BIN as well as all other types of eviction.
*/
public class Checkpointer extends DaemonThread implements EnvConfigObserver {
/**
* For unit testing only. Called before we flush the max level. This
* field is static because it is called from the static flushIN method.
*/
private static TestHook<?> maxFlushLevelHook = null;
private static TestHook<?> beforeFlushHook = null;
static TestHook<IN> examineINForCheckpointHook = null;
/* Checkpoint sequence, initialized at recovery. */
private long checkpointId;
/*
* How much the log should grow between checkpoints. If 0, we're using time
* based checkpointing.
*/
private final long logSizeBytesInterval;
private final long logFileMax;
private final long timeInterval;
private long lastCheckpointMillis;
private volatile boolean wakeupAfterNoWrites;
/* Configured to true to minimize checkpoint duration. */
private boolean highPriority;
private long nCheckpoints;
private long lastCheckpointStart;
private long lastCheckpointEnd;
private long lastCheckpointInterval;
private volatile long lastCheckpointFirstActiveLsn;
private final FlushStats flushStats;
/**
* The DirtyINMap for checkpointing is created once and is reset after each
* checkpoint is complete. Access to this object is synchronized so that
* eviction and checkpointing can access it concurrently.
*/
private final DirtyINMap checkpointDirtyMap;
public Checkpointer(EnvironmentImpl envImpl,
long waitTime,
String name) {
super(waitTime, name, envImpl);
logSizeBytesInterval =
envImpl.getConfigManager().getLong
(EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);
logFileMax =
envImpl.getConfigManager().getLong(EnvironmentParams.LOG_FILE_MAX);
timeInterval = waitTime;
lastCheckpointMillis = 0;
nCheckpoints = 0;
flushStats = new FlushStats();
checkpointDirtyMap = new DirtyINMap(envImpl);
/* Initialize mutable properties and register for notifications. */
envConfigUpdate(envImpl.getConfigManager(), null);
envImpl.addConfigObserver(this);
}
/**
* Process notifications of mutable property changes.
*/
@Override
public void envConfigUpdate(DbConfigManager cm,
EnvironmentMutableConfig ignore) {
highPriority = cm.getBoolean
(EnvironmentParams.CHECKPOINTER_HIGH_PRIORITY);
}
/**
* Initializes the checkpoint intervals when no checkpoint is performed
* while opening the environment.
*/
void initIntervals(long lastCheckpointStart,
long lastCheckpointEnd,
long lastCheckpointFirstActiveLsn,
long lastCheckpointMillis) {
this.lastCheckpointStart = lastCheckpointStart;
this.lastCheckpointEnd = lastCheckpointEnd;
this.lastCheckpointFirstActiveLsn = lastCheckpointFirstActiveLsn;
this.lastCheckpointMillis = lastCheckpointMillis;
}
/**
* Returns the firstActiveLsn of the last completed checkpoint.
*/
public long getLastCheckpointFirstActiveLsn() {
return lastCheckpointFirstActiveLsn;
}
/**
* Coordinates an eviction with an in-progress checkpoint and returns
* whether provisional logging is needed.
*
* @return the provisional status to use for logging the target.
*/
public Provisional coordinateEvictionWithCheckpoint(
final DatabaseImpl db,
final int targetLevel,
final IN parent) {
return checkpointDirtyMap.
coordinateEvictionWithCheckpoint(db, targetLevel, parent);
}
/**
* Coordinates a split with an in-progress checkpoint.
*
* @param newSibling the sibling IN created by the split.
*/
public void coordinateSplitWithCheckpoint(final IN newSibling) {
checkpointDirtyMap.coordinateSplitWithCheckpoint(newSibling);
}
/**
* Figure out the wakeup period. Supplied through this static method
* because we need to pass wakeup period to the superclass and need to do
* the calcuation outside this constructor.
*
* @throws IllegalArgumentException via Environment ctor and
* setMutableConfig.
*/
public static long getWakeupPeriod(DbConfigManager configManager)
throws IllegalArgumentException {
long wakeupPeriod = configManager.getDuration
(EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL);
long bytePeriod = configManager.getLong
(EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);
/* Checkpointing period must be set either by time or by log size. */
if ((wakeupPeriod == 0) && (bytePeriod == 0)) {
throw new IllegalArgumentException
(EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL.getName() +
" and " +
EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL.getName() +
" cannot both be 0. ");
}
/*
* Checkpointing by log size takes precendence over time based period.
*/
if (bytePeriod == 0) {
return wakeupPeriod;
} else {
return 0;
}
}
/**
* Set checkpoint id -- can only be done after recovery.
*/
synchronized void setCheckpointId(long lastCheckpointId) {
checkpointId = lastCheckpointId;
}
/**
* Load stats.
*/
@SuppressWarnings("unused")
public StatGroup loadStats(StatsConfig config) {
StatGroup stats = new StatGroup(GROUP_NAME, GROUP_DESC);
new LongStat(stats, CKPT_LAST_CKPTID, checkpointId);
new LongStat(stats, CKPT_CHECKPOINTS, nCheckpoints);
new LongStat(stats, CKPT_LAST_CKPT_INTERVAL, lastCheckpointInterval);
new LSNStat(stats, CKPT_LAST_CKPT_START, lastCheckpointStart);
new LSNStat(stats, CKPT_LAST_CKPT_END, lastCheckpointEnd);
new LongStat(stats, CKPT_FULL_IN_FLUSH, flushStats.nFullINFlush);
new LongStat(stats, CKPT_FULL_BIN_FLUSH, flushStats.nFullBINFlush);
new LongStat(stats, CKPT_DELTA_IN_FLUSH, flushStats.nDeltaINFlush);
if (config.getClear()) {
nCheckpoints = 0;
flushStats.nFullINFlush = 0;
flushStats.nFullBINFlush = 0;
flushStats.nDeltaINFlush = 0;
}
return stats;
}
/**
* Return the number of retries when a deadlock exception occurs.
*/
@Override
protected long nDeadlockRetries() {
return envImpl.getConfigManager().getInt
(EnvironmentParams.CHECKPOINTER_RETRY);
}
/**
* Called whenever the DaemonThread wakes up from a sleep.
*/
@Override
protected void onWakeup() {
if (envImpl.isClosing()) {
return;
}
doCheckpoint(
CheckpointConfig.DEFAULT, "daemon", true /*invokedFromDaemon*/);
wakeupAfterNoWrites = false;
}
/**
* Wakes up the checkpointer if a checkpoint log interval is configured and
* the number of bytes written since the last checkpoint exceeds the size
* of the interval.
*/
public void wakeupAfterWrite() {
if ((logSizeBytesInterval != 0) && !isRunning()) {
long nextLsn = envImpl.getFileManager().getNextLsn();
if (DbLsn.getNoCleaningDistance(
nextLsn, lastCheckpointStart, logFileMax) >=
logSizeBytesInterval) {
wakeup();
}
}
}
/**
* Wakes up the checkpointer if a checkpoint is needed to reclaim disk
* space for already cleaned files. This method is called after an idle
* period with no writes.
*/
public void wakeupAfterNoWrites() {
if (!isRunning() && needCheckpointForCleanedFiles()) {
wakeupAfterNoWrites = true;
wakeup();
}
}
private boolean needCheckpointForCleanedFiles() {
return envImpl.getCleaner().getFileSelector().isCheckpointNeeded();
}
/**
* Determine whether a checkpoint should be run.
*/
private boolean isRunnable(CheckpointConfig config) {
/* Figure out if we're using log size or time to determine interval.*/
long useBytesInterval = 0;
long useTimeInterval = 0;
long nextLsn = DbLsn.NULL_LSN;
boolean runnable = false;
try {
if (config.getForce()) {
runnable = true;
return true;
}
if (wakeupAfterNoWrites && needCheckpointForCleanedFiles()) {
runnable = true;
return true;
}
if (config.getKBytes() != 0) {
useBytesInterval = config.getKBytes() << 10;
} else if (config.getMinutes() != 0) {
/* Convert to millis. */
useTimeInterval = config.getMinutes() * 60 * 1000;
} else if (logSizeBytesInterval != 0) {
useBytesInterval = logSizeBytesInterval;
} else {
useTimeInterval = timeInterval;
}
/*
* If our checkpoint interval is defined by log size, check on how
* much log has grown since the last checkpoint.
*/
if (useBytesInterval != 0) {
nextLsn = envImpl.getFileManager().getNextLsn();
if (DbLsn.getNoCleaningDistance(
nextLsn, lastCheckpointStart, logFileMax) >=
useBytesInterval) {
runnable = true;
}
} else if (useTimeInterval != 0) {
/*
* Our checkpoint is determined by time. If enough time has
* passed and some log data has been written, do a checkpoint.
*/
final long lastUsedLsn =
envImpl.getFileManager().getLastUsedLsn();
if (((System.currentTimeMillis() - lastCheckpointMillis) >=
useTimeInterval) &&
(DbLsn.compareTo(lastUsedLsn, lastCheckpointEnd) != 0)) {
runnable = true;
}
}
return runnable;
} finally {
if (logger.isLoggable(Level.FINEST)) {
final StringBuilder sb = new StringBuilder();
sb.append("size interval=").append(useBytesInterval);
if (nextLsn != DbLsn.NULL_LSN) {
sb.append(" nextLsn=").
append(DbLsn.getNoFormatString(nextLsn));
}
if (lastCheckpointEnd != DbLsn.NULL_LSN) {
sb.append(" lastCkpt=");
sb.append(DbLsn.getNoFormatString(lastCheckpointEnd));
}
sb.append(" time interval=").append(useTimeInterval);
sb.append(" force=").append(config.getForce());
sb.append(" runnable=").append(runnable);
LoggerUtils.finest(logger, envImpl, sb.toString());
}
}
}
/**
* The real work to do a checkpoint. This may be called by the checkpoint
* thread when waking up, or it may be invoked programatically through the
* api.
*
* @param invokingSource a debug aid, to indicate who invoked this
* checkpoint. (i.e. recovery, the checkpointer daemon, the cleaner,
* programatically)
*/
public synchronized void doCheckpoint(CheckpointConfig config,
String invokingSource,
boolean invokedFromDaemon) {
if (envImpl.isReadOnly()) {
return;
}
if (!isRunnable(config)) {
return;
}
/* Stop if we cannot write because of a disk limit violation. */
try {
envImpl.checkDiskLimitViolation();
} catch (DiskLimitException e) {
if (!invokedFromDaemon) {
throw e;
}
return;
}
/*
* If minimizing recovery time is desired, then flush all the way to
* the top of the dbtree instead of stopping at the highest level last
* modified, so that only the root INs are processed by recovery.
*/
final boolean flushAll = config.getMinimizeRecoveryTime();
/*
* If there are cleaned files to be deleted, flush an extra level to
* write out the parents of cleaned nodes. This ensures that no node
* will contain the LSN of a cleaned file.
*/
final Cleaner cleaner = envImpl.getCleaner();
final CheckpointStartCleanerState cleanerState =
cleaner.getFilesAtCheckpointStart();
/*
* Any scans that are complete before the checkpoint, can be deleted
* after the checkpoint.
*/
final ExtinctionScanner extinctionScanner =
envImpl.getExtinctionScanner();
final Set<Long> completedScans =
extinctionScanner.getCompletedRecordScans();
final boolean flushExtraLevel = !cleanerState.isEmpty();
lastCheckpointMillis = System.currentTimeMillis();
flushStats.resetPerRunCounters();
/* Get the next checkpoint id. */
checkpointId++;
nCheckpoints++;
boolean success = false;
boolean traced = false;
final LogManager logManager = envImpl.getLogManager();
/*
* Set the checkpoint state so that concurrent eviction can be
* coordinated.
*/
checkpointDirtyMap.beginCheckpoint(flushAll, flushExtraLevel);
try {
/* Log the checkpoint start. */
final SingleItemEntry<CheckpointStart> startEntry =
SingleItemEntry.create(
LogEntryType.LOG_CKPT_START,
new CheckpointStart(checkpointId, invokingSource));
final long checkpointStart =
logManager.log(startEntry, ReplicationContext.NO_REPLICATE);
/*
* Note the first active LSN point. The definition of
* firstActiveLsn is that all log entries for active transactions
* are equal to or after that LSN. This is the starting point for
* replaying LNs during recovery and will be stored in the CkptEnd
* entry.
*
* Use the checkpointStart as the firstActiveLsn if firstActiveLsn
* is null, meaning that no txns are active.
*
* The current value must be retrieved from TxnManager after
* logging CkptStart. If it were instead retrieved before logging
* CkptStart, the following failure could occur. [#20270]
*
* ... getFirstActiveLsn returns NULL_LSN, will use 200 CkptStart
* 100 LN-A in Txn-1
* 200 CkptStart
* 300 BIN-B refers to 100 LN-A
* 400 CkptEnd
* ... Crash and recover. Recovery does not undo 100 LN-A.
* ... Txn-1 is uncommitted, yet 100 LN-A takes effect.
*/
long firstActiveLsn = envImpl.getTxnManager().getFirstActiveLsn();
if (firstActiveLsn == DbLsn.NULL_LSN) {
firstActiveLsn = checkpointStart;
}
/*
* In a replicated system, the checkpointer will be flushing out
* the VLSNIndex, which is HA metadata. Check that the in-memory
* version encompasses all metadata up to the point of the
* CheckpointStart record. This is no-op for non-replicated
* systems. [#19754]
*/
envImpl.awaitVLSNConsistency();
/* Find the set of dirty INs that must be logged. */
checkpointDirtyMap.selectDirtyINsForCheckpoint();
/* Call hook after dirty map creation and before flushing. */
TestHookExecute.doHookIfSet(beforeFlushHook);
/* Flush IN nodes. */
flushDirtyNodes(
envImpl, checkpointDirtyMap, checkpointStart, highPriority,
flushStats);
if (DirtyINMap.DIRTY_SET_DEBUG_TRACE) {
LoggerUtils.logMsg(
envImpl.getLogger(), envImpl, Level.INFO,
"Ckpt flushed" +
" nFullINFlushThisRun = " +
flushStats.nFullINFlushThisRun +
" nFullBINFlushThisRun = " +
flushStats.nFullBINFlushThisRun +
" nDeltaINFlushThisRun = " +
flushStats.nDeltaINFlushThisRun);
}
/*
* Flush MapLNs if not already done by flushDirtyNodes. Only flush
* a database if it has not already been flushed since checkpoint
* start. Lastly, flush the DB mapping tree root.
*/
checkpointDirtyMap.flushMapLNs(checkpointStart);
checkpointDirtyMap.flushRoot(checkpointStart);
/*
* Flush replication information if necessary so that the VLSNIndex
* cache is flushed and is recoverable.
*/
envImpl.preCheckpointEndFlush();
/*
* Flush utilization info AFTER flushing IN nodes to reduce the
* inaccuracies caused by the sequence FileSummaryLN-LN-BIN. This
* also reduces the chance of lost IN obsolete info when there is
* a crash after logging the non-provisional INs but before
* logging the FileSummaryLNs.
*/
envImpl.getUtilizationProfile().flushFileUtilization
(envImpl.getUtilizationTracker().getTrackedFiles());
final DbTree dbTree = envImpl.getDbTree();
final boolean willDeleteFiles = !cleanerState.isEmpty();
final CheckpointEnd ckptEnd = new CheckpointEnd(
invokingSource, checkpointStart, envImpl.getRootLsn(),
firstActiveLsn,
envImpl.getNodeSequence().getLastLocalNodeId(),
envImpl.getNodeSequence().getLastReplicatedNodeId(),
dbTree.getLastLocalDbId(), dbTree.getLastReplicatedDbId(),
envImpl.getTxnManager().getLastLocalTxnId(),
envImpl.getTxnManager().getLastReplicatedTxnId(),
envImpl.getExtinctionScanner().getLastLocalId(),
envImpl.getExtinctionScanner().getLastReplicatedId(),
checkpointId, willDeleteFiles);
final SingleItemEntry<CheckpointEnd> endEntry =
SingleItemEntry.create(LogEntryType.LOG_CKPT_END, ckptEnd);
/*
* Log checkpoint end and update state kept about the last
* checkpoint location. Send a trace message *before* the
* checkpoint end log entry. This is done so that the normal trace
* message doesn't affect the time-based isRunnable() calculation,
* which only issues a checkpoint if a log record has been written
* since the last checkpoint.
*/
trace(envImpl, invokingSource, true);
traced = true;
lastCheckpointInterval = DbLsn.getNoCleaningDistance(
checkpointStart, lastCheckpointStart, logFileMax);
/*
* We must flush and fsync to ensure that cleaned files are not
* referenced. This also ensures that this checkpoint is not wasted
* if we crash.
*/
lastCheckpointEnd = logManager.logForceFlush(
endEntry, true /*fsyncRequired*/,
ReplicationContext.NO_REPLICATE);
lastCheckpointStart = checkpointStart;
lastCheckpointFirstActiveLsn = firstActiveLsn;
success = true;
cleaner.updateFilesAtCheckpointEnd(cleanerState);
extinctionScanner.deleteCompletedRecordScans(completedScans);
} catch (DiskLimitException e) {
LoggerUtils.logMsg(
envImpl.getLogger(), envImpl, Level.WARNING,
"Ckpt id=" + checkpointId + " success=" + success +
" aborted because of disk limit violation: " + e);
if (!invokedFromDaemon) {
throw e;
}
} catch (DatabaseException e) {
LoggerUtils.traceAndLogException(envImpl, "Checkpointer",
"doCheckpoint", "checkpointId=" +
checkpointId, e);
throw e;
} finally {
/*
* Reset the checkpoint state so evictor activity knows there's no
* further requirement for provisional logging. SR 11163.
*/
checkpointDirtyMap.reset();
if (!traced) {
trace(envImpl, invokingSource, success);
}
}
}
private void trace(EnvironmentImpl envImpl,
String invokingSource,
boolean success ) {
final StringBuilder sb = new StringBuilder();
sb.append("Checkpoint ").append(checkpointId);
sb.append(": source=" ).append(invokingSource);
sb.append(" success=").append(success);
sb.append(" nFullINFlushThisRun=");
sb.append(flushStats.nFullINFlushThisRun);
sb.append(" nDeltaINFlushThisRun=");
sb.append(flushStats.nDeltaINFlushThisRun);
LoggerUtils.logMsg(logger, envImpl, Level.CONFIG, sb.toString());
}
/**
* Flush a given database to disk. Like checkpoint, log from the bottom
* up so that parents properly represent their children.
*/
public void syncDatabase(EnvironmentImpl envImpl,
DatabaseImpl dbImpl,
boolean flushLog) {
if (envImpl.isReadOnly()) {
return;
}
envImpl.checkDiskLimitViolation();
final DirtyINMap dirtyMap = new DirtyINMap(envImpl);
final FlushStats fstats = new FlushStats();
try {
/* Find the dirty set. */
dirtyMap.selectDirtyINsForDbSync(dbImpl);
if (dirtyMap.getNumEntries() > 0) {
/* Write all dirtyINs out.*/
flushDirtyNodes(
envImpl, dirtyMap, DbLsn.NULL_LSN /*ckptStart*/,
false /*highPriority*/, fstats);
/* Make changes durable. [#15254] */
if (flushLog) {
envImpl.getLogManager().flushSync();
}
}
} catch (DiskLimitException e) {
throw e;
} catch (DatabaseException e) {
LoggerUtils.traceAndLogException
(envImpl, "Checkpointer", "syncDatabase",
"of " + dbImpl.getName(), e);
throw e;
} finally {
dirtyMap.reset();
}
}
/* For unit testing only. */
public static void setMaxFlushLevelHook(TestHook<?> hook) {
maxFlushLevelHook = hook;
}
/* For unit testing only. */
public static void setBeforeFlushHook(TestHook<?> hook) {
beforeFlushHook = hook;
}
/**
* Flush the nodes in order, from the lowest level to highest level. As a
* flush dirties its parent, add it to the dirty map, thereby cascading the
* writes up the tree. If flushAll wasn't specified, we need only cascade
* up to the highest level set at the start of checkpointing.
*
* Note that all but the top level INs are logged provisionally. That's
* because we don't need to process lower INs during recovery because the
* higher INs will end up pointing at them.
*/
private static void flushDirtyNodes(EnvironmentImpl envImpl,
DirtyINMap dirtyMap,
long checkpointStart,
boolean highPriority,
FlushStats fstats) {
final DbTree dbTree = envImpl.getDbTree();
final Map<DatabaseId, DatabaseImpl> dbCache = new HashMap<>();
try {
while (dirtyMap.getNumLevels() > 0) {
/*
* Work on one level's worth of nodes in ascending level order.
*/
final Integer currentLevel = dirtyMap.getLowestLevelSet();
final int currentLevelVal = currentLevel;
/*
* Flush MapLNs just prior to flushing the first level of the
* mapping tree. Only flush a database if it has not already
* been flushed since checkpoint start.
*/
if (currentLevelVal == IN.DBMAP_LEVEL) {
dirtyMap.flushMapLNs(checkpointStart);
}
/* Flush the nodes at the current level. */
while (true) {
final CheckpointReference targetRef =
dirtyMap.removeNextNode(currentLevel);
if (targetRef == null) {
break;
}
envImpl.checkDiskLimitViolation();
/*
* Check to make sure the DB was not deleted after putting
* it in the dirty map, and prevent the DB from being
* deleted while we're working with it.
*/
final DatabaseImpl db = dbTree.getDb(
targetRef.dbId, -1 /*lockTimeout*/, dbCache);
if (db != null) {
/* Flush if we're below maxFlushLevel. */
final int maxFlushLevel =
dirtyMap.getHighestFlushLevel(db);
if (currentLevelVal <= maxFlushLevel) {
flushIN(
db, targetRef, dirtyMap, maxFlushLevel,
highPriority, fstats, true /*allowLogSubtree*/);
/*
* Sleep if background read/write limit was
* exceeded.
*/
envImpl.sleepAfterBackgroundIO();
}
}
/*
* If the environment was invalidated by other activity,
* get out of this loop, and re-throw the invalidating
* exception to indicate that the checkpoint did not
* succeed.
*/
envImpl.checkIfInvalid();
}
/* We're done with this level. */
dirtyMap.removeLevel(currentLevel);
}
} finally {
dbTree.releaseDbs(dbCache);
}
/*
* Do not flush FileSummaryLNs/MapLNs (do not call
* UtilizationProfile.flushLocalTracker) here because that flushing is
* already done by the checkpoint.
*/
}
/**
* Flush the target IN.
*
* Where applicable, also attempt to flush the subtree that houses this
* target, which means we flush the siblings of this target to promote
* better cleaning throughput. The problem lies in the fact that
* provisionally logged nodes are not available for log cleaning until
* their parent is logged non-provisionally. On the other hand, we want to
* log nodes in provisional mode as much as possible, both for recovery
* performance, and for correctness to avoid fetches against cleaned log
* files. (See [#16037].) These conflicting goals are reconciled by
* flushing nodes in subtree grouping, because writing the non-provisional
* parent of a set of provisionally written nodes frees the cleaner to work
* on that set of provisional nodes as soon as possible. For example, if a
* tree consists of:
*
* INa
* +------+-------+
* INb INc
* +-----+----+ +-----+
* BINd BINe BINf BINg BINh
*
* It is more efficient for cleaning throughput to log in this order:
* BINd, BINe, BINf, INb, BINg, BINh, INc, INa
* rather than:
* BINd, BINe, BINf, BINg, BINh, INb, INc, INa
*
* Suppose the subtree in question is INb->{BINd, BINe, BINf}
*
* Suppose we see BINd in the dirty map first, before BINe and BINf.
* - flushIN(BINd) is called
* - we fetch and latch its parent, INb
*
* If this is a high priority checkpoint, we'll hold the INb latch across
* the time it takes to flush all three children. In flushIN(BINd), we
* walk through INb, create a local map of all the siblings that can be
* found in the dirty map, and then call logSiblings with that local map.
* Then we'll write out INb.
*
* If high priority is false, we will not hold the INb latch across
* multiple IOs. Instead, we
* - write BINd out, using logSiblings
* - while still holding the INb latch, we create a list of dirty siblings
* - release the INb latch
* - call flushIN() recursively on each entry in the local sibling map,
* which will result in a search and write of each sibling. These
* recursive calls to flushIN are called with the allowLogSubtree
* parameter of false to halt the recursion and prevent a repeat of the
* sibling examination.
* - write INb
*/
private static void flushIN(final DatabaseImpl db,
final CheckpointReference targetRef,
final DirtyINMap dirtyMap,
final int maxFlushLevel,
final boolean highPriority,
final FlushStats fstats,
final boolean allowLogSubtree) {
final EnvironmentImpl envImpl = db.getEnv();
final Tree tree = db.getTree();
final int targetLevel = targetRef.nodeLevel;
/* Call test hook when we reach the max level. */
assert (targetLevel < maxFlushLevel) ||
TestHookExecute.doHookIfSet(maxFlushLevelHook);
if (targetRef.isRoot) {
final RootFlusher flusher =
new RootFlusher(db, targetRef.nodeId);
tree.withRootLatchedExclusive(flusher);
/*
* Update the tree's owner, whether it's the env root or the
* db-mapping tree.
*/
if (flusher.getFlushed()) {
DbTree dbTree = envImpl.getDbTree();
dbTree.modifyDbRoot(db);
fstats.nFullINFlushThisRun++;
fstats.nFullINFlush++;
}
/*
* If this target isn't the root anymore, we'll have to handle it
* like a regular node.
*/
if (flusher.stillRoot()) {
return;
}
}
/*
* The following applies to two cases:
* (1) the target was not ever the root
* (2) the target was the root, when the checkpoint dirty set was
* assembled but is not the root now.
*/
final SearchResult result = tree.getParentINForChildIN(
-1 /*nodeId*/, targetRef.treeKey,
targetRef.nodeLevel /*targetLevel*/,
targetRef.nodeLevel + 1 /*exclusiveLevel*/,
false /*requireExactMatch*/, false /*doFetch*/,
CacheMode.UNCHANGED, null /*trackingList*/);
/*
* If no possible parent is found, the compressor may have deleted
* this item before we got to processing it. (Although it seems this
* cannot currently happen since we never delete the root node.)
*/
if (result.parent == null) {
return;
}
final IN parent = result.parent;
final int index = result.index;
final int parentLevel = parent.getLevel();
final CheckpointReference parentRef;
/* List of siblings to log after releasing the parent latch. */
final List<CheckpointReference> logSiblingsSeparately;
try {
/*
* If bottomLevelTarget is true, the parent IN contains bottom
* level BINs. The masking is used to normalize the level for
* ordinary DBs and the mapping tree DB.
*/
final boolean bottomLevelTarget =
((parentLevel & IN.LEVEL_MASK) == 2);
/*
* INs at the max flush level are always non-provisional and
* INs at the bottom level (when this is not also the max flush
* level) are always provisional. In between INs are
* provisional BEFORE_CKPT_END (see Provisional).
*/
final Provisional provisional;
if (targetLevel >= maxFlushLevel) {
provisional = Provisional.NO;
} else if (bottomLevelTarget) {
provisional = Provisional.YES;
} else {
provisional = Provisional.BEFORE_CKPT_END;
}
/*
* If we didn't reach the target level, a child wasn't resident
* and there is nothing to log at this level. To be on the safe
* side, we'll put the parent into the dirty set to be logged when
* that level is processed.
*
* Only do this if the parent we found is at a higher level than
* the child. This ensures that the non-exact search does not
* find a sibling rather than a parent. [#11555]
*/
if (!result.exactParentFound) {
if (parentLevel > targetLevel) {
dirtyMap.addIN(
parent, -1 /*index*/,
false /*updateFlushLevels*/,
true /*updateMemoryBudget*/);
}
return;
}
/*
* We found the parent. Add it unconditionally to the dirty map. We
* must make sure that every IN that was selected for the
* checkpointer's dirty IN set at the beginning of checkpoint is
* written into the log and can be properly accessed from
* ancestors. Eviction or a split may have written out a member of
* this dirty set before the checkpointer got to it. See [#10249].
*/
assert parentLevel == targetLevel + 1;
dirtyMap.addIN(
parent, -1 /*index*/,
false /*updateFlushLevels*/,
true /*updateMemoryBudget*/);
/*
* Determine whether our search found the IN identified by either
* targetRef.nodeId or targetRef.lsn. If there is not a match, then
* the node was deleted, logged or split since creating the
* reference.
*
* For a non-DW DB, targetRef.lsn will be not null and we match on
* it. If the LSN has changed then of course the node was logged,
* and possibly split, and we will not log this target here.
*
* For a DW DB we also match on LSN if it is non-null. If the LSN
* is null then the reference was created for a never-logged IN and
* targetRef.nodeId >= 0. In that case we match on the nodeId. If
* the LSN or nodeId doesn't match, there must have been a split,
* and we will not log this target here. However, because splits
* are not logged for DW, this is not sufficient to cause both
* siblings that were part of split to be logged, when one node was
* added to the dirty map. We account for this when the parent is
* logged by calling logDirtyChildren. This approach relies on the
* fact that a split will dirty the parent.
*
* TODO:
* Why not always call logDirtyIN for a DW IN, whether or not the
* LSN or nodeId matches? logDirtyChildren is going to log it
* anyway if it is dirty.
*/
if (targetRef.lsn != DbLsn.NULL_LSN) {
if (targetRef.lsn != parent.getLsn(index)) {
return;
}
} else {
assert targetRef.nodeId >= 0;
assert db.isDeferredWriteMode();
final IN target = (IN) parent.getTarget(index);
if (target == null ||
targetRef.nodeId != target.getNodeId()) {
return;
}
}
/* Log the target, if dirty. */
logDirtyIN(envImpl, parent, index, provisional, fstats);
/*
* We will log a sub-tree when the target is at the bottom level
* and this is not a recursive call to flushIN during sub-tree
* logging. Return if we are only logging the target node here.
*/
if (!bottomLevelTarget || !allowLogSubtree) {
return;
}
/*
* Log sub-tree siblings with the latch held when highPriority
* is configured and this is not a DW DB. For a DW DB, dirty LNs
* are logged for each BIN. If we were to log a DW sub-tree with
* the parent latch held, the amount of logging may cause the latch
* to be held for too long a period.
*/
if (highPriority && !db.isDurableDeferredWrite()) {
logSiblingsSeparately = null;
} else {
logSiblingsSeparately = new ArrayList<>();
}
for (int i = 0; i < parent.getNEntries(); i += 1) {
if (i == index) {
continue;
}
final IN child = (IN) parent.getTarget(i);
final long childId = (child != null) ? child.getNodeId() : -1;
final long childLsn = parent.getLsn(i);
final CheckpointReference childRef =
dirtyMap.removeNode(targetLevel, childLsn, childId);
if (childRef == null) {
continue;
}
if (logSiblingsSeparately != null) {
logSiblingsSeparately.add(childRef);
} else {
logDirtyIN(envImpl, parent, i, provisional, fstats);
}
}
/* Get parentRef before releasing the latch. */
if (parentLevel <= maxFlushLevel) {
parentRef = dirtyMap.removeNode(
parentLevel, parent.getLastLoggedLsn(),
parent.getNodeId());
} else {
parentRef = null;
}
} finally {
parent.releaseLatch();
}
/*
* If highPriority is false, we don't hold the latch while logging
* the bottom level siblings. We log them here with flushIN,
* performing a separate search for each one, after releasing the
* parent latch above.
*/
if (logSiblingsSeparately != null) {
for (final CheckpointReference childRef : logSiblingsSeparately) {
flushIN(
db, childRef, dirtyMap, maxFlushLevel, highPriority,
fstats, false /*allowLogSubtree*/);
}
}
/*
* Log the sub-tree parent, which will be logged non-provisionally,
* in order to update cleaner utilization. This must be done with
* flushIN after releasing the parent latch above, since we must search
* and acquire the grandparent latch.
*/
if (parentRef != null) {
flushIN(
db, parentRef, dirtyMap, maxFlushLevel, highPriority, fstats,
false /*allowLogSubtree*/);
}
}
/**
* Note that if this method is called, the parent must also be logged. This
* is true even if this method finds that the child is not dirty. In that
* case the child has already been flushed (e.g., by eviction) and the
* parent must be logged according to the rule for max flush level.
*/
private static void logDirtyIN(
final EnvironmentImpl envImpl,
final IN parent,
final int index,
final Provisional provisional,
final FlushStats fstats) {
final IN child = (IN) parent.getTarget(index);
final long newLsn;
final boolean isBIN;
final boolean isDelta;
if (child != null) {
child.latch(CacheMode.UNCHANGED);
try {
if (!child.getDirty()) {
return;
}
if (child.getDatabase().isDurableDeferredWrite()) {
/*
* Find dirty descendants to avoid logging nodes with
* never-logged children. See [#13936] and
* IN.logDirtyChildren for description of the case.
*
* Note that we must log both dirty and never-logged
* descendants to be sure to have a consistent view of
* the split. If we didn't, we could end up with the
* post-split version of a new sibling and the
* pre-split version of an split sibling in the log,
* which could result in a recovery where descendants
* are incorrectly duplicated, because they are in both
* the pre-split split sibling, and the post-split
* version of the new sibling.
*/
child.logDirtyChildren();
}
newLsn = child.log(
true /*allowDeltas*/, provisional,
true /*backgroundIO*/, parent);
assert (newLsn != DbLsn.NULL_LSN);
isBIN = child.isBIN();
isDelta = (newLsn == child.getLastDeltaLsn());
} finally {
child.releaseLatch();
}
} else {
final OffHeapCache ohCache = envImpl.getOffHeapCache();
final INLogEntry<BIN> logEntry =
ohCache.createBINLogEntryForCheckpoint(parent, index);
if (logEntry == null) {
return;
}
isBIN = true;
isDelta = logEntry.isBINDelta();
newLsn = IN.logEntry(
logEntry, provisional, true /*backgroundIO*/, parent);
ohCache.postBINLog(parent, index, logEntry, newLsn);
}
parent.updateEntry(index, newLsn, VLSN.NULL_VLSN_SEQUENCE, 0);
if (isDelta) {
fstats.nDeltaINFlushThisRun++;
fstats.nDeltaINFlush++;
} else {
fstats.nFullINFlushThisRun++;
fstats.nFullINFlush++;
if (isBIN) {
fstats.nFullBINFlush++;
fstats.nFullBINFlushThisRun++;
}
}
}
/*
* RootFlusher lets us write out the root IN within the root latch.
*/
private static class RootFlusher implements WithRootLatched {
private final DatabaseImpl db;
private boolean flushed;
private boolean stillRoot;
private final long targetNodeId;
RootFlusher(final DatabaseImpl db,
final long targetNodeId) {
this.db = db;
flushed = false;
this.targetNodeId = targetNodeId;
stillRoot = false;
}
/**
* Flush the rootIN if dirty.
*/
@Override
public IN doWork(ChildReference root) {
if (root == null) {
return null;
}
IN rootIN = (IN) root.fetchTarget(db, null);
rootIN.latch(CacheMode.UNCHANGED);
try {
if (rootIN.getNodeId() == targetNodeId) {
/*
* Find dirty descendants to avoid logging nodes with
* never-logged children. See [#13936]
*/
if (rootIN.getDatabase().isDurableDeferredWrite()) {
rootIN.logDirtyChildren();
}
/*
* stillRoot handles the situation where the root was split
* after it was placed in the checkpointer's dirty set.
*/
stillRoot = true;
if (rootIN.getDirty()) {
long newLsn = rootIN.log();
root.setLsn(newLsn);
flushed = true;
}
}
} finally {
rootIN.releaseLatch();
}
return null;
}
boolean getFlushed() {
return flushed;
}
boolean stillRoot() {
return stillRoot;
}
}
/*
* CheckpointReferences are used to identify nodes that must be flushed as
* part of the checkpoint. We don't keep an actual reference to the node
* because that prevents nodes from being GC'ed during checkpoint.
*
* Using a checkpointReference introduces a window between the point when
* the checkpoint dirty set is created and when the node is flushed. Some
* of the fields saved in the reference are immutable: db, nodeId. The
* others are not and we have to handle potential change:
*
* isRoot: it's possible for isRoot to go from true->false, but not
* false->true. True->false is handled by the flushIN method
* by finding the root and checking if it is the target.
* treeKey: This can change only in the event of a split. If it does, there
* is the chance that the checkpointer will find the wrong node to
* flush, but that's okay because the split guarantees flushing to
* the root, so the target will be properly logged within the
* checkpoint period.
*
* The class and ctor are public for the Sizeof program.
*/
public static class CheckpointReference {
final DatabaseId dbId;
final long nodeId;
final int nodeLevel;
final boolean isRoot;
final byte[] treeKey;
final long lsn;
CheckpointReference(final DatabaseId dbId,
final long nodeId,
final int nodeLevel,
final boolean isRoot,
final byte[] treeKey,
final long lsn) {
this.dbId = dbId;
this.nodeId = nodeId;
this.nodeLevel = nodeLevel;
this.isRoot = isRoot;
this.treeKey = treeKey;
this.lsn = lsn;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof CheckpointReference)) {
return false;
}
CheckpointReference other = (CheckpointReference) o;
return nodeId == other.nodeId;
}
@Override
public int hashCode() {
return (int) nodeId;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("db=").append(dbId);
sb.append(" nodeId=").append(nodeId);
return sb.toString();
}
}
/**
* A struct to hold log flushing stats for checkpoint and database sync.
*/
public static class FlushStats {
public long nFullINFlush;
public long nFullBINFlush;
public long nDeltaINFlush;
long nFullINFlushThisRun;
long nFullBINFlushThisRun;
long nDeltaINFlushThisRun;
/* For future addition to stats:
private int nAlreadyEvictedThisRun;
*/
/* Reset per-run counters. */
void resetPerRunCounters() {
nFullINFlushThisRun = 0;
nFullBINFlushThisRun = 0;
nDeltaINFlushThisRun = 0;
/* nAlreadyEvictedThisRun = 0; -- for future */
}
}
}