| /*- |
| * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved. |
| * |
| * This file was distributed by Oracle as part of a version of Oracle Berkeley |
| * DB Java Edition made available at: |
| * |
| * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html |
| * |
| * Please see the LICENSE file included in the top-level directory of the |
| * appropriate version of Oracle Berkeley DB Java Edition for a copy of the |
| * license and additional information. |
| */ |
| |
| package com.sleepycat.je.recovery; |
| |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_CHECKPOINTS; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_DELTA_IN_FLUSH; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_BIN_FLUSH; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_IN_FLUSH; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPTID; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_END; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_INTERVAL; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_START; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_DESC; |
| import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_NAME; |
| |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.logging.Level; |
| |
| import com.sleepycat.je.CacheMode; |
| import com.sleepycat.je.CheckpointConfig; |
| import com.sleepycat.je.DatabaseException; |
| import com.sleepycat.je.DiskLimitException; |
| import com.sleepycat.je.EnvironmentMutableConfig; |
| import com.sleepycat.je.StatsConfig; |
| import com.sleepycat.je.cleaner.Cleaner; |
| import com.sleepycat.je.cleaner.ExtinctionScanner; |
| import com.sleepycat.je.cleaner.FileSelector.CheckpointStartCleanerState; |
| import com.sleepycat.je.config.EnvironmentParams; |
| import com.sleepycat.je.dbi.DatabaseId; |
| import com.sleepycat.je.dbi.DatabaseImpl; |
| import com.sleepycat.je.dbi.DbConfigManager; |
| import com.sleepycat.je.dbi.DbTree; |
| import com.sleepycat.je.dbi.EnvConfigObserver; |
| import com.sleepycat.je.dbi.EnvironmentImpl; |
| import com.sleepycat.je.evictor.OffHeapCache; |
| import com.sleepycat.je.log.LogEntryType; |
| import com.sleepycat.je.log.LogManager; |
| import com.sleepycat.je.log.Provisional; |
| import com.sleepycat.je.log.ReplicationContext; |
| import com.sleepycat.je.log.entry.INLogEntry; |
| import com.sleepycat.je.log.entry.SingleItemEntry; |
| import com.sleepycat.je.tree.BIN; |
| import com.sleepycat.je.tree.ChildReference; |
| import com.sleepycat.je.tree.IN; |
| import com.sleepycat.je.tree.SearchResult; |
| import com.sleepycat.je.tree.Tree; |
| import com.sleepycat.je.tree.WithRootLatched; |
| import com.sleepycat.je.utilint.DaemonThread; |
| import com.sleepycat.je.utilint.DbLsn; |
| import com.sleepycat.je.utilint.LSNStat; |
| import com.sleepycat.je.utilint.LoggerUtils; |
| import com.sleepycat.je.utilint.LongStat; |
| import com.sleepycat.je.utilint.StatGroup; |
| import com.sleepycat.je.utilint.TestHook; |
| import com.sleepycat.je.utilint.TestHookExecute; |
| import com.sleepycat.je.utilint.VLSN; |
| |
| /** |
| * The Checkpointer looks through the tree for internal nodes that must be |
| * flushed to the log. Checkpoint flushes must be done in ascending order from |
| * the bottom of the tree up. |
| * |
| * Checkpoint and IN Logging Rules |
| * ------------------------------- |
| * The checkpoint must log, and make accessible via non-provisional ancestors, |
| * all INs that are dirty at CkptStart. If we crash and recover from that |
| * CkptStart onward, any IN that became dirty (before the crash) after the |
| * CkptStart must become dirty again as the result of replaying the action that |
| * caused it to originally become dirty. |
| * |
| * Therefore, when an IN is dirtied at some point in the checkpoint interval, |
| * but is not logged by the checkpoint, the log entry representing the action |
| * that dirtied the IN must follow either the CkptStart or the FirstActiveLSN |
| * that is recorded in the CkptEnd entry. The FirstActiveLSN is less than or |
| * equal to the CkptStart LSN. Recovery will process LNs between the |
| * FirstActiveLSN and the end of the log. Other entries are only processed |
| * from the CkptStart forward. And provisional entries are not processed. |
| * |
| * Example: Non-transactional LN logging. We take two actions: 1) log the LN |
| * and then 2) dirty the parent BIN. What if the LN is logged before CkptStart |
| * and the BIN is dirtied after CkptStart? How do we avoid breaking the rules? |
| * The answer is that we log the LN while holding the latch on the parent BIN, |
| * and we don't release the latch until after we dirty the BIN. The |
| * construction of the checkpoint dirty map requires latching the BIN. Since |
| * the LN was logged before CkptStart, the BIN will be dirtied before the |
| * checkpointer latches it during dirty map construction. So the BIN will |
| * always be included in the dirty map and logged by the checkpoint. |
| * |
| * Example: Abort. We take two actions: 1) log the abort and then 2) undo the |
| * changes, which modifies (dirties) the BIN parents of the undone LNs. There |
| * is nothing to prevent logging CkptStart in between these two actions, so how |
| * do we avoid breaking the rules? The answer is that we do not unregister the |
| * transaction until after the undo phase. So although the BINs may be dirtied |
| * by the undo after CkptStart is logged, the FirstActiveLSN will be prior to |
| * CkptStart. Therefore, we will process the Abort and replay the action that |
| * modifies the BINs. |
| * |
| * Exception: Lazy migration. The log cleaner will make an IN dirty without |
| * logging an action that makes it dirty. This is an exception to the general |
| * rule that actions should be logged when they cause dirtiness. The reasons |
| * this is safe are: |
| * 1. The IN contents are not modified, so there is no information lost if the |
| * IN is never logged, or is logged provisionally and no ancestor is logged |
| * non-provisionally. |
| * 2. If the IN is logged non-provisionally, this will have the side effect of |
| * recording the old LSN as being obsolete. However, the general rules for |
| * checkpointing and recovery will ensure that the new version is used in |
| * the Btree. The new version will either be replayed by recovery or |
| * referenced in the active Btree via a non-provisional ancestor. |
| * |
| * Checkpoint Algorithm TODO update this |
| * -------------------- |
| * The final checkpointDirtyMap field is used to hold (in addition to the dirty |
| * INs) the state of the checkpoint and highest flush levels. Access to this |
| * object is synchronized so that eviction and checkpointing can access it |
| * concurrently. When a checkpoint is not active, the state is CkptState.NONE |
| * and the dirty map is empty. When a checkpoint runs, we do this: |
| * |
| * 1. Get set of files from cleaner that can be deleted after this checkpoint. |
| * 2. Set checkpointDirtyMap state to DIRTY_MAP_INCOMPLETE, meaning that dirty |
| * map construction is in progress. |
| * 3. Log CkptStart |
| * 4. Construct dirty map, organized by Btree level, from dirty INs in INList. |
| * The highest flush levels are calculated during dirty map construction. |
| * Set checkpointDirtyMap state to DIRTY_MAP_COMPLETE. |
| * 5. Flush INs in dirty map. |
| * + First, flush the bottom two levels a sub-tree at a time, where a |
| * sub-tree is one IN at level two and all its BIN children. Higher |
| * levels (above level two) are logged strictly by level, not using |
| * subtrees. |
| * o If je.checkpointer.highPriority=false, we log one IN at a |
| * time, whether or not the IN is logged as part of a subtree, |
| * and do a Btree search for the parent of each IN. |
| * o If je.checkpointer.highPriority=true, for the bottom two |
| * levels we log each sub-tree in a single call to the |
| * LogManager with the parent IN latched, and we only do one |
| * Btree search for each level two IN. Higher levels are logged |
| * one IN at a time as with highPriority=false. |
| * + The Provisional property is set as follows, depending on the level |
| * of the IN: |
| * o level is max flush level: Provisional.NO |
| * o level is bottom level: Provisional.YES |
| * o Otherwise (middle levels): Provisional.BEFORE_CKPT_END |
| * 6. Flush VLSNIndex cache to make VLSNIndex recoverable. |
| * 7. Flush UtilizationTracker (write FileSummaryLNs) to persist all |
| * tracked obsolete offsets and utilization summary info, to make this info |
| * recoverable. |
| * 8. Log CkptEnd |
| * 9. Delete cleaned files from step 1. |
| * 10. Set checkpointDirtyMap state to NONE. |
| * |
| * Per-DB Highest Flush Level |
| * -------------------------- |
| * As mentioned above, when the dirty map is constructed we also determine the |
| * highest flush level for each database. This is the maximum Btree level at |
| * which a dirty node exists in the DB. |
| * |
| * When logging a node below the maxFlushLevel, we add the parent to the dirty |
| * map. It may or may not have been added when the dirty map was constructed. |
| * The idea is to flush all ancestors of all nodes in the dirty map, up to and |
| * including the maxFlushLevel, even if those ancestors were not dirty when the |
| * dirty map was constructed. |
| * |
| * This is done to avoid orphaning a dirty node as shown in this example. |
| * |
| * IN-A (root level=4) |
| * / \ |
| * (d) IN-B IN-C (maxFlushLevel=3) |
| * \ |
| * (d) IN-D |
| * |
| * IN-C is not dirty (d) when the dirty map is constructed, but it will be |
| * logged because its child (IN-D) is dirty, and it is not above maxFlushLevel. |
| * |
| * If IN-C were not logged, and there were a crash after the checkpoint, the |
| * changes to IN-D would be lost. IN-D would not be replayed by recovery |
| * because it is logged provisionally, and it would not be accessible via its |
| * parent. This is because only nodes at maxFlushLevel are logged |
| * non-provisionally. The actions that led to the changes in IN-D may not be |
| * replayed either, because they may appear before the firstActiveLsn |
| * associated with the checkpoint. |
| * |
| * When log files are to be deleted at the end of the checkpoint (after being |
| * processed by the log cleaner), the maxFlushLevel is increased by one. |
| * This is to ensure that LSNs in deleted files will not be fetched during |
| * recovery. Such files are in the FileSelector.CLEANED state, which means |
| * they have been processed by the cleaner since the last checkpoint. |
| * |
| * TODO: Document circumstances and motivation for the extra flush level. |
| * |
| * Lastly, for Database.sync or a checkpoint with MinimizeRecoveryTime |
| * configured, we will flush all the way to the root rather than using the |
| * maxFlushLevel computed as described above. |
| * |
| * Provisional.BEFORE_CKPT_END |
| * --------------------------- |
| * See Provisional.java for a description of the relationship between the |
| * checkpoint algorithm above and the BEFORE_CKPT_END property. |
| * |
| * Coordination of Eviction and Checkpointing |
| * ------------------------------------------ |
| * Eviction can proceed concurrently with all phases of a checkpoint, and |
| * eviction may take place concurrently in multiple threads. This concurrency |
| * is crucial to avoid blocking application threads that perform eviction and |
| * to reduce the amount of eviction required in application threads. |
| * |
| * Eviction calls Checkpointer.coordinateEvictionWithCheckpoint, which calls |
| * DirtyINMap.coordinateEvictionWithCheckpoint, just before logging an IN. |
| * coordinateEvictionWithCheckpoint returns whether the IN should be logged |
| * provisionally (Provisional.YES) or non-provisionally (Provisional.NO). |
| * |
| * Other coordination necessary depends on the state of the checkpoint: |
| * + NONE: No additional action. |
| * o return Provisional.NO |
| * + DIRTY_MAP_INCOMPLETE: The parent IN is added to the dirty map, exactly |
| * as if it were encountered as dirty in the INList during dirty map |
| * construction. |
| * o IN is root: return Provisional.NO |
| * o IN is not root: return Provisional.YES |
| * + DIRTY_MAP_COMPLETE: |
| * o IN level GTE highest flush level: return Provisional.NO |
| * o IN level LT highest flush level: return Provisional.YES |
| * |
| * In general this is designed so that eviction will use the same provisional |
| * value that would be used by the checkpoint, as if the checkpoint itself were |
| * logging the IN. However, there are several conditions where this is not |
| * exactly the case. |
| * |
| * 1. Eviction may log an IN with Provisional.YES when the IN was not dirty at |
| * the time of dirty map creation, if it became dirty afterwards. In this |
| * case, the checkpointer would not have logged the IN at all. This is safe |
| * because the actions that made that IN dirty are logged in the recovery |
| * period. |
| * 2. Eviction may log an IN with Provisional.YES after the checkpoint has |
| * logged it, if it becomes dirty again. In this case the IN is logged |
| * twice, which would not have been done by the checkpoint alone. This is |
| * safe because the actions that made that IN dirty are logged in the |
| * recovery period. |
| * 3. An intermediate level IN (not bottom most and not the highest flush |
| * level) will be logged by the checkpoint with Provisional.BEFORE_CKPT_END |
| * but will be logged by eviction with Provisional.YES. See below for why |
| * this is safe. |
| * 4. Between checkpoint step 8 (log CkptEnd) and 10 (set checkpointDirtyMap |
| * state to NONE), eviction may log an IN with Provisional.YES, although a |
| * checkpoint is not strictly active during this interval. See below for |
| * why this is safe. |
| * |
| * It is safe for eviction to log an IN as Provisional.YES for the last two |
| * special cases, because this does not cause incorrect recovery behavior. For |
| * recovery to work properly, it is only necessary that: |
| * |
| * + Provisional.NO is used for INs at the max flush level during an active |
| * checkpoint. |
| * + Provisional.YES or BEFORE_CKPT_END is used for INs below the max flush |
| * level, to avoid replaying an IN during recovery that may depend on a file |
| * deleted as the result of the checkpoint. |
| * |
| * You may ask why we don't use Provisional.YES for eviction when a checkpoint |
| * is not active. There are two reason, both related to performance: |
| * |
| * 1. This would be wasteful when an IN is evicted in between checkpoints, and |
| * that portion of the log is processed by recovery later, in the event of a |
| * crash. The evicted INs would be ignored by recovery, but the actions |
| * that caused them to be dirty would be replayed and the INs would be |
| * logged again redundantly. |
| * 2. Logging a IN provisionally will not count the old LSN as obsolete |
| * immediately, so cleaner utilization will be inaccurate until the a |
| * non-provisional parent is logged, typically by the next checkpoint. It |
| * is always important to keep the cleaner from stalling and spiking, to |
| * keep latency and throughput as level as possible. |
| * |
| * Therefore, it is safe to log with Provisional.YES in between checkpoints, |
| * but not desirable. |
| * |
| * Although we don't do this, it would be safe and optimal to evict with |
| * BEFORE_CKPT_END in between checkpoints, because it would be treated by |
| * recovery as if it were Provisional.NO. This is because the interval between |
| * checkpoints is only processed by recovery if it follows the last CkptEnd, |
| * and BEFORE_CKPT_END is treated as Provisional.NO if the IN follows the last |
| * CkptEnd. |
| * |
| * However, it would not be safe to evict an IN with BEFORE_CKPT_END during a |
| * checkpoint, when logging of the IN's ancestors does not occur according to |
| * the rules of the checkpoint. If this were done, then if the checkpoint |
| * completes and is used during a subsequent recovery, an obsolete offset for |
| * the old version of the IN will mistakenly be recorded. Below are two cases |
| * where BEFORE_CKPT_END is used correctly and one showing how it could be used |
| * incorrectly. |
| * |
| * 1. Correct use of BEFORE_CKPT_END when the checkpoint does not complete. |
| * |
| * 050 BIN-A |
| * 060 IN-B parent of BIN-A |
| * 100 CkptStart |
| * 200 BIN-A logged with BEFORE_CKPT_END |
| * 300 FileSummaryLN with obsolete offset for BIN-A at 050 |
| * Crash and recover |
| * |
| * Recovery will process BIN-A at 200 (it will be considered |
| * non-provisional) because there is no following CkptEnd. It is |
| * therefore correct that BIN-A at 050 is obsolete. |
| * |
| * 2. Correct use of BEFORE_CKPT_END when the checkpoint does complete. |
| * |
| * 050 BIN-A |
| * 060 IN-B parent of BIN-A |
| * 100 CkptStart |
| * 200 BIN-A logged with BEFORE_CKPT_END |
| * 300 FileSummaryLN with obsolete offset for BIN-A at 050 |
| * 400 IN-B parent of BIN-A, non-provisional |
| * 500 CkptEnd |
| * Crash and recover |
| * |
| * Recovery will not process BIN-A at 200 (it will be considered |
| * provisional) because there is a following CkptEnd, but it will |
| * process its parent IN-B at 400, and therefore the BIN-A at 200 will be |
| * active in the tree. It is therefore correct that BIN-A at 050 is |
| * obsolete. |
| * |
| * 3. Incorrect use of BEFORE_CKPT_END when the checkpoint does complete. |
| * |
| * 050 BIN-A |
| * 060 IN-B parent of BIN-A |
| * 100 CkptStart |
| * 200 BIN-A logged with BEFORE_CKPT_END |
| * 300 FileSummaryLN with obsolete offset for BIN-A at 050 |
| * 400 CkptEnd |
| * Crash and recover |
| * |
| * Recovery will not process BIN-A at 200 (it will be considered |
| * provisional) because there is a following CkptEnd, but no parent |
| * IN-B is logged, and therefore the IN-B at 060 and BIN-A at 050 will be |
| * active in the tree. It is therefore incorrect that BIN-A at 050 is |
| * obsolete. |
| * |
| * This last case is what caused the LFNF in SR [#19422], when BEFORE_CKPT_END |
| * was mistakenly used for logging evicted BINs via CacheMode.EVICT_BIN. |
| * During the checkpoint, we evict BIN-A and log it with BEFORE_CKPT_END, yet |
| * neither it nor its parent are part of the checkpoint. After being counted |
| * obsolete, we crash and recover. Then the file containing the BIN (BIN-A at |
| * 050 above) is cleaned and deleted. During cleaning, it is not migrated |
| * because an obsolete offset was previously recorded. The LFNF occurs when |
| * trying to access this BIN during a user operation. |
| * |
| * CacheMode.EVICT_BIN |
| * ------------------- |
| * Unlike in JE 4.0 where EVICT_BIN was first introduced, in JE 4.1 and later |
| * we do not use special rules when an IN is evicted. Since concurrent |
| * eviction and checkpointing are supported in JE 4.1, the above rules apply to |
| * EVICT_BIN as well as all other types of eviction. |
| */ |
| public class Checkpointer extends DaemonThread implements EnvConfigObserver { |
| |
| /** |
| * For unit testing only. Called before we flush the max level. This |
| * field is static because it is called from the static flushIN method. |
| */ |
| private static TestHook<?> maxFlushLevelHook = null; |
| |
| private static TestHook<?> beforeFlushHook = null; |
| |
| static TestHook<IN> examineINForCheckpointHook = null; |
| |
| /* Checkpoint sequence, initialized at recovery. */ |
| private long checkpointId; |
| |
| /* |
| * How much the log should grow between checkpoints. If 0, we're using time |
| * based checkpointing. |
| */ |
| private final long logSizeBytesInterval; |
| private final long logFileMax; |
| private final long timeInterval; |
| private long lastCheckpointMillis; |
| private volatile boolean wakeupAfterNoWrites; |
| |
| /* Configured to true to minimize checkpoint duration. */ |
| private boolean highPriority; |
| |
| private long nCheckpoints; |
| private long lastCheckpointStart; |
| private long lastCheckpointEnd; |
| private long lastCheckpointInterval; |
| private volatile long lastCheckpointFirstActiveLsn; |
| private final FlushStats flushStats; |
| |
| /** |
| * The DirtyINMap for checkpointing is created once and is reset after each |
| * checkpoint is complete. Access to this object is synchronized so that |
| * eviction and checkpointing can access it concurrently. |
| */ |
| private final DirtyINMap checkpointDirtyMap; |
| |
| public Checkpointer(EnvironmentImpl envImpl, |
| long waitTime, |
| String name) { |
| super(waitTime, name, envImpl); |
| logSizeBytesInterval = |
| envImpl.getConfigManager().getLong |
| (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL); |
| logFileMax = |
| envImpl.getConfigManager().getLong(EnvironmentParams.LOG_FILE_MAX); |
| timeInterval = waitTime; |
| lastCheckpointMillis = 0; |
| |
| nCheckpoints = 0; |
| flushStats = new FlushStats(); |
| |
| checkpointDirtyMap = new DirtyINMap(envImpl); |
| |
| /* Initialize mutable properties and register for notifications. */ |
| envConfigUpdate(envImpl.getConfigManager(), null); |
| envImpl.addConfigObserver(this); |
| } |
| |
| /** |
| * Process notifications of mutable property changes. |
| */ |
| @Override |
| public void envConfigUpdate(DbConfigManager cm, |
| EnvironmentMutableConfig ignore) { |
| highPriority = cm.getBoolean |
| (EnvironmentParams.CHECKPOINTER_HIGH_PRIORITY); |
| } |
| |
| /** |
| * Initializes the checkpoint intervals when no checkpoint is performed |
| * while opening the environment. |
| */ |
| void initIntervals(long lastCheckpointStart, |
| long lastCheckpointEnd, |
| long lastCheckpointFirstActiveLsn, |
| long lastCheckpointMillis) { |
| this.lastCheckpointStart = lastCheckpointStart; |
| this.lastCheckpointEnd = lastCheckpointEnd; |
| this.lastCheckpointFirstActiveLsn = lastCheckpointFirstActiveLsn; |
| this.lastCheckpointMillis = lastCheckpointMillis; |
| } |
| |
| /** |
| * Returns the firstActiveLsn of the last completed checkpoint. |
| */ |
| public long getLastCheckpointFirstActiveLsn() { |
| return lastCheckpointFirstActiveLsn; |
| } |
| |
| /** |
| * Coordinates an eviction with an in-progress checkpoint and returns |
| * whether provisional logging is needed. |
| * |
| * @return the provisional status to use for logging the target. |
| */ |
| public Provisional coordinateEvictionWithCheckpoint( |
| final DatabaseImpl db, |
| final int targetLevel, |
| final IN parent) { |
| |
| return checkpointDirtyMap. |
| coordinateEvictionWithCheckpoint(db, targetLevel, parent); |
| } |
| |
| /** |
| * Coordinates a split with an in-progress checkpoint. |
| * |
| * @param newSibling the sibling IN created by the split. |
| */ |
| public void coordinateSplitWithCheckpoint(final IN newSibling) { |
| checkpointDirtyMap.coordinateSplitWithCheckpoint(newSibling); |
| } |
| |
| /** |
| * Figure out the wakeup period. Supplied through this static method |
| * because we need to pass wakeup period to the superclass and need to do |
| * the calcuation outside this constructor. |
| * |
| * @throws IllegalArgumentException via Environment ctor and |
| * setMutableConfig. |
| */ |
| public static long getWakeupPeriod(DbConfigManager configManager) |
| throws IllegalArgumentException { |
| |
| long wakeupPeriod = configManager.getDuration |
| (EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL); |
| long bytePeriod = configManager.getLong |
| (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL); |
| |
| /* Checkpointing period must be set either by time or by log size. */ |
| if ((wakeupPeriod == 0) && (bytePeriod == 0)) { |
| throw new IllegalArgumentException |
| (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL.getName() + |
| " and " + |
| EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL.getName() + |
| " cannot both be 0. "); |
| } |
| |
| /* |
| * Checkpointing by log size takes precendence over time based period. |
| */ |
| if (bytePeriod == 0) { |
| return wakeupPeriod; |
| } else { |
| return 0; |
| } |
| } |
| |
| /** |
| * Set checkpoint id -- can only be done after recovery. |
| */ |
| synchronized void setCheckpointId(long lastCheckpointId) { |
| checkpointId = lastCheckpointId; |
| } |
| |
| /** |
| * Load stats. |
| */ |
| @SuppressWarnings("unused") |
| public StatGroup loadStats(StatsConfig config) { |
| StatGroup stats = new StatGroup(GROUP_NAME, GROUP_DESC); |
| new LongStat(stats, CKPT_LAST_CKPTID, checkpointId); |
| new LongStat(stats, CKPT_CHECKPOINTS, nCheckpoints); |
| new LongStat(stats, CKPT_LAST_CKPT_INTERVAL, lastCheckpointInterval); |
| new LSNStat(stats, CKPT_LAST_CKPT_START, lastCheckpointStart); |
| new LSNStat(stats, CKPT_LAST_CKPT_END, lastCheckpointEnd); |
| new LongStat(stats, CKPT_FULL_IN_FLUSH, flushStats.nFullINFlush); |
| new LongStat(stats, CKPT_FULL_BIN_FLUSH, flushStats.nFullBINFlush); |
| new LongStat(stats, CKPT_DELTA_IN_FLUSH, flushStats.nDeltaINFlush); |
| |
| if (config.getClear()) { |
| nCheckpoints = 0; |
| flushStats.nFullINFlush = 0; |
| flushStats.nFullBINFlush = 0; |
| flushStats.nDeltaINFlush = 0; |
| } |
| |
| return stats; |
| } |
| |
| /** |
| * Return the number of retries when a deadlock exception occurs. |
| */ |
| @Override |
| protected long nDeadlockRetries() { |
| return envImpl.getConfigManager().getInt |
| (EnvironmentParams.CHECKPOINTER_RETRY); |
| } |
| |
| /** |
| * Called whenever the DaemonThread wakes up from a sleep. |
| */ |
| @Override |
| protected void onWakeup() { |
| |
| if (envImpl.isClosing()) { |
| return; |
| } |
| |
| doCheckpoint( |
| CheckpointConfig.DEFAULT, "daemon", true /*invokedFromDaemon*/); |
| |
| wakeupAfterNoWrites = false; |
| } |
| |
| /** |
| * Wakes up the checkpointer if a checkpoint log interval is configured and |
| * the number of bytes written since the last checkpoint exceeds the size |
| * of the interval. |
| */ |
| public void wakeupAfterWrite() { |
| |
| if ((logSizeBytesInterval != 0) && !isRunning()) { |
| |
| long nextLsn = envImpl.getFileManager().getNextLsn(); |
| |
| if (DbLsn.getNoCleaningDistance( |
| nextLsn, lastCheckpointStart, logFileMax) >= |
| logSizeBytesInterval) { |
| |
| wakeup(); |
| } |
| } |
| } |
| |
| /** |
| * Wakes up the checkpointer if a checkpoint is needed to reclaim disk |
| * space for already cleaned files. This method is called after an idle |
| * period with no writes. |
| */ |
| public void wakeupAfterNoWrites() { |
| |
| if (!isRunning() && needCheckpointForCleanedFiles()) { |
| wakeupAfterNoWrites = true; |
| wakeup(); |
| } |
| } |
| |
| private boolean needCheckpointForCleanedFiles() { |
| return envImpl.getCleaner().getFileSelector().isCheckpointNeeded(); |
| } |
| |
| /** |
| * Determine whether a checkpoint should be run. |
| */ |
| private boolean isRunnable(CheckpointConfig config) { |
| /* Figure out if we're using log size or time to determine interval.*/ |
| long useBytesInterval = 0; |
| long useTimeInterval = 0; |
| long nextLsn = DbLsn.NULL_LSN; |
| boolean runnable = false; |
| try { |
| if (config.getForce()) { |
| runnable = true; |
| return true; |
| } |
| |
| if (wakeupAfterNoWrites && needCheckpointForCleanedFiles()) { |
| runnable = true; |
| return true; |
| } |
| |
| if (config.getKBytes() != 0) { |
| useBytesInterval = config.getKBytes() << 10; |
| |
| } else if (config.getMinutes() != 0) { |
| /* Convert to millis. */ |
| useTimeInterval = config.getMinutes() * 60 * 1000; |
| |
| } else if (logSizeBytesInterval != 0) { |
| useBytesInterval = logSizeBytesInterval; |
| |
| } else { |
| useTimeInterval = timeInterval; |
| } |
| |
| /* |
| * If our checkpoint interval is defined by log size, check on how |
| * much log has grown since the last checkpoint. |
| */ |
| if (useBytesInterval != 0) { |
| nextLsn = envImpl.getFileManager().getNextLsn(); |
| |
| if (DbLsn.getNoCleaningDistance( |
| nextLsn, lastCheckpointStart, logFileMax) >= |
| useBytesInterval) { |
| |
| runnable = true; |
| } |
| |
| } else if (useTimeInterval != 0) { |
| |
| /* |
| * Our checkpoint is determined by time. If enough time has |
| * passed and some log data has been written, do a checkpoint. |
| */ |
| final long lastUsedLsn = |
| envImpl.getFileManager().getLastUsedLsn(); |
| |
| if (((System.currentTimeMillis() - lastCheckpointMillis) >= |
| useTimeInterval) && |
| (DbLsn.compareTo(lastUsedLsn, lastCheckpointEnd) != 0)) { |
| |
| runnable = true; |
| } |
| } |
| return runnable; |
| |
| } finally { |
| if (logger.isLoggable(Level.FINEST)) { |
| final StringBuilder sb = new StringBuilder(); |
| sb.append("size interval=").append(useBytesInterval); |
| if (nextLsn != DbLsn.NULL_LSN) { |
| sb.append(" nextLsn="). |
| append(DbLsn.getNoFormatString(nextLsn)); |
| } |
| if (lastCheckpointEnd != DbLsn.NULL_LSN) { |
| sb.append(" lastCkpt="); |
| sb.append(DbLsn.getNoFormatString(lastCheckpointEnd)); |
| } |
| sb.append(" time interval=").append(useTimeInterval); |
| sb.append(" force=").append(config.getForce()); |
| sb.append(" runnable=").append(runnable); |
| |
| LoggerUtils.finest(logger, envImpl, sb.toString()); |
| } |
| } |
| } |
| |
| /** |
| * The real work to do a checkpoint. This may be called by the checkpoint |
| * thread when waking up, or it may be invoked programatically through the |
| * api. |
| * |
| * @param invokingSource a debug aid, to indicate who invoked this |
| * checkpoint. (i.e. recovery, the checkpointer daemon, the cleaner, |
| * programatically) |
| */ |
| public synchronized void doCheckpoint(CheckpointConfig config, |
| String invokingSource, |
| boolean invokedFromDaemon) { |
| if (envImpl.isReadOnly()) { |
| return; |
| } |
| |
| if (!isRunnable(config)) { |
| return; |
| } |
| |
| /* Stop if we cannot write because of a disk limit violation. */ |
| try { |
| envImpl.checkDiskLimitViolation(); |
| } catch (DiskLimitException e) { |
| if (!invokedFromDaemon) { |
| throw e; |
| } |
| return; |
| } |
| |
| /* |
| * If minimizing recovery time is desired, then flush all the way to |
| * the top of the dbtree instead of stopping at the highest level last |
| * modified, so that only the root INs are processed by recovery. |
| */ |
| final boolean flushAll = config.getMinimizeRecoveryTime(); |
| |
| /* |
| * If there are cleaned files to be deleted, flush an extra level to |
| * write out the parents of cleaned nodes. This ensures that no node |
| * will contain the LSN of a cleaned file. |
| */ |
| final Cleaner cleaner = envImpl.getCleaner(); |
| |
| final CheckpointStartCleanerState cleanerState = |
| cleaner.getFilesAtCheckpointStart(); |
| |
| /* |
| * Any scans that are complete before the checkpoint, can be deleted |
| * after the checkpoint. |
| */ |
| final ExtinctionScanner extinctionScanner = |
| envImpl.getExtinctionScanner(); |
| |
| final Set<Long> completedScans = |
| extinctionScanner.getCompletedRecordScans(); |
| |
| final boolean flushExtraLevel = !cleanerState.isEmpty(); |
| |
| lastCheckpointMillis = System.currentTimeMillis(); |
| flushStats.resetPerRunCounters(); |
| |
| /* Get the next checkpoint id. */ |
| checkpointId++; |
| nCheckpoints++; |
| |
| boolean success = false; |
| boolean traced = false; |
| |
| final LogManager logManager = envImpl.getLogManager(); |
| |
| /* |
| * Set the checkpoint state so that concurrent eviction can be |
| * coordinated. |
| */ |
| checkpointDirtyMap.beginCheckpoint(flushAll, flushExtraLevel); |
| |
| try { |
| /* Log the checkpoint start. */ |
| final SingleItemEntry<CheckpointStart> startEntry = |
| SingleItemEntry.create( |
| LogEntryType.LOG_CKPT_START, |
| new CheckpointStart(checkpointId, invokingSource)); |
| |
| final long checkpointStart = |
| logManager.log(startEntry, ReplicationContext.NO_REPLICATE); |
| |
| /* |
| * Note the first active LSN point. The definition of |
| * firstActiveLsn is that all log entries for active transactions |
| * are equal to or after that LSN. This is the starting point for |
| * replaying LNs during recovery and will be stored in the CkptEnd |
| * entry. |
| * |
| * Use the checkpointStart as the firstActiveLsn if firstActiveLsn |
| * is null, meaning that no txns are active. |
| * |
| * The current value must be retrieved from TxnManager after |
| * logging CkptStart. If it were instead retrieved before logging |
| * CkptStart, the following failure could occur. [#20270] |
| * |
| * ... getFirstActiveLsn returns NULL_LSN, will use 200 CkptStart |
| * 100 LN-A in Txn-1 |
| * 200 CkptStart |
| * 300 BIN-B refers to 100 LN-A |
| * 400 CkptEnd |
| * ... Crash and recover. Recovery does not undo 100 LN-A. |
| * ... Txn-1 is uncommitted, yet 100 LN-A takes effect. |
| */ |
| long firstActiveLsn = envImpl.getTxnManager().getFirstActiveLsn(); |
| if (firstActiveLsn == DbLsn.NULL_LSN) { |
| firstActiveLsn = checkpointStart; |
| } |
| |
| /* |
| * In a replicated system, the checkpointer will be flushing out |
| * the VLSNIndex, which is HA metadata. Check that the in-memory |
| * version encompasses all metadata up to the point of the |
| * CheckpointStart record. This is no-op for non-replicated |
| * systems. [#19754] |
| */ |
| envImpl.awaitVLSNConsistency(); |
| |
| /* Find the set of dirty INs that must be logged. */ |
| checkpointDirtyMap.selectDirtyINsForCheckpoint(); |
| |
| /* Call hook after dirty map creation and before flushing. */ |
| TestHookExecute.doHookIfSet(beforeFlushHook); |
| |
| /* Flush IN nodes. */ |
| flushDirtyNodes( |
| envImpl, checkpointDirtyMap, checkpointStart, highPriority, |
| flushStats); |
| |
| if (DirtyINMap.DIRTY_SET_DEBUG_TRACE) { |
| LoggerUtils.logMsg( |
| envImpl.getLogger(), envImpl, Level.INFO, |
| "Ckpt flushed" + |
| " nFullINFlushThisRun = " + |
| flushStats.nFullINFlushThisRun + |
| " nFullBINFlushThisRun = " + |
| flushStats.nFullBINFlushThisRun + |
| " nDeltaINFlushThisRun = " + |
| flushStats.nDeltaINFlushThisRun); |
| |
| } |
| |
| /* |
| * Flush MapLNs if not already done by flushDirtyNodes. Only flush |
| * a database if it has not already been flushed since checkpoint |
| * start. Lastly, flush the DB mapping tree root. |
| */ |
| checkpointDirtyMap.flushMapLNs(checkpointStart); |
| checkpointDirtyMap.flushRoot(checkpointStart); |
| |
| /* |
| * Flush replication information if necessary so that the VLSNIndex |
| * cache is flushed and is recoverable. |
| */ |
| envImpl.preCheckpointEndFlush(); |
| |
| /* |
| * Flush utilization info AFTER flushing IN nodes to reduce the |
| * inaccuracies caused by the sequence FileSummaryLN-LN-BIN. This |
| * also reduces the chance of lost IN obsolete info when there is |
| * a crash after logging the non-provisional INs but before |
| * logging the FileSummaryLNs. |
| */ |
| envImpl.getUtilizationProfile().flushFileUtilization |
| (envImpl.getUtilizationTracker().getTrackedFiles()); |
| |
| final DbTree dbTree = envImpl.getDbTree(); |
| final boolean willDeleteFiles = !cleanerState.isEmpty(); |
| |
| final CheckpointEnd ckptEnd = new CheckpointEnd( |
| invokingSource, checkpointStart, envImpl.getRootLsn(), |
| firstActiveLsn, |
| envImpl.getNodeSequence().getLastLocalNodeId(), |
| envImpl.getNodeSequence().getLastReplicatedNodeId(), |
| dbTree.getLastLocalDbId(), dbTree.getLastReplicatedDbId(), |
| envImpl.getTxnManager().getLastLocalTxnId(), |
| envImpl.getTxnManager().getLastReplicatedTxnId(), |
| envImpl.getExtinctionScanner().getLastLocalId(), |
| envImpl.getExtinctionScanner().getLastReplicatedId(), |
| checkpointId, willDeleteFiles); |
| |
| final SingleItemEntry<CheckpointEnd> endEntry = |
| SingleItemEntry.create(LogEntryType.LOG_CKPT_END, ckptEnd); |
| |
| /* |
| * Log checkpoint end and update state kept about the last |
| * checkpoint location. Send a trace message *before* the |
| * checkpoint end log entry. This is done so that the normal trace |
| * message doesn't affect the time-based isRunnable() calculation, |
| * which only issues a checkpoint if a log record has been written |
| * since the last checkpoint. |
| */ |
| trace(envImpl, invokingSource, true); |
| traced = true; |
| |
| lastCheckpointInterval = DbLsn.getNoCleaningDistance( |
| checkpointStart, lastCheckpointStart, logFileMax); |
| |
| /* |
| * We must flush and fsync to ensure that cleaned files are not |
| * referenced. This also ensures that this checkpoint is not wasted |
| * if we crash. |
| */ |
| lastCheckpointEnd = logManager.logForceFlush( |
| endEntry, true /*fsyncRequired*/, |
| ReplicationContext.NO_REPLICATE); |
| |
| lastCheckpointStart = checkpointStart; |
| lastCheckpointFirstActiveLsn = firstActiveLsn; |
| |
| success = true; |
| cleaner.updateFilesAtCheckpointEnd(cleanerState); |
| extinctionScanner.deleteCompletedRecordScans(completedScans); |
| |
| } catch (DiskLimitException e) { |
| |
| LoggerUtils.logMsg( |
| envImpl.getLogger(), envImpl, Level.WARNING, |
| "Ckpt id=" + checkpointId + " success=" + success + |
| " aborted because of disk limit violation: " + e); |
| |
| if (!invokedFromDaemon) { |
| throw e; |
| } |
| |
| } catch (DatabaseException e) { |
| LoggerUtils.traceAndLogException(envImpl, "Checkpointer", |
| "doCheckpoint", "checkpointId=" + |
| checkpointId, e); |
| throw e; |
| } finally { |
| |
| /* |
| * Reset the checkpoint state so evictor activity knows there's no |
| * further requirement for provisional logging. SR 11163. |
| */ |
| checkpointDirtyMap.reset(); |
| |
| if (!traced) { |
| trace(envImpl, invokingSource, success); |
| } |
| } |
| } |
| |
| private void trace(EnvironmentImpl envImpl, |
| String invokingSource, |
| boolean success ) { |
| |
| final StringBuilder sb = new StringBuilder(); |
| sb.append("Checkpoint ").append(checkpointId); |
| sb.append(": source=" ).append(invokingSource); |
| sb.append(" success=").append(success); |
| sb.append(" nFullINFlushThisRun="); |
| sb.append(flushStats.nFullINFlushThisRun); |
| sb.append(" nDeltaINFlushThisRun="); |
| sb.append(flushStats.nDeltaINFlushThisRun); |
| LoggerUtils.logMsg(logger, envImpl, Level.CONFIG, sb.toString()); |
| } |
| |
| /** |
| * Flush a given database to disk. Like checkpoint, log from the bottom |
| * up so that parents properly represent their children. |
| */ |
| public void syncDatabase(EnvironmentImpl envImpl, |
| DatabaseImpl dbImpl, |
| boolean flushLog) { |
| if (envImpl.isReadOnly()) { |
| return; |
| } |
| |
| envImpl.checkDiskLimitViolation(); |
| |
| final DirtyINMap dirtyMap = new DirtyINMap(envImpl); |
| final FlushStats fstats = new FlushStats(); |
| |
| try { |
| /* Find the dirty set. */ |
| dirtyMap.selectDirtyINsForDbSync(dbImpl); |
| |
| if (dirtyMap.getNumEntries() > 0) { |
| /* Write all dirtyINs out.*/ |
| flushDirtyNodes( |
| envImpl, dirtyMap, DbLsn.NULL_LSN /*ckptStart*/, |
| false /*highPriority*/, fstats); |
| |
| /* Make changes durable. [#15254] */ |
| if (flushLog) { |
| envImpl.getLogManager().flushSync(); |
| } |
| } |
| } catch (DiskLimitException e) { |
| throw e; |
| } catch (DatabaseException e) { |
| LoggerUtils.traceAndLogException |
| (envImpl, "Checkpointer", "syncDatabase", |
| "of " + dbImpl.getName(), e); |
| throw e; |
| } finally { |
| dirtyMap.reset(); |
| } |
| } |
| |
| /* For unit testing only. */ |
| public static void setMaxFlushLevelHook(TestHook<?> hook) { |
| maxFlushLevelHook = hook; |
| } |
| |
| /* For unit testing only. */ |
| public static void setBeforeFlushHook(TestHook<?> hook) { |
| beforeFlushHook = hook; |
| } |
| |
| /** |
| * Flush the nodes in order, from the lowest level to highest level. As a |
| * flush dirties its parent, add it to the dirty map, thereby cascading the |
| * writes up the tree. If flushAll wasn't specified, we need only cascade |
| * up to the highest level set at the start of checkpointing. |
| * |
| * Note that all but the top level INs are logged provisionally. That's |
| * because we don't need to process lower INs during recovery because the |
| * higher INs will end up pointing at them. |
| */ |
| private static void flushDirtyNodes(EnvironmentImpl envImpl, |
| DirtyINMap dirtyMap, |
| long checkpointStart, |
| boolean highPriority, |
| FlushStats fstats) { |
| |
| final DbTree dbTree = envImpl.getDbTree(); |
| final Map<DatabaseId, DatabaseImpl> dbCache = new HashMap<>(); |
| |
| try { |
| while (dirtyMap.getNumLevels() > 0) { |
| |
| /* |
| * Work on one level's worth of nodes in ascending level order. |
| */ |
| final Integer currentLevel = dirtyMap.getLowestLevelSet(); |
| final int currentLevelVal = currentLevel; |
| |
| /* |
| * Flush MapLNs just prior to flushing the first level of the |
| * mapping tree. Only flush a database if it has not already |
| * been flushed since checkpoint start. |
| */ |
| if (currentLevelVal == IN.DBMAP_LEVEL) { |
| dirtyMap.flushMapLNs(checkpointStart); |
| } |
| |
| /* Flush the nodes at the current level. */ |
| while (true) { |
| final CheckpointReference targetRef = |
| dirtyMap.removeNextNode(currentLevel); |
| |
| if (targetRef == null) { |
| break; |
| } |
| |
| envImpl.checkDiskLimitViolation(); |
| |
| /* |
| * Check to make sure the DB was not deleted after putting |
| * it in the dirty map, and prevent the DB from being |
| * deleted while we're working with it. |
| */ |
| final DatabaseImpl db = dbTree.getDb( |
| targetRef.dbId, -1 /*lockTimeout*/, dbCache); |
| |
| if (db != null) { |
| |
| /* Flush if we're below maxFlushLevel. */ |
| final int maxFlushLevel = |
| dirtyMap.getHighestFlushLevel(db); |
| |
| if (currentLevelVal <= maxFlushLevel) { |
| |
| flushIN( |
| db, targetRef, dirtyMap, maxFlushLevel, |
| highPriority, fstats, true /*allowLogSubtree*/); |
| |
| /* |
| * Sleep if background read/write limit was |
| * exceeded. |
| */ |
| envImpl.sleepAfterBackgroundIO(); |
| } |
| } |
| |
| /* |
| * If the environment was invalidated by other activity, |
| * get out of this loop, and re-throw the invalidating |
| * exception to indicate that the checkpoint did not |
| * succeed. |
| */ |
| envImpl.checkIfInvalid(); |
| } |
| |
| /* We're done with this level. */ |
| dirtyMap.removeLevel(currentLevel); |
| } |
| } finally { |
| dbTree.releaseDbs(dbCache); |
| } |
| |
| /* |
| * Do not flush FileSummaryLNs/MapLNs (do not call |
| * UtilizationProfile.flushLocalTracker) here because that flushing is |
| * already done by the checkpoint. |
| */ |
| } |
| |
| /** |
| * Flush the target IN. |
| * |
| * Where applicable, also attempt to flush the subtree that houses this |
| * target, which means we flush the siblings of this target to promote |
| * better cleaning throughput. The problem lies in the fact that |
| * provisionally logged nodes are not available for log cleaning until |
| * their parent is logged non-provisionally. On the other hand, we want to |
| * log nodes in provisional mode as much as possible, both for recovery |
| * performance, and for correctness to avoid fetches against cleaned log |
| * files. (See [#16037].) These conflicting goals are reconciled by |
| * flushing nodes in subtree grouping, because writing the non-provisional |
| * parent of a set of provisionally written nodes frees the cleaner to work |
| * on that set of provisional nodes as soon as possible. For example, if a |
| * tree consists of: |
| * |
| * INa |
| * +------+-------+ |
| * INb INc |
| * +-----+----+ +-----+ |
| * BINd BINe BINf BINg BINh |
| * |
| * It is more efficient for cleaning throughput to log in this order: |
| * BINd, BINe, BINf, INb, BINg, BINh, INc, INa |
| * rather than: |
| * BINd, BINe, BINf, BINg, BINh, INb, INc, INa |
| * |
| * Suppose the subtree in question is INb->{BINd, BINe, BINf} |
| * |
| * Suppose we see BINd in the dirty map first, before BINe and BINf. |
| * - flushIN(BINd) is called |
| * - we fetch and latch its parent, INb |
| * |
| * If this is a high priority checkpoint, we'll hold the INb latch across |
| * the time it takes to flush all three children. In flushIN(BINd), we |
| * walk through INb, create a local map of all the siblings that can be |
| * found in the dirty map, and then call logSiblings with that local map. |
| * Then we'll write out INb. |
| * |
| * If high priority is false, we will not hold the INb latch across |
| * multiple IOs. Instead, we |
| * - write BINd out, using logSiblings |
| * - while still holding the INb latch, we create a list of dirty siblings |
| * - release the INb latch |
| * - call flushIN() recursively on each entry in the local sibling map, |
| * which will result in a search and write of each sibling. These |
| * recursive calls to flushIN are called with the allowLogSubtree |
| * parameter of false to halt the recursion and prevent a repeat of the |
| * sibling examination. |
| * - write INb |
| */ |
| private static void flushIN(final DatabaseImpl db, |
| final CheckpointReference targetRef, |
| final DirtyINMap dirtyMap, |
| final int maxFlushLevel, |
| final boolean highPriority, |
| final FlushStats fstats, |
| final boolean allowLogSubtree) { |
| |
| final EnvironmentImpl envImpl = db.getEnv(); |
| final Tree tree = db.getTree(); |
| final int targetLevel = targetRef.nodeLevel; |
| |
| /* Call test hook when we reach the max level. */ |
| assert (targetLevel < maxFlushLevel) || |
| TestHookExecute.doHookIfSet(maxFlushLevelHook); |
| |
| if (targetRef.isRoot) { |
| |
| final RootFlusher flusher = |
| new RootFlusher(db, targetRef.nodeId); |
| |
| tree.withRootLatchedExclusive(flusher); |
| |
| /* |
| * Update the tree's owner, whether it's the env root or the |
| * db-mapping tree. |
| */ |
| if (flusher.getFlushed()) { |
| DbTree dbTree = envImpl.getDbTree(); |
| dbTree.modifyDbRoot(db); |
| fstats.nFullINFlushThisRun++; |
| fstats.nFullINFlush++; |
| } |
| |
| /* |
| * If this target isn't the root anymore, we'll have to handle it |
| * like a regular node. |
| */ |
| if (flusher.stillRoot()) { |
| return; |
| } |
| } |
| |
| /* |
| * The following applies to two cases: |
| * (1) the target was not ever the root |
| * (2) the target was the root, when the checkpoint dirty set was |
| * assembled but is not the root now. |
| */ |
| final SearchResult result = tree.getParentINForChildIN( |
| -1 /*nodeId*/, targetRef.treeKey, |
| targetRef.nodeLevel /*targetLevel*/, |
| targetRef.nodeLevel + 1 /*exclusiveLevel*/, |
| false /*requireExactMatch*/, false /*doFetch*/, |
| CacheMode.UNCHANGED, null /*trackingList*/); |
| |
| /* |
| * If no possible parent is found, the compressor may have deleted |
| * this item before we got to processing it. (Although it seems this |
| * cannot currently happen since we never delete the root node.) |
| */ |
| if (result.parent == null) { |
| return; |
| } |
| |
| final IN parent = result.parent; |
| final int index = result.index; |
| final int parentLevel = parent.getLevel(); |
| final CheckpointReference parentRef; |
| |
| /* List of siblings to log after releasing the parent latch. */ |
| final List<CheckpointReference> logSiblingsSeparately; |
| |
| try { |
| /* |
| * If bottomLevelTarget is true, the parent IN contains bottom |
| * level BINs. The masking is used to normalize the level for |
| * ordinary DBs and the mapping tree DB. |
| */ |
| final boolean bottomLevelTarget = |
| ((parentLevel & IN.LEVEL_MASK) == 2); |
| |
| /* |
| * INs at the max flush level are always non-provisional and |
| * INs at the bottom level (when this is not also the max flush |
| * level) are always provisional. In between INs are |
| * provisional BEFORE_CKPT_END (see Provisional). |
| */ |
| final Provisional provisional; |
| if (targetLevel >= maxFlushLevel) { |
| provisional = Provisional.NO; |
| } else if (bottomLevelTarget) { |
| provisional = Provisional.YES; |
| } else { |
| provisional = Provisional.BEFORE_CKPT_END; |
| } |
| |
| /* |
| * If we didn't reach the target level, a child wasn't resident |
| * and there is nothing to log at this level. To be on the safe |
| * side, we'll put the parent into the dirty set to be logged when |
| * that level is processed. |
| * |
| * Only do this if the parent we found is at a higher level than |
| * the child. This ensures that the non-exact search does not |
| * find a sibling rather than a parent. [#11555] |
| */ |
| if (!result.exactParentFound) { |
| if (parentLevel > targetLevel) { |
| dirtyMap.addIN( |
| parent, -1 /*index*/, |
| false /*updateFlushLevels*/, |
| true /*updateMemoryBudget*/); |
| } |
| return; |
| } |
| |
| /* |
| * We found the parent. Add it unconditionally to the dirty map. We |
| * must make sure that every IN that was selected for the |
| * checkpointer's dirty IN set at the beginning of checkpoint is |
| * written into the log and can be properly accessed from |
| * ancestors. Eviction or a split may have written out a member of |
| * this dirty set before the checkpointer got to it. See [#10249]. |
| */ |
| assert parentLevel == targetLevel + 1; |
| |
| dirtyMap.addIN( |
| parent, -1 /*index*/, |
| false /*updateFlushLevels*/, |
| true /*updateMemoryBudget*/); |
| |
| /* |
| * Determine whether our search found the IN identified by either |
| * targetRef.nodeId or targetRef.lsn. If there is not a match, then |
| * the node was deleted, logged or split since creating the |
| * reference. |
| * |
| * For a non-DW DB, targetRef.lsn will be not null and we match on |
| * it. If the LSN has changed then of course the node was logged, |
| * and possibly split, and we will not log this target here. |
| * |
| * For a DW DB we also match on LSN if it is non-null. If the LSN |
| * is null then the reference was created for a never-logged IN and |
| * targetRef.nodeId >= 0. In that case we match on the nodeId. If |
| * the LSN or nodeId doesn't match, there must have been a split, |
| * and we will not log this target here. However, because splits |
| * are not logged for DW, this is not sufficient to cause both |
| * siblings that were part of split to be logged, when one node was |
| * added to the dirty map. We account for this when the parent is |
| * logged by calling logDirtyChildren. This approach relies on the |
| * fact that a split will dirty the parent. |
| * |
| * TODO: |
| * Why not always call logDirtyIN for a DW IN, whether or not the |
| * LSN or nodeId matches? logDirtyChildren is going to log it |
| * anyway if it is dirty. |
| */ |
| if (targetRef.lsn != DbLsn.NULL_LSN) { |
| |
| if (targetRef.lsn != parent.getLsn(index)) { |
| return; |
| } |
| } else { |
| assert targetRef.nodeId >= 0; |
| assert db.isDeferredWriteMode(); |
| |
| final IN target = (IN) parent.getTarget(index); |
| |
| if (target == null || |
| targetRef.nodeId != target.getNodeId()) { |
| return; |
| } |
| } |
| |
| /* Log the target, if dirty. */ |
| logDirtyIN(envImpl, parent, index, provisional, fstats); |
| |
| /* |
| * We will log a sub-tree when the target is at the bottom level |
| * and this is not a recursive call to flushIN during sub-tree |
| * logging. Return if we are only logging the target node here. |
| */ |
| if (!bottomLevelTarget || !allowLogSubtree) { |
| return; |
| } |
| |
| /* |
| * Log sub-tree siblings with the latch held when highPriority |
| * is configured and this is not a DW DB. For a DW DB, dirty LNs |
| * are logged for each BIN. If we were to log a DW sub-tree with |
| * the parent latch held, the amount of logging may cause the latch |
| * to be held for too long a period. |
| */ |
| if (highPriority && !db.isDurableDeferredWrite()) { |
| logSiblingsSeparately = null; |
| } else { |
| logSiblingsSeparately = new ArrayList<>(); |
| } |
| |
| for (int i = 0; i < parent.getNEntries(); i += 1) { |
| |
| if (i == index) { |
| continue; |
| } |
| |
| final IN child = (IN) parent.getTarget(i); |
| final long childId = (child != null) ? child.getNodeId() : -1; |
| final long childLsn = parent.getLsn(i); |
| |
| final CheckpointReference childRef = |
| dirtyMap.removeNode(targetLevel, childLsn, childId); |
| |
| if (childRef == null) { |
| continue; |
| } |
| |
| if (logSiblingsSeparately != null) { |
| logSiblingsSeparately.add(childRef); |
| } else { |
| logDirtyIN(envImpl, parent, i, provisional, fstats); |
| } |
| } |
| |
| /* Get parentRef before releasing the latch. */ |
| if (parentLevel <= maxFlushLevel) { |
| parentRef = dirtyMap.removeNode( |
| parentLevel, parent.getLastLoggedLsn(), |
| parent.getNodeId()); |
| } else { |
| parentRef = null; |
| } |
| } finally { |
| parent.releaseLatch(); |
| } |
| |
| /* |
| * If highPriority is false, we don't hold the latch while logging |
| * the bottom level siblings. We log them here with flushIN, |
| * performing a separate search for each one, after releasing the |
| * parent latch above. |
| */ |
| if (logSiblingsSeparately != null) { |
| for (final CheckpointReference childRef : logSiblingsSeparately) { |
| flushIN( |
| db, childRef, dirtyMap, maxFlushLevel, highPriority, |
| fstats, false /*allowLogSubtree*/); |
| } |
| } |
| |
| /* |
| * Log the sub-tree parent, which will be logged non-provisionally, |
| * in order to update cleaner utilization. This must be done with |
| * flushIN after releasing the parent latch above, since we must search |
| * and acquire the grandparent latch. |
| */ |
| if (parentRef != null) { |
| flushIN( |
| db, parentRef, dirtyMap, maxFlushLevel, highPriority, fstats, |
| false /*allowLogSubtree*/); |
| } |
| } |
| |
| /** |
| * Note that if this method is called, the parent must also be logged. This |
| * is true even if this method finds that the child is not dirty. In that |
| * case the child has already been flushed (e.g., by eviction) and the |
| * parent must be logged according to the rule for max flush level. |
| */ |
| private static void logDirtyIN( |
| final EnvironmentImpl envImpl, |
| final IN parent, |
| final int index, |
| final Provisional provisional, |
| final FlushStats fstats) { |
| |
| final IN child = (IN) parent.getTarget(index); |
| final long newLsn; |
| final boolean isBIN; |
| final boolean isDelta; |
| |
| if (child != null) { |
| child.latch(CacheMode.UNCHANGED); |
| try { |
| if (!child.getDirty()) { |
| return; |
| } |
| |
| if (child.getDatabase().isDurableDeferredWrite()) { |
| |
| /* |
| * Find dirty descendants to avoid logging nodes with |
| * never-logged children. See [#13936] and |
| * IN.logDirtyChildren for description of the case. |
| * |
| * Note that we must log both dirty and never-logged |
| * descendants to be sure to have a consistent view of |
| * the split. If we didn't, we could end up with the |
| * post-split version of a new sibling and the |
| * pre-split version of an split sibling in the log, |
| * which could result in a recovery where descendants |
| * are incorrectly duplicated, because they are in both |
| * the pre-split split sibling, and the post-split |
| * version of the new sibling. |
| */ |
| child.logDirtyChildren(); |
| } |
| |
| newLsn = child.log( |
| true /*allowDeltas*/, provisional, |
| true /*backgroundIO*/, parent); |
| |
| assert (newLsn != DbLsn.NULL_LSN); |
| |
| isBIN = child.isBIN(); |
| isDelta = (newLsn == child.getLastDeltaLsn()); |
| } finally { |
| child.releaseLatch(); |
| } |
| } else { |
| final OffHeapCache ohCache = envImpl.getOffHeapCache(); |
| |
| final INLogEntry<BIN> logEntry = |
| ohCache.createBINLogEntryForCheckpoint(parent, index); |
| |
| if (logEntry == null) { |
| return; |
| } |
| |
| isBIN = true; |
| isDelta = logEntry.isBINDelta(); |
| |
| newLsn = IN.logEntry( |
| logEntry, provisional, true /*backgroundIO*/, parent); |
| |
| ohCache.postBINLog(parent, index, logEntry, newLsn); |
| } |
| |
| parent.updateEntry(index, newLsn, VLSN.NULL_VLSN_SEQUENCE, 0); |
| |
| if (isDelta) { |
| fstats.nDeltaINFlushThisRun++; |
| fstats.nDeltaINFlush++; |
| } else { |
| fstats.nFullINFlushThisRun++; |
| fstats.nFullINFlush++; |
| if (isBIN) { |
| fstats.nFullBINFlush++; |
| fstats.nFullBINFlushThisRun++; |
| } |
| } |
| } |
| |
| /* |
| * RootFlusher lets us write out the root IN within the root latch. |
| */ |
| private static class RootFlusher implements WithRootLatched { |
| private final DatabaseImpl db; |
| private boolean flushed; |
| private boolean stillRoot; |
| private final long targetNodeId; |
| |
| RootFlusher(final DatabaseImpl db, |
| final long targetNodeId) { |
| this.db = db; |
| flushed = false; |
| this.targetNodeId = targetNodeId; |
| stillRoot = false; |
| } |
| |
| /** |
| * Flush the rootIN if dirty. |
| */ |
| @Override |
| public IN doWork(ChildReference root) { |
| |
| if (root == null) { |
| return null; |
| } |
| |
| IN rootIN = (IN) root.fetchTarget(db, null); |
| rootIN.latch(CacheMode.UNCHANGED); |
| try { |
| if (rootIN.getNodeId() == targetNodeId) { |
| |
| /* |
| * Find dirty descendants to avoid logging nodes with |
| * never-logged children. See [#13936] |
| */ |
| if (rootIN.getDatabase().isDurableDeferredWrite()) { |
| rootIN.logDirtyChildren(); |
| } |
| |
| /* |
| * stillRoot handles the situation where the root was split |
| * after it was placed in the checkpointer's dirty set. |
| */ |
| stillRoot = true; |
| |
| if (rootIN.getDirty()) { |
| long newLsn = rootIN.log(); |
| root.setLsn(newLsn); |
| flushed = true; |
| } |
| } |
| } finally { |
| rootIN.releaseLatch(); |
| } |
| return null; |
| } |
| |
| boolean getFlushed() { |
| return flushed; |
| } |
| |
| boolean stillRoot() { |
| return stillRoot; |
| } |
| } |
| |
| /* |
| * CheckpointReferences are used to identify nodes that must be flushed as |
| * part of the checkpoint. We don't keep an actual reference to the node |
| * because that prevents nodes from being GC'ed during checkpoint. |
| * |
| * Using a checkpointReference introduces a window between the point when |
| * the checkpoint dirty set is created and when the node is flushed. Some |
| * of the fields saved in the reference are immutable: db, nodeId. The |
| * others are not and we have to handle potential change: |
| * |
| * isRoot: it's possible for isRoot to go from true->false, but not |
| * false->true. True->false is handled by the flushIN method |
| * by finding the root and checking if it is the target. |
| * treeKey: This can change only in the event of a split. If it does, there |
| * is the chance that the checkpointer will find the wrong node to |
| * flush, but that's okay because the split guarantees flushing to |
| * the root, so the target will be properly logged within the |
| * checkpoint period. |
| * |
| * The class and ctor are public for the Sizeof program. |
| */ |
| public static class CheckpointReference { |
| final DatabaseId dbId; |
| final long nodeId; |
| final int nodeLevel; |
| final boolean isRoot; |
| final byte[] treeKey; |
| final long lsn; |
| |
| CheckpointReference(final DatabaseId dbId, |
| final long nodeId, |
| final int nodeLevel, |
| final boolean isRoot, |
| final byte[] treeKey, |
| final long lsn) { |
| this.dbId = dbId; |
| this.nodeId = nodeId; |
| this.nodeLevel = nodeLevel; |
| this.isRoot = isRoot; |
| this.treeKey = treeKey; |
| this.lsn = lsn; |
| } |
| |
| @Override |
| public boolean equals(Object o) { |
| if (!(o instanceof CheckpointReference)) { |
| return false; |
| } |
| |
| CheckpointReference other = (CheckpointReference) o; |
| return nodeId == other.nodeId; |
| } |
| |
| @Override |
| public int hashCode() { |
| return (int) nodeId; |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("db=").append(dbId); |
| sb.append(" nodeId=").append(nodeId); |
| return sb.toString(); |
| } |
| } |
| |
| /** |
| * A struct to hold log flushing stats for checkpoint and database sync. |
| */ |
| public static class FlushStats { |
| |
| public long nFullINFlush; |
| public long nFullBINFlush; |
| public long nDeltaINFlush; |
| long nFullINFlushThisRun; |
| long nFullBINFlushThisRun; |
| long nDeltaINFlushThisRun; |
| |
| /* For future addition to stats: |
| private int nAlreadyEvictedThisRun; |
| */ |
| |
| /* Reset per-run counters. */ |
| void resetPerRunCounters() { |
| nFullINFlushThisRun = 0; |
| nFullBINFlushThisRun = 0; |
| nDeltaINFlushThisRun = 0; |
| /* nAlreadyEvictedThisRun = 0; -- for future */ |
| } |
| } |
| } |