blob: 695c1c68d63e462430dbb68c23b606a2d1c90495 [file] [log] [blame]
/*-
* Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle Berkeley
* DB Java Edition made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle Berkeley DB Java Edition for a copy of the
* license and additional information.
*/
package com.sleepycat.je.tree.dupConvert;
import java.util.ArrayList;
import com.sleepycat.je.CacheMode;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.PreloadConfig;
import com.sleepycat.je.cleaner.LocalUtilizationTracker;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbTree;
import com.sleepycat.je.dbi.DupKeyData;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.ChildReference;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.Key;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.Node;
import com.sleepycat.je.txn.BasicLocker;
import com.sleepycat.je.txn.LockGrantType;
import com.sleepycat.je.txn.LockResult;
import com.sleepycat.je.txn.LockType;
import com.sleepycat.je.utilint.DbLsn;
/**
* Performs post-recovery conversion of all dup DBs during Environment
* construction, when upgrading from JE 4.1 and earlier. In JE 5.0, duplicates
* are represented by a two-part (key + data) key, and empty data. In JE 4.1
* and earlier, the key and data were separate as with non-dup DBs.
*
* Uses the DbTree.DUPS_CONVERTED_BIT to determine whether conversion of the
* environment is necessary. When all databases are successfully converted,
* this bit is set and the mapping tree is flushed. See
* EnvironmentImpl.convertDupDatabases.
*
* Uses DatabaseImpl.DUPS_CONVERTED to determine whether an individual database
* has been converted, to handle the case where the conversion crashes and is
* restarted later. When a database is successfully converted, this bit is set
* and the entire database is flushed using Database.sync.
*
* The conversion of each database is atomic -- either all INs or none are
* converted and made durable. This is accomplished by putting the database
* into Deferred Write mode so that splits won't log and eviction will be
* provisional (eviction will not flush the root IN if it is dirty). The
* Deferred Write mode is cleared after conversion is complete and
* Database.sync has been called.
*
* The memory budget is updated during conversion and daemon eviction is
* invoked periodically. This provides support for arbitrarily large DBs.
*
* Uses preload to load all dup trees (DINs/DBINs) prior to conversion, to
* minimize random I/O. See EnvironmentConfig.ENV_DUP_CONVERT_PRELOAD_ALL.
*
* The preload config does not specify loading of LNs, because we do not need
* to load LNs from DBINs. The fact that DBIN LNs are not loaded is the main
* reason that conversion is quick. LNs are converted lazily instead; see
* LNLogEntry.postFetchInit. The DBIN LNs do not need to be loaded because the
* DBIN slot key contains the LN 'data' that is needed to create the two-part
* key.
*
* Even when LN loading is not configured, it turns out that preload does load
* BIN (not DBIN) LNs in a dup DB, which is what we want. The singleton LNs
* must be loaded in order to get the LN data to create the two-part key. When
* preload has not loaded a singleton LN, it will be fetched during conversion.
*
* The DIN, DBIN and DupCount LSN are counted obsolete during conversion using
* a local utilization tracker. The tracker must not be flushed until the
* conversion of a database is complete. Inexact counting can be used, because
* DIN/DBIN/DupCountLN entries are automatically considered obsolete by the
* cleaner. Since only totals are tracked, the memory overhead of the local
* tracker is not substantial.
*
* Database Conversion Algorithm
* -----------------------------
* 1. Set Deferred Write mode for the database. Preload the database, including
* INs/BINs/DINs/DBINs, but not LNs except for singleton LNs (LNs with a BIN
* parent).
*
* 2. Convert all IN/BIN keys to "prefix keys", which are defined by the
* DupKeyData class. This allows tree searches and slot insertions to work
* correctly as the conversion is performed.
*
* 3. Traverse through the BIN slots in forward order.
*
* 4. If a singleton LN is encountered, ensure it is loaded. IN.fetchLN()
* automatically updates the slot key if the LNLogEntry's key is different
* from the one already in the slot. Because LNLogEntry's key is converted
* on the fly, a two-part key is set in the slot as a side effect of
* fetching the LN.
*
* 5. If a DIN is encountered, first delete the BIN slot containing the DIN.
* Then iterate through all LNs in the DBINs of this dup tree, assign each
* a two-part key, and insert the slot into a BIN. The LSN and state flags
* of the DBIN slot are copied to the new BIN slot.
*
* 6. If a deleted singleton (BIN) LN is encountered, delete the slot rather
* than converting the key. If a deleted DBIN LN is encountered, simply
* discard it.
*
* 7. Count the DIN and DupCount LSN obsolete for each DIN encountered, using
* a local utilization tracker.
*
* 8. When all BIN slots have been processed, set the
* DatabaseImpl.DUPS_CONVERTED flag, call Database.sync to flush all INs and
* the MapLN, clear Deferred Write mode, and flush the local utilization
* tracker.
*/
public class DupConvert {
private final static boolean DEBUG = false;
private final EnvironmentImpl envImpl;
private final DbTree dbTree;
private final boolean preloadAll;
private final PreloadConfig preloadConfig;
private LocalUtilizationTracker localTracker;
private long nConverted; // for debugging
/* Current working tree position. */
private BIN bin;
private int index;
/**
* Creates a conversion object.
*/
public DupConvert(final EnvironmentImpl envImpl, final DbTree dbTree) {
this.envImpl = envImpl;
this.dbTree = dbTree;
this.preloadAll = envImpl.getConfigManager().getBoolean
(EnvironmentParams.ENV_DUP_CONVERT_PRELOAD_ALL);
this.preloadConfig = (envImpl.getDupConvertPreloadConfig() != null) ?
envImpl.getDupConvertPreloadConfig() : (new PreloadConfig());
}
/**
* Converts all dup DBs that need conversion.
*/
public void convertDatabases() {
if (DEBUG) {
System.out.println("DupConvert.convertDatabases");
}
if (preloadAll) {
preloadAllDatabases();
}
for (DatabaseId dbId : dbTree.getDbNamesAndIds().keySet()) {
final DatabaseImpl dbImpl = dbTree.getDb(dbId);
try {
if (!needsConversion(dbImpl)) {
continue;
}
convertDatabase(dbImpl);
} finally {
dbTree.releaseDb(dbImpl);
}
}
assert noDupNodesPresent();
}
private boolean noDupNodesPresent() {
for (IN in : envImpl.getInMemoryINs()) {
if (in instanceof DIN || in instanceof DBIN) {
System.out.println(in.toString());
return false;
}
}
return true;
}
/**
* Preload all dup DBs to be converted.
*/
private void preloadAllDatabases() {
final ArrayList<DatabaseImpl> dbsToConvert =
new ArrayList<DatabaseImpl>();
try {
for (DatabaseId dbId : dbTree.getDbNamesAndIds().keySet()) {
final DatabaseImpl dbImpl = dbTree.getDb(dbId);
boolean releaseDbImpl = true;
try {
if (!needsConversion(dbImpl)) {
continue;
}
dbsToConvert.add(dbImpl);
releaseDbImpl = false;
} finally {
if (releaseDbImpl) {
dbTree.releaseDb(dbImpl);
}
}
}
if (dbsToConvert.size() == 0) {
return;
}
final DatabaseImpl[] dbArray =
new DatabaseImpl[dbsToConvert.size()];
dbsToConvert.toArray(dbArray);
envImpl.preload(dbArray, preloadConfig);
} finally {
for (DatabaseImpl dbImpl : dbsToConvert) {
dbTree.releaseDb(dbImpl);
}
}
}
/**
* Returns whether the given DB needs conversion.
*/
public static boolean needsConversion(final DatabaseImpl dbImpl) {
return (dbImpl.getSortedDuplicates() &&
!dbImpl.getDupsConverted() &&
!dbImpl.isDeleting());
}
/**
* Converts a single database.
*/
private void convertDatabase(final DatabaseImpl dbImpl) {
if (DEBUG) {
System.out.println("DupConvert.convertDatabase " +
dbImpl.getId());
}
final boolean saveDeferredWrite = dbImpl.isDurableDeferredWrite();
try {
localTracker = new LocalUtilizationTracker(envImpl);
dbImpl.setDeferredWrite(true);
dbImpl.setKeyPrefixing();
if (!preloadAll) {
dbImpl.preload(preloadConfig);
}
bin = dbImpl.getTree().getFirstNode(CacheMode.UNCHANGED);
if (bin == null) {
return;
}
index = -1;
while (getNextBinSlot()) {
convertBinSlot();
}
dbImpl.setDupsConverted();
dbImpl.sync(false /*flushLog*/);
envImpl.getUtilizationProfile().flushLocalTracker(localTracker);
} finally {
dbImpl.setDeferredWrite(saveDeferredWrite);
}
}
/**
* Advances the bin/index fields to the next BIN slot. When moving past
* the last BIN slot, the bin field is set to null and false is returned.
*
* Enter/leave with bin field latched.
*/
private boolean getNextBinSlot() {
index += 1;
if (index < bin.getNEntries()) {
return true;
}
/* Compact keys after finishing with a BIN. */
bin.compactMemory();
assert bin.verifyMemorySize();
/* Cannot evict between BINs here, because a latch is held. */
bin = bin.getDatabase().getTree().getNextBin(bin, CacheMode.UNCHANGED);
if (bin == null) {
return false;
}
index = 0;
return true;
}
/**
* Converts the bin/index slot, whether a singleton LN or a DIN root.
*
* Enter/leave with bin field latched, although bin field may change.
*
* When a singleton LN is converted, leaves with bin/index fields
* unchanged.
*
* When a dup tree is converted, leaves with bin/index fields set to last
* inserted slot. This is the slot of the highest key in the dup tree.
*/
private void convertBinSlot() {
if (DEBUG) {
System.out.println("DupConvert BIN LSN " +
DbLsn.getNoFormatString(bin.getLsn(index)) +
" index " + index +
" nEntries " + bin.getNEntries());
}
/* Delete slot if LN is deleted. */
if (isLNDeleted(bin, index)) {
deleteSlot();
return;
}
final Node node = bin.fetchLNOrDIN(index, CacheMode.DEFAULT);
if (!node.containsDuplicates()) {
if (DEBUG) {
System.out.println("DupConvert BIN LN " +
Key.dumpString(bin.getKey(index), 0));
}
/* Fetching a non-deleted LN updates the slot key; we're done. */
assert node instanceof LN;
nConverted += 1;
return;
}
/*
* Delete the slot containing the DIN before re-inserting the dup tree,
* so that the DIN slot key doesn't interfere with insertions.
*
* The DIN is evicted and memory usage is decremented. This is not
* exactly correct because we keep a local reference to the DIN until
* the dup tree is converted, but we tolerate this temporary
* inaccuracy.
*/
final byte[] binKey = bin.getKey(index);
final DIN din = (DIN) node;
deleteSlot();
convertDin(din, binKey);
}
/**
* Returns true if the LN at the given bin/index slot is permanently
* deleted. Returns false if it is not deleted, or if it is deleted but
* part of an unclosed, resurrected txn.
*
* Enter/leave with bin field latched.
*/
private boolean isLNDeleted(BIN checkBin, int checkIndex) {
if (!checkBin.isEntryKnownDeleted(checkIndex) &&
!checkBin.isEntryPendingDeleted(checkIndex)) {
/* Not deleted. */
return false;
}
final long lsn = checkBin.getLsn(checkIndex);
if (lsn == DbLsn.NULL_LSN) {
/* Can discard a NULL_LSN entry without locking. */
return true;
}
/* Lock LSN to guarantee deletedness. */
final BasicLocker lockingTxn = BasicLocker.createBasicLocker(envImpl);
/* Don't allow this short-lived lock to be preempted/stolen. */
lockingTxn.setPreemptable(false);
try {
final LockResult lockRet = lockingTxn.nonBlockingLock
(lsn, LockType.READ, false /*jumpAheadOfWaiters*/,
checkBin.getDatabase());
if (lockRet.getLockGrant() == LockGrantType.DENIED) {
/* Is locked by a resurrected txn. */
return false;
}
return true;
} finally {
lockingTxn.operationEnd();
}
}
/**
* Deletes the bin/index slot, assigned a new identifier key if needed.
*
* Enter/leave with bin field latched.
*/
private void deleteSlot() {
bin.deleteEntry(index);
if (index == 0 && bin.getNEntries() != 0) {
bin.setIdentifierKey(bin.getKey(0), true /*makeDirty*/);
}
index -= 1;
}
/**
* Converts the given DIN and its descendants.
*
* Enter/leave with bin field latched, although bin field will change to
* last inserted slot.
*/
private void convertDin(final DIN din, final byte[] binKey) {
din.latch();
try {
for (int i = 0; i < din.getNEntries(); i += 1) {
final IN child = din.fetchIN(i, CacheMode.DEFAULT);
assert(!child.isBINDelta(false));
if (child instanceof DBIN) {
final DBIN dbin = (DBIN) child;
dbin.latch();
try {
for (int j = 0; j < dbin.getNEntries(); j += 1) {
if (!isLNDeleted(dbin, j)) {
convertDbinSlot(dbin, j, binKey);
}
}
assert dbin.verifyMemorySize();
/* Count DBIN obsolete. */
if (dbin.getLastLoggedLsn() != DbLsn.NULL_LSN) {
localTracker.countObsoleteNodeInexact(
dbin.getLastLoggedLsn(), dbin.getLogType(), 0);
}
} finally {
dbin.releaseLatch();
}
} else {
convertDin((DIN) child, binKey);
}
/* Evict DIN child. */
din.detachNode(i, false/*updateLsn*/, -1/*lsn*/);
}
assert din.verifyMemorySize();
/* Count DIN and DupCountLN obsolete. */
if (din.getLastLoggedLsn() != DbLsn.NULL_LSN) {
localTracker.countObsoleteNodeInexact(
din.getLastLoggedLsn(), din.getLogType(), 0);
}
final ChildReference dupCountRef = din.getDupCountLNRef();
if (dupCountRef != null &&
dupCountRef.getLsn() != DbLsn.NULL_LSN) {
localTracker.countObsoleteNodeInexact(
dupCountRef.getLsn(), LogEntryType.LOG_DUPCOUNTLN, 0);
}
} finally {
din.releaseLatch();
}
}
/**
* Converts the given DBIN slot, leaving bin/index set to the inserted
* BIN slot.
*
* Enter/leave with bin field latched, although bin field may change.
*
* If slot is inserted into current bin, leave bin field unchanged and
* set index field to inserted slot.
*
* If slot is inserted into a different bin, set bin/index fields to
* inserted slot.
*/
private void convertDbinSlot(
final DBIN dbin,
final int dbinIndex,
final byte[] binKey) {
final byte[] newKey =
DupKeyData.replaceData(binKey, dbin.getKey(dbinIndex));
if (DEBUG) {
System.out.println("DupConvert DBIN LN " +
Key.dumpString(newKey, 0));
}
/*
* If the current BIN can hold the new slot, don't bother to do a
* search to find it.
*/
if (bin.needsSplitting() || !bin.isKeyInBounds(newKey)) {
/* Compact keys after finishing with a BIN. */
bin.compactMemory();
/* Evict without latches, before moving to a new BIN. */
bin.releaseLatch();
envImpl.daemonEviction(false /*backgroundIO*/);
/* Find a BIN for insertion, split if necessary. */
bin = dbin.getDatabase().getTree().searchSplitsAllowed(
newKey, CacheMode.UNCHANGED);
}
final int newIndex = bin.insertEntry1(
null/*ln*/, newKey, null/*data*/, dbin.getLsn(dbinIndex),
dbin.getState(dbinIndex), false);
if ((newIndex & IN.INSERT_SUCCESS) == 0) {
throw EnvironmentFailureException.unexpectedState
("Key not inserted: " + Key.dumpString(newKey, 0) +
" DB: " + dbin.getDatabase().getId());
}
index = newIndex & ~IN.INSERT_SUCCESS;
/*
* Evict LN from DBIN slot. Although we don't explicitly load DBIN LNs,
* it may have been loaded by recovery.
*/
dbin.detachNode(dbinIndex, false/*updateLsn*/, -1/*lsn*/);
nConverted += 1;
}
/**
* Changes all keys to "prefix keys" in the given IN. Called after reading
* an IN from disk via IN.postFetchInit.
*
* The conversion of IN keys is invoked from the IN class when an IN is
* fetched, rather than invoked from the DupConvert class directly, for
* performance and simplicity. If it were invoked from the DupConvert
* class, we would have to iterate over all INs in a separate initial pass.
* This is both more time consuming, and more complex to implement properly
* so that eviction is possible. Instead, conversion occurs when an old
* format IN is loaded.
*
* Enter/leave with 'in' unlatched.
*/
public static void convertInKeys(final DatabaseImpl dbImpl, final IN in) {
/* Nothing to convert for non-duplicates DB. */
if (!dbImpl.getSortedDuplicates()) {
return;
}
/* DIN/DBIN do not need conversion either. */
if (in instanceof DIN || in instanceof DBIN) {
return;
}
for (int i = 0; i < in.getNEntries(); i += 1) {
byte[] oldKey = in.getKey(i);
byte[] newKey =
DupKeyData.makePrefixKey(oldKey, 0, oldKey.length);
in.convertKey(i, newKey);
}
byte[] oldKey = in.getIdentifierKey();
byte[] newKey = DupKeyData.makePrefixKey(oldKey, 0, oldKey.length);
in.setIdentifierKey(newKey, true /*makeDirty*/);
assert in.verifyMemorySize();
}
}