blob: 8a8ccd8037afdf13b010e64bd0e8f0b8b2f8270c [file] [log] [blame]
/*-
* Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle Berkeley
* DB Java Edition made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle Berkeley DB Java Edition for a copy of the
* license and additional information.
*/
package com.sleepycat.je.tree;
import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.EXTINCT;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.Set;
import com.sleepycat.je.CacheMode;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.cleaner.LocalUtilizationTracker;
import com.sleepycat.je.dbi.CursorImpl;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.EnvironmentFailureReason;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.dbi.MemoryBudget;
import com.sleepycat.je.evictor.OffHeapCache;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogItem;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.txn.LockManager;
import com.sleepycat.je.utilint.DatabaseUtil;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.SizeofMarker;
import com.sleepycat.je.utilint.TinyHashSet;
import com.sleepycat.je.utilint.VLSN;
/**
* A BIN represents a Bottom Internal Node in the JE tree.
*
* BIN-deltas
* ==========
* A BIN-delta is a BIN with the non-dirty slots omitted. A "full BIN", OTOH
* contains all slots. On disk and in memory, the format of a BIN-delta is the
* same as that of a BIN. In memory, a BIN object is actually a BIN-delta when
* the BIN-delta flag is set (IN.isBINDelta). On disk, the NewBINDelta log
* entry type (class BINDeltaLogEntry) is the only thing that distinguishes it
* from a full BIN, which has the BIN log entry type.
*
* BIN-deltas provides two benefits: Reduced writing and reduced memory usage.
*
* Reduced Writing
* ---------------
* Logging a BIN-delta rather a full BIN reduces writing significantly. The
* cost, however, is that two reads are necessary to reconstruct a full BIN
* from scratch. The reduced writing is worth this cost, particularly because
* less writing means less log cleaning.
*
* A BIN-delta is logged when 25% or less (configured with EnvironmentConfig
* TREE_BIN_DELTA) of the slots in a BIN are dirty. When a BIN-delta is logged,
* the dirty flag is cleared on the the BIN in cache. If more slots are
* dirtied and another BIN-delta is logged, it will contain all entries dirtied
* since the last full BIN was logged. In other words, BIN-deltas are
* cumulative and not chained, to avoid reading many (more than two) log
* entries to reconstruct a full BIN. The dirty flag on each slot is cleared
* only when a full BIN is logged.
*
* In addition to the cost of fetching two entries on a BIN cache miss, another
* drawback of the current approach is that dirtiness propagates upward in the
* Btree due to BIN-delta logging, causing repeated logging of upper INs. The
* slot of the parent IN contains the LSN of the most recent BIN-delta or full
* BIN that was logged. A BINDeltaLogEntry in turn contains the LSN of the
* last full BIN logged.
*
* Historical note: The pre-JE 5 implementation of OldBINDeltas worked
* differently and had a different cost/benefit trade-off. When an
* OldBINDelta was logged, its dirty flag was not cleared, causing it to be
* logged repeatedly at every checkpoint. A full BIN was logged after 10
* deltas, to prevent endless logging of the same BIN. One benefit of this
* approach is that the BIN's parent IN was not dirtied when logging the
* OldBINDelta, preventing dirtiness from propagating upward. Another
* benefit is that the OldBINDelta was only processed by recovery, and did
* not have to be fetched to reconstruct a full BIN from scratch on a cache
* miss. But the cost (the logging of an OldBINDelta every checkpoint, even
* when it hadn't changed since the last time logged) outweighed the
* benefits. When the current approach was implemented in JE 5, performance
* improved due to less logging.
*
* In JE 6, deltas were also maintained in the Btree cache. This was done to
* provide the reduced memory benefits described in the next section. The
* log format for a delta was also changed. The OldBINDelta log format is
* different (not the same as the BIN format) and is supported for backward
* compatibility as the OldBINDeltaLogEntry. Its log entry type name is
* still BINDelta, which is why the new type is named NewBINDelta (for
* backward compatibility, log entry type names cannot be changed.) This is
* also why the spelling "BIN-delta" is used to refer to deltas in the new
* approach. The old BINDelta class was renamed to OldBINDelta and there is
* no longer a class named BINDelta.
*
* Reduced Memory Usage
* --------------------
* In the Btree cache, a BIN may be represented as a full BIN or a BIN-delta.
* Eviction will mutate a full BIN to a BIN-delta in preference to discarding
* the entire BIN. A BIN-delta in cache occupies less memory than a full BIN,
* and can be exploited as follows:
*
* - When a full BIN is needed, it can be constructed with only one fetch
* rather than two, reducing IO overall. IN.fetchIN implements this
* optimization.
*
* - Certain operations can sometimes be performed using the BIN-delta alone,
* allowing such operations on a given data set to take place using less
* less IO (for a given cache size).
*
* The latter benefit is not yet implemented. No user CRUD operations are
* currently implemented using BIN-deltas. In the future we plan to implement
* the following operations using the BIN-delta alone.
*
* - Consider recording deletions in a BIN-delta. Currently, slot deletion
* prohibits a BIN-delta from being logged. To record deletion in
* BIN-deltas, slot deletion will have to be deferred until a full BIN is
* logged.
*
* - User reads by key, updates and deletions can be implemented if the key
* happens to appear in the BIN-delta.
*
* - The Cleaner can migrate an LN if its key happens to appear in the
* BIN-delta. This is similar to a user update operation, but in a
* different code path.
*
* - Insertions, deletions and updates can always be performed in a BIN-delta
* during replica replay, since the Master operation has already determined
* whether the key exists.
*
* - Recovery LN redo could also apply insertions, updates and inserts in the
* manner described.
*
* - Add idempotent put/delete operations, which can always be applied in a
* BIN-delta.
*
* - Store a hash of the keys in the full BIN in the BIN-delta and use it to
* perform the following in the delta:
* - putIfAbsent (true insertion)
* - get/delete/putIfPresent operations that return NOTFOUND
* - to avoid accumulating unnecessary deletions
*
* However, some internal operations do currently exploit BIN-deltas to avoid
* unnecessary IO. The following are currently implemented.
*
* - The Evictor and Checkpointer log a BIN-delta that is present in the
* cache, without having to fetch the full BIN.
*
* - The Cleaner can use the BIN-delta to avoid fetching when processing a BIN
* log entry (delta or full) and the BIN is not present in cache,
*
* To support BIB-delta-aware operations, the IN.fetchIN() and IN.getTarget()
* methods may return a BIN delta. IN.getTarget() will return whatever object
* is cached under the parent IN, and IN.fetchIN() will do a single I/O to
* fetch the most recently log record for the requested BIN, which may be a
* full BIN or a delta. Callers of these methods must be prepared to handle
* a BIN delta; either doing their operation directly on the delta, if
* possible, or mutating the delta to a full BIN by calling
* BIN.mutateToFullBIN().
*/
public class BIN extends IN {
private static final String BEGIN_TAG = "<bin>";
private static final String END_TAG = "</bin>";
/**
* Used as the "empty rep" for the INLongRep lastLoggedSizes field.
*
* minLength is 1 because log sizes are unpredictable.
*
* allowSparseRep is false because all slots have log sizes and less
* mutation is better.
*/
private static final INLongRep.EmptyRep EMPTY_LAST_LOGGED_SIZES =
new INLongRep.EmptyRep(1, false);
/**
* Used as the "empty rep" for the INLongRep vlsnCache field.
*
* minLength is 5 because VLSNS grow that large fairly quickly, and less
* mutation is better. The value 5 accomodates data set sizes up to 100
* billion. If we want to improve memory utilization for smaller data sets
* or reduce mutation for larger data sets, we could dynamically determine
* a value based on the last assigned VLSN.
*
* allowSparseRep is false because either all slots typically have VLSNs,
* or none do, and less mutation is better.
*/
private static final INLongRep.EmptyRep EMPTY_VLSNS =
new INLongRep.EmptyRep(5, false);
/**
* Used as the "empty rep" for the INLongRep offHeapLNIds field.
*
* minLength is 8 because memory IDs are 64-bit pointers.
*
* allowSparseRep is true because some workloads will only load LN IDs for
* a subset of the LNs in the BIN.
*/
private static final INLongRep.EmptyRep EMPTY_OFFHEAP_LN_IDS =
new INLongRep.EmptyRep(8, true);
/**
* Used as the "empty rep" for the INLongRep expirationValues field.
*
* minLength is 1 because we expect most expiration values, which are an
* offset from a base day/hour, to fit in one byte.
*
* allowSparseRep is true because some workloads only set TTLs on some of
* the LNs in a BIN.
*/
private static final INLongRep.EmptyRep EMPTY_EXPIRATION =
new INLongRep.EmptyRep(1, true);
/*
* The set of cursors that are currently referring to this BIN.
* This field is set to null when there are no cursors on this BIN.
*/
private TinyHashSet<CursorImpl> cursorSet;
/*
* Support for logging BIN deltas. (Partial BIN logging)
*/
/*
* If this is a delta, fullBinNEntries stores the number of entries
* in the full version of the BIN. This is a persistent field for
* BIN-delta logrecs only, and for log versions >= 10.
*/
private int fullBinNEntries = -1;
/*
* If this is a delta, fullBinMaxEntries stores the max number of
* entries (capacity) in the full version of the BIN. This is a
* persistent field for BIN-delta logrecs only, and for log versions >= 10.
*/
private int fullBinMaxEntries = -1;
/*
* If "this" is a BIN-delta, bloomFilter is a bloom-filter representation
* of the set of keys in the clean slots of the full version of the same
* BIN. It is used to allow blind put operations in deltas, by answering
* the question whether the put key is in the full BIN or not. See the
* javadoc of the TREE_BIN_DELTA_BLIND_PUTS config param for more info.
* This is a persistent field for BIN-delta logrecs only, and for log
* versions >= 10.
*/
byte[] bloomFilter;
/*
* See comment in IN.java, right after the lastFullVersion data field.
*/
private long lastDeltaVersion = DbLsn.NULL_LSN;
/*
* Caches the VLSN sequence for the LN entries in a BIN, when VLSN
* preservation and caching are configured.
*
* A VLSN is added to the cache when an LN is evicted from a BIN. When the
* LN is resident, there is no need for caching because the LN contains the
* VLSN. See BIN.setTarget. This strategy works because an LN is always
* cached during a read or write operation, and only evicted after that,
* based on eviction policies.
*
* For embedded LNs a VLSN is added to the cache every time the record is
* logged. Furthermore, the vlsn cache is made persistent for such LNs.
*
* An EMPTY_REP is used initially until the need arises to add a non-zero
* value. The cache will remain empty if LNs are never evicted or version
* caching is not configured, which is always the case for standalone JE.
*/
private INLongRep vlsnCache = EMPTY_VLSNS;
/*
* Stores the size of the most recently written logrec of each LN, or zero
* if the size is unknown.
*
* We use INLongRep in spite of the fact that sizes are int not long;
* INLongRep will store the minimum number of bytes. An EMPTY_REP is
* used initially until the need arises to add a non-zero value.
*/
private INLongRep lastLoggedSizes = EMPTY_LAST_LOGGED_SIZES;
/**
* When some LNs are in the off-heap cache, the offHeapLruId is this BIN's
* index in the off-heap LRU list.
*/
private INLongRep offHeapLNIds = EMPTY_OFFHEAP_LN_IDS;
private int offHeapLruId = -1;
/**
* An expirationValues slot value is one more than the number of days/hours
* to add to the expirationBase to get the true expiration days/hours. A
* slot value of zero means no expiration, and a non-zero slot value is one
* greater than the actual offset to be added. The base is the smallest
* non-zero offset that has been encountered.
*/
private INLongRep expirationValues = EMPTY_EXPIRATION;
private int expirationBase = -1;
/**
* Can be set to true by tests to prevent last logged sizes from being
* stored.
*/
public static boolean TEST_NO_LAST_LOGGED_SIZES = false;
public BIN() {
}
public BIN(
DatabaseImpl db,
byte[] identifierKey,
int capacity,
int level) {
super(db, identifierKey, capacity, level);
}
/**
* For Sizeof.
*/
public BIN(@SuppressWarnings("unused") SizeofMarker marker) {
super(marker);
}
/**
* Create a new BIN. Need this because we can't call newInstance()
* without getting a 0 for nodeId.
*/
@Override
protected IN createNewInstance(
byte[] identifierKey,
int maxEntries,
int level) {
return new BIN(getDatabase(), identifierKey, maxEntries, level);
}
public BINReference createReference() {
return new BINReference(
getNodeId(), getDatabase().getId(), getIdentifierKey());
}
@Override
public boolean isBIN() {
return true;
}
/*
* Return whether the shared latch for this kind of node should be of the
* "always exclusive" variety. Presently, only IN's are actually latched
* shared. BINs are latched exclusive only.
*/
@Override
boolean isAlwaysLatchedExclusively() {
return true;
}
@Override
public String shortClassName() {
return "BIN";
}
@Override
public String beginTag() {
return BEGIN_TAG;
}
@Override
public String endTag() {
return END_TAG;
}
boolean isVLSNCachingEnabled() {
return (!databaseImpl.getSortedDuplicates() && getEnv().getCacheVLSN());
}
public void setCachedVLSN(int idx, long vlsn) {
/*
* We do not cache the VLSN for dup DBs, because dup DBs are typically
* used only for indexes, and the overhead of VLSN maintenance would be
* wasted. Plus, although technically VLSN preservation might apply to
* dup DBs, the VLSNs are not reliably available since the LNs are
* immediately obsolete.
*/
if (!isVLSNCachingEnabled()) {
return;
}
setCachedVLSNUnconditional(idx, vlsn);
}
void setCachedVLSNUnconditional(int idx, long vlsn) {
vlsnCache = vlsnCache.set(
idx,
(vlsn == VLSN.NULL_VLSN_SEQUENCE ? 0 : vlsn),
this);
}
long getCachedVLSN(int idx) {
final long vlsn = vlsnCache.get(idx);
return (vlsn == 0 ? VLSN.NULL_VLSN_SEQUENCE : vlsn);
}
/**
* Returns the VLSN. VLSN.NULL_VLSN_SEQUENCE (-1) is returned in two
* cases:
* 1) This is a standalone environment.
* 2) The VLSN is not cached (perhaps VLSN caching is not configured), and
* the allowFetch param is false.
*
* WARNING: Because the vlsnCache is only updated when an LN is evicted, it
* is critical that getVLSN returns the VLSN for a resident LN before
* getting the VLSN from the cache.
*/
public long getVLSN(int idx, boolean allowFetch, CacheMode cacheMode) {
/* Must return the VLSN from the LN, if it is resident. */
LN ln = (LN) getTarget(idx);
if (ln != null) {
return ln.getVLSNSequence();
}
/* Next try the vlsnCache. */
long vlsn = getCachedVLSN(idx);
if (!VLSN.isNull(vlsn)) {
return vlsn;
}
/* Next try the off-heap cache. */
final OffHeapCache ohCache = getOffHeapCache();
if (ohCache.isEnabled()) {
vlsn = ohCache.loadVLSN(this, idx);
if (!VLSN.isNull(vlsn)) {
return vlsn;
}
}
/* As the last resort, fetch the LN if fetching is allowed. */
if (!allowFetch || isEmbeddedLN(idx)) {
return vlsn;
}
ln = fetchLN(idx, cacheMode);
if (ln != null) {
return ln.getVLSNSequence();
}
return VLSN.NULL_VLSN_SEQUENCE;
}
/** For unit testing. */
public INLongRep getVLSNCache() {
return vlsnCache;
}
/**
* The last logged size is never needed when the LN is counted obsolete
* immediately, since it is only needed for counting an LN obsolete
* during an update or deletion.
*
* This method may not be called until after the database is initialized,
* i,e., it may not be called during readFromLog.
*/
@Override
boolean isLastLoggedSizeStored(int idx) {
return mayHaveLastLoggedSizeStored() && !isEmbeddedLN(idx);
}
@Override
boolean mayHaveLastLoggedSizeStored() {
/* Check final static first so all test code is optimized away. */
if (DatabaseUtil.TEST) {
/* Don't skew test measurements with internal DBs. */
if (TEST_NO_LAST_LOGGED_SIZES &&
!databaseImpl.getDbType().isInternal()) {
return false;
}
}
return !databaseImpl.isLNImmediatelyObsolete();
}
/**
* Sets last logged size if necessary.
*
* This method does not dirty the IN because the caller methods dirty it,
* for example, when setting the LSN, key, or node.
*
* This method is sometimes called to add the logged size for a pre log
* version 9 BIN, for example, during fetchTarget and preload. This makes
* the logged size available for obsolete counting but does not dirty the
* IN, since that could cause an unexpected write of the IN being read.
*
* @param lastLoggedSize is positive if the size is known, zero if the size
* is unknown, or -1 if the size should not be changed because logging of
* the LN was deferred.
*/
@Override
public void setLastLoggedSize(int idx, int lastLoggedSize) {
if ((lastLoggedSize < 0) || !isLastLoggedSizeStored(idx)) {
return;
}
setLastLoggedSizeUnconditional(idx, lastLoggedSize);
}
@Override
public void clearLastLoggedSize(int idx) {
setLastLoggedSizeUnconditional(idx, 0);
}
/**
* Sets the size without checking whether it is necessary.
*
* This method is used when reading from the log because the databaseImpl
* is not yet initialized and isLastLoggedSizeStored cannot be called.
* It is also called for efficiency reasons when it is known that storing
* the logged size is necessary, for example, when copying values between
* slots.
*/
@Override
void setLastLoggedSizeUnconditional(int idx, int lastLoggedSize) {
lastLoggedSizes = lastLoggedSizes.set(idx, lastLoggedSize, this);
}
/**
* @return a positive value if the size is known, or zero if unknown.
*/
@Override
public int getLastLoggedSize(int idx) {
if (isLastLoggedSizeStored(idx)) {
return (int) lastLoggedSizes.get(idx);
}
return 0;
}
/**
* Sets the expiration time for a slot in days or hours.
*/
public void setExpiration(final int idx, int value, final boolean hours) {
/* This slot has no expiration. */
if (value == 0) {
expirationValues = expirationValues.set(idx, 0, this);
return;
}
/*
* If this is the first slot with an expiration, initialize the base to
* the value and set the offset (slot value) to one.
*/
if (expirationBase == -1 || nEntries == 1) {
expirationBase = value;
setExpirationOffset(idx, 1);
setExpirationInHours(hours);
return;
}
if (hours) {
/* Convert existing values to hours if necessary. */
if (!isExpirationInHours()) {
expirationBase *= 24;
setExpirationInHours(true);
for (int i = 0; i < nEntries; i += 1) {
if (i == idx) {
continue;
}
final int offset = (int) expirationValues.get(i);
if (offset == 0) {
continue;
}
setExpirationOffset(i, ((offset - 1) * 24) + 1);
}
}
} else {
/* If values are stored in hours, convert days to hours. */
if (isExpirationInHours()) {
value *= 24;
}
}
/*
* Slot's expiration must not be less than the base. If it is, decrease
* the base and increase the offset in other slots accordingly.
*/
if (value < expirationBase) {
final int adjustment = expirationBase - value;
expirationBase = value;
for (int i = 0; i < nEntries; i += 1) {
if (i == idx) {
continue;
}
final int offset = (int) expirationValues.get(i);
if (offset == 0) {
continue;
}
setExpirationOffset(i, offset + adjustment);
}
}
setExpirationOffset(idx, value - expirationBase + 1);
}
public boolean hasExpirationValues() {
return !expirationValues.isEmpty();
}
/**
* Returns the expiration time for a slot. The return value is in days or
* hours, depending on isExpirationTimeInHours.
*/
public int getExpiration(int idx) {
final int offset = (int) expirationValues.get(idx);
if (offset == 0) {
return 0;
}
return offset - 1 + expirationBase;
}
int getExpirationBase() {
return expirationBase;
}
int getExpirationOffset(int idx) {
return (int) expirationValues.get(idx);
}
void setExpirationBase(int base) {
expirationBase = base;
}
void setExpirationOffset(int idx, int offset) {
expirationValues = expirationValues.set(idx, offset, this);
}
/**
* Returns whether the slot is known-deleted, pending-deleted, or expired.
*/
public boolean isDefunct(int idx) {
return isDeleted(idx) || isExpired(idx);
}
/**
* Returns whether the slot is known-deleted or pending-deleted.
*/
public boolean isDeleted(int idx) {
return isEntryKnownDeleted(idx) || isEntryPendingDeleted(idx);
}
/**
* Returns whether the slot is expired.
*/
public boolean isExpired(int idx) {
return getEnv().isExpired(getExpiration(idx), isExpirationInHours());
}
public boolean isProbablyExpired(int idx) {
return getEnv().expiresWithin(
getExpiration(idx), isExpirationInHours(),
getEnv().getTtlClockTolerance());
}
public int getLastLoggedSizeUnconditional(int idx) {
return (int) lastLoggedSizes.get(idx);
}
public void setOffHeapLNId(int idx, long memId) {
if (offHeapLNIds.get(idx) == memId) {
return;
}
offHeapLNIds = offHeapLNIds.set(idx, memId, this);
}
public void clearOffHeapLNIds() {
offHeapLNIds = offHeapLNIds.clear(this, EMPTY_OFFHEAP_LN_IDS);
}
public long getOffHeapLNIdsMemorySize() {
return offHeapLNIds.getMemorySize();
}
public long getOffHeapLNId(int idx) {
return offHeapLNIds.get(idx);
}
public boolean hasOffHeapLNs() {
return !offHeapLNIds.isEmpty();
}
public void setOffHeapLruId(int id) {
assert id >= 0 || !hasOffHeapLNs();
offHeapLruId = id;
}
public int getOffHeapLruId() {
return offHeapLruId;
}
void freeOffHeapLN(int idx) {
getOffHeapCache().freeLN(this, idx);
}
/**
* Updates the vlsnCache when an LN target is evicted. See vlsnCache.
*/
@Override
void setTarget(int idx, Node target) {
if (target == null) {
final Node oldTarget = getTarget(idx);
if (oldTarget instanceof LN) {
setCachedVLSN(idx, ((LN) oldTarget).getVLSNSequence());
}
}
super.setTarget(idx, target);
}
/**
* Overridden to account for BIN-specific slot info.
*/
@Override
void appendEntryFromOtherNode(IN from, int fromIdx) {
super.appendEntryFromOtherNode(from, fromIdx);
final BIN fromBin = (BIN) from;
final int idx = nEntries - 1;
setCachedVLSNUnconditional(idx, fromBin.getCachedVLSN(fromIdx));
setLastLoggedSizeUnconditional(idx, from.getLastLoggedSize(fromIdx));
setExpiration(
idx, fromBin.getExpiration(fromIdx),
fromBin.isExpirationInHours());
final OffHeapCache ohCache = getOffHeapCache();
if (ohCache.isEnabled()) {
offHeapLNIds = offHeapLNIds.set(
idx, fromBin.offHeapLNIds.get(fromIdx), this);
ohCache.ensureOffHeapLNsInLRU(this);
}
}
/**
* Overridden to account for BIN-specific slot info.
*/
@Override
void copyEntries(int from, int to, int n) {
super.copyEntries(from, to, n);
vlsnCache = vlsnCache.copy(from, to, n, this);
lastLoggedSizes = lastLoggedSizes.copy(from, to, n, this);
expirationValues = expirationValues.copy(from, to, n, this);
offHeapLNIds = offHeapLNIds.copy(from, to, n, this);
}
/**
* Overridden to account for BIN-specific slot info.
*/
@Override
void clearEntry(int idx) {
super.clearEntry(idx);
setCachedVLSNUnconditional(idx, VLSN.NULL_VLSN_SEQUENCE);
setLastLoggedSizeUnconditional(idx, 0);
setExpiration(idx, 0, false);
offHeapLNIds = offHeapLNIds.set(idx, 0, this);
}
/*
* Cursors
*/
/* public for the test suite. */
public Set<CursorImpl> getCursorSet() {
if (cursorSet == null) {
return Collections.emptySet();
}
return cursorSet.copy();
}
/**
* Register a cursor with this BIN. Caller has this BIN already latched.
* @param cursor Cursor to register.
*/
public void addCursor(CursorImpl cursor) {
assert isLatchExclusiveOwner();
if (cursorSet == null) {
cursorSet = new TinyHashSet<CursorImpl>();
}
cursorSet.add(cursor);
}
/**
* Unregister a cursor with this bin. Caller has this BIN already
* latched.
*
* @param cursor Cursor to unregister.
*/
public void removeCursor(CursorImpl cursor) {
assert isLatchExclusiveOwner();
if (cursorSet == null) {
return;
}
cursorSet.remove(cursor);
if (cursorSet.size() == 0) {
cursorSet = null;
}
}
/**
* @return the number of cursors currently referring to this BIN.
*/
public int nCursors() {
/*
* Use a local var to concurrent assignment to the cursorSet field by
* another thread. This method is called via eviction without latching.
* LRU-TODO: with the new evictor this method is called with the node
* EX-latched. So, cleanup after the old evictor is scrapped.
*/
final TinyHashSet<CursorImpl> cursors = cursorSet;
if (cursors == null) {
return 0;
}
return cursors.size();
}
/**
* Adjust any cursors that are referring to this BIN. This method is
* called during a split operation. "this" is the BIN being split.
* newSibling is the new BIN into which the entries from "this" between
* newSiblingLow and newSiblingHigh have been copied.
*
* @param newSibling - the newSibling into which "this" has been split.
* @param newSiblingLow
* @param newSiblingHigh - the low and high entry of
* "this" that were moved into newSibling.
*/
@Override
void adjustCursors(
IN newSibling,
int newSiblingLow,
int newSiblingHigh)
{
assert newSibling.isLatchExclusiveOwner();
assert this.isLatchExclusiveOwner();
if (cursorSet == null) {
return;
}
int adjustmentDelta = (newSiblingHigh - newSiblingLow);
Iterator<CursorImpl> iter = cursorSet.iterator();
while (iter.hasNext()) {
CursorImpl cursor = iter.next();
int cIdx = cursor.getIndex();
cursor.assertBIN(this);
assert newSibling instanceof BIN;
/*
* There are four cases to consider for cursor adjustments,
* depending on (1) how the existing node gets split, and (2) where
* the cursor points to currently. In cases 1 and 2, the id key of
* the node being split is to the right of the splitindex so the
* new sibling gets the node entries to the left of that index.
* This is indicated by "new sibling" to the left of the vertical
* split line below. The right side of the node contains entries
* that will remain in the existing node (although they've been
* shifted to the left). The vertical bar (^) indicates where the
* cursor currently points.
*
* case 1:
*
* We need to set the cursor's "bin" reference to point at the
* new sibling, but we don't need to adjust its index since that
* continues to be correct post-split.
*
* +=======================================+
* | new sibling | existing node |
* +=======================================+
* cursor ^
*
* case 2:
*
* We only need to adjust the cursor's index since it continues
* to point to the current BIN post-split.
*
* +=======================================+
* | new sibling | existing node |
* +=======================================+
* cursor ^
*
* case 3:
*
* Do nothing. The cursor continues to point at the correct BIN
* and index.
*
* +=======================================+
* | existing Node | new sibling |
* +=======================================+
* cursor ^
*
* case 4:
*
* Adjust the "bin" pointer to point at the new sibling BIN and
* also adjust the index.
*
* +=======================================+
* | existing Node | new sibling |
* +=======================================+
* cursor ^
*/
BIN ns = (BIN) newSibling;
if (newSiblingLow == 0) {
if (cIdx < newSiblingHigh) {
/* case 1 */
iter.remove();
cursor.setBIN(ns);
ns.addCursor(cursor);
} else {
/* case 2 */
cursor.setIndex(cIdx - adjustmentDelta);
}
} else {
if (cIdx >= newSiblingLow) {
/* case 4 */
cursor.setIndex(cIdx - newSiblingLow);
iter.remove();
cursor.setBIN(ns);
ns.addCursor(cursor);
}
}
}
}
/**
* For each cursor in this BIN's cursor set, ensure that the cursor is
* actually referring to this BIN.
*/
public void verifyCursors() {
if (cursorSet == null) {
return;
}
for (CursorImpl cursor : cursorSet) {
cursor.assertBIN(this);
}
}
/**
* Adjust cursors referring to this BIN following an insert.
*
* @param insertIndex - The index of the new entry.
*/
@Override
void adjustCursorsForInsert(int insertIndex) {
assert this.isLatchExclusiveOwner();
if (cursorSet == null) {
return;
}
for (CursorImpl cursor : cursorSet) {
int cIdx = cursor.getIndex();
if (insertIndex <= cIdx) {
cursor.setIndex(cIdx + 1);
}
}
}
/**
* Called when we know we are about to split on behalf of a key that is the
* minimum (leftSide) or maximum (!leftSide) of this node. This is
* achieved by just forcing the split to occur either one element in from
* the left or the right (i.e. splitIndex is 1 or nEntries - 1).
*/
@Override
IN splitSpecial(
IN parent,
int parentIndex,
IN grandParent,
int maxEntriesPerNode,
byte[] key,
boolean leftSide)
throws DatabaseException {
int nEntries = getNEntries();
int index = findEntry(key, true, false);
boolean exact = (index & IN.EXACT_MATCH) != 0;
index &= ~IN.EXACT_MATCH;
if (leftSide && index < 0) {
return splitInternal(
parent, parentIndex, grandParent, maxEntriesPerNode, 1);
} else if (!leftSide && !exact && index == (nEntries - 1)) {
return splitInternal(
parent, parentIndex, grandParent, maxEntriesPerNode,
nEntries - 1);
} else {
return split(
parent, parentIndex, grandParent, maxEntriesPerNode);
}
}
/**
* Compress a full BIN by removing any slots that are deleted or expired.
*
* This must not be a BIN-delta. No cursors can be present on the BIN,
* unless the env is closing. Caller is responsible for latching and
* unlatching this node.
*
* If the slot containing the identifier is removed, the identifier key
* will be changed to the key in the first remaining slot.
*
* Normally when a slot is removed, the IN is dirtied. However, during
* compression the BIN is not dirtied when a slot is removed. This is safe
* for the reasons described below. Note that the BIN being compressed is
* always a full BIN, not a delta.
*
* + If the BIN is not dirty and it does not become dirty before shutdown,
* i.e., it is not logged, then it is possible that this compression will
* be "lost". However, the state of the slot on disk is expired/deleted,
* and when the BIN is later fetched from disk, this state will be
* restored and the compression will be performed again.
*
* + If the slot is dirty, the BIN may also be dirty or may become dirty
* later, and be logged. Logging a delta would cause the information in
* the dirty slot to be lost. Therefore, when a dirty slot is removed, we
* set a flag that prohibits the next BIN logged from being a delta.
*
* This optimization (that we don't dirty the BIN and we allow logging a
* delta after removing a non-dirty slot) has one minor and one major
* impact:
*
* 1. When a slot is removed for a deleted record, normally the slot and
* the BIN will be dirty. Although it is unusual, we may encounter a
* non-dirty slot for a deleted record. This happens if the slot could not
* be removed by this method when a full BIN is logged, due to a lock or a
* cursor, and we compress the full BIN later.
*
* 2. When a slot is removed for an expired record, it is common that the
* slot will not be be dirty. In this case, without the optimization, the
* removal of expired slots would cause more logging and less deltas would
* be logged.
*
* @param localTracker is used only for temporary DBs, and may be specified
* to consolidate multiple tracking operations. If null, the tracking is
* performed immediately in this method.
*
* @return true if all deleted and expired slots were compressed, or false
* if one or more slots could not be compressed because we were unable to
* obtain a lock. A false return value means "try again later".
*/
public boolean compress(boolean compressDirtySlots,
final LocalUtilizationTracker localTracker) {
/*
* If the environment is not yet recovered we can't rely on locks
* being set up to safeguard active data and so we can't compress
* safely.
*/
if (!databaseImpl.getEnv().isValid()) {
return true;
}
if (nCursors() > 0) {
throw EnvironmentFailureException.unexpectedState();
}
if (isBINDelta()) {
throw EnvironmentFailureException.unexpectedState();
}
final DatabaseImpl db = getDatabase();
final EnvironmentImpl envImpl = db.getEnv();
final LockManager lockManager =
envImpl.getTxnManager().getLockManager();
boolean setNewIdKey = false;
boolean anyLocked = false;
for (int i = 0; i < getNEntries(); i++) {
if (!compressDirtySlots && isDirty(i)) {
continue;
}
final boolean expired = isExpired(i);
final boolean deleted = isDeleted(i);
if (!deleted && !expired) {
continue;
}
/*
* We have to be able to lock the LN before we can compress the
* entry. If we can't, then skip over it. For a deleted record, a
* read lock is sufficient because it means the deletion has been
* committed, and other lockers don't hold read locks on a deleted
* record. For an expired record, a write lock is needed to prevent
* removal of a slot for a record that is read-locked elsewhere.
* In both cases it is more efficient to call isLockUncontended
* than to actually lock the LN, since we would release the lock
* immediately.
*
* We must be able to lock the LN even if isKnownDeleted is true,
* because locks protect the aborts. (Aborts may execute multiple
* operations, where each operation latches and unlatches. It's the
* LN lock that protects the integrity of the whole multi-step
* process.)
*
* For example, during abort, there may be cases where we have
* deleted and then added an LN during the same txn. This means
* that to undo/abort it, we first delete the LN (leaving
* knownDeleted set), and then add it back into the tree. We want
* to make sure the entry is in the BIN when we do the insert back
* in.
*/
final long lsn = getLsn(i);
/* Can discard a NULL_LSN entry without locking. */
if (lsn != DbLsn.NULL_LSN &&
!lockManager.isLockUncontended(lsn)) {
/*
* Check whether the extinct scanner has set the KD flag. If
* so, it is OK to delete a locked slot (since no cursor is
* positioned on it). getExtinctionStatus has overhead, but a
* KD slot should be locked infrequently.
*/
if (!isEntryKnownDeleted(i) ||
envImpl.getExtinctionState(db, getKey(i)) != EXTINCT) {
/* Try again later. */
anyLocked = true;
continue;
}
}
/* At this point, we know we can remove the slot. */
if (entryKeys.compareKeys(
getIdentifierKey(), keyPrefix, i, haveEmbeddedData(i),
getKeyComparator()) == 0) {
/*
* We're about to remove the entry with the idKey so the
* node will need a new idkey.
*/
setNewIdKey = true;
}
/*
* When we compress a deleted slot in a deferred-write DB, we
* must either log a dirty LN or count it obsolete. However, if
* we compress an expired slot, neither is necessary; see TTL.
*/
if (!expired && db.isDeferredWriteMode()) {
final LN ln = (LN) getTarget(i);
if (ln != null &&
ln.isDirty() &&
!DbLsn.isTransient(lsn)) {
if (db.isTemporary()) {
/*
* When a previously logged LN in a temporary DB is
* dirty, we can count the LSN of the last logged LN as
* obsolete without logging. There is no requirement
* for the dirty deleted LN to be durable past
* recovery. There is no danger of the last logged LN
* being accessed again (after log cleaning, for
* example), since temp DBs do not survive recovery.
*/
if (localTracker != null) {
localTracker.countObsoleteNode(
lsn, ln.getGenericLogType(),
getLastLoggedSize(i));
} else {
envImpl.getLogManager().countObsoleteNode(
lsn, ln.getGenericLogType(),
getLastLoggedSize(i),
true /*countExact*/);
}
} else {
/*
* When a previously logged deferred-write LN is dirty,
* we log the dirty deleted LN to make the deletion
* durable. The act of logging will also count the last
* logged LSN as obsolete.
*/
logDirtyLN(i, ln, true /*allowEviction*/);
}
}
}
deleteEntry(i, false /*makeDirty*/, true /*validate*/);
/* Since we're deleting the current entry, decrement the index. */
i--;
}
if (getNEntries() != 0 && setNewIdKey) {
setIdentifierKey(getKey(0), false /*makeDirty*/);
}
if (getNEntries() == 0) {
/* This BIN is empty and expendable. */
updateLRU(CacheMode.MAKE_COLD); // TODO actually make cold
}
/*
* Reduce capacity if this BIN is larger than the configured capacity,
* and has less entries then the configured capacity. This could be due
* to enlarging the BIN during recovery (see reconstituteBIN) or
* because the configured capacity was changed.
*/
final int configuredCapacity = databaseImpl.getNodeMaxTreeEntries();
if (getMaxEntries() > configuredCapacity &&
getNEntries() < configuredCapacity) {
resize(configuredCapacity);
}
return !anyLocked;
}
/**
* This method is called opportunistically at certain places where a
* deleted slot is observed (when the slot's PendingDeleted or KnownDeleted
* flag is set), to ensure that the slot is compressed away. This is an
* attempt to process slots that were not compressed during the mainstream
* record deletion process because of cursors on the BIN during compress,
* or a crash prior to compression.
*/
public void queueSlotDeletion(final int idx) {
/*
* If the next logrec for this BIN should be a BIN-delta, don't queue
* the BIN if the deleted slot is dirty, because removing dirty BIN
* slots prevents logging a delta.
*/
if (isDirty(idx) && shouldLogDelta()) {
return;
}
getEnv().addToCompressorQueue(this);
}
/* For debugging. Overrides method in IN. */
@Override
boolean validateSubtreeBeforeDelete(int index) {
assert(!isBINDelta());
return true;
}
/**
* Check if this node fits the qualifications for being part of a deletable
* subtree. It may not have any LN children.
*
* We assume that this is only called under an assert.
*/
@Override
boolean isValidForDelete()
throws DatabaseException {
assert(isLatchExclusiveOwner());
if (isBINDelta()) {
return false;
}
int numValidEntries = 0;
for (int i = 0; i < getNEntries(); i++) {
if (!isEntryKnownDeleted(i)) {
numValidEntries++;
}
}
if (numValidEntries > 0) { // any valid entries, not eligible
return false;
}
if (nCursors() > 0) { // cursors on BIN, not eligible
return false;
}
return true; // 0 entries, no cursors
}
@Override
public long compactMemory() {
final long oldSize = inMemorySize;
super.compactMemory();
offHeapLNIds = offHeapLNIds.compact(this, EMPTY_OFFHEAP_LN_IDS);
expirationValues = expirationValues.compact(this, EMPTY_EXPIRATION);
return oldSize - inMemorySize;
}
/**
* Adds vlsnCache size to computed memory size.
*/
@Override
public long computeMemorySize() {
long size = super.computeMemorySize();
/*
* vlsnCache, lastLoggedSizes, etc, are null only when this method is
* called by the superclass constructor, i.e., before this class
* constructor has run. Luckily the initial representations have a
* memory size of zero, so we can ignore them in this case.
*/
if (vlsnCache != null) {
size += vlsnCache.getMemorySize();
}
if (lastLoggedSizes != null) {
size += lastLoggedSizes.getMemorySize();
}
if (expirationValues != null) {
size += expirationValues.getMemorySize();
}
if (offHeapLNIds != null) {
size += offHeapLNIds.getMemorySize();
}
if (bloomFilter != null) {
size += BINDeltaBloomFilter.getMemorySize(bloomFilter);
}
return size;
}
/* Utility method used during unit testing. */
@Override
protected long printMemorySize() {
final long inTotal = super.printMemorySize();
final long vlsnCacheOverhead = vlsnCache.getMemorySize();
final long logSizesOverhead = lastLoggedSizes.getMemorySize();
final long expirationOverhead = expirationValues.getMemorySize();
final long offHeapLNIdOverhead = offHeapLNIds.getMemorySize();
final long binTotal = inTotal +
vlsnCacheOverhead + logSizesOverhead + offHeapLNIdOverhead;
System.out.format(
"BIN: %d vlsns: %d logSizes: %d expiration: %d offHeapLNIds: %d %n",
binTotal, vlsnCacheOverhead, logSizesOverhead, expirationOverhead,
offHeapLNIdOverhead);
return binTotal;
}
@Override
protected long getFixedMemoryOverhead() {
return MemoryBudget.BIN_FIXED_OVERHEAD;
}
/**
* Reduce memory consumption. Note that evicting deferred-write LNs may
* require logging them, which will mark this BIN dirty. Compression of
* deleted slots will also mark the BIN dirty.
*
* The BIN should be latched by the caller.
*
* @return a long number encoding (a) the number of evicted bytes, and
* (b) whether this BIN is evictable. (b) will be false if the BIN has
* any cursors on it, or has any non-evictable children.
*/
@Override
public long partialEviction() {
/* Try compressing non-dirty slots. */
final long oldMemSize = inMemorySize;
getEnv().lazyCompress(this);
if (oldMemSize > inMemorySize) {
return oldMemSize - inMemorySize;
}
/* Try LN eviction. Return if any were evicted. */
final long lnBytesAndStatus = evictLNs();
if ((lnBytesAndStatus & ~IN.NON_EVICTABLE_IN) != 0) {
return lnBytesAndStatus;
}
/* Try discarding the VLSNCache. Return bytes and evictable status. */
return discardVLSNCache() | lnBytesAndStatus;
}
public long discardVLSNCache() {
final long vlsnBytes = vlsnCache.getMemorySize();
if (vlsnBytes > 0) {
int numEntries = getNEntries();
for (int i = 0; i < numEntries; ++i) {
if (isEmbeddedLN(i)) {
return 0;
}
}
vlsnCache = EMPTY_VLSNS;
updateMemorySize(0 - vlsnBytes);
}
return vlsnBytes;
}
/**
* Reduce memory consumption by evicting all LN targets. Note that this may
* cause LNs to be logged, which will mark this BIN dirty.
*
* The BIN should be latched by the caller.
*
* @return a long number encoding (a) the number of evicted bytes, and
* (b) whether this BIN is evictable. (b) will be false if the BIN has
* any cursors on it, or has any non-evictable children.
*/
public long evictLNs()
throws DatabaseException {
assert isLatchExclusiveOwner() :
"BIN must be latched before evicting LNs";
/*
* We can't evict an LN which is pointed to by a cursor, in case that
* cursor has a reference to the LN object. We'll take the cheap choice
* and avoid evicting any LNs if there are cursors on this BIN. We
* could do a more expensive, precise check to see entries have which
* cursors. This is something we might move to later.
*/
if (nCursors() > 0) {
return IN.NON_EVICTABLE_IN;
}
/* Try to evict each child LN. */
long totalRemoved = 0;
long numLNsEvicted = 0;
boolean haveNonEvictableLN = false;
for (int i = 0; i < getNEntries(); i++) {
if (getTarget(i) == null) {
continue;
}
long lnRemoved = evictLNInternal(i, false /*ifFetchedCold*/);
if (lnRemoved < 0) {
haveNonEvictableLN = true;
} else {
totalRemoved += lnRemoved;
++numLNsEvicted;
}
}
/*
* compactMemory() may decrease the memory footprint by mutating the
* representations of the target and key sets.
*/
if (totalRemoved > 0) {
updateMemorySize(totalRemoved, 0);
totalRemoved += compactMemory();
}
getEvictor().incNumLNsEvicted(numLNsEvicted);
if (haveNonEvictableLN) {
return (totalRemoved | IN.NON_EVICTABLE_IN);
} else {
return totalRemoved;
}
}
public void evictLN(int index) {
evictLN(index, false /*ifFetchedCold*/);
}
public void evictLN(int index, boolean ifFetchedCold)
throws DatabaseException {
final long removed = evictLNInternal(index, ifFetchedCold);
/* May decrease the memory footprint by changing the INTargetRep. */
if (removed > 0) {
updateMemorySize(removed, 0);
compactMemory();
}
}
/**
* Evict a single LN if allowed. The amount of memory freed is returned
* and must be subtracted from the memory budget by the caller.
*
* @param ifFetchedCold If true, evict the LN only if it has the
* FetchedCold flag set.
*
* @return number of evicted bytes or -1 if the LN is not evictable.
*/
private long evictLNInternal(int index, boolean ifFetchedCold)
throws DatabaseException {
final Node n = getTarget(index);
assert(n == null || n instanceof LN);
if (n == null) {
return 0;
}
final LN ln = (LN) n;
if (ifFetchedCold && !ln.getFetchedCold()) {
return 0;
}
/*
* Don't evict MapLNs for open databases (LN.isEvictable) [#13415].
*/
if (!ln.isEvictable(getLsn(index))) {
return -1;
}
/*
* Log target if necessary. Do not allow eviction since we evict
* here and that would cause double-counting of the memory freed.
*/
logDirtyLN(index, ln, false /*allowEviction*/);
/* Clear target. */
setTarget(index, null);
final OffHeapCache ohCache = getOffHeapCache();
if (ohCache.isEnabled()) {
ohCache.storeEvictedLN(this, index, ln);
}
return n.getMemorySizeIncludedByParent();
}
/**
* @see IN#logDirtyChildren
*/
@Override
public void logDirtyChildren()
throws DatabaseException {
/* Look for LNs that are dirty or have never been logged before. */
for (int i = 0; i < getNEntries(); i++) {
Node node = getTarget(i);
if (node != null) {
logDirtyLN(i, (LN) node, true /*allowEviction*/);
}
}
}
/**
* Logs the LN at the given index if it is dirty.
*/
private void logDirtyLN(
int idx,
LN ln,
boolean allowEviction)
throws DatabaseException {
final long currLsn = getLsn(idx);
final boolean force = getDatabase().isDeferredWriteMode() &&
DbLsn.isTransientOrNull(currLsn);
if (force || ln.isDirty()) {
final DatabaseImpl dbImpl = getDatabase();
final EnvironmentImpl envImpl = dbImpl.getEnv();
/* Only deferred write databases should have dirty LNs. */
assert(dbImpl.isDeferredWriteMode() || ln instanceof MapLN);
/*
* Do not lock while logging. Locking of new LSN is performed by
* lockAfterLsnChange. This should never be part of the replication
* stream, because this is a deferred-write DB.
*
* No reason to include the previous record version in this logrec
* because this logrec will never be undone (DW databases are
* non-transactional)
*/
final LogItem logItem = ln.log(
envImpl, dbImpl, null /*locker*/, null /*writeLockInfo*/,
isEmbeddedLN(idx), getKey(idx),
getExpiration(idx), isExpirationInHours(),
isEmbeddedLN(idx), currLsn, getLastLoggedSize(idx),
false/*isInsertion*/, true /*backgroundIO*/,
ReplicationContext.NO_REPLICATE);
updateEntry(
idx, logItem.lsn, ln.getVLSNSequence(),
logItem.size);
/* Lock new LSN on behalf of existing lockers. */
CursorImpl.lockAfterLsnChange(
dbImpl, currLsn, logItem.lsn, null /*excludeLocker*/);
/*
* It is desirable to evict a non-dirty LN that is immediately
* obsolete, because it will never be fetched again.
*/
if (allowEviction &&
(databaseImpl.isLNImmediatelyObsolete() ||
isEmbeddedLN(idx))) {
evictLN(idx);
}
}
}
/*
* Logging support
*/
/**
* @see IN#getLogType
*/
@Override
public LogEntryType getLogType() {
return LogEntryType.LOG_BIN;
}
/**
* Overrides the IN method to account for deltas.
* Public for unit testing.
*/
@Override
public long getLastDeltaLsn() {
return lastDeltaVersion;
}
public void setLastDeltaLsn(long lsn) {
lastDeltaVersion = lsn;
}
/*
* BIN delta support
*/
public int getFullBinNEntries() {
if (isBINDelta()) {
return fullBinNEntries;
} else {
return nEntries;
}
}
public void setFullBinNEntries(int n) {
assert(isBINDelta(false));
fullBinNEntries = n;
}
void incFullBinNEntries() {
assert(isBINDelta());
++fullBinNEntries;
}
public int getFullBinMaxEntries() {
if (isBINDelta()) {
return fullBinMaxEntries;
} else {
return getMaxEntries();
}
}
public void setFullBinMaxEntries(int n) {
assert(isBINDelta(false));
fullBinMaxEntries = n;
}
int getDeltaCapacity(int numDirtyEntries) {
boolean blindOps =
(getEnv().allowBlindOps() || getEnv().allowBlindPuts());
if (isBINDelta()) {
return getMaxEntries();
}
if (blindOps) {
return (getNEntries() * databaseImpl.getBinDeltaPercent()) / 100;
}
return numDirtyEntries;
}
boolean allowBlindPuts() {
boolean res = getEnv().allowBlindPuts();
if (res) {
res = res && databaseImpl.hasBtreeBinaryEqualityComparator();
res = res && databaseImpl.hasDuplicateBinaryEqualityComparator();
}
return res;
}
/*
* It is called in 3 cases listed below. In all cases, if blind puts are
* not allowed, the method returns null.
*
* 1. A full BIN is being mutated to an in-memory delta. A new filter will
* be created here and will be stored in the delta by the caller.
* 2. A full BIN is being logged as a delta. A new filter will be created
* here and will be written in the delta logrec by the caller.
* 3. An in-memory BIN-delta is being logged. If the delta has a bloom
* filter already, that filter will be returned and written into the
* logrec. The delta may not have a filter already because it was read
* from an older-version logfile; in this case we return null.
*/
byte[] createBloomFilter() {
assert(bloomFilter == null || isBINDelta());
boolean blindPuts = allowBlindPuts();
if (!blindPuts) {
assert(bloomFilter == null);
return null;
}
if (bloomFilter != null) {
/*
* We are here because we are logging a delta that has a filter
* already. We just need to log the existing filter.
*/
return bloomFilter;
}
if (isBINDelta()) {
return null;
}
int numKeys = getNEntries() - getNDeltas();
int nbytes = BINDeltaBloomFilter.getByteSize(numKeys);
byte[] bf = new byte[nbytes];
BINDeltaBloomFilter.HashContext hc =
new BINDeltaBloomFilter.HashContext();
if (keyPrefix != null) {
hc.hashKeyPrefix(keyPrefix);
}
for (int i = 0; i < getNEntries(); ++i) {
if (isDirty(i)) {
continue;
}
byte[] suffix = entryKeys.getKey(i, haveEmbeddedData(i));
if (suffix == null) {
suffix = Key.EMPTY_KEY;
}
BINDeltaBloomFilter.add(bf, suffix, hc);
}
return bf;
}
public boolean mayHaveKeyInFullBin(byte[] key) {
assert(isBINDelta());
if (bloomFilter == null) {
return true;
}
return BINDeltaBloomFilter.contains(bloomFilter, key);
}
/*
* Used in IN.getLogSize() only
*/
int getBloomFilterLogSize() {
if (!allowBlindPuts()) {
return 0;
}
if (isBINDelta()) {
if (bloomFilter != null) {
return BINDeltaBloomFilter.getLogSize(bloomFilter);
}
return 0;
} else {
assert(bloomFilter == null);
int numKeys = getNEntries() - getNDeltas();
return BINDeltaBloomFilter.getLogSize(numKeys);
}
}
boolean isDeltaProhibited() {
return (getProhibitNextDelta() ||
getDatabase().isDeferredWriteMode() ||
getLastFullLsn() == DbLsn.NULL_LSN);
}
/**
* Decide whether to log a full or partial BIN, depending on the ratio of
* the delta size to full BIN size.
*
* Other factors are taken into account:
* + a delta cannot be logged if the BIN has never been logged before
* + deltas are not currently supported for DeferredWrite databases
* + this particular delta may have been prohibited because the cleaner is
* migrating the BIN or a dirty slot has been removed
* + if there are no dirty slots, we might as well log a full BIN
*
* The restriction on using BIN-deltas for deferred-write DBs is for
* reasons that are probably no longer relevant. However, we have not
* tested deltas with DW, so we still prohibit them. Because BIN-deltas
* may be in cache at the time a DB is opened in DW mode, a workaround is
* currently necessary: see Database.mutateDeferredWriteBINDeltas.
*
* @return true if we should log the deltas of this BIN
*/
public boolean shouldLogDelta() {
if (isBINDelta()) {
/*
* Cannot assert that db is not in DeferredWrite mode.
* See Database.mutateDeferredWriteBINDeltas.
*/
assert !getProhibitNextDelta();
assert getLastFullLsn() != DbLsn.NULL_LSN;
return true;
}
/* Cheapest checks first. */
if (isDeltaProhibited()) {
return false;
}
/* Must count deltas to check further. */
final int numDeltas = getNDeltas();
/* A delta with zero items is not valid. */
if (numDeltas <= 0) {
return false;
}
/* Check the configured BinDeltaPercent. */
final int deltaLimit =
(getNEntries() * databaseImpl.getBinDeltaPercent()) / 100;
return numDeltas <= deltaLimit;
}
/**
* Returns whether mutateToBINDelta can be called.
*/
public boolean canMutateToBINDelta() {
return (!isBINDelta() &&
shouldLogDelta() &&
(nCursors() == 0));
}
/**
* Mutate to a delta (discard non-dirty entries and resize arrays).
*
* This method must be called with this node latched exclusively, and
* canMutateToBINDelta must return true.
*
* @return the number of bytes freed.
*/
public long mutateToBINDelta() {
assert isLatchExclusiveOwner();
assert canMutateToBINDelta();
if (getInListResident()) {
getEnv().getInMemoryINs().updateBINDeltaStat(1);
}
final long oldSize = getInMemorySize();
final int nDeltas = getNDeltas();
final int capacity = getDeltaCapacity(nDeltas);
bloomFilter = createBloomFilter();
initBINDelta(this, nDeltas, capacity, true);
return oldSize - getInMemorySize();
}
/**
* This method assumes that "this" BIN is a delta and creates a clone of
* it. It is currently used by the DiskOrderedScanner only. The method
* does not clone the targets array.
*/
public BIN cloneBINDelta() {
assert(isBINDelta());
final BIN bin = new BIN(
databaseImpl, getIdentifierKey(), 0/*capacity*/, getLevel());
bin.nodeId = nodeId;
bin.flags = flags;
bin.lastFullVersion = lastFullVersion;
final int nDeltas = getNDeltas();
initBINDelta(bin, nDeltas, nDeltas, false);
return bin;
}
/**
* Replaces the contents of destBIN with the deltas in this BIN.
*/
private void initBINDelta(
final BIN destBIN,
final int nDeltas,
final int capacity,
final boolean copyTargets) {
long[] longLSNs = null;
byte[] compactLSNs = null;
if (entryLsnLongArray == null) {
compactLSNs = new byte[nDeltas * 4];
} else {
longLSNs = new long[nDeltas];
}
final long[] vlsns = new long[nDeltas];
final int[] sizes = new int[nDeltas];
final byte[][] keys = new byte[nDeltas][];
final byte[] states = new byte[nDeltas];
long[] memIds = null;
Node[] targets = null;
int[] expiration = null;
if (copyTargets) {
targets = new Node[nDeltas];
memIds = new long[nDeltas];
}
if (expirationBase != -1) {
expiration = new int[nDeltas];
}
int j = 0;
for (int i = 0; i < getNEntries(); i += 1) {
if (!isDirty(i)) {
freeOffHeapLN(i);
continue;
}
if (entryLsnLongArray == null) {
int doff = j << 2;
int soff = i << 2;
compactLSNs[doff] = entryLsnByteArray[soff];
compactLSNs[doff+1] = entryLsnByteArray[soff+1];
compactLSNs[doff+2] = entryLsnByteArray[soff+2];
compactLSNs[doff+3] = entryLsnByteArray[soff+3];
} else {
longLSNs[j] = getLsn(i);
}
keys[j] = entryKeys.get(i);
states[j] = getState(i);
if (targets != null) {
targets[j] = getTarget(i);
}
if (memIds != null) {
memIds[j] = getOffHeapLNId(i);
}
vlsns[j] = getCachedVLSN(i);
sizes[j] = getLastLoggedSize(i);
if (expiration != null) {
expiration[j] = getExpiration(i);
}
j += 1;
}
/*
* Do this before resetContent() because destBIN and "this" may be the
* same java obj
*/
destBIN.fullBinNEntries = getFullBinNEntries();
destBIN.fullBinMaxEntries = getFullBinMaxEntries();
destBIN.resetContent(
capacity, nDeltas,
baseFileNumber, compactLSNs, longLSNs,
states, keyPrefix, keys, targets,
sizes, memIds, vlsns,
expiration, isExpirationInHours());
destBIN.setBINDelta(true);
destBIN.compactMemory();
}
/**
* Replaces the contents of this BIN with the given contents.
* Used in mutating a full BIN to a BIN-delta or for creating
* a new BIN delta with the given content.
*/
private void resetContent(
final int capacity,
final int newNEntries,
final long baseFileNumber,
final byte[] compactLSNs,
final long[] longLSNs,
final byte[] states,
final byte[] keyPrefix,
final byte[][] keys,
final Node[] targets,
final int[] loggedSizes,
final long[] memIds,
final long[] vlsns,
final int[] expiration,
final boolean expirationInHours) {
updateRepCacheStats(false);
nEntries = newNEntries;
this.baseFileNumber = baseFileNumber;
if (longLSNs == null) {
entryLsnByteArray = new byte[capacity << 2];
entryLsnLongArray = null;
} else {
entryLsnByteArray = null;
entryLsnLongArray = new long[capacity];
}
this.keyPrefix = keyPrefix;
entryKeys = new INKeyRep.Default(capacity);
entryTargets = INTargetRep.NONE;
vlsnCache = EMPTY_VLSNS;
lastLoggedSizes = EMPTY_LAST_LOGGED_SIZES;
expirationValues = EMPTY_EXPIRATION;
expirationBase = -1;
offHeapLNIds = EMPTY_OFFHEAP_LN_IDS;
updateRepCacheStats(true);
entryStates = new byte[capacity];
for (int i = 0; i < newNEntries; i += 1) {
if (longLSNs == null) {
int off = i << 2;
entryLsnByteArray[off] = compactLSNs[off];
entryLsnByteArray[off+1] = compactLSNs[off+1];
entryLsnByteArray[off+2] = compactLSNs[off+2];
entryLsnByteArray[off+3] = compactLSNs[off+3];
} else {
entryLsnLongArray[i] = longLSNs[i];
}
entryKeys = entryKeys.set(i, keys[i], this);
entryStates[i] = states[i];
if (targets != null) {
entryTargets = entryTargets.set(i, targets[i], this);
}
if (memIds != null) {
setOffHeapLNId(i, memIds[i]);
}
if (expiration != null) {
setExpiration(i, expiration[i], expirationInHours);
}
setLastLoggedSizeUnconditional(i, loggedSizes[i]);
setCachedVLSNUnconditional(i, vlsns[i]);
}
updateMemorySize(inMemorySize, computeMemorySize());
}
/**
* Fetch the full BIN and apply the deltas in this BIN to it, then use the
* merged result to replace the contents of this BIN.
*
* This method must be called with this node latched exclusively. If 'this'
* is not a delta, this method does nothing.
*/
@Override
public void mutateToFullBIN(boolean leaveFreeSlot) {
if (!isBINDelta()) {
return;
}
final BIN fullBIN = fetchFullBIN(databaseImpl);
mutateToFullBIN(fullBIN, leaveFreeSlot);
getEvictor().incFullBINMissStats();
}
/**
* Mutates this delta to a full BIN by applying this delta to the fullBIN
* param and then replacing this BIN's contents with it.
*
* This method must be called with this node latched exclusively. 'this'
* must be a delta.
*
* After mutation, the full BIN is compressed and compacted. The
* compression is particularly important, since BIN-deltas in cache cannot
* be compressed.
*
* The method is public because it is called directly from FileProcessor
* when it finds a BIN that must be migrated. In that case, fullBIN is a
* full BIN that has just been read from the log, and it is not part of
* the memory-resident tree.
*/
public void mutateToFullBIN(BIN fullBIN, boolean leaveFreeSlot) {
assert isLatchExclusiveOwner();
assert isBINDelta() : this;
byte[][] keys = null;
int i = 0;
if (cursorSet != null) {
keys = new byte[cursorSet.size()][];
for (CursorImpl cursor : cursorSet) {
final int index = cursor.getIndex();
if (index >= 0 && index < getNEntries()) {
keys[i] = cursor.getCurrentKey(true/*isLatched*/);
}
++i;
}
}
reconstituteBIN(databaseImpl, fullBIN, leaveFreeSlot);
resetContent(fullBIN);
setBINDelta(false);
/*
* The fullBIN identifierKey may have changed when reconstituteBIN
* called BIN.compress. We cannot call setIdentifierKey in resetContent
* because assert(!isBINDelta()) will fail, so call it here.
*/
setIdentifierKey(fullBIN.getIdentifierKey(), false);
if (cursorSet != null) {
i = 0;
for (CursorImpl cursor : cursorSet) {
if (keys[i] != null) {
/*
* Do not ask for an exact match from findEntry because if
* the cursor was on a KD slot, findEntry would return -1.
*/
int index = findEntry(keys[i], true, false);
if ((index & IN.EXACT_MATCH) == 0) {
throw EnvironmentFailureException.unexpectedState(
getEnv(), "Failed to reposition cursor during " +
"mutation of a BIN delta to a full BIN");
}
index &= ~IN.EXACT_MATCH;
assert(index >= 0 && index < getNEntries());
cursor.setIndex(index);
}
++i;
}
}
getEnv().lazyCompress(this);
compactMemory();
if (getInListResident()) {
getEnv().getInMemoryINs().updateBINDeltaStat(-1);
}
}
private BIN fetchFullBIN(DatabaseImpl dbImpl) {
final EnvironmentImpl envImpl = dbImpl.getEnv();
final long lsn = getLastFullLsn();
try {
return (BIN)
envImpl.getLogManager().getEntryHandleNotFound(lsn);
} catch (EnvironmentFailureException e) {
e.addErrorMessage(makeFetchErrorMsg(null, lsn, -1));
throw e;
} catch (RuntimeException e) {
throw new EnvironmentFailureException(
envImpl, EnvironmentFailureReason.LOG_INTEGRITY,
makeFetchErrorMsg(e.toString(), lsn, -1), e);
}
}
/**
* Replaces the contents of this BIN with the contents of the given BIN,
* including lsns, states, keys and targets. Key prefixing and key/target
* representations will also be those of the given BIN.
*/
private void resetContent(final BIN other) {
updateRepCacheStats(false);
nEntries = other.nEntries;
baseFileNumber = other.baseFileNumber;
entryLsnByteArray = other.entryLsnByteArray;
entryLsnLongArray = other.entryLsnLongArray;
keyPrefix = other.keyPrefix;
entryKeys = other.entryKeys;
entryTargets = other.entryTargets;
entryStates = other.entryStates;
lastLoggedSizes = other.lastLoggedSizes;
expirationValues = other.expirationValues;
expirationBase = other.expirationBase;
offHeapLNIds = other.offHeapLNIds;
assert (getOffHeapLruId() >= 0) || !hasOffHeapLNs();
vlsnCache = other.vlsnCache;
bloomFilter = null;
updateMemorySize(inMemorySize, computeMemorySize());
updateRepCacheStats(true);
}
private void resize(final int newCapacity) {
assert newCapacity >= getNEntries();
updateRepCacheStats(false);
if (entryLsnByteArray != null) {
entryLsnByteArray = Arrays.copyOfRange(
entryLsnByteArray, 0, newCapacity * 4);
}
if (entryLsnLongArray != null) {
entryLsnLongArray = Arrays.copyOfRange(
entryLsnLongArray, 0, newCapacity);
}
if (entryStates != null) {
entryStates = Arrays.copyOfRange(
entryStates, 0, newCapacity);
}
entryKeys = entryKeys.resize(newCapacity);
entryTargets = entryTargets.resize(newCapacity);
lastLoggedSizes = lastLoggedSizes.resize(newCapacity);
expirationValues = expirationValues.resize(newCapacity);
offHeapLNIds = offHeapLNIds.resize(newCapacity);
vlsnCache = vlsnCache.resize(newCapacity);
updateMemorySize(inMemorySize, computeMemorySize());
updateRepCacheStats(true);
}
/**
* Create a BIN by fetching its most recent full version from the log and
* applying to it the deltas in this BIN delta. The new BIN is not added
* to the INList or the BTree.
*
* Called from DiskOrderedScanner.fetchAndProcessBINs() and
* DiskOrderedScanner.accumulateLNs()
*
* @return the full BIN with deltas applied.
*/
public BIN reconstituteBIN(DatabaseImpl dbImpl) {
final BIN fullBIN = fetchFullBIN(dbImpl);
reconstituteBIN(dbImpl, fullBIN, false /*leaveFreeSlot*/);
return fullBIN;
}
/**
* Given a full version BIN, apply to it the deltas in this BIN delta. The
* fullBIN will then be complete, but its memory will not be compacted.
*
* Called from mutateToFullBIN() above and from SortedLSNTreewalker.
*
* @param leaveFreeSlot should be true if a slot will be inserted into the
* resulting full BIN, without first checking whether the full BIN must be
* split, and performing the split if necessary. If this param is true, the
* returned BIN will contain at least one free slot. If this param is
* false, a BIN with no free slots may be returned. For example, it is
* important that false is passed when a split will be performed, since if
* true were passed, the BIN would grow beyond its bounds unnecessarily.
*/
public void reconstituteBIN(
DatabaseImpl dbImpl,
BIN fullBIN,
boolean leaveFreeSlot) {
fullBIN.setDatabase(dbImpl);
fullBIN.latch(CacheMode.UNCHANGED);
try {
if (databaseImpl == null) {
setDatabase(dbImpl);
}
assert fullBIN.getOffHeapLruId() < 0;
assert !fullBIN.hasOffHeapLNs();
/*
* The BIN's lastFullLsn is set here, while its lastLoggedLsn is
* set by postFetchInit or postRecoveryInit.
*/
fullBIN.setLastFullLsn(getLastFullLsn());
/*
* Compress the full BIN before applying deltas, to handle the
* following scenario: Non-dirty slots were compressed away
* earlier, leaving room for inserted records, and a delta was
* logged with the inserted records. The full version of the BIN
* (after compression) was not logged, because the BIN is not
* dirtied when non-dirty slots were compressed away. If we don't
* compress here, there may not be room in the original BIN for the
* slots inserted when applying the deltas.
*
* However, during recovery we can't compress because locking is
* not used during recovery, and the compressor may delete a slot
* for a record that is part of an active transaction. In addition,
* even when compression is performed here, it is possible that it
* doesn't compress all deleted/expired slots that were compressed
* originally in the scenario described, for one of the following
* reasons:
*
* + The record is locked temporarily by a read operation that
* will skip the record. Note that the compressor uses
* non-blocking locking.
*
* + If expiration has been disabled, or the system clock has been
* changed, slots that were expired originally may not be
* expired now.
*
* Therefore, in all cases we enlarge the BIN if necessary to hold
* all slots to be inserted when applying the delta. An extra slot
* is added if leaveFreeSlot is true, to handle cases
* where mutation to a full BIN is performed after calling
* Tree.searchSplitsAllowed, or one of the methods that calls it
* such as Tree.findBinForInsert and Tree.getParentBINForChildLN.
* If the search returns a BIN-delta without splitting, and then we
* must mutate to full BIN in order to insert, because blind
* insertions do not apply, then the scenario described can occur.
*
* If the BIN is enlarged, we add it to the compressor queue so it
* will be shrunk back down to the Database's configured maxEntries
* during normal compression.
*/
if (!dbImpl.getEnv().isInInit()) {
fullBIN.compress(
false /*compressDirtySlots*/, null /*localTracker*/);
}
int nInsertions = leaveFreeSlot ? 1 : 0;
for (int i = 0; i < getNEntries(); i += 1) {
final int foundIndex = fullBIN.findEntry(
getKey(i), true, false);
if (foundIndex < 0 || (foundIndex & IN.EXACT_MATCH) == 0) {
nInsertions += 1;
}
}
final int maxEntries = nInsertions + fullBIN.getNEntries();
if (maxEntries > fullBIN.getMaxEntries()) {
fullBIN.resize(maxEntries);
dbImpl.getEnv().addToCompressorQueue(fullBIN);
}
/* Process each delta. */
for (int i = 0; i < getNEntries(); i++) {
assert isDirty(i) : this;
fullBIN.applyDelta(
getKey(i), getData(i), getLsn(i), getState(i),
getLastLoggedSize(i), getOffHeapLNId(i),
getCachedVLSN(i), getTarget(i),
getExpiration(i), isExpirationInHours());
}
/*
* The applied deltas will leave some slots dirty, which is
* necessary as a record of changes that will be included in the
* next delta. However, the BIN itself should not be dirty,
* because this delta is a persistent record of those changes.
*/
fullBIN.setDirty(false);
} finally {
fullBIN.releaseLatch();
}
}
/**
* Apply (insert, update) a given delta slot in this full BIN.
* Note: also called from OldBINDelta class.
*/
void applyDelta(
final byte[] key,
final byte[] data,
final long lsn,
final byte state,
final int lastLoggedSize,
final long ohLnId,
final long vlsn,
final Node child,
final int expiration,
final boolean expirationInHours) {
/*
* The delta is the authoritative version of the entry. In all cases,
* it should supersede the entry in the full BIN. This is true even if
* the BIN Delta's entry is knownDeleted or if the full BIN's version
* is knownDeleted. Therefore we use the flavor of findEntry that will
* return a knownDeleted entry if the entry key matches (i.e. true,
* false) but still indicates exact matches with the return index.
* findEntry only returns deleted entries if third arg is false, but we
* still need to know if it's an exact match or not so indicateExact is
* true.
*/
int foundIndex = findEntry(key, true, false);
if (foundIndex >= 0 && (foundIndex & IN.EXACT_MATCH) != 0) {
foundIndex &= ~IN.EXACT_MATCH;
/*
* The entry exists in the full version, update it with the delta
* info. Note that all state flags should be restored [#22848].
*/
applyDeltaSlot(
foundIndex, child, lsn, lastLoggedSize, state, key, data);
} else {
/*
* The entry doesn't exist, insert the delta entry. We insert the
* entry even when it is known or pending deleted, since the
* deleted (and dirty) entry will be needed to log the next delta.
* [#20737]
*/
final int result = insertEntry1(
child, key, data, lsn, state, false/*blindInsertion*/);
assert (result & INSERT_SUCCESS) != 0;
foundIndex = result & ~IN.INSERT_SUCCESS;
setLastLoggedSizeUnconditional(foundIndex, lastLoggedSize);
}
setCachedVLSNUnconditional(foundIndex, vlsn);
setOffHeapLNId(foundIndex, ohLnId);
setExpiration(foundIndex, expiration, expirationInHours);
}
/*
* DbStat support.
*/
@Override
void accumulateStats(TreeWalkerStatsAccumulator acc) {
acc.processBIN(this, Long.valueOf(getNodeId()), getLevel());
}
}