src/main/java/com/sleepycat/je/tree/CountEstimator.java - doris-thirdparty - Git at Google

 /*-
  * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
  *
  * This file was distributed by Oracle as part of a version of Oracle Berkeley
  * DB Java Edition made available at:
  *
  * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
  *
  * Please see the LICENSE file included in the top-level directory of the
  * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
  * license and additional information.
  */

 package com.sleepycat.je.tree;

 import java.util.ArrayList;
 import java.util.List;

 import com.sleepycat.je.EnvironmentFailureException;
 import com.sleepycat.je.OperationResult;
 import com.sleepycat.je.dbi.CursorImpl;
 import com.sleepycat.je.dbi.DatabaseImpl;
 import com.sleepycat.je.txn.LockType;

 /**
  * Estimates the number of non-deleted BIN entries between two end points,
  * using information returned in TrackingInfo from Tree.getParentINForChildIN.
  * Used for estimating dup counts, e.g., for join query optimization.  Accuracy
  * is limited by the number of samples taken to compute the average number of
  * entries at each level.  Currently only two samples (at the end points) are
  * taken, and because the tree is not balanced making the number of entries
  * highly variable, the count can easily be off by a factor of two.
  */
 public class CountEstimator {

     /* If exceeded, there must be a bug of some kind. */
     private static final int MAX_RETRIES_AFTER_SPLIT = 100;

     /**
      * Returns an estimate of the number of records between two end points
      * specified by begin/end cursor positions.
      */
     public static long count(DatabaseImpl dbImpl,
                              CursorImpl beginCursor,
                              boolean beginInclusive,
                              CursorImpl endCursor,
                              boolean endInclusive) {

         /* If the two cursors are at the same position, return 1. */
         if (beginCursor.isOnSamePosition(endCursor)) {
             return 1;
         }

         /* Compute estimate for cursors at different positions. */
         final CountEstimator estimator = new CountEstimator(dbImpl);

         return estimator.count(beginCursor, endCursor) +
                (beginInclusive ? 1 : 0) +
                (endInclusive ? 1 : 0);
     }

     private final DatabaseImpl dbImpl;

     private List<TrackingInfo> beginStack;
     private List<TrackingInfo> endStack;

     private final List<List<TrackingInfo>> avgEntriesStacks =
         new ArrayList<List<TrackingInfo>>();

     private int levelCount;
     private int rootLevel;
     private float[] avgEntries;

     private CountEstimator(DatabaseImpl dbImpl) {
         this.dbImpl = dbImpl;
     }

     private long count(CursorImpl beginCursor, CursorImpl endCursor) {

         for (int numRetries = 0;; numRetries += 1) {

             /*
              * If we have retried too many times, give up.  This is probably
              * due to a bug of some kind, and we shouldn't loop forever.
              */
             if (numRetries > MAX_RETRIES_AFTER_SPLIT) {
                 throw EnvironmentFailureException.unexpectedState();
             }

             /*
              * Set up the initial info for the computation.  Retry if a split
              * occurs.
              */
             beginStack = beginCursor.getAncestorPath();
             if (beginStack == null) {
                 continue;
             }
             endStack = endCursor.getAncestorPath();
             if (endStack == null) {
                 continue;
             }

             if (!findCommonAncestor()) {
                 continue;
             }

             /* Get the the average entries from the two end points.  */
             getAvgEntries(beginCursor, endCursor);

             /*
              * Return the count.  FUTURE: Taking more samples between the two
              * end points would increase accuracy.
              */
             return countTotal();
         }
     }

     /**
      * Find the common ancestor node for the two end points, which we'll call
      * the root level.  If no common ancestor can be found, return false to
      * restart the process, because a split must have occurred in between
      * getting the two stacks for the end points.
      */
     private boolean findCommonAncestor() {

         levelCount = beginStack.size();
         if (levelCount != endStack.size()) {
             /* Must have been a root split. */
             return false;
         }

         rootLevel = -1;

         for (int level = levelCount - 1; level >= 0; level -= 1) {

             if (beginStack.get(level).nodeId == endStack.get(level).nodeId) {
                 rootLevel = level;
                 break;
             }
         }
         if (rootLevel < 0) {
             /* Must have been a split. */
             return false;
         }
         return true;
     }

     /**
      * This method starts with a preliminary average using just the two end
      * points.
      */
     private void getAvgEntries(CursorImpl beginCursor, CursorImpl endCursor) {

         avgEntriesStacks.clear();

         if (!addAvgEntriesSample(beginStack)) {
             sampleNextBIN(beginCursor, true /*moveForward*/);
         }

         if (!addAvgEntriesSample(endStack)) {
             sampleNextBIN(endCursor, false /*moveForward*/);
         }

         computeAvgEntries();
     }

     /**
      * FUTURE: use internal skip method instead, saving a btree lookup.
      */
     private void sampleNextBIN(
         CursorImpl beginOrEndCursor,
         boolean moveForward) {

         final CursorImpl cursor =
             beginOrEndCursor.cloneCursor(true /*samePosition*/);

         try {
             cursor.latchBIN();
             if (moveForward) {
                 cursor.setOnLastSlot();
             } else {
                 cursor.setOnFirstSlot();
             }

             final OperationResult result = cursor.getNext(
                 null /*foundKey*/, null /*foundData*/,
                 LockType.NONE, false /*dirtyReadAll*/,
                 moveForward, true /*alreadyLatched*/,
                 null /*rangeConstraint*/);

             if (result != null) {
                 final List<TrackingInfo> stack = cursor.getAncestorPath();
                 if (stack != null) {
                     addAvgEntriesSample(stack);
                 }
             }
         } finally {
             cursor.close();
         }
     }

     /**
      * At each level we compute the average number of entries.  This will be
      * used as a multipler to estimate the number of nodes for any IN at that
      * level.
      */
     private void computeAvgEntries() {

         avgEntries = new float[levelCount];

         avgEntries[levelCount - 1] = 1.0F;

         if (avgEntriesStacks.size() == 0) {
             return;
         }

         for (int level = levelCount - 1; level > 0; level -= 1) {
             long totalEntries = 0;
             for (List<TrackingInfo> stack : avgEntriesStacks) {
                 totalEntries += stack.get(level).entries;
             }
             final float avg = totalEntries / ((float) avgEntriesStacks.size());
             avgEntries[level - 1] = avg * avgEntries[level];
         }
     }

     private boolean addAvgEntriesSample(List<TrackingInfo> stack) {
         if (isFirstBIN(stack) || isLastBIN(stack)) {
             return false;
         }
         avgEntriesStacks.add(stack);
         return true;
     }

     private boolean isFirstBIN(List<TrackingInfo> stack) {
         for (int i = 0; i < stack.size() - 1; i += 1) {
             final TrackingInfo info = stack.get(i);
             if (info.index != 0) {
                 return false;
             }
         }
         return true;
     }

     private boolean isLastBIN(List<TrackingInfo> stack) {
         for (int i = 0; i < stack.size() - 1; i += 1) {
             final TrackingInfo info = stack.get(i);
             if (info.index != info.entries - 1) {
                 return false;
             }
         }
         return true;
     }

     /**
      * Count the total for each node that is between the two end points.
      */
     private long countTotal() {
         long total = 0;

         /* Add nodes between the end points at the root level. */
         final int rootIndex1 = beginStack.get(rootLevel).index + 1;
         final int rootIndex2 = endStack.get(rootLevel).index;
         if (rootIndex2 > rootIndex1) {
             total += Math.round((rootIndex2 - rootIndex1) *
                                 avgEntries[rootLevel]);
         }

         /* Add nodes under the end points at lower levels. */
         for (int level = rootLevel + 1; level < levelCount; level += 1) {

             /* Add nodes under left end point that are to its right. */
             final int leftIndex = beginStack.get(level).index;
             final int lastIndex = beginStack.get(level).entries - 1;
             if (lastIndex > leftIndex) {
                 total += Math.round((lastIndex - leftIndex) *
                                     avgEntries[level]);
             }

             /* Add nodes under right end point that are to its left. */
             final int rightIndex = endStack.get(level).index;
             final int firstIndex = 0;
             if (rightIndex > firstIndex) {
                 total += Math.round((rightIndex - firstIndex) *
                                     avgEntries[level]);
             }
         }

         return total;
     }

     /* For future use, if getKeyRatios is exposed in the API. */
     static class KeyRatios {
         final double less;
         final double equal;
         final double greater;

         KeyRatios(double less, double equal, double greater) {
             this.less = less;
             this.equal = equal;
             this.greater = greater;
         }

         @Override
         public String toString() {
             return "less: " + less +
                    " equal: " + equal +
                    " greater: " + greater;
         }
     }

     /*
      * For future use, if getKeyRatios is exposed in the API.  Be sure to test
      * boundary conditions when index is 0 or nEntries.
      *
      * Algorithm copied from __bam_key_range in BDB btree/bt_stat.c.
      */
     static KeyRatios getKeyRatios(List<TrackingInfo> infoByLevel,
                                   boolean exact) {
         double factor = 1.0;
         double less = 0.0;
         double greater = 0.0;

         /*
          * At each level we know that INs greater than index contain keys
          * greater than what we are looking for and those less than index are
          * less than.  The one pointed to by index may have some less, some
          * greater or even equal.  If index is equal to the number of entries,
          * then the key is out of range and everything is less.
          */
         for (final TrackingInfo info : infoByLevel) {
             if (info.index == 0) {
                 greater += (factor * (info.entries - 1)) / info.entries;
             } else if (info.index == info.entries) {
                 less += factor;
             } else {
                 less += (factor * info.index) / info.entries;
                 greater += (factor * ((info.entries - info.index) - 1)) /
                            info.entries;
             }

             /* Factor at next level down is 1/n'th the amount at this level. */
             factor /= info.entries;

             /*
             System.out.println("factor: " + factor +
                                " less: " + less +
                                " greater: " + greater);
             */
         }

         /*
          * If there was an exact match then assign the 1/n'th factor to the key
          * itself.  Otherwise that factor belongs to those greater than the
          * key, unless the key was out of range.
          */
         final double equal;
         if (exact) {
             equal = factor;
         } else {
             if (less != 1.0) {
                 greater += factor;
             }
             equal = 0.0;
         }

         return new KeyRatios(less, equal, greater);
     }
 }
	/*-
	* Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
	*
	* This file was distributed by Oracle as part of a version of Oracle Berkeley
	* DB Java Edition made available at:
	*
	* http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
	*
	* Please see the LICENSE file included in the top-level directory of the
	* appropriate version of Oracle Berkeley DB Java Edition for a copy of the
	* license and additional information.
	*/

	package com.sleepycat.je.tree;

	import java.util.ArrayList;
	import java.util.List;

	import com.sleepycat.je.EnvironmentFailureException;
	import com.sleepycat.je.OperationResult;
	import com.sleepycat.je.dbi.CursorImpl;
	import com.sleepycat.je.dbi.DatabaseImpl;
	import com.sleepycat.je.txn.LockType;

	/**
	* Estimates the number of non-deleted BIN entries between two end points,
	* using information returned in TrackingInfo from Tree.getParentINForChildIN.
	* Used for estimating dup counts, e.g., for join query optimization. Accuracy
	* is limited by the number of samples taken to compute the average number of
	* entries at each level. Currently only two samples (at the end points) are
	* taken, and because the tree is not balanced making the number of entries
	* highly variable, the count can easily be off by a factor of two.
	*/
	public class CountEstimator {

	/* If exceeded, there must be a bug of some kind. */
	private static final int MAX_RETRIES_AFTER_SPLIT = 100;

	/**
	* Returns an estimate of the number of records between two end points
	* specified by begin/end cursor positions.
	*/
	public static long count(DatabaseImpl dbImpl,
	CursorImpl beginCursor,
	boolean beginInclusive,
	CursorImpl endCursor,
	boolean endInclusive) {

	/* If the two cursors are at the same position, return 1. */
	if (beginCursor.isOnSamePosition(endCursor)) {
	return 1;
	}

	/* Compute estimate for cursors at different positions. */
	final CountEstimator estimator = new CountEstimator(dbImpl);

	return estimator.count(beginCursor, endCursor) +
	(beginInclusive ? 1 : 0) +
	(endInclusive ? 1 : 0);
	}

	private final DatabaseImpl dbImpl;

	private List<TrackingInfo> beginStack;
	private List<TrackingInfo> endStack;

	private final List<List<TrackingInfo>> avgEntriesStacks =
	new ArrayList<List<TrackingInfo>>();

	private int levelCount;
	private int rootLevel;
	private float[] avgEntries;

	private CountEstimator(DatabaseImpl dbImpl) {
	this.dbImpl = dbImpl;
	}

	private long count(CursorImpl beginCursor, CursorImpl endCursor) {

	for (int numRetries = 0;; numRetries += 1) {

	/*
	* If we have retried too many times, give up. This is probably
	* due to a bug of some kind, and we shouldn't loop forever.
	*/
	if (numRetries > MAX_RETRIES_AFTER_SPLIT) {
	throw EnvironmentFailureException.unexpectedState();
	}

	/*
	* Set up the initial info for the computation. Retry if a split
	* occurs.
	*/
	beginStack = beginCursor.getAncestorPath();
	if (beginStack == null) {
	continue;
	}
	endStack = endCursor.getAncestorPath();
	if (endStack == null) {
	continue;
	}

	if (!findCommonAncestor()) {
	continue;
	}

	/* Get the the average entries from the two end points. */
	getAvgEntries(beginCursor, endCursor);

	/*
	* Return the count. FUTURE: Taking more samples between the two
	* end points would increase accuracy.
	*/
	return countTotal();
	}
	}

	/**
	* Find the common ancestor node for the two end points, which we'll call
	* the root level. If no common ancestor can be found, return false to
	* restart the process, because a split must have occurred in between
	* getting the two stacks for the end points.
	*/
	private boolean findCommonAncestor() {

	levelCount = beginStack.size();
	if (levelCount != endStack.size()) {
	/* Must have been a root split. */
	return false;
	}

	rootLevel = -1;

	for (int level = levelCount - 1; level >= 0; level -= 1) {

	if (beginStack.get(level).nodeId == endStack.get(level).nodeId) {
	rootLevel = level;
	break;
	}
	}
	if (rootLevel < 0) {
	/* Must have been a split. */
	return false;
	}
	return true;
	}

	/**
	* This method starts with a preliminary average using just the two end
	* points.
	*/
	private void getAvgEntries(CursorImpl beginCursor, CursorImpl endCursor) {

	avgEntriesStacks.clear();

	if (!addAvgEntriesSample(beginStack)) {
	sampleNextBIN(beginCursor, true /moveForward/);
	}

	if (!addAvgEntriesSample(endStack)) {
	sampleNextBIN(endCursor, false /moveForward/);
	}

	computeAvgEntries();
	}

	/**
	* FUTURE: use internal skip method instead, saving a btree lookup.
	*/
	private void sampleNextBIN(
	CursorImpl beginOrEndCursor,
	boolean moveForward) {

	final CursorImpl cursor =
	beginOrEndCursor.cloneCursor(true /samePosition/);

	try {
	cursor.latchBIN();
	if (moveForward) {
	cursor.setOnLastSlot();
	} else {
	cursor.setOnFirstSlot();
	}

	final OperationResult result = cursor.getNext(
	null /foundKey/, null /foundData/,
	LockType.NONE, false /dirtyReadAll/,
	moveForward, true /alreadyLatched/,
	null /rangeConstraint/);

	if (result != null) {
	final List<TrackingInfo> stack = cursor.getAncestorPath();
	if (stack != null) {
	addAvgEntriesSample(stack);
	}
	}
	} finally {
	cursor.close();
	}
	}

	/**
	* At each level we compute the average number of entries. This will be
	* used as a multipler to estimate the number of nodes for any IN at that
	* level.
	*/
	private void computeAvgEntries() {

	avgEntries = new float[levelCount];

	avgEntries[levelCount - 1] = 1.0F;

	if (avgEntriesStacks.size() == 0) {
	return;
	}

	for (int level = levelCount - 1; level > 0; level -= 1) {
	long totalEntries = 0;
	for (List<TrackingInfo> stack : avgEntriesStacks) {
	totalEntries += stack.get(level).entries;
	}
	final float avg = totalEntries / ((float) avgEntriesStacks.size());
	avgEntries[level - 1] = avg * avgEntries[level];
	}
	}

	private boolean addAvgEntriesSample(List<TrackingInfo> stack) {
	if (isFirstBIN(stack) \|\| isLastBIN(stack)) {
	return false;
	}
	avgEntriesStacks.add(stack);
	return true;
	}

	private boolean isFirstBIN(List<TrackingInfo> stack) {
	for (int i = 0; i < stack.size() - 1; i += 1) {
	final TrackingInfo info = stack.get(i);
	if (info.index != 0) {
	return false;
	}
	}
	return true;
	}

	private boolean isLastBIN(List<TrackingInfo> stack) {
	for (int i = 0; i < stack.size() - 1; i += 1) {
	final TrackingInfo info = stack.get(i);
	if (info.index != info.entries - 1) {
	return false;
	}
	}
	return true;
	}

	/**
	* Count the total for each node that is between the two end points.
	*/
	private long countTotal() {
	long total = 0;

	/* Add nodes between the end points at the root level. */
	final int rootIndex1 = beginStack.get(rootLevel).index + 1;
	final int rootIndex2 = endStack.get(rootLevel).index;
	if (rootIndex2 > rootIndex1) {
	total += Math.round((rootIndex2 - rootIndex1) *
	avgEntries[rootLevel]);
	}

	/* Add nodes under the end points at lower levels. */
	for (int level = rootLevel + 1; level < levelCount; level += 1) {

	/* Add nodes under left end point that are to its right. */
	final int leftIndex = beginStack.get(level).index;
	final int lastIndex = beginStack.get(level).entries - 1;
	if (lastIndex > leftIndex) {
	total += Math.round((lastIndex - leftIndex) *
	avgEntries[level]);
	}

	/* Add nodes under right end point that are to its left. */
	final int rightIndex = endStack.get(level).index;
	final int firstIndex = 0;
	if (rightIndex > firstIndex) {
	total += Math.round((rightIndex - firstIndex) *
	avgEntries[level]);
	}
	}

	return total;
	}

	/* For future use, if getKeyRatios is exposed in the API. */
	static class KeyRatios {
	final double less;
	final double equal;
	final double greater;

	KeyRatios(double less, double equal, double greater) {
	this.less = less;
	this.equal = equal;
	this.greater = greater;
	}

	@Override
	public String toString() {
	return "less: " + less +
	" equal: " + equal +
	" greater: " + greater;
	}
	}

	/*
	* For future use, if getKeyRatios is exposed in the API. Be sure to test
	* boundary conditions when index is 0 or nEntries.
	*
	* Algorithm copied from __bam_key_range in BDB btree/bt_stat.c.
	*/
	static KeyRatios getKeyRatios(List<TrackingInfo> infoByLevel,
	boolean exact) {
	double factor = 1.0;
	double less = 0.0;
	double greater = 0.0;

	/*
	* At each level we know that INs greater than index contain keys
	* greater than what we are looking for and those less than index are
	* less than. The one pointed to by index may have some less, some
	* greater or even equal. If index is equal to the number of entries,
	* then the key is out of range and everything is less.
	*/
	for (final TrackingInfo info : infoByLevel) {
	if (info.index == 0) {
	greater += (factor * (info.entries - 1)) / info.entries;
	} else if (info.index == info.entries) {
	less += factor;
	} else {
	less += (factor * info.index) / info.entries;
	greater += (factor * ((info.entries - info.index) - 1)) /
	info.entries;
	}

	/* Factor at next level down is 1/n'th the amount at this level. */
	factor /= info.entries;

	/*
	System.out.println("factor: " + factor +
	" less: " + less +
	" greater: " + greater);
	*/
	}

	/*
	* If there was an exact match then assign the 1/n'th factor to the key
	* itself. Otherwise that factor belongs to those greater than the
	* key, unless the key was out of range.
	*/
	final double equal;
	if (exact) {
	equal = factor;
	} else {
	if (less != 1.0) {
	greater += factor;
	}
	equal = 0.0;
	}

	return new KeyRatios(less, equal, greater);
	}
	}