src/main/java/org/apache/datasketches/fdt/FdtSketch.java - datasketches-java - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.datasketches.fdt;

 import static org.apache.datasketches.Util.MAX_LG_NOM_LONGS;

 import java.util.List;

 import org.apache.datasketches.SketchesArgumentException;
 import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.tuple.strings.ArrayOfStringsSketch;

 /**
  * A Frequent Distinct Tuples sketch.
  *
  * <p>Suppose our data is a stream of pairs {IP address, User ID} and we want to identify the
  * IP addresses that have the most distinct User IDs.  Or conversely, we would like to identify
  * the User IDs that have the most distinct IP addresses. This is a common challenge in the
  * analysis of big data and the FDT sketch helps solve this problem using probabilistic techniques.
  *
  * <p>More generally, given a multiset of tuples with dimensions <i>{d1,d2, d3, ..., dN}</i>,
  * and a primary subset of dimensions <i>M &lt; N</i>, our task is to identify the combinations of
  * <i>M</i> subset dimensions that have the most frequent number of distinct combinations of
  * the <i>N-M</i> non-primary dimensions.
  *
  * <p>Please refer to the web page
  * <a href="https://datasketches.apache.org/docs/Frequency/FrequentDistinctTuplesSketch.html">
  * https://datasketches.apache.org/docs/Frequency/FrequentDistinctTuplesSketch.html</a> for a more
  * complete discussion about this sketch.
  *
  * @author Lee Rhodes
  */
 public class FdtSketch extends ArrayOfStringsSketch {

   /**
    * Create new instance of Frequent Distinct Tuples sketch with the given
    * Log-base2 of required nominal entries.
    * @param lgK Log-base2 of required nominal entries.
    */
   public FdtSketch(final int lgK) {
     super(lgK);
   }

   /**
    * Used by deserialization.
    * @param mem the image of a FdtSketch
    */
   FdtSketch(final Memory mem) {
     super(mem);
   }

   /**
    * Create a new instance of Frequent Distinct Tuples sketch with a size determined by the given
    * threshold and rse.
    * @param threshold : the fraction, between zero and 1.0, of the total distinct stream length
    * that defines a "Frequent" (or heavy) item.
    * @param rse the maximum Relative Standard Error for the estimate of the distinct population of a
    * reported tuple (selected with a primary key) at the threshold.
    */
   public FdtSketch(final double threshold, final double rse) {
     super(computeLgK(threshold, rse));
   }

   /**
    * Update the sketch with the given string array tuple.
    * @param tuple the given string array tuple.
    */
   public void update(final String[] tuple) {
     super.update(tuple, tuple);
   }

   /**
    * Returns an ordered List of Groups of the most frequent distinct population of subset tuples
    * represented by the count of entries of each group.
    * @param priKeyIndices these indices define the dimensions used for the Primary Keys.
    * @param limit the maximum number of groups to return. If this value is &le; 0, all
    * groups will be returned.
    * @param numStdDev the number of standard deviations for the upper and lower error bounds,
    * this value is an integer and must be one of 1, 2, or 3.
    * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of Standard Deviations</a>
    * @param sep the separator character
    * @return an ordered List of Groups of the most frequent distinct population of subset tuples
    * represented by the count of entries of each group.
    */
   public List<Group> getResult(final int[] priKeyIndices, final int limit, final int numStdDev,
       final char sep) {
     final PostProcessor proc = new PostProcessor(this, new Group(), sep);
     return proc.getGroupList(priKeyIndices, numStdDev, limit);
   }

   /**
    * Returns the PostProcessor that enables multiple queries against the sketch results.
    * This assumes the default Group and the default separator character '|'.
    * @return the PostProcessor
    */
   public PostProcessor getPostProcessor() {
     return getPostProcessor(new Group(), '|');
   }

   /**
    * Returns the PostProcessor that enables multiple queries against the sketch results.
    * @param group the Group class to use during post processing.
    * @param sep the separator character.
    * @return the PostProcessor
    */
   public PostProcessor getPostProcessor(final Group group, final char sep) {
     return new PostProcessor(this, group, sep);
   }

   // Restricted

   /**
    * Computes LgK given the threshold and RSE.
    * @param threshold the fraction, between zero and 1.0, of the total stream length that defines
    * a "Frequent" (or heavy) tuple.
    * @param rse the maximum Relative Standard Error for the estimate of the distinct population of a
    * reported tuple (selected with a primary key) at the threshold.
    * @return LgK
    */
   static int computeLgK(final double threshold, final double rse) {
     final double v = Math.ceil(1.0 / (threshold * rse * rse));
     final int lgK = (int) Math.ceil(Math.log(v) / Math.log(2));
     if (lgK > MAX_LG_NOM_LONGS) {
       throw new SketchesArgumentException("Requested Sketch (LgK = " + lgK + " &gt; 2^26), "
           + "either increase the threshold, the rse or both.");
     }
     return lgK;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.datasketches.fdt;

	import static org.apache.datasketches.Util.MAX_LG_NOM_LONGS;

	import java.util.List;

	import org.apache.datasketches.SketchesArgumentException;
	import org.apache.datasketches.memory.Memory;
	import org.apache.datasketches.tuple.strings.ArrayOfStringsSketch;

	/**
	* A Frequent Distinct Tuples sketch.
	*
	* <p>Suppose our data is a stream of pairs {IP address, User ID} and we want to identify the
	* IP addresses that have the most distinct User IDs. Or conversely, we would like to identify
	* the User IDs that have the most distinct IP addresses. This is a common challenge in the
	* analysis of big data and the FDT sketch helps solve this problem using probabilistic techniques.
	*
	* <p>More generally, given a multiset of tuples with dimensions <i>{d1,d2, d3, ..., dN}</i>,
	* and a primary subset of dimensions <i>M < N</i>, our task is to identify the combinations of
	* <i>M</i> subset dimensions that have the most frequent number of distinct combinations of
	* the <i>N-M</i> non-primary dimensions.
	*
	* <p>Please refer to the web page
	* <a href="https://datasketches.apache.org/docs/Frequency/FrequentDistinctTuplesSketch.html">
	* https://datasketches.apache.org/docs/Frequency/FrequentDistinctTuplesSketch.html</a> for a more
	* complete discussion about this sketch.
	*
	* @author Lee Rhodes
	*/
	public class FdtSketch extends ArrayOfStringsSketch {

	/**
	* Create new instance of Frequent Distinct Tuples sketch with the given
	* Log-base2 of required nominal entries.
	* @param lgK Log-base2 of required nominal entries.
	*/
	public FdtSketch(final int lgK) {
	super(lgK);
	}

	/**
	* Used by deserialization.
	* @param mem the image of a FdtSketch
	*/
	FdtSketch(final Memory mem) {
	super(mem);
	}

	/**
	* Create a new instance of Frequent Distinct Tuples sketch with a size determined by the given
	* threshold and rse.
	* @param threshold : the fraction, between zero and 1.0, of the total distinct stream length
	* that defines a "Frequent" (or heavy) item.
	* @param rse the maximum Relative Standard Error for the estimate of the distinct population of a
	* reported tuple (selected with a primary key) at the threshold.
	*/
	public FdtSketch(final double threshold, final double rse) {
	super(computeLgK(threshold, rse));
	}

	/**
	* Update the sketch with the given string array tuple.
	* @param tuple the given string array tuple.
	*/
	public void update(final String[] tuple) {
	super.update(tuple, tuple);
	}

	/**
	* Returns an ordered List of Groups of the most frequent distinct population of subset tuples
	* represented by the count of entries of each group.
	* @param priKeyIndices these indices define the dimensions used for the Primary Keys.
	* @param limit the maximum number of groups to return. If this value is ≤ 0, all
	* groups will be returned.
	* @param numStdDev the number of standard deviations for the upper and lower error bounds,
	* this value is an integer and must be one of 1, 2, or 3.
	* <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of Standard Deviations</a>
	* @param sep the separator character
	* @return an ordered List of Groups of the most frequent distinct population of subset tuples
	* represented by the count of entries of each group.
	*/
	public List<Group> getResult(final int[] priKeyIndices, final int limit, final int numStdDev,
	final char sep) {
	final PostProcessor proc = new PostProcessor(this, new Group(), sep);
	return proc.getGroupList(priKeyIndices, numStdDev, limit);
	}

	/**
	* Returns the PostProcessor that enables multiple queries against the sketch results.
	* This assumes the default Group and the default separator character '\|'.
	* @return the PostProcessor
	*/
	public PostProcessor getPostProcessor() {
	return getPostProcessor(new Group(), '\|');
	}

	/**
	* Returns the PostProcessor that enables multiple queries against the sketch results.
	* @param group the Group class to use during post processing.
	* @param sep the separator character.
	* @return the PostProcessor
	*/
	public PostProcessor getPostProcessor(final Group group, final char sep) {
	return new PostProcessor(this, group, sep);
	}

	// Restricted

	/**
	* Computes LgK given the threshold and RSE.
	* @param threshold the fraction, between zero and 1.0, of the total stream length that defines
	* a "Frequent" (or heavy) tuple.
	* @param rse the maximum Relative Standard Error for the estimate of the distinct population of a
	* reported tuple (selected with a primary key) at the threshold.
	* @return LgK
	*/
	static int computeLgK(final double threshold, final double rse) {
	final double v = Math.ceil(1.0 / (threshold * rse * rse));
	final int lgK = (int) Math.ceil(Math.log(v) / Math.log(2));
	if (lgK > MAX_LG_NOM_LONGS) {
	throw new SketchesArgumentException("Requested Sketch (LgK = " + lgK + " > 2^26), "
	+ "either increase the threshold, the rse or both.");
	}
	return lgK;
	}

	}