blob: 7ac8c601e2b8308c6532d9600ae4f90b8d24d29b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.tuple.aninteger;
import static java.lang.Math.exp;
import static java.lang.Math.log;
import static java.lang.Math.round;
import static org.apache.datasketches.tuple.aninteger.IntegerSummary.Mode.AlwaysOne;
import static org.apache.datasketches.tuple.aninteger.IntegerSummary.Mode.Sum;
import org.apache.datasketches.tuple.CompactSketch;
import org.apache.datasketches.tuple.SketchIterator;
import org.apache.datasketches.tuple.Union;
import org.testng.annotations.Test;
/**
* @author Lee Rhodes
*/
@SuppressWarnings("javadoc")
public class EngagementTest {
public static final int numStdDev = 2;
@Test
public void computeEngagementHistogram() {
final int lgK = 8; //Using a larger sketch >= 9 will produce exact results for this little example
final int K = 1 << lgK;
final int days = 30;
int v = 0;
final IntegerSketch[] skArr = new IntegerSketch[days];
for (int i = 0; i < days; i++) {
skArr[i] = new IntegerSketch(lgK, AlwaysOne);
}
for (int i = 0; i <= days; i++) { //31 generating indices for symmetry
final int numIds = numIDs(days, i);
final int numDays = numDays(days, i);
final int myV = v++;
for (int d = 0; d < numDays; d++) {
for (int id = 0; id < numIds; id++) {
skArr[d].update(myV + id, 1);
}
}
v += numIds;
}
unionOps(K, Sum, skArr);
}
private static int numIDs(final int totalDays, final int index) {
final double d = totalDays;
final double i = index;
return (int)round(exp(i * log(d) / d));
}
private static int numDays(final int totalDays, final int index) {
final double d = totalDays;
final double i = index;
return (int)round(exp((d - i) * log(d) / d));
}
private static void unionOps(final int K, final IntegerSummary.Mode mode, final IntegerSketch ... sketches) {
final IntegerSummarySetOperations setOps = new IntegerSummarySetOperations(mode, mode);
final Union<IntegerSummary> union = new Union<>(K, setOps);
final int len = sketches.length;
for (final IntegerSketch isk : sketches) {
union.union(isk);
}
final CompactSketch<IntegerSummary> result = union.getResult();
final SketchIterator<IntegerSummary> itr = result.iterator();
final int[] numDaysArr = new int[len + 1]; //zero index is ignored
while (itr.next()) {
//For each unique visitor from the result sketch, get the # days visited
final int numDaysVisited = itr.getSummary().getValue();
//increment the number of visitors that visited numDays
numDaysArr[numDaysVisited]++; //values range from 1 to 30
}
println("\nEngagement Histogram:");
println("Number of Unique Visitors by Number of Days Visited");
printf("%12s%12s%12s%12s\n","Days Visited", "Estimate", "LB", "UB");
int sumVisits = 0;
final double theta = result.getTheta();
for (int i = 0; i < numDaysArr.length; i++) {
final int visitorsAtDaysVisited = numDaysArr[i];
if (visitorsAtDaysVisited == 0) { continue; }
sumVisits += visitorsAtDaysVisited * i;
final double estVisitorsAtDaysVisited = visitorsAtDaysVisited / theta;
final double lbVisitorsAtDaysVisited = result.getLowerBound(numStdDev, visitorsAtDaysVisited);
final double ubVisitorsAtDaysVisited = result.getUpperBound(numStdDev, visitorsAtDaysVisited);
printf("%12d%12.0f%12.0f%12.0f\n",
i, estVisitorsAtDaysVisited, lbVisitorsAtDaysVisited, ubVisitorsAtDaysVisited);
}
//The estimate and bounds of the total number of visitors comes directly from the sketch.
final double visitors = result.getEstimate();
final double lbVisitors = result.getLowerBound(numStdDev);
final double ubVisitors = result.getUpperBound(numStdDev);
printf("\n%12s%12s%12s%12s\n","Totals", "Estimate", "LB", "UB");
printf("%12s%12.0f%12.0f%12.0f\n", "Visitors", visitors, lbVisitors, ubVisitors);
//The total number of visits, however, is a scaled metric and takes advantage of the fact that
//the retained entries in the sketch is a uniform random sample of all unique visitors, and
//the the rest of the unique users will likely behave in the same way.
final double estVisits = sumVisits / theta;
final double lbVisits = estVisits * lbVisitors / visitors;
final double ubVisits = estVisits * ubVisitors / visitors;
printf("%12s%12.0f%12.0f%12.0f\n\n", "Visits", estVisits, lbVisits, ubVisits);
}
/**
* @param o object to print
*/
private static void println(final Object o) {
printf("%s\n", o.toString());
}
/**
* @param fmt format
* @param args arguments
*/
private static void printf(final String fmt, final Object ... args) {
//System.out.printf(fmt, args); //Enable/Disable printing here
}
}