hadoop-tools/hadoop-rumen/src/main/java/org/apache/hadoop/tools/rumen/Histogram.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 /**
  *
  */
 package org.apache.hadoop.tools.rumen;

 import java.io.PrintStream;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeMap;

 /**
  * {@link Histogram} represents an ordered summary of a sequence of {@code long}
  * s which can be queried to produce a discrete approximation of its cumulative
  * distribution function
  *
  */
 class Histogram implements Iterable<Map.Entry<Long, Long>> {
   private TreeMap<Long, Long> content = new TreeMap<Long, Long>();

   private String name;

   private long totalCount;

   public Histogram() {
     this("(anonymous)");
   }

   public Histogram(String name) {
     super();

     this.name = name;

     totalCount = 0L;
   }

   public void dump(PrintStream stream) {
     stream.print("dumping Histogram " + name + ":\n");

     Iterator<Map.Entry<Long, Long>> iter = iterator();

     while (iter.hasNext()) {
       Map.Entry<Long, Long> ent = iter.next();

       stream.print("val/count pair: " + (long) ent.getKey() + ", "
           + (long) ent.getValue() + "\n");
     }

     stream.print("*** end *** \n");
   }

   public Iterator<Map.Entry<Long, Long>> iterator() {
     return content.entrySet().iterator();
   }

   public long get(long key) {
     Long result = content.get(key);

     return result == null ? 0 : result;
   }

   public long getTotalCount() {
     return totalCount;
   }

   public void enter(long value) {
     Long existingValue = content.get(value);

     if (existingValue == null) {
       content.put(value, 1L);
     } else {
       content.put(value, existingValue + 1L);
     }

     ++totalCount;
   }

   /**
    * Produces a discrete approximation of the CDF. The user provides the points
    * on the {@code Y} axis he wants, and we give the corresponding points on the
    * {@code X} axis, plus the minimum and maximum from the data.
    *
    * @param scale
    *          the denominator applied to every element of buckets. For example,
    *          if {@code scale} is {@code 1000}, a {@code buckets} element of 500
    *          will specify the median in that output slot.
    * @param buckets
    *          an array of int, all less than scale and each strictly greater
    *          than its predecessor if any. We don't check these requirements.
    * @return a {@code long[]}, with two more elements than {@code buckets} has.
    *         The first resp. last element is the minimum resp. maximum value
    *         that was ever {@code enter}ed. The rest of the elements correspond
    *         to the elements of {@code buckets} and carry the first element
    *         whose rank is no less than {@code #content elements * scale /
    *         bucket}.
    *
    */
   public long[] getCDF(int scale, int[] buckets) {
     if (totalCount == 0) {
       return null;
     }

     long[] result = new long[buckets.length + 2];

     // fill in the min and the max
     result[0] = content.firstEntry().getKey();

     result[buckets.length + 1] = content.lastEntry().getKey();

     Iterator<Map.Entry<Long, Long>> iter = content.entrySet().iterator();
     long cumulativeCount = 0;
     int bucketCursor = 0;


     // Loop invariant: the item at buckets[bucketCursor] can still be reached
     // from iter, and the number of logged elements no longer available from
     // iter is cumulativeCount.
     //
     // cumulativeCount/totalCount is therefore strictly less than
     // buckets[bucketCursor]/scale .

     while (iter.hasNext()) {
       long targetCumulativeCount = buckets[bucketCursor] * totalCount / scale;

       Map.Entry<Long, Long> elt = iter.next();

       cumulativeCount += elt.getValue();

       while (cumulativeCount >= targetCumulativeCount) {
         result[bucketCursor + 1] = elt.getKey();

         ++bucketCursor;

         if (bucketCursor < buckets.length) {
           targetCumulativeCount = buckets[bucketCursor] * totalCount / scale;
         } else {
           break;
         }
       }

       if (bucketCursor == buckets.length) {
         break;
       }
     }

     return result;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	/**
	*
	*/
	package org.apache.hadoop.tools.rumen;

	import java.io.PrintStream;
	import java.util.Iterator;
	import java.util.Map;
	import java.util.TreeMap;

	/**
	* {@link Histogram} represents an ordered summary of a sequence of {@code long}
	* s which can be queried to produce a discrete approximation of its cumulative
	* distribution function
	*
	*/
	class Histogram implements Iterable<Map.Entry<Long, Long>> {
	private TreeMap<Long, Long> content = new TreeMap<Long, Long>();

	private String name;

	private long totalCount;

	public Histogram() {
	this("(anonymous)");
	}

	public Histogram(String name) {
	super();

	this.name = name;

	totalCount = 0L;
	}

	public void dump(PrintStream stream) {
	stream.print("dumping Histogram " + name + ":\n");

	Iterator<Map.Entry<Long, Long>> iter = iterator();

	while (iter.hasNext()) {
	Map.Entry<Long, Long> ent = iter.next();

	stream.print("val/count pair: " + (long) ent.getKey() + ", "
	+ (long) ent.getValue() + "\n");
	}

	stream.print("* end * \n");
	}

	public Iterator<Map.Entry<Long, Long>> iterator() {
	return content.entrySet().iterator();
	}

	public long get(long key) {
	Long result = content.get(key);

	return result == null ? 0 : result;
	}

	public long getTotalCount() {
	return totalCount;
	}

	public void enter(long value) {
	Long existingValue = content.get(value);

	if (existingValue == null) {
	content.put(value, 1L);
	} else {
	content.put(value, existingValue + 1L);
	}

	++totalCount;
	}

	/**
	* Produces a discrete approximation of the CDF. The user provides the points
	* on the {@code Y} axis he wants, and we give the corresponding points on the
	* {@code X} axis, plus the minimum and maximum from the data.
	*
	* @param scale
	* the denominator applied to every element of buckets. For example,
	* if {@code scale} is {@code 1000}, a {@code buckets} element of 500
	* will specify the median in that output slot.
	* @param buckets
	* an array of int, all less than scale and each strictly greater
	* than its predecessor if any. We don't check these requirements.
	* @return a {@code long[]}, with two more elements than {@code buckets} has.
	* The first resp. last element is the minimum resp. maximum value
	* that was ever {@code enter}ed. The rest of the elements correspond
	* to the elements of {@code buckets} and carry the first element
	* whose rank is no less than {@code #content elements * scale /
	* bucket}.
	*
	*/
	public long[] getCDF(int scale, int[] buckets) {
	if (totalCount == 0) {
	return null;
	}

	long[] result = new long[buckets.length + 2];

	// fill in the min and the max
	result[0] = content.firstEntry().getKey();

	result[buckets.length + 1] = content.lastEntry().getKey();

	Iterator<Map.Entry<Long, Long>> iter = content.entrySet().iterator();
	long cumulativeCount = 0;
	int bucketCursor = 0;


	// Loop invariant: the item at buckets[bucketCursor] can still be reached
	// from iter, and the number of logged elements no longer available from
	// iter is cumulativeCount.
	//
	// cumulativeCount/totalCount is therefore strictly less than
	// buckets[bucketCursor]/scale .

	while (iter.hasNext()) {
	long targetCumulativeCount = buckets[bucketCursor] * totalCount / scale;

	Map.Entry<Long, Long> elt = iter.next();

	cumulativeCount += elt.getValue();

	while (cumulativeCount >= targetCumulativeCount) {
	result[bucketCursor + 1] = elt.getKey();

	++bucketCursor;

	if (bucketCursor < buckets.length) {
	targetCumulativeCount = buckets[bucketCursor] * totalCount / scale;
	} else {
	break;
	}
	}

	if (bucketCursor == buckets.length) {
	break;
	}
	}

	return result;
	}
	}