| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.mapreduce.lib.aggregate; |
| |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.TreeMap; |
| import java.util.Map.Entry; |
| import java.util.Arrays; |
| |
| import org.apache.hadoop.classification.InterfaceAudience; |
| import org.apache.hadoop.classification.InterfaceStability; |
| |
| |
| /** |
| * This class implements a value aggregator that computes the |
| * histogram of a sequence of strings. |
| * |
| */ |
| @InterfaceAudience.Public |
| @InterfaceStability.Stable |
| public class ValueHistogram implements ValueAggregator<String> { |
| |
| TreeMap<Object, Object> items = null; |
| |
| public ValueHistogram() { |
| items = new TreeMap<Object, Object>(); |
| } |
| |
| /** |
| * add the given val to the aggregator. |
| * |
| * @param val the value to be added. It is expected to be a string |
| * in the form of xxxx\tnum, meaning xxxx has num occurrences. |
| */ |
| public void addNextValue(Object val) { |
| String valCountStr = val.toString(); |
| int pos = valCountStr.lastIndexOf("\t"); |
| String valStr = valCountStr; |
| String countStr = "1"; |
| if (pos >= 0) { |
| valStr = valCountStr.substring(0, pos); |
| countStr = valCountStr.substring(pos + 1); |
| } |
| |
| Long count = (Long) this.items.get(valStr); |
| long inc = Long.parseLong(countStr); |
| |
| if (count == null) { |
| count = inc; |
| } else { |
| count = count.longValue() + inc; |
| } |
| items.put(valStr, count); |
| } |
| |
| /** |
| * @return the string representation of this aggregator. |
| * It includes the following basic statistics of the histogram: |
| * the number of unique values |
| * the minimum value |
| * the media value |
| * the maximum value |
| * the average value |
| * the standard deviation |
| */ |
| public String getReport() { |
| long[] counts = new long[items.size()]; |
| |
| StringBuffer sb = new StringBuffer(); |
| Iterator<Object> iter = items.values().iterator(); |
| int i = 0; |
| while (iter.hasNext()) { |
| Long count = (Long) iter.next(); |
| counts[i] = count.longValue(); |
| i += 1; |
| } |
| Arrays.sort(counts); |
| sb.append(counts.length); |
| i = 0; |
| long acc = 0; |
| while (i < counts.length) { |
| long nextVal = counts[i]; |
| int j = i + 1; |
| while (j < counts.length && counts[j] == nextVal) { |
| j++; |
| } |
| acc += nextVal * (j - i); |
| i = j; |
| } |
| double average = 0.0; |
| double sd = 0.0; |
| if (counts.length > 0) { |
| sb.append("\t").append(counts[0]); |
| sb.append("\t").append(counts[counts.length / 2]); |
| sb.append("\t").append(counts[counts.length - 1]); |
| |
| average = acc * 1.0 / counts.length; |
| sb.append("\t").append(average); |
| |
| i = 0; |
| while (i < counts.length) { |
| double nextDiff = counts[i] - average; |
| sd += nextDiff * nextDiff; |
| i += 1; |
| } |
| sd = Math.sqrt(sd / counts.length); |
| sb.append("\t").append(sd); |
| |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * |
| * @return a string representation of the list of value/frequence pairs of |
| * the histogram |
| */ |
| public String getReportDetails() { |
| StringBuffer sb = new StringBuffer(); |
| Iterator<Entry<Object,Object>> iter = items.entrySet().iterator(); |
| while (iter.hasNext()) { |
| Entry<Object,Object> en = iter.next(); |
| Object val = en.getKey(); |
| Long count = (Long) en.getValue(); |
| sb.append("\t").append(val.toString()).append("\t"). |
| append(count.longValue()).append("\n"); |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * @return a list value/frequence pairs. |
| * The return value is expected to be used by the reducer. |
| */ |
| public ArrayList<String> getCombinerOutput() { |
| ArrayList<String> retv = new ArrayList<String>(); |
| Iterator<Entry<Object,Object>> iter = items.entrySet().iterator(); |
| |
| while (iter.hasNext()) { |
| Entry<Object,Object> en = iter.next(); |
| Object val = en.getKey(); |
| Long count = (Long) en.getValue(); |
| retv.add(val.toString() + "\t" + count.longValue()); |
| } |
| return retv; |
| } |
| |
| /** |
| * |
| * @return a TreeMap representation of the histogram |
| */ |
| public TreeMap<Object,Object> getReportItems() { |
| return items; |
| } |
| |
| /** |
| * reset the aggregator |
| */ |
| public void reset() { |
| items = new TreeMap<Object, Object>(); |
| } |
| |
| } |