| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.pig; |
| |
| import java.io.Serializable; |
| import java.util.Arrays; |
| |
| import org.apache.pig.classification.InterfaceAudience; |
| import org.apache.pig.classification.InterfaceStability; |
| |
| /** |
| * An class that represents statistics about data to be loaded or stored. It is marked unstable |
| * because Pig does very little statistics collection at this point. If and when that |
| * functionality is added it is expected that this interface will change. |
| * @since Pig 0.7 |
| */ |
| @InterfaceAudience.Public |
| @InterfaceStability.Unstable |
| public class ResourceStatistics implements Cloneable { |
| |
| /* Getters intentionally return mutable arrays instead of copies, |
| * to simplify updates without unnecessary copying. |
| * Setters make a copy of the arrays in order to prevent an array |
| * from being shared by two objects, with modifications in one |
| * accidentally changing the other. |
| */ |
| |
| // arrays are initialized to empty so we don't have to worry about NPEs |
| // setters disallow setting them to null. |
| |
| private static final long serialVersionUID = 1L; |
| private Long numRecords; // number of records |
| private Long avgRecordSize; // average record size in bytes |
| private ResourceFieldStatistics[] fields = new ResourceFieldStatistics[0]; |
| private Long bytes; |
| |
| /** |
| * Statistics for a given field in the data. |
| */ |
| public static class ResourceFieldStatistics implements Serializable { |
| |
| private static final long serialVersionUID = 1L; |
| |
| private int version; |
| |
| private Long numDistinctValues; // number of distinct values represented |
| // in this field |
| |
| /** |
| * We need some way to represent a histogram of values in the field, |
| * as those will be useful. However, we can't count on being |
| * able to hold such histograms in memory. Have to figure out |
| * how they can be kept on disk and represented here. |
| * |
| * for now.. don't create so many buckets you can't hold them in memory |
| * |
| * an ordered array of the most common values, |
| * in descending order of frequency |
| */ |
| private Object[] mostCommonValues = new Object[0]; |
| |
| /** |
| * an array that matches the mostCommonValues array, and lists |
| * the frequencies of those values as a fraction (0 through 1) of |
| * the total number of records |
| */ |
| private float[] mostCommonValuesFreq = new float[0]; |
| |
| /** |
| * an ordered array of values, from min val to max val |
| * such that the number of records with values |
| * between valueHistogram[i] and and valueHistogram[i+1] is |
| * roughly equal for all values of i. |
| * NOTE: if mostCommonValues is non-empty, the values in that array |
| * should not be included in the histogram. Adjust accordingly. |
| */ |
| private Object[] valueHistogram = new Object[0]; |
| |
| |
| public int getVersion() { |
| return version; |
| } |
| |
| public ResourceFieldStatistics setVersion(int version) { |
| this.version = version; |
| return this; |
| } |
| |
| public Long getNumDistinctValues() { |
| return numDistinctValues; |
| } |
| |
| public ResourceFieldStatistics setNumDistinctValues(Long numDistinctValues) { |
| this.numDistinctValues = numDistinctValues; |
| return this; |
| } |
| |
| public Object[] getMostCommonValues() { |
| return mostCommonValues; |
| } |
| |
| public ResourceFieldStatistics setMostCommonValues(Object[] mostCommonValues) { |
| if (mostCommonValues !=null) |
| this.mostCommonValues = |
| Arrays.copyOf(mostCommonValues, mostCommonValues.length); |
| return this; |
| } |
| |
| public float[] getMostCommonValuesFreq() { |
| return mostCommonValuesFreq; |
| } |
| |
| public ResourceFieldStatistics setMostCommonValuesFreq(float[] mostCommonValuesFreq) { |
| if (mostCommonValuesFreq != null) |
| this.mostCommonValuesFreq = |
| Arrays.copyOf(mostCommonValuesFreq, mostCommonValuesFreq.length); |
| return this; |
| } |
| |
| public Object[] getValueHistogram() { |
| return valueHistogram; |
| } |
| |
| public ResourceFieldStatistics setValueHistogram(Object[] valueHistogram) { |
| if (valueHistogram != null) |
| this.valueHistogram = Arrays.copyOf(valueHistogram, valueHistogram.length); |
| return this; |
| } |
| |
| |
| /* |
| * equals() and hashCode() overridden mostly for ease of testing |
| * you shouldn't encounter a situation in which you need to .equals() |
| * two sets of statistics on different objects "in the wild" |
| */ |
| @Override |
| public boolean equals(Object anOther) { |
| if (anOther == null || !(anOther.getClass().equals(this.getClass()))) |
| return false; |
| ResourceFieldStatistics other = (ResourceFieldStatistics) anOther; |
| // setters do not allow null values, so no worries about NPEs here |
| return (Arrays.equals(mostCommonValues, other.mostCommonValues) && |
| Arrays.equals(mostCommonValuesFreq, other.mostCommonValuesFreq) && |
| Arrays.equals(valueHistogram, other.valueHistogram) && |
| this.numDistinctValues.equals(other.numDistinctValues) && |
| this.version == other.version |
| ); |
| } |
| |
| /** |
| * A naive hashCode implementation following the example in IBM's developerworks: |
| * http://www.ibm.com/developerworks/java/library/j-jtp05273.html |
| */ |
| @Override |
| public int hashCode() { |
| int hash = 1; |
| hash = 31 * hash + Arrays.hashCode(mostCommonValues); |
| hash = 31 * hash + Arrays.hashCode(mostCommonValuesFreq); |
| hash = 31 * hash + numDistinctValues.hashCode(); |
| hash = 31 * hash + Arrays.hashCode(valueHistogram); |
| hash = 31 * hash + version; |
| return 0; |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder("ResourceStatistics. Version: "+version+"\n"); |
| sb.append("MCV:\n"); |
| for (Object o : mostCommonValues) sb.append('['+ o.toString() +']'); |
| sb.append("\n MCVfreq:\n"); |
| for (Float f : mostCommonValuesFreq) sb.append('['+f.toString()+']'); |
| sb.append("\n"); |
| sb.append("numDistVals: "+numDistinctValues); |
| sb.append("valHistogram: \n"); |
| for (Object o : valueHistogram) sb.append('['+o.toString()+']'); |
| sb.append("\n"); |
| return sb.toString(); |
| } |
| } |
| |
| public Long getmBytes() { |
| return this.bytes / 1024 / 1024; |
| } |
| |
| /** |
| * |
| * @param mBytes |
| * @deprecated Use {@link ResourceStatistics#setSizeInBytes(Long)} instead |
| */ |
| @Deprecated |
| public ResourceStatistics setmBytes(Long mBytes) { |
| this.bytes = mBytes * 1024 * 1024; |
| return this; |
| } |
| |
| /** |
| * Sets the size in bytes |
| * |
| * @param bytes |
| */ |
| public void setSizeInBytes(Long bytes) { |
| this.bytes = bytes; |
| } |
| |
| /** |
| * @return size in bytes. |
| */ |
| public Long getSizeInBytes() { |
| return this.bytes; |
| } |
| |
| public Long getNumRecords() { |
| return numRecords; |
| } |
| |
| public ResourceStatistics setNumRecords(Long numRecords) { |
| this.numRecords = numRecords; |
| return this; |
| } |
| |
| /* |
| * returns average record size in bytes. This number can be explicitly |
| * specified by statistics, or if absent, computed using |
| * totalbytes/totalrecords. Will return null if can't be computed. |
| */ |
| public Long getAvgRecordSize() { |
| if (avgRecordSize == null && (bytes != null && numRecords != null)) |
| return bytes / numRecords; |
| else |
| return avgRecordSize; |
| } |
| |
| /** |
| * Set average record size in bytes |
| * |
| * @param sizeInBytes |
| */ |
| public void setAvgRecordSize(Long sizeInBytes) { |
| avgRecordSize = sizeInBytes; |
| } |
| |
| public ResourceFieldStatistics[] getFields() { |
| return fields; |
| } |
| |
| public ResourceStatistics setFields(ResourceFieldStatistics[] fields) { |
| if (fields != null) |
| this.fields = Arrays.copyOf(fields, fields.length); |
| return this; |
| } |
| |
| |
| /* |
| * equals() and hashCode() overridden mostly for ease of testing |
| * you shouldn't encounter a situation in which you need to .equals() |
| * two sets of statistics on different objects "in the wild" |
| */ |
| @Override |
| public boolean equals(Object anOther) { |
| if (anOther == null || !(anOther.getClass().equals(this.getClass()))) |
| return false; |
| ResourceStatistics other = (ResourceStatistics) anOther; |
| return (Arrays.equals(fields, other.fields) && |
| ((bytes == null) ? (other.bytes == null) : bytes |
| .equals(other.bytes)) && |
| ((numRecords == null) |
| ? (other.numRecords==null) : numRecords.equals(other.numRecords)) |
| ); |
| } |
| |
| @Override |
| public int hashCode() { |
| int hash = 1; |
| hash = 31*hash + Arrays.hashCode(fields); |
| hash = 31 * hash + (bytes == null ? 0 : bytes.hashCode()); |
| hash = 31*hash + (numRecords == null ? 0 : numRecords.hashCode()); |
| return hash; |
| } |
| // Probably more in here |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder("Field Stats: \n"); |
| for (ResourceFieldStatistics f : fields) sb.append(f.toString()); |
| sb.append("bytes: " + bytes); |
| sb.append("numRecords: "+numRecords); |
| return sb.toString(); |
| } |
| |
| public Object clone() throws CloneNotSupportedException { |
| return super.clone(); |
| } |
| } |