blob: 9f83580cc6a1290f2d9f1f7d2192c0cf600c5cec [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig;
import java.io.Serializable;
import java.util.Arrays;
import org.apache.pig.classification.InterfaceAudience;
import org.apache.pig.classification.InterfaceStability;
/**
* An class that represents statistics about data to be loaded or stored. It is marked unstable
* because Pig does very little statistics collection at this point. If and when that
* functionality is added it is expected that this interface will change.
* @since Pig 0.7
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public class ResourceStatistics implements Cloneable {
/* Getters intentionally return mutable arrays instead of copies,
* to simplify updates without unnecessary copying.
* Setters make a copy of the arrays in order to prevent an array
* from being shared by two objects, with modifications in one
* accidentally changing the other.
*/
// arrays are initialized to empty so we don't have to worry about NPEs
// setters disallow setting them to null.
private static final long serialVersionUID = 1L;
private Long numRecords; // number of records
private Long avgRecordSize; // average record size in bytes
private ResourceFieldStatistics[] fields = new ResourceFieldStatistics[0];
private Long bytes;
/**
* Statistics for a given field in the data.
*/
public static class ResourceFieldStatistics implements Serializable {
private static final long serialVersionUID = 1L;
private int version;
private Long numDistinctValues; // number of distinct values represented
// in this field
/**
* We need some way to represent a histogram of values in the field,
* as those will be useful. However, we can't count on being
* able to hold such histograms in memory. Have to figure out
* how they can be kept on disk and represented here.
*
* for now.. don't create so many buckets you can't hold them in memory
*
* an ordered array of the most common values,
* in descending order of frequency
*/
private Object[] mostCommonValues = new Object[0];
/**
* an array that matches the mostCommonValues array, and lists
* the frequencies of those values as a fraction (0 through 1) of
* the total number of records
*/
private float[] mostCommonValuesFreq = new float[0];
/**
* an ordered array of values, from min val to max val
* such that the number of records with values
* between valueHistogram[i] and and valueHistogram[i+1] is
* roughly equal for all values of i.
* NOTE: if mostCommonValues is non-empty, the values in that array
* should not be included in the histogram. Adjust accordingly.
*/
private Object[] valueHistogram = new Object[0];
public int getVersion() {
return version;
}
public ResourceFieldStatistics setVersion(int version) {
this.version = version;
return this;
}
public Long getNumDistinctValues() {
return numDistinctValues;
}
public ResourceFieldStatistics setNumDistinctValues(Long numDistinctValues) {
this.numDistinctValues = numDistinctValues;
return this;
}
public Object[] getMostCommonValues() {
return mostCommonValues;
}
public ResourceFieldStatistics setMostCommonValues(Object[] mostCommonValues) {
if (mostCommonValues !=null)
this.mostCommonValues =
Arrays.copyOf(mostCommonValues, mostCommonValues.length);
return this;
}
public float[] getMostCommonValuesFreq() {
return mostCommonValuesFreq;
}
public ResourceFieldStatistics setMostCommonValuesFreq(float[] mostCommonValuesFreq) {
if (mostCommonValuesFreq != null)
this.mostCommonValuesFreq =
Arrays.copyOf(mostCommonValuesFreq, mostCommonValuesFreq.length);
return this;
}
public Object[] getValueHistogram() {
return valueHistogram;
}
public ResourceFieldStatistics setValueHistogram(Object[] valueHistogram) {
if (valueHistogram != null)
this.valueHistogram = Arrays.copyOf(valueHistogram, valueHistogram.length);
return this;
}
/*
* equals() and hashCode() overridden mostly for ease of testing
* you shouldn't encounter a situation in which you need to .equals()
* two sets of statistics on different objects "in the wild"
*/
@Override
public boolean equals(Object anOther) {
if (anOther == null || !(anOther.getClass().equals(this.getClass())))
return false;
ResourceFieldStatistics other = (ResourceFieldStatistics) anOther;
// setters do not allow null values, so no worries about NPEs here
return (Arrays.equals(mostCommonValues, other.mostCommonValues) &&
Arrays.equals(mostCommonValuesFreq, other.mostCommonValuesFreq) &&
Arrays.equals(valueHistogram, other.valueHistogram) &&
this.numDistinctValues.equals(other.numDistinctValues) &&
this.version == other.version
);
}
/**
* A naive hashCode implementation following the example in IBM's developerworks:
* http://www.ibm.com/developerworks/java/library/j-jtp05273.html
*/
@Override
public int hashCode() {
int hash = 1;
hash = 31 * hash + Arrays.hashCode(mostCommonValues);
hash = 31 * hash + Arrays.hashCode(mostCommonValuesFreq);
hash = 31 * hash + numDistinctValues.hashCode();
hash = 31 * hash + Arrays.hashCode(valueHistogram);
hash = 31 * hash + version;
return 0;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("ResourceStatistics. Version: "+version+"\n");
sb.append("MCV:\n");
for (Object o : mostCommonValues) sb.append('['+ o.toString() +']');
sb.append("\n MCVfreq:\n");
for (Float f : mostCommonValuesFreq) sb.append('['+f.toString()+']');
sb.append("\n");
sb.append("numDistVals: "+numDistinctValues);
sb.append("valHistogram: \n");
for (Object o : valueHistogram) sb.append('['+o.toString()+']');
sb.append("\n");
return sb.toString();
}
}
public Long getmBytes() {
return this.bytes / 1024 / 1024;
}
/**
* Sets the size in bytes
*
* @param bytes
*/
public void setSizeInBytes(Long bytes) {
this.bytes = bytes;
}
/**
* @return size in bytes.
*/
public Long getSizeInBytes() {
return this.bytes;
}
public Long getNumRecords() {
return numRecords;
}
public ResourceStatistics setNumRecords(Long numRecords) {
this.numRecords = numRecords;
return this;
}
/*
* returns average record size in bytes. This number can be explicitly
* specified by statistics, or if absent, computed using
* totalbytes/totalrecords. Will return null if can't be computed.
*/
public Long getAvgRecordSize() {
if (avgRecordSize == null && (bytes != null && numRecords != null))
return bytes / numRecords;
else
return avgRecordSize;
}
/**
* Set average record size in bytes
*
* @param sizeInBytes
*/
public void setAvgRecordSize(Long sizeInBytes) {
avgRecordSize = sizeInBytes;
}
public ResourceFieldStatistics[] getFields() {
return fields;
}
public ResourceStatistics setFields(ResourceFieldStatistics[] fields) {
if (fields != null)
this.fields = Arrays.copyOf(fields, fields.length);
return this;
}
/*
* equals() and hashCode() overridden mostly for ease of testing
* you shouldn't encounter a situation in which you need to .equals()
* two sets of statistics on different objects "in the wild"
*/
@Override
public boolean equals(Object anOther) {
if (anOther == null || !(anOther.getClass().equals(this.getClass())))
return false;
ResourceStatistics other = (ResourceStatistics) anOther;
return (Arrays.equals(fields, other.fields) &&
((bytes == null) ? (other.bytes == null) : bytes
.equals(other.bytes)) &&
((numRecords == null)
? (other.numRecords==null) : numRecords.equals(other.numRecords))
);
}
@Override
public int hashCode() {
int hash = 1;
hash = 31*hash + Arrays.hashCode(fields);
hash = 31 * hash + (bytes == null ? 0 : bytes.hashCode());
hash = 31*hash + (numRecords == null ? 0 : numRecords.hashCode());
return hash;
}
// Probably more in here
@Override
public String toString() {
StringBuilder sb = new StringBuilder("Field Stats: \n");
for (ResourceFieldStatistics f : fields) sb.append(f.toString());
sb.append("bytes: " + bytes);
sb.append("numRecords: "+numRecords);
return sb.toString();
}
public Object clone() throws CloneNotSupportedException {
return super.clone();
}
}