| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.pig.data; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.DataInput; |
| import java.io.DataInputStream; |
| import java.io.DataOutput; |
| import java.io.IOException; |
| import java.io.UnsupportedEncodingException; |
| import java.math.BigDecimal; |
| import java.math.BigInteger; |
| import java.nio.ByteBuffer; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.List; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.WritableComparator; |
| import org.apache.hadoop.mapred.JobConf; |
| import org.apache.pig.PigException; |
| import org.apache.pig.backend.executionengine.ExecException; |
| import org.apache.pig.impl.io.NullableTuple; |
| import org.apache.pig.impl.util.ObjectSerializer; |
| |
| /** |
| * A default implementation of Tuple. This class will be created by the DefaultTupleFactory. |
| */ |
| public class DefaultTuple extends AbstractTuple { |
| |
| private static final long serialVersionUID = 2L; |
| protected List<Object> mFields; |
| |
| /** |
| * Default constructor. This constructor is public so that hadoop can call it directly. However, inside pig you |
| * should never be calling this function. Use TupleFactory instead. |
| * <br>Time complexity: O(1), after allocation |
| */ |
| public DefaultTuple() { |
| mFields = new ArrayList<Object>(); |
| } |
| |
| /** |
| * Construct a tuple with a known number of fields. Package level so that callers cannot directly invoke it. |
| * <br>Resulting tuple is filled pre-filled with null elements. Time complexity: O(N), after allocation |
| * |
| * @param size |
| * Number of fields to allocate in the tuple. |
| */ |
| DefaultTuple(int size) { |
| mFields = new ArrayList<Object>(size); |
| for (int i = 0; i < size; i++) |
| mFields.add(null); |
| } |
| |
| /** |
| * Construct a tuple from an existing list of objects. Package level so that callers cannot directly invoke it. |
| * <br>Time complexity: O(N) plus running time of input object iteration, after allocation |
| * @param c |
| * List of objects to turn into a tuple. |
| */ |
| DefaultTuple(List<Object> c) { |
| mFields = new ArrayList<Object>(c); |
| } |
| |
| /** |
| * Construct a tuple from an existing list of objects. Package level so that callers cannot directly invoke it. |
| * <br>Time complexity: O(1) |
| * |
| * @param c |
| * List of objects to turn into a tuple. This list will be kept as part of the tuple. |
| * @param junk |
| * Just used to differentiate from the constructor above that copies the list. |
| */ |
| DefaultTuple(List<Object> c, int junk) { |
| mFields = c; |
| } |
| |
| /** |
| * Find the size of the tuple. Used to be called arity(). |
| * |
| * @return number of fields in the tuple. |
| */ |
| @Override |
| public int size() { |
| return mFields.size(); |
| } |
| |
| /** |
| * Get the value in a given field. |
| * |
| * @param fieldNum |
| * Number of the field to get the value for. |
| * @return value, as an Object. |
| * @throws ExecException |
| * if the field number is greater than or equal to the number of fields in the tuple. |
| */ |
| @Override |
| public Object get(int fieldNum) throws ExecException { |
| return mFields.get(fieldNum); |
| } |
| |
| /** |
| * Get all of the fields in the tuple as a list. |
| * |
| * @return List<Object> containing the fields of the tuple in order. |
| */ |
| @Override |
| public List<Object> getAll() { |
| return mFields; |
| } |
| |
| /** |
| * Set the value in a given field. |
| * |
| * @param fieldNum |
| * Number of the field to set the value for. |
| * @param val |
| * Object to put in the indicated field. |
| * @throws ExecException |
| * if the field number is greater than or equal to the number of fields in the tuple. |
| */ |
| @Override |
| public void set(int fieldNum, Object val) throws ExecException { |
| mFields.set(fieldNum, val); |
| } |
| |
| /** |
| * Append a field to a tuple. This method is not efficient as it may force copying of existing data in order to grow |
| * the data structure. Whenever possible you should construct your Tuple with the newTuple(int) method and then fill |
| * in the values with set(), rather than construct it with newTuple() and append values. |
| * |
| * @param val |
| * Object to append to the tuple. |
| */ |
| @Override |
| public void append(Object val) { |
| mFields.add(val); |
| } |
| |
| /** |
| * Determine the size of tuple in memory. This is used by data bags to determine their memory size. This need not be |
| * exact, but it should be a decent estimation. |
| * |
| * @return estimated memory size. |
| */ |
| @Override |
| public long getMemorySize() { |
| Iterator<Object> i = mFields.iterator(); |
| |
| // rest of the fixed portion of mfields size is accounted within empty_tuple_size |
| long mfields_var_size = SizeUtil.roundToEight(4 + 4 * mFields.size()); |
| // in java hotspot 32bit vm, there seems to be a minimum tuple size of 96 |
| // which is probably from the minimum size of this array list |
| mfields_var_size = Math.max(40, mfields_var_size); |
| |
| // fixed overhead = 48 bytes |
| //8 - tuple object header |
| //8 - mFields reference |
| //32 - mFields array list fixed size |
| long sum = 48 + mfields_var_size; |
| while (i.hasNext()) { |
| sum += SizeUtil.getPigObjMemSize(i.next()); |
| } |
| return sum; |
| } |
| |
| @Override |
| public int compareTo(Object other) { |
| if (other instanceof Tuple) { |
| Tuple t = (Tuple) other; |
| int mySz = mFields.size(); |
| int tSz = t.size(); |
| if (tSz < mySz) { |
| return 1; |
| } else if (tSz > mySz) { |
| return -1; |
| } else { |
| for (int i = 0; i < mySz; i++) { |
| try { |
| int c = DataType.compare(mFields.get(i), t.get(i)); |
| if (c != 0) { |
| return c; |
| } |
| } catch (ExecException e) { |
| throw new RuntimeException("Unable to compare tuples", e); |
| } |
| } |
| return 0; |
| } |
| } else { |
| return DataType.compare(this, other); |
| } |
| } |
| |
| public static class DefaultTupleRawComparator extends WritableComparator implements TupleRawComparator { |
| private final Log mLog = LogFactory.getLog(getClass()); |
| private boolean[] mAsc; |
| private boolean mWholeTuple; |
| private boolean mHasNullField; |
| private TupleFactory mFact; |
| |
| public DefaultTupleRawComparator() { |
| super(DefaultTuple.class); |
| } |
| |
| @Override |
| public Configuration getConf() { |
| return null; |
| } |
| |
| @Override |
| public void setConf(Configuration conf) { |
| try { |
| mAsc = (boolean[]) ObjectSerializer.deserialize(conf.get("pig.sortOrder")); |
| } catch (IOException ioe) { |
| mLog.error("Unable to deserialize pig.sortOrder " + ioe.getMessage()); |
| throw new RuntimeException(ioe); |
| } |
| if (mAsc == null) { |
| mAsc = new boolean[1]; |
| mAsc[0] = true; |
| } |
| // If there's only one entry in mAsc, it means it's for the whole |
| // tuple. So we can't be looking for each column. |
| mWholeTuple = (mAsc.length == 1); |
| mFact = TupleFactory.getInstance(); |
| } |
| |
| @Override |
| public boolean hasComparedTupleNull() { |
| return mHasNullField; |
| } |
| |
| /** |
| * Compare two DefaultTuples as raw bytes. We assume the Tuples are NOT PigNullableWritable, so client classes |
| * need to deal with Null and Index. |
| */ |
| @Override |
| public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { |
| ByteBuffer bb1 = ByteBuffer.wrap(b1, s1, l1); |
| ByteBuffer bb2 = ByteBuffer.wrap(b2, s2, l2); |
| int rc = compareDefaultTuple(bb1, bb2, true); // FIXME adjust for secondary sort asc |
| return rc; |
| } |
| |
| /** |
| * Compare two DefaultTuples as raw bytes. |
| */ |
| private int compareDefaultTuple(ByteBuffer bb1, ByteBuffer bb2, boolean topLevel) { |
| mHasNullField = false; |
| // store the position in case of deserialization |
| int s1 = bb1.position(); |
| int s2 = bb2.position(); |
| int rc = 0; |
| byte tupleType1 = bb1.get(); |
| byte tupleType2 = bb2.get(); |
| assert (tupleType1 == tupleType2 && tupleType1 == DataType.TUPLE); |
| // first compare sizes |
| int sz1 = bb1.getInt(); |
| int sz2 = bb2.getInt(); |
| if (sz1 > sz2) { |
| return 1; |
| } else if (sz1 < sz2) { |
| return -1; |
| } else { |
| // if sizes are the same, compare field by field |
| for (int i = 0; i < sz1 && rc == 0; i++) { |
| byte dt1 = bb1.get(); |
| byte dt2 = bb2.get(); |
| if (dt1 == dt2) { |
| switch (dt1) { |
| case DataType.NULL: |
| if (topLevel) // we are scanning the top-level Tuple (original call) |
| mHasNullField = true; |
| rc = 0; |
| break; |
| case DataType.BOOLEAN: |
| case DataType.BYTE: |
| byte bv1 = bb1.get(); |
| byte bv2 = bb2.get(); |
| rc = (bv1 < bv2 ? -1 : (bv1 == bv2 ? 0 : 1)); |
| break; |
| case DataType.INTEGER: |
| int iv1 = bb1.getInt(); |
| int iv2 = bb2.getInt(); |
| rc = (iv1 < iv2 ? -1 : (iv1 == iv2 ? 0 : 1)); |
| break; |
| case DataType.LONG: |
| long lv1 = bb1.getLong(); |
| long lv2 = bb2.getLong(); |
| rc = (lv1 < lv2 ? -1 : (lv1 == lv2 ? 0 : 1)); |
| break; |
| case DataType.FLOAT: |
| float fv1 = bb1.getFloat(); |
| float fv2 = bb2.getFloat(); |
| rc = Float.compare(fv1, fv2); |
| break; |
| case DataType.DOUBLE: |
| double dv1 = bb1.getDouble(); |
| double dv2 = bb2.getDouble(); |
| rc = Double.compare(dv1, dv2); |
| break; |
| case DataType.BIGINTEGER: { |
| if (bb1.get() != DataType.BYTEARRAY || bb2.get() != DataType.BYTEARRAY) { |
| throw new RuntimeException("Issue in comparing raw bytes for DefaultTuple! BIGINTEGER was not serialized with BYTEARRAY"); |
| } |
| |
| int basz1 = bb1.getInt(); |
| int basz2 = bb2.getInt(); |
| byte[] ba1 = new byte[basz1]; |
| byte[] ba2 = new byte[basz2]; |
| bb1.get(ba1); |
| bb2.get(ba2); |
| rc = new BigInteger(ba1).compareTo(new BigInteger(ba2)); |
| break; |
| } |
| case DataType.BIGDECIMAL: { |
| byte catype1 = bb1.get(); |
| byte catype2 = bb2.get(); |
| int casz1 = (catype1 == DataType.CHARARRAY) ? bb1.getShort() : bb1.getInt(); |
| int casz2 = (catype2 == DataType.CHARARRAY) ? bb2.getShort() : bb2.getInt(); |
| byte[] ca1 = new byte[casz1]; |
| byte[] ca2 = new byte[casz2]; |
| bb1.get(ca1); |
| bb2.get(ca2); |
| String str1 = null, |
| str2 = null; |
| try { |
| str1 = new String(ca1, DataReaderWriter.UTF8); |
| str2 = new String(ca2, DataReaderWriter.UTF8); |
| } catch (UnsupportedEncodingException uee) { |
| mLog.warn("Unsupported string encoding", uee); |
| uee.printStackTrace(); |
| } |
| if (str1 != null && str2 != null) |
| rc = new BigDecimal(str1).compareTo(new BigDecimal(str2)); |
| break; |
| } |
| case DataType.DATETIME: |
| long dtv1 = bb1.getLong(); |
| bb1.position(bb1.position() + 2); // move cursor forward without read the timezone bytes |
| long dtv2 = bb2.getLong(); |
| bb2.position(bb2.position() + 2); |
| rc = (dtv1 < dtv2 ? -1 : (dtv1 == dtv2 ? 0 : 1)); |
| break; |
| case DataType.BYTEARRAY: |
| int basz1 = bb1.getInt(); |
| int basz2 = bb2.getInt(); |
| byte[] ba1 = new byte[basz1]; |
| byte[] ba2 = new byte[basz2]; |
| bb1.get(ba1); |
| bb2.get(ba2); |
| rc = DataByteArray.compare(ba1, ba2); |
| break; |
| case DataType.CHARARRAY: |
| case DataType.BIGCHARARRAY: |
| int casz1 = (dt1 == DataType.CHARARRAY) ? bb1.getShort() : bb1.getInt(); |
| int casz2 = (dt1 == DataType.CHARARRAY) ? bb2.getShort() : bb2.getInt(); |
| byte[] ca1 = new byte[casz1]; |
| byte[] ca2 = new byte[casz2]; |
| bb1.get(ca1); |
| bb2.get(ca2); |
| String str1 = null, |
| str2 = null; |
| try { |
| str1 = new String(ca1, DataReaderWriter.UTF8); |
| str2 = new String(ca2, DataReaderWriter.UTF8); |
| } catch (UnsupportedEncodingException uee) { |
| mLog.warn("Unsupported string encoding", uee); |
| uee.printStackTrace(); |
| } |
| if (str1 != null && str2 != null) |
| rc = str1.compareTo(str2); |
| break; |
| case DataType.TUPLE: |
| // put back the cursor to before DataType.TUPLE |
| bb1.position(bb1.position() - 1); |
| bb2.position(bb2.position() - 1); |
| rc = compareDefaultTuple(bb1, bb2, false); |
| break; |
| default: |
| mLog.info("Unsupported DataType for binary comparison, switching to object deserialization: " |
| + DataType.genTypeToNameMap().get(dt1) + "(" + dt1 + ")"); |
| Tuple t1 = mFact.newTuple(); |
| Tuple t2 = mFact.newTuple(); |
| try { |
| t1.readFields(new DataInputStream( |
| new ByteArrayInputStream(bb1.array(), s1, bb1.limit()))); |
| t2.readFields(new DataInputStream( |
| new ByteArrayInputStream(bb2.array(), s2, bb2.limit()))); |
| } catch (IOException ioe) { |
| mLog.error("Unable to instantiate tuples for comparison: " + ioe.getMessage()); |
| throw new RuntimeException(ioe.getMessage(), ioe); |
| } |
| // delegate to compareTuple |
| return compareTuple(t1, t2); |
| } |
| } else { // compare DataTypes |
| if (dt1 < dt2) |
| rc = -1; |
| else |
| rc = 1; |
| } |
| // flip if the order is descending |
| if (rc != 0) { |
| if (!mWholeTuple && !mAsc[i]) |
| rc *= -1; |
| else if (mWholeTuple && !mAsc[0]) |
| rc *= -1; |
| } |
| } |
| } |
| return rc; |
| } |
| |
| @Override |
| public int compare(Object o1, Object o2) { |
| NullableTuple nt1 = (NullableTuple) o1; |
| NullableTuple nt2 = (NullableTuple) o2; |
| int rc = 0; |
| |
| // if either are null, handle differently |
| if (!nt1.isNull() && !nt2.isNull()) { |
| rc = compareTuple((Tuple) nt1.getValueAsPigType(), (Tuple) nt2.getValueAsPigType()); |
| } else { |
| // for sorting purposes two nulls are equal |
| if (nt1.isNull() && nt2.isNull()) |
| rc = 0; |
| else if (nt1.isNull()) |
| rc = -1; |
| else |
| rc = 1; |
| if (mWholeTuple && !mAsc[0]) |
| rc *= -1; |
| } |
| return rc; |
| } |
| |
| private int compareTuple(Tuple t1, Tuple t2) { |
| int sz1 = t1.size(); |
| int sz2 = t2.size(); |
| if (sz2 < sz1) { |
| return 1; |
| } else if (sz2 > sz1) { |
| return -1; |
| } else { |
| for (int i = 0; i < sz1; i++) { |
| try { |
| int c = DataType.compare(t1.get(i), t2.get(i)); |
| if (c != 0) { |
| if (!mWholeTuple && !mAsc[i]) |
| c *= -1; |
| else if (mWholeTuple && !mAsc[0]) |
| c *= -1; |
| return c; |
| } |
| } catch (ExecException e) { |
| throw new RuntimeException("Unable to compare tuples", e); |
| } |
| } |
| return 0; |
| } |
| } |
| |
| } |
| |
| @Override |
| public int hashCode() { |
| int hash = 17; |
| for (Iterator<Object> it = mFields.iterator(); it.hasNext();) { |
| Object o = it.next(); |
| if (o != null) { |
| hash = 31 * hash + o.hashCode(); |
| } |
| } |
| return hash; |
| } |
| |
| @Override |
| public void write(DataOutput out) throws IOException { |
| out.writeByte(DataType.TUPLE); |
| int sz = size(); |
| out.writeInt(sz); |
| for (int i = 0; i < sz; i++) { |
| DataReaderWriter.writeDatum(out, mFields.get(i)); |
| } |
| } |
| |
| @Override |
| public void readFields(DataInput in) throws IOException { |
| // Clear our fields, in case we're being reused. |
| mFields.clear(); |
| |
| // Make sure it's a tuple. |
| byte b = in.readByte(); |
| if (b != DataType.TUPLE) { |
| int errCode = 2112; |
| String msg = "Unexpected data while reading tuple " + "from binary file."; |
| throw new ExecException(msg, errCode, PigException.BUG); |
| } |
| // Read the number of fields |
| int sz = in.readInt(); |
| for (int i = 0; i < sz; i++) { |
| try { |
| append(DataReaderWriter.readDatum(in)); |
| } catch (ExecException ee) { |
| throw ee; |
| } |
| } |
| } |
| |
| public static Class<? extends TupleRawComparator> getComparatorClass() { |
| return DefaultTupleRawComparator.class; |
| } |
| } |