| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.pig.piggybank.storage; |
| |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.util.Iterator; |
| import java.util.Map; |
| import java.util.Properties; |
| |
| import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; |
| import org.apache.hadoop.hive.serde2.ByteStream; |
| import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; |
| import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; |
| import org.apache.hadoop.mapreduce.Job; |
| import org.apache.hadoop.mapreduce.OutputFormat; |
| import org.apache.pig.PigException; |
| import org.apache.pig.ResourceSchema; |
| import org.apache.pig.backend.executionengine.ExecException; |
| import org.apache.pig.builtin.PigStorage; |
| import org.apache.pig.data.DataBag; |
| import org.apache.pig.data.DataByteArray; |
| import org.apache.pig.data.DataType; |
| import org.apache.pig.data.Tuple; |
| import org.apache.pig.impl.util.UDFContext; |
| import org.apache.pig.piggybank.storage.hiverc.HiveRCOutputFormat; |
| |
| public class HiveColumnarStorage extends PigStorage { |
| private static final String UTF8 = "UTF-8"; |
| |
| private static final char LIST_DELIMITER = 2; |
| private static final char MAP_DELIMITER = 3; |
| |
| private int numColumns = -1; |
| private ByteStream.Output byteStream; |
| private BytesRefArrayWritable rowWritable; |
| private BytesRefWritable[] colValRefs; |
| |
| @Override |
| public OutputFormat getOutputFormat() { |
| return new HiveRCOutputFormat(); |
| } |
| |
| @Override |
| public void setStoreLocation(String location, Job job) throws IOException { |
| super.setStoreLocation(location, job); |
| // set number of columns if this is set in context. |
| Properties p = getUDFProperties(); |
| if (p != null) { |
| numColumns = Integer.parseInt(p.getProperty("numColumns", "-1")); |
| } |
| |
| if (numColumns > 0) { |
| RCFileOutputFormat.setColumnNumber(job.getConfiguration(), numColumns); |
| } |
| } |
| |
| @Override |
| public void checkSchema(ResourceSchema s) throws IOException { |
| super.checkSchema(s); |
| getUDFProperties().setProperty("numColumns", Integer.toString(s.getFields().length)); |
| } |
| |
| @SuppressWarnings("unchecked") |
| @Override |
| public void putNext(Tuple t) throws IOException { |
| |
| if (rowWritable == null) { // initialize |
| if (numColumns < 1) { |
| throw new IOException("number of columns is not set"); |
| } |
| |
| byteStream = new ByteStream.Output(); |
| rowWritable = new BytesRefArrayWritable(); |
| colValRefs = new BytesRefWritable[numColumns]; |
| |
| for (int i = 0; i < numColumns; i++) { |
| colValRefs[i] = new BytesRefWritable(); |
| rowWritable.set(i, colValRefs[i]); |
| } |
| } |
| |
| byteStream.reset(); |
| |
| int sz = t.size(); |
| int startPos = 0; |
| |
| for (int i = 0; i < sz && i < numColumns; i++) { |
| |
| putField(byteStream, t.get(i)); |
| colValRefs[i].set(byteStream.getData(), startPos, byteStream.getLength() - startPos); |
| startPos = byteStream.getLength(); |
| } |
| |
| try { |
| writer.write(null, rowWritable); |
| } catch (InterruptedException e) { |
| throw new IOException(e); |
| } |
| } |
| |
| private Properties getUDFProperties() { |
| return UDFContext.getUDFContext().getUDFProperties(this.getClass(), |
| new String[] { signature }); |
| } |
| |
| public void putField(OutputStream out, Object field) throws IOException { |
| |
| switch (DataType.findType(field)) { |
| case DataType.NULL: |
| break; // just leave it empty |
| |
| case DataType.BOOLEAN: |
| out.write(((Boolean) field).toString().getBytes()); |
| break; |
| |
| case DataType.INTEGER: |
| out.write(((Integer) field).toString().getBytes()); |
| break; |
| |
| case DataType.LONG: |
| out.write(((Long) field).toString().getBytes()); |
| break; |
| |
| case DataType.FLOAT: |
| out.write(((Float) field).toString().getBytes()); |
| break; |
| |
| case DataType.DOUBLE: |
| out.write(((Double) field).toString().getBytes()); |
| break; |
| |
| case DataType.BYTEARRAY: |
| byte[] b = ((DataByteArray) field).get(); |
| out.write(b, 0, b.length); |
| break; |
| |
| case DataType.CHARARRAY: |
| out.write(((String) field).getBytes(UTF8)); |
| break; |
| |
| case DataType.MAP: |
| boolean mapHasNext = false; |
| Map<String, Object> m = (Map<String, Object>) field; |
| |
| for (Map.Entry<String, Object> e : m.entrySet()) { |
| if (mapHasNext) { |
| out.write(LIST_DELIMITER); |
| } else { |
| mapHasNext = true; |
| } |
| putField(out, e.getKey()); |
| out.write(MAP_DELIMITER); |
| putField(out, e.getValue()); |
| } |
| |
| break; |
| case DataType.INTERNALMAP: |
| boolean internalMapHasNext = false; |
| Map<String, Object> im = (Map<String, Object>) field; |
| |
| for (Map.Entry<String, Object> e : im.entrySet()) { |
| if (internalMapHasNext) { |
| out.write(LIST_DELIMITER); |
| } else { |
| internalMapHasNext = true; |
| } |
| putField(out, e.getKey()); |
| out.write(MAP_DELIMITER); |
| putField(out, e.getValue()); |
| } |
| |
| break; |
| |
| case DataType.TUPLE: |
| boolean tupleHasNext = false; |
| Tuple t = (Tuple) field; |
| |
| for (int i = 0; i < t.size(); ++i) { |
| if (tupleHasNext) { |
| out.write(LIST_DELIMITER); |
| } else { |
| tupleHasNext = true; |
| } |
| try { |
| putField(out, t.get(i)); |
| } catch (ExecException ee) { |
| throw ee; |
| } |
| } |
| |
| break; |
| |
| case DataType.BAG: |
| boolean bagHasNext = false; |
| Iterator<Tuple> tupleIter = ((DataBag) field).iterator(); |
| while (tupleIter.hasNext()) { |
| if (bagHasNext) { |
| out.write(LIST_DELIMITER); |
| } else { |
| bagHasNext = true; |
| } |
| putField(out, tupleIter.next()); |
| } |
| |
| break; |
| |
| default: { |
| int errCode = 2108; |
| String msg = "Could not determine data type of field: " + field; |
| throw new ExecException(msg, errCode, PigException.BUG); |
| } |
| |
| } |
| } |
| |
| } |