| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.tajo.storage.avro; |
| |
| import com.google.protobuf.InvalidProtocolBufferException; |
| import com.google.protobuf.Message; |
| import org.apache.avro.Schema; |
| import org.apache.avro.file.DataFileReader; |
| import org.apache.avro.file.SeekableInput; |
| import org.apache.avro.generic.GenericDatumReader; |
| import org.apache.avro.generic.GenericFixed; |
| import org.apache.avro.generic.GenericRecord; |
| import org.apache.avro.io.DatumReader; |
| import org.apache.avro.mapred.FsInput; |
| import org.apache.avro.util.Utf8; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.tajo.catalog.Column; |
| import org.apache.tajo.catalog.TableMeta; |
| import org.apache.tajo.common.TajoDataTypes; |
| import org.apache.tajo.common.TajoDataTypes.DataType; |
| import org.apache.tajo.datum.*; |
| import org.apache.tajo.exception.TajoRuntimeException; |
| import org.apache.tajo.exception.UnsupportedException; |
| import org.apache.tajo.plan.expr.EvalNode; |
| import org.apache.tajo.storage.FileScanner; |
| import org.apache.tajo.storage.Tuple; |
| import org.apache.tajo.storage.VTuple; |
| import org.apache.tajo.storage.fragment.Fragment; |
| |
| import java.io.IOException; |
| import java.nio.ByteBuffer; |
| import java.util.List; |
| |
| /** |
| * FileScanner for reading Avro files |
| */ |
| public class AvroScanner extends FileScanner { |
| private List<Schema.Field> avroFields; |
| private DataFileReader<GenericRecord> dataFileReader; |
| private int[] projectionMap; |
| private Tuple outTuple; |
| |
| /** |
| * Creates a new AvroScanner. |
| * |
| * @param conf |
| * @param schema |
| * @param meta |
| * @param fragment |
| */ |
| public AvroScanner(Configuration conf, |
| final org.apache.tajo.catalog.Schema schema, |
| final TableMeta meta, final Fragment fragment) { |
| super(conf, schema, meta, fragment); |
| } |
| |
| /** |
| * Initializes the AvroScanner. |
| */ |
| @Override |
| public void init() throws IOException { |
| if (targets == null) { |
| targets = schema.toArray(); |
| } |
| prepareProjection(targets); |
| outTuple = new VTuple(projectionMap.length); |
| |
| Schema avroSchema = AvroUtil.getAvroSchema(meta, conf); |
| avroFields = avroSchema.getFields(); |
| |
| DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(avroSchema); |
| SeekableInput input = new FsInput(fragment.getPath(), conf); |
| dataFileReader = new DataFileReader<>(input, datumReader); |
| super.init(); |
| } |
| |
| private void prepareProjection(Column[] targets) { |
| projectionMap = new int[targets.length]; |
| for (int i = 0; i < targets.length; ++i) { |
| projectionMap[i] = schema.getColumnId(targets[i].getQualifiedName()); |
| } |
| } |
| |
| private static String fromAvroString(Object value) { |
| if (value instanceof Utf8) { |
| Utf8 utf8 = (Utf8)value; |
| return utf8.toString(); |
| } |
| return value.toString(); |
| } |
| |
| private static Schema getNonNull(Schema schema) { |
| if (!schema.getType().equals(Schema.Type.UNION)) { |
| return schema; |
| } |
| List<Schema> schemas = schema.getTypes(); |
| if (schemas.size() != 2) { |
| return schema; |
| } |
| if (schemas.get(0).getType().equals(Schema.Type.NULL)) { |
| return schemas.get(1); |
| } else if (schemas.get(1).getType().equals(Schema.Type.NULL)) { |
| return schemas.get(0); |
| } else { |
| return schema; |
| } |
| } |
| |
| private Datum convertInt(Object value, TajoDataTypes.Type tajoType) { |
| int intValue = (Integer)value; |
| switch (tajoType) { |
| case BIT: |
| return DatumFactory.createBit((byte)(intValue & 0xff)); |
| case INT2: |
| return DatumFactory.createInt2((short)intValue); |
| default: |
| return DatumFactory.createInt4(intValue); |
| } |
| } |
| |
| private Datum convertBytes(Object value, TajoDataTypes.Type tajoType, |
| DataType dataType) { |
| ByteBuffer buffer = (ByteBuffer)value; |
| byte[] bytes = new byte[buffer.capacity()]; |
| buffer.get(bytes, 0, bytes.length); |
| switch (tajoType) { |
| case PROTOBUF: |
| try { |
| ProtobufDatumFactory factory = |
| ProtobufDatumFactory.get(dataType.getCode()); |
| Message.Builder builder = factory.newBuilder(); |
| builder.mergeFrom(bytes); |
| return ProtobufDatumFactory.createDatum(builder); |
| } catch (InvalidProtocolBufferException e) { |
| throw new RuntimeException(e); |
| } |
| default: |
| return new BlobDatum(bytes); |
| } |
| } |
| |
| private Datum convertString(Object value, TajoDataTypes.Type tajoType) { |
| switch (tajoType) { |
| case CHAR: |
| return DatumFactory.createChar(fromAvroString(value)); |
| default: |
| return DatumFactory.createText(fromAvroString(value)); |
| } |
| } |
| |
| /** |
| * Reads the next Tuple from the Avro file. |
| * |
| * @return The next Tuple from the Avro file or null if end of file is |
| * reached. |
| */ |
| @Override |
| public Tuple next() throws IOException { |
| if (!dataFileReader.hasNext()) { |
| return null; |
| } |
| |
| GenericRecord record = dataFileReader.next(); |
| for (int i = 0; i < projectionMap.length; ++i) { |
| int columnIndex = projectionMap[i]; |
| Object value = record.get(columnIndex); |
| if (value == null) { |
| outTuple.put(i, NullDatum.get()); |
| continue; |
| } |
| |
| // Get Avro type. |
| Schema.Field avroField = avroFields.get(columnIndex); |
| Schema nonNullAvroSchema = getNonNull(avroField.schema()); |
| Schema.Type avroType = nonNullAvroSchema.getType(); |
| |
| // Get Tajo type. |
| Column column = schema.getColumn(columnIndex); |
| DataType dataType = column.getDataType(); |
| TajoDataTypes.Type tajoType = dataType.getType(); |
| switch (avroType) { |
| case NULL: |
| outTuple.put(i, NullDatum.get()); |
| break; |
| case BOOLEAN: |
| outTuple.put(i, DatumFactory.createBool((Boolean) value)); |
| break; |
| case INT: |
| outTuple.put(i, convertInt(value, tajoType)); |
| break; |
| case LONG: |
| outTuple.put(i, DatumFactory.createInt8((Long) value)); |
| break; |
| case FLOAT: |
| outTuple.put(i, DatumFactory.createFloat4((Float) value)); |
| break; |
| case DOUBLE: |
| outTuple.put(i, DatumFactory.createFloat8((Double) value)); |
| break; |
| case BYTES: |
| outTuple.put(i, convertBytes(value, tajoType, dataType)); |
| break; |
| case STRING: |
| outTuple.put(i, convertString(value, tajoType)); |
| break; |
| case RECORD: |
| throw new RuntimeException("Avro RECORD not supported."); |
| case ENUM: |
| throw new RuntimeException("Avro ENUM not supported."); |
| case MAP: |
| throw new RuntimeException("Avro MAP not supported."); |
| case UNION: |
| throw new RuntimeException("Avro UNION not supported."); |
| case FIXED: |
| outTuple.put(i, new BlobDatum(((GenericFixed) value).bytes())); |
| break; |
| default: |
| throw new RuntimeException("Unknown type."); |
| } |
| } |
| return outTuple; |
| } |
| |
| /** |
| * Resets the scanner |
| */ |
| @Override |
| public void reset() throws IOException { |
| } |
| |
| /** |
| * Closes the scanner. |
| */ |
| @Override |
| public void close() throws IOException { |
| if (dataFileReader != null) { |
| dataFileReader.close(); |
| } |
| outTuple = null; |
| } |
| |
| /** |
| * Returns whether this scanner is projectable. |
| * |
| * @return true |
| */ |
| @Override |
| public boolean isProjectable() { |
| return true; |
| } |
| |
| /** |
| * Returns whether this scanner is selectable. |
| * |
| * @return false |
| */ |
| @Override |
| public boolean isSelectable() { |
| return false; |
| } |
| |
| @Override |
| public void setFilter(EvalNode filter) { |
| throw new TajoRuntimeException(new UnsupportedException()); |
| } |
| |
| /** |
| * Returns whether this scanner is splittable. |
| * |
| * @return false |
| */ |
| @Override |
| public boolean isSplittable() { |
| return false; |
| } |
| |
| @Override |
| public float getProgress() { |
| if (!inited) return super.getProgress(); |
| |
| if (!dataFileReader.hasNext()) { |
| return 1.0f; |
| } else { |
| return 0.0f; |
| } |
| } |
| } |