lang/java/trevni/avro/src/main/java/org/apache/trevni/avro/AvroColumnReader.java - avro - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.trevni.avro;

 import java.io.IOException;
 import java.io.Closeable;
 import java.io.File;
 import java.nio.ByteBuffer;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.HashMap;
 import java.util.List;

 import org.apache.trevni.ColumnMetaData;
 import org.apache.trevni.ColumnFileReader;
 import org.apache.trevni.ColumnValues;
 import org.apache.trevni.Input;
 import org.apache.trevni.InputFile;
 import org.apache.trevni.TrevniRuntimeException;

 import org.apache.avro.Schema;
 import org.apache.avro.Schema.Field;
 import org.apache.avro.generic.GenericData;

 import static org.apache.trevni.avro.AvroColumnator.isSimple;

 /** Read files written with {@link AvroColumnWriter}.  A subset of the schema
  * used for writing may be specified when reading.  In this case only columns
  * of the subset schema are read. */
 public class AvroColumnReader<D>
   implements Iterator<D>, Iterable<D>, Closeable {

   private ColumnFileReader reader;
   private GenericData model;
   private Schema fileSchema;
   private Schema readSchema;

   private ColumnValues[] values;
   private int[] arrayWidths;
   private int column;                          // current index in values

   private Map<String,Map<String,Object>> defaults =
     new HashMap<String,Map<String,Object>>();

   /** Parameters for reading an Avro column file. */
   public static class Params {
     Input input;
     Schema schema;
     GenericData model = GenericData.get();

     /** Construct reading from a file. */
     public Params(File file) throws IOException {
       this(new InputFile(file));
     }

     /** Construct reading from input. */
     public Params(Input input) { this.input = input; }

     /** Set subset schema to project data down to. */
     public Params setSchema(Schema schema) {
       this.schema = schema;
       return this;
     }

     /** Set data representation. */
     public Params setModel(GenericData model) {
       this.model = model;
       return this;
     }
   }

   /** Construct a reader for a file. */
   public AvroColumnReader(Params params)
     throws IOException {
     this.reader = new ColumnFileReader(params.input);
     this.model = params.model;
     this.fileSchema =
       Schema.parse(reader.getMetaData().getString(AvroColumnWriter.SCHEMA_KEY));
     this.readSchema = params.schema == null ? fileSchema : params.schema;
     initialize();
   }

   /** Return the schema for data in this file. */
   public Schema getFileSchema() { return fileSchema; }

   void initialize() throws IOException {
     // compute a mapping from column name to number for file
     Map<String,Integer> fileColumnNumbers = new HashMap<String,Integer>();
     int i = 0;
     for (ColumnMetaData c : new AvroColumnator(fileSchema).getColumns())
       fileColumnNumbers.put(c.getName(), i++);

     // create iterator for each column in readSchema
     AvroColumnator readColumnator = new AvroColumnator(readSchema);
     this.arrayWidths = readColumnator.getArrayWidths();
     ColumnMetaData[] readColumns = readColumnator.getColumns();
     this.values = new ColumnValues[readColumns.length];
     int j = 0;
     for (ColumnMetaData c : readColumns) {
       Integer n = fileColumnNumbers.get(c.getName());
       if (n != null)
         values[j++] = reader.getValues(n);
     }
     findDefaults(readSchema, fileSchema);
   }

   // get defaults for fields in read that are not in write
   private void findDefaults(Schema read, Schema write) {
     switch (read.getType()) {
     case NULL: case BOOLEAN:
     case INT: case LONG:
     case FLOAT: case DOUBLE:
     case BYTES: case STRING:
     case ENUM: case FIXED:
       if (read.getType() != write.getType())
         throw new TrevniRuntimeException("Type mismatch: "+read+" & "+write);
       break;
     case MAP:
       findDefaults(read.getValueType(), write.getValueType());
       break;
     case ARRAY:
       findDefaults(read.getElementType(), write.getElementType());
       break;
     case UNION:
       for (Schema s : read.getTypes()) {
         Integer index = write.getIndexNamed(s.getFullName());
         if (index == null)
           throw new TrevniRuntimeException("No matching branch: "+s);
         findDefaults(s, write.getTypes().get(index));
       }
       break;
     case RECORD:
       for (Field f : read.getFields()) {
         Field g = write.getField(f.name());
         if (g == null)
           setDefault(read, f);
         else
           findDefaults(f.schema(), g.schema());
       }
       break;
     default:
       throw new TrevniRuntimeException("Unknown schema: "+read);
     }
   }

   private void setDefault(Schema record, Field f) {
     String recordName = record.getFullName();
     Map<String,Object> recordDefaults = defaults.get(recordName);
     if (recordDefaults == null) {
       recordDefaults = new HashMap<String,Object>();
       defaults.put(recordName, recordDefaults);
     }
     recordDefaults.put(f.name(), model.getDefaultValue(f));
   }

   @Override
   public Iterator<D> iterator() { return this; }

   @Override
   public boolean hasNext() {
     return values[0].hasNext();
   }

   /** Return the number of rows in this file. */
   public long getRowCount() { return reader.getRowCount(); }

   @Override
   public D next() {
     try {
       for (int i = 0; i < values.length; i++)
         if (values[i] != null)
           values[i].startRow();
       this.column = 0;
       return (D)read(readSchema);
     } catch (IOException e) {
       throw new TrevniRuntimeException(e);
     }
   }

   private Object read(Schema s) throws IOException {
     if (isSimple(s))
       return nextValue(s, column++);

     final int startColumn = column;

     switch (s.getType()) {
     case MAP:
       int size = values[column].nextLength();
       Map map = (Map)new HashMap(size);
       for (int i = 0; i < size; i++) {
         this.column = startColumn;
         values[column++].nextValue();                      // null in parent
         String key = (String)values[column++].nextValue(); // key
         map.put(key, read(s.getValueType()));              // value
       }
       column = startColumn + arrayWidths[startColumn];
       return map;
     case RECORD:
       Object record = model.newRecord(null, s);
       Map<String,Object> rDefaults = defaults.get(s.getFullName());
       for (Field f : s.getFields()) {
         Object value = ((rDefaults != null) && rDefaults.containsKey(f.name()))
           ? model.deepCopy(f.schema(), rDefaults.get(f.name()))
           : read(f.schema());
         model.setField(record, f.name(), f.pos(), value);
       }
       return record;
     case ARRAY:
       int length = values[column].nextLength();
       List elements = (List)new GenericData.Array(length, s);
       for (int i = 0; i < length; i++) {
         this.column = startColumn;
         Object value = nextValue(s, column++);
         if (!isSimple(s.getElementType()))
           value = read(s.getElementType());
         elements.add(value);
       }
       column = startColumn + arrayWidths[startColumn];
       return elements;
     case UNION:
       Object value = null;
       for (Schema branch : s.getTypes()) {
         if (branch.getType() == Schema.Type.NULL) continue;
         if (values[column].nextLength() == 1) {
           value = nextValue(branch, column);
           column++;
           if (!isSimple(branch))
             value = read(branch);
         } else {
           column += arrayWidths[column];
         }
       }
       return value;
     default:
       throw new TrevniRuntimeException("Unknown schema: "+s);
     }
   }

   private Object nextValue(Schema s, int column) throws IOException {
     Object v = values[column].nextValue();

     switch (s.getType()) {
     case ENUM:
       return model.createEnum(s.getEnumSymbols().get((Integer)v), s);
     case FIXED:
       return model.createFixed(null, ((ByteBuffer)v).array(), s);
     }

     return v;
   }

   @Override
   public void remove() { throw new UnsupportedOperationException(); }

   @Override
   public void close() throws IOException {
     reader.close();
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.trevni.avro;

	import java.io.IOException;
	import java.io.Closeable;
	import java.io.File;
	import java.nio.ByteBuffer;
	import java.util.Iterator;
	import java.util.Map;
	import java.util.HashMap;
	import java.util.List;

	import org.apache.trevni.ColumnMetaData;
	import org.apache.trevni.ColumnFileReader;
	import org.apache.trevni.ColumnValues;
	import org.apache.trevni.Input;
	import org.apache.trevni.InputFile;
	import org.apache.trevni.TrevniRuntimeException;

	import org.apache.avro.Schema;
	import org.apache.avro.Schema.Field;
	import org.apache.avro.generic.GenericData;

	import static org.apache.trevni.avro.AvroColumnator.isSimple;

	/** Read files written with {@link AvroColumnWriter}. A subset of the schema
	* used for writing may be specified when reading. In this case only columns
	* of the subset schema are read. */
	public class AvroColumnReader<D>
	implements Iterator<D>, Iterable<D>, Closeable {

	private ColumnFileReader reader;
	private GenericData model;
	private Schema fileSchema;
	private Schema readSchema;

	private ColumnValues[] values;
	private int[] arrayWidths;
	private int column; // current index in values

	private Map<String,Map<String,Object>> defaults =
	new HashMap<String,Map<String,Object>>();

	/** Parameters for reading an Avro column file. */
	public static class Params {
	Input input;
	Schema schema;
	GenericData model = GenericData.get();

	/** Construct reading from a file. */
	public Params(File file) throws IOException {
	this(new InputFile(file));
	}

	/** Construct reading from input. */
	public Params(Input input) { this.input = input; }

	/** Set subset schema to project data down to. */
	public Params setSchema(Schema schema) {
	this.schema = schema;
	return this;
	}

	/** Set data representation. */
	public Params setModel(GenericData model) {
	this.model = model;
	return this;
	}
	}

	/** Construct a reader for a file. */
	public AvroColumnReader(Params params)
	throws IOException {
	this.reader = new ColumnFileReader(params.input);
	this.model = params.model;
	this.fileSchema =
	Schema.parse(reader.getMetaData().getString(AvroColumnWriter.SCHEMA_KEY));
	this.readSchema = params.schema == null ? fileSchema : params.schema;
	initialize();
	}

	/** Return the schema for data in this file. */
	public Schema getFileSchema() { return fileSchema; }

	void initialize() throws IOException {
	// compute a mapping from column name to number for file
	Map<String,Integer> fileColumnNumbers = new HashMap<String,Integer>();
	int i = 0;
	for (ColumnMetaData c : new AvroColumnator(fileSchema).getColumns())
	fileColumnNumbers.put(c.getName(), i++);

	// create iterator for each column in readSchema
	AvroColumnator readColumnator = new AvroColumnator(readSchema);
	this.arrayWidths = readColumnator.getArrayWidths();
	ColumnMetaData[] readColumns = readColumnator.getColumns();
	this.values = new ColumnValues[readColumns.length];
	int j = 0;
	for (ColumnMetaData c : readColumns) {
	Integer n = fileColumnNumbers.get(c.getName());
	if (n != null)
	values[j++] = reader.getValues(n);
	}
	findDefaults(readSchema, fileSchema);
	}

	// get defaults for fields in read that are not in write
	private void findDefaults(Schema read, Schema write) {
	switch (read.getType()) {
	case NULL: case BOOLEAN:
	case INT: case LONG:
	case FLOAT: case DOUBLE:
	case BYTES: case STRING:
	case ENUM: case FIXED:
	if (read.getType() != write.getType())
	throw new TrevniRuntimeException("Type mismatch: "+read+" & "+write);
	break;
	case MAP:
	findDefaults(read.getValueType(), write.getValueType());
	break;
	case ARRAY:
	findDefaults(read.getElementType(), write.getElementType());
	break;
	case UNION:
	for (Schema s : read.getTypes()) {
	Integer index = write.getIndexNamed(s.getFullName());
	if (index == null)
	throw new TrevniRuntimeException("No matching branch: "+s);
	findDefaults(s, write.getTypes().get(index));
	}
	break;
	case RECORD:
	for (Field f : read.getFields()) {
	Field g = write.getField(f.name());
	if (g == null)
	setDefault(read, f);
	else
	findDefaults(f.schema(), g.schema());
	}
	break;
	default:
	throw new TrevniRuntimeException("Unknown schema: "+read);
	}
	}

	private void setDefault(Schema record, Field f) {
	String recordName = record.getFullName();
	Map<String,Object> recordDefaults = defaults.get(recordName);
	if (recordDefaults == null) {
	recordDefaults = new HashMap<String,Object>();
	defaults.put(recordName, recordDefaults);
	}
	recordDefaults.put(f.name(), model.getDefaultValue(f));
	}

	@Override
	public Iterator<D> iterator() { return this; }

	@Override
	public boolean hasNext() {
	return values[0].hasNext();
	}

	/** Return the number of rows in this file. */
	public long getRowCount() { return reader.getRowCount(); }

	@Override
	public D next() {
	try {
	for (int i = 0; i < values.length; i++)
	if (values[i] != null)
	values[i].startRow();
	this.column = 0;
	return (D)read(readSchema);
	} catch (IOException e) {
	throw new TrevniRuntimeException(e);
	}
	}

	private Object read(Schema s) throws IOException {
	if (isSimple(s))
	return nextValue(s, column++);

	final int startColumn = column;

	switch (s.getType()) {
	case MAP:
	int size = values[column].nextLength();
	Map map = (Map)new HashMap(size);
	for (int i = 0; i < size; i++) {
	this.column = startColumn;
	values[column++].nextValue(); // null in parent
	String key = (String)values[column++].nextValue(); // key
	map.put(key, read(s.getValueType())); // value
	}
	column = startColumn + arrayWidths[startColumn];
	return map;
	case RECORD:
	Object record = model.newRecord(null, s);
	Map<String,Object> rDefaults = defaults.get(s.getFullName());
	for (Field f : s.getFields()) {
	Object value = ((rDefaults != null) && rDefaults.containsKey(f.name()))
	? model.deepCopy(f.schema(), rDefaults.get(f.name()))
	: read(f.schema());
	model.setField(record, f.name(), f.pos(), value);
	}
	return record;
	case ARRAY:
	int length = values[column].nextLength();
	List elements = (List)new GenericData.Array(length, s);
	for (int i = 0; i < length; i++) {
	this.column = startColumn;
	Object value = nextValue(s, column++);
	if (!isSimple(s.getElementType()))
	value = read(s.getElementType());
	elements.add(value);
	}
	column = startColumn + arrayWidths[startColumn];
	return elements;
	case UNION:
	Object value = null;
	for (Schema branch : s.getTypes()) {
	if (branch.getType() == Schema.Type.NULL) continue;
	if (values[column].nextLength() == 1) {
	value = nextValue(branch, column);
	column++;
	if (!isSimple(branch))
	value = read(branch);
	} else {
	column += arrayWidths[column];
	}
	}
	return value;
	default:
	throw new TrevniRuntimeException("Unknown schema: "+s);
	}
	}

	private Object nextValue(Schema s, int column) throws IOException {
	Object v = values[column].nextValue();

	switch (s.getType()) {
	case ENUM:
	return model.createEnum(s.getEnumSymbols().get((Integer)v), s);
	case FIXED:
	return model.createFixed(null, ((ByteBuffer)v).array(), s);
	}

	return v;
	}

	@Override
	public void remove() { throw new UnsupportedOperationException(); }

	@Override
	public void close() throws IOException {
	reader.close();
	}

	}