testdata/TableFlattener/src/main/java/org/apache/impala/infra/tableflattener/FlattenedSchema.java - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 package org.apache.impala.infra.tableflattener;

 import com.google.common.base.Preconditions;
 import com.google.common.collect.Maps;
 import org.apache.avro.Schema.Field;
 import org.apache.avro.generic.GenericRecord;
 import org.kitesdk.data.Dataset;
 import org.kitesdk.data.DatasetWriter;

 import java.util.Map;

 // This class contains information about how unnested datasets are related.
 public class FlattenedSchema {

   // If a dataset has nesting, an id field will be created so that records in the child
   // dataset can reference the parent.
   private long referenceId;

   // If this dataset has a parent, the parentIdField_ indicates which field in this
   // dataset is the foreign key to the parent dataset. If this dataset does not have a
   // parent, this should be null??
   private Field parentIdField_;

   // The name of the data set, mainly used to find a child dataset.
   private String name_;
   private Map<String, FlattenedSchema> childrenByName_ = Maps.newHashMap();

   // The actual dataset object.
   private Dataset<GenericRecord> dataset_;
   private DatasetWriter<GenericRecord> datasetWriter_;

   private final String idFieldName_ = "id";

   public FlattenedSchema(String name) {
     name_ = name;
     referenceId = 0;
   }

   public FlattenedSchema(String name, FlattenedSchema parent) {
     this(name);
     parent.childrenByName_.put(name, this);
   }

   // Opens this dataset and all children for writing.
   public void open() {
     if (datasetWriter_ != null) return;
     datasetWriter_ = dataset_.newWriter();
     for (FlattenedSchema child : childrenByName_.values()) {
       child.open();
     }
   }

   // Write a record to this dataset.
   public void write(GenericRecord record) {
     Preconditions.checkNotNull(datasetWriter_, "open() must be called before writing");
     datasetWriter_.write(record);
   }

   // Close this dataset and all children.
   public void close() {
     if (datasetWriter_ == null) return;
     datasetWriter_.close();
     for (FlattenedSchema child : childrenByName_.values()) {
       child.close();
     }
     datasetWriter_ = null;
   }

   // Generates a new id for a new record in this dataset.
   public Long nextId() { return ++referenceId; }

   // Get the name to use when creating an id field.
   public String getIdFieldName() { return idFieldName_; }

   // Get the name of the field used to store the values of an array or map.
   public String getCollectionValueFieldName() { return "value"; }

   // Get the name of the field used to store the index of an array value..
   public String getArrayIdxFieldName() { return "idx"; }

   // Get the name of the field used to store the key of a map entry.
   public String getMapKeyFieldName() { return "key"; }

   // Get the name of a child dataset if this dataset corresponds to an array or map.
   public String getChildOfCollectionName() {
     return name_ + getNameSeparator() + "_values";
   }

   public String getIsNullFieldName(String fieldName) { return fieldName + "_is_null"; }

   // Get the separator when concatenating field or dataset names.
   public static String getNameSeparator() { return "_"; }

   // Get the child of this dataset if this dataset corresponds to an array or map.
   public FlattenedSchema getChildOfCollection() {
     FlattenedSchema child = childrenByName_.get(getChildOfCollectionName());
     Preconditions.checkNotNull(child);
     return child;
   }

   // Get the name of a child dataset if this dataset corresponds to a record.
   public String getChildOfRecordName(String parentFieldName) {
     return name_ + getNameSeparator() + parentFieldName;
   }

   // Get the child of this dataset if this dataset corresponds to a record.
   public FlattenedSchema getChildOfRecord(String parentFieldName) {
     FlattenedSchema child = childrenByName_.get(getChildOfRecordName(parentFieldName));
     Preconditions.checkNotNull(child);
     return child;
   }

   @Override
   public String toString() {
     StringBuilder builder = new StringBuilder();
     builder.append(dataset_.getDescriptor().getSchema().toString(true));
     for (FlattenedSchema child : childrenByName_.values()) {
       builder.append("\n\nChild: ")
           .append(child.name_)
           .append("\n")
           .append(child.toString());

     }
     return builder.toString();
   }

   public String getName() { return name_; }

   public Field getParentIdField() { return parentIdField_; }

   public void setParentIdField(Field parentIdField) {
     parentIdField_ = parentIdField;
   }

   public Dataset getDataset() { return dataset_; }

   public void setDataset(Dataset<GenericRecord> dataset) { dataset_ = dataset; }
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	package org.apache.impala.infra.tableflattener;

	import com.google.common.base.Preconditions;
	import com.google.common.collect.Maps;
	import org.apache.avro.Schema.Field;
	import org.apache.avro.generic.GenericRecord;
	import org.kitesdk.data.Dataset;
	import org.kitesdk.data.DatasetWriter;

	import java.util.Map;

	// This class contains information about how unnested datasets are related.
	public class FlattenedSchema {

	// If a dataset has nesting, an id field will be created so that records in the child
	// dataset can reference the parent.
	private long referenceId;

	// If this dataset has a parent, the parentIdField_ indicates which field in this
	// dataset is the foreign key to the parent dataset. If this dataset does not have a
	// parent, this should be null??
	private Field parentIdField_;

	// The name of the data set, mainly used to find a child dataset.
	private String name_;
	private Map<String, FlattenedSchema> childrenByName_ = Maps.newHashMap();

	// The actual dataset object.
	private Dataset<GenericRecord> dataset_;
	private DatasetWriter<GenericRecord> datasetWriter_;

	private final String idFieldName_ = "id";

	public FlattenedSchema(String name) {
	name_ = name;
	referenceId = 0;
	}

	public FlattenedSchema(String name, FlattenedSchema parent) {
	this(name);
	parent.childrenByName_.put(name, this);
	}

	// Opens this dataset and all children for writing.
	public void open() {
	if (datasetWriter_ != null) return;
	datasetWriter_ = dataset_.newWriter();
	for (FlattenedSchema child : childrenByName_.values()) {
	child.open();
	}
	}

	// Write a record to this dataset.
	public void write(GenericRecord record) {
	Preconditions.checkNotNull(datasetWriter_, "open() must be called before writing");
	datasetWriter_.write(record);
	}

	// Close this dataset and all children.
	public void close() {
	if (datasetWriter_ == null) return;
	datasetWriter_.close();
	for (FlattenedSchema child : childrenByName_.values()) {
	child.close();
	}
	datasetWriter_ = null;
	}

	// Generates a new id for a new record in this dataset.
	public Long nextId() { return ++referenceId; }

	// Get the name to use when creating an id field.
	public String getIdFieldName() { return idFieldName_; }

	// Get the name of the field used to store the values of an array or map.
	public String getCollectionValueFieldName() { return "value"; }

	// Get the name of the field used to store the index of an array value..
	public String getArrayIdxFieldName() { return "idx"; }

	// Get the name of the field used to store the key of a map entry.
	public String getMapKeyFieldName() { return "key"; }

	// Get the name of a child dataset if this dataset corresponds to an array or map.
	public String getChildOfCollectionName() {
	return name_ + getNameSeparator() + "_values";
	}

	public String getIsNullFieldName(String fieldName) { return fieldName + "_is_null"; }

	// Get the separator when concatenating field or dataset names.
	public static String getNameSeparator() { return "_"; }

	// Get the child of this dataset if this dataset corresponds to an array or map.
	public FlattenedSchema getChildOfCollection() {
	FlattenedSchema child = childrenByName_.get(getChildOfCollectionName());
	Preconditions.checkNotNull(child);
	return child;
	}

	// Get the name of a child dataset if this dataset corresponds to a record.
	public String getChildOfRecordName(String parentFieldName) {
	return name_ + getNameSeparator() + parentFieldName;
	}

	// Get the child of this dataset if this dataset corresponds to a record.
	public FlattenedSchema getChildOfRecord(String parentFieldName) {
	FlattenedSchema child = childrenByName_.get(getChildOfRecordName(parentFieldName));
	Preconditions.checkNotNull(child);
	return child;
	}

	@Override
	public String toString() {
	StringBuilder builder = new StringBuilder();
	builder.append(dataset_.getDescriptor().getSchema().toString(true));
	for (FlattenedSchema child : childrenByName_.values()) {
	builder.append("\n\nChild: ")
	.append(child.name_)
	.append("\n")
	.append(child.toString());

	}
	return builder.toString();
	}

	public String getName() { return name_; }

	public Field getParentIdField() { return parentIdField_; }

	public void setParentIdField(Field parentIdField) {
	parentIdField_ = parentIdField;
	}

	public Dataset getDataset() { return dataset_; }

	public void setDataset(Dataset<GenericRecord> dataset) { dataset_ = dataset; }
	}