testdata/TableFlattener/src/main/java/org/apache/impala/infra/tableflattener/SchemaFlattener.java - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 package org.apache.impala.infra.tableflattener;

 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import org.apache.avro.Schema;
 import org.apache.avro.Schema.Field;
 import org.apache.avro.Schema.Type;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.hadoop.conf.Configuration;
 import org.kitesdk.data.Dataset;
 import org.kitesdk.data.DatasetDescriptor;
 import org.kitesdk.data.Datasets;
 import org.kitesdk.data.Formats;

 import java.io.IOException;
 import java.net.URI;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.LinkedList;
 import java.util.List;

 public class SchemaFlattener {

   // The dir to write the flat datasets to. The dir should either not exist or be
   // empty. The URI can either point to a local dir or an HDFS dir.
   URI outputDir_;

   public SchemaFlattener(URI outputDir) { outputDir_ = outputDir; }

   // Creates a flattened schema but does not migrate any data.
   public FlattenedSchema flatten(Schema srcSchema) {
     Preconditions.checkState(srcSchema.getType() == Type.RECORD);
     FlattenedSchema dstDataset = new FlattenedSchema(srcSchema.getName());
     LinkedList<Field> fields = Lists.newLinkedList();
     addRecordFields(srcSchema, dstDataset, fields, "");
     finishCreatingDataset(fields, dstDataset);
     return dstDataset;
   }

   private void addRecordFields(Schema srcSchema, FlattenedSchema dstDataset,
       LinkedList<Field> dstSchemaFields, String fieldNamePrefix) {
     Preconditions.checkState(srcSchema.getType() == Type.RECORD);
     for (Field field : srcSchema.getFields()) {
       Schema fieldSchema = field.schema();
       if (SchemaUtil.isSimpleType(fieldSchema)) {
         dstSchemaFields.add(SchemaUtil.createField(fieldNamePrefix + field.name(),
             fieldSchema, field.doc(), field.defaultValue()));
         continue;
       }
       if (SchemaUtil.isNullable(fieldSchema)) {
         dstSchemaFields.add(SchemaUtil.createField(
             fieldNamePrefix + dstDataset.getIsNullFieldName(field.name()), Type.BOOLEAN));
         fieldSchema = SchemaUtil.reduceUnionToNonNull(fieldSchema);
       }
       if (SchemaUtil.requiresChildDataset(fieldSchema)) {
         createChildDataset(dstDataset.getChildOfRecordName(field.name()), fieldSchema,
             dstSchemaFields, dstDataset);
       } else {
         addRecordFields(fieldSchema, dstDataset, dstSchemaFields,
             fieldNamePrefix + field.name() + dstDataset.getNameSeparator());
       }
     }
   }

   private void createChildDataset(String name, Schema srcSchema,
       LinkedList<Field> parentFields, FlattenedSchema parentDataset) {
     // Ensure that the parent schema has an id field so the child can reference the
     // parent. A single id field is sufficient.
     if (parentFields.isEmpty()
         || !parentFields.getFirst().name().equals(parentDataset.getIdFieldName())) {
       parentFields.addFirst(SchemaUtil.createField(
           parentDataset.getIdFieldName(), Type.LONG));
     }
     FlattenedSchema childDataset = new FlattenedSchema(name, parentDataset);
     LinkedList<Field> fields = Lists.newLinkedList();
     String parentIdFieldName = parentDataset.getName() + childDataset.getNameSeparator()
         + childDataset.getIdFieldName();
     Field parentIdField = SchemaUtil.createField(parentIdFieldName, Type.LONG);
     childDataset.setParentIdField(parentIdField);
     fields.add(parentIdField);
     Schema valueSchema;
     if (srcSchema.getType() == Type.ARRAY) {
       fields.add(
           SchemaUtil.createField(childDataset.getArrayIdxFieldName(), Type.LONG));
       valueSchema = srcSchema.getElementType();
     } else {
       Preconditions.checkState(srcSchema.getType() == Type.MAP);
       fields.add(
           SchemaUtil.createField(childDataset.getMapKeyFieldName(), Type.STRING));
       valueSchema = srcSchema.getValueType();
     }

     if (SchemaUtil.isSimpleType(valueSchema)) {
       fields.add(SchemaUtil.createField(
           childDataset.getCollectionValueFieldName(), valueSchema));
     } else {
       if (SchemaUtil.isNullable(valueSchema)) {
         fields.add(SchemaUtil.createField(childDataset.getIsNullFieldName(
             childDataset.getCollectionValueFieldName()), Type.BOOLEAN));
         valueSchema = SchemaUtil.reduceUnionToNonNull(valueSchema);
       }
       if (SchemaUtil.requiresChildDataset(valueSchema)) {
         createChildDataset(childDataset.getChildOfCollectionName(), valueSchema, fields,
             childDataset);
       } else {
         addRecordFields(valueSchema, childDataset, fields,
             childDataset.getCollectionValueFieldName() + childDataset.getNameSeparator());
       }
     }

     finishCreatingDataset(fields, childDataset);
   }

   private void finishCreatingDataset(List<Field> fields, FlattenedSchema dataset) {
     Schema childSchema = Schema.createRecord(dataset.getName(), null, null, false);
     for (Field field : fields) {
       Preconditions.checkState(!SchemaUtil.schemaHasNesting(field.schema()));
     }
     childSchema.setFields(fields);
     DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
         .format(Formats.PARQUET)
         .schema(childSchema)
         .build();
     dataset.setDataset((Dataset<GenericRecord>)Datasets.create(
         "dataset:" + createDir(dataset.getName()), descriptor));
   }

   private URI createDir(String name) {
     try {
       switch (outputDir_.getScheme().toUpperCase()) {
         case "FILE": {
           Path datasetPath = Paths.get(outputDir_).resolve(name);
           datasetPath.toFile().mkdirs();
           return datasetPath.toUri();
         }
         case "HDFS": {
           org.apache.hadoop.fs.Path outputDirPath
               = new org.apache.hadoop.fs.Path(outputDir_);
           org.apache.hadoop.fs.Path datasetPath
               = new org.apache.hadoop.fs.Path(outputDirPath, name);
           outputDirPath.getFileSystem(new Configuration()).mkdirs(datasetPath);
           return datasetPath.toUri();
         }
         default:
           throw new NotImplementedException(String.format(
               "Unexpected output dir scheme: %s", outputDir_.getScheme()));
       }
     } catch (IOException ex) {
       throw new RuntimeException(ex);
     }
   }
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	package org.apache.impala.infra.tableflattener;

	import com.google.common.base.Preconditions;
	import com.google.common.collect.Lists;
	import org.apache.avro.Schema;
	import org.apache.avro.Schema.Field;
	import org.apache.avro.Schema.Type;
	import org.apache.avro.generic.GenericRecord;
	import org.apache.commons.lang.NotImplementedException;
	import org.apache.hadoop.conf.Configuration;
	import org.kitesdk.data.Dataset;
	import org.kitesdk.data.DatasetDescriptor;
	import org.kitesdk.data.Datasets;
	import org.kitesdk.data.Formats;

	import java.io.IOException;
	import java.net.URI;
	import java.nio.file.Path;
	import java.nio.file.Paths;
	import java.util.LinkedList;
	import java.util.List;

	public class SchemaFlattener {

	// The dir to write the flat datasets to. The dir should either not exist or be
	// empty. The URI can either point to a local dir or an HDFS dir.
	URI outputDir_;

	public SchemaFlattener(URI outputDir) { outputDir_ = outputDir; }

	// Creates a flattened schema but does not migrate any data.
	public FlattenedSchema flatten(Schema srcSchema) {
	Preconditions.checkState(srcSchema.getType() == Type.RECORD);
	FlattenedSchema dstDataset = new FlattenedSchema(srcSchema.getName());
	LinkedList<Field> fields = Lists.newLinkedList();
	addRecordFields(srcSchema, dstDataset, fields, "");
	finishCreatingDataset(fields, dstDataset);
	return dstDataset;
	}

	private void addRecordFields(Schema srcSchema, FlattenedSchema dstDataset,
	LinkedList<Field> dstSchemaFields, String fieldNamePrefix) {
	Preconditions.checkState(srcSchema.getType() == Type.RECORD);
	for (Field field : srcSchema.getFields()) {
	Schema fieldSchema = field.schema();
	if (SchemaUtil.isSimpleType(fieldSchema)) {
	dstSchemaFields.add(SchemaUtil.createField(fieldNamePrefix + field.name(),
	fieldSchema, field.doc(), field.defaultValue()));
	continue;
	}
	if (SchemaUtil.isNullable(fieldSchema)) {
	dstSchemaFields.add(SchemaUtil.createField(
	fieldNamePrefix + dstDataset.getIsNullFieldName(field.name()), Type.BOOLEAN));
	fieldSchema = SchemaUtil.reduceUnionToNonNull(fieldSchema);
	}
	if (SchemaUtil.requiresChildDataset(fieldSchema)) {
	createChildDataset(dstDataset.getChildOfRecordName(field.name()), fieldSchema,
	dstSchemaFields, dstDataset);
	} else {
	addRecordFields(fieldSchema, dstDataset, dstSchemaFields,
	fieldNamePrefix + field.name() + dstDataset.getNameSeparator());
	}
	}
	}

	private void createChildDataset(String name, Schema srcSchema,
	LinkedList<Field> parentFields, FlattenedSchema parentDataset) {
	// Ensure that the parent schema has an id field so the child can reference the
	// parent. A single id field is sufficient.
	if (parentFields.isEmpty()
	\|\| !parentFields.getFirst().name().equals(parentDataset.getIdFieldName())) {
	parentFields.addFirst(SchemaUtil.createField(
	parentDataset.getIdFieldName(), Type.LONG));
	}
	FlattenedSchema childDataset = new FlattenedSchema(name, parentDataset);
	LinkedList<Field> fields = Lists.newLinkedList();
	String parentIdFieldName = parentDataset.getName() + childDataset.getNameSeparator()
	+ childDataset.getIdFieldName();
	Field parentIdField = SchemaUtil.createField(parentIdFieldName, Type.LONG);
	childDataset.setParentIdField(parentIdField);
	fields.add(parentIdField);
	Schema valueSchema;
	if (srcSchema.getType() == Type.ARRAY) {
	fields.add(
	SchemaUtil.createField(childDataset.getArrayIdxFieldName(), Type.LONG));
	valueSchema = srcSchema.getElementType();
	} else {
	Preconditions.checkState(srcSchema.getType() == Type.MAP);
	fields.add(
	SchemaUtil.createField(childDataset.getMapKeyFieldName(), Type.STRING));
	valueSchema = srcSchema.getValueType();
	}

	if (SchemaUtil.isSimpleType(valueSchema)) {
	fields.add(SchemaUtil.createField(
	childDataset.getCollectionValueFieldName(), valueSchema));
	} else {
	if (SchemaUtil.isNullable(valueSchema)) {
	fields.add(SchemaUtil.createField(childDataset.getIsNullFieldName(
	childDataset.getCollectionValueFieldName()), Type.BOOLEAN));
	valueSchema = SchemaUtil.reduceUnionToNonNull(valueSchema);
	}
	if (SchemaUtil.requiresChildDataset(valueSchema)) {
	createChildDataset(childDataset.getChildOfCollectionName(), valueSchema, fields,
	childDataset);
	} else {
	addRecordFields(valueSchema, childDataset, fields,
	childDataset.getCollectionValueFieldName() + childDataset.getNameSeparator());
	}
	}

	finishCreatingDataset(fields, childDataset);
	}

	private void finishCreatingDataset(List<Field> fields, FlattenedSchema dataset) {
	Schema childSchema = Schema.createRecord(dataset.getName(), null, null, false);
	for (Field field : fields) {
	Preconditions.checkState(!SchemaUtil.schemaHasNesting(field.schema()));
	}
	childSchema.setFields(fields);
	DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
	.format(Formats.PARQUET)
	.schema(childSchema)
	.build();
	dataset.setDataset((Dataset<GenericRecord>)Datasets.create(
	"dataset:" + createDir(dataset.getName()), descriptor));
	}

	private URI createDir(String name) {
	try {
	switch (outputDir_.getScheme().toUpperCase()) {
	case "FILE": {
	Path datasetPath = Paths.get(outputDir_).resolve(name);
	datasetPath.toFile().mkdirs();
	return datasetPath.toUri();
	}
	case "HDFS": {
	org.apache.hadoop.fs.Path outputDirPath
	= new org.apache.hadoop.fs.Path(outputDir_);
	org.apache.hadoop.fs.Path datasetPath
	= new org.apache.hadoop.fs.Path(outputDirPath, name);
	outputDirPath.getFileSystem(new Configuration()).mkdirs(datasetPath);
	return datasetPath.toUri();
	}
	default:
	throw new NotImplementedException(String.format(
	"Unexpected output dir scheme: %s", outputDir_.getScheme()));
	}
	} catch (IOException ex) {
	throw new RuntimeException(ex);
	}
	}
	}