blob: b87511020e4c74bfa88f3fece9695ab0dd438a41 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.impala.infra.tableflattener;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericRecord;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetWriter;
import java.util.Map;
// This class contains information about how unnested datasets are related.
public class FlattenedSchema {
// If a dataset has nesting, an id field will be created so that records in the child
// dataset can reference the parent.
private long referenceId;
// If this dataset has a parent, the parentIdField_ indicates which field in this
// dataset is the foreign key to the parent dataset. If this dataset does not have a
// parent, this should be null??
private Field parentIdField_;
// The name of the data set, mainly used to find a child dataset.
private String name_;
private Map<String, FlattenedSchema> childrenByName_ = Maps.newHashMap();
// The actual dataset object.
private Dataset<GenericRecord> dataset_;
private DatasetWriter<GenericRecord> datasetWriter_;
private final String idFieldName_ = "id";
public FlattenedSchema(String name) {
name_ = name;
referenceId = 0;
}
public FlattenedSchema(String name, FlattenedSchema parent) {
this(name);
parent.childrenByName_.put(name, this);
}
// Opens this dataset and all children for writing.
public void open() {
if (datasetWriter_ != null) return;
datasetWriter_ = dataset_.newWriter();
for (FlattenedSchema child : childrenByName_.values()) {
child.open();
}
}
// Write a record to this dataset.
public void write(GenericRecord record) {
Preconditions.checkNotNull(datasetWriter_, "open() must be called before writing");
datasetWriter_.write(record);
}
// Close this dataset and all children.
public void close() {
if (datasetWriter_ == null) return;
datasetWriter_.close();
for (FlattenedSchema child : childrenByName_.values()) {
child.close();
}
datasetWriter_ = null;
}
// Generates a new id for a new record in this dataset.
public Long nextId() { return ++referenceId; }
// Get the name to use when creating an id field.
public String getIdFieldName() { return idFieldName_; }
// Get the name of the field used to store the values of an array or map.
public String getCollectionValueFieldName() { return "value"; }
// Get the name of the field used to store the index of an array value..
public String getArrayIdxFieldName() { return "idx"; }
// Get the name of the field used to store the key of a map entry.
public String getMapKeyFieldName() { return "key"; }
// Get the name of a child dataset if this dataset corresponds to an array or map.
public String getChildOfCollectionName() {
return name_ + getNameSeparator() + "_values";
}
public String getIsNullFieldName(String fieldName) { return fieldName + "_is_null"; }
// Get the separator when concatenating field or dataset names.
public static String getNameSeparator() { return "_"; }
// Get the child of this dataset if this dataset corresponds to an array or map.
public FlattenedSchema getChildOfCollection() {
FlattenedSchema child = childrenByName_.get(getChildOfCollectionName());
Preconditions.checkNotNull(child);
return child;
}
// Get the name of a child dataset if this dataset corresponds to a record.
public String getChildOfRecordName(String parentFieldName) {
return name_ + getNameSeparator() + parentFieldName;
}
// Get the child of this dataset if this dataset corresponds to a record.
public FlattenedSchema getChildOfRecord(String parentFieldName) {
FlattenedSchema child = childrenByName_.get(getChildOfRecordName(parentFieldName));
Preconditions.checkNotNull(child);
return child;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append(dataset_.getDescriptor().getSchema().toString(true));
for (FlattenedSchema child : childrenByName_.values()) {
builder.append("\n\nChild: ")
.append(child.name_)
.append("\n")
.append(child.toString());
}
return builder.toString();
}
public String getName() { return name_; }
public Field getParentIdField() { return parentIdField_; }
public void setParentIdField(Field parentIdField) {
parentIdField_ = parentIdField;
}
public Dataset getDataset() { return dataset_; }
public void setDataset(Dataset<GenericRecord> dataset) { dataset_ = dataset; }
}