blob: fc6d3538d9a4cfd9ed3e247220ecc6fd1fe077a5 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tajo.catalog;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import org.apache.tajo.exception.TajoRuntimeException;
import org.apache.tajo.exception.UnsupportedDataTypeException;
import java.util.*;
import static org.apache.tajo.common.TajoDataTypes.DataType;
import static org.apache.tajo.common.TajoDataTypes.Type;
public class SchemaUtil {
// See TAJO-914 bug.
//
// Its essential problem is that constant value is evaluated multiple times at each scan.
// As a result, join nodes can take the child nodes which have the same named fields.
// Because current schema does not allow the same name and ignore the duplicated schema,
// it finally causes the in-out schema mismatch between the parent and child nodes.
//
// tmpColumnSeq is a hack to avoid the above problem by keeping duplicated constant values as different name fields.
// The essential solution would be https://issues.apache.org/jira/browse/TAJO-895.
static int tmpColumnSeq = 0;
public static Schema merge(Schema left, Schema right) {
SchemaBuilder merged = SchemaBuilder.builder();
Set<String> nameSet = new HashSet<>();
for(Column col : left.getRootColumns()) {
merged.add(col);
nameSet.add(col.getQualifiedName());
}
for(Column col : right.getRootColumns()) {
if (nameSet.contains(col.getQualifiedName())) {
merged.add("?fake" + (tmpColumnSeq++), col.getDataType());
} else {
merged.add(col);
nameSet.add(col.getQualifiedName());
}
}
// if overflow
if (tmpColumnSeq < 0) {
tmpColumnSeq = 0;
}
return merged.build();
}
/**
* Get common columns to be used as join keys of natural joins.
*/
public static Schema getNaturalJoinColumns(Schema left, Schema right) {
SchemaBuilder common = SchemaBuilder.builder();
Set<String> commonNames = new HashSet<>();
for (Column outer : left.getRootColumns()) {
if (!commonNames.contains(outer.getSimpleName()) && right.containsByName(outer.getSimpleName())) {
common.add(new Column(outer.getSimpleName(), outer.getDataType()));
commonNames.add(outer.getSimpleName());
}
}
return common.build();
}
public static Schema getQualifiedLogicalSchema(TableDesc tableDesc, String tableName) {
Schema logicalSchema = SchemaBuilder.builder().addAll(tableDesc.getLogicalSchema().getRootColumns()).build();
if (tableName != null) {
logicalSchema.setQualifier(tableName);
}
return logicalSchema;
}
public static <T extends Schema> T clone(Schema schema) {
try {
T copy = (T) schema.clone();
return copy;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
public static DataType [] toDataTypes(Schema schema) {
DataType[] types = new DataType[schema.size()];
for (int i = 0; i < schema.size(); i++) {
types[i] = schema.getColumn(i).getDataType();
}
return types;
}
public static Type [] toTypes(Schema schema) {
Type [] types = new Type[schema.size()];
for (int i = 0; i < schema.size(); i++) {
types[i] = schema.getColumn(i).getDataType().getType();
}
return types;
}
public static String [] toSimpleNames(Schema schema) {
String [] names = new String[schema.size()];
for (int i = 0; i < schema.size(); i++) {
names[i] = schema.getColumn(i).getSimpleName();
}
return names;
}
public static String [] convertColumnsToPaths(Iterable<Column> columns, boolean onlyLeaves) {
List<String> paths = Lists.newArrayList();
for (Column c : columns) {
if (onlyLeaves && c.getDataType().getType() == Type.RECORD) {
continue;
}
paths.add(c.getSimpleName());
}
return paths.toArray(new String [paths.size()]);
}
public static ImmutableMap<String, Type> buildTypeMap(Iterable<Column> schema, String [] targetPaths) {
HashMap<String, Type> builder = new HashMap<>();
for (Column column : schema) {
// Keep types which only belong to projected paths
// For example, assume that a projected path is 'name/first_name', where name is RECORD and first_name is TEXT.
// In this case, we should keep two types:
// * name - RECORD
// * name/first_name TEXT
for (String p : targetPaths) {
if (p.startsWith(column.getSimpleName())) {
builder.put(column.getSimpleName(), column.getDataType().getType());
}
}
}
return ImmutableMap.copyOf(builder);
}
/**
* Column visitor interface
*/
public interface ColumnVisitor {
void visit(int depth, List<String> path, Column column);
}
/**
* It allows a column visitor to traverse all columns in a schema in a depth-first order.
* @param schema
* @param function
*/
public static void visitSchema(Schema schema, ColumnVisitor function) {
for(Column col : schema.getRootColumns()) {
visitInDepthFirstOrder(0, NestedPathUtil.ROOT_PATH, function, col);
}
}
/**
* A recursive function to traverse all columns in a schema in a depth-first order.
*
* @param depth Nested depth. 0 is root column.
* @param function Visitor
* @param column Current visiting column
*/
private static void visitInDepthFirstOrder(int depth,
final List<String> path,
ColumnVisitor function,
Column column) {
if (column.getDataType().getType() == Type.RECORD) {
for (Column nestedColumn : TypeConverter.convert(column.type).nestedRecordSchema.getRootColumns()) {
List<String> newPath = new ArrayList<>(path);
newPath.add(column.getQualifiedName());
visitInDepthFirstOrder(depth + 1, newPath, function, nestedColumn);
}
function.visit(depth, path, column);
} else {
function.visit(depth, path, column);
}
}
public static String toDisplayString(Schema schema) {
StringBuilder sb = new StringBuilder();
DDLBuilder.buildSchema(sb, schema);
return sb.toString();
}
/**
* Calculate the row size from the given schema.
*
* @param schema input schema
* @return estimated row size in bytes
*/
public static int estimateRowByteSizeWithSchema(Schema schema) {
int size = 0;
for (Column column : schema.getAllColumns()) {
size += getColByteSize(column);
}
return size;
}
/**
* Return the size of the given column. For the variable-length columns, it returns a prefixed value.
*
* @param col input column
* @return column length in bytes
*/
public static int getColByteSize(Column col) {
if (col.getDataType().getLength() > 0) {
return col.getDataType().getLength();
}
switch (col.getDataType().getType()) {
case BOOLEAN:
return 1;
case CHAR:
return col.getDataType().getLength();
case BIT:
return 1;
case INT2:
return 2;
case INT4:
return 4;
case INT8:
return 8;
case FLOAT4:
return 4;
case FLOAT8:
return 8;
case INET4:
return 4;
case INET6:
return 16;
case TEXT:
return 256;
case BLOB:
return 256;
case DATE:
return 4;
case TIME:
return 8;
case TIMESTAMP:
return 8;
default:
throw new TajoRuntimeException(new UnsupportedDataTypeException(col.getDataType().toString()));
}
}
}