| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.orc.impl; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| |
| import org.apache.orc.TypeDescription; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Take the file types and the (optional) configuration column names/types and |
| * see if there has been schema evolution. |
| */ |
| public class SchemaEvolution { |
| // indexed by reader column id |
| private final TypeDescription[] readerFileTypes; |
| // indexed by reader column id |
| private final boolean[] included; |
| private final TypeDescription fileSchema; |
| private final TypeDescription readerSchema; |
| private boolean hasConversion = false; |
| // indexed by reader column id |
| private final boolean[] ppdSafeConversion; |
| |
| public SchemaEvolution(TypeDescription fileSchema, boolean[] includedCols) { |
| this(fileSchema, null, includedCols); |
| } |
| |
| public SchemaEvolution(TypeDescription fileSchema, |
| TypeDescription readerSchema, |
| boolean[] includedCols) { |
| this.included = includedCols == null ? null : |
| Arrays.copyOf(includedCols, includedCols.length); |
| this.hasConversion = false; |
| this.fileSchema = fileSchema; |
| if (readerSchema != null) { |
| if (checkAcidSchema(fileSchema)) { |
| this.readerSchema = createEventSchema(readerSchema); |
| } else { |
| this.readerSchema = readerSchema; |
| } |
| this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1]; |
| buildConversionFileTypesArray(fileSchema, this.readerSchema); |
| } else { |
| this.readerSchema = fileSchema; |
| this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1]; |
| buildSameSchemaFileTypesArray(); |
| } |
| this.ppdSafeConversion = populatePpdSafeConversion(); |
| } |
| |
| public TypeDescription getReaderSchema() { |
| return readerSchema; |
| } |
| |
| /** |
| * Is there Schema Evolution data type conversion? |
| * @return |
| */ |
| public boolean hasConversion() { |
| return hasConversion; |
| } |
| |
| public TypeDescription getFileType(TypeDescription readerType) { |
| return getFileType(readerType.getId()); |
| } |
| |
| /** |
| * Get the file type by reader type id. |
| * @param id reader column id |
| * @return |
| */ |
| public TypeDescription getFileType(int id) { |
| return readerFileTypes[id]; |
| } |
| |
| /** |
| * Check if column is safe for ppd evaluation |
| * @param colId reader column id |
| * @return true if the specified column is safe for ppd evaluation else false |
| */ |
| public boolean isPPDSafeConversion(final int colId) { |
| if (hasConversion()) { |
| if (colId < 0 || colId >= ppdSafeConversion.length) { |
| return false; |
| } |
| return ppdSafeConversion[colId]; |
| } |
| |
| // when there is no schema evolution PPD is safe |
| return true; |
| } |
| |
| private boolean[] populatePpdSafeConversion() { |
| if (fileSchema == null || readerSchema == null || readerFileTypes == null) { |
| return null; |
| } |
| |
| boolean[] result = new boolean[readerSchema.getMaximumId() + 1]; |
| boolean safePpd = validatePPDConversion(fileSchema, readerSchema); |
| result[readerSchema.getId()] = safePpd; |
| List<TypeDescription> children = readerSchema.getChildren(); |
| if (children != null) { |
| for (TypeDescription child : children) { |
| TypeDescription fileType = getFileType(child.getId()); |
| safePpd = validatePPDConversion(fileType, child); |
| result[child.getId()] = safePpd; |
| } |
| } |
| return result; |
| } |
| |
| private boolean validatePPDConversion(final TypeDescription fileType, |
| final TypeDescription readerType) { |
| if (fileType == null) { |
| return false; |
| } |
| if (fileType.getCategory().isPrimitive()) { |
| if (fileType.getCategory().equals(readerType.getCategory())) { |
| // for decimals alone do equality check to not mess up with precision change |
| if (fileType.getCategory().equals(TypeDescription.Category.DECIMAL) && |
| !fileType.equals(readerType)) { |
| return false; |
| } |
| return true; |
| } |
| |
| // only integer and string evolutions are safe |
| // byte -> short -> int -> long |
| // string <-> char <-> varchar |
| // NOTE: Float to double evolution is not safe as floats are stored as doubles in ORC's |
| // internal index, but when doing predicate evaluation for queries like "select * from |
| // orc_float where f = 74.72" the constant on the filter is converted from string -> double |
| // so the precisions will be different and the comparison will fail. |
| // Soon, we should convert all sargs that compare equality between floats or |
| // doubles to range predicates. |
| |
| // Similarly string -> char and varchar -> char and vice versa is not possible, as ORC stores |
| // char with padded spaces in its internal index. |
| switch (fileType.getCategory()) { |
| case BYTE: |
| if (readerType.getCategory().equals(TypeDescription.Category.SHORT) || |
| readerType.getCategory().equals(TypeDescription.Category.INT) || |
| readerType.getCategory().equals(TypeDescription.Category.LONG)) { |
| return true; |
| } |
| break; |
| case SHORT: |
| if (readerType.getCategory().equals(TypeDescription.Category.INT) || |
| readerType.getCategory().equals(TypeDescription.Category.LONG)) { |
| return true; |
| } |
| break; |
| case INT: |
| if (readerType.getCategory().equals(TypeDescription.Category.LONG)) { |
| return true; |
| } |
| break; |
| case STRING: |
| if (readerType.getCategory().equals(TypeDescription.Category.VARCHAR)) { |
| return true; |
| } |
| break; |
| case VARCHAR: |
| if (readerType.getCategory().equals(TypeDescription.Category.STRING)) { |
| return true; |
| } |
| break; |
| default: |
| break; |
| } |
| } |
| return false; |
| } |
| |
| void buildConversionFileTypesArray(TypeDescription fileType, |
| TypeDescription readerType) { |
| // if the column isn't included, don't map it |
| if (included != null && !included[readerType.getId()]) { |
| return; |
| } |
| boolean isOk = true; |
| // check the easy case first |
| if (fileType.getCategory() == readerType.getCategory()) { |
| switch (readerType.getCategory()) { |
| case BOOLEAN: |
| case BYTE: |
| case SHORT: |
| case INT: |
| case LONG: |
| case DOUBLE: |
| case FLOAT: |
| case STRING: |
| case TIMESTAMP: |
| case BINARY: |
| case DATE: |
| // these are always a match |
| break; |
| case CHAR: |
| case VARCHAR: |
| // We do conversion when same CHAR/VARCHAR type but different |
| // maxLength. |
| if (fileType.getMaxLength() != readerType.getMaxLength()) { |
| hasConversion = true; |
| } |
| break; |
| case DECIMAL: |
| // We do conversion when same DECIMAL type but different |
| // precision/scale. |
| if (fileType.getPrecision() != readerType.getPrecision() || |
| fileType.getScale() != readerType.getScale()) { |
| hasConversion = true; |
| } |
| break; |
| case UNION: |
| case MAP: |
| case LIST: { |
| // these must be an exact match |
| List<TypeDescription> fileChildren = fileType.getChildren(); |
| List<TypeDescription> readerChildren = readerType.getChildren(); |
| if (fileChildren.size() == readerChildren.size()) { |
| for(int i=0; i < fileChildren.size(); ++i) { |
| buildConversionFileTypesArray(fileChildren.get(i), |
| readerChildren.get(i)); |
| } |
| } else { |
| isOk = false; |
| } |
| break; |
| } |
| case STRUCT: { |
| // allow either side to have fewer fields than the other |
| List<TypeDescription> fileChildren = fileType.getChildren(); |
| List<TypeDescription> readerChildren = readerType.getChildren(); |
| if (fileChildren.size() != readerChildren.size()) { |
| hasConversion = true; |
| } |
| int jointSize = Math.min(fileChildren.size(), readerChildren.size()); |
| for(int i=0; i < jointSize; ++i) { |
| buildConversionFileTypesArray(fileChildren.get(i), |
| readerChildren.get(i)); |
| } |
| break; |
| } |
| default: |
| throw new IllegalArgumentException("Unknown type " + readerType); |
| } |
| } else { |
| /* |
| * Check for the few cases where will not convert.... |
| */ |
| |
| isOk = ConvertTreeReaderFactory.canConvert(fileType, readerType); |
| hasConversion = true; |
| } |
| if (isOk) { |
| int id = readerType.getId(); |
| if (readerFileTypes[id] != null) { |
| throw new RuntimeException("reader to file type entry already" + |
| " assigned"); |
| } |
| readerFileTypes[id] = fileType; |
| } else { |
| throw new IllegalArgumentException(String.format |
| ("ORC does not support type" + |
| " conversion from file type %s" + |
| " (%d) to reader type %s (%d)", |
| fileType.toString(), |
| fileType.getId(), |
| readerType.toString(), |
| readerType.getId())); |
| } |
| } |
| |
| /** |
| * Use to make a reader to file type array when the schema is the same. |
| * @return |
| */ |
| private void buildSameSchemaFileTypesArray() { |
| buildSameSchemaFileTypesArrayRecurse(readerSchema); |
| } |
| |
| void buildSameSchemaFileTypesArrayRecurse(TypeDescription readerType) { |
| if (included != null && !included[readerType.getId()]) { |
| return; |
| } |
| int id = readerType.getId(); |
| if (readerFileTypes[id] != null) { |
| throw new RuntimeException("reader to file type entry already assigned"); |
| } |
| readerFileTypes[id] = readerType; |
| List<TypeDescription> children = readerType.getChildren(); |
| if (children != null) { |
| for (TypeDescription child : children) { |
| buildSameSchemaFileTypesArrayRecurse(child); |
| } |
| } |
| } |
| |
| private static boolean checkAcidSchema(TypeDescription type) { |
| if (type.getCategory().equals(TypeDescription.Category.STRUCT)) { |
| List<String> rootFields = type.getFieldNames(); |
| if (acidEventFieldNames.equals(rootFields)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * @param typeDescr |
| * @return ORC types for the ACID event based on the row's type description |
| */ |
| public static TypeDescription createEventSchema(TypeDescription typeDescr) { |
| TypeDescription result = TypeDescription.createStruct() |
| .addField("operation", TypeDescription.createInt()) |
| .addField("originalTransaction", TypeDescription.createLong()) |
| .addField("bucket", TypeDescription.createInt()) |
| .addField("rowId", TypeDescription.createLong()) |
| .addField("currentTransaction", TypeDescription.createLong()) |
| .addField("row", typeDescr.clone()); |
| return result; |
| } |
| |
| public static final List<String> acidEventFieldNames= |
| new ArrayList<String>(); |
| static { |
| acidEventFieldNames.add("operation"); |
| acidEventFieldNames.add("originalTransaction"); |
| acidEventFieldNames.add("bucket"); |
| acidEventFieldNames.add("rowId"); |
| acidEventFieldNames.add("currentTransaction"); |
| acidEventFieldNames.add("row"); |
| } |
| } |