exec/java-exec/src/main/java/org/apache/drill/exec/record/BatchSchema.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.record;

 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
 import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
 import org.apache.drill.common.types.TypeProtos.MajorType;

 /**
  * Historically {@link BatchSchema} is used to represent the schema of a batch. However, it does not handle complex types well. If you have a choice, use
  * {@link org.apache.drill.exec.record.metadata.TupleMetadata} instead.
  */
 public class BatchSchema implements Iterable<MaterializedField> {
   static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BatchSchema.class);

   private final SelectionVectorMode selectionVectorMode;
   private final List<MaterializedField> fields;

   public BatchSchema(SelectionVectorMode selectionVector, List<MaterializedField> fields) {
     this.fields = fields;
     this.selectionVectorMode = selectionVector;
   }

   public static SchemaBuilder newBuilder() {
     return new SchemaBuilder();
   }

   public int getFieldCount() {
     return fields.size();
   }

   public MaterializedField getColumn(int index) {
     if (index < 0 || index >= fields.size()) {
       return null;
     }
     return fields.get(index);
   }

   @Override
   public Iterator<MaterializedField> iterator() {
     return fields.iterator();
   }

   public SelectionVectorMode getSelectionVectorMode() {
     return selectionVectorMode;
   }

   @Override
   public BatchSchema clone() {
     List<MaterializedField> newFields = Lists.newArrayList();
     newFields.addAll(fields);
     return new BatchSchema(selectionVectorMode, newFields);
   }

   @Override
   public String toString() {
     return "BatchSchema [fields=" + fields + ", selectionVector=" + selectionVectorMode + "]";
   }

   public enum SelectionVectorMode {
     NONE(-1, false), TWO_BYTE(2, true), FOUR_BYTE(4, true);

     public boolean hasSelectionVector;
     public final int size;
     SelectionVectorMode(int size, boolean hasSelectionVector) {
       this.size = size;
     }

     public static SelectionVectorMode[] DEFAULT = {NONE};
     public static SelectionVectorMode[] NONE_AND_TWO = {NONE, TWO_BYTE};
     public static SelectionVectorMode[] NONE_AND_FOUR = {NONE, FOUR_BYTE};
     public static SelectionVectorMode[] ALL = {NONE, TWO_BYTE, FOUR_BYTE};
   }

   @Override
   public int hashCode() {
     final int prime = 31;
     int result = 1;
     result = prime * result + ((fields == null) ? 0 : fields.hashCode());
     result = prime * result + ((selectionVectorMode == null) ? 0 : selectionVectorMode.hashCode());
     return result;
   }

   /**
    * DRILL-5525: the semantics of this method are badly broken.
    * Caveat emptor.
    *
    * This check used for detecting actual schema change inside operator record batch will not work for
    * AbstractContainerVectors (like MapVector). In each record batch a reference to incoming batch schema is
    * stored (let say S:{a: int}) and then equals is called on that stored reference and current incoming batch schema.
    * Internally schema object has references to Materialized fields from vectors in container. If there is change in
    * incoming batch schema, then the upstream will create a new ValueVector in its output container with the new
    * detected type, which in turn will have new instance for Materialized Field. Then later a new BatchSchema object
    * is created for this new incoming batch (let say S":{a":varchar}). The operator calling equals will have reference
    * to old schema object (S) and hence first check will not be satisfied and then it will call equals on each of the
    * Materialized Field (a.equals(a")). Since new materialized field is created for newly created vector the equals
    * check on field will return false. And schema change will be detected in this case.
    * Now consider instead of int vector there is a MapVector such that initial schema was (let say S:{a:{b:int, c:int}}
    * and then later schema for Map field c changes, then in container Map vector will be found but later the children
    * vector for field c will be replaced. This new schema object will be created as (S":{a:{b:int, c":varchar}}). Now
    * when S.equals(S") is called it will eventually call a.equals(a) which will return true even though the schema of
    * children value vector c has changed. This is because no new vector is created for field (a) and hence it's object
    * reference to MaterializedField has not changed which will be reflected in both old and new schema instances.
    * Hence we should make use of {@link BatchSchema#isEquivalent(BatchSchema)} method instead since
    * {@link MaterializedField#isEquivalent(MaterializedField)} method is updated to remove the reference check.
    */

   @Override
   public boolean equals(Object obj) {
     if (this == obj) {
       return true;
     }
     if (obj == null) {
       return false;
     }
     if (getClass() != obj.getClass()) {
       return false;
     }
     BatchSchema other = (BatchSchema) obj;
     if (selectionVectorMode != other.selectionVectorMode) {
       return false;
     }
     if (fields == null) {
       return other.fields == null;
     }

     // Compare names.
     // (DRILL-5525: actually compares all fields.)

     if (!fields.equals(other.fields)) {
       return false;
     }

     // Compare types
     // (DRILL-5525: this code is redundant because any differences
     // will fail above.)

     for (int i = 0; i < fields.size(); i++) {
       MajorType t1 = fields.get(i).getType();
       MajorType t2 = other.fields.get(i).getType();
       if (t1 == null) {
         if (t2 != null) {
           return false;
         }
       } else {
         if (!majorTypeEqual(t1, t2)) {
           return false;
         }
       }
     }
     return true;
   }

   /**
    * Compare that two schemas are identical according to the rules defined
    * in {@link MaterializedField#isEquivalent(MaterializedField)}. In particular,
    * this method requires that the fields have a 1:1 ordered correspondence
    * in the two schemas.
    *
    * @param other another non-null batch schema
    * @return <tt>true</tt> if the two schemas are equivalent according to
    * the {@link MaterializedField#isEquivalent(MaterializedField)} rules,
    * false otherwise
    */

   public boolean isEquivalent(BatchSchema other) {
     if (this == other) {
       return true;
     }
     if (fields == null || other.fields == null) {
       return fields == other.fields;
     }
     if (fields.size() != other.fields.size()) {
       return false;
     }
     for (int i = 0; i < fields.size(); i++) {
       if (! fields.get(i).isEquivalent(other.fields.get(i))) {
         return false;
       }
     }
     return true;
   }

   /**
    * We treat fields with same set of Subtypes as equal, even if they are in a different order
    * @param t1
    * @param t2
    * @return
    */
   private boolean majorTypeEqual(MajorType t1, MajorType t2) {
     if (t1.equals(t2)) {
       return true;
     }
     if (!t1.getMinorType().equals(t2.getMinorType())) {
       return false;
     }
     if (!t1.getMode().equals(t2.getMode())) {
       return false;
     }
     if (!Sets.newHashSet(t1.getSubTypeList()).equals(Sets.newHashSet(t2.getSubTypeList()))) {
       return false;
     }
     return true;
   }

   /**
    * Merge two schemas to produce a new, merged schema. The caller is responsible
    * for ensuring that column names are unique. The order of the fields in the
    * new schema is the same as that of this schema, with the other schema's fields
    * appended in the order defined in the other schema.
    * <p>
    * Merging data with selection vectors is unlikely to be useful, or work well.
    * With a selection vector, the two record batches would have to be correlated
    * both in their selection vectors AND in the underlying vectors. Such a use case
    * is hard to imagine. So, for now, this method forbids merging schemas if either
    * of them carry a selection vector. If we discover a meaningful use case, we can
    * revisit the issue.
    * @param otherSchema the schema to merge with this one
    * @return the new, merged, schema
    */

   public BatchSchema merge(BatchSchema otherSchema) {
     if (selectionVectorMode != SelectionVectorMode.NONE ||
         otherSchema.selectionVectorMode != SelectionVectorMode.NONE) {
       throw new IllegalArgumentException("Cannot merge schemas with selection vectors");
     }
     List<MaterializedField> mergedFields =
         new ArrayList<>(fields.size() + otherSchema.fields.size());
     mergedFields.addAll(this.fields);
     mergedFields.addAll(otherSchema.fields);
     return new BatchSchema(selectionVectorMode, mergedFields);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.record;

	import java.util.ArrayList;
	import java.util.Iterator;
	import java.util.List;

	import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
	import org.apache.drill.shaded.guava.com.google.common.collect.Sets;
	import org.apache.drill.common.types.TypeProtos.MajorType;

	/**
	* Historically {@link BatchSchema} is used to represent the schema of a batch. However, it does not handle complex types well. If you have a choice, use
	* {@link org.apache.drill.exec.record.metadata.TupleMetadata} instead.
	*/
	public class BatchSchema implements Iterable<MaterializedField> {
	static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BatchSchema.class);

	private final SelectionVectorMode selectionVectorMode;
	private final List<MaterializedField> fields;

	public BatchSchema(SelectionVectorMode selectionVector, List<MaterializedField> fields) {
	this.fields = fields;
	this.selectionVectorMode = selectionVector;
	}

	public static SchemaBuilder newBuilder() {
	return new SchemaBuilder();
	}

	public int getFieldCount() {
	return fields.size();
	}

	public MaterializedField getColumn(int index) {
	if (index < 0 \|\| index >= fields.size()) {
	return null;
	}
	return fields.get(index);
	}

	@Override
	public Iterator<MaterializedField> iterator() {
	return fields.iterator();
	}

	public SelectionVectorMode getSelectionVectorMode() {
	return selectionVectorMode;
	}

	@Override
	public BatchSchema clone() {
	List<MaterializedField> newFields = Lists.newArrayList();
	newFields.addAll(fields);
	return new BatchSchema(selectionVectorMode, newFields);
	}

	@Override
	public String toString() {
	return "BatchSchema [fields=" + fields + ", selectionVector=" + selectionVectorMode + "]";
	}

	public enum SelectionVectorMode {
	NONE(-1, false), TWO_BYTE(2, true), FOUR_BYTE(4, true);

	public boolean hasSelectionVector;
	public final int size;
	SelectionVectorMode(int size, boolean hasSelectionVector) {
	this.size = size;
	}

	public static SelectionVectorMode[] DEFAULT = {NONE};
	public static SelectionVectorMode[] NONE_AND_TWO = {NONE, TWO_BYTE};
	public static SelectionVectorMode[] NONE_AND_FOUR = {NONE, FOUR_BYTE};
	public static SelectionVectorMode[] ALL = {NONE, TWO_BYTE, FOUR_BYTE};
	}

	@Override
	public int hashCode() {
	final int prime = 31;
	int result = 1;
	result = prime * result + ((fields == null) ? 0 : fields.hashCode());
	result = prime * result + ((selectionVectorMode == null) ? 0 : selectionVectorMode.hashCode());
	return result;
	}

	/**
	* DRILL-5525: the semantics of this method are badly broken.
	* Caveat emptor.
	*
	* This check used for detecting actual schema change inside operator record batch will not work for
	* AbstractContainerVectors (like MapVector). In each record batch a reference to incoming batch schema is
	* stored (let say S:{a: int}) and then equals is called on that stored reference and current incoming batch schema.
	* Internally schema object has references to Materialized fields from vectors in container. If there is change in
	* incoming batch schema, then the upstream will create a new ValueVector in its output container with the new
	* detected type, which in turn will have new instance for Materialized Field. Then later a new BatchSchema object
	* is created for this new incoming batch (let say S":{a":varchar}). The operator calling equals will have reference
	* to old schema object (S) and hence first check will not be satisfied and then it will call equals on each of the
	* Materialized Field (a.equals(a")). Since new materialized field is created for newly created vector the equals
	* check on field will return false. And schema change will be detected in this case.
	* Now consider instead of int vector there is a MapVector such that initial schema was (let say S:{a:{b:int, c:int}}
	* and then later schema for Map field c changes, then in container Map vector will be found but later the children
	* vector for field c will be replaced. This new schema object will be created as (S":{a:{b:int, c":varchar}}). Now
	* when S.equals(S") is called it will eventually call a.equals(a) which will return true even though the schema of
	* children value vector c has changed. This is because no new vector is created for field (a) and hence it's object
	* reference to MaterializedField has not changed which will be reflected in both old and new schema instances.
	* Hence we should make use of {@link BatchSchema#isEquivalent(BatchSchema)} method instead since
	* {@link MaterializedField#isEquivalent(MaterializedField)} method is updated to remove the reference check.
	*/

	@Override
	public boolean equals(Object obj) {
	if (this == obj) {
	return true;
	}
	if (obj == null) {
	return false;
	}
	if (getClass() != obj.getClass()) {
	return false;
	}
	BatchSchema other = (BatchSchema) obj;
	if (selectionVectorMode != other.selectionVectorMode) {
	return false;
	}
	if (fields == null) {
	return other.fields == null;
	}

	// Compare names.
	// (DRILL-5525: actually compares all fields.)

	if (!fields.equals(other.fields)) {
	return false;
	}

	// Compare types
	// (DRILL-5525: this code is redundant because any differences
	// will fail above.)

	for (int i = 0; i < fields.size(); i++) {
	MajorType t1 = fields.get(i).getType();
	MajorType t2 = other.fields.get(i).getType();
	if (t1 == null) {
	if (t2 != null) {
	return false;
	}
	} else {
	if (!majorTypeEqual(t1, t2)) {
	return false;
	}
	}
	}
	return true;
	}

	/**
	* Compare that two schemas are identical according to the rules defined
	* in {@link MaterializedField#isEquivalent(MaterializedField)}. In particular,
	* this method requires that the fields have a 1:1 ordered correspondence
	* in the two schemas.
	*
	* @param other another non-null batch schema
	* @return <tt>true</tt> if the two schemas are equivalent according to
	* the {@link MaterializedField#isEquivalent(MaterializedField)} rules,
	* false otherwise
	*/

	public boolean isEquivalent(BatchSchema other) {
	if (this == other) {
	return true;
	}
	if (fields == null \|\| other.fields == null) {
	return fields == other.fields;
	}
	if (fields.size() != other.fields.size()) {
	return false;
	}
	for (int i = 0; i < fields.size(); i++) {
	if (! fields.get(i).isEquivalent(other.fields.get(i))) {
	return false;
	}
	}
	return true;
	}

	/**
	* We treat fields with same set of Subtypes as equal, even if they are in a different order
	* @param t1
	* @param t2
	* @return
	*/
	private boolean majorTypeEqual(MajorType t1, MajorType t2) {
	if (t1.equals(t2)) {
	return true;
	}
	if (!t1.getMinorType().equals(t2.getMinorType())) {
	return false;
	}
	if (!t1.getMode().equals(t2.getMode())) {
	return false;
	}
	if (!Sets.newHashSet(t1.getSubTypeList()).equals(Sets.newHashSet(t2.getSubTypeList()))) {
	return false;
	}
	return true;
	}

	/**
	* Merge two schemas to produce a new, merged schema. The caller is responsible
	* for ensuring that column names are unique. The order of the fields in the
	* new schema is the same as that of this schema, with the other schema's fields
	* appended in the order defined in the other schema.
	* <p>
	* Merging data with selection vectors is unlikely to be useful, or work well.
	* With a selection vector, the two record batches would have to be correlated
	* both in their selection vectors AND in the underlying vectors. Such a use case
	* is hard to imagine. So, for now, this method forbids merging schemas if either
	* of them carry a selection vector. If we discover a meaningful use case, we can
	* revisit the issue.
	* @param otherSchema the schema to merge with this one
	* @return the new, merged, schema
	*/

	public BatchSchema merge(BatchSchema otherSchema) {
	if (selectionVectorMode != SelectionVectorMode.NONE \|\|
	otherSchema.selectionVectorMode != SelectionVectorMode.NONE) {
	throw new IllegalArgumentException("Cannot merge schemas with selection vectors");
	}
	List<MaterializedField> mergedFields =
	new ArrayList<>(fields.size() + otherSchema.fields.size());
	mergedFields.addAll(this.fields);
	mergedFields.addAll(otherSchema.fields);
	return new BatchSchema(selectionVectorMode, mergedFields);
	}
	}