blob: 84757a5a3048845a542985bfc8e18ed0aa48ecef [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.pig;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.classification.InterfaceAudience;
import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.newplan.logical.relational.LogicalSchema;
import org.apache.pig.newplan.logical.relational.LogicalSchema.LogicalFieldSchema;
import org.codehaus.jackson.annotate.JsonPropertyOrder;
* A represenation of a schema used to communicate with load and store functions. This is
* separate from {@link Schema}, which is an internal Pig representation of a schema.
* @since Pig 0.7
@JsonPropertyOrder({ "fields", "version", "sortKeys", "sortKeyOrders" })
public class ResourceSchema implements Serializable {
private static final long serialVersionUID = 1L;
private static Log log = LogFactory.getLog(ResourceSchema.class);
/* Array Getters intentionally return mutable arrays instead of copies,
* to simplify updates without unnecessary copying.
* Setters make a copy of the arrays in order to prevent an array
* from being shared by two objects, with modifications in one
* accidentally changing the other.
// initializing arrays to empty so we don't have to worry about NPEs
// setters won't set to null
private ResourceFieldSchema[] fields = new ResourceFieldSchema[0];
public enum Order { ASCENDING, DESCENDING }
private int[] sortKeys = new int[0]; // each entry is an offset into the fields array.
private Order[] sortKeyOrders = new Order[0];
private int version = 0;
@JsonPropertyOrder({ "name", "type", "description", "schema" })
public static class ResourceFieldSchema implements Serializable {
private static final long serialVersionUID = 1L;
private String name;
// values are constants from DataType
private byte type;
private String description;
// nested tuples and bags will have their own schema
private ResourceSchema schema;
* Construct an empty field schema.
public ResourceFieldSchema() {
* Construct using a {@link org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema} as the template.
* @param fieldSchema fieldSchema to copy from
public ResourceFieldSchema(FieldSchema fieldSchema) {
type = fieldSchema.type;
name = fieldSchema.alias;
description = "autogenerated from Pig Field Schema";
Schema inner = fieldSchema.schema;
// allow partial schema
if ((type == DataType.BAG || type == DataType.TUPLE || type == DataType.MAP)
&& inner != null) {
schema = new ResourceSchema(inner);
} else {
schema = null;
* Construct using a {@link org.apache.pig.newplan.logical.relational.LogicalSchema.LogicalFieldSchema} as the template.
* @param fieldSchema fieldSchema to copy from
public ResourceFieldSchema(LogicalFieldSchema fieldSchema) {
type = fieldSchema.type;
name = fieldSchema.alias;
description = "autogenerated from Pig Field Schema";
LogicalSchema inner = fieldSchema.schema;
// allow partial schema
if (DataType.isSchemaType(type) && inner != null) {
schema = new ResourceSchema(inner);
} else {
schema = null;
* Get the name of this field.
* @return name
public String getName() {
return name;
* Set the name of this filed.
* @param name new name
* @return this
public ResourceFieldSchema setName(String name) { = name;
return this;
* Get the type of this field.
* @return type, as a {@link DataType} static final byte
public byte getType() {
return type;
* Set the type of this field
* @param type new type
* @return this
public ResourceFieldSchema setType(byte type) {
this.type = type;
return this;
* Get a free form text description of this field.
* @return description
public String getDescription() {
return description;
* Set the description
* @param description new description
* @return this
public ResourceFieldSchema setDescription(String description) {
this.description = description;
return this;
* Get the schema for this field. Type tuple/bag/map may have a schema.
* @return schema
public ResourceSchema getSchema() {
return schema;
* Set the schema for this field. Type tuple/bag/map may have a schema.
* @param schema new schema
* @return this
public ResourceFieldSchema setSchema(ResourceSchema schema) throws
IOException {
this.schema = schema;
return this;
private void validateSchema(ResourceSchema schema) throws IOException {
if(type == DataType.BAG && schema != null) {
ResourceFieldSchema[] subFields = schema.getFields();
if (subFields.length == 1) {
if (subFields[0].type != DataType.TUPLE) {
} else {
public static void throwInvalidSchemaException() throws FrontendException {
int errCode = 2218;
throw new FrontendException("Invalid resource schema: " +
"bag schema must have tuple as its field", errCode, PigException.BUG);
public String toString() {
return getDescription(true);
public String calcCastString() {
return getDescription(false);
private String getDescription(boolean printAlias) {
StringBuilder sb = new StringBuilder();
if (printAlias && != null)
if (DataType.isAtomic(this.type)) {
} else {
//if (this.schema!=null)
stringifyResourceSchema(sb, this.schema, this.type, printAlias);
return sb.toString();
* Construct an empty ResourceSchema.
public ResourceSchema() {
* Construct a ResourceSchema from a {@link Schema}
* @param pigSchema Schema to use
public ResourceSchema(Schema pigSchema) {
List<FieldSchema> pigSchemaFields = pigSchema.getFields();
fields = new ResourceFieldSchema[pigSchemaFields.size()];
for (int i=0; i<fields.length; i++) {
fields[i] = new ResourceFieldSchema(pigSchemaFields.get(i));
* Construct a ResourceSchema from a {@link LogicalSchema}
* @param pigSchema Schema to use
public ResourceSchema(LogicalSchema pigSchema) {
List<LogicalFieldSchema> pigSchemaFields = pigSchema.getFields();
fields = new ResourceFieldSchema[pigSchemaFields.size()];
for (int i=0; i<fields.length; i++) {
fields[i] = new ResourceFieldSchema(pigSchemaFields.get(i));
* Only for use by Pig internal code.
* Construct a ResourceSchema from a {@link Schema}
* @param pigSchema Schema to use
* @param sortInfo information on how data is sorted
public ResourceSchema(Schema pigSchema, SortInfo sortInfo) {
if (sortInfo!=null && sortInfo.getSortColInfoList().size()!=0) {
sortKeys = new int[sortInfo.getSortColInfoList().size()];
sortKeyOrders = new Order[sortInfo.getSortColInfoList().size()];
for (int i=0;i<sortInfo.getSortColInfoList().size();i++) {
SortColInfo colInfo = sortInfo.getSortColInfoList().get(i);
int index = colInfo.getColIndex();
Order order;
org.apache.pig.SortColInfo.Order origOrder = colInfo.getSortOrder();
if (origOrder==org.apache.pig.SortColInfo.Order.ASCENDING) {
order = Order.ASCENDING;
} else {
order = Order.DESCENDING;
sortKeys[i] = index;
sortKeyOrders[i] = order;
* Only for use by Pig internal code.
* Construct a ResourceSchema from a {@link LogicalSchema}
* @param pigSchema LogicalSchema to use
* @param sortInfo information on how data is sorted
public ResourceSchema(LogicalSchema pigSchema, SortInfo sortInfo) {
if (sortInfo!=null && sortInfo.getSortColInfoList().size()!=0) {
sortKeys = new int[sortInfo.getSortColInfoList().size()];
sortKeyOrders = new Order[sortInfo.getSortColInfoList().size()];
for (int i=0;i<sortInfo.getSortColInfoList().size();i++) {
SortColInfo colInfo = sortInfo.getSortColInfoList().get(i);
int index = colInfo.getColIndex();
Order order;
org.apache.pig.SortColInfo.Order origOrder = colInfo.getSortOrder();
if (origOrder==org.apache.pig.SortColInfo.Order.ASCENDING) {
order = Order.ASCENDING;
} else {
order = Order.DESCENDING;
sortKeys[i] = index;
sortKeyOrders[i] = order;
* Get the version of this schema.
* @return version
public int getVersion() {
return version;
public ResourceSchema setVersion(int version) {
this.version = version;
return this;
* Get field schema for each field
* @return array of field schemas.
public ResourceFieldSchema[] getFields() {
return fields;
* Get all field names.
* @return array of field names
public String[] fieldNames() {
String[] names = new String[fields.length];
for (int i=0; i<fields.length; i++) {
names[i] = fields[i].getName();
return names;
* Set all the fields. If fields are not currently null the new fields will be silently
* ignored.
* @param fields to use as fields in this schema
* @return this
public ResourceSchema setFields(ResourceFieldSchema[] fields) {
if (fields != null)
this.fields = Arrays.copyOf(fields, fields.length);
return this;
* Get the sort keys for this data.
* @return array of ints. Each integer in the array represents the field number. So if the
* schema of the data is (a, b, c, d) and the data is sorted on c, b, the returned sort keys
* will be [2, 1]. Field numbers are zero based. If the data is not sorted a zero length
* array will be returned.
public int[] getSortKeys() {
return sortKeys;
* Set the sort keys for htis data. If sort keys are not currently null the new sort keys
* will be silently ignored.
* @param sortKeys Each integer in the array represents the field number. So if the
* schema of the data is (a, b, c, d) and the data is sorted on c, b, the sort keys
* should be [2, 1]. Field numbers are zero based.
* @return this
public ResourceSchema setSortKeys(int[] sortKeys) {
if (sortKeys != null)
this.sortKeys = Arrays.copyOf(sortKeys, sortKeys.length);
return this;
* Get order for sort keys.
* @return array of Order. This array will be the same length as the int[] array returned by
* {@link #getSortKeys}.
public Order[] getSortKeyOrders() {
return sortKeyOrders;
* Set the order for each sort key. If order is not currently null, new order will be
* silently ignored.
* @param sortKeyOrders array of Order. Should be the same length as int[] passed to
* {@link #setSortKeys}.
* @return this
public ResourceSchema setSortKeyOrders(Order[] sortKeyOrders) {
if (sortKeyOrders != null)
this.sortKeyOrders = Arrays.copyOf(sortKeyOrders, sortKeyOrders.length);
return this;
* Test whether two ResourceSchemas are the same. Two schemas are said to be the same if they
* are both null or
* have the same number of fields and for each field the name, type are the same. For fields
* that have may have schemas (i.e. tuples) both schemas be equal. Field
* descriptions are not used in testing equality.
* @return true if equal according to the above definition, otherwise false.
public static boolean equals(ResourceSchema rs1, ResourceSchema rs2) {
if (rs1 == null) {
return rs2 == null ? true : false;
if (rs2 == null) {
return false;
if (rs1.getVersion() != rs2.getVersion()
|| !Arrays.equals(rs1.getSortKeys(), rs2.getSortKeys())
|| !Arrays.equals(rs1.getSortKeyOrders(), rs2.getSortKeyOrders())) {
return false;
ResourceFieldSchema[] rfs1 = rs1.getFields();
ResourceFieldSchema[] rfs2 = rs1.getFields();
if (rfs1.length != rfs2.length) return false;
for (int i=0; i<rfs1.length; i++) {
if (rfs1[i].getName()==null && rfs2[i].getName()!=null ||
rfs1[i].getName()!=null && rfs2[i].getName()==null)
return false;
if (rfs1[i].getName()==null && rfs2[i].getName()==null) {
if (rfs1[i].getType() == rfs2[i].getType())
return true;
return false;
if (!rfs1[i].getName().equals(rfs2[i].getName())
|| rfs1[i].getType() != rfs2[i].getType()) {
return false;
if (!equals(rfs1[i].getSchema(), rfs2[i].getSchema())) {
return false;
return true;
public String toString() {
StringBuilder sb = new StringBuilder();
stringifyResourceSchema(sb, this, DataType.UNKNOWN, true) ;
return sb.toString();
private static void stringifyResourceSchema(StringBuilder sb,
ResourceSchema rs, byte type, boolean printAlias) {
if (type == DataType.BAG) {
} else if (type == DataType.TUPLE) {
} else if (type == DataType.MAP) {
if (rs != null) {
for (int i=0; i<rs.getFields().length; i++) {
if (i < rs.getFields().length - 1) {
if (type == DataType.BAG) {
} else if (type == DataType.TUPLE) {
} else if (type == DataType.MAP) {