exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/loader/TupleParser.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.store.easy.json.loader;

 import org.apache.drill.exec.record.metadata.TupleMetadata;
 import org.apache.drill.exec.store.easy.json.parser.ElementParser;
 import org.apache.drill.exec.store.easy.json.parser.JsonStructureParser;
 import org.apache.drill.exec.store.easy.json.parser.ObjectParser;
 import org.apache.drill.exec.store.easy.json.parser.TokenIterator;
 import org.apache.drill.exec.vector.accessor.TupleWriter;

 /**
  * Accepts { name : value ... }
  * <p>
  * The structure parser maintains a map of known fields. Each time a
  * field is parsed, looks up the field in the map. If not found, the parser
  * looks ahead to find a value token, if any, and calls this class to add
  * a new column. This class creates a column writer based either on the
  * type provided in a provided schema, or inferred from the JSON token.
  * <p>
  * As it turns out, most of the semantic action occurs at the tuple level:
  * that is where fields are defined, types inferred, and projection is
  * computed.
  *
  * <h4>Nulls</h4>
  *
  * Much code here deals with null types, especially leading nulls, leading
  * empty arrays, and so on. The object parser creates a parser for each
  * value; a parser which "does the right thing" based on the data type.
  * For example, for a Boolean, the parser recognizes {@code true},
  * {@code false} and {@code null}.
  * <p>
  * But what happens if the first value for a field is {@code null}? We
  * don't know what kind of parser to create because we don't have a schema.
  * Instead, we have to create a temporary placeholder parser that will consume
  * nulls, waiting for a real type to show itself. Once that type appears, the
  * null parser can replace itself with the correct form. Each vector's
  * "fill empties" logic will back-fill the newly created vector with nulls
  * for prior rows.
  * <p>
  * Two null parsers are needed: one when we see an empty list, and one for
  * when we only see {@code null}. The one for {@code null{@code  must morph into
  * the one for empty lists if we see:<br>
  * {@code {a: null} {a: [ ]  }}<br>
  * <p>
  * If we get all the way through the batch, but have still not seen a type,
  * then we have to guess. A prototype type system can tell us, otherwise we
  * guess {@code VARCHAR}. ({@code VARCHAR} is the right choice for all-text
  * mode, it is as good a guess as any for other cases.)
  *
  * <h4>Projection List Hints</h4>
  *
  * To help, we consult the projection list, if any, for a column. If the
  * projection is of the form {@code a[0]}, we know the column had better
  * be an array. Similarly, if the projection list has {@code b.c}, then
  * {@code b} had better be an object.
  *
  * <h4>Array Handling</h4>
  *
  * The code here handles arrays in two ways. JSON normally uses the
  * {@code LIST} type. But, that can be expensive if lists are
  * well-behaved. So, the code here also implements arrays using the
  * classic {@code REPEATED} types. The repeated type option is disabled
  * by default. It can be enabled, for efficiency, if Drill ever supports
  * a JSON schema. If an array is well-behaved, mark that column as able
  * to use a repeated type.
  *
  * <h4>Ambiguous Types</h4>
  *
  * JSON nulls are untyped. A run of nulls does not tell us what type will
  * eventually appear. The best solution is to provide a schema. Without a
  * schema, the code is forgiving: defers selection of the column type until
  * the first non-null value (or, forces a type at the end of the batch.)
  * <p>
  * For scalars the pattern is: <code>{a: null} {a: "foo"}</code>. Type
  * selection happens on the value {@code "foo"}.
  * <p>
  * For arrays, the pattern is: <code>{a: []} {a: ["foo"]}</code>. Type
  * selection happens on the first array element. Note that type selection
  * must happen on the first element, even if tha element is null (which,
  * as we just said, ambiguous.)
  * <p>
  * If we are forced to pick a type (because we hit the end of a batch, or
  * we see {@code [null]}, then we pick {@code VARCHAR} as we allow any
  * scalar to be converted to {@code VARCHAR}. This helps for a single-file
  * query, but not if multiple fragments each make their own (inconsistent)
  * decisions. Only a schema provides a consistent answer.
  */
 public class TupleParser extends ObjectParser {

   private final JsonLoaderImpl loader;
   private final TupleWriter tupleWriter;
   private final TupleMetadata providedSchema;

   // Bootstrap case: struct parser not yet set on the JSON loader
   public TupleParser(JsonStructureParser structParser, JsonLoaderImpl loader,
       TupleWriter tupleWriter, TupleMetadata providedSchema) {
     super(structParser);
     this.loader = loader;
     this.tupleWriter = tupleWriter;
     this.providedSchema = providedSchema;
   }

   public TupleParser(JsonLoaderImpl loader, TupleWriter tupleWriter, TupleMetadata providedSchema) {
     this(loader.parser(), loader, tupleWriter, providedSchema);
   }

   public JsonLoaderImpl loader() { return loader; }
   public TupleWriter writer() { return tupleWriter; }
   protected TupleMetadata providedSchema() { return providedSchema; }
   protected FieldFactory fieldFactory() { return loader.fieldFactory(); }

   @Override
   public ElementParser onField(String key, TokenIterator tokenizer) {
     if (projectField(key)) {
       return fieldParserFor(key, tokenizer);
     } else {
       return fieldFactory().ignoredFieldParser();
     }
   }

   private boolean projectField(String key) {
     // This method makes sure that fields necessary for column listeners are read.
     if (tupleWriter.isProjected(key)) {
       return true;
     } else {
       return loader.listenerColumnMap() != null && loader.listenerColumnMap().containsKey(key);
     }
   }

   private ElementParser fieldParserFor(String key, TokenIterator tokenizer) {
     return fieldFactory().fieldParser(new FieldDefn(this, key, tokenizer));
   }

   public ElementParser resolveField(String key, TokenIterator tokenizer) {
     return replaceFieldParser(key, fieldParserFor(key, tokenizer));
   }

   public ElementParser resolveArray(String key, TokenIterator tokenizer) {
     return replaceFieldParser(key, fieldFactory().fieldParser(new FieldDefn(this, key, tokenizer, true)));
   }

   public void forceNullResolution(String key) {
     replaceFieldParser(key, fieldFactory().forceNullResolution(new FieldDefn(this, key, null)));
   }

   public void forceEmptyArrayResolution(String key) {
     replaceFieldParser(key, fieldFactory().forceArrayResolution(new FieldDefn(this, key, null)));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.store.easy.json.loader;

	import org.apache.drill.exec.record.metadata.TupleMetadata;
	import org.apache.drill.exec.store.easy.json.parser.ElementParser;
	import org.apache.drill.exec.store.easy.json.parser.JsonStructureParser;
	import org.apache.drill.exec.store.easy.json.parser.ObjectParser;
	import org.apache.drill.exec.store.easy.json.parser.TokenIterator;
	import org.apache.drill.exec.vector.accessor.TupleWriter;

	/**
	* Accepts { name : value ... }
	* <p>
	* The structure parser maintains a map of known fields. Each time a
	* field is parsed, looks up the field in the map. If not found, the parser
	* looks ahead to find a value token, if any, and calls this class to add
	* a new column. This class creates a column writer based either on the
	* type provided in a provided schema, or inferred from the JSON token.
	* <p>
	* As it turns out, most of the semantic action occurs at the tuple level:
	* that is where fields are defined, types inferred, and projection is
	* computed.
	*
	* <h4>Nulls</h4>
	*
	* Much code here deals with null types, especially leading nulls, leading
	* empty arrays, and so on. The object parser creates a parser for each
	* value; a parser which "does the right thing" based on the data type.
	* For example, for a Boolean, the parser recognizes {@code true},
	* {@code false} and {@code null}.
	* <p>
	* But what happens if the first value for a field is {@code null}? We
	* don't know what kind of parser to create because we don't have a schema.
	* Instead, we have to create a temporary placeholder parser that will consume
	* nulls, waiting for a real type to show itself. Once that type appears, the
	* null parser can replace itself with the correct form. Each vector's
	* "fill empties" logic will back-fill the newly created vector with nulls
	* for prior rows.
	* <p>
	* Two null parsers are needed: one when we see an empty list, and one for
	* when we only see {@code null}. The one for {@code null{@code must morph into
	* the one for empty lists if we see:<br>
	* {@code {a: null} {a: [ ] }}<br>
	* <p>
	* If we get all the way through the batch, but have still not seen a type,
	* then we have to guess. A prototype type system can tell us, otherwise we
	* guess {@code VARCHAR}. ({@code VARCHAR} is the right choice for all-text
	* mode, it is as good a guess as any for other cases.)
	*
	* <h4>Projection List Hints</h4>
	*
	* To help, we consult the projection list, if any, for a column. If the
	* projection is of the form {@code a[0]}, we know the column had better
	* be an array. Similarly, if the projection list has {@code b.c}, then
	* {@code b} had better be an object.
	*
	* <h4>Array Handling</h4>
	*
	* The code here handles arrays in two ways. JSON normally uses the
	* {@code LIST} type. But, that can be expensive if lists are
	* well-behaved. So, the code here also implements arrays using the
	* classic {@code REPEATED} types. The repeated type option is disabled
	* by default. It can be enabled, for efficiency, if Drill ever supports
	* a JSON schema. If an array is well-behaved, mark that column as able
	* to use a repeated type.
	*
	* <h4>Ambiguous Types</h4>
	*
	* JSON nulls are untyped. A run of nulls does not tell us what type will
	* eventually appear. The best solution is to provide a schema. Without a
	* schema, the code is forgiving: defers selection of the column type until
	* the first non-null value (or, forces a type at the end of the batch.)
	* <p>
	* For scalars the pattern is: <code>{a: null} {a: "foo"}</code>. Type
	* selection happens on the value {@code "foo"}.
	* <p>
	* For arrays, the pattern is: <code>{a: []} {a: ["foo"]}</code>. Type
	* selection happens on the first array element. Note that type selection
	* must happen on the first element, even if tha element is null (which,
	* as we just said, ambiguous.)
	* <p>
	* If we are forced to pick a type (because we hit the end of a batch, or
	* we see {@code [null]}, then we pick {@code VARCHAR} as we allow any
	* scalar to be converted to {@code VARCHAR}. This helps for a single-file
	* query, but not if multiple fragments each make their own (inconsistent)
	* decisions. Only a schema provides a consistent answer.
	*/
	public class TupleParser extends ObjectParser {

	private final JsonLoaderImpl loader;
	private final TupleWriter tupleWriter;
	private final TupleMetadata providedSchema;

	// Bootstrap case: struct parser not yet set on the JSON loader
	public TupleParser(JsonStructureParser structParser, JsonLoaderImpl loader,
	TupleWriter tupleWriter, TupleMetadata providedSchema) {
	super(structParser);
	this.loader = loader;
	this.tupleWriter = tupleWriter;
	this.providedSchema = providedSchema;
	}

	public TupleParser(JsonLoaderImpl loader, TupleWriter tupleWriter, TupleMetadata providedSchema) {
	this(loader.parser(), loader, tupleWriter, providedSchema);
	}

	public JsonLoaderImpl loader() { return loader; }
	public TupleWriter writer() { return tupleWriter; }
	protected TupleMetadata providedSchema() { return providedSchema; }
	protected FieldFactory fieldFactory() { return loader.fieldFactory(); }

	@Override
	public ElementParser onField(String key, TokenIterator tokenizer) {
	if (projectField(key)) {
	return fieldParserFor(key, tokenizer);
	} else {
	return fieldFactory().ignoredFieldParser();
	}
	}

	private boolean projectField(String key) {
	// This method makes sure that fields necessary for column listeners are read.
	if (tupleWriter.isProjected(key)) {
	return true;
	} else {
	return loader.listenerColumnMap() != null && loader.listenerColumnMap().containsKey(key);
	}
	}

	private ElementParser fieldParserFor(String key, TokenIterator tokenizer) {
	return fieldFactory().fieldParser(new FieldDefn(this, key, tokenizer));
	}

	public ElementParser resolveField(String key, TokenIterator tokenizer) {
	return replaceFieldParser(key, fieldParserFor(key, tokenizer));
	}

	public ElementParser resolveArray(String key, TokenIterator tokenizer) {
	return replaceFieldParser(key, fieldFactory().fieldParser(new FieldDefn(this, key, tokenizer, true)));
	}

	public void forceNullResolution(String key) {
	replaceFieldParser(key, fieldFactory().forceNullResolution(new FieldDefn(this, key, null)));
	}

	public void forceEmptyArrayResolution(String key) {
	replaceFieldParser(key, fieldFactory().forceArrayResolution(new FieldDefn(this, key, null)));
	}
	}