exec/java-exec/src/main/java/org/apache/drill/exec/store/easy/json/parser/ObjectParser.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.store.easy.json.parser;

 import java.util.Map;

 import org.apache.drill.common.map.CaseInsensitiveMap;
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.core.JsonToken;

 /**
  * Parses a JSON object: <code>{ name : value ... }</code>
  * <p>
  * The object value may the root object (the row), a top-level
  * field or may be the element of an array. The event methods are called when
  * an object is started and ended, as well as when a new field is discovered.
  * <p>
  * Creates a map of known fields. Each time a field is parsed,
  * looks up the field in the map. If not found, the value is "sniffed"
  * to determine its type, and a matching parser and listener created.
  * Thereafter, the previous parser is reused.
  * <p>
  * The object listener provides semantics. One key decision is whether
  * to project a field or not. An unprojected field is parsed with
  * a "dummy" parser that "free-wheels" over all valid JSON structures.
  * Otherwise, the listener is given whatever type information that the
  * parser can discover when creating the field.
  * <p>
  * Work is divided between this class, which discovers fields, and
  * the listeners which determine the meaning of field values. A field,
  * via a properly-defined listener, can accept one or more different
  * value kinds.
  * <p>
  * The parser accepts JSON tokens as they appear in the file. The
  * question of whether those tokens make sense is left to the listeners.
  * The listeners decide if the tokens make sense for a particular column.
  * The listener should provide a clear error if a particular token is not
  * valid for a given listener.
  *
  * <h4>Fields</h4>
  *
  * The structure of an object is:
  * <ul>
  * <li>{@code ObjectListener} which represents the object (tuple) as a whole.
  * Each field, indexed by name, is represented as a</li>
  * <li>{@code ValueListener} which represents the value "slot". That value
  * can be scalar, or can be structured, in which case the value listener
  * contains either a</li>
  * <li>{@code ArrayListener} for an array, or a</li>
  * <li>{@code ObjectListener} for a nested object (tuple).</li>
  * </ul>
  *
  * <h4>Nulls</h4>
  *
  * Null values are handled at the semantic, not syntax level. If the
  * first appearance of a field contains a null value, then the parser can
  * provide no hints about the expected field type. The listener must
  * implement a solution such as referring to a schema, waiting for a
  * non-null value to appear, etc.
  * <p>
  * Since the parser classes handle syntax, they are blissfully ignorant
  * of any fancy logic needed for null handling. Each field is
  * represented by a field parser whether that field is null or not.
  * It is the listener that may have to swap out one mechanism for
  * another as types are discovered.
  *
  * <h4>Complex Types</h4>
  *
  * Parsers handle arrays and objects using a two-level system. Each field
  * always is driven by a field parser. If the field is discovered to be an
  * array, then we add an array parser to the field parser to handle array
  * contents. The same is true of objects.
  * <p>
  * Both objects and arrays are collections of values, and a value can
  * optionally contain an array or object. (JSON allows any given field
  * name to map to both objects and arrays in different rows. The parser
  * structure reflects this syntax. The listeners can enforce more
  * relational-like semantics).
  * <p>
  * If an array is single-dimension, then the field parse contains an array
  * parser which contains another value parser for the array contents. If
  * the array is multi-dimensional, there will be multiple array/value
  * parser pairs: one for each dimension.
  */
 public abstract class ObjectParser extends AbstractElementParser {
   protected static final Logger logger = LoggerFactory.getLogger(ObjectParser.class);

   private final Map<String, ElementParser> members = CaseInsensitiveMap.newHashMap();

   public ObjectParser(JsonStructureParser structParser) {
     super(structParser);
   }

   @VisibleForTesting
   public ElementParser fieldParser(String key) {
     return members.get(key);
   }

   /**
    * Called at the start of a set of values for an object. That is, called
    * when the structure parser accepts the <code>{</code> token.
    */
   protected void onStart() { }

   /**
    * The structure parser has just encountered a new field for this
    * object. This method returns a parser for the field, along with
    * an optional listener to handle events within the field. The field typically
    * uses a value parser create by the {@link FieldParserFactory} class.
    * However, special cases (such as Mongo extended types) can create a
    * custom parser.
    * <p>
    * If the field is not projected, the method should return a dummy parser
    * from {@link FieldParserFactory#ignoredFieldParser()}.
    * The dummy parser will "free-wheel" over whatever values the
    * field contains. (This is one way to avoid structure errors in a JSON file:
    * just ignore them.) Otherwise, the parser will look ahead to guess the
    * field type and will call one of the "add" methods, each of which should
    * return a value listener for the field itself.
    * <p>
    * A normal field will respond to the structure of the JSON file as it
    * appears. The associated value listener receives events for the
    * field value. The value listener may be asked to create additional
    * structure, such as arrays or nested objects.
    * <p>
    * Parse position: <code>{ ... field : ^ ?</code> for a newly-seen field.
    * Constructs a value parser and its listeners by looking ahead
    * some number of tokens to "sniff" the type of the value. For
    * example:
    * <ul>
    * <li>{@code foo: <value>} - Field value</li>
    * <li>{@code foo: [ <value> ]} - 1D array value</li>
    * <li>{@code foo: [ [<value> ] ]} - 2D array value</li>
    * <li>Etc.</li>
    * </ul>
    * <p>
    * There are two cases in which no type estimation is possible:
    * <ul>
    * <li>{@code foo: null}</li>
    * <li>{@code foo: []}</li>
    * </ul>
    *
    * @param key name of the field
    * @param tokenizer an instance of a token iterator
    * @return a parser for the newly-created field
    */
   protected abstract ElementParser onField(String key, TokenIterator tokenizer);

   /**
    * Called at the end of a set of values for an object. That is, called
    * when the structure parser accepts the <code>}</code> token.
    */
   protected void onEnd() { }

   /**
    * Parses <code>{ ^ ... }</code>
    *
    * @param tokenizer an instance of a token iterator
    */
   @Override
   public void parse(TokenIterator tokenizer) {
     onStart();

     // Parse (field: value)* }
     top: while (true) {
       JsonToken token = tokenizer.requireNext();
       // Position: { (key: value)* ? ^
       switch (token) {
         case END_OBJECT:
           // Position: { (key: value)* } ^
           break top;

         case FIELD_NAME:
           // Position: { (key: value)* key: ^
           parseMember(tokenizer);
           break;

         default:
           // Position: { (key: value)* ~(key | }) ^
           // Invalid JSON.
           // Actually, we probably won't get here, the JSON parser
           // itself will throw an exception.
           throw errorFactory().syntaxError(token);
       }
     }
     onEnd();
   }

   /**
    * Parse a field. Two cases. First, this is a field we've already seen. If so,
    * look up the parser for that field and use it. If this is the first time
    * we've seen the field, "sniff" tokens to determine field type, create a
    * parser, then parse.
    *
    * @param tokenizer an instance of a token iterator
    */
   private void parseMember(TokenIterator tokenizer) {
     // Position: key: ^ ?
     final String key = tokenizer.textValue().trim();
     ElementParser fieldParser = members.get(key);
     if (fieldParser == null) {
       // New key; sniff the value to determine the parser to use
       // (which also tell us the kind of column to create in Drill.)
       // Position: key: ^
       fieldParser = detectValueParser(key, tokenizer);
       members.put(key, fieldParser);
     }
     // Parse the field value.
     // Position: key: ^ value ...
     fieldParser.parse(tokenizer);
   }

   /**
    * If the column is not projected, create a dummy parser to "free wheel" over
    * the value. Otherwise, look ahead a token or two to determine the the type
    * of the field. Then the caller will backtrack to parse the field.
    *
    * @param key name of the field
    * @return parser for the field
    */
   private ElementParser detectValueParser(String key, TokenIterator tokenizer) {
     if (key.isEmpty()) {
       throw errorFactory().structureError(
           "Drill does not allow empty keys in JSON key/value pairs");
     }
     ElementParser fieldParser = onField(key, tokenizer);
     if (fieldParser == null) {
       logger.warn("No JSON element parser returned for field {}, assuming unprojected", key);
       return DummyValueParser.INSTANCE;
     } else {
       return fieldParser;
     }
   }

   public ElementParser replaceFieldParser(String key, ElementParser fieldParser) {
     members.put(key, fieldParser);
     return fieldParser;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.store.easy.json.parser;

	import java.util.Map;

	import org.apache.drill.common.map.CaseInsensitiveMap;
	import com.google.common.annotations.VisibleForTesting;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import com.fasterxml.jackson.core.JsonToken;

	/**
	* Parses a JSON object: <code>{ name : value ... }</code>
	* <p>
	* The object value may the root object (the row), a top-level
	* field or may be the element of an array. The event methods are called when
	* an object is started and ended, as well as when a new field is discovered.
	* <p>
	* Creates a map of known fields. Each time a field is parsed,
	* looks up the field in the map. If not found, the value is "sniffed"
	* to determine its type, and a matching parser and listener created.
	* Thereafter, the previous parser is reused.
	* <p>
	* The object listener provides semantics. One key decision is whether
	* to project a field or not. An unprojected field is parsed with
	* a "dummy" parser that "free-wheels" over all valid JSON structures.
	* Otherwise, the listener is given whatever type information that the
	* parser can discover when creating the field.
	* <p>
	* Work is divided between this class, which discovers fields, and
	* the listeners which determine the meaning of field values. A field,
	* via a properly-defined listener, can accept one or more different
	* value kinds.
	* <p>
	* The parser accepts JSON tokens as they appear in the file. The
	* question of whether those tokens make sense is left to the listeners.
	* The listeners decide if the tokens make sense for a particular column.
	* The listener should provide a clear error if a particular token is not
	* valid for a given listener.
	*
	* <h4>Fields</h4>
	*
	* The structure of an object is:
	* <ul>
	* <li>{@code ObjectListener} which represents the object (tuple) as a whole.
	* Each field, indexed by name, is represented as a</li>
	* <li>{@code ValueListener} which represents the value "slot". That value
	* can be scalar, or can be structured, in which case the value listener
	* contains either a</li>
	* <li>{@code ArrayListener} for an array, or a</li>
	* <li>{@code ObjectListener} for a nested object (tuple).</li>
	* </ul>
	*
	* <h4>Nulls</h4>
	*
	* Null values are handled at the semantic, not syntax level. If the
	* first appearance of a field contains a null value, then the parser can
	* provide no hints about the expected field type. The listener must
	* implement a solution such as referring to a schema, waiting for a
	* non-null value to appear, etc.
	* <p>
	* Since the parser classes handle syntax, they are blissfully ignorant
	* of any fancy logic needed for null handling. Each field is
	* represented by a field parser whether that field is null or not.
	* It is the listener that may have to swap out one mechanism for
	* another as types are discovered.
	*
	* <h4>Complex Types</h4>
	*
	* Parsers handle arrays and objects using a two-level system. Each field
	* always is driven by a field parser. If the field is discovered to be an
	* array, then we add an array parser to the field parser to handle array
	* contents. The same is true of objects.
	* <p>
	* Both objects and arrays are collections of values, and a value can
	* optionally contain an array or object. (JSON allows any given field
	* name to map to both objects and arrays in different rows. The parser
	* structure reflects this syntax. The listeners can enforce more
	* relational-like semantics).
	* <p>
	* If an array is single-dimension, then the field parse contains an array
	* parser which contains another value parser for the array contents. If
	* the array is multi-dimensional, there will be multiple array/value
	* parser pairs: one for each dimension.
	*/
	public abstract class ObjectParser extends AbstractElementParser {
	protected static final Logger logger = LoggerFactory.getLogger(ObjectParser.class);

	private final Map<String, ElementParser> members = CaseInsensitiveMap.newHashMap();

	public ObjectParser(JsonStructureParser structParser) {
	super(structParser);
	}

	@VisibleForTesting
	public ElementParser fieldParser(String key) {
	return members.get(key);
	}

	/**
	* Called at the start of a set of values for an object. That is, called
	* when the structure parser accepts the <code>{</code> token.
	*/
	protected void onStart() { }

	/**
	* The structure parser has just encountered a new field for this
	* object. This method returns a parser for the field, along with
	* an optional listener to handle events within the field. The field typically
	* uses a value parser create by the {@link FieldParserFactory} class.
	* However, special cases (such as Mongo extended types) can create a
	* custom parser.
	* <p>
	* If the field is not projected, the method should return a dummy parser
	* from {@link FieldParserFactory#ignoredFieldParser()}.
	* The dummy parser will "free-wheel" over whatever values the
	* field contains. (This is one way to avoid structure errors in a JSON file:
	* just ignore them.) Otherwise, the parser will look ahead to guess the
	* field type and will call one of the "add" methods, each of which should
	* return a value listener for the field itself.
	* <p>
	* A normal field will respond to the structure of the JSON file as it
	* appears. The associated value listener receives events for the
	* field value. The value listener may be asked to create additional
	* structure, such as arrays or nested objects.
	* <p>
	* Parse position: <code>{ ... field : ^ ?</code> for a newly-seen field.
	* Constructs a value parser and its listeners by looking ahead
	* some number of tokens to "sniff" the type of the value. For
	* example:
	* <ul>
	* <li>{@code foo: <value>} - Field value</li>
	* <li>{@code foo: [ <value> ]} - 1D array value</li>
	* <li>{@code foo: [ [<value> ] ]} - 2D array value</li>
	* <li>Etc.</li>
	* </ul>
	* <p>
	* There are two cases in which no type estimation is possible:
	* <ul>
	* <li>{@code foo: null}</li>
	* <li>{@code foo: []}</li>
	* </ul>
	*
	* @param key name of the field
	* @param tokenizer an instance of a token iterator
	* @return a parser for the newly-created field
	*/
	protected abstract ElementParser onField(String key, TokenIterator tokenizer);

	/**
	* Called at the end of a set of values for an object. That is, called
	* when the structure parser accepts the <code>}</code> token.
	*/
	protected void onEnd() { }

	/**
	* Parses <code>{ ^ ... }</code>
	*
	* @param tokenizer an instance of a token iterator
	*/
	@Override
	public void parse(TokenIterator tokenizer) {
	onStart();

	// Parse (field: value)* }
	top: while (true) {
	JsonToken token = tokenizer.requireNext();
	// Position: { (key: value)* ? ^
	switch (token) {
	case END_OBJECT:
	// Position: { (key: value)* } ^
	break top;

	case FIELD_NAME:
	// Position: { (key: value)* key: ^
	parseMember(tokenizer);
	break;

	default:
	// Position: { (key: value)* ~(key \| }) ^
	// Invalid JSON.
	// Actually, we probably won't get here, the JSON parser
	// itself will throw an exception.
	throw errorFactory().syntaxError(token);
	}
	}
	onEnd();
	}

	/**
	* Parse a field. Two cases. First, this is a field we've already seen. If so,
	* look up the parser for that field and use it. If this is the first time
	* we've seen the field, "sniff" tokens to determine field type, create a
	* parser, then parse.
	*
	* @param tokenizer an instance of a token iterator
	*/
	private void parseMember(TokenIterator tokenizer) {
	// Position: key: ^ ?
	final String key = tokenizer.textValue().trim();
	ElementParser fieldParser = members.get(key);
	if (fieldParser == null) {
	// New key; sniff the value to determine the parser to use
	// (which also tell us the kind of column to create in Drill.)
	// Position: key: ^
	fieldParser = detectValueParser(key, tokenizer);
	members.put(key, fieldParser);
	}
	// Parse the field value.
	// Position: key: ^ value ...
	fieldParser.parse(tokenizer);
	}

	/**
	* If the column is not projected, create a dummy parser to "free wheel" over
	* the value. Otherwise, look ahead a token or two to determine the the type
	* of the field. Then the caller will backtrack to parse the field.
	*
	* @param key name of the field
	* @return parser for the field
	*/
	private ElementParser detectValueParser(String key, TokenIterator tokenizer) {
	if (key.isEmpty()) {
	throw errorFactory().structureError(
	"Drill does not allow empty keys in JSON key/value pairs");
	}
	ElementParser fieldParser = onField(key, tokenizer);
	if (fieldParser == null) {
	logger.warn("No JSON element parser returned for field {}, assuming unprojected", key);
	return DummyValueParser.INSTANCE;
	} else {
	return fieldParser;
	}
	}

	public ElementParser replaceFieldParser(String key, ElementParser fieldParser) {
	members.put(key, fieldParser);
	return fieldParser;
	}
	}