blob: ad405c16417fce87fa7cbd9562f35b14bb150c59 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.easy.json.parser;
import java.util.Map;
import org.apache.drill.common.map.CaseInsensitiveMap;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonToken;
/**
* Parses a JSON object: <code>{ name : value ... }</code>
* <p>
* The object value may the root object (the row), a top-level
* field or may be the element of an array. The event methods are called when
* an object is started and ended, as well as when a new field is discovered.
* <p>
* Creates a map of known fields. Each time a field is parsed,
* looks up the field in the map. If not found, the value is "sniffed"
* to determine its type, and a matching parser and listener created.
* Thereafter, the previous parser is reused.
* <p>
* The object listener provides semantics. One key decision is whether
* to project a field or not. An unprojected field is parsed with
* a "dummy" parser that "free-wheels" over all valid JSON structures.
* Otherwise, the listener is given whatever type information that the
* parser can discover when creating the field.
* <p>
* Work is divided between this class, which discovers fields, and
* the listeners which determine the meaning of field values. A field,
* via a properly-defined listener, can accept one or more different
* value kinds.
* <p>
* The parser accepts JSON tokens as they appear in the file. The
* question of whether those tokens make sense is left to the listeners.
* The listeners decide if the tokens make sense for a particular column.
* The listener should provide a clear error if a particular token is not
* valid for a given listener.
*
* <h4>Fields</h4>
*
* The structure of an object is:
* <ul>
* <li>{@code ObjectListener} which represents the object (tuple) as a whole.
* Each field, indexed by name, is represented as a</li>
* <li>{@code ValueListener} which represents the value "slot". That value
* can be scalar, or can be structured, in which case the value listener
* contains either a</li>
* <li>{@code ArrayListener} for an array, or a</li>
* <li>{@code ObjectListener} for a nested object (tuple).</li>
* </ul>
*
* <h4>Nulls</h4>
*
* Null values are handled at the semantic, not syntax level. If the
* first appearance of a field contains a null value, then the parser can
* provide no hints about the expected field type. The listener must
* implement a solution such as referring to a schema, waiting for a
* non-null value to appear, etc.
* <p>
* Since the parser classes handle syntax, they are blissfully ignorant
* of any fancy logic needed for null handling. Each field is
* represented by a field parser whether that field is null or not.
* It is the listener that may have to swap out one mechanism for
* another as types are discovered.
*
* <h4>Complex Types</h4>
*
* Parsers handle arrays and objects using a two-level system. Each field
* always is driven by a field parser. If the field is discovered to be an
* array, then we add an array parser to the field parser to handle array
* contents. The same is true of objects.
* <p>
* Both objects and arrays are collections of values, and a value can
* optionally contain an array or object. (JSON allows any given field
* name to map to both objects and arrays in different rows. The parser
* structure reflects this syntax. The listeners can enforce more
* relational-like semantics).
* <p>
* If an array is single-dimension, then the field parse contains an array
* parser which contains another value parser for the array contents. If
* the array is multi-dimensional, there will be multiple array/value
* parser pairs: one for each dimension.
*/
public abstract class ObjectParser extends AbstractElementParser {
protected static final Logger logger = LoggerFactory.getLogger(ObjectParser.class);
private final Map<String, ElementParser> members = CaseInsensitiveMap.newHashMap();
public ObjectParser(JsonStructureParser structParser) {
super(structParser);
}
@VisibleForTesting
public ElementParser fieldParser(String key) {
return members.get(key);
}
/**
* Called at the start of a set of values for an object. That is, called
* when the structure parser accepts the <code>{</code> token.
*/
protected void onStart() { }
/**
* The structure parser has just encountered a new field for this
* object. This method returns a parser for the field, along with
* an optional listener to handle events within the field. The field typically
* uses a value parser create by the {@link FieldParserFactory} class.
* However, special cases (such as Mongo extended types) can create a
* custom parser.
* <p>
* If the field is not projected, the method should return a dummy parser
* from {@link FieldParserFactory#ignoredFieldParser()}.
* The dummy parser will "free-wheel" over whatever values the
* field contains. (This is one way to avoid structure errors in a JSON file:
* just ignore them.) Otherwise, the parser will look ahead to guess the
* field type and will call one of the "add" methods, each of which should
* return a value listener for the field itself.
* <p>
* A normal field will respond to the structure of the JSON file as it
* appears. The associated value listener receives events for the
* field value. The value listener may be asked to create additional
* structure, such as arrays or nested objects.
* <p>
* Parse position: <code>{ ... field : ^ ?</code> for a newly-seen field.
* Constructs a value parser and its listeners by looking ahead
* some number of tokens to "sniff" the type of the value. For
* example:
* <ul>
* <li>{@code foo: <value>} - Field value</li>
* <li>{@code foo: [ <value> ]} - 1D array value</li>
* <li>{@code foo: [ [<value> ] ]} - 2D array value</li>
* <li>Etc.</li>
* </ul>
* <p>
* There are two cases in which no type estimation is possible:
* <ul>
* <li>{@code foo: null}</li>
* <li>{@code foo: []}</li>
* </ul>
*
* @param key name of the field
* @param tokenizer an instance of a token iterator
* @return a parser for the newly-created field
*/
protected abstract ElementParser onField(String key, TokenIterator tokenizer);
/**
* Called at the end of a set of values for an object. That is, called
* when the structure parser accepts the <code>}</code> token.
*/
protected void onEnd() { }
/**
* Parses <code>{ ^ ... }</code>
*
* @param tokenizer an instance of a token iterator
*/
@Override
public void parse(TokenIterator tokenizer) {
onStart();
// Parse (field: value)* }
top: while (true) {
JsonToken token = tokenizer.requireNext();
// Position: { (key: value)* ? ^
switch (token) {
case END_OBJECT:
// Position: { (key: value)* } ^
break top;
case FIELD_NAME:
// Position: { (key: value)* key: ^
parseMember(tokenizer);
break;
default:
// Position: { (key: value)* ~(key | }) ^
// Invalid JSON.
// Actually, we probably won't get here, the JSON parser
// itself will throw an exception.
throw errorFactory().syntaxError(token);
}
}
onEnd();
}
/**
* Parse a field. Two cases. First, this is a field we've already seen. If so,
* look up the parser for that field and use it. If this is the first time
* we've seen the field, "sniff" tokens to determine field type, create a
* parser, then parse.
*
* @param tokenizer an instance of a token iterator
*/
private void parseMember(TokenIterator tokenizer) {
// Position: key: ^ ?
final String key = tokenizer.textValue().trim();
ElementParser fieldParser = members.get(key);
if (fieldParser == null) {
// New key; sniff the value to determine the parser to use
// (which also tell us the kind of column to create in Drill.)
// Position: key: ^
fieldParser = detectValueParser(key, tokenizer);
members.put(key, fieldParser);
}
// Parse the field value.
// Position: key: ^ value ...
fieldParser.parse(tokenizer);
}
/**
* If the column is not projected, create a dummy parser to "free wheel" over
* the value. Otherwise, look ahead a token or two to determine the the type
* of the field. Then the caller will backtrack to parse the field.
*
* @param key name of the field
* @return parser for the field
*/
private ElementParser detectValueParser(String key, TokenIterator tokenizer) {
if (key.isEmpty()) {
throw errorFactory().structureError(
"Drill does not allow empty keys in JSON key/value pairs");
}
ElementParser fieldParser = onField(key, tokenizer);
if (fieldParser == null) {
logger.warn("No JSON element parser returned for field {}, assuming unprojected", key);
return DummyValueParser.INSTANCE;
} else {
return fieldParser;
}
}
public ElementParser replaceFieldParser(String key, ElementParser fieldParser) {
members.put(key, fieldParser);
return fieldParser;
}
}