blob: ce331a346833b9af709026b060c19fd73f0694b4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.easy.json.parser;
import java.util.Map;
import org.apache.drill.common.map.CaseInsensitiveMap;
import org.apache.drill.exec.store.easy.json.parser.ObjectListener.FieldType;
import org.apache.drill.exec.store.easy.json.parser.ValueDef.JsonType;
import com.fasterxml.jackson.core.JsonToken;
/**
* Parses a JSON object: <code>{ name : value ... }</code>
* <p>
* Creates a map of known fields. Each time a field is parsed,
* looks up the field in the map. If not found, the value is "sniffed"
* to determine its type, and a matching parser and listener created.
* Thereafter, the previous parser is reused.
* <p>
* The object listener provides semantics. One key decision is whether
* to project a field or not. An unprojected field is parsed with
* a "dummy" parser that "free-wheels" over all valid JSON structures.
* Otherwise, the listener is given whatever type information that the
* parser can discover when creating the field.
* <p>
* Work is divided between this class, which discovers fields, and
* the listeners which determine the meaning of field values. A field,
* via a properly-defined listener, can accept one or more different
* value kinds.
* <p>
* The parser accepts JSON tokens as they appear in the file. The
* question of whether those tokens make sense is left to the listeners.
* The listeners decide if the tokens make sense for a particular column.
* The listener should provide a clear error if a particular token is not
* valid for a given listener.
*
* <h4>Nulls</h4>
*
* Null values are handled at the semantic, not syntax level. If the
* first appearance of a field contains a null value, then the parser can
* provide no hints about the expected field type. The listener must
* implement a solution such as referring to a schema, waiting for a
* non-null value to appear, etc.
* <p>
* Since the parser classes handle syntax, they are blissfully ignorant
* of any fancy logic needed for null handling. Each field is
* represented by a field parser whether that field is null or not.
* It is the listener that may have to swap out one mechanism for
* another as types are discovered.
*
* <h4>Complex Types</h4>
*
* Parsers handle arrays and objects using a two-level system. Each field
* always is driven by a field parser. If the field is discovered to be an
* array, then we add an array parser to the field parser to handle array
* contents. The same is true of objects.
* <p>
* Both objects and arrays are collections of values, and a value can
* optionally contain an array or object. (JSON allows any given field
* name to map to both objects and arrays in different rows. The parser
* structure reflects this syntax. The listeners can enforce more
* relational-like semantics).
* <p>
* If an array is single-dimension, then the field parse contains an array
* parser which contains another value parser for the array contents. If
* the array is multi-dimensional, there will be multiple array/value
* parser pairs: one for each dimension.
*/
public class ObjectParser extends AbstractElementParser {
private final ObjectListener listener;
private final Map<String, ElementParser> members = CaseInsensitiveMap.newHashMap();
public ObjectParser(ElementParser parent, ObjectListener listener) {
super(parent);
this.listener = listener;
}
public ObjectListener listener() { return listener; }
/**
* Parses <code>{ ^ ... }</code>
*/
@Override
public void parse(TokenIterator tokenizer) {
listener.onStart();
// Parse (field: value)* }
top: while (true) {
JsonToken token = tokenizer.requireNext();
// Position: { (key: value)* ? ^
switch (token) {
case END_OBJECT:
// Position: { (key: value)* } ^
break top;
case FIELD_NAME:
// Position: { (key: value)* key: ^
parseMember(tokenizer);
break;
default:
// Position: { (key: value)* ~(key | }) ^
// Invalid JSON.
// Actually, we probably won't get here, the JSON parser
// itself will throw an exception.
throw errorFactory().syntaxError(token);
}
}
listener.onEnd();
}
/**
* Parse a field. Two cases. First, this is a field we've already seen. If so,
* look up the parser for that field and use it. If this is the first time
* we've seen the field, "sniff" tokens to determine field type, create a
* parser, then parse.
*/
private void parseMember(TokenIterator tokenizer) {
// Position: key: ^ ?
final String key = tokenizer.textValue().trim();
ElementParser fieldParser = members.get(key);
if (fieldParser == null) {
// New key; sniff the value to determine the parser to use
// (which also tell us the kind of column to create in Drill.)
// Position: key: ^
fieldParser = detectValueParser(tokenizer, key);
members.put(key, fieldParser);
}
// Parse the field value.
// Position: key: ^ value ...
fieldParser.parse(tokenizer);
}
/**
* If the column is not projected, create a dummy parser to "free wheel" over
* the value. Otherwise, look ahead a token or two to determine the the type
* of the field. Then the caller will backtrack to parse the field.
*
* @param key name of the field
* @return parser for the field
*/
private ElementParser detectValueParser(TokenIterator tokenizer, final String key) {
if (key.isEmpty()) {
throw errorFactory().structureError(
"Drill does not allow empty keys in JSON key/value pairs");
}
FieldType type = listener.fieldType(key);
switch (type) {
case IGNORE:
return new DummyValueParser(this);
case JSON:
return new JsonValueParser(this, key,
listener.addField(key, new ValueDef(JsonType.STRING, 0)));
default:
return createFieldParser(key, type, tokenizer);
}
}
/**
* Parse position: <code>{ ... field : ^ ?</code> for a newly-seen field.
* Constructs a value parser and its listeners by looking ahead
* some number of tokens to "sniff" the type of the value. For
* example:
* <ul>
* <li>{@code foo: <value>} - Field value</li>
* <li>{@code foo: [ <value> ]} - 1D array value</li>
* <li>{@code foo: [ [<value> ] ]} - 2D array value</li>
* <li>Etc.</li>
* </ul>
* <p>
* There are two cases in which no type estimation is possible:
* <ul>
* <li>The value is {@code null}, indicated by
* {@link JsonType#NULL}.</code>
* <li>The value is an array, and the array is empty, indicated
* by {@link JsonType#EMPTY}.</li>
* </ul>
* {@link ValueDefFactory} handles syntactic type inference. The associated
* listener enforces semantic rules. For example, if a schema is
* available, and we know that field "x" must be an Integer, but
* this class reports that it is an object, then the listener should
* raise an exception.
* <p>
* Also, the parser cannot enforce type consistency. This method
* looks only at the first appearance of a value: a sample size of
* one. JSON allows anything.
* The listener must enforce semantic rules that say whether a different
* type is allowed for later values.
*
* @param key the name of the field
* @param type the kind of field parser to create
* @param tokenizer the token parser
* @return the value parser for the element, which may contain additional
* structure for objects or arrays
*/
public ElementParser createFieldParser(String key, FieldType type,
TokenIterator tokenizer) {
ValueParser fp = new ValueParser(this, key, type);
ValueDef valueDef = ValueDefFactory.lookAhead(tokenizer);
fp.bindListener(listener.addField(key, valueDef));
fp.expandStructure(valueDef);
return fp;
}
}