blob: bf65bb790a46adc52406b7ff5328341e3d19f4eb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.easy.json.loader;
import org.apache.drill.exec.record.metadata.TupleMetadata;
import org.apache.drill.exec.store.easy.json.parser.ElementParser;
import org.apache.drill.exec.store.easy.json.parser.JsonStructureParser;
import org.apache.drill.exec.store.easy.json.parser.ObjectParser;
import org.apache.drill.exec.store.easy.json.parser.TokenIterator;
import org.apache.drill.exec.vector.accessor.TupleWriter;
/**
* Accepts { name : value ... }
* <p>
* The structure parser maintains a map of known fields. Each time a
* field is parsed, looks up the field in the map. If not found, the parser
* looks ahead to find a value token, if any, and calls this class to add
* a new column. This class creates a column writer based either on the
* type provided in a provided schema, or inferred from the JSON token.
* <p>
* As it turns out, most of the semantic action occurs at the tuple level:
* that is where fields are defined, types inferred, and projection is
* computed.
*
* <h4>Nulls</h4>
*
* Much code here deals with null types, especially leading nulls, leading
* empty arrays, and so on. The object parser creates a parser for each
* value; a parser which "does the right thing" based on the data type.
* For example, for a Boolean, the parser recognizes {@code true},
* {@code false} and {@code null}.
* <p>
* But what happens if the first value for a field is {@code null}? We
* don't know what kind of parser to create because we don't have a schema.
* Instead, we have to create a temporary placeholder parser that will consume
* nulls, waiting for a real type to show itself. Once that type appears, the
* null parser can replace itself with the correct form. Each vector's
* "fill empties" logic will back-fill the newly created vector with nulls
* for prior rows.
* <p>
* Two null parsers are needed: one when we see an empty list, and one for
* when we only see {@code null}. The one for {@code null{@code must morph into
* the one for empty lists if we see:<br>
* {@code {a: null} {a: [ ] }}<br>
* <p>
* If we get all the way through the batch, but have still not seen a type,
* then we have to guess. A prototype type system can tell us, otherwise we
* guess {@code VARCHAR}. ({@code VARCHAR} is the right choice for all-text
* mode, it is as good a guess as any for other cases.)
*
* <h4>Projection List Hints</h4>
*
* To help, we consult the projection list, if any, for a column. If the
* projection is of the form {@code a[0]}, we know the column had better
* be an array. Similarly, if the projection list has {@code b.c}, then
* {@code b} had better be an object.
*
* <h4>Array Handling</h4>
*
* The code here handles arrays in two ways. JSON normally uses the
* {@code LIST} type. But, that can be expensive if lists are
* well-behaved. So, the code here also implements arrays using the
* classic {@code REPEATED} types. The repeated type option is disabled
* by default. It can be enabled, for efficiency, if Drill ever supports
* a JSON schema. If an array is well-behaved, mark that column as able
* to use a repeated type.
*
* <h4>Ambiguous Types</h4>
*
* JSON nulls are untyped. A run of nulls does not tell us what type will
* eventually appear. The best solution is to provide a schema. Without a
* schema, the code is forgiving: defers selection of the column type until
* the first non-null value (or, forces a type at the end of the batch.)
* <p>
* For scalars the pattern is: <code>{a: null} {a: "foo"}</code>. Type
* selection happens on the value {@code "foo"}.
* <p>
* For arrays, the pattern is: <code>{a: []} {a: ["foo"]}</code>. Type
* selection happens on the first array element. Note that type selection
* must happen on the first element, even if tha element is null (which,
* as we just said, ambiguous.)
* <p>
* If we are forced to pick a type (because we hit the end of a batch, or
* we see {@code [null]}, then we pick {@code VARCHAR} as we allow any
* scalar to be converted to {@code VARCHAR}. This helps for a single-file
* query, but not if multiple fragments each make their own (inconsistent)
* decisions. Only a schema provides a consistent answer.
*/
public class TupleParser extends ObjectParser {
private final JsonLoaderImpl loader;
private final TupleWriter tupleWriter;
private final TupleMetadata providedSchema;
// Bootstrap case: struct parser not yet set on the JSON loader
public TupleParser(JsonStructureParser structParser, JsonLoaderImpl loader,
TupleWriter tupleWriter, TupleMetadata providedSchema) {
super(structParser);
this.loader = loader;
this.tupleWriter = tupleWriter;
this.providedSchema = providedSchema;
}
public TupleParser(JsonLoaderImpl loader, TupleWriter tupleWriter, TupleMetadata providedSchema) {
this(loader.parser(), loader, tupleWriter, providedSchema);
}
public JsonLoaderImpl loader() { return loader; }
public TupleWriter writer() { return tupleWriter; }
protected TupleMetadata providedSchema() { return providedSchema; }
protected FieldFactory fieldFactory() { return loader.fieldFactory(); }
@Override
public ElementParser onField(String key, TokenIterator tokenizer) {
if (projectField(key)) {
return fieldParserFor(key, tokenizer);
} else {
return fieldFactory().ignoredFieldParser();
}
}
private boolean projectField(String key) {
// This method makes sure that fields necessary for column listeners are read.
if (tupleWriter.isProjected(key)) {
return true;
} else {
return loader.listenerColumnMap() != null && loader.listenerColumnMap().containsKey(key);
}
}
private ElementParser fieldParserFor(String key, TokenIterator tokenizer) {
return fieldFactory().fieldParser(new FieldDefn(this, key, tokenizer));
}
public ElementParser resolveField(String key, TokenIterator tokenizer) {
return replaceFieldParser(key, fieldParserFor(key, tokenizer));
}
public ElementParser resolveArray(String key, TokenIterator tokenizer) {
return replaceFieldParser(key, fieldFactory().fieldParser(new FieldDefn(this, key, tokenizer, true)));
}
public void forceNullResolution(String key) {
replaceFieldParser(key, fieldFactory().forceNullResolution(new FieldDefn(this, key, null)));
}
public void forceEmptyArrayResolution(String key) {
replaceFieldParser(key, fieldFactory().forceArrayResolution(new FieldDefn(this, key, null)));
}
}