blob: 9e21f5399f7bca94b5624a6782ec4b09231cb930 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.vxquery.xmlparser;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hyracks.api.comm.IFrameFieldAppender;
import org.apache.hyracks.api.comm.IFrameWriter;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
import org.apache.hyracks.data.std.util.GrowableArray;
import org.apache.hyracks.data.std.util.UTF8StringBuilder;
import org.apache.hyracks.dataflow.common.comm.util.FrameUtils;
import org.apache.hyracks.util.string.UTF8StringUtil;
import org.apache.vxquery.datamodel.accessors.TaggedValuePointable;
import org.apache.vxquery.datamodel.accessors.nodes.NodeTreePointable;
import org.apache.vxquery.datamodel.builders.nodes.AbstractNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.AttributeNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.CommentNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.DictionaryBuilder;
import org.apache.vxquery.datamodel.builders.nodes.DocumentNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.ElementNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.PINodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.TextNodeBuilder;
import org.apache.vxquery.datamodel.values.ValueTag;
import org.apache.vxquery.runtime.functions.util.FunctionHelper;
import org.apache.vxquery.types.BuiltinTypeQNames;
import org.apache.vxquery.types.ElementType;
import org.apache.vxquery.types.NameTest;
import org.apache.vxquery.types.NodeType;
import org.apache.vxquery.types.SequenceType;
import org.apache.vxquery.xmlquery.query.XQueryConstants;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
public class SAXContentHandler implements ContentHandler, LexicalHandler {
private static final int STRING_EXPECTED_LENGTH = 300;
// XML node builders
protected final AttributeNodeBuilder anb;
protected final CommentNodeBuilder cnb;
protected final DictionaryBuilder db;
protected final DocumentNodeBuilder docb;
protected final PINodeBuilder pinb;
protected final TextNodeBuilder tnb;
protected final UTF8StringBuilder utf8b;
private final UTF8StringBuilder utf8bInternal;
protected final List<ElementNodeBuilder> enbStack;
protected final List<ElementNodeBuilder> freeENBList;
protected boolean isIndexHandler;
// Frame writing variables
protected IFrameFieldAppender appender;
private int tupleIndex;
private IFrameWriter writer;
// Element writing and path step variables
protected boolean skipping;
private String[] childLocalName = null;
private String[] childUri = null;
private boolean[] subElement = null;
private final TaggedValuePointable tvp;
// Basic tracking and setting variables
protected final boolean attachTypes;
protected final boolean createNodeIds;
private int depth;
protected final ArrayBackedValueStorage resultABVS;
protected boolean pendingText;
protected int nodeIdCounter;
protected final ITreeNodeIdProvider nodeIdProvider;
protected final ArrayBackedValueStorage tempABVS;
private final GrowableArray textGA;
private final GrowableArray textGAInternal;
public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider, boolean isIndexHandler) {
// XML node builders
anb = new AttributeNodeBuilder();
cnb = new CommentNodeBuilder();
db = new DictionaryBuilder();
docb = new DocumentNodeBuilder();
pinb = new PINodeBuilder();
tnb = new TextNodeBuilder();
utf8b = new UTF8StringBuilder();
utf8bInternal = new UTF8StringBuilder();
enbStack = new ArrayList<>();
freeENBList = new ArrayList<>();
// Element writing and path step variables
skipping = true;
tvp = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable();
// Basic tracking and setting variables
this.attachTypes = attachTypes;
createNodeIds = nodeIdProvider != null;
depth = 0;
resultABVS = new ArrayBackedValueStorage();
pendingText = false;
nodeIdCounter = 0;
this.nodeIdProvider = nodeIdProvider;
tempABVS = new ArrayBackedValueStorage();
textGA = new GrowableArray();
textGAInternal = new GrowableArray();
this.isIndexHandler = isIndexHandler;
if (isIndexHandler) {
this.appender = null;
this.skipping = false;
}
}
public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider, IFrameFieldAppender appender,
List<SequenceType> childSequenceTypes) {
this(attachTypes, nodeIdProvider, false);
this.appender = appender;
setChildPathSteps(childSequenceTypes);
}
private void setChildPathSteps(List<SequenceType> childSeq) {
if (!childSeq.isEmpty()) {
subElement = new boolean[childSeq.size()];
childUri = new String[childSeq.size()];
childLocalName = new String[childSeq.size()];
}
int index = 0;
for (SequenceType sType : childSeq) {
NodeType nodeType = (NodeType) sType.getItemType();
ElementType eType = (ElementType) nodeType;
NameTest nameTest = eType.getNameTest();
childUri[index] = getStringFromBytes(nameTest.getUri());
childLocalName[index] = getStringFromBytes(nameTest.getLocalName());
++index;
}
}
public void setupElementWriter(IFrameWriter writer, int tupleIndex) {
this.writer = writer;
this.tupleIndex = tupleIndex;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (skipping) {
return;
}
try {
appendCharArray(ch, start, length);
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
pendingText = true;
}
@Override
public void endDocument() throws SAXException {
if (skipping) {
return;
}
try {
flushText();
docb.endChildrenChunk();
docb.finish();
if (appender != null) {
writeElement();
}
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
private void endElementChildPathStep() throws IOException {
if (subElement != null && depth <= subElement.length) {
subElement[depth - 1] = false;
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (skipping) {
--depth;
return;
}
try {
boolean nonSkipped = false;
if (!isIndexHandler) {
nonSkipped = foundFirstNonSkippedElement();
}
flushText();
ElementNodeBuilder enb = enbStack.remove(enbStack.size() - 1);
enb.endChildrenChunk();
endChildInParent(enb, nonSkipped);
freeENB(enb);
if (!isIndexHandler) {
if (nonSkipped) {
writeElement();
}
endElementChildPathStep();
}
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
--depth;
}
@Override
public void endPrefixMapping(String prefix) throws SAXException {
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
}
@Override
public void processingInstruction(String target, String data) throws SAXException {
if (skipping) {
return;
}
try {
flushText();
startChildInParent(pinb);
if (createNodeIds) {
pinb.setLocalNodeId(nodeIdCounter++);
}
pinb.setTarget(stringToGrowableArray(target));
pinb.setContent(stringToGrowableArray(data));
endChildInParent(pinb);
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
private GrowableArray stringToGrowableArray(String value) throws IOException {
FunctionHelper.stringToGrowableArray(value, textGAInternal, utf8bInternal, STRING_EXPECTED_LENGTH);
return textGAInternal;
}
@Override
public void setDocumentLocator(Locator locator) {
}
@Override
public void skippedEntity(String name) throws SAXException {
}
@Override
public void startDocument() throws SAXException {
if (isIndexHandler || subElement == null) {
skipping = false;
}
db.reset();
try {
textGA.reset();
utf8b.reset(textGA, STRING_EXPECTED_LENGTH);
} catch (IOException e) {
throw new SAXException(e);
}
if (skipping) {
return;
}
try {
resultABVS.reset();
docb.reset(resultABVS);
if (createNodeIds) {
docb.setLocalNodeId(nodeIdCounter++);
}
docb.startChildrenChunk();
flushText();
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
/**
* The filter settings here are similar to one in the class linked below.
*
* @throws SAXException
* @see org.apache.vxquery.runtime.functions.step.NodeTestFilter.java
*/
private boolean startElementChildPathStep(String uri, String localName) throws SAXException {
if (subElement != null && depth <= subElement.length) {
// Check path step if it exists.
subElement[depth - 1] = true;
if (uri != null) {
if (childUri[depth - 1] != null && uri.compareTo(childUri[depth - 1]) != 0) {
subElement[depth - 1] = false;
}
}
if (localName != null) {
if (childLocalName[depth - 1] != null && localName.compareTo(childLocalName[depth - 1]) != 0) {
subElement[depth - 1] = false;
}
}
}
boolean start = foundFirstNonSkippedElement();
if (start) {
skipping = false;
}
return start;
}
@Override
public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
++depth;
boolean start = false;
if (!isIndexHandler) {
start = startElementChildPathStep(uri, localName);
}
if (skipping) {
return;
}
try {
flushText();
int idx = name.indexOf(':');
String prefix = idx < 0 ? "" : name.substring(0, idx);
ElementNodeBuilder enb = createENB();
startChildInParent(enb, start);
int uriCode = db.lookup(uri);
int localNameCode = db.lookup(localName);
int prefixCode = db.lookup(prefix);
enb.setName(uriCode, localNameCode, prefixCode);
if (attachTypes) {
int typeUriCode = db.lookup(XQueryConstants.XS_NSURI);
int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_STR);
int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX);
enb.setType(typeUriCode, typeLocalNameCode, typePrefixCode);
}
if (createNodeIds) {
enb.setLocalNodeId(nodeIdCounter++);
}
enb.startAttributeChunk();
final int nAttrs = atts.getLength();
for (int i = 0; i < nAttrs; ++i) {
String aName = atts.getQName(i);
int aIdx = aName.indexOf(':');
int aPrefixCode = db.lookup(aIdx < 0 ? "" : aName.substring(0, aIdx));
int aLocalNameCode = db.lookup(atts.getLocalName(i));
int aUriCode = db.lookup(atts.getURI(i));
String aValue = atts.getValue(i);
tempABVS.reset();
DataOutput tempOut = tempABVS.getDataOutput();
tempOut.write(ValueTag.XS_UNTYPED_ATOMIC_TAG);
stringToGrowableArray(aValue);
tempOut.write(textGAInternal.getByteArray(), 0, textGAInternal.getLength());
enb.startAttribute(anb);
anb.setName(aUriCode, aLocalNameCode, aPrefixCode);
if (attachTypes) {
int typeUriCode = db.lookup(XQueryConstants.XS_NSURI);
int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_ATOMIC_STR);
int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX);
anb.setType(typeUriCode, typeLocalNameCode, typePrefixCode);
}
if (createNodeIds) {
anb.setLocalNodeId(nodeIdCounter++);
}
anb.setValue(tempABVS);
enb.endAttribute(anb);
}
enb.endAttributeChunk();
enb.startChildrenChunk();
enbStack.add(enb);
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
}
@Override
public void comment(char[] ch, int start, int length) throws SAXException {
if (skipping) {
return;
}
try {
flushText();
startChildInParent(cnb);
if (createNodeIds) {
cnb.setLocalNodeId(nodeIdCounter++);
}
appendCharArray(ch, start, length);
utf8b.finish();
cnb.setValue(textGA);
endChildInParent(cnb);
textGA.reset();
utf8b.reset(textGA, STRING_EXPECTED_LENGTH);
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
private void appendCharArray(char[] ch, int start, int length) throws IOException {
for (int i = 0; i < length; ++i) {
utf8b.appendChar(ch[i + start]);
}
}
protected void flushText() throws IOException {
if (pendingText) {
peekENBStackTop().startChild(tnb);
if (createNodeIds) {
tnb.setLocalNodeId(nodeIdCounter++);
}
utf8b.finish();
tnb.setValue(textGA);
peekENBStackTop().endChild(tnb);
textGA.reset();
utf8b.reset(textGA, STRING_EXPECTED_LENGTH);
pendingText = false;
}
}
@Override
public void endCDATA() throws SAXException {
}
@Override
public void endDTD() throws SAXException {
}
@Override
public void endEntity(String name) throws SAXException {
}
@Override
public void startCDATA() throws SAXException {
}
@Override
public void startDTD(String name, String publicId, String systemId) throws SAXException {
}
@Override
public void startEntity(String name) throws SAXException {
}
public void writeElement() throws IOException {
tempABVS.reset();
DataOutput out = tempABVS.getDataOutput();
out.write(ValueTag.NODE_TREE_TAG);
byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK;
if (attachTypes) {
header |= NodeTreePointable.HEADER_TYPE_EXISTS_MASK;
}
if (createNodeIds) {
header |= NodeTreePointable.HEADER_NODEID_EXISTS_MASK;
}
out.write(header);
if (createNodeIds) {
out.writeInt(nodeIdProvider.getId());
}
db.writeFromCache(tempABVS);
out.write(resultABVS.getByteArray(), resultABVS.getStartOffset(), resultABVS.getLength());
tvp.set(tempABVS.getByteArray(), tempABVS.getStartOffset(), tempABVS.getLength());
addNodeToTuple(tvp, tupleIndex);
skipping = true;
}
public void writeDocument(ArrayBackedValueStorage abvs) throws IOException {
DataOutput out = abvs.getDataOutput();
out.write(ValueTag.NODE_TREE_TAG);
byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK;
if (attachTypes) {
header |= NodeTreePointable.HEADER_TYPE_EXISTS_MASK;
}
if (createNodeIds) {
header |= NodeTreePointable.HEADER_NODEID_EXISTS_MASK;
}
out.write(header);
if (createNodeIds) {
out.writeInt(nodeIdProvider.getId());
}
db.writeFromCache(abvs);
out.write(resultABVS.getByteArray(), resultABVS.getStartOffset(), resultABVS.getLength());
}
protected ElementNodeBuilder createENB() {
if (freeENBList.isEmpty()) {
return new ElementNodeBuilder();
}
return freeENBList.remove(freeENBList.size() - 1);
}
private void freeENB(ElementNodeBuilder enb) {
freeENBList.add(enb);
}
protected ElementNodeBuilder peekENBStackTop() {
return enbStack.get(enbStack.size() - 1);
}
private void startChildInParent(AbstractNodeBuilder anb) throws IOException {
startChildInParent(anb, false);
}
private void startChildInParent(AbstractNodeBuilder anb, boolean startNewElement) throws IOException {
if (startNewElement) {
resultABVS.reset();
anb.reset(resultABVS);
} else if (enbStack.isEmpty()) {
docb.startChild(anb);
} else {
peekENBStackTop().startChild(anb);
}
}
private void endChildInParent(AbstractNodeBuilder anb) throws IOException {
endChildInParent(anb, false);
}
private void endChildInParent(AbstractNodeBuilder anb, boolean endNewElement) throws IOException {
if (endNewElement) {
anb.finish();
} else if (enbStack.isEmpty()) {
docb.endChild(anb);
} else {
peekENBStackTop().endChild(anb);
}
}
private void addNodeToTuple(TaggedValuePointable result, int t) throws HyracksDataException {
FrameUtils.appendFieldToWriter(writer, appender, result.getByteArray(), result.getStartOffset(),
result.getLength());
// // Send to the writer.
// if (!addNodeToTupleAppender(result, t)) {
// FrameUtils.flushFrame(frame, writer);
// appender.reset(frame, true);
// if (!addNodeToTupleAppender(result, t)) {
// throw new HyracksDataException("Could not write frame.");
// }
// }
}
//
// private boolean addNodeToTupleAppender(TaggedValuePointable result, int t) throws HyracksDataException {
// // First copy all new fields over.
// if (fta.getFieldCount() > 0) {
// for (int f = 0; f < fta.getFieldCount(); ++f) {
// if (!appender.appendField(fta, t, f)) {
// return false;
// }
// }
// }
// return appender.appendField(result.getByteArray(), result.getStartOffset(), result.getLength());
// }
private String getStringFromBytes(byte[] bytes) {
if (bytes == null) {
return null;
}
StringBuilder sb = new StringBuilder();
UTF8StringUtil.toString(sb, bytes, 0);
return sb.toString();
}
/**
* Determines if the correct path step is active.
*/
private boolean foundFirstNonSkippedElement() {
if (subElement == null || subElement.length != depth) {
// Not the correct depth.
return false;
}
for (boolean b : subElement) {
if (!b) {
// Found a path step that did not match.
return false;
}
}
return true;
}
}