blob: c8eac0b31aa75baa7059d88d68c363fffad8ac81 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.loader;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.dom.DOMResult;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXSource;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import com.google.common.collect.Lists;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.EmptyEntityResolver;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.XMLErrorLogger;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.handler.RequestHandlerUtils;
import org.apache.solr.handler.UpdateRequestHandler;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.update.DeleteUpdateCommand;
import org.apache.solr.update.RollbackUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.xslt.TransformerProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import static org.apache.solr.common.params.CommonParams.ID;
import static org.apache.solr.common.params.CommonParams.NAME;
public class XMLLoader extends ContentStreamLoader {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final AtomicBoolean WARNED_ABOUT_INDEX_TIME_BOOSTS = new AtomicBoolean();
static final XMLErrorLogger xmllog = new XMLErrorLogger(log);
public static final String CONTEXT_TRANSFORMER_KEY = "xsltupdater.transformer";
private static final String XSLT_CACHE_PARAM = "xsltCacheLifetimeSeconds";
public static final int XSLT_CACHE_DEFAULT = 60;
int xsltCacheLifetimeSeconds;
XMLInputFactory inputFactory;
SAXParserFactory saxFactory;
@Override
public XMLLoader init(SolrParams args) {
// Init StAX parser:
inputFactory = XMLInputFactory.newInstance();
EmptyEntityResolver.configureXMLInputFactory(inputFactory);
inputFactory.setXMLReporter(xmllog);
try {
// The java 1.6 bundled stax parser (sjsxp) does not currently have a thread-safe
// XMLInputFactory, as that implementation tries to cache and reuse the
// XMLStreamReader. Setting the parser-specific "reuse-instance" property to false
// prevents this.
// All other known open-source stax parsers (and the bea ref impl)
// have thread-safe factories.
inputFactory.setProperty("reuse-instance", Boolean.FALSE);
} catch (IllegalArgumentException ex) {
// Other implementations will likely throw this exception since "reuse-instance"
// isimplementation specific.
log.debug("Unable to set the 'reuse-instance' property for the input chain: {}", inputFactory);
}
// Init SAX parser (for XSL):
saxFactory = SAXParserFactory.newInstance();
saxFactory.setNamespaceAware(true); // XSL needs this!
EmptyEntityResolver.configureSAXParserFactory(saxFactory);
xsltCacheLifetimeSeconds = XSLT_CACHE_DEFAULT;
if(args != null) {
xsltCacheLifetimeSeconds = args.getInt(XSLT_CACHE_PARAM,XSLT_CACHE_DEFAULT);
log.debug("xsltCacheLifetimeSeconds={}", xsltCacheLifetimeSeconds);
}
return this;
}
@Override
public String getDefaultWT() {
return "xml";
}
@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
final String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
InputStream is = null;
XMLStreamReader parser = null;
String tr = req.getParams().get(CommonParams.TR,null);
if(tr!=null) {
if (req.getCore().getCoreDescriptor().isConfigSetTrusted() == false) {
throw new SolrException(ErrorCode.UNAUTHORIZED, "The configset for this collection was uploaded without any authentication in place,"
+ " and this operation is not available for collections with untrusted configsets. To use this feature, re-upload the configset"
+ " after enabling authentication and authorization.");
}
final Transformer t = getTransformer(tr,req);
final DOMResult result = new DOMResult();
// first step: read XML and build DOM using Transformer (this is no overhead, as XSL always produces
// an internal result DOM tree, we just access it directly as input for StAX):
try {
is = stream.getStream();
final InputSource isrc = new InputSource(is);
isrc.setEncoding(charset);
final XMLReader xmlr = saxFactory.newSAXParser().getXMLReader();
xmlr.setErrorHandler(xmllog);
xmlr.setEntityResolver(EmptyEntityResolver.SAX_INSTANCE);
final SAXSource source = new SAXSource(xmlr, isrc);
t.transform(source, result);
} catch(TransformerException te) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, te.getMessage(), te);
} finally {
IOUtils.closeQuietly(is);
}
// second step: feed the intermediate DOM tree into StAX parser:
try {
parser = inputFactory.createXMLStreamReader(new DOMSource(result.getNode()));
this.processUpdate(req, processor, parser);
} catch (XMLStreamException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
} finally {
if (parser != null) parser.close();
}
}
// Normal XML Loader
else {
try {
is = stream.getStream();
if (log.isTraceEnabled()) {
final byte[] body = IOUtils.toByteArray(is);
// TODO: The charset may be wrong, as the real charset is later
// determined by the XML parser, the content-type is only used as a hint!
if (log.isTraceEnabled()) {
log.trace("body: {}", new String(body, (charset == null) ?
ContentStreamBase.DEFAULT_CHARSET : charset));
}
IOUtils.closeQuietly(is);
is = new ByteArrayInputStream(body);
}
parser = (charset == null) ?
inputFactory.createXMLStreamReader(is) : inputFactory.createXMLStreamReader(is, charset);
this.processUpdate(req, processor, parser);
} catch (XMLStreamException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
} finally {
if (parser != null) parser.close();
IOUtils.closeQuietly(is);
}
}
}
/** Get Transformer from request context, or from TransformerProvider.
* This allows either getContentType(...) or write(...) to instantiate the Transformer,
* depending on which one is called first, then the other one reuses the same Transformer
*/
Transformer getTransformer(String xslt, SolrQueryRequest request) throws IOException {
// not the cleanest way to achieve this
// no need to synchronize access to context, right?
// Nothing else happens with it at the same time
final Map<Object,Object> ctx = request.getContext();
Transformer result = (Transformer)ctx.get(CONTEXT_TRANSFORMER_KEY);
if(result==null) {
SolrConfig solrConfig = request.getCore().getSolrConfig();
result = TransformerProvider.instance.getTransformer(solrConfig, xslt, xsltCacheLifetimeSeconds);
result.setErrorListener(xmllog);
ctx.put(CONTEXT_TRANSFORMER_KEY,result);
}
return result;
}
/**
* @since solr 1.2
*/
void processUpdate(SolrQueryRequest req, UpdateRequestProcessor processor, XMLStreamReader parser)
throws XMLStreamException, IOException, FactoryConfigurationError {
AddUpdateCommand addCmd = null;
SolrParams params = req.getParams();
while (true) {
int event = parser.next();
switch (event) {
case XMLStreamConstants.END_DOCUMENT:
parser.close();
return;
case XMLStreamConstants.START_ELEMENT:
String currTag = parser.getLocalName();
if (currTag.equals(UpdateRequestHandler.ADD)) {
log.trace("SolrCore.update(add)");
addCmd = new AddUpdateCommand(req);
// First look for commitWithin parameter on the request, will be overwritten for individual <add>'s
addCmd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);
addCmd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
for (int i = 0; i < parser.getAttributeCount(); i++) {
String attrName = parser.getAttributeLocalName(i);
String attrVal = parser.getAttributeValue(i);
if (UpdateRequestHandler.OVERWRITE.equals(attrName)) {
addCmd.overwrite = StrUtils.parseBoolean(attrVal);
} else if (UpdateRequestHandler.COMMIT_WITHIN.equals(attrName)) {
addCmd.commitWithin = Integer.parseInt(attrVal);
} else {
log.warn("XML element <add> has invalid XML attr: {}", attrName);
}
}
} else if ("doc".equals(currTag)) {
if(addCmd != null) {
log.trace("adding doc...");
addCmd.clear();
addCmd.solrDoc = readDoc(parser);
processor.processAdd(addCmd);
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unexpected <doc> tag without an <add> tag surrounding it.");
}
} else if (UpdateRequestHandler.COMMIT.equals(currTag) || UpdateRequestHandler.OPTIMIZE.equals(currTag)) {
log.trace("parsing {}", currTag);
CommitUpdateCommand cmd = new CommitUpdateCommand(req, UpdateRequestHandler.OPTIMIZE.equals(currTag));
ModifiableSolrParams mp = new ModifiableSolrParams();
for (int i = 0; i < parser.getAttributeCount(); i++) {
String attrName = parser.getAttributeLocalName(i);
String attrVal = parser.getAttributeValue(i);
mp.set(attrName, attrVal);
}
RequestHandlerUtils.validateCommitParams(mp);
SolrParams p = SolrParams.wrapDefaults(mp, req.getParams()); // default to the normal request params for commit options
RequestHandlerUtils.updateCommit(cmd, p);
processor.processCommit(cmd);
} // end commit
else if (UpdateRequestHandler.ROLLBACK.equals(currTag)) {
log.trace("parsing rollback");
RollbackUpdateCommand cmd = new RollbackUpdateCommand(req);
processor.processRollback(cmd);
} // end rollback
else if (UpdateRequestHandler.DELETE.equals(currTag)) {
log.trace("parsing delete");
processDelete(req, processor, parser);
} // end delete
break;
}
}
}
/**
* @since solr 1.3
*/
void processDelete(SolrQueryRequest req, UpdateRequestProcessor processor, XMLStreamReader parser) throws XMLStreamException, IOException {
// Parse the command
DeleteUpdateCommand deleteCmd = new DeleteUpdateCommand(req);
// First look for commitWithin parameter on the request, will be overwritten for individual <delete>'s
SolrParams params = req.getParams();
deleteCmd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);
for (int i = 0; i < parser.getAttributeCount(); i++) {
String attrName = parser.getAttributeLocalName(i);
String attrVal = parser.getAttributeValue(i);
if ("fromPending".equals(attrName)) {
// deprecated
} else if ("fromCommitted".equals(attrName)) {
// deprecated
} else if (UpdateRequestHandler.COMMIT_WITHIN.equals(attrName)) {
deleteCmd.commitWithin = Integer.parseInt(attrVal);
} else {
log.warn("XML element <delete> has invalid XML attr: {}", attrName);
}
}
StringBuilder text = new StringBuilder();
while (true) {
int event = parser.next();
switch (event) {
case XMLStreamConstants.START_ELEMENT:
String mode = parser.getLocalName();
if (!(ID.equals(mode) || "query".equals(mode))) {
String msg = "XML element <delete> has invalid XML child element: " + mode;
log.warn(msg);
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
msg);
}
text.setLength(0);
if (ID.equals(mode)) {
for (int i = 0; i < parser.getAttributeCount(); i++) {
String attrName = parser.getAttributeLocalName(i);
String attrVal = parser.getAttributeValue(i);
if (UpdateRequestHandler.VERSION.equals(attrName)) {
deleteCmd.setVersion(Long.parseLong(attrVal));
}
if (ShardParams._ROUTE_.equals(attrName)) {
deleteCmd.setRoute(attrVal);
}
}
}
break;
case XMLStreamConstants.END_ELEMENT:
String currTag = parser.getLocalName();
if (ID.equals(currTag)) {
deleteCmd.setId(text.toString());
} else if ("query".equals(currTag)) {
deleteCmd.setQuery(text.toString());
} else if ("delete".equals(currTag)) {
return;
} else {
String msg = "XML element <delete> has invalid XML (closing) child element: " + currTag;
log.warn(msg);
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
msg);
}
processor.processDelete(deleteCmd);
deleteCmd.clear();
break;
// Add everything to the text
case XMLStreamConstants.SPACE:
case XMLStreamConstants.CDATA:
case XMLStreamConstants.CHARACTERS:
text.append(parser.getText());
break;
}
}
}
/**
* Given the input stream, read a document
*
* @since solr 1.3
*/
@SuppressWarnings({"unchecked"})
public SolrInputDocument readDoc(XMLStreamReader parser) throws XMLStreamException {
SolrInputDocument doc = new SolrInputDocument();
String attrName = "";
for (int i = 0; i < parser.getAttributeCount(); i++) {
attrName = parser.getAttributeLocalName(i);
if ("boost".equals(attrName)) {
String message = "Ignoring document boost: " + parser.getAttributeValue(i) + " as index-time boosts are not supported anymore";
if (WARNED_ABOUT_INDEX_TIME_BOOSTS.compareAndSet(false, true)) {
log.warn(message);
} else {
log.debug(message);
}
} else {
log.warn("XML element <doc> has invalid XML attr: {}", attrName);
}
}
StringBuilder text = new StringBuilder();
String name = null;
boolean isNull = false;
boolean isLabeledChildDoc = false;
String update = null;
Collection<SolrInputDocument> subDocs = null;
Map<String, Map<String, Object>> updateMap = null;
boolean complete = false;
while (!complete) {
int event = parser.next();
switch (event) {
// Add everything to the text
case XMLStreamConstants.SPACE:
case XMLStreamConstants.CDATA:
case XMLStreamConstants.CHARACTERS:
text.append(parser.getText());
break;
case XMLStreamConstants.END_ELEMENT:
if ("doc".equals(parser.getLocalName())) {
if (subDocs != null && !subDocs.isEmpty()) {
doc.addChildDocuments(subDocs);
subDocs = null;
}
complete = true;
break;
} else if ("field".equals(parser.getLocalName())) {
// should I warn in some text has been found too
Object v = isNull ? null : text.toString();
if (update != null) {
if (updateMap == null) updateMap = new HashMap<>();
Map<String, Object> extendedValues = updateMap.get(name);
if (extendedValues == null) {
extendedValues = new HashMap<>(1);
updateMap.put(name, extendedValues);
}
Object val = extendedValues.get(update);
if (val == null) {
extendedValues.put(update, v);
} else {
// multiple val are present
if (val instanceof List) {
@SuppressWarnings({"rawtypes"})
List list = (List) val;
list.add(v);
} else {
List<Object> values = new ArrayList<>();
values.add(val);
values.add(v);
extendedValues.put(update, values);
}
}
break;
}
if(!isLabeledChildDoc){
// only add data if this is not a childDoc, since it was added already
doc.addField(name, v);
} else {
// reset so next field is not treated as child doc
isLabeledChildDoc = false;
}
// field is over
name = null;
}
break;
case XMLStreamConstants.START_ELEMENT:
text.setLength(0);
String localName = parser.getLocalName();
if ("doc".equals(localName)) {
if(name != null) {
// flag to prevent spaces after doc from being added
isLabeledChildDoc = true;
if(!doc.containsKey(name)) {
doc.setField(name, Lists.newArrayList());
}
doc.addField(name, readDoc(parser));
break;
}
if (subDocs == null)
subDocs = Lists.newArrayList();
subDocs.add(readDoc(parser));
}
else {
if (!"field".equals(localName)) {
String msg = "XML element <doc> has invalid XML child element: " + localName;
log.warn(msg);
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
msg);
}
update = null;
isNull = false;
String attrVal = "";
for (int i = 0; i < parser.getAttributeCount(); i++) {
attrName = parser.getAttributeLocalName(i);
attrVal = parser.getAttributeValue(i);
if (NAME.equals(attrName)) {
name = attrVal;
} else if ("boost".equals(attrName)) {
String message = "Ignoring field boost: " + attrVal + " as index-time boosts are not supported anymore";
if (WARNED_ABOUT_INDEX_TIME_BOOSTS.compareAndSet(false, true)) {
log.warn(message);
} else {
log.debug(message);
}
} else if ("null".equals(attrName)) {
isNull = StrUtils.parseBoolean(attrVal);
} else if ("update".equals(attrName)) {
update = attrVal;
} else {
log.warn("XML element <field> has invalid XML attr: {}", attrName);
}
}
}
break;
}
}
if (updateMap != null) {
for (Map.Entry<String, Map<String, Object>> entry : updateMap.entrySet()) {
name = entry.getKey();
Map<String, Object> value = entry.getValue();
doc.addField(name, value);
}
}
return doc;
}
}