blob: 92cca70719fcc3ee0d264e2f900c81816958243d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.designer;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.loader.CSVLoaderBase;
import org.apache.solr.handler.loader.JsonLoader;
import org.apache.solr.handler.loader.XMLLoader;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.SafeXMLParsing;
import org.noggit.JSONParser;
import org.noggit.ObjectBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.SAXException;
import static org.apache.solr.common.params.CommonParams.JSON_MIME;
import static org.apache.solr.handler.loader.CSVLoaderBase.SEPARATOR;
public class DefaultSampleDocumentsLoader implements SampleDocumentsLoader {
public static final String CSV_MULTI_VALUE_DELIM_PARAM = "csvMultiValueDelimiter";
private static final int MAX_STREAM_SIZE = (5 * 1024 * 1024);
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static byte[] streamAsBytes(final InputStream in) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buf = new byte[1024];
int r;
try {
while ((r = in.read(buf)) != -1) baos.write(buf, 0, r);
} finally {
in.close();
}
return baos.toByteArray();
}
@Override
public SampleDocuments parseDocsFromStream(SolrParams params, ContentStream stream, final int maxDocsToLoad) throws IOException {
final String contentType = stream.getContentType();
if (contentType == null) {
return SampleDocuments.NONE;
}
if (params == null) {
params = new ModifiableSolrParams();
}
Long streamSize = stream.getSize();
if (streamSize != null && streamSize > MAX_STREAM_SIZE) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Sample is too big! " + MAX_STREAM_SIZE + " bytes is the max upload size for sample documents.");
}
String fileSource = "paste";
if ("file".equals(stream.getName())) {
fileSource = stream.getSourceInfo() != null ? stream.getSourceInfo() : "file";
}
byte[] uploadedBytes = streamAsBytes(stream.getStream());
// recheck the upload size in case the stream returned null for getSize
if (uploadedBytes.length > MAX_STREAM_SIZE) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Sample is too big! " + MAX_STREAM_SIZE + " bytes is the max upload size for sample documents.");
}
// use a byte stream for the parsers in case they need to re-parse using a different strategy
// e.g. JSON vs. JSON lines or different CSV strategies ...
ContentStreamBase.ByteArrayStream byteStream = new ContentStreamBase.ByteArrayStream(uploadedBytes, fileSource, contentType);
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
if (charset == null) {
charset = ContentStreamBase.DEFAULT_CHARSET;
}
List<SolrInputDocument> docs = null;
if (stream.getSize() > 0) {
if (contentType.contains(JSON_MIME)) {
docs = loadJsonDocs(params, byteStream, maxDocsToLoad);
} else if (contentType.contains("text/xml") || contentType.contains("application/xml")) {
docs = loadXmlDocs(params, byteStream, maxDocsToLoad);
} else if (contentType.contains("text/csv") || contentType.contains("application/csv")) {
docs = loadCsvDocs(params, fileSource, uploadedBytes, charset, maxDocsToLoad);
} else if (contentType.contains("text/plain") || contentType.contains("application/octet-stream")) {
docs = loadJsonLines(params, byteStream, maxDocsToLoad);
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, contentType + " not supported yet!");
}
if (docs != null && maxDocsToLoad > 0 && docs.size() > maxDocsToLoad) {
docs = docs.subList(0, maxDocsToLoad);
}
}
return new SampleDocuments(docs, contentType, fileSource);
}
protected List<SolrInputDocument> loadCsvDocs(SolrParams params, String source, byte[] streamBytes, String charset, final int maxDocsToLoad) throws IOException {
ContentStream stream;
if (params.get(SEPARATOR) == null) {
String csvStr = new String(streamBytes, charset);
char sep = detectTSV(csvStr);
ModifiableSolrParams modifiableSolrParams = new ModifiableSolrParams(params);
modifiableSolrParams.set(SEPARATOR, String.valueOf(sep));
params = modifiableSolrParams;
stream = new ContentStreamBase.StringStream(csvStr, "text/csv");
} else {
stream = new ContentStreamBase.ByteArrayStream(streamBytes, source, "text/csv");
}
return (new SampleCSVLoader(new CSVRequest(params), maxDocsToLoad)).loadDocs(stream);
}
@SuppressWarnings("unchecked")
protected List<SolrInputDocument> loadJsonLines(SolrParams params, ContentStreamBase.ByteArrayStream stream, final int maxDocsToLoad) throws IOException {
List<Map<String, Object>> docs = new LinkedList<>();
try (Reader r = stream.getReader()) {
BufferedReader br = new BufferedReader(r);
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
if (!line.isEmpty() && line.startsWith("{") && line.endsWith("}")) {
Object jsonLine = ObjectBuilder.getVal(new JSONParser(line));
if (jsonLine instanceof Map) {
docs.add((Map<String, Object>) jsonLine);
}
}
if (maxDocsToLoad > 0 && docs.size() == maxDocsToLoad) {
break;
}
}
}
return docs.stream().map(JsonLoader::buildDoc).collect(Collectors.toList());
}
@SuppressWarnings("unchecked")
protected List<SolrInputDocument> loadJsonDocs(SolrParams params, ContentStreamBase.ByteArrayStream stream, final int maxDocsToLoad) throws IOException {
Object json;
try (Reader r = stream.getReader()) {
json = ObjectBuilder.getVal(new JSONParser(r));
}
if (json == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Expected at least 1 JSON doc in the request body!");
}
List<Map<String, Object>> docs;
if (json instanceof List) {
// list of docs
docs = (List<Map<String, Object>>) json;
} else if (json instanceof Map) {
// single doc ... see if this is a json lines file
boolean isJsonLines = false;
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
String jsonStr = new String(streamAsBytes(stream.getStream()), charset != null ? charset : ContentStreamBase.DEFAULT_CHARSET);
String[] lines = jsonStr.split("\n");
if (lines.length > 1) {
for (String line : lines) {
line = line.trim();
if (!line.isEmpty() && line.startsWith("{") && line.endsWith("}")) {
isJsonLines = true;
break;
}
}
}
if (isJsonLines) {
docs = loadJsonLines(lines);
} else {
docs = Collections.singletonList((Map<String, Object>) json);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Expected one or more JSON docs in the request body!");
}
if (maxDocsToLoad > 0 && docs.size() > maxDocsToLoad) {
docs = docs.subList(0, maxDocsToLoad);
}
return docs.stream().map(JsonLoader::buildDoc).collect(Collectors.toList());
}
protected List<SolrInputDocument> loadXmlDocs(SolrParams params, ContentStreamBase.ByteArrayStream stream, final int maxDocsToLoad) throws IOException {
String xmlString = readInputAsString(stream.getStream()).trim();
List<SolrInputDocument> docs;
if (xmlString.contains("<add>") && xmlString.contains("<doc>")) {
XMLInputFactory inputFactory = XMLInputFactory.newInstance();
inputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
XMLStreamReader parser = null;
try {
parser = inputFactory.createXMLStreamReader(new StringReader(xmlString));
docs = parseXmlDocs(parser, maxDocsToLoad);
} catch (XMLStreamException e) {
throw new IOException(e);
} finally {
if (parser != null) {
try {
parser.close();
} catch (XMLStreamException ignore) {
}
}
}
} else {
Document xmlDoc;
try {
xmlDoc = SafeXMLParsing.parseUntrustedXML(log, xmlString);
} catch (SAXException e) {
throw new IOException(e);
}
Element root = xmlDoc.getDocumentElement();
// TODO: support other types of XML here
throw new IOException("TODO: XML documents with root " + root.getTagName() + " not supported yet!");
}
return docs;
}
protected List<SolrInputDocument> parseXmlDocs(XMLStreamReader parser, final int maxDocsToLoad) throws XMLStreamException {
List<SolrInputDocument> docs = new LinkedList<>();
XMLLoader loader = new XMLLoader().init(null);
while (true) {
final int event;
try {
event = parser.next();
} catch (java.util.NoSuchElementException noSuchElementException) {
return docs;
}
switch (event) {
case XMLStreamConstants.END_DOCUMENT:
parser.close();
return docs;
case XMLStreamConstants.START_ELEMENT:
if ("doc".equals(parser.getLocalName())) {
SolrInputDocument doc = loader.readDoc(parser);
if (doc != null) {
docs.add(doc);
if (maxDocsToLoad > 0 && docs.size() >= maxDocsToLoad) {
parser.close();
return docs;
}
}
}
}
}
}
@SuppressWarnings("unchecked")
protected List<Map<String, Object>> loadJsonLines(String[] lines) throws IOException {
List<Map<String, Object>> docs = new ArrayList<>(lines.length);
for (String line : lines) {
line = line.trim();
if (!line.isEmpty() && line.startsWith("{") && line.endsWith("}")) {
Object jsonLine = ObjectBuilder.getVal(new JSONParser(line));
if (jsonLine instanceof Map) {
docs.add((Map<String, Object>) jsonLine);
}
}
}
return docs;
}
protected String readInputAsString(InputStream in) throws IOException {
return new String(streamAsBytes(in), StandardCharsets.UTF_8);
}
protected char detectTSV(String csvStr) {
char sep = ',';
int endOfFirstLine = csvStr.indexOf('\n');
if (endOfFirstLine != -1) {
int commas = 0;
int tabs = 0;
for (char value : csvStr.substring(0, endOfFirstLine).toCharArray()) {
if (value == ',') {
++commas;
} else if (value == '\t') {
++tabs;
}
}
if (tabs >= commas) {
sep = '\t';
}
}
return sep;
}
@Override
@SuppressWarnings({"rawtypes"})
public void init(NamedList args) {
}
private static class NoOpUpdateRequestProcessor extends UpdateRequestProcessor {
NoOpUpdateRequestProcessor() {
super(null);
}
}
private static class CSVRequest extends SolrQueryRequestBase {
CSVRequest(SolrParams params) {
super(null, params);
}
}
private static class SampleCSVLoader extends CSVLoaderBase {
List<SolrInputDocument> docs = new LinkedList<>();
CSVRequest req;
int maxDocsToLoad;
String multiValueDelimiter;
SampleCSVLoader(CSVRequest req, int maxDocsToLoad) {
super(req, new NoOpUpdateRequestProcessor());
this.req = req;
this.maxDocsToLoad = maxDocsToLoad;
this.multiValueDelimiter = req.getParams().get(CSV_MULTI_VALUE_DELIM_PARAM);
}
List<SolrInputDocument> loadDocs(ContentStream stream) throws IOException {
load(req, new SolrQueryResponse(), stream, processor);
return docs;
}
@Override
public void addDoc(int line, String[] vals) throws IOException {
if (maxDocsToLoad > 0 && docs.size() >= maxDocsToLoad) {
return; // just a short circuit, probably doesn't help that much
}
templateAdd.clear();
SolrInputDocument doc = new SolrInputDocument();
doAdd(line, vals, doc, templateAdd);
if (templateAdd.solrDoc != null) {
if (multiValueDelimiter != null) {
for (SolrInputField field : templateAdd.solrDoc.values()) {
if (field.getValueCount() == 1) {
Object value = field.getFirstValue();
if (value instanceof String) {
String[] splitValue = ((String) value).split(multiValueDelimiter);
if (splitValue.length > 1) {
field.setValue(Arrays.asList(splitValue));
}
}
}
}
}
docs.add(templateAdd.solrDoc);
}
}
}
}