blob: 4f77b5d4875025441d1bc567a8c87cdc064a8c1b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.function.Predicate;
import java.util.zip.GZIPInputStream;
import org.apache.http.entity.ContentType;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.request.RequestWriter;
/**
* Three concrete implementations for ContentStream - one for File/URL/String
*
*
* @since solr 1.2
*/
public abstract class ContentStreamBase implements ContentStream
{
public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
private static final String TEXT_CSV = "text/csv";
private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
private static final List<String> XML_SUF = Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
private static final List<String> JSON_SUF = Arrays.asList(".json", ".json.gz", ".json.gzip");
private static final List<String> CSV_SUF = Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
protected String name;
protected String sourceInfo;
protected String contentType;
protected Long size;
//---------------------------------------------------------------------
//---------------------------------------------------------------------
public static String getCharsetFromContentType( String contentType )
{
if( contentType != null ) {
int idx = contentType.toLowerCase(Locale.ROOT).indexOf( "charset=" );
if( idx > 0 ) {
return contentType.substring( idx + "charset=".length() ).trim();
}
}
return null;
}
protected String attemptToDetermineContentType() {
String type = null;
if (name != null) {
Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
if (XML_SUF.stream().anyMatch(endsWith)) {
type = ContentType.APPLICATION_XML.getMimeType();
} else if (JSON_SUF.stream().anyMatch(endsWith)) {
type = ContentType.APPLICATION_JSON.getMimeType();
} else if (CSV_SUF.stream().anyMatch(endsWith)) {
type = TEXT_CSV;
} else {
type = attemptToDetermineTypeFromFirstCharacter();
}
}
return type;
}
private String attemptToDetermineTypeFromFirstCharacter() {
String type = null;
try (InputStream stream = getStream()) {
// Last ditch effort to determine content, if the first non-white space
// is a '<' or '{', assume xml or json.
int data = stream.read();
while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
data = stream.read();
}
if ((char)data == '<') {
type = ContentType.APPLICATION_XML.getMimeType();
} else if ((char)data == '{') {
type = ContentType.APPLICATION_JSON.getMimeType();
}
} catch (Exception ex) {
// This code just eats, the exception and leaves
// the contentType untouched.
}
return type;
}
//------------------------------------------------------------------------
//------------------------------------------------------------------------
/**
* Construct a <code>ContentStream</code> from a <code>URL</code>
*
* This uses a <code>URLConnection</code> to get the content stream
* @see URLConnection
*/
public static class URLStream extends ContentStreamBase
{
private final URL url;
public URLStream( URL url ) {
this.url = url;
sourceInfo = "url";
}
@Override
public String getContentType() {
// for file:// streams that are octet-streams, try to determine the payload
// type from payload rather than just using the mime type.
if ("file".equals(url.getProtocol())) {
Predicate<String> equals = mimeType->mimeType.equals(contentType);
if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
String type = attemptToDetermineContentType();
contentType = ( type != null ) ? type : contentType;
}
}
return contentType;
}
@Override
public InputStream getStream() throws IOException {
URLConnection conn = this.url.openConnection();
contentType = conn.getContentType();
name = url.toExternalForm();
size = conn.getContentLengthLong();
InputStream is = conn.getInputStream();
String urlFile = url.getFile().toLowerCase(Locale.ROOT);
if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
is = new GZIPInputStream(is);
}
return is;
}
}
/**
* Construct a <code>ContentStream</code> from a <code>File</code>
*/
public static class FileStream extends ContentStreamBase
{
private final File file;
public FileStream( File f ) {
file = f;
contentType = null; // ??
name = file.getName();
size = file.length();
sourceInfo = file.toURI().toString();
}
@Override
public String getContentType() {
if(contentType==null) {
contentType = attemptToDetermineContentType();
}
return contentType;
}
@Override
public InputStream getStream() throws IOException {
InputStream is = new FileInputStream( file );
String lowerName = name.toLowerCase(Locale.ROOT);
if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
is = new GZIPInputStream(is);
}
return is;
}
}
/**
* Construct a <code>ContentStream</code> from a <code>String</code>
*/
public static class StringStream extends ContentStreamBase
{
private final String str;
public StringStream( String str ) {
this(str, detect(str));
}
public StringStream( String str, String contentType ) {
this.str = str;
this.contentType = contentType;
name = null;
try {
size = (long) str.getBytes(DEFAULT_CHARSET).length;
} catch (UnsupportedEncodingException e) {
// won't happen
throw new RuntimeException(e);
}
sourceInfo = "string";
}
public static String detect(String str) {
String detectedContentType = null;
int lim = str.length() - 1;
for (int i=0; i<lim; i++) {
char ch = str.charAt(i);
if (Character.isWhitespace(ch)) {
continue;
}
// first non-whitespace chars
if (ch == '#' // single line comment
|| (ch == '/' && (str.charAt(i + 1) == '/' || str.charAt(i + 1) == '*')) // single line or multi-line comment
|| (ch == '{' || ch == '[') // start of JSON object
)
{
detectedContentType = "application/json";
} else if (ch == '<') {
detectedContentType = "text/xml";
}
break;
}
return detectedContentType;
}
@Override
public InputStream getStream() throws IOException {
return new ByteArrayInputStream( str.getBytes(DEFAULT_CHARSET) );
}
/**
* If an charset is defined (by the contentType) use that, otherwise
* use a StringReader
*/
@Override
public Reader getReader() throws IOException {
String charset = getCharsetFromContentType( contentType );
return charset == null
? new StringReader( str )
: new InputStreamReader( getStream(), charset );
}
}
/**
* Base reader implementation. If the contentType declares a
* charset use it, otherwise use "utf-8".
*/
@Override
public Reader getReader() throws IOException {
String charset = getCharsetFromContentType( getContentType() );
return charset == null
? new InputStreamReader( getStream(), DEFAULT_CHARSET )
: new InputStreamReader( getStream(), charset );
}
//------------------------------------------------------------------
// Getters / Setters for overrideable attributes
//------------------------------------------------------------------
@Override
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
@Override
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
@Override
public Long getSize() {
return size;
}
public void setSize(Long size) {
this.size = size;
}
@Override
public String getSourceInfo() {
return sourceInfo;
}
public void setSourceInfo(String sourceInfo) {
this.sourceInfo = sourceInfo;
}
public static ContentStream create(RequestWriter requestWriter,
@SuppressWarnings({"rawtypes"})SolrRequest req) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
RequestWriter.ContentWriter contentWriter = requestWriter.getContentWriter(req);
contentWriter.write(baos);
return new ByteArrayStream(baos.toByteArray(), null,contentWriter.getContentType() );
}
/**
* Construct a <code>ContentStream</code> from a <code>File</code>
*/
public static class ByteArrayStream extends ContentStreamBase
{
private final byte[] bytes;
public ByteArrayStream( byte[] bytes, String source ) {
this(bytes,source, null);
}
public ByteArrayStream( byte[] bytes, String source, String contentType ) {
this.bytes = bytes;
this.contentType = contentType;
name = source;
size = (long) bytes.length;
sourceInfo = source;
}
@Override
public InputStream getStream() throws IOException {
return new ByteArrayInputStream( bytes );
}
}
}