solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.common.util;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
 import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.net.URLConnection;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 import java.util.function.Predicate;
 import java.util.zip.GZIPInputStream;

 import org.apache.http.entity.ContentType;
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.request.RequestWriter;

 /**
  * Three concrete implementations for ContentStream - one for File/URL/String
  *
  *
  * @since solr 1.2
  */
 public abstract class ContentStreamBase implements ContentStream
 {

   public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
   private static final String TEXT_CSV = "text/csv";
   private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
   private static final List<String> XML_SUF =  Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
   private static final List<String> JSON_SUF =  Arrays.asList(".json", ".json.gz", ".json.gzip");
   private static final List<String> CSV_SUF =  Arrays.asList(".csv", ".csv.gz", ".csv.gzip");

   protected String name;
   protected String sourceInfo;
   protected String contentType;
   protected Long size;

   //---------------------------------------------------------------------
   //---------------------------------------------------------------------

   public static String getCharsetFromContentType( String contentType )
   {
     if( contentType != null ) {
       int idx = contentType.toLowerCase(Locale.ROOT).indexOf( "charset=" );
       if( idx > 0 ) {
         return contentType.substring( idx + "charset=".length() ).trim();
       }
     }
     return null;
   }

   protected String attemptToDetermineContentType() {
     String type = null;
     if (name != null) {
       Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);

       if (XML_SUF.stream().anyMatch(endsWith)) {
         type = ContentType.APPLICATION_XML.getMimeType();
       } else if (JSON_SUF.stream().anyMatch(endsWith)) {
         type = ContentType.APPLICATION_JSON.getMimeType();
       } else if (CSV_SUF.stream().anyMatch(endsWith)) {
         type = TEXT_CSV;
       } else {
         type = attemptToDetermineTypeFromFirstCharacter();
       }
     }
     return type;
   }

   private String attemptToDetermineTypeFromFirstCharacter() {
     String type = null;
     try (InputStream stream = getStream()) {
       // Last ditch effort to determine content, if the first non-white space
       // is a '<' or '{', assume xml or json.
       int data = stream.read();
       while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
         data = stream.read();
       }
       if ((char)data == '<') {
         type = ContentType.APPLICATION_XML.getMimeType();
       } else if ((char)data == '{') {
         type = ContentType.APPLICATION_JSON.getMimeType();
       }
     } catch (Exception ex) {
       // This code just eats, the exception and leaves
       // the contentType untouched.
     }
     return type;
   }

   //------------------------------------------------------------------------
   //------------------------------------------------------------------------

   /**
    * Construct a <code>ContentStream</code> from a <code>URL</code>
    *
    * This uses a <code>URLConnection</code> to get the content stream
    * @see  URLConnection
    */
   public static class URLStream extends ContentStreamBase
   {
     private final URL url;

     public URLStream( URL url ) {
       this.url = url;
       sourceInfo = "url";
     }

     @Override
     public String getContentType() {
       // for file:// streams that are octet-streams, try to determine the payload
       // type from payload rather than just using the mime type.
       if ("file".equals(url.getProtocol())) {
         Predicate<String> equals = mimeType->mimeType.equals(contentType);
         if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
           String type = attemptToDetermineContentType();
           contentType = ( type != null ) ? type : contentType;
         }
       }
       return contentType;
     }

     @Override
     public InputStream getStream() throws IOException {
       URLConnection conn = this.url.openConnection();

       contentType = conn.getContentType();
       name = url.toExternalForm();
       size = conn.getContentLengthLong();
       InputStream is = conn.getInputStream();
       String urlFile = url.getFile().toLowerCase(Locale.ROOT);
       if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
         is = new GZIPInputStream(is);
       }
       return is;
     }
   }

   /**
    * Construct a <code>ContentStream</code> from a <code>File</code>
    */
   public static class FileStream extends ContentStreamBase
   {
     private final File file;

     public FileStream( File f ) {
       file = f;

       contentType = null; // ??
       name = file.getName();
       size = file.length();
       sourceInfo = file.toURI().toString();
     }

     @Override
     public String getContentType() {
       if(contentType==null) {
         contentType = attemptToDetermineContentType();
       }
       return contentType;
     }

     @Override
     public InputStream getStream() throws IOException {
       InputStream is = new FileInputStream( file );
       String lowerName = name.toLowerCase(Locale.ROOT);
       if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
         is = new GZIPInputStream(is);
       }
       return is;
     }
   }


   /**
    * Construct a <code>ContentStream</code> from a <code>String</code>
    */
   public static class StringStream extends ContentStreamBase
   {
     private final String str;

     public StringStream( String str ) {
       this(str, detect(str));
     }

     public StringStream( String str, String contentType ) {
       this.str = str;
       this.contentType = contentType;
       name = null;
       try {
         size = (long) str.getBytes(DEFAULT_CHARSET).length;
       } catch (UnsupportedEncodingException e) {
         // won't happen
         throw new RuntimeException(e);
       }
       sourceInfo = "string";
     }

     public static String detect(String str) {
       String detectedContentType = null;
       int lim = str.length() - 1;
       for (int i=0; i<lim; i++) {
         char ch = str.charAt(i);
         if (Character.isWhitespace(ch)) {
           continue;
         }
         // first non-whitespace chars
         if (ch == '#'                         // single line comment
             || (ch == '/' && (str.charAt(i + 1) == '/' || str.charAt(i + 1) == '*'))  // single line or multi-line comment
             || (ch == '{' || ch == '[')       // start of JSON object
             )
         {
           detectedContentType = "application/json";
         } else if (ch == '<') {
           detectedContentType = "text/xml";
         }
         break;
       }
       return detectedContentType;
     }

     @Override
     public InputStream getStream() throws IOException {
       return new ByteArrayInputStream( str.getBytes(DEFAULT_CHARSET) );
     }

     /**
      * If an charset is defined (by the contentType) use that, otherwise
      * use a StringReader
      */
     @Override
     public Reader getReader() throws IOException {
       String charset = getCharsetFromContentType( contentType );
       return charset == null
         ? new StringReader( str )
         : new InputStreamReader( getStream(), charset );
     }
   }

   /**
    * Base reader implementation.  If the contentType declares a
    * charset use it, otherwise use "utf-8".
    */
   @Override
   public Reader getReader() throws IOException {
     String charset = getCharsetFromContentType( getContentType() );
     return charset == null
       ? new InputStreamReader( getStream(), DEFAULT_CHARSET )
       : new InputStreamReader( getStream(), charset );
   }

   //------------------------------------------------------------------
   // Getters / Setters for overrideable attributes
   //------------------------------------------------------------------

   @Override
   public String getContentType() {
     return contentType;
   }

   public void setContentType(String contentType) {
     this.contentType = contentType;
   }

   @Override
   public String getName() {
     return name;
   }

   public void setName(String name) {
     this.name = name;
   }

   @Override
   public Long getSize() {
     return size;
   }

   public void setSize(Long size) {
     this.size = size;
   }

   @Override
   public String getSourceInfo() {
     return sourceInfo;
   }

   public void setSourceInfo(String sourceInfo) {
     this.sourceInfo = sourceInfo;
   }
   public static ContentStream create(RequestWriter requestWriter,
                                      @SuppressWarnings({"rawtypes"})SolrRequest req) throws IOException {
     ByteArrayOutputStream baos = new ByteArrayOutputStream();
     RequestWriter.ContentWriter contentWriter = requestWriter.getContentWriter(req);
     contentWriter.write(baos);
     return new ByteArrayStream(baos.toByteArray(), null,contentWriter.getContentType() );
   }

   /**
    * Construct a <code>ContentStream</code> from a <code>File</code>
    */
   public static class ByteArrayStream extends ContentStreamBase
   {
     private final byte[] bytes;
     public ByteArrayStream( byte[] bytes, String source ) {
       this(bytes,source, null);
     }

     public ByteArrayStream( byte[] bytes, String source, String contentType ) {
       this.bytes = bytes;

       this.contentType = contentType;
       name = source;
       size = (long) bytes.length;
       sourceInfo = source;
     }


     @Override
     public InputStream getStream() throws IOException {
       return new ByteArrayInputStream( bytes );
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.common.util;

	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.io.StringReader;
	import java.io.UnsupportedEncodingException;
	import java.net.URL;
	import java.net.URLConnection;
	import java.nio.charset.StandardCharsets;
	import java.util.Arrays;
	import java.util.List;
	import java.util.Locale;
	import java.util.function.Predicate;
	import java.util.zip.GZIPInputStream;

	import org.apache.http.entity.ContentType;
	import org.apache.solr.client.solrj.SolrRequest;
	import org.apache.solr.client.solrj.request.RequestWriter;

	/**
	* Three concrete implementations for ContentStream - one for File/URL/String
	*
	*
	* @since solr 1.2
	*/
	public abstract class ContentStreamBase implements ContentStream
	{

	public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
	private static final String TEXT_CSV = "text/csv";
	private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
	private static final List<String> XML_SUF = Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
	private static final List<String> JSON_SUF = Arrays.asList(".json", ".json.gz", ".json.gzip");
	private static final List<String> CSV_SUF = Arrays.asList(".csv", ".csv.gz", ".csv.gzip");

	protected String name;
	protected String sourceInfo;
	protected String contentType;
	protected Long size;

	//---------------------------------------------------------------------
	//---------------------------------------------------------------------

	public static String getCharsetFromContentType( String contentType )
	{
	if( contentType != null ) {
	int idx = contentType.toLowerCase(Locale.ROOT).indexOf( "charset=" );
	if( idx > 0 ) {
	return contentType.substring( idx + "charset=".length() ).trim();
	}
	}
	return null;
	}

	protected String attemptToDetermineContentType() {
	String type = null;
	if (name != null) {
	Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);

	if (XML_SUF.stream().anyMatch(endsWith)) {
	type = ContentType.APPLICATION_XML.getMimeType();
	} else if (JSON_SUF.stream().anyMatch(endsWith)) {
	type = ContentType.APPLICATION_JSON.getMimeType();
	} else if (CSV_SUF.stream().anyMatch(endsWith)) {
	type = TEXT_CSV;
	} else {
	type = attemptToDetermineTypeFromFirstCharacter();
	}
	}
	return type;
	}

	private String attemptToDetermineTypeFromFirstCharacter() {
	String type = null;
	try (InputStream stream = getStream()) {
	// Last ditch effort to determine content, if the first non-white space
	// is a '<' or '{', assume xml or json.
	int data = stream.read();
	while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
	data = stream.read();
	}
	if ((char)data == '<') {
	type = ContentType.APPLICATION_XML.getMimeType();
	} else if ((char)data == '{') {
	type = ContentType.APPLICATION_JSON.getMimeType();
	}
	} catch (Exception ex) {
	// This code just eats, the exception and leaves
	// the contentType untouched.
	}
	return type;
	}

	//------------------------------------------------------------------------
	//------------------------------------------------------------------------

	/**
	* Construct a <code>ContentStream</code> from a <code>URL</code>
	*
	* This uses a <code>URLConnection</code> to get the content stream
	* @see URLConnection
	*/
	public static class URLStream extends ContentStreamBase
	{
	private final URL url;

	public URLStream( URL url ) {
	this.url = url;
	sourceInfo = "url";
	}

	@Override
	public String getContentType() {
	// for file:// streams that are octet-streams, try to determine the payload
	// type from payload rather than just using the mime type.
	if ("file".equals(url.getProtocol())) {
	Predicate<String> equals = mimeType->mimeType.equals(contentType);
	if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
	String type = attemptToDetermineContentType();
	contentType = ( type != null ) ? type : contentType;
	}
	}
	return contentType;
	}

	@Override
	public InputStream getStream() throws IOException {
	URLConnection conn = this.url.openConnection();

	contentType = conn.getContentType();
	name = url.toExternalForm();
	size = conn.getContentLengthLong();
	InputStream is = conn.getInputStream();
	String urlFile = url.getFile().toLowerCase(Locale.ROOT);
	if( "gzip".equals(conn.getContentEncoding()) \|\| urlFile.endsWith( ".gz" ) \|\| urlFile.endsWith( ".gzip" )){
	is = new GZIPInputStream(is);
	}
	return is;
	}
	}

	/**
	* Construct a <code>ContentStream</code> from a <code>File</code>
	*/
	public static class FileStream extends ContentStreamBase
	{
	private final File file;

	public FileStream( File f ) {
	file = f;

	contentType = null; // ??
	name = file.getName();
	size = file.length();
	sourceInfo = file.toURI().toString();
	}

	@Override
	public String getContentType() {
	if(contentType==null) {
	contentType = attemptToDetermineContentType();
	}
	return contentType;
	}

	@Override
	public InputStream getStream() throws IOException {
	InputStream is = new FileInputStream( file );
	String lowerName = name.toLowerCase(Locale.ROOT);
	if(lowerName.endsWith(".gz") \|\| lowerName.endsWith(".gzip")) {
	is = new GZIPInputStream(is);
	}
	return is;
	}
	}


	/**
	* Construct a <code>ContentStream</code> from a <code>String</code>
	*/
	public static class StringStream extends ContentStreamBase
	{
	private final String str;

	public StringStream( String str ) {
	this(str, detect(str));
	}

	public StringStream( String str, String contentType ) {
	this.str = str;
	this.contentType = contentType;
	name = null;
	try {
	size = (long) str.getBytes(DEFAULT_CHARSET).length;
	} catch (UnsupportedEncodingException e) {
	// won't happen
	throw new RuntimeException(e);
	}
	sourceInfo = "string";
	}

	public static String detect(String str) {
	String detectedContentType = null;
	int lim = str.length() - 1;
	for (int i=0; i<lim; i++) {
	char ch = str.charAt(i);
	if (Character.isWhitespace(ch)) {
	continue;
	}
	// first non-whitespace chars
	if (ch == '#' // single line comment
	\|\| (ch == '/' && (str.charAt(i + 1) == '/' \|\| str.charAt(i + 1) == '*')) // single line or multi-line comment
	\|\| (ch == '{' \|\| ch == '[') // start of JSON object
	)
	{
	detectedContentType = "application/json";
	} else if (ch == '<') {
	detectedContentType = "text/xml";
	}
	break;
	}
	return detectedContentType;
	}

	@Override
	public InputStream getStream() throws IOException {
	return new ByteArrayInputStream( str.getBytes(DEFAULT_CHARSET) );
	}

	/**
	* If an charset is defined (by the contentType) use that, otherwise
	* use a StringReader
	*/
	@Override
	public Reader getReader() throws IOException {
	String charset = getCharsetFromContentType( contentType );
	return charset == null
	? new StringReader( str )
	: new InputStreamReader( getStream(), charset );
	}
	}

	/**
	* Base reader implementation. If the contentType declares a
	* charset use it, otherwise use "utf-8".
	*/
	@Override
	public Reader getReader() throws IOException {
	String charset = getCharsetFromContentType( getContentType() );
	return charset == null
	? new InputStreamReader( getStream(), DEFAULT_CHARSET )
	: new InputStreamReader( getStream(), charset );
	}

	//------------------------------------------------------------------
	// Getters / Setters for overrideable attributes
	//------------------------------------------------------------------

	@Override
	public String getContentType() {
	return contentType;
	}

	public void setContentType(String contentType) {
	this.contentType = contentType;
	}

	@Override
	public String getName() {
	return name;
	}

	public void setName(String name) {
	this.name = name;
	}

	@Override
	public Long getSize() {
	return size;
	}

	public void setSize(Long size) {
	this.size = size;
	}

	@Override
	public String getSourceInfo() {
	return sourceInfo;
	}

	public void setSourceInfo(String sourceInfo) {
	this.sourceInfo = sourceInfo;
	}
	public static ContentStream create(RequestWriter requestWriter,
	@SuppressWarnings({"rawtypes"})SolrRequest req) throws IOException {
	ByteArrayOutputStream baos = new ByteArrayOutputStream();
	RequestWriter.ContentWriter contentWriter = requestWriter.getContentWriter(req);
	contentWriter.write(baos);
	return new ByteArrayStream(baos.toByteArray(), null,contentWriter.getContentType() );
	}

	/**
	* Construct a <code>ContentStream</code> from a <code>File</code>
	*/
	public static class ByteArrayStream extends ContentStreamBase
	{
	private final byte[] bytes;
	public ByteArrayStream( byte[] bytes, String source ) {
	this(bytes,source, null);
	}

	public ByteArrayStream( byte[] bytes, String source, String contentType ) {
	this.bytes = bytes;

	this.contentType = contentType;
	name = source;
	size = (long) bytes.length;
	sourceInfo = source;
	}


	@Override
	public InputStream getStream() throws IOException {
	return new ByteArrayInputStream( bytes );
	}
	}
	}