opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.similarity.apps.solr;


 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;

 import net.billylieurance.azuresearch.AzureSearchImageResult;
 import net.billylieurance.azuresearch.AzureSearchResultSet;
 import net.billylieurance.azuresearch.AzureSearchWebResult;

 import org.apache.commons.lang.StringUtils;
 //import org.docx4j.Docx4J;
 //import org.docx4j.convert.out.FOSettings;
 import org.docx4j.openpackaging.packages.WordprocessingMLPackage;

 import opennlp.tools.similarity.apps.ContentGeneratorSupport;
 import opennlp.tools.similarity.apps.Fragment;
 import opennlp.tools.similarity.apps.HitBase;


 public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{

 	public String buildWordDoc(List<HitBase> content, String title){

 		String outputDocFinename =  absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";

 		WordprocessingMLPackage wordMLPackage;
 		List<String> imageURLs = getAllImageSearchResults(title);
 		int count=0;
 		try {
 			wordMLPackage = WordprocessingMLPackage.createPackage();
 			wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title.toUpperCase());
 			for(HitBase para: content){
 				if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit
 						continue;
 				try {
 					if (!para.getTitle().endsWith("..") /*|| StringUtils.isAlphanumeric(para.getTitle())*/){
 						String sectTitle = ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(para.getTitle());
 						wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
 							sectTitle);
 					}
 					String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "")
 							.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
 							.replace(",.", ".");
 					wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);

 					try {
 						addImageByImageURLToPackage(count, wordMLPackage, imageURLs);
 					} catch (Exception e) {
 						// TODO Auto-generated catch block
 						e.printStackTrace();
 					}
 				} catch (Exception e) {
 					// TODO Auto-generated catch block
 					e.printStackTrace();
 				}
 				count++;
 			}
 			// now add URLs
 			wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", "REFERENCES");
 			for(HitBase para: content){
 				if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit
 						continue;
 				try {
 					wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
 							para.getTitle());
 					String paraText = para.getUrl();
 					wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);


 				} catch (Exception e) {
 					// TODO Auto-generated catch block
 					e.printStackTrace();
 				}
 			}


 			wordMLPackage.save(new File(outputDocFinename));
 			System.out.println("Finished creating docx ="+outputDocFinename);
 		//TODO pdf export
 			/*
 			FOSettings foSettings = Docx4J.createFOSettings();
             foSettings.setWmlPackage(wordMLPackage);
             OutputStream os = new java.io.FileOutputStream(outputDocFinename.replace(".docx", ".pdf"));
             Docx4J.toFO(foSettings, os, Docx4J.FLAG_NONE);
         	System.out.println("Finished creating docx's PDF ="+outputDocFinename);
     	*/

 		} catch (Exception e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}

 		return outputDocFinename;
 	}

 	protected void addImageByImageURLToPackage(int count,
 			WordprocessingMLPackage wordMLPackage,
 			List<String>  imageURLs) {
 		if (count>imageURLs.size()-1)
 			return;

 		String url = imageURLs.get(count);
 		String destinationFile = url.replace("http://", "").replace("/", "_");
 		saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
 		File file = new File(absPath+IMG_REL_PATH+destinationFile);
         try {
 			byte[] bytes = convertImageToByteArray(file);
 			addImageToPackage(wordMLPackage, bytes);
 		} catch (FileNotFoundException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		} catch (Exception e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}

 	}

 	protected List<String>  getAllImageSearchResults(String title) {
 		List<String> imageURLs = new ArrayList<String>();
 		AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title);
 		for(AzureSearchImageResult imResult: res){
 			imageURLs.add(imResult.getMediaUrl());
 		}
 		return imageURLs;

 	}


     public static void main(String[] args){
     	WordDocBuilderSingleImageSearchCall b = new WordDocBuilderSingleImageSearchCall();
     	List<HitBase> content = new ArrayList<HitBase>();
     	for(int i = 0; i<10; i++){
     		HitBase h = new HitBase();
     		h.setTitle("albert einstein "+i);
     		List<Fragment> frs = new ArrayList<Fragment>();
     		frs.add(new Fragment(" content "+i, 0));
     		h.setFragments(frs);
     		content.add(h);
     	}

     	b.buildWordDoc(content, "albert einstein");
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.similarity.apps.solr;


	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileNotFoundException;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStream;
	import java.net.MalformedURLException;
	import java.net.URL;
	import java.util.ArrayList;
	import java.util.List;

	import net.billylieurance.azuresearch.AzureSearchImageResult;
	import net.billylieurance.azuresearch.AzureSearchResultSet;
	import net.billylieurance.azuresearch.AzureSearchWebResult;

	import org.apache.commons.lang.StringUtils;
	//import org.docx4j.Docx4J;
	//import org.docx4j.convert.out.FOSettings;
	import org.docx4j.openpackaging.packages.WordprocessingMLPackage;

	import opennlp.tools.similarity.apps.ContentGeneratorSupport;
	import opennlp.tools.similarity.apps.Fragment;
	import opennlp.tools.similarity.apps.HitBase;



	public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{

	public String buildWordDoc(List<HitBase> content, String title){

	String outputDocFinename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";

	WordprocessingMLPackage wordMLPackage;
	List<String> imageURLs = getAllImageSearchResults(title);
	int count=0;
	try {
	wordMLPackage = WordprocessingMLPackage.createPackage();
	wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title.toUpperCase());
	for(HitBase para: content){
	if (para.getFragments()==null \|\| para.getFragments().size()<1) // no found content in this hit
	continue;
	try {
	if (!para.getTitle().endsWith("..") /\|\| StringUtils.isAlphanumeric(para.getTitle())/){
	String sectTitle = ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(para.getTitle());
	wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
	sectTitle);
	}
	String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" \| ", "")
	.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
	.replace(",.", ".");
	wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);

	try {
	addImageByImageURLToPackage(count, wordMLPackage, imageURLs);
	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	count++;
	}
	// now add URLs
	wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", "REFERENCES");
	for(HitBase para: content){
	if (para.getFragments()==null \|\| para.getFragments().size()<1) // no found content in this hit
	continue;
	try {
	wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
	para.getTitle());
	String paraText = para.getUrl();
	wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);


	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}


	wordMLPackage.save(new File(outputDocFinename));
	System.out.println("Finished creating docx ="+outputDocFinename);
	//TODO pdf export
	/*
	FOSettings foSettings = Docx4J.createFOSettings();
	foSettings.setWmlPackage(wordMLPackage);
	OutputStream os = new java.io.FileOutputStream(outputDocFinename.replace(".docx", ".pdf"));
	Docx4J.toFO(foSettings, os, Docx4J.FLAG_NONE);
	System.out.println("Finished creating docx's PDF ="+outputDocFinename);
	*/

	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}

	return outputDocFinename;
	}

	protected void addImageByImageURLToPackage(int count,
	WordprocessingMLPackage wordMLPackage,
	List<String> imageURLs) {
	if (count>imageURLs.size()-1)
	return;

	String url = imageURLs.get(count);
	String destinationFile = url.replace("http://", "").replace("/", "_");
	saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
	File file = new File(absPath+IMG_REL_PATH+destinationFile);
	try {
	byte[] bytes = convertImageToByteArray(file);
	addImageToPackage(wordMLPackage, bytes);
	} catch (FileNotFoundException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}

	}

	protected List<String> getAllImageSearchResults(String title) {
	List<String> imageURLs = new ArrayList<String>();
	AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title);
	for(AzureSearchImageResult imResult: res){
	imageURLs.add(imResult.getMediaUrl());
	}
	return imageURLs;

	}


	public static void main(String[] args){
	WordDocBuilderSingleImageSearchCall b = new WordDocBuilderSingleImageSearchCall();
	List<HitBase> content = new ArrayList<HitBase>();
	for(int i = 0; i<10; i++){
	HitBase h = new HitBase();
	h.setTitle("albert einstein "+i);
	List<Fragment> frs = new ArrayList<Fragment>();
	frs.add(new Fragment(" content "+i, 0));
	h.setFragments(frs);
	content.add(h);
	}

	b.buildWordDoc(content, "albert einstein");
	}
	}