opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.similarity.apps.solr;


 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;

 import net.billylieurance.azuresearch.AzureSearchImageResult;
 import net.billylieurance.azuresearch.AzureSearchResultSet;

 import org.docx4j.dml.wordprocessingDrawing.Inline;
 import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
 import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage;
 import org.docx4j.wml.CTFootnotes;
 import org.docx4j.wml.CTFtnEdn;
 import org.docx4j.wml.Drawing;
 import org.docx4j.wml.P;
 import org.docx4j.wml.R;

 import opennlp.tools.similarity.apps.BingQueryRunner;
 import opennlp.tools.similarity.apps.Fragment;
 import opennlp.tools.similarity.apps.HitBase;

 public class WordDocBuilder{
 	protected static final String IMG_REL_PATH = "images/";
 	protected final BingQueryRunner imageSearcher = new BingQueryRunner();
 	protected String absPath;

 	public WordDocBuilder(){
 	absPath = new File(".").getAbsolutePath();
 	absPath = absPath.substring(0, absPath.length()-1);
 	}

 	public String buildWordDoc(List<HitBase> content, String title){

 		String outputDocFilename =  absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";

 		WordprocessingMLPackage wordMLPackage;
 		try {
 			wordMLPackage = WordprocessingMLPackage.createPackage();
 			wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title);
 			for(HitBase para: content){

 				wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
 						para.getTitle());
 				String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "")
 						.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
 						.replace(",.", ".");
 				wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);

 				addImageByImageTitleToPackage(wordMLPackage, para.getTitle());
 			}

 			//File file = new File("C:/ma/personal/argCamp.png");
 	        //byte[] bytes = convertImageToByteArray(file);
 	        //addImageToPackage(wordMLPackage, bytes);

 			wordMLPackage.save(new File(outputDocFilename));
 		} catch (Exception e) {
 			e.printStackTrace();
 		}

 		return outputDocFilename;
 	}

 	private void addImageByImageTitleToPackage(
 			WordprocessingMLPackage wordMLPackage, String title) {
 		AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title);
 		for (AzureSearchImageResult anr : res){
 			String url = anr.getMediaUrl();
 			addImageByURLToPackage( wordMLPackage, url);
 			return;
 		}

 	}

 	private void addImageByURLToPackage(WordprocessingMLPackage wordMLPackage,
             String url){
 		String destinationFile = url.replace("http://", "").replace("/", "_");
 		saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
 		File file = new File(absPath+destinationFile);
 		try {
 			byte[] bytes = convertImageToByteArray(file);
 			addImageToPackage(wordMLPackage, bytes);
 		} catch (Exception e) {
 			e.printStackTrace();
 		}
 	}

 	/**
      *  Docx4j contains a utility method to create an image part from an array of
      *  bytes and then adds it to the given package. In order to be able to add this
      *  image to a paragraph, we have to convert it into an inline object. For this
      *  there is also a method, which takes a filename hint, an alt-text, two ids
      *  and an indication on whether it should be embedded or linked to.
      *  One id is for the drawing object non-visual properties of the document, and
      *  the second id is for the non-visual drawing properties of the picture itself.
      *  Finally, we add this inline object to the paragraph and the paragraph to the
      *  main document of the package.
      *
      *  @param wordMLPackage The package we want to add the image to
      *  @param bytes         The bytes of the image
      *  @throws Exception    Sadly the createImageInline method throws an Exception
      *                       (and not a more specific exception type)
      *
      *
      */
     protected static void addImageToPackage(WordprocessingMLPackage wordMLPackage,
                             byte[] bytes) throws Exception {
 			BinaryPartAbstractImage imagePart =
 					BinaryPartAbstractImage.createImagePart(wordMLPackage, bytes);

 			int docPrId = 1;
 			int cNvPrId = 2;
 					Inline inline = imagePart.createImageInline("Filename hint",
 							"Alternative text", docPrId, cNvPrId, false);

 			P paragraph = addInlineImageToParagraph(inline);
 			wordMLPackage.getMainDocumentPart().addObject(paragraph);
     }

     /**
      *  We create an object factory and use it to create a paragraph and a run.
      *  Then we add the run to the paragraph. Next we create a drawing and
      *  add it to the run. Finally, we add the inline object to the drawing and
      *  return the paragraph.
      *
      * @param   inline The inline object containing the image.
      * @return  the paragraph containing the image
      */
     private static P addInlineImageToParagraph(Inline inline) {
         // Now add the in-line image to a paragraph
     		org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();
         P paragraph = factory.createP();
         R run = factory.createR();
         paragraph.getContent().add(run);
         Drawing drawing = factory.createDrawing();
         run.getContent().add(drawing);
         drawing.getAnchorOrInline().add(inline);
         return paragraph;
     }

     private static CTFootnotes createFootnote(P paragraph){
     	org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();
     	CTFootnotes fn = factory.createCTFootnotes();
     	fn.setParent(paragraph);

     	//STFtnEdn sTFtnEdn = factory.createSTFtnEdn();
     	CTFtnEdn fe = factory.createCTFtnEdn();
     	fe.setParent(paragraph);
     	return fn;
     }

     /**
      * Convert the image from the file into an array of bytes.
      *
      * @param file  the image file to be converted
      * @return      the byte array containing the bytes from the image
      * @throws FileNotFoundException
      * @throws IOException
      */
     protected static byte[] convertImageToByteArray(File file) throws FileNotFoundException, IOException {
         try (InputStream is = new FileInputStream(file)) {
 					long length = file.length();
 					// You cannot create an array using a long, it needs to be an int.
 					if (length > Integer.MAX_VALUE) {
 							System.out.println("File too large!!");
 					}
 					byte[] bytes = new byte[(int)length];
 					int offset = 0;
 					int numRead;
 					while (offset < bytes.length && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
 							offset += numRead;
 					}
 					// Ensure all the bytes have been read
 					if (offset < bytes.length) {
 							System.out.println("Could not completely read file "
 													+file.getName());
 					}
 					return bytes;
 				}
     }

     public static void saveImageFromTheWeb(String imageUrl, String destinationFile) {
 			File f = new File(destinationFile);
 			if (!f.exists()) {
 				try {
 					f.createNewFile();
 				} catch (IOException e) {
 					throw new RuntimeException(e);
 				}
 			}
 			try (InputStream is = new URL(imageUrl).openStream();
 					 OutputStream os = new FileOutputStream(destinationFile)) {

 				byte[] b = new byte[2048];
 				int length;

 				while ((length = is.read(b)) != -1) {
 					os.write(b, 0, length);
 				}

 			} catch (IOException e) {
 				e.printStackTrace();
 			}
 		}

     public static void main(String[] args){
     	WordDocBuilder b = new WordDocBuilder();
     	List<HitBase> content = new ArrayList<>();
     	for(int i = 0; i<10; i++){
     		HitBase h = new HitBase();
     		h.setTitle("albert einstein "+i);
     		List<Fragment> frs = new ArrayList<>();
     		frs.add(new Fragment(" content "+i, 0));
     		h.setFragments(frs);
     		content.add(h);
     	}

     	b.buildWordDoc(content, "mytitle");
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.similarity.apps.solr;


	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileNotFoundException;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStream;
	import java.net.URL;
	import java.util.ArrayList;
	import java.util.List;

	import net.billylieurance.azuresearch.AzureSearchImageResult;
	import net.billylieurance.azuresearch.AzureSearchResultSet;

	import org.docx4j.dml.wordprocessingDrawing.Inline;
	import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
	import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage;
	import org.docx4j.wml.CTFootnotes;
	import org.docx4j.wml.CTFtnEdn;
	import org.docx4j.wml.Drawing;
	import org.docx4j.wml.P;
	import org.docx4j.wml.R;

	import opennlp.tools.similarity.apps.BingQueryRunner;
	import opennlp.tools.similarity.apps.Fragment;
	import opennlp.tools.similarity.apps.HitBase;

	public class WordDocBuilder{
	protected static final String IMG_REL_PATH = "images/";
	protected final BingQueryRunner imageSearcher = new BingQueryRunner();
	protected String absPath;

	public WordDocBuilder(){
	absPath = new File(".").getAbsolutePath();
	absPath = absPath.substring(0, absPath.length()-1);
	}

	public String buildWordDoc(List<HitBase> content, String title){

	String outputDocFilename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";

	WordprocessingMLPackage wordMLPackage;
	try {
	wordMLPackage = WordprocessingMLPackage.createPackage();
	wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title);
	for(HitBase para: content){

	wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
	para.getTitle());
	String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" \| ", "")
	.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
	.replace(",.", ".");
	wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);

	addImageByImageTitleToPackage(wordMLPackage, para.getTitle());
	}

	//File file = new File("C:/ma/personal/argCamp.png");
	//byte[] bytes = convertImageToByteArray(file);
	//addImageToPackage(wordMLPackage, bytes);

	wordMLPackage.save(new File(outputDocFilename));
	} catch (Exception e) {
	e.printStackTrace();
	}

	return outputDocFilename;
	}

	private void addImageByImageTitleToPackage(
	WordprocessingMLPackage wordMLPackage, String title) {
	AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title);
	for (AzureSearchImageResult anr : res){
	String url = anr.getMediaUrl();
	addImageByURLToPackage( wordMLPackage, url);
	return;
	}

	}

	private void addImageByURLToPackage(WordprocessingMLPackage wordMLPackage,
	String url){
	String destinationFile = url.replace("http://", "").replace("/", "_");
	saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
	File file = new File(absPath+destinationFile);
	try {
	byte[] bytes = convertImageToByteArray(file);
	addImageToPackage(wordMLPackage, bytes);
	} catch (Exception e) {
	e.printStackTrace();
	}
	}

	/**
	* Docx4j contains a utility method to create an image part from an array of
	* bytes and then adds it to the given package. In order to be able to add this
	* image to a paragraph, we have to convert it into an inline object. For this
	* there is also a method, which takes a filename hint, an alt-text, two ids
	* and an indication on whether it should be embedded or linked to.
	* One id is for the drawing object non-visual properties of the document, and
	* the second id is for the non-visual drawing properties of the picture itself.
	* Finally, we add this inline object to the paragraph and the paragraph to the
	* main document of the package.
	*
	* @param wordMLPackage The package we want to add the image to
	* @param bytes The bytes of the image
	* @throws Exception Sadly the createImageInline method throws an Exception
	* (and not a more specific exception type)
	*
	*
	*/
	protected static void addImageToPackage(WordprocessingMLPackage wordMLPackage,
	byte[] bytes) throws Exception {
	BinaryPartAbstractImage imagePart =
	BinaryPartAbstractImage.createImagePart(wordMLPackage, bytes);

	int docPrId = 1;
	int cNvPrId = 2;
	Inline inline = imagePart.createImageInline("Filename hint",
	"Alternative text", docPrId, cNvPrId, false);

	P paragraph = addInlineImageToParagraph(inline);
	wordMLPackage.getMainDocumentPart().addObject(paragraph);
	}

	/**
	* We create an object factory and use it to create a paragraph and a run.
	* Then we add the run to the paragraph. Next we create a drawing and
	* add it to the run. Finally, we add the inline object to the drawing and
	* return the paragraph.
	*
	* @param inline The inline object containing the image.
	* @return the paragraph containing the image
	*/
	private static P addInlineImageToParagraph(Inline inline) {
	// Now add the in-line image to a paragraph
	org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();
	P paragraph = factory.createP();
	R run = factory.createR();
	paragraph.getContent().add(run);
	Drawing drawing = factory.createDrawing();
	run.getContent().add(drawing);
	drawing.getAnchorOrInline().add(inline);
	return paragraph;
	}

	private static CTFootnotes createFootnote(P paragraph){
	org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();
	CTFootnotes fn = factory.createCTFootnotes();
	fn.setParent(paragraph);

	//STFtnEdn sTFtnEdn = factory.createSTFtnEdn();
	CTFtnEdn fe = factory.createCTFtnEdn();
	fe.setParent(paragraph);
	return fn;
	}

	/**
	* Convert the image from the file into an array of bytes.
	*
	* @param file the image file to be converted
	* @return the byte array containing the bytes from the image
	* @throws FileNotFoundException
	* @throws IOException
	*/
	protected static byte[] convertImageToByteArray(File file) throws FileNotFoundException, IOException {
	try (InputStream is = new FileInputStream(file)) {
	long length = file.length();
	// You cannot create an array using a long, it needs to be an int.
	if (length > Integer.MAX_VALUE) {
	System.out.println("File too large!!");
	}
	byte[] bytes = new byte[(int)length];
	int offset = 0;
	int numRead;
	while (offset < bytes.length && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
	offset += numRead;
	}
	// Ensure all the bytes have been read
	if (offset < bytes.length) {
	System.out.println("Could not completely read file "
	+file.getName());
	}
	return bytes;
	}
	}

	public static void saveImageFromTheWeb(String imageUrl, String destinationFile) {
	File f = new File(destinationFile);
	if (!f.exists()) {
	try {
	f.createNewFile();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	try (InputStream is = new URL(imageUrl).openStream();
	OutputStream os = new FileOutputStream(destinationFile)) {

	byte[] b = new byte[2048];
	int length;

	while ((length = is.read(b)) != -1) {
	os.write(b, 0, length);
	}

	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	public static void main(String[] args){
	WordDocBuilder b = new WordDocBuilder();
	List<HitBase> content = new ArrayList<>();
	for(int i = 0; i<10; i++){
	HitBase h = new HitBase();
	h.setTitle("albert einstein "+i);
	List<Fragment> frs = new ArrayList<>();
	frs.add(new Fragment(" content "+i, 0));
	h.setFragments(frs);
	content.add(h);
	}

	b.buildWordDoc(content, "mytitle");
	}
	}