/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.similarity.apps.solr; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.OutputStream; | |
import java.net.MalformedURLException; | |
import java.net.URL; | |
import java.util.ArrayList; | |
import java.util.List; | |
import net.billylieurance.azuresearch.AzureSearchImageResult; | |
import net.billylieurance.azuresearch.AzureSearchResultSet; | |
import net.billylieurance.azuresearch.AzureSearchWebResult; | |
import org.docx4j.dml.wordprocessingDrawing.Inline; | |
import org.docx4j.openpackaging.exceptions.Docx4JException; | |
import org.docx4j.openpackaging.exceptions.InvalidFormatException; | |
import org.docx4j.openpackaging.packages.WordprocessingMLPackage; | |
import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage; | |
import org.docx4j.wml.CTFootnotes; | |
import org.docx4j.wml.CTFtnEdn; | |
import org.docx4j.wml.Drawing; | |
import org.docx4j.wml.P; | |
import org.docx4j.wml.R; | |
import org.docx4j.wml.STFtnEdn; | |
import opennlp.tools.similarity.apps.BingQueryRunner; | |
import opennlp.tools.similarity.apps.Fragment; | |
import opennlp.tools.similarity.apps.HitBase; | |
public class WordDocBuilder{ | |
protected static final String IMG_REL_PATH = "images/"; | |
protected BingQueryRunner imageSearcher = new BingQueryRunner(); | |
protected String absPath = null; | |
public WordDocBuilder(){ | |
absPath = new File(".").getAbsolutePath(); | |
absPath = absPath.substring(0, absPath.length()-1); | |
} | |
public String buildWordDoc(List<HitBase> content, String title){ | |
String outputDocFinename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx"; | |
WordprocessingMLPackage wordMLPackage; | |
try { | |
wordMLPackage = WordprocessingMLPackage.createPackage(); | |
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title); | |
for(HitBase para: content){ | |
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", | |
para.getTitle()); | |
String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "") | |
.replace(".,", ".").replace(".\"", "\"").replace(". .", ".") | |
.replace(",.", "."); | |
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); | |
addImageByImageTitleToPackage(wordMLPackage, para.getTitle()); | |
} | |
//File file = new File("C:/ma/personal/argCamp.png"); | |
//byte[] bytes = convertImageToByteArray(file); | |
//addImageToPackage(wordMLPackage, bytes); | |
wordMLPackage.save(new File(outputDocFinename)); | |
} catch (Exception e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
return outputDocFinename; | |
} | |
private void addImageByImageTitleToPackage( | |
WordprocessingMLPackage wordMLPackage, String title) { | |
AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title); | |
for (AzureSearchImageResult anr : res){ | |
String url = anr.getMediaUrl(); | |
addImageByURLToPackage( wordMLPackage, url); | |
return; | |
} | |
} | |
private void addImageByURLToPackage(WordprocessingMLPackage wordMLPackage, | |
String url){ | |
String destinationFile = url.replace("http://", "").replace("/", "_"); | |
saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile); | |
File file = new File(absPath+destinationFile); | |
try { | |
byte[] bytes = convertImageToByteArray(file); | |
addImageToPackage(wordMLPackage, bytes); | |
} catch (FileNotFoundException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (IOException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (Exception e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} | |
/** | |
* Docx4j contains a utility method to create an image part from an array of | |
* bytes and then adds it to the given package. In order to be able to add this | |
* image to a paragraph, we have to convert it into an inline object. For this | |
* there is also a method, which takes a filename hint, an alt-text, two ids | |
* and an indication on whether it should be embedded or linked to. | |
* One id is for the drawing object non-visual properties of the document, and | |
* the second id is for the non visual drawing properties of the picture itself. | |
* Finally we add this inline object to the paragraph and the paragraph to the | |
* main document of the package. | |
* | |
* @param wordMLPackage The package we want to add the image to | |
* @param bytes The bytes of the image | |
* @throws Exception Sadly the createImageInline method throws an Exception | |
* (and not a more specific exception type) | |
* | |
* | |
*/ | |
protected static void addImageToPackage(WordprocessingMLPackage wordMLPackage, | |
byte[] bytes) throws Exception { | |
BinaryPartAbstractImage imagePart = | |
BinaryPartAbstractImage.createImagePart(wordMLPackage, bytes); | |
int docPrId = 1; | |
int cNvPrId = 2; | |
Inline inline = imagePart.createImageInline("Filename hint", | |
"Alternative text", docPrId, cNvPrId, false); | |
P paragraph = addInlineImageToParagraph(inline); | |
wordMLPackage.getMainDocumentPart().addObject(paragraph); | |
} | |
/** | |
* We create an object factory and use it to create a paragraph and a run. | |
* Then we add the run to the paragraph. Next we create a drawing and | |
* add it to the run. Finally we add the inline object to the drawing and | |
* return the paragraph. | |
* | |
* @param inline The inline object containing the image. | |
* @return the paragraph containing the image | |
*/ | |
private static P addInlineImageToParagraph(Inline inline) { | |
// Now add the in-line image to a paragraph | |
org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory(); | |
P paragraph = factory.createP(); | |
R run = factory.createR(); | |
paragraph.getContent().add(run); | |
Drawing drawing = factory.createDrawing(); | |
run.getContent().add(drawing); | |
drawing.getAnchorOrInline().add(inline); | |
return paragraph; | |
} | |
private static CTFootnotes createFootnote(P paragraph){ | |
org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory(); | |
CTFootnotes fn = factory.createCTFootnotes(); | |
fn.setParent(paragraph); | |
//STFtnEdn sTFtnEdn = factory.createSTFtnEdn(); | |
CTFtnEdn fe = factory.createCTFtnEdn(); | |
fe.setParent(paragraph); | |
return fn; | |
} | |
/** | |
* Convert the image from the file into an array of bytes. | |
* | |
* @param file the image file to be converted | |
* @return the byte array containing the bytes from the image | |
* @throws FileNotFoundException | |
* @throws IOException | |
*/ | |
protected static byte[] convertImageToByteArray(File file) | |
throws FileNotFoundException, IOException { | |
InputStream is = new FileInputStream(file ); | |
long length = file.length(); | |
// You cannot create an array using a long, it needs to be an int. | |
if (length > Integer.MAX_VALUE) { | |
System.out.println("File too large!!"); | |
} | |
byte[] bytes = new byte[(int)length]; | |
int offset = 0; | |
int numRead = 0; | |
while (offset < bytes.length && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) { | |
offset += numRead; | |
} | |
// Ensure all the bytes have been read | |
if (offset < bytes.length) { | |
System.out.println("Could not completely read file " | |
+file.getName()); | |
} | |
is.close(); | |
return bytes; | |
} | |
public static void saveImageFromTheWeb(String imageUrl, String destinationFile) { | |
try { | |
URL url = new URL(imageUrl); | |
InputStream is = url.openStream(); | |
if (!new File(destinationFile).exists()) { | |
new File(destinationFile).createNewFile(); | |
} | |
OutputStream os = new FileOutputStream(destinationFile); | |
byte[] b = new byte[2048]; | |
int length; | |
while ((length = is.read(b)) != -1) { | |
os.write(b, 0, length); | |
} | |
is.close(); | |
os.close(); | |
} catch (MalformedURLException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (FileNotFoundException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (IOException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} | |
public static void main(String[] args){ | |
WordDocBuilder b = new WordDocBuilder(); | |
List<HitBase> content = new ArrayList<HitBase>(); | |
for(int i = 0; i<10; i++){ | |
HitBase h = new HitBase(); | |
h.setTitle("albert einstein "+i); | |
List<Fragment> frs = new ArrayList<Fragment>(); | |
frs.add(new Fragment(" content "+i, 0)); | |
h.setFragments(frs); | |
content.add(h); | |
} | |
b.buildWordDoc(content, "mytitle"); | |
} | |
} |