blob: 461dcd08d30f5f1714290d128a4a64ef273612e1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps.solr;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import net.billylieurance.azuresearch.AzureSearchImageResult;
import net.billylieurance.azuresearch.AzureSearchResultSet;
import org.docx4j.dml.wordprocessingDrawing.Inline;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage;
import org.docx4j.wml.CTFootnotes;
import org.docx4j.wml.CTFtnEdn;
import org.docx4j.wml.Drawing;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import opennlp.tools.similarity.apps.BingQueryRunner;
import opennlp.tools.similarity.apps.Fragment;
import opennlp.tools.similarity.apps.HitBase;
public class WordDocBuilder{
protected static final String IMG_REL_PATH = "images/";
protected final BingQueryRunner imageSearcher = new BingQueryRunner();
protected String absPath;
public WordDocBuilder(){
absPath = new File(".").getAbsolutePath();
absPath = absPath.substring(0, absPath.length()-1);
}
public String buildWordDoc(List<HitBase> content, String title){
String outputDocFilename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";
WordprocessingMLPackage wordMLPackage;
try {
wordMLPackage = WordprocessingMLPackage.createPackage();
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title);
for(HitBase para: content){
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
para.getTitle());
String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "")
.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
.replace(",.", ".");
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
addImageByImageTitleToPackage(wordMLPackage, para.getTitle());
}
//File file = new File("C:/ma/personal/argCamp.png");
//byte[] bytes = convertImageToByteArray(file);
//addImageToPackage(wordMLPackage, bytes);
wordMLPackage.save(new File(outputDocFilename));
} catch (Exception e) {
e.printStackTrace();
}
return outputDocFilename;
}
private void addImageByImageTitleToPackage(
WordprocessingMLPackage wordMLPackage, String title) {
AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title);
for (AzureSearchImageResult anr : res){
String url = anr.getMediaUrl();
addImageByURLToPackage( wordMLPackage, url);
return;
}
}
private void addImageByURLToPackage(WordprocessingMLPackage wordMLPackage,
String url){
String destinationFile = url.replace("http://", "").replace("/", "_");
saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
File file = new File(absPath+destinationFile);
try {
byte[] bytes = convertImageToByteArray(file);
addImageToPackage(wordMLPackage, bytes);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Docx4j contains a utility method to create an image part from an array of
* bytes and then adds it to the given package. In order to be able to add this
* image to a paragraph, we have to convert it into an inline object. For this
* there is also a method, which takes a filename hint, an alt-text, two ids
* and an indication on whether it should be embedded or linked to.
* One id is for the drawing object non-visual properties of the document, and
* the second id is for the non-visual drawing properties of the picture itself.
* Finally, we add this inline object to the paragraph and the paragraph to the
* main document of the package.
*
* @param wordMLPackage The package we want to add the image to
* @param bytes The bytes of the image
* @throws Exception Sadly the createImageInline method throws an Exception
* (and not a more specific exception type)
*
*
*/
protected static void addImageToPackage(WordprocessingMLPackage wordMLPackage,
byte[] bytes) throws Exception {
BinaryPartAbstractImage imagePart =
BinaryPartAbstractImage.createImagePart(wordMLPackage, bytes);
int docPrId = 1;
int cNvPrId = 2;
Inline inline = imagePart.createImageInline("Filename hint",
"Alternative text", docPrId, cNvPrId, false);
P paragraph = addInlineImageToParagraph(inline);
wordMLPackage.getMainDocumentPart().addObject(paragraph);
}
/**
* We create an object factory and use it to create a paragraph and a run.
* Then we add the run to the paragraph. Next we create a drawing and
* add it to the run. Finally, we add the inline object to the drawing and
* return the paragraph.
*
* @param inline The inline object containing the image.
* @return the paragraph containing the image
*/
private static P addInlineImageToParagraph(Inline inline) {
// Now add the in-line image to a paragraph
org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();
P paragraph = factory.createP();
R run = factory.createR();
paragraph.getContent().add(run);
Drawing drawing = factory.createDrawing();
run.getContent().add(drawing);
drawing.getAnchorOrInline().add(inline);
return paragraph;
}
private static CTFootnotes createFootnote(P paragraph){
org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();
CTFootnotes fn = factory.createCTFootnotes();
fn.setParent(paragraph);
//STFtnEdn sTFtnEdn = factory.createSTFtnEdn();
CTFtnEdn fe = factory.createCTFtnEdn();
fe.setParent(paragraph);
return fn;
}
/**
* Convert the image from the file into an array of bytes.
*
* @param file the image file to be converted
* @return the byte array containing the bytes from the image
* @throws FileNotFoundException
* @throws IOException
*/
protected static byte[] convertImageToByteArray(File file) throws FileNotFoundException, IOException {
try (InputStream is = new FileInputStream(file)) {
long length = file.length();
// You cannot create an array using a long, it needs to be an int.
if (length > Integer.MAX_VALUE) {
System.out.println("File too large!!");
}
byte[] bytes = new byte[(int)length];
int offset = 0;
int numRead;
while (offset < bytes.length && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
offset += numRead;
}
// Ensure all the bytes have been read
if (offset < bytes.length) {
System.out.println("Could not completely read file "
+file.getName());
}
return bytes;
}
}
public static void saveImageFromTheWeb(String imageUrl, String destinationFile) {
File f = new File(destinationFile);
if (!f.exists()) {
try {
f.createNewFile();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
try (InputStream is = new URL(imageUrl).openStream();
OutputStream os = new FileOutputStream(destinationFile)) {
byte[] b = new byte[2048];
int length;
while ((length = is.read(b)) != -1) {
os.write(b, 0, length);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args){
WordDocBuilder b = new WordDocBuilder();
List<HitBase> content = new ArrayList<>();
for(int i = 0; i<10; i++){
HitBase h = new HitBase();
h.setTitle("albert einstein "+i);
List<Fragment> frs = new ArrayList<>();
frs.add(new Fragment(" content "+i, 0));
h.setFragments(frs);
content.add(h);
}
b.buildWordDoc(content, "mytitle");
}
}