blob: 7c7c1d94047b66377850bc977a3497a12c0e1057 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps.solr;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import net.billylieurance.azuresearch.AzureSearchImageResult;
import net.billylieurance.azuresearch.AzureSearchResultSet;
import net.billylieurance.azuresearch.AzureSearchWebResult;
import org.apache.commons.lang.StringUtils;
//import org.docx4j.Docx4J;
//import org.docx4j.convert.out.FOSettings;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import opennlp.tools.similarity.apps.ContentGeneratorSupport;
import opennlp.tools.similarity.apps.Fragment;
import opennlp.tools.similarity.apps.HitBase;
public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{
public String buildWordDoc(List<HitBase> content, String title){
String outputDocFinename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";
WordprocessingMLPackage wordMLPackage;
List<String> imageURLs = getAllImageSearchResults(title);
int count=0;
try {
wordMLPackage = WordprocessingMLPackage.createPackage();
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title.toUpperCase());
for(HitBase para: content){
if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit
continue;
try {
if (!para.getTitle().endsWith("..") /*|| StringUtils.isAlphanumeric(para.getTitle())*/){
String sectTitle = ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(para.getTitle());
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
sectTitle);
}
String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "")
.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
.replace(",.", ".");
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
try {
addImageByImageURLToPackage(count, wordMLPackage, imageURLs);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
count++;
}
// now add URLs
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", "REFERENCES");
for(HitBase para: content){
if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit
continue;
try {
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
para.getTitle());
String paraText = para.getUrl();
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
wordMLPackage.save(new File(outputDocFinename));
System.out.println("Finished creating docx ="+outputDocFinename);
//TODO pdf export
/*
FOSettings foSettings = Docx4J.createFOSettings();
foSettings.setWmlPackage(wordMLPackage);
OutputStream os = new java.io.FileOutputStream(outputDocFinename.replace(".docx", ".pdf"));
Docx4J.toFO(foSettings, os, Docx4J.FLAG_NONE);
System.out.println("Finished creating docx's PDF ="+outputDocFinename);
*/
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return outputDocFinename;
}
protected void addImageByImageURLToPackage(int count,
WordprocessingMLPackage wordMLPackage,
List<String> imageURLs) {
if (count>imageURLs.size()-1)
return;
String url = imageURLs.get(count);
String destinationFile = url.replace("http://", "").replace("/", "_");
saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
File file = new File(absPath+IMG_REL_PATH+destinationFile);
try {
byte[] bytes = convertImageToByteArray(file);
addImageToPackage(wordMLPackage, bytes);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
protected List<String> getAllImageSearchResults(String title) {
List<String> imageURLs = new ArrayList<String>();
AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title);
for(AzureSearchImageResult imResult: res){
imageURLs.add(imResult.getMediaUrl());
}
return imageURLs;
}
public static void main(String[] args){
WordDocBuilderSingleImageSearchCall b = new WordDocBuilderSingleImageSearchCall();
List<HitBase> content = new ArrayList<HitBase>();
for(int i = 0; i<10; i++){
HitBase h = new HitBase();
h.setTitle("albert einstein "+i);
List<Fragment> frs = new ArrayList<Fragment>();
frs.add(new Fragment(" content "+i, 0));
h.setFragments(frs);
content.add(h);
}
b.buildWordDoc(content, "albert einstein");
}
}