package opennlp.tools.parse_thicket.kernel_interface; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.List; | |
import java.util.logging.Logger; | |
import org.apache.commons.lang.StringUtils; | |
import opennlp.tools.parse_thicket.apps.MinedSentenceProcessor; | |
import opennlp.tools.parse_thicket.apps.SnippetToParagraph; | |
import opennlp.tools.similarity.apps.Fragment; | |
import opennlp.tools.similarity.apps.GeneratedSentenceProcessor; | |
import opennlp.tools.similarity.apps.HitBase; | |
import opennlp.tools.similarity.apps.RelatedSentenceFinder; | |
import opennlp.tools.similarity.apps.utils.PageFetcher; | |
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; | |
import opennlp.tools.similarity.apps.utils.Utils; | |
import opennlp.tools.textsimilarity.TextProcessor; | |
public class SnippetToParagraphFull extends SnippetToParagraph { | |
private PageFetcher pFetcher = new PageFetcher(); | |
private static Logger LOG = Logger | |
.getLogger("com.become.parse_thicket.apps.SnippetToParagraphFull"); | |
public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) { | |
String[] sents = extractSentencesFromPage(item.getUrl()); | |
String title = item.getTitle().replace("<b>", " ").replace("</b>", " ") | |
.replace(" ", " ").replace(" ", " "); | |
// generation results for this sentence | |
List<String> result = new ArrayList<String>(); | |
// form plain text from snippet | |
String snapshot = item.getAbstractText().replace("<b>", " ") | |
.replace("</b>", " ").replace(" ", " ").replace(" ", " ").replace("\"", ""); | |
String snapshotMarked = snapshot.replace(" ...", "."); | |
List<String> fragments = TextProcessor.splitToSentences(snapshotMarked); | |
if (fragments.size()<3 && StringUtils.countMatches(snapshotMarked, ".")>1){ | |
snapshotMarked = snapshotMarked.replace("..", "&").replace(".", "&"); | |
String[] fragmSents = snapshotMarked.split("&"); | |
fragments = Arrays.asList(fragmSents); | |
} | |
for (String f : fragments) { | |
String followSent = null; | |
if (f.length() < 50) | |
continue; | |
String pageSentence = ""; | |
// try to find original sentence from webpage | |
try { | |
String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment( | |
f, sents); | |
pageSentence = mainAndFollowSent[0]; | |
followSent = mainAndFollowSent[1]; | |
if (pageSentence!=null) | |
result.add(pageSentence); | |
else { | |
result.add(f); | |
LOG.info("Could not find the original sentence \n"+f +"\n in the page " ); | |
} | |
//if (followSent !=null) | |
// result.add(followSent); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
item.setOriginalSentences(result); | |
return item; | |
} | |
} | |