opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.parse_thicket.apps;

 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.commons.lang.StringUtils;

 import opennlp.tools.similarity.apps.ContentGeneratorSupport;
 import opennlp.tools.similarity.apps.Fragment;
 import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
 import opennlp.tools.similarity.apps.HitBase;
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 import opennlp.tools.similarity.apps.utils.Utils;
 import opennlp.tools.textsimilarity.TextProcessor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class SnippetToParagraph extends ContentGeneratorSupport /*RelatedSentenceFinder */{
 	private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 	private final PageFetcher pFetcher = new PageFetcher();

 	public HitBase formTextFromOriginalPageGivenSnippetDirect(HitBase item) {

 		// put orig sentence in structure
 		List<String> origs = new ArrayList<>();

 		item.setOriginalSentences(origs);
 		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
 				.replace("  ", " ").replace("  ", " ");
 		// generation results for this sentence
 		List<Fragment> result = new ArrayList<>();
 		// form plain text from snippet
 		String snapshot = item.getAbstractText().replace("<b>", " ")
 				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");

 		String snapshotMarked = snapshot.replace("...",
 				" _should_find_orig_ . _should_find_orig_");
 		List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
 		List<String> allFragms = new ArrayList<>(fragments);

 		List<String> sents = new ArrayList<>();
 		String downloadedPage;
 		try {
 			if (snapshotMarked.length() != snapshot.length()) {
 				downloadedPage = pFetcher.fetchPage(item.getUrl());
 				if (downloadedPage != null && downloadedPage.length() > 100) {
 					item.setPageContent(downloadedPage);
 					String pageContent = Utils.fullStripHTML(item.getPageContent());
 					pageContent = GeneratedSentenceProcessor
 							.normalizeForSentenceSplitting(pageContent);
 					pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
 							// ". ")
 							.replace("..", ".").replace(". . .", " ").trim(); // sometimes
 					// html breaks
 					// are converted
 					// into ' ' (two
 					// spaces), so
 					// we need to
 					// put '.'
 					sents = TextProcessor.splitToSentences(pageContent);

 				}
 			}
 		} catch (Exception e) {
 			System.err.println("Problem downloading  the page and splitting into sentences");
 			return item;
 		}

 		for (String fragment : allFragms) {
 			String followSent = null;
 			if (fragment.length() < 50)
 				continue;
 			String pageSentence = "";
 			// try to find original sentence from webpage
 			if (fragment.contains("_should_find_orig_") && sents != null
 					&& sents.size() > 0)
 				try {
 					String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
 							fragment.replace("_should_find_orig_", ""), sents.toArray(new String[]{}));
 					pageSentence = mainAndFollowSent[0];
 					followSent = mainAndFollowSent[1];

 				} catch (Exception e) {
 					LOG.error(e.getLocalizedMessage(), e);
 				}
 			else
 				// or get original snippet
 				pageSentence = fragment;
 			if (pageSentence != null)
 				pageSentence = pageSentence.replace("_should_find_orig_", "");
 			String pageSentenceProc = GeneratedSentenceProcessor
 					.acceptableMinedSentence(pageSentence);
 			if (pageSentenceProc != null) {
 				pageSentenceProc = GeneratedSentenceProcessor
 						.processSentence(pageSentenceProc);
 				if (followSent != null) {
 					pageSentenceProc += " "
 							+ GeneratedSentenceProcessor.processSentence(followSent);
 				}

 				pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
 				Fragment f = new Fragment(pageSentenceProc, 1);
 				f.setSourceURL(item.getUrl());
 				f.fragment = fragment;
 				result.add(f);
 				LOG.debug("Accepted sentence: {} | with title = {}", pageSentenceProc, title);
 				LOG.debug("For fragment = {}", fragment);
 			} else
 				LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence);
 		}


 		item.setFragments(result);
 		return item;
 	}

 	public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) {

 		String[] sents = extractSentencesFromPage(item.getUrl());

 		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
 				.replace("  ", " ").replace("  ", " ");
 		// generation results for this sentence
 		List<String> result = new ArrayList<>();
 		// form plain text from snippet
 		String snapshot = item.getAbstractText().replace("<b>", " ")
 				.replace("</b>", " ").replace("  ", " ").replace("  ", " ").replace("\"", "");

 		String snapshotMarked = snapshot.replace(" ...", ".");
 		List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
 		if (fragments.size()<3 && StringUtils.countMatches(snapshotMarked, ".")>1){
 			snapshotMarked = snapshotMarked.replace("..", "&").replace(".", "&");
 			String[] fragmSents = snapshotMarked.split("&");
 			fragments = Arrays.asList(fragmSents);
 		}

 		for (String f : fragments) {
 			String followSent = null;
 			if (f.length() < 50)
 				continue;
 			String pageSentence;
 			// try to find original sentence from webpage

 			try {
 				String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(f, sents);
 				pageSentence = mainAndFollowSent[0];
 				followSent = mainAndFollowSent[1];
 				if (pageSentence!=null)
 					result.add(pageSentence);
 				else {
 					result.add(f);
 					LOG.warn("Could not find the original sentence \n {} \n in the page ", f);
 				}
 				//if (followSent !=null)
 				//	result.add(followSent);
 			} catch (Exception e) {
 				LOG.error(e.getLocalizedMessage(), e);
 			}
 		}
 		item.setOriginalSentences(result);
 		return item;
 	}

 	public  List<String> cleanListOfSents(List<String> sents) {
 		List<String> sentsClean = new ArrayList<>();
 		for (String s : sents) {
 			if (s == null || s.trim().length() < 30 || s.length() < 20)
 				continue;
 			sentsClean.add(s);
 		}
 		return sentsClean;
 	}

 	private String[] removeDuplicates(String[] hits) {
 		StringDistanceMeasurer meas = new StringDistanceMeasurer();

 		List<Integer> idsToRemove = new ArrayList<>();
 		List<String> hitsDedup = new ArrayList<>();
 		try {
 			for (int i = 0; i < hits.length; i++)
 				for (int j = i + 1; j < hits.length; j++) {
 					String title1 = hits[i];
 					String title2 = hits[j];
 					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
 						continue;
 					if (meas.measureStringDistance(title1, title2) > 0.7) {
 						idsToRemove.add(j); // dupes found, later list member to
 						// be deleted
 					}
 				}
 			for (int i = 0; i < hits.length; i++)
 				if (!idsToRemove.contains(i))
 					hitsDedup.add(hits[i]);
 			if (hitsDedup.size() < hits.length) {
 				System.out.println("Removed duplicates from relevant search results, including "
 						+ hits[idsToRemove.get(0)]);
 			}
 		}
 		catch (Exception e) {
 			System.out.println("Problem removing duplicates from relevant images");
 		}

 		return hitsDedup.toArray(new String[0]);

 	}

 	public String[] extractSentencesFromPage(String url) {

 		int maxSentsFromPage= 100;
 		List<String[]> results = new ArrayList<>();

 		String downloadedPage = pFetcher.fetchPage(url, 20000);
 		if (downloadedPage == null || downloadedPage.length() < 100) {
 			return null;
 		}

 		String pageOrigHTML = pFetcher.fetchOrigHTML(url);

 		downloadedPage= downloadedPage.replace("     ", "&");
 		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
 		String[] sents = downloadedPage.split("#");
 		List<TextChunk> sentsList = new ArrayList<>();
 		for(String s: sents){
 			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")
 					.replace(": ", ". ").replace("- ", ". ").
 					replace (". .",".").trim();
 			sentsList.add(new TextChunk(s, s.length()));
 		}

 		sentsList.sort(new TextChunkComparable());
 		String[] longestSents = new String[maxSentsFromPage];
 		int j=0;
 		int initIndex = sentsList.size()-1 -maxSentsFromPage;
 		if (initIndex<0)
 			initIndex = 0;
 		for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){
 			longestSents[j] = sentsList.get(i).text;
 			j++;
 		}

 		sents = cleanSplitListOfSents(longestSents);

 		//sents = removeDuplicates(sents);
 		//sents = verifyEnforceStartsUpperCase(sents);

 		return sents;
 	}

 	protected String[] cleanSplitListOfSents(String[] longestSents){
 	float minFragmentLength = 40, minFragmentLengthSpace=4;

 	List<String> sentsClean = new ArrayList<>();
 	for (String sentenceOrMultSent : longestSents) {
 		if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)
 			continue;
 		if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
 			LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", sentenceOrMultSent);
 			continue;
 		}
 		// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.
 		int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;
 		float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
 		if ( avgSentenceLengthInTextPortion<minFragmentLength)
 			continue;
 		// o oo o ooo o o o ooo oo ooo o o oo
 		numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;
 		avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
 		if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
 			continue;

 		List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

 		// forced split by ',' somewhere in the middle of sentence
 		// disused - Feb 26 13
 		//furtherSplit = furtherMakeSentencesShorter(furtherSplit);
 		furtherSplit.remove(furtherSplit.size()-1);
 		for(String s : furtherSplit) {
 			if (s.indexOf('|') >- 1)
 				continue;
 			s = s.replace("<em>"," ").replace("</em>"," ");
 			s = Utils.convertToASCII(s);
 			sentsClean.add(s);
 		}
 	}

 	return sentsClean.toArray(new String[0]);
 }
 	private String[] verifyEnforceStartsUpperCase(String[] sents) {
 		for(int i=0; i<sents.length; i++) {
 			String s = sents[i];
 			s = StringUtils.trim(s);
 			String sFirstChar = s.substring(0, 1);
 			if (!sFirstChar.toUpperCase().equals(sFirstChar)){
 				s = sFirstChar.toUpperCase()+s.substring(1);
 			}
 			sents[i] = s;
 		}
 		return sents;
 	}

 	private List<String> cleanProductFeatures(List<String> productFeaturesList) {
 		List<String> results = new ArrayList<>();
 		for(String feature: productFeaturesList){
 			if (feature.startsWith("Unlimited Free") || feature.startsWith("View Larger") || feature.startsWith("View Larger") || feature.indexOf("shipping")>0)
 				continue;
 			results.add(feature);
 		}
 		return results;
 	}
 	public static class TextChunk {
 		public TextChunk(String s, int length) {
 			this.text = s;
 			this.len = length;
 		}
 		public final String text;
 		public final int len;
 	}

 	public static class TextChunkComparable implements Comparator<TextChunk> {

 		@Override
 		public int compare(TextChunk ch1, TextChunk ch2) {
 			return Integer.compare(ch1.len, ch2.len);
 		}
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.parse_thicket.apps;

	import java.lang.invoke.MethodHandles;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Comparator;
	import java.util.List;

	import org.apache.commons.lang.StringUtils;

	import opennlp.tools.similarity.apps.ContentGeneratorSupport;
	import opennlp.tools.similarity.apps.Fragment;
	import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
	import opennlp.tools.similarity.apps.HitBase;
	import opennlp.tools.similarity.apps.utils.PageFetcher;
	import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
	import opennlp.tools.similarity.apps.utils.Utils;
	import opennlp.tools.textsimilarity.TextProcessor;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class SnippetToParagraph extends ContentGeneratorSupport /RelatedSentenceFinder /{
	private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
	private final PageFetcher pFetcher = new PageFetcher();

	public HitBase formTextFromOriginalPageGivenSnippetDirect(HitBase item) {

	// put orig sentence in structure
	List<String> origs = new ArrayList<>();

	item.setOriginalSentences(origs);
	String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
	.replace(" ", " ").replace(" ", " ");
	// generation results for this sentence
	List<Fragment> result = new ArrayList<>();
	// form plain text from snippet
	String snapshot = item.getAbstractText().replace("<b>", " ")
	.replace("</b>", " ").replace(" ", " ").replace(" ", " ");

	String snapshotMarked = snapshot.replace("...",
	" _should_find_orig_ . _should_find_orig_");
	List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
	List<String> allFragms = new ArrayList<>(fragments);

	List<String> sents = new ArrayList<>();
	String downloadedPage;
	try {
	if (snapshotMarked.length() != snapshot.length()) {
	downloadedPage = pFetcher.fetchPage(item.getUrl());
	if (downloadedPage != null && downloadedPage.length() > 100) {
	item.setPageContent(downloadedPage);
	String pageContent = Utils.fullStripHTML(item.getPageContent());
	pageContent = GeneratedSentenceProcessor
	.normalizeForSentenceSplitting(pageContent);
	pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
	// ". ")
	.replace("..", ".").replace(". . .", " ").trim(); // sometimes
	// html breaks
	// are converted
	// into ' ' (two
	// spaces), so
	// we need to
	// put '.'
	sents = TextProcessor.splitToSentences(pageContent);

	}
	}
	} catch (Exception e) {
	System.err.println("Problem downloading the page and splitting into sentences");
	return item;
	}

	for (String fragment : allFragms) {
	String followSent = null;
	if (fragment.length() < 50)
	continue;
	String pageSentence = "";
	// try to find original sentence from webpage
	if (fragment.contains("_should_find_orig_") && sents != null
	&& sents.size() > 0)
	try {
	String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
	fragment.replace("_should_find_orig_", ""), sents.toArray(new String[]{}));
	pageSentence = mainAndFollowSent[0];
	followSent = mainAndFollowSent[1];

	} catch (Exception e) {
	LOG.error(e.getLocalizedMessage(), e);
	}
	else
	// or get original snippet
	pageSentence = fragment;
	if (pageSentence != null)
	pageSentence = pageSentence.replace("_should_find_orig_", "");
	String pageSentenceProc = GeneratedSentenceProcessor
	.acceptableMinedSentence(pageSentence);
	if (pageSentenceProc != null) {
	pageSentenceProc = GeneratedSentenceProcessor
	.processSentence(pageSentenceProc);
	if (followSent != null) {
	pageSentenceProc += " "
	+ GeneratedSentenceProcessor.processSentence(followSent);
	}

	pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
	Fragment f = new Fragment(pageSentenceProc, 1);
	f.setSourceURL(item.getUrl());
	f.fragment = fragment;
	result.add(f);
	LOG.debug("Accepted sentence: {} \| with title = {}", pageSentenceProc, title);
	LOG.debug("For fragment = {}", fragment);
	} else
	LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence);
	}


	item.setFragments(result);
	return item;
	}

	public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) {

	String[] sents = extractSentencesFromPage(item.getUrl());

	String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
	.replace(" ", " ").replace(" ", " ");
	// generation results for this sentence
	List<String> result = new ArrayList<>();
	// form plain text from snippet
	String snapshot = item.getAbstractText().replace("<b>", " ")
	.replace("</b>", " ").replace(" ", " ").replace(" ", " ").replace("\"", "");

	String snapshotMarked = snapshot.replace(" ...", ".");
	List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
	if (fragments.size()<3 && StringUtils.countMatches(snapshotMarked, ".")>1){
	snapshotMarked = snapshotMarked.replace("..", "&").replace(".", "&");
	String[] fragmSents = snapshotMarked.split("&");
	fragments = Arrays.asList(fragmSents);
	}

	for (String f : fragments) {
	String followSent = null;
	if (f.length() < 50)
	continue;
	String pageSentence;
	// try to find original sentence from webpage

	try {
	String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(f, sents);
	pageSentence = mainAndFollowSent[0];
	followSent = mainAndFollowSent[1];
	if (pageSentence!=null)
	result.add(pageSentence);
	else {
	result.add(f);
	LOG.warn("Could not find the original sentence \n {} \n in the page ", f);
	}
	//if (followSent !=null)
	// result.add(followSent);
	} catch (Exception e) {
	LOG.error(e.getLocalizedMessage(), e);
	}
	}
	item.setOriginalSentences(result);
	return item;
	}

	public List<String> cleanListOfSents(List<String> sents) {
	List<String> sentsClean = new ArrayList<>();
	for (String s : sents) {
	if (s == null \|\| s.trim().length() < 30 \|\| s.length() < 20)
	continue;
	sentsClean.add(s);
	}
	return sentsClean;
	}

	private String[] removeDuplicates(String[] hits) {
	StringDistanceMeasurer meas = new StringDistanceMeasurer();

	List<Integer> idsToRemove = new ArrayList<>();
	List<String> hitsDedup = new ArrayList<>();
	try {
	for (int i = 0; i < hits.length; i++)
	for (int j = i + 1; j < hits.length; j++) {
	String title1 = hits[i];
	String title2 = hits[j];
	if (StringUtils.isEmpty(title1) \|\| StringUtils.isEmpty(title2))
	continue;
	if (meas.measureStringDistance(title1, title2) > 0.7) {
	idsToRemove.add(j); // dupes found, later list member to
	// be deleted
	}
	}
	for (int i = 0; i < hits.length; i++)
	if (!idsToRemove.contains(i))
	hitsDedup.add(hits[i]);
	if (hitsDedup.size() < hits.length) {
	System.out.println("Removed duplicates from relevant search results, including "
	+ hits[idsToRemove.get(0)]);
	}
	}
	catch (Exception e) {
	System.out.println("Problem removing duplicates from relevant images");
	}

	return hitsDedup.toArray(new String[0]);

	}

	public String[] extractSentencesFromPage(String url) {

	int maxSentsFromPage= 100;
	List<String[]> results = new ArrayList<>();

	String downloadedPage = pFetcher.fetchPage(url, 20000);
	if (downloadedPage == null \|\| downloadedPage.length() < 100) {
	return null;
	}

	String pageOrigHTML = pFetcher.fetchOrigHTML(url);

	downloadedPage= downloadedPage.replace(" ", "&");
	downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
	String[] sents = downloadedPage.split("#");
	List<TextChunk> sentsList = new ArrayList<>();
	for(String s: sents){
	s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
	.replace(": ", ". ").replace("- ", ". ").
	replace (". .",".").trim();
	sentsList.add(new TextChunk(s, s.length()));
	}

	sentsList.sort(new TextChunkComparable());
	String[] longestSents = new String[maxSentsFromPage];
	int j=0;
	int initIndex = sentsList.size()-1 -maxSentsFromPage;
	if (initIndex<0)
	initIndex = 0;
	for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){
	longestSents[j] = sentsList.get(i).text;
	j++;
	}

	sents = cleanSplitListOfSents(longestSents);

	//sents = removeDuplicates(sents);
	//sents = verifyEnforceStartsUpperCase(sents);

	return sents;
	}

	protected String[] cleanSplitListOfSents(String[] longestSents){
	float minFragmentLength = 40, minFragmentLengthSpace=4;

	List<String> sentsClean = new ArrayList<>();
	for (String sentenceOrMultSent : longestSents) {
	if (sentenceOrMultSent==null \|\| sentenceOrMultSent.length()<20)
	continue;
	if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
	LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", sentenceOrMultSent);
	continue;
	}
	// aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n.
	int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;
	float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
	if ( avgSentenceLengthInTextPortion<minFragmentLength)
	continue;
	// o oo o ooo o o o ooo oo ooo o o oo
	numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;
	avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
	if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
	continue;

	List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

	// forced split by ',' somewhere in the middle of sentence
	// disused - Feb 26 13
	//furtherSplit = furtherMakeSentencesShorter(furtherSplit);
	furtherSplit.remove(furtherSplit.size()-1);
	for(String s : furtherSplit) {
	if (s.indexOf('\|') >- 1)
	continue;
	s = s.replace("<em>"," ").replace("</em>"," ");
	s = Utils.convertToASCII(s);
	sentsClean.add(s);
	}
	}

	return sentsClean.toArray(new String[0]);
	}
	private String[] verifyEnforceStartsUpperCase(String[] sents) {
	for(int i=0; i<sents.length; i++) {
	String s = sents[i];
	s = StringUtils.trim(s);
	String sFirstChar = s.substring(0, 1);
	if (!sFirstChar.toUpperCase().equals(sFirstChar)){
	s = sFirstChar.toUpperCase()+s.substring(1);
	}
	sents[i] = s;
	}
	return sents;
	}

	private List<String> cleanProductFeatures(List<String> productFeaturesList) {
	List<String> results = new ArrayList<>();
	for(String feature: productFeaturesList){
	if (feature.startsWith("Unlimited Free") \|\| feature.startsWith("View Larger") \|\| feature.startsWith("View Larger") \|\| feature.indexOf("shipping")>0)
	continue;
	results.add(feature);
	}
	return results;
	}
	public static class TextChunk {
	public TextChunk(String s, int length) {
	this.text = s;
	this.len = length;
	}
	public final String text;
	public final int len;
	}

	public static class TextChunkComparable implements Comparator<TextChunk> {

	@Override
	public int compare(TextChunk ch1, TextChunk ch2) {
	return Integer.compare(ch1.len, ch2.len);
	}
	}

	}