OPENNLP-628

commit: 3bc5f315f9856bd4e1dbbba4948d83153ff8da0b [log] [tgz]
author: Boris Galitsky <bgalitsky@apache.org> Mon Jan 06 17:48:30 2014 +0000
committer: Boris Galitsky <bgalitsky@apache.org> Mon Jan 06 17:48:30 2014 +0000
tree: 24f1d15a20168c3b9350e71a3c993ee3b12df1a1
parent: b866ac199c37260433bb3f9fc310e42865356c00 [diff]
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java
new file mode 100644
index 0000000..b712847
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java

@@ -0,0 +1,54 @@
+package opennlp.tools.apps.contentgen.multithreaded;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+

+public class BingWebQueryRunnerThread extends BingQueryRunner implements Runnable{

+	

+	private String query;

+	private List<HitBase> results= new ArrayList<HitBase>();

+	public BingWebQueryRunnerThread(String Query){

+		super();

+		this.query=Query;

+	}

+	public void run(){

+		results=runSearch(query);

+		fireMyEvent(new MyEvent(this));

+	}

+	public List<HitBase> getResults() {

+		return results;

+	}

+	

+	public String getQuery() {

+		return query;

+	}

+	

+	// Create the listener list

+    protected javax.swing.event.EventListenerList listenerList = new javax.swing.event.EventListenerList();

+    // This methods allows classes to register for MyEvents 

+

+    public void addMyEventListener(MyEventListener listener) {

+        listenerList.add(MyEventListener.class, listener);

+    }

+    // This methods allows classes to unregister for MyEvents

+

+    public void removeMyEventListener(MyEventListener listener) {

+        listenerList.remove(MyEventListener.class, listener);

+    }

+

+    void fireMyEvent(MyEvent evt) {

+        Object[] listeners = listenerList.getListenerList();

+        // Each listener occupies two elements - the first is the listener class

+        // and the second is the listener instance

+        for (int i = 0; i < listeners.length; i += 2) {

+            if (listeners[i] == MyEventListener.class) {

+                ((MyEventListener) listeners[i + 1]).MyEvent(evt);

+            }

+        }

+    }

+	

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/Fragment.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/Fragment.java
new file mode 100644
index 0000000..328d95c
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/Fragment.java

@@ -0,0 +1,88 @@
+package opennlp.tools.apps.contentgen.multithreaded;

+

+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

+

+

+public class Fragment {

+	

+		public String resultText;      // result

+		public double score;

+		public String fragment; // original

+		public String sourceURL;

+

+		Fragment(String text, double score) {

+			this.resultText = text;

+			this.score = score;

+		}

+		

+			

+		public String getResultText() {

+			return resultText;

+		}

+

+		public void setResultText(String resultText) {

+			this.resultText = resultText;

+		}

+

+

+

+		public double getScore() {

+			return score;

+		}

+

+

+

+		public void setScore(double score) {

+			this.score = score;

+		}

+

+

+

+		public String getFragment() {

+			return fragment;

+		}

+

+

+

+		public void setFragment(String fragment) {

+			this.fragment = fragment;

+		}

+

+		

+

+		public String getSourceURL() {

+			return sourceURL;

+		}

+

+

+		public void setSourceURL(String sourceURL) {

+			this.sourceURL = sourceURL;

+		}

+

+

+		public String toString(){

+			return this.resultText;

+		}

+

+		@Override

+		public boolean equals(Object o) {

+			if (this == o) return true;

+			if (o == null || getClass() != o.getClass()) return false;

+

+			Fragment fragment = (Fragment) o;

+

+			if (resultText == null && fragment.resultText == null) {

+				return true;

+			} else if ((resultText == null && fragment.resultText != null) || (resultText != null && fragment.resultText == null)) {

+				return false;

+			}

+

+			StringDistanceMeasurer sdm = new StringDistanceMeasurer();

+			return sdm.measureStringDistance(resultText, fragment.resultText) > 0.8;

+		}

+

+		@Override

+		public int hashCode() {

+			return resultText != null ? resultText.hashCode() : 0;

+		}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/MyEvent.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/MyEvent.java
new file mode 100644
index 0000000..14e7daa
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/MyEvent.java

@@ -0,0 +1,12 @@
+package opennlp.tools.apps.contentgen.multithreaded;

+

+import java.util.EventObject;

+

+public class MyEvent extends EventObject {

+

+	public MyEvent(Object arg0) {

+		super(arg0);

+		// TODO Auto-generated constructor stub

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/MyEventListener.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/MyEventListener.java
new file mode 100644
index 0000000..ecdced4
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/MyEventListener.java

@@ -0,0 +1,8 @@
+package opennlp.tools.apps.contentgen.multithreaded;

+

+import java.util.EventListener;

+

+

+public interface MyEventListener extends EventListener{

+	public void MyEvent(MyEvent evt);

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/cgRequestForm.html b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/cgRequestForm.html
new file mode 100644
index 0000000..1c5dfb2
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/cgRequestForm.html

@@ -0,0 +1,37 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"

+"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

+ 

+<html xmlns='http://www.w3.org/1999/xhtml'>

+   <head >

+      <meta http-equiv='Content-Type' content='text/html; charset=utf-8'/>

+      <title >Submit Your Essay Writing request here</title>

+   </head>

+<body>

+<h1>Submit Your Essay Writing request here / Envie su solicitud ensayo escrito aqui</h1>

+ 

+<form id='sampleform' method='post' action='http://173.255.254.250:8983/solr/contentgen/?resourceDir=/home/solr/solr-4.4.0/example/src/test/resources&workDir=/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib&relevanceThreshold=0.5&bingKey=e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=' >

+   <p>

+   Topic for your essay/Tema de su ensayo: <input type='text' name='q' value='albert einstein' size='35' maxlength='100'/>

+   </p>

+   <p>

+   Email to receive your essay/para recibir su ensayo: <input type='text' name='email' />

+   </p>

+   

+   <p>

+   Select language/seleccionar el idioma: <select name="lang" >

+   		<option value="en-US"> English</option>

+ 		<option value="es-US"> Espaniol</option>

+ 		<option value="de-DE"> German</option>

+	</select>

+	</p>

+	<p>

+   Number of Bing calls to write a this essay: <input type='text' name='stepsNum' value='20' size='5' maxlength='10'/>

+   Number of Bing search results for each call to use for writing: <input type='text' name='searchResultsNum' value='100' size='5' maxlength='10'/>

+   </p>

+<p>

+   <input type='submit' name='Submit' value='Submit/presentar' />

+   </p>

+</form>

+ 

+</body>

+</html>


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/nlProg2codeRequestForm.html b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/nlProg2codeRequestForm.html
new file mode 100644
index 0000000..2fbf1c9
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/nlProg2codeRequestForm.html

@@ -0,0 +1,47 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"

+"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

+ 

+<html xmlns='http://www.w3.org/1999/xhtml'>

+   <head >

+      <meta http-equiv='Content-Type' content='text/html; charset=utf-8'/>

+      <title >Submit Your Code Writing request here</title>

+   </head>

+<body>

+<h1>Submit Your Code Writing request here</h1>

+ 

+<form id='sampleform' method='post' action='http://173.255.254.250:8983/solr/nlprog2code/?' >

+   <p>

+   Write what you want your program to do in natural language <input type='text' name='line' value='define a class named ...' size='35' maxlength='120'/>

+   </p>

+    <p>

+    <input type='text' name='line' value='define a function taking a string s1 and an integer i2 ' size='35' maxlength='150'/>

+   </p>

+   <p>

+     <input type='text' name='line' size='35' maxlength='200'/>

+   </p>

+   <p>

+    <input type='text' name='line' size='35' maxlength='200'/>

+   </p>

+   <p>

+    <input type='text' name='line' size='35' maxlength='200'/>

+   </p>

+   <p>

+     <input type='text' name='line' size='35' maxlength='200'/>

+   </p>

+   <p>

+    <input type='text' name='line' size='35' maxlength='200'/>

+   </p>

+   <p>

+    <input type='text' name='line' size='35' maxlength='200'/>

+   </p>

+   <p>

+    <input type='text' name='line' size='35' maxlength='200'/>

+   </p>

+   

+<p>

+   <input type='submit' name='Submit' value='Submit' />

+   </p>

+</form>

+ 

+</body>

+</html>


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java
new file mode 100644
index 0000000..45dadf9
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java

@@ -0,0 +1,74 @@
+package opennlp.tools.apps.relevanceVocabs;
+
+public interface POStags {
+	// added new POS types for infinitive phrase and participle phrase
+	public static final String TYPE_STP = "STP"; // infinitive phrase
+	public static final String TYPE_SGP = "SGP"; // present participle phrase
+	public static final String TYPE_SNP = "SNP"; // past participle phrase
+
+	// below are the standard POS types,
+	// http://bulba.sdsu.edu/jeanette/thesis/PennTags.html
+	public static final String TYPE_ADJP = "ADJP";
+	public static final String TYPE_ADVP = "ADVP";
+	public static final String TYPE_CC = "CC";
+	public static final String TYPE_CD = "CD";
+	public static final String TYPE_CONJP = "CONJP";
+	public static final String TYPE_DT = "DT";
+	public static final String TYPE_EX = "EX";
+	public static final String TYPE_FRAG = "FRAG";
+	public static final String TYPE_FW = "FW";
+	public static final String TYPE_IN = "IN";
+	public static final String TYPE_INTJ = "INTJ";
+	public static final String TYPE_JJ = "JJ";
+	public static final String TYPE_JJR = "JJR";
+	public static final String TYPE_JJS = "JJS";
+	public static final String TYPE_LS = "LS";
+	public static final String TYPE_LST = "LST";
+	public static final String TYPE_MD = "MD";
+	public static final String TYPE_NAC = "NAC";
+	public static final String TYPE_NN = "NN";
+	public static final String TYPE_NNS = "NNS";
+	public static final String TYPE_NNP = "NNP";
+	public static final String TYPE_NNPS = "NNPS";
+	public static final String TYPE_NP = "NP";
+	public static final String TYPE_NX = "NX";
+	public static final String TYPE_PDT = "PDT";
+	public static final String TYPE_POS = "POS";
+	public static final String TYPE_PP = "PP";
+	public static final String TYPE_PRN = "PRN";
+	public static final String TYPE_PRP = "PRP";
+	public static final String TYPE_PRP$ = "PRP$";
+	public static final String TYPE_PRT = "PRT";
+	public static final String TYPE_QP = "QP";
+	public static final String TYPE_RB = "RB";
+	public static final String TYPE_RBR = "RBR";
+	public static final String TYPE_RBS = "RBS";
+	public static final String TYPE_RP = "RP";
+	public static final String TYPE_RRC = "RRC";
+	public static final String TYPE_S = "S";
+	public static final String TYPE_SBAR = "SBAR";
+	public static final String TYPE_SBARQ = "SBARQ";
+	public static final String TYPE_SINV = "SINV";
+	public static final String TYPE_SQ = "SQ";
+	public static final String TYPE_SYM = "SYM";
+	public static final String TYPE_TO = "TO";
+	public static final String TYPE_TOP = "TOP";
+	public static final String TYPE_UCP = "UCP";
+	public static final String TYPE_UH = "UH";
+	public static final String TYPE_VB = "VB";
+	public static final String TYPE_VBD = "VBD";
+	public static final String TYPE_VBG = "VBG";
+	public static final String TYPE_VBN = "VBN";
+	public static final String TYPE_VBP = "VBP";
+	public static final String TYPE_VBZ = "VBZ";
+	public static final String TYPE_VP = "VP";
+	public static final String TYPE_WDT = "WDT";
+	public static final String TYPE_WHADJP = "WHADJP";
+	public static final String TYPE_WHADVP = "WHADVP";
+	public static final String TYPE_WHNP = "WHNP";
+	public static final String TYPE_WHPP = "WHPP";
+	public static final String TYPE_WP = "WP";
+	public static final String TYPE_WP$ = "WP$";
+	public static final String TYPE_WRB = "WRB";
+	public static final String TYPE_X = "X";
+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java
new file mode 100644
index 0000000..ae2772b
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java

@@ -0,0 +1,215 @@
+package opennlp.tools.apps.relevanceVocabs;

+

+import java.util.ArrayList;

+import java.util.Comparator;

+import java.util.List;

+

+import org.apache.commons.lang.StringUtils;

+

+import opennlp.tools.parser.Parse;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.TextProcessor;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+import opennlp.tools.util.Span;

+

+public class PhraseProcessor {

+	

+	private ParserChunker2MatcherProcessor nlProc = ParserChunker2MatcherProcessor.getInstance() ;

+	

+	public static boolean allChildNodesArePOSTags(Parse p)

+	{

+		Parse[] subParses = p.getChildren();

+		for (int pi = 0; pi < subParses.length; pi++)

+			if (!((Parse) subParses[pi]).isPosTag())

+				return false;

+		return true;

+	}

+	

+	public ArrayList<String> getNounPhrases(Parse p)

+	{

+		ArrayList<String> nounphrases = new ArrayList<String>();

+

+		Parse[] subparses = p.getChildren();

+		for (int pi = 0; pi < subparses.length; pi++)

+		{

+

+			if (subparses[pi].getType().equals("NP") && allChildNodesArePOSTags(subparses[pi]))

+			{

+				Span _span = subparses[pi].getSpan();

+				nounphrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));

+			}

+			else if (!((Parse) subparses[pi]).isPosTag())

+				nounphrases.addAll(getNounPhrases(subparses[pi]));

+		}

+

+		return nounphrases;

+	}

+	

+	public ArrayList<String> getVerbPhrases(Parse p)

+	{

+		ArrayList<String> verbPhrases = new ArrayList<String>();

+

+		Parse[] subparses = p.getChildren();

+		for (int pi = 0; pi < subparses.length; pi++)

+		{

+

+			if (subparses[pi].getType().startsWith("VB") && allChildNodesArePOSTags(subparses[pi]))

+			{

+				Span _span = subparses[pi].getSpan();

+				verbPhrases.add(p.getText().substring(_span.getStart(), _span.getEnd()));

+			}

+			else if (!((Parse) subparses[pi]).isPosTag())

+				verbPhrases.addAll(getNounPhrases(subparses[pi]));

+		}

+

+		return verbPhrases;

+	}

+	

+	// forms phrases from text which are candidate expressions for events lookup

+			public List<ParseTreeChunk> getVerbPhrases(String sentence) {

+				if (sentence==null)

+					return null;

+				if (sentence.split(" ").length ==1) { // this is a word, return empty

+					//queryArrayStr.add( sentence);

+					return null;

+				}

+				if (sentence.length()>100)

+					return null ; // too long of a sentence to parse

+				

+				System.out.println("About to parse: "+sentence);

+				List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); 

+				if (groupedChunks.size()<1)

+					return null;

+

+				List<ParseTreeChunk> vPhrases = groupedChunks.get(1);

+				

+				return vPhrases;

+			}

+

+			public List<List<ParseTreeChunk>> getPhrasesOfAllTypes(String sentence) {

+				if (sentence==null)

+					return null;

+				if (sentence.split(" ").length ==1) { // this is a word, return empty

+					//queryArrayStr.add( sentence);

+					return null;

+				}

+				if (sentence.length()>200)

+					return null ; // too long of a sentence to parse

+				

+				System.out.println("About to parse: "+sentence);

+				List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); 

+				if (groupedChunks.size()<1)

+					return null;

+

+				return groupedChunks;

+			}

+	

+	// forms phrases from text which are candidate expressions for events lookup

+		public List<String> extractNounPhraseProductNameCandidate(String sentence) {

+			

+			List<String> queryArrayStr = new ArrayList<String>();

+			

+			if (sentence.split(" ").length ==1) { // this is a word, return empty

+				//queryArrayStr.add( sentence);

+				return queryArrayStr;

+			}

+			String quoted1 = StringUtils.substringBetween(sentence, "\"", "\"");

+			String quoted2 = StringUtils.substringBetween(sentence, "\'", "\'");

+			List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence); 

+			if (groupedChunks.size()<1)

+				return queryArrayStr;

+

+			List<ParseTreeChunk> nPhrases = groupedChunks.get(0);

+

+			for (ParseTreeChunk ch : nPhrases) {

+				String query = "";

+				int size = ch.getLemmas().size();

+				boolean phraseBeingFormed = false;

+				for (int i = 0; i < size; i++) {

+					if ((ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i)

+							.startsWith("J") || ch.getPOSs().get(i).startsWith("CD") ) )

+					//		&& StringUtils.isAlpha(ch.getLemmas().get(i)))

+					{

+						query += ch.getLemmas().get(i) + " ";

+						phraseBeingFormed = true;

+					} else 

+						if ((ch.getPOSs().get(i).startsWith("PR") || ch.getPOSs().get(i).startsWith("IN") || ch.getPOSs().get(i).startsWith("TO")  ) 

+								&& phraseBeingFormed )

+							break;

+						else if (ch.getPOSs().get(i).startsWith("DT") || ch.getPOSs().get(i).startsWith("CC"))

+						continue;

+				}

+				query = query.trim();

+				int len = query.split(" ").length;

+				if (len > 5 || len < 2) // too long or too short

+					continue;

+				

+	/*				

+				if (len < 4 && len>1) { // every word should start with capital

+					String[] qs = query.split(" ");

+					boolean bAccept = true;

+					for (String w : qs) {

+						if (w.toLowerCase().equals(w)) // idf only two words then

+														// has to be person name,

+														// title or geo

+														// location

+							bAccept = false;

+					}

+					if (!bAccept)

+						continue;

+				}

+		*/		

+				 // individual word, possibly a frequent word

+				// if len==1 do nothing

+

+				query = query.trim();

+				queryArrayStr.add(query);

+

+			}

+	/*		

+			if (queryArrayStr.size() < 1) { // release constraints on NP down to 2

+											// keywords

+				for (ParseTreeChunk ch : nPhrases) {

+					String query = "";

+					int size = ch.getLemmas().size();

+

+					for (int i = 0; i < size; i++) {

+						if (ch.getPOSs().get(i).startsWith("N")

+								|| ch.getPOSs().get(i).startsWith("J")) {

+							query += ch.getLemmas().get(i) + " ";

+						}

+					}

+					query = query.trim();

+					int len = query.split(" ").length;

+					if (len < 2)

+						continue;

+

+					query = TextProcessor.fastTokenize(query.toLowerCase(), false)

+							.toString().replace('[', ' ').replace(']', ' ').trim();

+					if (query.length() > 6)

+						queryArrayStr.add(query);

+				}

+			}

+			//queryArrayStr = Utils

+			//		.removeDuplicatesFromQueries(queryArrayStr);

+			if (quoted1 != null

+					&& ((quoted1.length() > 5 && !stopList.isCommonWord(quoted1)) || quoted1

+							.length() > 10))

+				queryArrayStr.add(quoted1);

+			if (quoted2 != null

+					&& ((quoted2.length() > 5 && !stopList.isCommonWord(quoted2)) || quoted2

+							.length() > 10))

+				queryArrayStr.add(quoted2);

+		*/	return queryArrayStr;

+		}

+		

+

+	

+		

+		public static void main(String[] args){

+			String sent = "Appliances and Kitchen Gadgets - CNET Blogs";

+					//"The tablet phenomenon turns Silicon Valley upside down - SiliconValley.com";

+			List<String> res = new PhraseProcessor().extractNounPhraseProductNameCandidate(sent);

+			System.out.println(res);

+		}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java
new file mode 100644
index 0000000..150b3df
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java

@@ -0,0 +1,199 @@
+package opennlp.tools.apps.relevanceVocabs;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+
+
+public class SentimentVocab {
+	private static final String[] POSITIVE_ADJECTTIVE_LIST = { "accessible",
+			"advanced", "affordable", "amazing", "awesome", "beautiful",
+			"brilliant", "capable", "classic", "clear", "comfortable",
+			"convenient", "cool", "courteous", "cute", "decent", "delight",
+			"easy", "elegant", "enjoyable", "enough", "excellent",
+			"exceptional", "fabulous", "fancy", "fantastic", "fast",
+			"favorable", "fine", "friendly", "fun", "good", "great", "handy",
+			"happy", "hefty", "helpful", "high", "immaculate", "impressive",
+			"incredible", "interesting", "jealous", "lovely", "lucky",
+			"luxurious", "marvelous", "maximum", "memorable", "neat", "nice",
+			"outstanding", "perfect", "pleasant", "positive", "pretty",
+			"powerful", "quiet", "reasonable", "remarkable", "right", "safe",
+			"silky", "sleek", "slick", "stylish", "suitable", "superb",
+			"tasteful", "terrific", "top", "unbelievable", "useful",
+			"welcoming", "wonderful", "worthwhile" };
+
+	private static final String[] NEGATIVE_ADJECTTIVE_LIST = { "angry",
+			"annoyed", "annoying", "anxious", "arrogant", "ashamed", "awful",
+			"bad", "bored", "boring", "broke", "broken", "clumsy",
+			"complicate", "complicated", "confused", "cranky", "crazy",
+			"cumbersome", "defective", "depressed", "dead", "depressing",
+			"difficult", "dirty", "disappointed", "disappointing", "disgusted",
+			"disgusting", "disheartened", "disheartening", "dissatisfactory",
+			"dissatisfying", "distant", "disturbed", "dizzy", "doubtful",
+			"down", "drab", "dull", "dysfunctional", "embarrassed", "evil",
+			"exhausted", "fatal", "filthy", "flawed", "fragile", "frightened",
+			"frustrating", "goofy", "grieving", "hard", "horrific",
+			"horrifying", "harsh", "horrible", "impossible", "inconvenient",
+			"insane", "lack", "lacking", "lazy", "leaking", "leaky", "lonely",
+			"low", "mediocre", "messy", "mysterious", "nasty", "naughty",
+			"negative", "noisy", "nonclean", "nutty", "outdated", "outrageous",
+			"over priced", "pathetic", "poor", "premature", "pricey", "pricy",
+			"problematic", "putrid", "puzzled", "rickety", "ridiculous",
+			"ripped off", "rugged", "slow", "stinky", "strange", "stupid",
+			"sweaty", "tedious", "terrible", "tired", "tough", "toxic",
+			"trubled", "ugly", "unbearable", "unclean", "uncomfortable",
+			"unfortunate", "unhelpful", "uninviting", "unpleasent",
+			"unsanitary", "upseting", "unusable", "weird", "worn", "worn down",
+			"wretched", "wrong" };
+
+	private static final String[] POSITIVE_ADVERB_LIST = { "absolutely",
+			"amazingly", "completely", "definitely", "easily", "fairly",
+			"highly", "immensely", "incredibly", "nicely", "really", "rich",
+			"simply", "surprisingly", "tastefully", "totally", "truly", "very",
+			"well" };
+
+	private static final String[] NEGATIVE_ADVERB_LIST = { "badly",
+			"deceptfully", "down", "horribly", "oddly", "pathetically",
+			"terribly", "too", "unfortunately" };
+
+	private static final String[] POSITIVE_NOUN_LIST = { "ability", "benefit",
+			"character", "charm", "comfort", "discount", "dream", "elegance",
+			"favourite", "feature", "improvement", "luck", "luxury", "offer",
+			"pro", "quality", "requirement", "usability" };
+
+	private static final String[] NEGATIVE_NOUN_LIST = { "blocker",
+			"challenge", "complain", "complaint", "compromise", "con",
+			"concern", "crap", "disappointment", "disillusion", "doubt",
+			"downside", "drawback", "embarrassment", "error", "failure",
+			"fault", "garbage", "glitch", "inability", "issue", "junk",
+			"long line", "malfunction", "mess", "mistake", "nightmare",
+			"noise", "odor", "pain", "pitfall", "problem", "rip off", "roach",
+			"rude", "sacrifice", "shame", "shock", "stain", "threat",
+			"trouble", "urine", "worry" };
+
+	private static final String[] POSITIVE_VERB_LIST = { "admire", "amaze",
+			"assist", "disgust", "enjoy", "help", "guarantee", "impress",
+			"improve", "like", "love", "patronize", "prefer", "recommend",
+			"want" };
+
+	private static final String[] NEGATIVE_VERB_LIST = { "annoy", "appall",
+			"break", "complain", "confuse", "depress", "disappoint",
+			"dishearten", "dislike", "dissatisfy", "embarrass", "fail", "fear",
+			"flaw", "frustrate", "hate", "ruin", "scare", "stink", "suck",
+			"think twice", "thwart", "upset", "vomit" };
+
+	public static final int SENTIMENT_POSITIVE = 1;
+	public static final int SENTIMENT_UNKNOWN = 0;
+	public static final int SENTIMENT_NEGATIVE = -1;
+
+	private static SentimentVocab instance = new SentimentVocab();
+
+	// complete sentiment word map, key = word, value = sentiment object
+	private Map<String, Sentiment> sentimentMap = new HashMap<String, Sentiment>();
+
+	// sentiment word sets, key = POS type, value = word set
+	private Map<String, HashSet<String>> wordSetMap = new HashMap<String, HashSet<String>>();
+
+	public static class Sentiment {
+		public String posType;
+		public int sentimentType;
+
+		Sentiment(String posType, int sentimentType) {
+			this.posType = posType;
+			this.sentimentType = sentimentType;
+		}
+	}
+
+	public static SentimentVocab getInstance() {
+		return instance;
+	}
+
+	public Sentiment getSentiment(String word) {
+		if (word == null)
+			return null;
+
+		// get the normalized form of the word
+		//word = WordDictionary.getInstance().getLemmaOrWord(word);
+
+		return sentimentMap.get(word);
+	}
+
+	public Sentiment getSentiment(String word, String posType) {
+		if (word == null)
+			return null;
+
+		// get the normalized form of the word
+		word = WordDictionary.getInstance().getLemmaOrWord(word, posType);
+
+		return sentimentMap.get(word);
+	}
+
+	public boolean isSentimentWord(String word) {
+		return (getSentiment(word) != null);
+	}
+
+	public boolean isSentimentWord(String word, String posType) {
+		Sentiment sentiment = getSentiment(word, posType);
+		if (sentiment == null)
+			return false;
+
+		return sentiment.posType == posType;
+	}
+
+	public HashSet<String> getSentimentWordSet(String posType) {
+		if (posType == null)
+			return null;
+
+		return wordSetMap.get(posType);
+	}
+
+	public static String getSentimentName(int sentimentType) {
+		switch (sentimentType) {
+		case SENTIMENT_POSITIVE:
+			return "positive";
+		case SENTIMENT_NEGATIVE:
+			return "negative";
+		default:
+			return "unknown";
+		}
+	}
+
+	private SentimentVocab() {
+		// populate the sentiment map
+		addWordsToSentimentMap(POSITIVE_ADJECTTIVE_LIST,
+				POStags.TYPE_JJ, SENTIMENT_POSITIVE);
+		addWordsToSentimentMap(NEGATIVE_ADJECTTIVE_LIST,
+				POStags.TYPE_JJ, SENTIMENT_NEGATIVE);
+		addWordsToSentimentMap(POSITIVE_ADVERB_LIST, POStags.TYPE_RB,
+				SENTIMENT_POSITIVE);
+		addWordsToSentimentMap(NEGATIVE_ADVERB_LIST, POStags.TYPE_RB,
+				SENTIMENT_NEGATIVE);
+		addWordsToSentimentMap(POSITIVE_NOUN_LIST, POStags.TYPE_NN,
+				SENTIMENT_POSITIVE);
+		addWordsToSentimentMap(NEGATIVE_NOUN_LIST, POStags.TYPE_NN,
+				SENTIMENT_NEGATIVE);
+		addWordsToSentimentMap(POSITIVE_VERB_LIST, POStags.TYPE_VB,
+				SENTIMENT_POSITIVE);
+		addWordsToSentimentMap(NEGATIVE_VERB_LIST, POStags.TYPE_VB,
+				SENTIMENT_NEGATIVE);
+	}
+
+	private void addWordsToSentimentMap(String[] words, String posType,
+			int sentimentType) {
+
+		// add the word to the complete sentiment word map
+		for (String word : words) {
+			sentimentMap.put(word, new Sentiment(posType, sentimentType));
+		}
+
+		// add the word to the corresponding sentiment word set
+		HashSet<String> wordSet = wordSetMap.get(posType);
+		if (wordSet == null) {
+			wordSet = new HashSet<String>();
+			wordSetMap.put(posType, wordSet);
+		}
+		for (String word : words) {
+			wordSet.add(word);
+		}
+	}
+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java
new file mode 100644
index 0000000..7c12c9a
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java

@@ -0,0 +1,88 @@
+package opennlp.tools.apps.relevanceVocabs;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileInputStream;

+import java.io.FileNotFoundException;

+import java.io.FileOutputStream;

+import java.io.FileReader;

+import java.io.IOException;

+import java.io.InputStreamReader;

+import java.io.ObjectInputStream;

+import java.io.ObjectOutputStream;

+import java.io.Serializable;

+import java.net.URL;

+import java.net.URLConnection;

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+import org.slf4j.Logger;

+import org.slf4j.LoggerFactory;

+

+

+

+public class SynonymListFilter {

+	SynonymMap map=null;

+	

+	public SynonymListFilter(String dir){

+		dir = dir.replace("maps/analytics","");

+		try {

+			map = new SynonymMap( new FileInputStream(dir+"wn_s.pl"));

+		} catch (FileNotFoundException e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		} catch (IOException e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+	}

+	

+	protected static Map<String, List<String>> filteredKeyword_synonyms = new HashMap<String, List<String>>();

+

+	static public List<String> getFileLines(File aFile) {

+

+		List<String> items = new ArrayList<String>();

+

+		StringBuilder contents = new StringBuilder();		    

+		try {

+

+			BufferedReader input =  new BufferedReader(new FileReader(aFile));

+			try {

+				String line = null; //not declared within while loop

+				while (( line = input.readLine()) != null){

+					int endOfWord = line.indexOf(';');

+					if (endOfWord>2)

+						line = line.substring(1, endOfWord -1 );

+

+					items.add(line);

+

+				}

+			}

+			finally {

+				input.close();

+			}

+		}

+		catch (IOException ex){

+			ex.printStackTrace();

+		}

+

+		return items;

+	}

+	public String getSynonym (String word){

+			String[] synonyms = map.getSynonyms(word);

+			if (synonyms==null || synonyms.length<1)

+				return null;

+			int index = (int) Math.floor(Math.random()*(double)synonyms.length);

+			System.out.println("Found synonyms "+Arrays.asList(synonyms).toString()+ " | selected synonym = "+synonyms[index] +" | for the input = "+ word);

+			return synonyms[index];

+			

+	}	

+	public static void main(String[] args){

+		SynonymListFilter filter = new  SynonymListFilter("/src/test/resources");

+		String syn = filter.getSynonym("bring");

+		syn = filter.getSynonym("yell");

+	}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java
new file mode 100644
index 0000000..804fc2b
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java

@@ -0,0 +1,379 @@
+package opennlp.tools.apps.relevanceVocabs;

+

+import java.io.IOException;

+  import java.io.InputStream;

+   import java.nio.ByteBuffer;

+   import java.nio.charset.Charset;

+   import java.util.ArrayList;

+   import java.util.Arrays;

+   import java.util.HashMap;

+   import java.util.Iterator;

+   import java.util.Map;

+   import java.util.TreeMap;

+   import java.util.TreeSet;

+   

+   /**

+    * Loads the <a target="_blank" 

+    * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a

+    * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>

+    * into a thread-safe main-memory hash map that can be used for fast

+    * high-frequency lookups of synonyms for any given (lowercase) word string.

+    * <p>

+    * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).

+    * There does not necessarily hold: A -> B, B -> C then A -> C.

+    * <p>

+    * Loading typically takes some 1.5 secs, so should be done only once per

+    * (server) program execution, using a singleton pattern. Once loaded, a

+    * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).

+    * A loaded default synonym map consumes about 10 MB main memory.

+    * An instance is immutable, hence thread-safe.

+    * <p>

+    * This implementation borrows some ideas from the Lucene Syns2Index demo that 

+    * Dave Spencer originally contributed to Lucene. Dave's approach

+    * involved a persistent Lucene index which is suitable for occasional

+    * lookups or very large synonym tables, but considered unsuitable for 

+    * high-frequency lookups of medium size synonym tables.

+    * <p>

+    * Example Usage:

+    * <pre>

+    * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};

+    * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));

+    * for (int i = 0; i &lt; words.length; i++) {

+    *     String[] synonyms = map.getSynonyms(words[i]);

+    *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());

+    * }

+    * 

+    * Example output:

+    * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]

+    * woods:[forest, wood]

+   * forest:[afforest, timber, timberland, wood, woodland, woods]

+    * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]

+    * xxxx:[]

+    * </pre>

+    *

+    * @see <a target="_blank"

+    *      href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb

+    *      man page </a>

+    * @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>

+    */

+   public class SynonymMap {

+   

+     /** the index data; Map<String word, String[] synonyms> */

+     private final HashMap<String,String[]> table;

+     

+     private static final String[] EMPTY = new String[0];

+     

+     private static final boolean DEBUG = false;

+   

+     /**

+      * Constructs an instance, loading WordNet synonym data from the given input

+      * stream. Finally closes the stream. The words in the stream must be in

+      * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).

+      * 

+      * @param input

+      *            the stream to read from (null indicates an empty synonym map)

+      * @throws IOException

+      *             if an error occured while reading the stream.

+      */

+     public SynonymMap(InputStream input) throws IOException {

+       this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input));

+     }

+     

+     /**

+      * Returns the synonym set for the given word, sorted ascending.

+      * 

+      * @param word

+      *            the word to lookup (must be in lowercase).

+      * @return the synonyms; a set of zero or more words, sorted ascending, each

+      *         word containing lowercase characters that satisfy

+      *         <code>Character.isLetter()</code>.

+      */

+     public String[] getSynonyms(String word) {

+       String[] synonyms = table.get(word);

+       if (synonyms == null) return EMPTY;

+       String[] copy = new String[synonyms.length]; // copy for guaranteed immutability

+       System.arraycopy(synonyms, 0, copy, 0, synonyms.length);

+       return copy;

+     }

+     

+     /**

+      * Returns a String representation of the index data for debugging purposes.

+      * 

+      * @return a String representation

+      */

+     @Override

+     public String toString() {

+       StringBuilder buf = new StringBuilder();

+       Iterator<String> iter = new TreeMap<String,String[]>(table).keySet().iterator();

+       int count = 0;

+       int f0 = 0;

+       int f1 = 0;

+       int f2 = 0;

+       int f3 = 0;

+       

+       while (iter.hasNext()) {

+         String word = iter.next();

+         buf.append(word + ":");

+         String[] synonyms = getSynonyms(word);

+         buf.append(Arrays.asList(synonyms));

+         buf.append("\n");

+         count += synonyms.length;

+         if (synonyms.length == 0) f0++;

+         if (synonyms.length == 1) f1++;

+         if (synonyms.length == 2) f2++;

+         if (synonyms.length == 3) f3++;

+       }

+       

+       buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);

+       return buf.toString();

+     }

+     

+     /**

+      * Analyzes/transforms the given word on input stream loading. This default implementation simply

+      * lowercases the word. Override this method with a custom stemming

+      * algorithm or similar, if desired.

+      * 

+      * @param word

+      *            the word to analyze

+      * @return the same word, or a different word (or null to indicate that the

+      *         word should be ignored)

+      */

+     protected String analyze(String word) {

+       return word.toLowerCase();

+     }

+   

+     private static boolean isValid(String str) {

+       for (int i=str.length(); --i >= 0; ) {

+         if (!Character.isLetter(str.charAt(i))) return false;

+       }

+       return true;

+     }

+   

+     private HashMap<String,String[]> read(byte[] data) {

+       int WORDS  = (int) (76401 / 0.7); // presizing

+       int GROUPS = (int) (88022 / 0.7); // presizing

+       HashMap<String,ArrayList<Integer>> word2Groups = new HashMap<String,ArrayList<Integer>>(WORDS);  // Map<String word, int[] groups>

+       HashMap<Integer,ArrayList<String>> group2Words = new HashMap<Integer,ArrayList<String>>(GROUPS); // Map<int group, String[] words>

+       HashMap<String,String> internedWords = new HashMap<String,String>(WORDS);// Map<String word, String word>

+   

+       Charset charset = Charset.forName("UTF-8");

+       int lastNum = -1;

+       Integer lastGroup = null;

+       int len = data.length;

+       int i=0;

+       

+       while (i < len) { // until EOF

+         /* Part A: Parse a line */

+         

+         // scan to beginning of group

+         while (i < len && data[i] != '(') i++;

+         if (i >= len) break; // EOF

+         i++;

+         

+         // parse group

+         int num = 0;

+         while (i < len && data[i] != ',') {

+           num = 10*num + (data[i] - 48);

+           i++;

+         }

+         i++;

+   //      if (DEBUG) System.err.println("num="+ num);

+         

+         // scan to beginning of word

+         while (i < len && data[i] != '\'') i++;

+         i++;

+     

+         // scan to end of word

+         int start = i;

+         do {

+           while (i < len && data[i] != '\'') i++;

+           i++;

+         } while (i < len && data[i] != ','); // word must end with "',"

+         

+         if (i >= len) break; // EOF

+         String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();

+   //      String word = new String(data, 0, start, i-start-1); // ASCII

+         

+         /*

+          * Part B: ignore phrases (with spaces and hyphens) and

+          * non-alphabetic words, and let user customize word (e.g. do some

+          * stemming)

+          */

+         if (!isValid(word)) continue; // ignore

+         word = analyze(word);

+         if (word == null || word.length() == 0) continue; // ignore

+         

+         

+         /* Part C: Add (group,word) to tables */

+         

+         // ensure compact string representation, minimizing memory overhead

+         String w = internedWords.get(word);

+         if (w == null) {

+           word = new String(word); // ensure compact string

+           internedWords.put(word, word);

+         } else {

+           word = w;

+         }

+         

+         Integer group = lastGroup;

+         if (num != lastNum) {

+           group = Integer.valueOf(num);

+           lastGroup = group;

+           lastNum = num;

+         }

+         

+         // add word --> group

+         ArrayList<Integer> groups =  word2Groups.get(word);

+         if (groups == null) {

+           groups = new ArrayList<Integer>(1);

+           word2Groups.put(word, groups);

+         }

+         groups.add(group);

+   

+         // add group --> word

+         ArrayList<String> words = group2Words.get(group);

+         if (words == null) {

+           words = new ArrayList<String>(1);

+           group2Words.put(group, words);

+         } 

+         words.add(word);

+       }

+       

+       

+       /* Part D: compute index data structure */

+       HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words);    

+           

+       /* Part E: minimize memory consumption by a factor 3 (or so) */

+   //    if (true) return word2Syns;

+       word2Groups = null; // help gc

+       //TODO: word2Groups.clear(); would be more appropriate  ? 

+       group2Words = null; // help gc

+       //TODO: group2Words.clear(); would be more appropriate  ? 

+       

+       return optimize(word2Syns, internedWords);

+     }

+     

+    private HashMap<String,String[]> createIndex(Map<String,ArrayList<Integer>> word2Groups, Map<Integer,ArrayList<String>> group2Words) {

+       HashMap<String,String[]> word2Syns = new HashMap<String,String[]>();

+       

+       for (final Map.Entry<String,ArrayList<Integer>> entry : word2Groups.entrySet()) { // for each word

+         ArrayList<Integer> group = entry.getValue();     

+         String word = entry.getKey();

+         

+   //      HashSet synonyms = new HashSet();

+         TreeSet<String> synonyms = new TreeSet<String>();

+         for (int i=group.size(); --i >= 0; ) { // for each groupID of word

+           ArrayList<String> words = group2Words.get(group.get(i));

+           for (int j=words.size(); --j >= 0; ) { // add all words       

+             String synonym = words.get(j); // note that w and word are interned

+             if (synonym != word) { // a word is implicitly it's own synonym

+               synonyms.add(synonym);

+             }

+           }

+         }

+   

+         int size = synonyms.size();

+         if (size > 0) {

+           String[] syns = new String[size];

+           if (size == 1)  

+             syns[0] = synonyms.first();

+           else

+             synonyms.toArray(syns);

+   //        if (syns.length > 1) Arrays.sort(syns);

+   //        if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));

+           word2Syns.put(word, syns);

+         }

+       }

+     

+       return word2Syns;

+     }

+   

+     private HashMap<String,String[]> optimize(HashMap<String,String[]> word2Syns, HashMap<String,String> internedWords) {

+       if (DEBUG) {

+         System.err.println("before gc");

+         for (int i=0; i < 10; i++) System.gc();

+         System.err.println("after gc");

+       }

+       

+       // collect entries

+       int len = 0;

+       int size = word2Syns.size();

+       String[][] allSynonyms = new String[size][];

+       String[] words = new String[size];

+       Iterator<Map.Entry<String,String[]>> iter = word2Syns.entrySet().iterator();

+       for (int j=0; j < size; j++) {

+         Map.Entry<String,String[]> entry = iter.next();

+         allSynonyms[j] = entry.getValue(); 

+         words[j] = entry.getKey();

+         len += words[j].length();

+       }

+       

+       // assemble large string containing all words

+       StringBuilder buf = new StringBuilder(len);

+       for (int j=0; j < size; j++) buf.append(words[j]);

+       String allWords = new String(buf.toString()); // ensure compact string across JDK versions

+       buf = null;

+       

+       // intern words at app level via memory-overlaid substrings

+       for (int p=0, j=0; j < size; j++) {

+         String word = words[j];

+         internedWords.put(word, allWords.substring(p, p + word.length()));

+         p += word.length();

+       }

+       

+       // replace words with interned words

+       for (int j=0; j < size; j++) {

+         String[] syns = allSynonyms[j];

+         for (int k=syns.length; --k >= 0; ) {

+           syns[k] = internedWords.get(syns[k]);

+         }

+         word2Syns.remove(words[j]);

+         word2Syns.put(internedWords.get(words[j]), syns);

+      }

+       

+       if (DEBUG) {

+         words = null;

+         allSynonyms = null;

+         internedWords = null;

+         allWords = null;

+         System.err.println("before gc");

+         for (int i=0; i < 10; i++) System.gc();

+         System.err.println("after gc");

+       }

+       return word2Syns;

+     }

+     

+     // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux

+     private static byte[] toByteArray(InputStream input) throws IOException {

+       try {

+         // safe and fast even if input.available() behaves weird or buggy

+         int len = Math.max(256, input.available());

+         byte[] buffer = new byte[len];

+         byte[] output = new byte[len];

+         

+         len = 0;

+         int n;

+         while ((n = input.read(buffer)) >= 0) {

+           if (len + n > output.length) { // grow capacity

+             byte tmp[] = new byte[Math.max(output.length << 1, len + n)];

+             System.arraycopy(output, 0, tmp, 0, len);

+             System.arraycopy(buffer, 0, tmp, len, n);

+             buffer = output; // use larger buffer for future larger bulk reads

+             output = tmp;

+           } else {

+             System.arraycopy(buffer, 0, output, len, n);

+           }

+           len += n;

+         }

+   

+         if (len == output.length) return output;

+         buffer = null; // help gc

+         buffer = new byte[len];

+         System.arraycopy(output, 0, buffer, 0, len);

+         return buffer;

+       } finally {

+         if (input != null) input.close();

+       }

+     }

+     

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/TopJWNLDictionary.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/TopJWNLDictionary.java
new file mode 100644
index 0000000..1505096
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/TopJWNLDictionary.java

@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.apps.relevanceVocabs;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.coref.mention.Dictionary;
+
+import net.didion.jwnl.JWNL;
+import net.didion.jwnl.JWNLException;
+import net.didion.jwnl.data.Adjective;
+import net.didion.jwnl.data.IndexWord;
+import net.didion.jwnl.data.POS;
+import net.didion.jwnl.data.Pointer;
+import net.didion.jwnl.data.PointerType;
+import net.didion.jwnl.data.Synset;
+import net.didion.jwnl.data.VerbFrame;
+import net.didion.jwnl.dictionary.MapBackedDictionary;
+import net.didion.jwnl.dictionary.MorphologicalProcessor;
+import net.didion.jwnl.dictionary.morph.DefaultMorphologicalProcessor;
+import net.didion.jwnl.dictionary.morph.DetachSuffixesOperation;
+import net.didion.jwnl.dictionary.morph.LookupExceptionsOperation;
+import net.didion.jwnl.dictionary.morph.LookupIndexWordOperation;
+import net.didion.jwnl.dictionary.morph.Operation;
+import net.didion.jwnl.dictionary.morph.TokenizerOperation;
+import net.didion.jwnl.princeton.file.PrincetonObjectDictionaryFile;
+
+/**
+ * An implementation of the Dictionary interface using the JWNL library.
+ */
+public class TopJWNLDictionary implements Dictionary {
+
+	private net.didion.jwnl.dictionary.Dictionary dict;
+	private MorphologicalProcessor morphy;
+	private static String[] empty = new String[0];
+
+	public TopJWNLDictionary(String propertiesFile) throws IOException,
+			JWNLException {
+		JWNL.initialize(this.getClass().getResourceAsStream(propertiesFile));
+		dict = net.didion.jwnl.dictionary.Dictionary.getInstance();
+		morphy = dict.getMorphologicalProcessor();
+	}
+
+	@SuppressWarnings("unchecked")
+	public String[] getLemmas(String word, String tag) {
+		try {
+			POS pos;
+			if (tag.startsWith("N") || tag.startsWith("n")) {
+				pos = POS.NOUN;
+			} else if (tag.startsWith("N") || tag.startsWith("v")) {
+				pos = POS.VERB;
+			} else if (tag.startsWith("J") || tag.startsWith("a")) {
+				pos = POS.ADJECTIVE;
+			} else if (tag.startsWith("R") || tag.startsWith("r")) {
+				pos = POS.ADVERB;
+			} else {
+				pos = POS.NOUN;
+			}
+			List<String> lemmas = morphy.lookupAllBaseForms(pos, word);
+			return lemmas.toArray(new String[lemmas.size()]);
+		} catch (JWNLException e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+
+	public String getSenseKey(String lemma, String pos, int sense) {
+		try {
+			IndexWord iw = dict.getIndexWord(POS.NOUN, lemma);
+			if (iw == null) {
+				return null;
+			}
+			return String.valueOf(iw.getSynsetOffsets()[sense]);
+		} catch (JWNLException e) {
+			e.printStackTrace();
+			return null;
+		}
+
+	}
+
+	public int getNumSenses(String lemma, String pos) {
+		try {
+			IndexWord iw = dict.getIndexWord(POS.NOUN, lemma);
+			if (iw == null) {
+				return 0;
+			}
+			return iw.getSenseCount();
+		} catch (JWNLException e) {
+			return 0;
+		}
+	}
+
+	private void getParents(Synset synset, List<String> parents)
+			throws JWNLException {
+		Pointer[] pointers = synset.getPointers();
+		for (int pi = 0, pn = pointers.length; pi < pn; pi++) {
+			if (pointers[pi].getType() == PointerType.HYPERNYM) {
+				Synset parent = pointers[pi].getTargetSynset();
+				parents.add(String.valueOf(parent.getOffset()));
+				getParents(parent, parents);
+			}
+		}
+	}
+
+	public String[] getParentSenseKeys(String lemma, String pos, int sense) {
+		// System.err.println("JWNLDictionary.getParentSenseKeys: lemma="+lemma);
+		try {
+			IndexWord iw = dict.getIndexWord(POS.NOUN, lemma);
+			if (iw != null) {
+				Synset synset = iw.getSense(sense + 1);
+				List<String> parents = new ArrayList<String>();
+				getParents(synset, parents);
+				return parents.toArray(new String[parents.size()]);
+			} else {
+				return empty;
+			}
+		} catch (JWNLException e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+
+	public static void main(String[] args) throws IOException, JWNLException {
+		String searchDir = System.getProperty("WNSEARCHDIR");
+		System.err.println("searchDir=" + searchDir);
+		searchDir = "models/WordNet_2.1";
+		if (searchDir != null) {
+			Dictionary dict = new TopJWNLDictionary(
+					System.getProperty("WNSEARCHDIR"));
+			// Dictionary dict = new TopJWNLDictionary();
+			// String word = args[0];
+			String[] lemmas = dict.getLemmas("test", "NN");
+			for (int li = 0, ln = lemmas.length; li < ln; li++) {
+				for (int si = 0, sn = dict.getNumSenses(lemmas[li], "NN"); si < sn; si++) {
+					System.out.println(lemmas[li]
+							+ " ("
+							+ si
+							+ ")\t"
+							+ java.util.Arrays.asList(dict.getParentSenseKeys(
+									lemmas[li], "NN", si)));
+				}
+			}
+		}
+	}
+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java
new file mode 100644
index 0000000..dbbec1d
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java

@@ -0,0 +1,137 @@
+package opennlp.tools.apps.relevanceVocabs;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.coref.mention.Dictionary;
+
+public class WordDictionary {
+	private static final String[][] SPECIAL_CASES = { { "lens", "lenses" } };
+
+	//private static final String WORDNET_PROPERTITES_KEY = "wordnet.propertites.file";
+	//private static final String PROPERTIES_FILE = null;;
+
+	// private static final String DATA_DIR;
+	private static WordDictionary instance;
+
+	private Dictionary dictionary;
+	private Map<String, String> specialCaseMap;
+
+	/*static {
+		ConfigProperties config = ConfigFactory.getInstance()
+				.getConfigProperties(ConfigFactory.NLP_CONFIG_PATH);
+		PROPERTIES_FILE = config.getProperty(WORDNET_PROPERTITES_KEY);
+	}*/
+
+	public synchronized static WordDictionary getInstance() {
+		if (instance == null)
+			instance = new WordDictionary();
+
+		return instance;
+	}
+
+	private WordDictionary() {
+		// initialize the dictionary by loading the WordNet database
+		try {
+			dictionary = new TopJWNLDictionary("PROPERTIES_FILE");
+		} catch (Exception e) {
+			e.printStackTrace();
+			System.err.println("Failed to load the WordNet database: " + e);
+		}
+
+		// build the dictionary for special cases
+		specialCaseMap = buildSpecialCaseMap();
+	}
+
+	public String getLemmaOrWord(String word, String type) {
+		String lemma = getLemma(word, type);
+		if (lemma != null)
+			return lemma;
+		else
+			return (word == null) ? null : word.trim().toLowerCase();
+	}
+
+	public String getLemma(String word, String type) {
+		if (word == null)
+			return null;
+		// skip some long word,avoid dictionary getLemmas dead
+		if (word.length() >= 20)
+			return word;
+		word = word.trim().toLowerCase();
+		if (word.length() == 0)
+			return null;
+
+		// check special cases first
+		String lemma = specialCaseMap.get(word);
+		if (lemma != null)
+			return lemma;
+
+		// use the dictionary for general cases
+		// JWNLDictionary has a bug, and we have to use lower case type
+		type = (type == null) ? null : type.toLowerCase();
+		String[] lemmas = dictionary.getLemmas(word, type);
+		if (lemmas == null || lemmas.length == 0)
+			return null;
+
+		return lemmas[0];
+	}
+
+	/**
+	 * get the lemma for a word of unknown POS type return the word if no lemma
+	 * is found
+	 * 
+	 * @param word
+	 * @return
+	 */
+	public String getLemmaOrWord(String word) {
+		if (word == null)
+			return null;
+
+		// try noun first
+		String lemma = getLemma(word, "NN");
+		if (lemma != null)
+			return lemma;
+
+		// then try verb
+		lemma = getLemma(word, "VB");
+		if (lemma != null)
+			return lemma;
+
+		// return word now
+		return word.trim().toLowerCase();
+	}
+
+	private Map<String, String> buildSpecialCaseMap() {
+
+		Map<String, String> specialCaseMap = new HashMap<String, String>();
+		for (String[] wordList : SPECIAL_CASES) {
+			String lemma = wordList[0];
+			for (String word : wordList) {
+				specialCaseMap.put(word, lemma);
+			}
+		}
+
+		return specialCaseMap;
+	}
+
+	public static void main(String[] args) {
+		String[] verbs = { "is", "has", "were", "likes", "TaKen", "going" };
+		String[] nouns = { "efficient", "Cars", "lens", "wives", "lenses",
+				"photos" };
+		String[] adverbs = { "would", "could", "should", "might" };
+		WordDictionary dictionary = WordDictionary.getInstance();
+
+		for (String word : verbs) {
+			System.out
+					.println(word + " ==> " + dictionary.getLemma(word, "VB"));
+		}
+		for (String word : nouns) {
+			System.out
+					.println(word + " ==> " + dictionary.getLemma(word, "NN"));
+		}
+		for (String word : adverbs) {
+			System.out
+					.println(word + " ==> " + dictionary.getLemma(word, "JJ"));
+		}
+	}
+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/BingAPIProductSearchManager.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/BingAPIProductSearchManager.java
new file mode 100644
index 0000000..b1afe09
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/BingAPIProductSearchManager.java

@@ -0,0 +1,68 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+

+import org.apache.commons.lang.StringUtils;

+

+public class BingAPIProductSearchManager {

+	BingQueryRunner search = new BingQueryRunner();

+

+	public List<HitBase> findProductByName(String name, int count){

+		List<HitBase> foundFBPages = search.runSearch("site:amazon.com"+" "+name + " reviews"

+				, 10);

+		List<HitBase> results = new ArrayList<HitBase>();

+		int ct=0;

+		for(HitBase h: foundFBPages){

+			if (ct>=count) break; ct++; 

+			String title = h.getTitle().toLowerCase();

+			if (h.getUrl().indexOf("amazon.com")<0)

+				continue;

+			String[] merchantWords = name.toLowerCase().split(" ");

+			int overlapCount=0;

+/*			for(String commonWord:merchantWords){

+				if (title.indexOf(commonWord+" ")>-1 || title.indexOf(" "+commonWord)>-1){

+					overlapCount++;

+					System.out.println(" found word "+ commonWord + " in title = "+title);

+				}

+			}

+			float coverage = (float)overlapCount/(float) (merchantWords.length);

+			if ((coverage>0.4 || (coverage>0.5f && merchantWords.length <4 )))

+*/				results.add(h);

+		}

+		return results;

+	}

+	

+	public List<HitBase> findProductByNameNoReview(String name, int count){

+		List<HitBase> foundFBPages = search.runSearch(name, count);

+		List<HitBase> results = new ArrayList<HitBase>();

+		int ct=0;

+		for(HitBase h: foundFBPages){

+			if (ct>=count) break; ct++; 

+			String title = h.getTitle().toLowerCase();

+			String[] merchantWords = name.toLowerCase().split(" ");

+			int overlapCount=0;

+			for(String commonWord:merchantWords){

+				if (title.indexOf(commonWord+" ")>-1 || title.indexOf(" "+commonWord)>-1){

+					overlapCount++;

+					System.out.println(" found word "+ commonWord + " in title = "+title);

+				}

+			}

+			float coverage = (float)overlapCount/(float) (merchantWords.length);

+			if ((coverage>0.4 || (coverage>0.5f && merchantWords.length <4 )))

+				results.add(h);

+		}

+		return results;

+	}

+

+	

+

+	public static void main(String[] args){

+		BingAPIProductSearchManager man = new BingAPIProductSearchManager ();

+		List<HitBase> res = man.findProductByName("chain saw", 5);

+		System.out.println(res);  	

+	}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/FBOpenGraphSearchManager.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/FBOpenGraphSearchManager.java
new file mode 100644
index 0000000..926a723
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/FBOpenGraphSearchManager.java

@@ -0,0 +1,143 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.ArrayList;

+import java.util.Calendar;

+import java.util.List;

+import org.apache.commons.lang.StringUtils;

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.similarity.apps.utils.PageFetcher;

+import com.restfb.Connection;

+import com.restfb.DefaultFacebookClient;

+import com.restfb.FacebookClient;

+import com.restfb.Parameter;

+import com.restfb.exception.FacebookException;

+import com.restfb.types.Event;

+import com.restfb.types.Page;

+

+

+public class FBOpenGraphSearchManager {

+

+	public List<String[]> profiles = null;

+	protected FacebookClient mFBClient;

+	protected PageFetcher pageFetcher = new PageFetcher();

+	protected static final int NUM_TRIES = 5;

+	protected static final long WAIT_BTW_TRIES=1000; //milliseconds between re-tries

+	

+		

+	public FBOpenGraphSearchManager(){

+		profiles = ProfileReaderWriter.readProfiles("C:\\nc\\features\\analytics\\dealanalyzer\\sweetjack-localcoupon-may12012tooct302012.csv");

+		

+	}

+	

+		

+	public void setFacebookClient(FacebookClient c){

+		this.mFBClient=c;

+	}

+	

+	public List<Event> getFBEventsByName(String event)

+	{

+	    List<Event> events = new ArrayList<Event>();

+	    

+	    for(int i=0; i < NUM_TRIES; i++)

+	    {

+    	    try

+    	    {

+        	    Connection<Event> publicSearch =

+        	            mFBClient.fetchConnection("search", Event.class,

+        	              Parameter.with("q", event), Parameter.with("type", "event"),Parameter.with("limit", 100));

+        	    System.out.println("Searching FB events for " + event);

+        	    events= publicSearch.getData();

+        	    break;

+    	    }

+    	    catch(FacebookException e)

+    	    {

+    	    	System.out.println("FBError "+e);

+    	        try

+                {

+                    Thread.sleep(WAIT_BTW_TRIES);

+                }

+                catch (InterruptedException e1)

+                {

+                    // TODO Auto-generated catch block

+                	System.out.println("Error "+e1);

+                }

+    	    }

+	    }

+	    return events;

+	}

+	

+	public Long getFBPageLikes(String merchant)

+	{

+        List<Page> groups = new ArrayList<Page>();

+        

+        for(int i=0; i < NUM_TRIES; i++)

+        {

+            try

+            {

+                Connection<Page> publicSearch =

+                        mFBClient.fetchConnection("search", Page.class,

+                          Parameter.with("q", merchant), Parameter.with("type", "page"),Parameter.with("limit", 100));

+                System.out.println("Searching FB Pages for " + merchant);

+                groups= publicSearch.getData();

+                break;

+            }

+            catch(FacebookException e)

+            {

+            	System.out.println("FBError "+e);

+                try

+                {

+                    Thread.sleep(WAIT_BTW_TRIES);

+                }

+                catch (InterruptedException e1)

+                {

+                    // TODO Auto-generated catch block

+                	System.out.println("Error "+e1);

+                }

+            }

+        }

+        

+        for (Page p: groups){

+        	if (p!=null && p.getLikes()!=null && p.getLikes()>0) 

+        		return p.getLikes();

+        }

+        

+        //stats fwb">235</span>

+        

+        for (Page p: groups){

+        	if (p.getId()==null)

+        		continue;

+        	String content = pageFetcher.fetchOrigHTML("http://www.facebook.com/"+p.getId());

+        

+        	String likes = StringUtils.substringBetween(content, "stats fwb\">", "<" );

+        	if (likes==null)

+        		continue;

+        	Integer nLikes =0;

+        	try {

+        	nLikes = Integer.parseInt(likes);

+        	} catch (Exception e){

+        		

+        	}

+        	if (nLikes>0){

+        		return (long)nLikes;

+        	}

+        	

+        }

+        

+        

+        return null;

+	}

+	

+

+    // 

+    

+    public static void main(String[] args){

+    	FBOpenGraphSearchManager man = new FBOpenGraphSearchManager ();

+    	man.setFacebookClient(new DefaultFacebookClient());

+       	

+    	

+    	long res = man.getFBPageLikes("chain saw");

+    	System.out.println(res);

+    	    	

+    }

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java
new file mode 100644
index 0000000..8ddf502
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java

@@ -0,0 +1,86 @@
+package opennlp.tools.apps.review_builder;

+

+import java.io.BufferedReader;

+import java.io.IOException;

+import java.io.InputStreamReader;

+import java.net.MalformedURLException;

+import java.net.URL;

+import java.net.URLConnection;

+import java.net.URLDecoder;

+import java.util.HashMap;

+import java.util.Map;

+

+import org.apache.commons.lang.StringUtils;

+import org.json.JSONArray;

+import org.json.JSONException;

+import org.json.JSONObject;

+

+public class MachineTranslationWrapper  {

+	private String translatorURL = "http://mymemory.translated.net/api/get?q=";

+	

+	public String translate(String sentence, String lang2lang){

+		if (sentence==null)

+			return null;

+		String request = translatorURL + sentence.replace(' ','+') + "&langpair="+lang2lang;//"en|es";

+		JSONArray arr=null, prodArr = null, searchURLviewArr = null;

+		try {

+			URL urlC = new URL(request);

+			URLConnection connection = urlC.openConnection();

+

+			String line;

+			String result = "";

+			BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));

+			int count = 0;

+			while ((line = reader.readLine()) != null)

+			{

+				result+=line;

+				count++;

+			}

+			JSONObject rootObject = new JSONObject(result);

+			JSONObject  findObject = rootObject.getJSONObject("responseData");

+			String transl = findObject.getString("translatedText");

+			try {

+				transl = URLDecoder.decode(transl, "UTF-8");

+			} catch (Exception e) {

+				

+			}

+			

+			return transl;

+			

+		} catch (MalformedURLException e) {

+			

+			e.printStackTrace();

+			return null;

+		} catch (JSONException e) {

+			e.printStackTrace();

+			return null;			

+		} catch (IOException e) {

+			e.printStackTrace();

+			return null;			

+		}	

+		

+	}

+	

+	public String rePhrase(String sentence){

+		System.out.println("orig = "+ sentence);

+		String transl = translate(sentence, "en|es");

+		System.out.println("tranls = "+transl);

+		String inverseTransl = translate(transl, "es|en");

+		if (!(inverseTransl.indexOf("NO QUERY SPECIFIED")>-1) && !(inverseTransl.indexOf("INVALID LANGUAGE")>-1) && !(inverseTransl.indexOf("MYMEMORY WARNING")>-1))

+			return inverseTransl;

+		else 

+			return sentence;

+	}

+	

+	

+	

+	public static void main(String[] args){

+		MachineTranslationWrapper rePhraser = new MachineTranslationWrapper();

+		

+		System.out.println(rePhraser.translate("I went to the nearest bookstore to buy a book written by my friend and his aunt", "en|ru"));

+		

+		System.out.println(rePhraser.rePhrase("I went to the nearest bookstore to buy a book written by my friend and his aunt"));

+

+	}

+		

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MinedSentenceProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MinedSentenceProcessor.java
new file mode 100644
index 0000000..73d8417
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MinedSentenceProcessor.java

@@ -0,0 +1,210 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.apps.review_builder;

+

+import java.util.Arrays;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.utils.Utils;

+

+import org.apache.commons.lang.StringUtils;

+

+public class MinedSentenceProcessor {

+  public static String acceptableMinedSentence(String sent) {

+    // if too many commas => seo text

+

+    String[] commas = StringUtils.split(sent, ',');

+    String[] spaces = StringUtils.split(sent, ' ');

+    if ((float) commas.length / (float) spaces.length > 0.7) {

+      System.out.println("Rejection: too many commas");

+      return null;

+    }

+    

+    String[] otherDelimiters = StringUtils.split(sent, '/');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    

+    otherDelimiters = StringUtils.split(sent, '.');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    otherDelimiters = StringUtils.split(sent, '!');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    otherDelimiters = StringUtils.split(sent, '=');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    

+    String[] pipes = StringUtils.split(sent, '|');

+    if (StringUtils.split(sent, '|').length > 2

+        || StringUtils.split(sent, '>').length > 2) {

+      System.out.println("Rejection: too many |s or >s ");

+      return null;

+    }

+    String sentTry = sent.toLowerCase();

+    // if too many long spaces

+    String sentSpaces = sentTry.replace("   ", "");

+    if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -

+      // suspicious

+      return null;

+

+    if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1

+        || sentTry.indexOf("copyright") > -1

+        || sentTry.indexOf("operating hours") > -1

+        || sentTry.indexOf("days per week") > -1

+        || sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1

+        || sentTry.indexOf("find the latest") > -1

+        || sentTry.startsWith("subscribe")

+        || sentTry.indexOf("Terms of Service") > -1

+        || sentTry.indexOf("clicking here") > -1

+        || sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1

+        || sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")

+        || sentTry.indexOf("available online") > -1

+        || sentTry.indexOf("get online") > -1

+        || sentTry.indexOf("buy online") > -1

+        || sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1

+        || sentTry.indexOf("official site") > -1

+        || sentTry.indexOf("this video") > -1

+        || sentTry.indexOf("this book") > -1

+        || sentTry.indexOf("this product") > -1

+        || sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1

+        || sentTry.indexOf("audio cd") > -1

+        || sentTry.indexOf("related searches") > -1

+        || sentTry.indexOf("permission is granted") > -1

+        || sentTry.indexOf("[edit") > -1

+        || sentTry.indexOf("edit categories") > -1

+        || sentTry.indexOf("free license") > -1

+        || sentTry.indexOf("permission is granted") > -1

+        || sentTry.indexOf("under the terms") > -1

+        || sentTry.indexOf("rights reserved") > -1

+        || sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")

+        || sentTry.endsWith("the.") || sentTry.startsWith("below") 

+        || sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1 

+        ||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1

+        ||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1 

+        ||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1

+        ||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1

+        ||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1

+        ||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1

+        

+        ||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 || sentTry.indexOf( "your money")>-1

+        ||sentTry.startsWith( "free") ||sentTry.indexOf( "purchase orders")>-1

+        ||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1 ||sentTry.indexOf( "credit card")>-1 

+        

+        ||sentTry.indexOf( "storeshop")>-1 || sentTry.startsWith( "find") || sentTry.startsWith( "shop") || sentTry.startsWith( "unlimited") 

+        ||sentTry.indexOf( "for a limited time")>-1 ||sentTry.indexOf( "prime members")>-1 ||sentTry.indexOf( "amazon members")>-1 ||sentTry.indexOf( "unlimited free")>-1 

+        ||sentTry.indexOf( "shipping")>-1 || sentTry.startsWith( "amazon")

+// not a script text

+        ||sentTry.indexOf( "document.body")>-1 ||sentTry.indexOf( " var ")>-1         ||sentTry.indexOf( "search suggestions")>-1 ||sentTry.startsWith( "Search") 

+        

+    		)

+      return null;

+    

+    //Millions of Amazon Prime members enjoy instant videos, free Kindle books and unlimited free two-day shipping.

+

+    // count symbols indicating wrong parts of page to mine for text

+    // if short and contains too many symbols indicating wrong area: reject

+    String sentWrongSym = sentTry.replace(">", "&&&").replace("�", "&&&")

+        .replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")

+        .replace("-", "&&&").replace("%", "&&&");

+    if ((sentWrongSym.length() - sentTry.length()) >= 4

+        && sentTry.length() < 200) // twice ot more

+      return null;

+

+    sent = sent.replace('[', ' ').replace(']', ' ')

+        .replace("_should_find_orig_", "").replace(".   .", ". ")

+        .replace("amp;", " ").replace("1.", " ").replace("2.", " ")

+        .replace("3.", " ").replace("4.", " ").replace("2009", "2011")

+        .replace("2008", "2011").replace("2006", "2011")

+        .replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")

+        .replace("no comments", " ").replace("  ", " ").replace("  ", " ")

+        .replace("(more.)", "").replace("more.", "").replace("<more>", "")

+        .replace("[more]", "").replace(".,", ".").replace("&lt;", "")

+        .replace("p&gt;", "").replace("product description", "");

+

+    // TODO .replace("a.", ".");

+

+    int endIndex = sent.indexOf(" posted");

+    if (endIndex > 0)

+      sent = sent.substring(0, endIndex);

+

+    return sent;

+  }

+

+  public static String processSentence(String pageSentence) {

+    if (pageSentence == null)

+      return "";

+    pageSentence = Utils.fullStripHTML(pageSentence);

+    pageSentence = StringUtils.chomp(pageSentence, "..");

+    pageSentence = StringUtils.chomp(pageSentence, ". .");

+    pageSentence = StringUtils.chomp(pageSentence, " .");

+    pageSentence = StringUtils.chomp(pageSentence, ".");

+    pageSentence = StringUtils.chomp(pageSentence, "...");

+    pageSentence = StringUtils.chomp(pageSentence, " ....");

+    pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")

+        .replace("(.)", "");

+

+    pageSentence = pageSentence.trim();

+    pageSentence = pageSentence.replaceAll("\\s+", " "); // make single

+    // spaces

+    // everywhere

+

+    String[] pipes = StringUtils.split(pageSentence, '|'); // removed

+    // shorter part

+    // of sentence

+    // at the end

+    // after pipe

+    if (pipes.length == 2

+        && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {

+      int pipePos = pageSentence.indexOf("|");

+      if (pipePos > -1)

+        pageSentence = pageSentence.substring(0, pipePos - 1).trim();

+

+    }

+

+    if (!StringUtils.contains(pageSentence, '.')

+        && !StringUtils.contains(pageSentence, '?')

+        && !StringUtils.contains(pageSentence, '!'))

+      pageSentence = pageSentence + ". ";

+

+    pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();

+    if (!pageSentence.endsWith("."))

+      pageSentence += ". ";

+    return pageSentence;

+  }

+

+  

+  public static String normalizeForSentenceSplitting(String pageContent) {

+    pageContent.replace("Jan.", "January").replace("Feb.", "February")

+        .replace("Mar.", "March").replace("Apr.", "April")

+        .replace("Jun.", "June").replace("Jul.", "July")

+        .replace("Aug.", "August").replace("Sep.", "September")

+        .replace("Oct.", "October").replace("Nov.", "November")

+        .replace("Dec.", "December");

+

+    return pageContent;

+

+  }

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ParserConstants.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ParserConstants.java
new file mode 100644
index 0000000..9862ffb
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ParserConstants.java

@@ -0,0 +1,74 @@
+package opennlp.tools.apps.review_builder;
+
+public interface ParserConstants {
+	// added new POS types for infinitive phrase and participle phrase
+	public static final String TYPE_STP = "STP"; // infinitive phrase
+	public static final String TYPE_SGP = "SGP"; // present participle phrase
+	public static final String TYPE_SNP = "SNP"; // past participle phrase
+
+	// below are the standard POS types,
+	// http://bulba.sdsu.edu/jeanette/thesis/PennTags.html
+	public static final String TYPE_ADJP = "ADJP";
+	public static final String TYPE_ADVP = "ADVP";
+	public static final String TYPE_CC = "CC";
+	public static final String TYPE_CD = "CD";
+	public static final String TYPE_CONJP = "CONJP";
+	public static final String TYPE_DT = "DT";
+	public static final String TYPE_EX = "EX";
+	public static final String TYPE_FRAG = "FRAG";
+	public static final String TYPE_FW = "FW";
+	public static final String TYPE_IN = "IN";
+	public static final String TYPE_INTJ = "INTJ";
+	public static final String TYPE_JJ = "JJ";
+	public static final String TYPE_JJR = "JJR";
+	public static final String TYPE_JJS = "JJS";
+	public static final String TYPE_LS = "LS";
+	public static final String TYPE_LST = "LST";
+	public static final String TYPE_MD = "MD";
+	public static final String TYPE_NAC = "NAC";
+	public static final String TYPE_NN = "NN";
+	public static final String TYPE_NNS = "NNS";
+	public static final String TYPE_NNP = "NNP";
+	public static final String TYPE_NNPS = "NNPS";
+	public static final String TYPE_NP = "NP";
+	public static final String TYPE_NX = "NX";
+	public static final String TYPE_PDT = "PDT";
+	public static final String TYPE_POS = "POS";
+	public static final String TYPE_PP = "PP";
+	public static final String TYPE_PRN = "PRN";
+	public static final String TYPE_PRP = "PRP";
+	public static final String TYPE_PRP$ = "PRP$";
+	public static final String TYPE_PRT = "PRT";
+	public static final String TYPE_QP = "QP";
+	public static final String TYPE_RB = "RB";
+	public static final String TYPE_RBR = "RBR";
+	public static final String TYPE_RBS = "RBS";
+	public static final String TYPE_RP = "RP";
+	public static final String TYPE_RRC = "RRC";
+	public static final String TYPE_S = "S";
+	public static final String TYPE_SBAR = "SBAR";
+	public static final String TYPE_SBARQ = "SBARQ";
+	public static final String TYPE_SINV = "SINV";
+	public static final String TYPE_SQ = "SQ";
+	public static final String TYPE_SYM = "SYM";
+	public static final String TYPE_TO = "TO";
+	public static final String TYPE_TOP = "TOP";
+	public static final String TYPE_UCP = "UCP";
+	public static final String TYPE_UH = "UH";
+	public static final String TYPE_VB = "VB";
+	public static final String TYPE_VBD = "VBD";
+	public static final String TYPE_VBG = "VBG";
+	public static final String TYPE_VBN = "VBN";
+	public static final String TYPE_VBP = "VBP";
+	public static final String TYPE_VBZ = "VBZ";
+	public static final String TYPE_VP = "VP";
+	public static final String TYPE_WDT = "WDT";
+	public static final String TYPE_WHADJP = "WHADJP";
+	public static final String TYPE_WHADVP = "WHADVP";
+	public static final String TYPE_WHNP = "WHNP";
+	public static final String TYPE_WHPP = "WHPP";
+	public static final String TYPE_WP = "WP";
+	public static final String TYPE_WP$ = "WP$";
+	public static final String TYPE_WRB = "WRB";
+	public static final String TYPE_X = "X";
+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ReviewBuilderRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ReviewBuilderRunner.java
new file mode 100644
index 0000000..956640f
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ReviewBuilderRunner.java

@@ -0,0 +1,166 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.Triple;

+

+public class ReviewBuilderRunner {

+

+	private List<Triple> input = new ArrayList<Triple>(); 

+

+	public ReviewBuilderRunner(){

+

+		/*	input.add( new Pair<String, Integer>("chief architect portable mobile tv", 204973051));

+

+	input.add( new Pair<String, Integer>("lg plasma tv", 215734562));

+	input.add( new Pair<String, Integer>("magnavox lcd hdtv", 215415652));

+	input.add( new Pair<String, Integer>("yamaha aventage home theater receiver", 215742271));

+	input.add( new Pair<String, Integer>("panasonic 24inch lcd tv", 215742233));

+	input.add( new Pair<String, Integer>("otterbox barnes and noble nook commuter case", 215572161));

+	input.add( new Pair<String, Integer>("sony kdl32ex340 led tv", 215743925));

+	input.add( new Pair<String, Integer>("alpine waterfall tabletop fountain lighting", 215135546));

+    input.add( new Pair<String, Integer>("ihome rechargeable speaker system", 215363231 ));

+	input.add( new Pair<String, Integer>("ion slide film scanner", 212088884));

+

+		 input.add( new Pair<String, Integer>("mens dr martens shoes black nappa", 210813142));

+		 input.add( new Pair<String, Integer>("calvin klein seamless thong panty", 201984853));

+		 input.add( new Pair<String, Integer>("mens clarks shoes wallabee beeswax leather", 210808477));

+		//? input.add( new Pair<String, Integer>("mens sperry topsider shoes", 210809238));

+		 input.add( new Pair<String, Integer>("mens giorgio brutini shoes italian calf", 210809508));

+

+		input.add( new Pair<String, Integer>("halo portable backup battery", 1640825398));

+input.add( new Pair<String, Integer>("kenwood pkgmp18 cd receiver  coaxial speakers",1642712915));

+input.add( new Pair<String, Integer>("element ultraslim hdtv",1643167865));

+input.add( new Pair<String, Integer>("westinghouse  dled hdtv black",1641930013));

+input.add( new Pair<String, Integer>("boss audio receiver speaker package system",1643532459));

+input.add( new Pair<String, Integer>("kenwood  cd receiver coaxial speakers bundle",1646566070));

+input.add( new Pair<String, Integer>("element electronics lcd tv black ",1637163018));

+input.add( new Pair<String, Integer>("stunt copter rechargeable battery pack",1636937811));

+input.add( new Pair<String, Integer>("element led ultraslim hdtv  soundbar",1637572596));

+input.add( new Pair<String, Integer>("boss  receiver speaker package system bundle",1646566067));

+input.add( new Pair<String, Integer>("coby  hd tv",1638746307));

+input.add( new Pair<String, Integer>("vizio  diag led smart hdtv",1660162001));

+input.add( new Pair<String, Integer>("sony dock for ipad ipod and iphone",1646826284));

+input.add( new Pair<String, Integer>("vizio  led  ultraslim hdtv",1642018249));

+input.add( new Pair<String, Integer>("lcd kula tv multimedia player",1640265845));

+

+input.add(new Pair<String, Integer>("liz and co alex tall leather boots",1630836375));

+input.add( new Pair<String, Integer>("total girl silvia sequin moccasin", 1630828314));

+input.add( new Pair<String, Integer>("new england patriots new era nfl sport sideline knit", 1588531904));

+input.add( new Pair<String, Integer>("betseyville sequin backpack", 1630825375));

+input.add( new Pair<String, Integer>("the north face womens osito jacket mojito", 1639791775));

+input.add( new Pair<String, Integer>("misty harbor raincoat trench removable liner", 903542613));

+input.add(new Pair<String, Integer>("ae womens camo jacket ", 1229070780));

+input.add(new Pair<String, Integer>("indianapolis colts sideline knit", 1588531896));

+input.add(new Pair<String, Integer>("b o c korah boot", 1622401738));

+input.add(new Pair<String, Integer>("adidas mens speed cut track suit", 920744865));

+input.add(new Pair<String, Integer>("liz and co lulu zipper boots", 1630836380));

+input.add(new Pair<String, Integer>("black navy  lightweight oxford shoes", 906123996));

+input.add(new Pair<String, Integer>("liz and co farley tall boots", 1639960280));

+input.add(new Pair<String, Integer>("call it spring karpin  pullon boots", 1629938981));

+input.add(new Pair<String, Integer>("ugg australia bailey bow boots", 1594029054));

+input.add(new Pair<String, Integer>("dream chasers  jacket", 1631247949));

+input.add(new Pair<String, Integer>("guess military  tiewaist coat", 1629993909));

+input.add(new Pair<String, Integer>("madden girl allstaar womens zip boots", 1581506993));

+input.add(new Pair<String, Integer>("michael womens shoes", 1590598743));

+input.add(new Pair<String, Integer>("sonoma life style suede midcalf boots women", 1617302927));

+

+		input.add(new Pair<String, Integer>("absolute pnf300 power noise filterground loop isolator with adjustable controls", 1521965454));

+		input.add(new Pair<String, Integer>("sennheiser ie8 stereo earbuds", 211969101));

+		input.add(new Pair<String, Integer>("sanus vlmf109 motorized full motion mount for tvs 37 60 up to 110 lbs", 214893385));

+		input.add(new Pair<String, Integer>("s2fmcy003 earset stereo earbud binaural open miniphone black", 214972916));

+		input.add(new Pair<String, Integer>("boconi bags and leather bryant safari bag carry on luggage brown", 1646568995));

+		input.add(new Pair<String, Integer>("diesel derik pant jyt mens pajama gray", 1645725530));

+		input.add(new Pair<String, Integer>("sole society gina sandal", 1633021283));

+		input.add(new Pair<String, Integer>("toms bimini stitchout slipon women", 1633012540));

+		input.add(new Pair<String, Integer>("the north face womens p r tka 100 microvelour glacier 14 zip tnf blackjk3 medium", 1618022193));

+		input.add(new Pair<String, Integer>("robert graham manuel dress shirt mens long sleeve button up blue", 1631119485));

+

+		input.add(new Pair<String, Integer>("b o c leesa", 1584193288));

+			input.add(new Pair<String, Integer>("blair stirrup pants", 1525621516));

+			input.add(new Pair<String, Integer>("donna karan shirtdress", 1463793963));

+			input.add(new Pair<String, Integer>("columbia sportswear terminal tackle shirt", 1661238030));

+			input.add(new Pair<String, Integer>("carters jersey pajamas", 1573999243));

+			input.add(new Pair<String, Integer>("vince camuto dena", 1626272001));

+			input.add(new Pair<String, Integer>("pistil hudson knit hats", 1660874149));

+			input.add(new Pair<String, Integer>("naturalizer trinity wide shaft womens zip", 1569191459));

+			input.add(new Pair<String, Integer>("bare traps chelby womens sandals", 1513387756));

+			input.add(new Pair<String, Integer>("overland storage hard drive 1 tb hotswap", 212107374));

+			input.add(new Pair<String, Integer>("humminbird indash depth finder", 1616650484));

+			input.add(new Pair<String, Integer>("grepsr800 gre dig scanner", 215723895));

+			input.add(new Pair<String, Integer>("humminbird kayak transducer", 215392426));

+			input.add(new Pair<String, Integer>("garmin nuvi suction cup mount ", 215728710));

+			input.add(new Pair<String, Integer>("crosley radio black", 215662289));

+

+		    input.add(new Triple<String, Integer, String >("avaya ip telephone", 1440488008, "lucent phone system"));

+			input.add(new Triple<String, Integer, String>("clarks trolley womens shoes", 1581854074, "clark womens shoes"));

+			input.add(new Triple<String, Integer, String>("mens evans shoes imperial deer", 210808400, "lb evans slippers"));

+			input.add(new Triple<String, Integer, String>("ugg classic bow shorty gloves", 1665094898, "leather gloves women"));

+			input.add(new Triple<String, Integer, String>("jumping beans man tee baby", 1667155332, "jumping beans clothing"));

+			input.add(new Triple<String, Integer, String>("asics mens shoes", 1630208773, "asics mens running shoes"));

+			input.add(new Triple<String, Integer, String>("oakley hoodie mens fleece", 1656661466, "hoodies for men"));

+			input.add(new Triple<String, Integer, String>("usb sound control digital voice recorder", 1654662662, "digital voice recorder with usb"));

+			input.add(new Triple<String, Integer, String>("motorola bluetooth headset", 215376254, "motorola oasis bluetooth headset"));

+			input.add(new Triple<String, Integer, String>("sony sound bar home theater system", 215450833, "sony sound bar"));

+			input.add(new Triple<String, Integer, String>("jvc full hd everio camcorder", 1664479999, "jvc everio camcorder"));

+		 */

+		

+		 input.add(new Triple<String, Integer, String>("dr martens beckett laceup boots", 1651452641, "doc martin shoes"));

+		 input.add(new Triple<String, Integer, String>("pioneer cd changer",204654672, "pioneer cd player"));

+		 input.add(new Triple<String, Integer, String>("tablet handler strap and desk mount", 1634326303, "tablet holder"));

+		 input.add(new Triple<String, Integer, String>("sockwell loden womens overthecalf socks", 1644572708, "compression stockings, support stockings"));

+		 input.add(new Triple<String, Integer, String>("nike eclipse womens shoes", 1657807048, "nike eclipse ii women s shoe"));

+		 input.add(new Triple<String, Integer, String>("cherokee workwear womens scrub pant black stall",211643295, "cherokee workwear scrubs"));

+		 input.add(new Triple<String, Integer, String>("columbia sportswear jacket ", 1667381935, "columbia omni heat"));

+		 input.add(new Triple<String, Integer, String>("adidas adipure jacket", 1040124787, "adidas track jacket"));

+		 input.add(new Triple<String, Integer, String>("clarks may orchid womens shoes", 1585805688, "clarks loafers"));

+		 input.add(new Triple<String, Integer, String>("levis pants empire blue", 1670283141, "skinny jeans for guys"));

+		 input.add(new Triple<String, Integer, String>("nike jordan black cat tee", 1653598764, "jordan black cat"));

+		 input.add(new Triple<String, Integer, String>("obermeyer womens kassandra down coat", 1670629180, "down winter coats"));

+/*

+		 input.add(new Triple<String, Integer, String>("paramax  surround sound", 835422569, "paramax im3"));

+		 input.add(new Triple<String, Integer, String>("mia quincy wedge", 1285886230, "mia quincy wedge"));

+		 input.add(new Triple<String, Integer, String>("able planet headphones", 1648522886, "able planet nc210g"));

+		 input.add(new Triple<String, Integer, String>("samsung replacement lamp", 695793593, "lamp code bp96"));

+		 input.add(new Triple<String, Integer, String>("paul green emerson boot castagno", 1313967918, "paul green emerson boot"));

+		 input.add(new Triple<String, Integer, String>("bandolino caresse boots", 1448643623, "bandolino caresse boots"));

+		 input.add(new Triple<String, Integer, String>("nine west modiley", 1365998968, "nine west modiley"));

+		 input.add(new Triple<String, Integer, String>("converse chuck taylor  bisay", 1555900934, "turquoise chuck taylors"));

+		 input.add(new Triple<String, Integer, String>("gentle souls bay leaf flats", 1436175162, "gentle souls bay leaf"));

+		 input.add(new Triple<String, Integer, String>("sauce hockey  back hat", 1644440355, "sauce hockey discount code"));

+		 input.add(new Triple<String, Integer, String>("aravon farren oxford shoes", 1644573438, "aravon wef07sh"));

+	*/	 input.add(new Triple<String, Integer, String>("kooba crosby hobo handbags", 1326503038, "kooba crosby"));

+		 input.add(new Triple<String, Integer, String>("bcbgmaxazria sheath dress", 1313949777, "bcbgmaxazria illusion bodice ruched sheath dress"));

+		 input.add(new Triple<String, Integer, String>("billabong boardshorts trunks", 1316823074, "la siesta boardshorts"));

+		 input.add(new Triple<String, Integer, String>("mootsies tootsies boot", 1503727310, "mootsies tootsies draker"));

+		 input.add(new Triple<String, Integer, String>("nine west bootie", 1503730060, "nine west drina"));

+		 input.add(new Triple<String, Integer, String>("playtex support cotton ", 1331026244, "playtex t723"));

+		 input.add(new Triple<String, Integer, String>("fossil morgan satchel taupe", 1355165745, "fossil morgan satchel"));

+		 input.add(new Triple<String, Integer, String>("katonah womens boots brown", 1420057844, "boc katonah boots"));

+		 input.add(new Triple<String, Integer, String>("boot cut jeans supernova", 1363356262, "levis 527 supernova"));

+		 input.add(new Triple<String, Integer, String>("steve madden buckie boot", 1313965918, "steve madden buckie boot"));

+		 input.add(new Triple<String, Integer, String>("charlies horse tshirt", 1428490587, "charlie s horse shirt"));

+		 input.add(new Triple<String, Integer, String>("igloo little playmate ice chest", 205421625, "igloo little playmate"));

+		 input.add(new Triple<String, Integer, String>("mark nason boot", 1313951044, "mark nason rudd"));

+

+

+

+	}

+

+	public static void main(String[] args){

+		//ProductFinderInAWebPage init = new ProductFinderInAWebPage("C:/workspace/relevanceEngine/src/test/resources");

+		ReviewBuilderRunner r = new ReviewBuilderRunner();

+		WebPageReviewExtractor extractor = new WebPageReviewExtractor("C:/workspace/relevanceEngine/src/test/resources");

+		for(Triple query_ID : r.input ){

+			String query = (String) query_ID.getFirst();

+			List<String> res = extractor.formReviewsForAProduct(query);

+

+			ProfileReaderWriter.writeReportListStr(res, "formedReviewSentences"+ query +".csv");

+		}

+

+

+

+	}

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ReviewObj.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ReviewObj.java
new file mode 100644
index 0000000..ebf42d7
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/ReviewObj.java

@@ -0,0 +1,137 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.List;

+

+public class ReviewObj {

+	

+		long bpid;

+		long pid;

+		float rating;

+		String pros;

+	    String cons;

+		String url;

+		String title;

+		String review;

+		String keywordsName;

+		float score;

+		String[] origSentences;

+		String[] featurePhrases;

+		

+		List<String> originalizedSentences ; //obtained from sentences;

+		List<String> sentimentPhrases ; //obtained from sentences;

+		

+		public ReviewObj(long bpid, long pid, float rating, String pros,

+				String cons, String url, String title, String review,

+				float score) {

+			super();

+			this.bpid = bpid;

+			this.pid = pid;

+			this.rating = rating;

+			this.pros = pros;

+			this.cons = cons;

+			this.url = url;

+			this.title = title;

+			this.review = review;

+			this.score = score;

+		}

+		

+		

+		public List<String> getSentimentPhrases() {

+			return sentimentPhrases;

+		}

+

+

+		public void setSentimentPhrases(List<String> sentimentPhrases) {

+			this.sentimentPhrases = sentimentPhrases;

+		}

+

+

+		public ReviewObj() {

+			// TODO Auto-generated constructor stub

+		}

+		public String[] getOrigSentences() {

+			return origSentences;

+		}

+		public void setOrigSentences(String[] sentences) {

+			this.origSentences = sentences;

+		}

+		public List<String> getOriginalizedSentences() {

+			return originalizedSentences;

+		}

+

+

+		public void setOriginalizedSentences(List<String> originalizedSentences) {

+			this.originalizedSentences = originalizedSentences;

+		}

+

+

+		public String[] getFeaturePhrases() {

+			return featurePhrases;

+		}

+		public void setFeaturePhrases(String[] featurePhrases) {

+			this.featurePhrases = featurePhrases;

+		}

+		public long getBpid() {

+			return bpid;

+		}

+		public void setBpid(long bpid) {

+			this.bpid = bpid;

+		}

+		public long getPid() {

+			return pid;

+		}

+		public void setPid(long pid) {

+			this.pid = pid;

+		}

+		public float getRating() {

+			return rating;

+		}

+		public void setRating(float rating) {

+			this.rating = rating;

+		}

+		public String getPros() {

+			return pros;

+		}

+		public void setPros(String pros) {

+			this.pros = pros;

+		}

+		public String getCons() {

+			return cons;

+		}

+		public void setCons(String cons) {

+			this.cons = cons;

+		}

+		public String getUrl() {

+			return url;

+		}

+		public void setUrl(String url) {

+			this.url = url;

+		}

+		public String getTitle() {

+			return title;

+		}

+		public void setTitle(String title) {

+			this.title = title;

+		}

+		public String getReview() {

+			return review;

+		}

+		public void setReview(String review) {

+			this.review = review;

+		}

+		public float getScore() {

+			return score;

+		}

+		public void setScore(float score) {

+			this.score = score;

+		}

+		public String getKeywordsName() {

+			

+			return this.keywordsName;

+		}

+		public void setKeywordsName(String kw) {

+			

+			keywordsName=kw;

+		}

+			

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/SentenceBeingOriginalized.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/SentenceBeingOriginalized.java
new file mode 100644
index 0000000..c4bebb1
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/SentenceBeingOriginalized.java

@@ -0,0 +1,59 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+

+public class SentenceBeingOriginalized {

+	private Map<String, String> sentKey_value= new HashMap<String, String>();

+	private String sentence;

+	private List<List<ParseTreeChunk>> groupedChunks;

+	

+	

+	

+	public Map<String, String> getSentKey_value() {

+		return sentKey_value;

+	}

+

+

+

+	public void setSentKey_value(Map<String, String> sentKey_value) {

+		this.sentKey_value = sentKey_value;

+	}

+

+

+

+	public String getSentence() {

+		return sentence;

+	}

+

+

+

+	public void setSentence(String sentence) {

+		this.sentence = sentence;

+	}

+

+

+

+	public List<List<ParseTreeChunk>> getGroupedChunks() {

+		return groupedChunks;

+	}

+

+

+

+	public void setGroupedChunks(List<List<ParseTreeChunk>> groupedChunks) {

+		this.groupedChunks = groupedChunks;

+	}

+

+

+

+	public SentenceBeingOriginalized(Map<String, String> sentKey_value,

+			String sentence, List<List<ParseTreeChunk>> groupedChunks) {

+		super();

+		this.sentKey_value = sentKey_value;

+		this.sentence = sentence;

+		this.groupedChunks = groupedChunks;

+	}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/SentenceOriginalizer.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/SentenceOriginalizer.java
new file mode 100644
index 0000000..a9c94dc
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/SentenceOriginalizer.java

@@ -0,0 +1,401 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.HashSet;

+import java.util.List;

+

+import org.apache.commons.lang.StringUtils;

+

+import opennlp.tools.apps.relevanceVocabs.PhraseProcessor;

+import opennlp.tools.apps.relevanceVocabs.SentimentVocab;

+import opennlp.tools.apps.relevanceVocabs.SynonymListFilter;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+

+public class SentenceOriginalizer {

+	private String[] sents; 

+	private SentenceBeingOriginalized[] sentenceBeingOriginalized;

+	public List<String> formedPhrases = new ArrayList<String>();

+

+	private MachineTranslationWrapper rePhraser = new MachineTranslationWrapper();

+	private SentimentVocab sVocab = SentimentVocab.getInstance();

+	PhraseProcessor pProc = new PhraseProcessor();

+	SynonymListFilter filter = null;

+	private List<String> verbsShouldStayNoSubstition = Arrays.asList(new String[]{

+			"might", "can", "power", "bonk", "screw", "victimization", "victimize", "victimised", "victimized", "victimise",

+			"hump", "sluttish", "wanton"

+	});

+

+	public SentenceOriginalizer(String[] ss){

+		sentenceBeingOriginalized = new SentenceBeingOriginalized[ss.length];

+		for(int i= 0; i< ss.length; i++){

+			//sentenceBeingOriginalized[i] = new  SentenceBeingOriginalized()

+		}

+	}

+

+	public SentenceOriginalizer(String dir){

+		filter = new  SynonymListFilter(dir);

+	};

+

+	public String[] getSents() {

+		return sents;

+	}

+

+	public void setSents(String[] sents) {

+		this.sents = sents;

+	}

+

+	

+

+	private void substituteProsCons(){

+		for(int i = 0; i< sents.length; i++){

+			if (sents[i]==null)

+				continue;

+

+			sents[i] = sents[i].replace("...", " ").replace("..", " ");

+

+			if (sents[i].startsWith("Pros")){

+				sents[i]="";

+				sents[i+1] = "I liked that "+ sents[i+1];

+			}

+

+			if (sents[i].startsWith("Cons")){

+				sents[i]="";

+				sents[i+1] = "What I did not like was that "+ sents[i+1];

+			}

+		}

+	}

+

+	private void insertProductNameForRefs(String prodName){

+		prodName = prodName.toLowerCase();

+		prodName = StringUtils.trim(prodName);

+		

+		for(int i = 0; i< sents.length; i++){

+			if (sents[i]==null)

+				continue;

+			String snt = sents[i];

+			String line  = snt.replace(" it ", " "+prodName+" ");

+			if (line.equals(snt)){

+				line = snt.replace(" this ", " "+prodName+" ");

+			}

+

+			sents[i]=line;

+		}

+	}

+	

+	private void insertProductNameForRefsFullNameKeywords(String prodName, String keywordsName){

+		prodName = StringUtils.trim(prodName.toLowerCase());

+				

+		for(int i = 0; i< sents.length; i++){

+			double flag = Math.random();

+			String prodNameCurr = null;

+			if (flag>0.4)

+				prodNameCurr = prodName;

+				else

+					prodNameCurr = keywordsName;

+					

+			if (sents[i]==null)

+				continue;

+			String snt = sents[i];

+			String line  = snt.replace(" it ", " "+prodNameCurr+" ");

+			if (line.equals(snt)){

+				line = snt.replace(" this ", " "+prodNameCurr+" ");

+			}

+

+			sents[i]=line;

+		}

+	}

+

+	private void turnTenseToPast(){

+		for(int i = 0; i< sents.length; i++){

+			if (sents[i]==null)

+				continue;

+			sents[i] = sents[i].replace("to do ", "to d_o_ ");

+			sents[i]=sents[i].replace(" is ", " was ").replace(" done ", " was done ").replace(" are ", " were ")

+					.replace(" do ", " did ").replace(" yes, ", " true, ");

+			sents[i]=sents[i].replace("somebody ", "one ").replace("would like", "would want").replace("I am", "users are");

+			sents[i]=sents[i].replace("my wife", "my spouse").replace("I would definitely buy ", "I wouldn't hesitate to buy ")

+					.replace("I haven't tried ", "I did not actually have a chance to try ");

+			sents[i]=sents[i].replace("they arrived ", "they were shipped to my residence ").replace(" ive ", " I have ")

+					.replace("We have ", "I have already tried and written a review on ");

+			

+			sents[i] = sents[i].replace( "to d_o_ ", "to do ");

+	

+			if (sents[i].startsWith("We "))

+				sents[i] = sents[i].replace("We ", "I know they ");

+			if (sents[i].startsWith("You "))

+				sents[i] = sents[i].replace("You ","I believe one can ");

+			

+			if (sents[i].startsWith("Well "))

+				sents[i] = sents[i].replace("Well ","I would state that ");

+

+		}

+	}

+

+	private void turnCounterFactual(){

+		for(int i = 0; i< sents.length; i++){

+			if (sents[i]==null)

+				continue;

+			sents[i]=sents[i].replace("however ", "b1ut1 ").replace("but ", "however ")

+					.replace("b1ut1 ", "but ").replace("I say", "I repeat").

+					replace("same way", "same manner").replace(" you ", " somebody ").replace(" can ", " might ");

+

+		}

+	}

+

+	public void substituteSynonymVerbs(){

+		for(int i = 0; i< sents.length; i++){

+			String line = sents[i];

+			List<List<ParseTreeChunk>> ps = pProc.getPhrasesOfAllTypes(line);

+			if (ps==null || ps.size()<2)

+				continue;

+

+			List<ParseTreeChunk> vps = ps.get(1);

+

+			extractNounPhrasesWithSentiments(ps.get(0));

+

+			line = substituteSentimentSynonyms(line, ps);

+

+			if (vps==null)

+				continue;

+			boolean bVerbRule = false;

+			if (vps.size()==1)

+				line = rePhraser.rePhrase(line);

+			else {

+				if (vps.size()>1)

+

+					for (ParseTreeChunk v: vps){

+						String verbLemma = v.getLemmas().get(0);

+						String newVerb = filter.getSynonym(verbLemma);

+						if (newVerb!=null && newVerb.length()>3 && verbLemma.length()>3 // both old and new words should be above 3

+								&& !newVerb.endsWith("ness") // empirical rule

+								&& !verbsShouldStayNoSubstition.contains(verbLemma) &&

+								!verbsShouldStayNoSubstition.contains(newVerb)	){

+							line = line.replace(verbLemma+" ", newVerb+" "); 	

+							line = line.replace(" "+verbLemma, " "+newVerb); 

+							System.out.println("Synonym for verb substitution: "+verbLemma + "->"+newVerb);

+							bVerbRule = true;

+						}

+					}

+				if (!bVerbRule && vps.size()==2 && Math.random()>0.8) // no other means of originalization worked, so do inverse translation

+					line = rePhraser.rePhrase(line);

+			}

+			sents[i]=line;

+

+		}

+	}

+

+

+	private String substituteSentimentSynonyms(String line,

+			List<List<ParseTreeChunk>> ps) {

+		List<ParseTreeChunk> nounPhrases = ps.get(0);

+		if (nounPhrases.size()<1)

+			return line;

+

+		for(ParseTreeChunk ch: nounPhrases){

+			List<String> lemmas = ch.getLemmas();

+			for(String oldSentim:lemmas){

+				if ( sVocab.isSentimentWord(oldSentim.toLowerCase())) {

+					String newSentim = filter.getSynonym(oldSentim);

+					if (newSentim!=null && newSentim.length()>3 && !verbsShouldStayNoSubstition.contains(newSentim)

+							&& !verbsShouldStayNoSubstition.contains(oldSentim)){

+						line = line.replace(oldSentim+" ", newSentim+" "); 	

+						line = line.replace(" "+oldSentim, " "+newSentim);

+						System.out.println("Synonym for sentiment substitution: "+oldSentim + "->"+newSentim);

+					}

+				}

+			}

+		}

+

+		return line;

+	}

+

+	private void extractNounPhrasesWithSentiments(List<ParseTreeChunk> list) {

+		List<String> phrasesWithSentiments = new ArrayList<String>();

+		for(ParseTreeChunk ch: list){

+			List<String> lemmas = ch.getLemmas();

+			for(String l:lemmas){

+				if ( sVocab.isSentimentWord(l.toLowerCase())) {

+					phrasesWithSentiments.add(lemmas.toString());

+				}

+			}

+		}

+		formedPhrases.addAll(phrasesWithSentiments);

+	}

+

+	public String[] convert(String[] sents, String name, String keywordsName){

+		name = name.replace("Amazon.com:" , "").replace("Amazon.com" , "").replace("..." , " ")

+				.replace("Customer Reviews: ", "");

+

+		this.sents = sents;

+		try {

+			substituteProsCons();

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		try {

+			//insertProductNameForRefs(name);

+			insertProductNameForRefsFullNameKeywords(name, keywordsName);

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		try {

+			turnTenseToPast();

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		try {

+			turnCounterFactual();

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+

+		try {

+			substituteSynonymVerbs();

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		// remove dupes

+		this.formedPhrases = new ArrayList<String>(new HashSet<String>(this.formedPhrases));

+

+		return sents;

+

+	}

+

+	public static void main(String[] args){

+		//ProductFinderInAWebPage init = new ProductFinderInAWebPage("C:/workspace/productsearchfe/src/test/resources");

+		SentenceOriginalizer orig = new SentenceOriginalizer("src/test/resources");

+		String[] sents = new String[] {

+				"Leave the bulky stabilization rig at home and take smooth handheld videos from any angle thanks to Optical SteadyShot image stabilization with Active Mode."

+				//"Other then that, it works well, and the chain stops instantly when you let go of the trigger, or push the safety bar."	

+		};

+		String[] res = orig.convert(sents, "VIP Product", "vv propro");

+		System.out.println(Arrays.asList(res));

+	}

+

+}

+

+/*

+ * 1.	Some Amazon specific text keeps showing up so we might want to put a filter on recurring phrases such as:

+1.	Unlimited Free Two-Day Shipping

+2.	View Larger

+3.	What's in the box

+2.	Period/stop added to punctuation marks: 

+1.	!.

+2.	?.

+3.	:.

+4.	.". 

+5.	-.

+3.	Saw some HTML formatting occasionally, such as <em></em>

+4.	Redundancy with choice phrases appearing multiple times in a single review

+5.	Specific issue with words being added at the end of the letter "s," creating nonsensical words:

+1.	It mispronouncesulphur virtually every caller'sulphur name in waysulphur that..

+2.	In fact, it'southward a rare feature that I recollect southwardhould be commonplace in any southwardurround receiver.

+6.	Adding -iness to make nonsensical words: mightinessiness, powerinessiness

+

+ */

+

+

+

+/*

+ * After using a gasoline powered chain saw for many years had to stop using because of dust and fumes made my copd worse this electric saw is great has surprising amount of power without the gas fumes..

+Nice chainsaw, works great, well built.

+The instant-stop chain is very safe, but a bit abrupt when releasing the trigger.

+I wish there were a half-way release that turned off the motor but did not engage the instant stop break.

+Pros .

+inexpensive compared to gas chainsaws, lightweight, cuts with good power, will do most anything that a gas chainsaw will do. like the automatic chain oiler and easy tension adjustment.

+Cons .

+If you are cutting larger branches and trees, a gas is better.

+However this will work on 8-10" size very well.

+Bought this McCulloch electric chainsaw to replace an old Craftsman electric chain saw. (the Craftsman got ran over by a car).

+Compared to my old Craftsman electric chain saw, the McCulloch seems to be wonderful.

+The first test was to cut a 16" diameter oak branch, cut thru it like hot butter.

+The "no tools needed" chain tensioner seems to be a good design..

+Is a good saw, however it came with the handle that wraps abound the left side of the saw was broken.

+The box looked good, but the saw itself was damaged.

+However, because I had a lot of tree damage in my yard, and more storms coming, I made due with it.

+Other then take, it works well, and the chain stops instantly when you let go of the trigger, or push the safety bar.

+stump w/ this E-saw.

+It keeps doing a super job.

+In terms of a replacement chain, make sure to get the Oregon S-54 (S is style of cutter, 54 means 54 links).

+The MC literature suggests use of a S-55, but it is TOO Long and will soon wind up in the trash can.

+ALSO, the MC factory installed gasket for the lube oil, between the saw and chain bar is total trash.

+When changing out the chain, pull the bar off, pull out and throw away the MC factory gasket, clean the bar and apply a piece of electrical tape, using a knife to cut out a pathway for oil to the bar.

+Will lube perfectly now!

+This is the second electric McCilloch 16" chain saw that I have owned and it is even better and more powerful than the first.

+I still use a gas chain saw out in the woods on my property but I usually do just enough cutting with it to get the logs on a trailer so I can take them bach to my shed to cut them up and save the sawdust for on my garden and flower beds as mulch.

+This electric is lighter and more powerful than my gas saw and makes short work of even 14" well-seasoned oak and poppel logs with a minimum of effort.

+I highly recommend this sae for anyone who has an electric outlet close enough to their cutting station.

+Bought this McCulloch electric chainsaw to replace an old Craftsman electric chain saw. (the Craftsman got ran over by a car).

+Compared to my old Craftsman electric chain saw, the McCulloch seems to be wonderful.

+The first test was to cut a 16" diameter oak branch, cut thru it like hot butter.

+The "no tools needed" chain tensioner seems to be a good design (design seems to be similar to that used by other manufacturers).

+Assuming. this thing keeps cutting/running the same way in the long term, then we have a winner. (note. all the electric chain saws come with cheap looking chains with cutting blades spaced very widely apart along the chain.

+To be ready for the bigger cutting jobs I sprung for a new $18 Oregon s-54 16" chain.).

+Update .

+Having used both gas and electric chain saws for more years than I care to remember, this little beauty is far more than I'd hoped for.

+Yes, it requires a cord to function and, without a handy "Current Bush", serves no useful purpose, but for trimming trees or cutting up firewood in a yard it beats H*** out of fighting the frustration when a gas saw refuses to start or remain running.

+I have another 14" electric MuCulloch along with a 16" gas Homelite and consider this to be a combination of the best qualities of both the others, the convenience of the small electric and the greater cutting ability of the gas powered Homelite.

+This little beauty appears to have as much power as the gas saw without the hassle of mixing fuel and the ongoing maintenence associated with it and cuts far faster than it's small electric brother.

+If I was forced to have a single chainsaw, in my present position(Pleanty of fire wood handy, just in need of cutting to the proper dimensions), this baby would be may choice without any douts or reservations.

+Ordered the Mcculloch 16inch electric chain saw to do some serious pruning of trees around the house which had severe frost damage.

+Although an electric chain saw, it cut through trees up to eight inches like a hot knife through butter.

+Not once did i have problems in two days of cutting.

+The big pros I noticed while using is realtively lightweight for a chainsaw and you can hold in one hand to use.

+Once you release the power switch, the chainsaw chain immediately stops!.

+This is a good thing as it keeps body parts attached.

+One nifty thing about this chainsaw is the chain tightener is outstanding once you figure how it works.

+No tools, just move the knobs and tighten, couldn't be easier and definitely beats hunting down a wrench to tighten.

+Only con is being electric, you have to watch the power cord.

+Very easy to hit extension cord if not careful.

+But it wakes you up when you are tired from your yard work.

+Let a good buddy borrow it and he was also impressed with the ease of use.

+Outstanding for jobs around you house, two thumbs up!

+The McCulloch3516F chainsaw puts an end to my problem of gas engines that don't start when I really need them to.

+I have been cutting out maple branches this summer from trees with verticillium wilt . branches up to 8 inches are no problem at all.

+This saw has an impressive safety feature. a chain brake that stops the saw instantly as soon as the trigger is released or the safety guard is pushed forward.

+I mean instantly. there is a loud clunk as the brake engages and the chain stops dead.

+This takes some getting used to, as the brake engages if you wiggle your finger while running the chainsaw, causing the chain to start and stop.

+There is no concept of "revving" the chain.

+It also means there is no "idle" speed for the chain.

+It is on or off.

+And that is safe.

+You can also consider it a safety feature that the chain has fewer cutting teeth than my gas powered saw chains.

+I don't know the relative operating RPMs .

+if they are about the same, this saw seems to cut a little slower, and fewer teeth would do that.

+This makes the saw less aggressive and less likely to pull out of your control.

+I like that.

+As I say, the cutting ability is well in excess of the 8" branches I've been dealing with.

+The oil fill is conveniently located so that you don't have to tip the saw to fill it, although a small funnel is helpful.

+Overall, I am very happy with this chainsaw.

+The saw works very well, overall.

+I have some minor complaints:.

+1.

+The chain drive gear cover requires a Phillips screwdriver to get the cover off.

+This is just dumb !.

+There's no good reason why it shouldn't have a thumbscrew similar to, but smaller than the chain tensioner thumbscrew.

+As someone pointed out, the chain gear area regularly gets clogged with oily sawdust that needs to be cleaned out.

+I can't figure out a good excuse for this design mistake.

+2 .

+The "instant chain stop" feature woks well, but the remaining motor drivetrain makes a loud howling screech until the motor actually stops.

+Makes me think there might be something wrong with the drivetrain.

+The saw seems to work well, though.

+Time will tell.

+3 .

+The oil filler neck is titled to the side, not vertical to the saw when placed on level ground.

+This makes viewing the oil stream going in and the rising oil level unnecessarily difficult.

+This is another obvious design mistake.

+4 .

+This is my first chainsaw, but it seems the bar oil reservoir is ridiculously small !.

+I have to refill it every 10 minutes of use.

+After reading other reviews for this model I immediately threw out the stock chain without ever using it and replaced it with an Oregon model S52 chain (dual chains is model ST52).

+Note that it fits fine although it is advertized as a 14 inch chain and this saw is advertized to be 16 inches.

+Go figure..

+Also, after reading about the risk of burning up the motor due to using a too lightweight extension cord, I bought a "US Wire 65100 12/3 100-Foot SJTW Orange Heavy Duty Extension Cord".

+It's heavy, alright !

+ */


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/URLsWithReviewFinderByProductName.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/URLsWithReviewFinderByProductName.java
new file mode 100644
index 0000000..467942d
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/URLsWithReviewFinderByProductName.java

@@ -0,0 +1,21 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+

+public class URLsWithReviewFinderByProductName {

+BingQueryRunner search = new BingQueryRunner();

+	

+	public List<String> findFacebookURLByNameAndZip(String name){

+		List<HitBase> foundFBPages = search.runSearch(name, 20);

+		List<String> results = new ArrayList<String>();

+		for(HitBase h: foundFBPages){

+			results.add(h.getUrl());

+		}

+		return results;

+	}

+	

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java
new file mode 100644
index 0000000..f9fb43b
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java

@@ -0,0 +1,444 @@
+package opennlp.tools.apps.review_builder;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collections;

+import java.util.List;

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.apps.WebPageExtractor;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

+import opennlp.tools.similarity.apps.utils.Utils;

+import opennlp.tools.textsimilarity.TextProcessor;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+import org.apache.commons.lang.StringUtils;

+

+public class WebPageReviewExtractor extends WebPageExtractor {

+	

+	BingAPIProductSearchManager prodman = new BingAPIProductSearchManager();

+	SentenceOriginalizer orig = null;

+		

+	public WebPageReviewExtractor(String resourceDir) {

+		orig = new SentenceOriginalizer(resourceDir);

+	}

+

+	public String[] removeDuplicates(String[] hits)

+	{

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+

+		List<Integer> idsToRemove = new ArrayList<Integer>();

+		List<String> hitsDedup = new ArrayList<String>();

+		try

+		{

+			for (int i = 0; i < hits.length; i++)

+				for (int j = i + 1; j < hits.length; j++)

+				{

+					String title1 = hits[i];

+					String title2 = hits[j];

+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))

+						continue;

+					if (meas.measureStringDistance(title1, title2) > 0.7)

+					{

+						idsToRemove.add(j); // dupes found, later list member to

+											// be deleted

+					}

+				}

+			for (int i = 0; i < hits.length; i++)

+				if (!idsToRemove.contains(i))

+					hitsDedup.add(hits[i]);

+			if (hitsDedup.size() < hits.length)

+			{

+				System.out.println("Removed duplicates from relevant search results, including "

+					+ hits[idsToRemove.get(0)]);

+			}

+		}

+		catch (Exception e)

+		{

+			System.out.println("Problem removing duplicates from relevant images");

+		}

+

+		return hitsDedup.toArray(new String[0]);

+

+	}

+

+	public ReviewObj extractSentencesWithPotentialReviewPhrases(String url)

+	{

+		ReviewObj reviewObj = new ReviewObj();

+		int maxSentsFromPage= 20;

+		List<String[]> results = new ArrayList<String[]>();

+

+		String downloadedPage = pageFetcher.fetchPage(url, 20000);

+		if (downloadedPage == null || downloadedPage.length() < 100)

+		{

+			return null;

+		}

+

+		String pageOrigHTML = pageFetcher.fetchOrigHTML(url);

+

+		List<String> productFeaturesList = new ArrayList<String> ();

+		String[] productFeatures = StringUtils.substringsBetween(pageOrigHTML, "<li>", "</li>" );

+		if (productFeatures!=null){

+			for(String item: productFeatures ){

+				if (item.indexOf("class")>-1 || item.indexOf("www.")>-1 || item.indexOf("href")>-1)

+					continue;

+				item = item.replace("<span>","").replace("</span>","").replace("<b>","").replace("</b>","");

+				if (item.length()>80 && MinedSentenceProcessor.acceptableMinedSentence(item)==null){

+					System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+item);

+					continue;

+				}

+				productFeaturesList .add(item);

+			}

+		}

+		

+		productFeaturesList = cleanProductFeatures(productFeaturesList);

+		

+		String startArea = StringUtils.substringBetween(pageOrigHTML, "reviewHistoPop", "t of 5 stars");

+		String item =  StringUtils.substringBetween(startArea, "title=\"","ou" );

+		if (item==null){//title="4.0 out of 5 stars" ><span>4.0 out of 5 stars</span>

+			int index = pageOrigHTML.indexOf("of 5 stars\"");

+			startArea = StringUtils.substringBetween(pageOrigHTML, "of 5 stars\"", "of 5 stars");

+			item =  StringUtils.substringBetween(startArea, "<span>","ou" );

+		}

+

+		// if found, process

+		if (item!=null){

+			try {

+				float rating = Float.parseFloat(item);

+				reviewObj.setRating(rating);

+			} catch (NumberFormatException e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+		}

+		//productFeaturesList .add(item);

+

+		downloadedPage= downloadedPage.replace("     ", "&");

+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");

+		String[] sents = downloadedPage.split("#");

+		List<TextChunk> sentsList = new ArrayList<TextChunk>();

+		for(String s: sents){

+			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")

+					.replace(": ", ". ").replace("- ", ". ").

+					replace (". .",".").trim();

+			sentsList.add(new TextChunk(s, s.length()));

+		}

+

+		Collections.sort(sentsList, new TextChunkComparable());

+		String[] longestSents = new String[maxSentsFromPage];

+		int j=0;														// -1 removed

+		for(int i=sentsList.size()-1 -maxSentsFromPage; i< sentsList.size()&& j<longestSents.length; i++){

+			longestSents[j] = sentsList.get(i).text;

+			j++;

+		}

+

+		sents = cleanListOfSents(longestSents);

+		

+		sents = removeDuplicates(sents);

+		sents = verifyEnforceStartsUpperCase(sents);

+

+		reviewObj.setFeaturePhrases(productFeaturesList.toArray(new String[0]));

+		reviewObj.setOrigSentences(sents);

+

+		return reviewObj;

+	}

+

+	private String[] verifyEnforceStartsUpperCase(String[] sents) {

+		for(int i=0; i<sents.length; i++){

+			String s = sents[i];

+			s = StringUtils.trim(s);

+			String sFirstChar = s.substring(0, 1);

+			if (!sFirstChar.toUpperCase().equals(sFirstChar)){

+				s = sFirstChar.toUpperCase()+s.substring(1);

+			}

+			sents[i] = s;

+		}

+			return sents;

+	}

+

+	private List<String> cleanProductFeatures(List<String> productFeaturesList) {

+		List<String> results = new ArrayList<String>();

+		for(String feature: productFeaturesList){

+			if (feature.startsWith("Unlimited Free") || feature.startsWith("View Larger") || feature.startsWith("View Larger") || feature.indexOf("shipping")>0)

+				continue;

+			results.add(feature);

+		}

+		return results;

+	}

+

+	protected String[] cleanListOfSents(String[] longestSents)

+	{

+		float minFragmentLength = 40, minFragmentLengthSpace=4;

+

+		List<String> sentsClean = new ArrayList<String>();

+		for (String sentenceOrMultSent : longestSents)

+		{

+			if (MinedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){

+				System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);

+				continue;

+			}

+			// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.

+			int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;

+			float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLength)

+				continue;

+			// o oo o ooo o o o ooo oo ooo o o oo

+			numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;

+			avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)

+				continue;

+

+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

+			

+			// forced split by ',' somewhere in the middle of sentence

+			// disused - Feb 26 13

+			//furtherSplit = furtherMakeSentencesShorter(furtherSplit);

+			furtherSplit.remove(furtherSplit.size()-1);

+			for(String s : furtherSplit){

+				if (s.indexOf('|')>-1)

+					continue;

+				s = s.replace("<em>"," ").replace("</em>"," ");

+				s = Utils.convertToASCII(s);

+				sentsClean.add(s);

+			}

+		}

+

+		return (String[]) sentsClean.toArray(new String[0]);

+	}

+

+	private List<String> furtherMakeSentencesShorter(List<String> furtherSplit) {

+		int MIN_LENGTH_TO_SPLIT = 80;

+		List<String> results = new ArrayList<String>();

+		for(String sent: furtherSplit) {

+			sent = startWithCapitalSent(sent);

+			int len = sent.length(); 

+			if (len <MIN_LENGTH_TO_SPLIT)

+				results.add(sent);

+			else {

+				try {

+					int commaIndex = StringUtils.indexOf(sent, ',');

+					int lastCommaIndex = StringUtils.lastIndexOf(sent, ',');

+					int splitIndex = -1;

+					if (Math.abs(commaIndex- len/2) > Math.abs(lastCommaIndex- len/2))

+						splitIndex = commaIndex;

+					else

+						splitIndex = lastCommaIndex;

+					if (splitIndex<0)

+						results.add(sent);

+					else {

+						String sent1 = sent.substring(0, splitIndex)+". ";

+						String sent2 = startWithCapitalSent(sent.substring(splitIndex+1));

+						results.add(sent1); results.add(sent2);

+					}

+				} catch (Exception e) {

+					results.add(sent);

+					e.printStackTrace();

+				}

+

+			}

+		}

+		return results;

+	}

+

+	private String startWithCapitalSent(String sent) {

+		String firstChar = sent.substring(0,1);

+		String remainder = sent.substring(1);

+		

+		return firstChar.toUpperCase()+remainder;

+	}

+

+	public List<String> formReviewsForAProduct(String name /*long bpid, String keywordsName*/){

+		ReviewObj reviewObjTotal = null;

+		try {

+			List<HitBase> pagesForAProduct = prodman.findProductByName(name, 1);

+			reviewObjTotal = null; 

+

+			for(HitBase p: pagesForAProduct){

+				ReviewObj reviewObj = 

+						extractSentencesWithPotentialReviewPhrases(p.getUrl());

+				// init with first element

+				if (reviewObjTotal  == null)

+					reviewObjTotal = reviewObj;

+				if (reviewObj==null)

+					continue;

+				String[] afterOriginalization = orig.convert(reviewObj.getOrigSentences(), p.getTitle(), reviewObj.getKeywordsName());

+				reviewObj.setOriginalizedSentences(Arrays.asList(afterOriginalization));

+				reviewObj.setSentimentPhrases(orig.formedPhrases);

+

+				List<String> buf = reviewObjTotal.getSentimentPhrases();

+				if (orig.formedPhrases!=null && orig.formedPhrases.size()>0){

+					buf.addAll(orig.formedPhrases);

+					reviewObjTotal.setSentimentPhrases(buf);

+				}

+

+		/*		buf = reviewObjTotal.getOriginalizedSentences();

+				if (buf!=null && afterOriginalization!=null && afterOriginalization.length>0){

+					List<String> b1 = Arrays.asList(afterOriginalization);

+					List<String> b2 = new ArrayList<String>();

+					b2.addAll(buf);

+					b2.addAll(new ArrayList<String>(b1));

+					reviewObjTotal.setOriginalizedSentences(b2);

+				}

+*/

+			}

+			if (reviewObjTotal==null) return new ArrayList<String>();

+			

+			List<String> textReviews = buildManyReviewTexts(reviewObjTotal);

+

+			

+		/*	String textReview = buildText(reviewObjTotal);

+			try {

+				if (textReview!=null && textReview.length()>60)

+					ser.saveReviewsToDB(textReview, bpid, pagesForAProduct.get(0).getUrl(), pagesForAProduct.get(0).getTitle(),

+							reviewObjTotal.getSentimentPhrases().toString(), reviewObjTotal.getRating());

+			} catch (Exception e) {

+				System.out.println("Database write failed");

+			}

+			*/

+			

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		} 

+		return reviewObjTotal.getOriginalizedSentences();

+	}

+

+	private String buildText(ReviewObj reviewObj) {

+

+		String[] features = reviewObj.getFeaturePhrases();

+		List<String> sentences =reviewObj.getOriginalizedSentences();

+		StringBuffer buf = new StringBuffer();

+		int count = 0;

+		for(String sent:sentences){

+			if (sent!=null)

+				buf.append(sent+" ");

+			if (count%2==0 && count<features.length)

+				if (features[count]!=null){

+					buf.append(features[count]);

+					if (!(features[count].endsWith("!") ||features[count].endsWith("?")||features[count].endsWith("?") 

+							||features[count].endsWith(".\"") ))

+						buf.append(". ");

+				}

+

+			if (count%5==0)

+				buf.append("\n");

+			count++;

+		}

+		return buf.toString();

+	}

+	

+	private List<String> buildManyReviewTexts(ReviewObj reviewObj) {

+

+		String[] features = reviewObj.getFeaturePhrases();

+		List<String> sentences =reviewObj.getOriginalizedSentences();

+		

+		// first count how many sentences

+				int NUM_SENTS_IN_REVIEW = 7;

+				int count=0;

+				for(String sent:sentences){

+					if (sent!=null)

+						count++;

+				}

+		int nReviews = count/NUM_SENTS_IN_REVIEW;

+		if (nReviews<1)

+			nReviews=1;

+		StringBuffer[] bufs = new StringBuffer[nReviews];

+		for(int i=0; i<bufs.length; i++){

+			bufs[i] = new StringBuffer();

+		}

+				

+		count = 0;

+		int currentRevIndex = 0;

+		for(String sent:sentences){

+			if (sent!=null)

+				bufs[currentRevIndex].append(sent+" ");

+			if (count%2==0 && count<features.length)

+				if (features[count]!=null){

+					bufs[currentRevIndex].append(features[count]);

+					if (!(features[count].endsWith("!") ||features[count].endsWith("?")||features[count].endsWith("?") 

+							||features[count].endsWith(".\"") ))

+						bufs[currentRevIndex].append(". ");

+				}

+

+			try {

+				if (bufs[currentRevIndex].toString().split(".").length>4)

+					bufs[currentRevIndex].append("\n");

+			} catch (Exception e) {

+				e.printStackTrace();

+			}

+			

+			count++;

+			currentRevIndex++;

+			if (currentRevIndex>=nReviews)

+				currentRevIndex=0;	

+		}

+		

+		List<String> results = new ArrayList<String>();

+		for(StringBuffer b:bufs){

+			String sent = b.toString().replace("!.","!").replace("?.","?");

+			results.add(sent);

+		}

+		return results;

+	}

+

+	public static void main(String[] args){

+		String resourceDir = "C:/stanford-corenlp/src/test/resources/";

+		ParserChunker2MatcherProcessor proc = ParserChunker2MatcherProcessor.getInstance(resourceDir); 

+			

+		//ProductFinderInAWebPage init = new ProductFinderInAWebPage("C:/workspace/relevanceEngine/src/test/resources");

+

+		WebPageReviewExtractor extractor = new WebPageReviewExtractor(resourceDir);

+		String res1[] = extractor.verifyEnforceStartsUpperCase(new String[]{ "hhhh !", "Klyn mng hghj ."});

+				

+		List<String> res = extractor.formReviewsForAProduct(//"McCulloch 16-Inch 3.5 HP Electric Chain Saw");

+				//	"WORX Electric JawSaw with Extension Handle");

+				//	"Panasonic 2-Line Digital Cordless System", 215200345l);

+				//	"Sport Silver Dial Women", 215475290);

+				//"Rectangle Area Rug", 213885290);

+				//		"40VA Replacement Transformer", 213085391);

+				//		"PSYLLIUM POWDER Food", 213185391);

+				//		"Leighton Toilet Tank", 213285391);

+				//"Samsung Knack U310 Flip Phone", 214495493);

+				//"Panasonic Cordless Phone 2 handsets", 214870820);

+				//"Winegard TV Antenna Pre-Amplifier", 211924499);

+				//"Atlona AT-HD-V18 HDMI Distribution Amplifier", 215162612);

+				//"airport express base station", 211462827);

+				//"denon  Network Streaming A/V Home Theater receiver", 209565926);

+				//"sherwood receiver 400 watts stereo", 211286714);

+				//"multizone music distribution system", 205333526);

+				//"niles zr4", 215104912);

+				//"alpine waterproof marine cd receiver", 215167695);

+				//"sherwood channel receiver dolby", 215116818);

+				//"sherwood lcd tv widescreen hdtv", 215481917);

+				//"multiroom music distribution system", 205333526);

+				//		"fusion ms compact stereo", 215649463); 

+				//"pyle pro speaker", 213265125);

+				// "apple iphone 4g",  213265325);

+				//"sherwood high performance receiver", 215394729);

+				//"sony camera housing", 211960592);

+				//"sony xl2100", 1135329203);

+				//"sony 18 megapixel-digital-camera", 215743208);

+				//"sony m470 microcassette tape recorder", 205828052);

+				//"sony monitor terminal expansion board", 213244217);

+				//"sony cybershot digital-camera", 215743207);

+				//"sony interchangeable lens handycam camcorder", 215153503);

+				//"canon powershot digital camera", 214754207);

+				//"brother ink pageyield yellow", 204743189);

+				// ?? "garmin 2200 gps navigator", 215167480);

+				"halo portable backup battery");

+

+		ProfileReaderWriter.writeReportListStr(res, "formedReviewSentences4.csv");

+

+

+		/*		

+			res=	extractor. extractSentencesWithPotentialReviewPhrases(//"http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");

+		//"http://www.amazon.com/OFM-High-Back-Leather-Integral-Headrest/dp/B002SIW1E0/ref=sr_1_1?ie=UTF8&qid=1353370254&sr=8-1&keywords=OFM-High-Back-Leather-Integral-Headrest");

+		//"http://www.amazon.com/Oregon-511AX-Chain-Grinder-Sharpener/dp/B0000AX0CY/ref=sr_1_4?s=industrial&ie=UTF8&qid=1353373435&sr=1-4&keywords=chain+saws");

+			//			"http://www.amazon.com/Bearing-UCP204-12-Housing-Mounted-Bearings/dp/B002BBIYWM/ref=sr_1_1?s=industrial&ie=UTF8&qid=1353373786&sr=1-1&keywords=pillow+block+bearing");

+			"http://www.amazon.com/ShelterLogic-20--Feet-Auto-Shelter/dp/B001OFNK8O/ref=sr_1_1?s=lawn-garden&ie=UTF8&qid=1353376677&sr=1-1&keywords=shelterlogic+62680+autoshelter+portable+garage+carport");			

+						System.out.println(res);

+		 */			

+

+	}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java
new file mode 100644
index 0000000..0b99fc2
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java

@@ -0,0 +1,171 @@
+package opennlp.tools.apps.utils.email;

+

+import java.util.Properties;

+import java.util.regex.Matcher;

+import java.util.regex.Pattern;

+import javax.mail.*;

+import javax.mail.internet.*;

+import javax.activation.*;

+/**

+ * Responsible to sending e-mails trough a gmail smtp server.

+ * It will be extended to handle arbitrary smtp servers.

+ * @author GaDo

+ *

+ */

+public class EmailSender {

+		private static final long serialVersionUID = 1L;

+		private static final String mailboxAddress="bgalitsky@hotmail.com";

+

+		public  boolean sendMail(String smtp, String user, String pass, InternetAddress from, InternetAddress[] to, InternetAddress[] cc, InternetAddress[] bcc, String subject, String body, String file) throws Exception

+		{

+			boolean correct=true;

+			try

+			{							

+				//Eliminate spaces from addresses

+				if(from!=null){		

+					from.setAddress(from.getAddress().replace(" ","").trim());		}

+					to = eliminateSpaces(to);

+					cc = eliminateSpaces(cc);

+					bcc = eliminateSpaces(bcc);

+					correct = validateAddress(from,to,cc,bcc);

+				

+				if(correct){

+					//Configuracio of the properties -> smtp

+					Properties props = new Properties();

+					props.put("mail.smtp.host", smtp);

+					props.put("mail.smtp.auth", "true");

+					props.put("mail.smtp.port", "587");

+					props.put("mail.smtp.starttls.enable", "true");

+					Authenticator auth = new SMTP_Authenticator	(user, pass);

+					Session session = Session.getInstance(props, auth);

+					//Session session = Session.getDefaultInstance(props);

+					//props.put("mail.smtp.user",user);

+					//props.put("mail.smtp.password",pass);

+												    

+				    //Composing the message

+				    MimeMessage message = new MimeMessage(session);

+				      message.setFrom(from);

+				    message.setRecipients(Message.RecipientType.TO,to);

+				    message.setRecipients(Message.RecipientType.CC,cc);

+				    message.setRecipients(Message.RecipientType.BCC,bcc);

+				    message.setSubject(subject);

+				    if(file==null)

+				    {

+				    	

+					    //message.setText(body);

+				    	message.setContent(body, "text/html");

+				    }

+				    else

+				    {

+					    Multipart multipart = new MimeMultipart();

+					    BodyPart messageBodyPart;

+					    messageBodyPart = new MimeBodyPart();

+					    messageBodyPart.setContent(body, "text/html");

+					    //messageBodyPart.setText(body);

+					    multipart.addBodyPart(messageBodyPart);

+					    messageBodyPart = new MimeBodyPart();

+					    DataSource source = new FileDataSource(file);

+					    messageBodyPart.setDataHandler(new DataHandler(source));

+					    messageBodyPart.setFileName(file);

+					    multipart.addBodyPart(messageBodyPart);

+		

+					    message.setContent(multipart);

+				    }

+		

+					Transport tr = session.getTransport("smtp");			

+					tr.connect(smtp, mailboxAddress, pass);

+					message.saveChanges();

+					tr.sendMessage(message, message.getAllRecipients());

+					tr.close();

+				}

+		    }

+			catch(Exception e)

+			{

+				e.printStackTrace();

+				correct=false;

+			}

+			return correct;

+		}

+

+		private  boolean validateAddress(InternetAddress from,

+				InternetAddress[] to, InternetAddress[] cc,

+				InternetAddress[] bcc) {

+			boolean correct = true;

+			try{

+				correct = from!=null && !from.getAddress().equals("") && to!=null && to.length>=1;

+				String regEx="[^\\s]+@[^\\s]+.[^\\s]+";

+				Pattern pc = Pattern.compile(regEx);

+				Matcher m = null ;

+

+				if(correct){

+					m = pc.matcher(from.getAddress());

+					correct = m.matches();

+				}

+				

+				if(correct){

+					int vault = to.length;

+					while(correct && vault<to.length){

+						correct = !to[vault].getAddress().equals("");

+						if(correct){

+					    	m = pc.matcher(to[vault].getAddress());

+					    	correct = m.matches();

+						}

+						vault++;

+					}

+				}

+				

+				if(correct && cc!=null){

+					int vault = cc.length;

+					while(correct && vault<cc.length){

+						correct = !cc[vault].getAddress().equals("");

+						if(correct){

+					    	m = pc.matcher(cc[vault].getAddress());

+					    	correct = m.matches();

+						}

+						vault++;

+					}

+				}

+				

+				if(correct && bcc!=null){

+					int vault = bcc.length;

+					while(correct && vault<bcc.length){

+						correct = !bcc[vault].getAddress().equals("");

+						if(correct){

+					    	m = pc.matcher(bcc[vault].getAddress());

+					    	correct = m.matches();

+						}

+						vault++;

+					}

+				}

+				

+			}catch(Exception e){

+				e.printStackTrace();

+				correct=false;

+			}

+			return correct;

+		}

+

+		private  InternetAddress[] eliminateSpaces(InternetAddress[] address) {

+			if(address!=null){

+				for(int i=0;i<address.length;i++){

+					address[i].setAddress(address[i].getAddress().replace(" ","").trim());

+				}

+			}

+			return address;

+		}		

+

+		

+		public static void main(String[] args){

+			EmailSender s = new EmailSender();

+			try {

+				s.sendMail("smtp.live.com", "bgalitsky@hotmail.com", "******", new InternetAddress("bgalitsky@hotmail.com"), new InternetAddress[]{new InternetAddress("bgalitsky@hotmail.com")}, new InternetAddress[]{}, new InternetAddress[]{}, 

+						"Generated content for you", "body", null);

+			} catch (AddressException e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			} catch (Exception e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+		}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTP_Authenticator.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTP_Authenticator.java
new file mode 100644
index 0000000..a57601b
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTP_Authenticator.java

@@ -0,0 +1,24 @@
+package opennlp.tools.apps.utils.email;

+import javax.mail.*;

+

+

+/**

+ * This contains the required informations for the smtp authorization!

+ *

+ */

+

+public class SMTP_Authenticator extends javax.mail.Authenticator {

+	

+	private String username="bg7550@gmail.com";

+	private String password="pill0693";	

+	

+	public SMTP_Authenticator(String user, String pwd) {

+		username=user;

+		password=pwd;

+	}

+

+		

+	public PasswordAuthentication getPasswordAuthentication() {

+		return new PasswordAuthentication(username, password);

+		}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java
new file mode 100644
index 0000000..89d12e4
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/FeatureSpaceCoverageProcessor.java

@@ -0,0 +1,317 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.jsmlearning;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+import org.apache.commons.lang.StringUtils;

+

+public class FeatureSpaceCoverageProcessor {

+

+	public Map<String, Integer> paramMap = new HashMap<String, Integer>();

+	public String[] header; 

+	String[] attributes;

+

+	public FeatureSpaceCoverageProcessor (){

+		

+	}

+

+	public void initParamMap(String[] attributes, String[] header){

+		this.header = header;

+		this.attributes = attributes;

+		for(int m=0; m<header.length; m++){

+			paramMap.put(header[m], m);

+		}

+	}

+

+

+	// distance between array and array

+	public Float calcDistance(String[] seed, String[] candidate) throws Exception {

+		if (paramMap.isEmpty())

+			throw new Exception("paramMap.isEmpty()");

+

+		Float score = 0f;

+		int p1 = paramMap.get("First Level Category");	

+		int p2 = paramMap.get("Second Level Category");

+		if (seed[p1].equals(candidate[p1])) {

+			if (seed[p2].equals(candidate[p2]))

+				score = score+0.0000001f;

+			else

+				score = score+0.01f;			

+		} else return 100000f;

+

+		try {

+			int p3 = paramMap.get("Latitude");	

+			int p4 = paramMap.get("Longitude");

+			Double latDiff = Math.abs(Double.parseDouble(seed[p3]) - Double.parseDouble(candidate[p3]));

+			Double longDiff = Math.abs(Double.parseDouble(seed[p4]) - Double.parseDouble(candidate[p4]));

+			if (latDiff>1 || longDiff>1)

+				return 1000000f;

+			else 

+				score+= latDiff.floatValue()/100.0f + longDiff.floatValue()/100.0f;

+		} catch (Exception e) {

+			return 1000000f;

+		}

+

+

+		return score;

+	}

+

+	// distance between matrix and array

+	public Float calcDistance(String[][] seed, String[] candidate) throws Exception {

+		if (paramMap.isEmpty())

+			throw new Exception("paramMap.isEmpty()");

+

+		Float score = 0f, catScore = 10000f, currCatScore=10000000f;

+

+		int p1 = paramMap.get("First Level Category");	

+		int p2 = paramMap.get("Second Level Category");

+		for(int v=0; v<seed[0].length; v++){

+			if (seed[p1][v].equals(candidate[p1])) {

+				if (seed[p2][v].equals(candidate[p2]))

+					currCatScore = 0.0000001f;

+				else

+					currCatScore = 0.01f;			

+			} 

+			if ( catScore >  currCatScore) // if found closer, update

+				catScore =  currCatScore;

+		}

+		score = catScore;

+		if (score > 1000000f)

+			return 10000000f;

+

+		Float latLongScore = 100000f, currLatLongScore = 10000000f;

+		for(int v=0; v<seed[0].length; v++){

+			try {

+				int p3 = paramMap.get("Latitude");	

+				int p4 = paramMap.get("Longitude");

+				if (seed[p3][v].equals("") || seed[p4][v].equals("") 

+						|| candidate[p3].equals("") ||  candidate[p4].equals(""))

+					continue;

+				Double latDiff = Math.abs(Double.parseDouble(seed[p3][v]) - Double.parseDouble(candidate[p3]));

+				Double longDiff = Math.abs(Double.parseDouble(seed[p4][v]) - Double.parseDouble(candidate[p4]));

+				if (!(latDiff>1 || longDiff>1))

+					currLatLongScore = latDiff.floatValue()/100.0f + longDiff.floatValue()/100.0f;

+			} catch (Exception e) {

+				//return 1000000f;

+			}

+			if (latLongScore > currLatLongScore)

+				latLongScore = currLatLongScore;

+

+		}	

+		if (latLongScore> 10000)

+			return 10000f;

+		score+=latLongScore;

+		return score;

+	}

+

+	public Integer getIdForAttributeName(String key){

+		Integer res = paramMap.get(key);

+		try {

+			res.toString();

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+			System.out.println("wrong key"+key);

+		}

+		return res;

+

+	}

+

+	public String getAttribNameForId(Integer id){

+		return header[id];

+	}

+

+

+

+

+	public Map<String, String> computeIntersection(String[] line1,

+			String[] line2) {

+

+		Map<String, String> attr_value = new HashMap<String, String>();

+		for(String attr: attributes){

+			int attrIndex = getIdForAttributeName(attr);

+			String v1 = line1[attrIndex].toLowerCase().replace("\"", "").replace(",  ", ", ").replace(", ", ",");;

+			String v2 = line2[attrIndex].toLowerCase().replace("\"", "").replace(",  ", ", ").replace(", ", ",");;

+			String valArr1Str = StringUtils.substringBetween(v1, "{", "}");

+			String valArr2Str = StringUtils.substringBetween(v2, "{", "}");

+			if (valArr1Str==null || valArr2Str==null) { // we assume single value, not an array of values

+				if (v1.equals(v2)){

+					attr_value.put(attr, v1);

+				}

+			}

+			else {

+				valArr1Str = valArr1Str.replaceAll(", ", ",");

+				valArr2Str = valArr2Str.replaceAll(", ", ",");

+				String[] valArr1 = valArr1Str.split(",");

+				String[] valArr2 = valArr2Str.split(","); 

+				List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1));

+				List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2));

+				valList1.retainAll(valList2);

+				/* verification of coverage

+				valList1.retainAll(valList2);

+				

+				List<String> vl1 = new ArrayList<String>(Arrays.asList(valArr1));

+				valList1.retainAll(vl1); */

+				

+				if (!valList1.isEmpty()){

+					v1 = "{"+valList1.toString().replace("["," ").replace("]", " ").trim()+"}";

+					attr_value.put(attr, v1);

+				}

+

+			}		    		

+		}

+			return attr_value;

+	}

+

+

+		public boolean ruleCoversCase(Map<String, String> attr_value, String[] line){

+			boolean soFarCovers = true;		

+			for(String attr: attributes){

+				int attrIndex = getIdForAttributeName(attr);

+				String rule = attr_value.get(attr);

+				if (rule == null)

+					continue; // no constraint

+				rule = rule.toLowerCase().replace("\"", "").replace(",  ", ",").replace(", ", ",");

+				String vCase = line[attrIndex].toLowerCase().replace("\"", "").replace(",  ", ",").replace(", ", ",");

+				if (vCase==null){// rule for this attribute exists but case has no value

+					soFarCovers = false;

+					return false;

+				}

+				

+				String valArrCaseStr = StringUtils.substringBetween(vCase, "{", "}");

+				String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}");

+				if (valArrCaseStr==null || valArrRuleStr==null) { // we assume single value, not an array of values

+					if (!vCase.equals(rule)){

+						soFarCovers = false;

+						return false;

+					}

+				}

+				else {

+					String[] valArrCase = valArrCaseStr.split(",");

+					String[] valArrRule = valArrRuleStr.split(","); 

+					List<String> valListCase = new ArrayList<String>(Arrays.asList(valArrCase));

+					List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule));

+					

+					int ruleSize = valListRule.size();

+					//System.out.println(valListRule);

+					//System.out.println(valListCase);

+					

+					// rule members are subset of case

+					valListRule.retainAll(valListCase);

+					

+					//System.out.println(valListRule);

+					

+					if (ruleSize != valListRule.size()){

+						soFarCovers = false;

+						return false;

+					}

+					

+					

+					

+				}		    		

+			}

+			return  soFarCovers;

+		}

+		

+		public boolean ruleCoversRule(Map<String, String> attr_value, Map<String, String> line){

+			boolean soFarCovers = true;		

+			for(String attr: attributes){

+				int attrIndex = getIdForAttributeName(attr);

+				String rule = attr_value.get(attr);

+				if (rule == null)

+					continue; // no constraint

+				

+				String vRuleBeingCovered = line.get(attr);

+				if (vRuleBeingCovered==null){// rule for this attribute exists but RuleBeingCovered has no value

+					soFarCovers = false;

+					return false;

+				}

+				

+				String valArrRuleBeingCoveredStr = StringUtils.substringBetween(vRuleBeingCovered, "{", "}");

+				String valArrRuleStr = StringUtils.substringBetween(rule, "{", "}");

+				if (valArrRuleBeingCoveredStr==null || valArrRuleStr==null) { // we assume single value, not an array of values

+					if (!vRuleBeingCovered.equals(rule)){

+						soFarCovers = false;

+						return false;

+					}

+				}

+				else {

+					String[] valArrRuleBeingCovered = valArrRuleBeingCoveredStr.split(",");

+					String[] valArrRule = valArrRuleStr.split(","); 

+					List<String> valListRuleBeingCovered = new ArrayList<String>(Arrays.asList(valArrRuleBeingCovered));

+					List<String> valListRule = new ArrayList<String>(Arrays.asList(valArrRule));		

+					for(String r: valListRule){

+						if (!strListContainsMember(valListRuleBeingCovered, r)){

+							soFarCovers = false;

+							return false;

+						} 

+					}

+

+				}		    		

+			}

+			return  soFarCovers;

+		}

+

+		public Map<String, String> computeIntersection(

+				Map<String, String> rule1, Map<String, String> rule2) {

+			Map<String, String> attr_value = new HashMap<String, String>();

+			for(String attr: attributes){

+				int attrIndex = getIdForAttributeName(attr);

+				String v1 = rule1.get(attr);

+				String v2 = rule2.get(attr);

+				if (v1==null || v2==null)

+					continue;

+				String valArr1Str = StringUtils.substringBetween(v1, "{", "}");

+				String valArr2Str = StringUtils.substringBetween(v2, "{", "}");

+				if (valArr1Str==null || valArr2Str==null) { // we assume single value, not an array of values

+					if (v1.equals(v2)){

+						attr_value.put(attr, v1);

+					}

+				}

+				else {

+					valArr1Str = valArr1Str.replaceAll(", ", ",");

+					valArr2Str = valArr2Str.replaceAll(", ", ",");

+					String[] valArr1 = valArr1Str.split(",");

+					String[] valArr2 = valArr2Str.split(","); 

+					List<String> valList1 = new ArrayList<String>(Arrays.asList(valArr1));

+					List<String> valList2 = new ArrayList<String>(Arrays.asList(valArr2));

+					valList1.retainAll(valList2);

+					if (!valList1.isEmpty()){

+						v1 = "{"+valList1.toString().replace("["," ").replace("]", " ").trim()+"}";

+						attr_value.put(attr, v1);

+					}

+

+				}		    		

+			}

+				return attr_value;

+		}

+

+		private boolean strListContainsMember(List<String> valListCase, String r) {

+			boolean bContains = false;

+			for(String m: valListCase){

+				if (m.startsWith(r) || r.startsWith(m))

+					return true;

+				

+			}

+			return false;

+		}

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java
new file mode 100644
index 0000000..e1d748e
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/IntersectionSetBuilder.java

@@ -0,0 +1,361 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.jsmlearning;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+/*

+ * 

+ * The rule is in the form

+The report also shows how many positive cases are covered by this rule (should be 0) and how many negative cases 

+are covered by this rule (should be above 1)

+

+The rule

+{plugin_number=3, service_type=all, mime_type_number=11, review_status=pass}	0	192

+

+should be read as 

+

+plugin_number=3 & service_type=all & mime_type_number=11 & review_status=pass

+

+For a single-attribute, its value should be the one from this rule. For a multi-value attribute, the set of values in the case

+should INCLUDE the set of values from the rule.

+

+The rule checking that a case belongs to the negative set is a disjunction of all rules in the result file.

+

+input: two data files, one is negative set and another is positive set.

+in the argument, just the negative file needs to be specified:

+".../negativeSet1.csv", 

+then the system assumes that the filename for negative is obtained by replacing 'negative' with 'positive'

+".../positiveSet1.csv", 

+

+The set of attribute in analysis is hard coded

+

+

+ */

+public class IntersectionSetBuilder{

+	private FeatureSpaceCoverageProcessor distProcessorPos, distProcessorNeg;

+	private float percentageOfAllowedSetCover = 0.001f;

+	//The set of attribute in analysis is hard coded

+	String[] fieldsToAggr = new String[]{

+			"reason_code",	"risk_rating", "service_type", 	"device_match_result", 	"device_result", 	"http_referer", 	"device_id_reason_code",

+			"review_status", "tcp_os_sig_ttl", "tcp_connection_type",

+			"mime_type_number", "plugin_number", "http_connection_type", "device_last_event", "http_connection_type"

+

+

+	};

+	public IntersectionSetBuilder() {};

+	

+	/*

+	 * Takes a file generated by public String ruleFormer(String dataFile)

+	 * and performs verification of coverage for positive and negative set, as well as dedupe of rules

+	 * The input for negative positive data set is the same as the above function.

+	 * The second argument is the rule file generated by the above.

+	 * Outputs the verified rule file.

+	 */

+

+	public void ruleVerifier(String dataFile, String ruleFile){

+

+

+		List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile); 

+		List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive")); 

+		distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();

+		distProcessorNeg.initParamMap( 	fieldsToAggr, negativeSet.get(0));		

+		distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));		

+		negativeSet.remove(0); positiveSet.remove(0);

+		

+		List<String[]> ruleStrings = ProfileReaderWriter.readProfiles(ruleFile);

+		List<Map<String, String>> rules = new ArrayList<Map<String, String>>(), dedupedRules = new ArrayList<Map<String, String>>() ;

+		for(String[] l : ruleStrings){

+			Map<String, String> rule = new HashMap<String, String>();

+			String lstr = l[0].substring(1, l[0].length()-1);

+			String[] ruleStr= lstr.split(",");

+			for(String attr_valueStr: ruleStr){

+				String[] attr_value =  attr_valueStr.split("=");	

+				if (attr_value.length==2)

+					rule.put(attr_value[0].trim(), attr_value[1].trim());

+				else if (attr_value.length==1)

+					rule.put(attr_value[0].trim(),"");

+				else

+					System.err.println("Problem parsing rule file "+lstr);

+			}

+			rules.add(rule);

+		}

+		

+		

+		for(int i=0; i<rules.size(); i++){

+			boolean bCovered = false;

+		

+			for(int j=i+1; j<rules.size(); j++){

+				if (distProcessorNeg.ruleCoversRule(rules.get(j), rules.get(i))){

+					bCovered = true;

+				}

+			}

+			if (!bCovered)

+				dedupedRules.add(rules.get(i));

+		}

+		

+		rules = dedupedRules;

+

+		List<String[]> output = new ArrayList<String[]>();

+		output.add(new String[]{"rule", "# covers positive", "# covers negative"});

+		for(Map<String, String> rule: rules){

+			int countCoverNeg = 0, countCoverPos=0;

+			for(String[] line: positiveSet){

+				if (distProcessorPos.ruleCoversCase(rule, line)){

+					countCoverPos++;

+				}

+			}

+			for(String[] line: negativeSet){

+				if (distProcessorNeg.ruleCoversCase(rule, line)){

+					countCoverNeg++;

+				}

+

+			}

+			output.add(new String[]{rule.toString(), new Integer(countCoverPos).toString(), new Integer(countCoverNeg).toString()});	

+

+		}

+		ProfileReaderWriter.writeReport(output, ruleFile+"Verif1.csv");

+	}

+	

+	

+	/*

+	 * Takes one argument for negative training set file, assumes the positive filename is formed by replacing 'negative'->'positive'

+	 * Outputs the filename with generated rules

+	 * 

+	 */

+	public String ruleFormer(String dataFile){

+

+

+		List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile); 

+		List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive")); 

+		distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();

+		distProcessorNeg.initParamMap( 	fieldsToAggr, negativeSet.get(0));		

+		distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));		

+		negativeSet.remove(0); positiveSet.remove(0);

+

+		List<Map<String, String>> intersections = formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(negativeSet, positiveSet);

+		List<Map<String, String>> superIntersections = formIntersections(intersections, negativeSet, positiveSet);

+

+		List<String[]> output = new ArrayList<String[]>();

+		for(Map<String, String> rule: superIntersections){

+			int countCover = 0;

+			for(String[] line: positiveSet){

+				if (distProcessorPos.ruleCoversCase(rule, line)){

+					countCover++;

+				}

+			}

+			output.add(new String[]{rule.toString(), new Integer(countCover).toString()});	

+

+		}

+		String outputFile = "learnedRulesForNegativeSetJune23-1.csv";

+		ProfileReaderWriter.writeReport(output, outputFile);

+		return outputFile; 

+

+	}

+

+	private List<Map<String, String>> formIntersections(List<Map<String, String>> intersectionsIn, List<String[]> negativeSet, List<String[]> positiveSet) {

+		List<Map<String, String>> intersectionsNew = new ArrayList<Map<String, String>>();

+		for(int i=0; i<intersectionsIn.size(); i++){

+			for(int j=i+1; j<intersectionsIn.size(); j++){

+				Map<String, String> intersection = distProcessorNeg.computeIntersection(intersectionsIn.get(i), intersectionsIn.get(j));

+				if (intersection.isEmpty())

+					continue;

+				

+				int countCover = 0;

+				for(String[] line: positiveSet){

+					if (distProcessorPos.ruleCoversCase(intersection, line)){

+						//countCover++;

+						countCover = 10000000;

+						break;

+					}

+				}

+				float cover = (float)countCover/(float)positiveSet.size();

+				if (!(cover<this.percentageOfAllowedSetCover))

+					continue;

+

+				List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();

+				boolean nothingCoversThisRule = true;

+				for(Map<String, String> intersChecker: intersectionsIn){ // more general rule covers more specific

+					if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){

+						nothingCoversThisRule = false;

+						break;

+					} // now check if this new rule defeats built rules

+					if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){

+						rulesToBeRemoved.add(intersChecker); 

+					}

+				}

+				if(nothingCoversThisRule){

+					intersectionsNew.add(intersection);

+					intersectionsNew.removeAll(rulesToBeRemoved);

+				}

+			}

+		}

+		intersectionsNew.addAll(intersectionsIn);

+		return intersectionsNew;

+	}

+

+	private List<Map<String, String>> formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(List<String[]> negativeSet, List<String[]> positiveSet){

+		List<Map<String, String>> intersections = new ArrayList<Map<String, String>>();

+

+		for(int i=0; i<negativeSet.size() && i<1000; i++){

+			for(int j=i+1; j<negativeSet.size(); j++){

+				Map<String, String> intersection = distProcessorNeg.computeIntersection(negativeSet.get(i), negativeSet.get(j));

+				if (intersection.isEmpty())

+					continue;

+				

+				/* temporary code that formed rule covers at least 2 cases

+				int countCoverNeg=0;

+				for(String[] line: negativeSet){

+					if (distProcessorNeg.ruleCoversCase(intersection, line)){

+						countCoverNeg++;

+					}

+

+				} 

+				if (countCoverNeg<2){

+					System.err.println("A rule formed but it does not cover its origin! "+intersection);

+					distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(i));

+					distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(j));

+				} */

+				

+				

+				

+				int countCover = 0;

+				for(String[] line: positiveSet){

+					if (distProcessorPos.ruleCoversCase(intersection, line)){

+						//countCover++;

+						countCover = 10000000;

+						break;

+					}

+				}

+				float cover = (float)countCover/(float)positiveSet.size();

+				if (!(cover<this.percentageOfAllowedSetCover))

+					continue;

+

+				List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();

+				boolean nothingCoversThisRule = true;

+				for(Map<String, String> intersChecker: intersections){ // more general rule covers more specific

+					if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){

+						nothingCoversThisRule = false;

+						break;

+					} // now check if this new rule defeats built rules

+					if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){

+						rulesToBeRemoved.add(intersChecker); 

+					}

+				}

+				if(nothingCoversThisRule){

+					intersections.add(intersection);

+					intersections.removeAll(rulesToBeRemoved);

+				}

+			}

+		}

+		return intersections;

+	}

+

+	private List<Map<String, String>> filterIntersectionsByOppositeTrainingSet(List<Map<String, String>> intersections, List<String[]> positiveSet){

+		List<Map<String, String>> filteredIntersections = new ArrayList<Map<String, String>>();

+		for(Map<String, String> rule: intersections){

+			int countCover = 0;

+			for(String[] line: positiveSet){

+				if (!distProcessorPos.ruleCoversCase(rule, line))

+					countCover++;

+			}

+			if ((float)countCover/(float)positiveSet.size()<this.percentageOfAllowedSetCover)

+				filteredIntersections.add(rule);

+

+		}

+		return filteredIntersections;

+	}

+

+    public boolean applyRule(String[] sample){

+    	return true;

+    	// todo implement singleton which reads rule file and applies them

+    	

+    }

+

+	public static void main(String[] args){

+		IntersectionSetBuilder iBuilder = new IntersectionSetBuilder ();

+		

+		// builds the set of rules

+	    String resFile = iBuilder.ruleFormer("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv");

+		// verifies and cleans the rules

+		iBuilder.ruleVerifier("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv", 

+				"C:/workspace/relevanceEngine/learnedRulesForNegativeSetJune23-1.csv");

+

+	}

+

+}

+

+/*

+ * 

+ * datetime

+browser_language

+browser_string

+device_first_seen

+device_match_result

+http_os_signature

+http_os_sig_raw

+os

+device_id_reason_code

+true_ip

+proxy_ip

+http_os_sig_adv_mss

+http_os_sig_snd_mss

+http_os_sig_rcv_mss

+http_os_sig_ttl

+http_connection_type

+device_last_event

+flash_lang

+flash_os

+flash_version

+os_fonts_number

+plugin_adobe_acrobat

+plugin_flash

+plugin_silverlight

+plugin_windows_media_player

+profiling_datetime

+screen_res

+tcp_os_signature

+tcp_os_sig_raw

+time_zone

+time_zone_dst_offset

+profile_api_timedelta

+mime_type_number

+plugin_number

+plugin_quicktime

+plugin_java

+fuzzy_device_id_confidence

+fuzzy_device_match_result

+fuzzy_device_last_event

+fuzzy_device_first_seen

+true_ip_city

+true_ip_first_seen

+true_ip_geo

+true_ip_latitude

+true_ip_longitude

+account_email_first_seen

+shipping_address_first_seen

+tcp_os_ sig_ttl

+tcp_connection_type

+page_time_on

+policy_score

+reason_code

+review_status

+risk_rating

+ */


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java
new file mode 100644
index 0000000..9081e1a
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java

@@ -0,0 +1,140 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.jsmlearning;

+

+import java.io.FileNotFoundException;

+import java.io.FileReader;

+import java.io.IOException;

+import java.io.PrintWriter;

+import java.util.ArrayList;

+import java.util.List;

+

+import au.com.bytecode.opencsv.CSVReader;

+import au.com.bytecode.opencsv.CSVWriter;

+

+public class ProfileReaderWriter {

+	public static List<String[]> readProfiles(String filename) {

+		CSVReader reader = null;

+		List<String[]> profiles = null;

+		try	{

+			reader = new CSVReader(new FileReader(filename), ',');

+			profiles = reader.readAll();

+		} catch (FileNotFoundException e) {

+			e.printStackTrace();

+		} catch (IOException ioe) {

+			ioe.printStackTrace();

+		}

+		return profiles;

+	}

+	

+	public static List<String[]> readProfiles(String filename, char delimiter) {

+		CSVReader reader = null;

+		List<String[]> profiles = null;

+		try	{

+			reader = new CSVReader(new FileReader(filename), delimiter);

+			profiles = reader.readAll();

+		} catch (FileNotFoundException e) {

+			e.printStackTrace();

+		} catch (IOException ioe) {

+			ioe.printStackTrace();

+		}

+		return profiles;

+	}

+

+	public static void writeReportArr( String[][] allLines, String reportName){

+		List<String[]> rep = new ArrayList<String[]>();

+		for(String[] line: allLines){

+			rep.add(line);

+		}

+		writeReport( rep, reportName);

+	}

+

+	public static void writeReport( List<String[]> allLines, String reportName){

+		CSVWriter writer = null;

+		try {	

+			writer = new CSVWriter(new PrintWriter(reportName));			

+		} catch (FileNotFoundException e) {

+			e.printStackTrace();

+		}		

+		writer.writeAll(allLines);

+

+		try {

+			writer.flush();

+			writer.close();

+		} catch (IOException e) {

+			e.printStackTrace();

+		}

+	}

+

+	public static void writeReport( List<String[]> allLines, String reportName, char delimiter){

+		CSVWriter writer = null;

+		try {	

+			writer = new CSVWriter(new PrintWriter(reportName), delimiter, delimiter, delimiter);			

+		} catch (FileNotFoundException e) {

+			e.printStackTrace();

+		}	

+

+		writer.writeAll(allLines);

+

+		try {

+			writer.flush();

+			writer.close();

+		} catch (IOException e) {

+			e.printStackTrace();

+		}

+	}

+	

+	public static void appendReport( List<String[]> allLines, String reportName, char delimiter){

+		List<String[]> previous;

+		try {

+			previous = readProfiles(reportName);

+			allLines.addAll(previous);

+		} catch (Exception e1) {

+			System.out.println("Creating file "+reportName);

+		}

+		

+		CSVWriter writer = null;

+		try {	

+			writer = new CSVWriter(new PrintWriter(reportName), delimiter, delimiter, delimiter);			

+		} catch (FileNotFoundException e) {

+			e.printStackTrace();

+		}	

+

+		writer.writeAll(allLines);

+

+		try {

+			writer.flush();

+			writer.close();

+		} catch (IOException e) {

+			e.printStackTrace();

+		}

+	}

+

+	public static void writeReportListStr(List<String> res, String string) {

+		// TODO Auto-generated method stub

+

+	}

+

+	public static void main(String[] args){

+		List<String[]> allLines = new ArrayList<String[]>();

+		allLines.add(new String[] {"aa " , "  bb", "ccc" });

+		ProfileReaderWriter.writeReport( allLines, "reportName.txt", ' ');

+

+	}

+

+

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java
new file mode 100644
index 0000000..88179b0
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/TreeKernelRunner.java

@@ -0,0 +1,131 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.jsmlearning;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.InputStreamReader;

+

+public class TreeKernelRunner {

+	private void runEXE(String[] command, String runPath){

+		Runtime r = Runtime.getRuntime();

+		Process mStartProcess = null;

+		try {

+			mStartProcess = r.exec( command, null, new File(runPath));

+		} catch (IOException e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+

+		StreamLogger outputGobbler = new StreamLogger(mStartProcess.getInputStream());

+		outputGobbler.start();

+

+		try {

+			int returnCode = mStartProcess.waitFor();

+		} catch (InterruptedException e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+	}

+

+	public void runLearner(String dir, String learning_file, String  model_file)

+	{

+		dir = dir.replace('/', '\\');

+		

+		if (!dir.endsWith("\\"))

+				dir+="\\";

+		String[] runString = new String[]{dir+"svm_learn.exe","-t", "5", dir+learning_file,  dir+model_file};

+		runEXE(runString, dir);

+	}

+	

+	

+	//svm_classify example_file model_file predictions_file

+	public void runClassifier(String dir, String example_file, String  model_file, String predictions_file)

+	{

+		dir = dir.replace('/', '\\');

+		

+		if (!dir.endsWith("\\"))

+				dir+="\\";

+		String[] runString = new String[]{dir+"svm_classify.exe", dir+example_file,  dir+model_file, dir+predictions_file};

+		runEXE(runString, dir);

+	}

+

+	class StreamLogger extends Thread{

+

+		private InputStream mInputStream;

+

+		public StreamLogger(InputStream is) {

+			this.mInputStream = is;

+		}

+

+		public void run() {

+			try {

+				InputStreamReader isr = new InputStreamReader(mInputStream);

+				BufferedReader br = new BufferedReader(isr);

+				String line = null;

+				while ((line = br.readLine()) != null) {

+					System.out.println(line);

+				}

+			} catch (IOException ioe) {

+				ioe.printStackTrace();

+			}

+		}

+

+	}

+	

+	public static void main(String[] args){

+		TreeKernelRunner runner = new TreeKernelRunner();

+		runner.runLearner("C:\\stanford-corenlp\\tree_kernel\\", "training.txt", "arg0.model1.txt");

+		runner.runClassifier("C:\\stanford-corenlp\\tree_kernel\\", "arg0.test", "arg0.model1.txt", "arg0.output1.txt");

+	}

+}

+

+	/*

+exec:

+

+public Process exec(String command, String envp[], File dir) 

+

+

+

+   @param      command   a specified system command.

+   @param      envp      array of strings, each element of which 

+                         has environment variable settings in format

+                         <i>name</i>=<i>value</i>.

+   @param      dir       the working directory of the subprocess, or

+                         <tt>null</tt> if the subprocess should inherit

+                         the working directory of the current process.

+

+                         Ð’ Ð´Ð¸Ñ�Ñ‚Ñ€Ð¸Ð±ÑƒÑ‚Ð¸Ð²Ðµ Ð´Ð²Ð° exe-Ñ„Ð°Ð¹Ð»Ð°: svm_learn.exe Ð¸ svm_classify.exe.

+

+1.   svm_learn.exe Ð±ÐµÑ€ÐµÑ‚ Ñ„Ð°Ð¹Ð» Ñ� Ð¿Ñ€Ð¸Ð¼ÐµÑ€Ð°Ð¼Ð¸, Ð¾Ð±Ñ€Ð°Ð±Ð°Ñ‚Ñ‹Ð²Ð°ÐµÑ‚ ÐµÐ³Ð¾, Ñ�Ñ‚Ñ€Ð¾Ð¸Ñ‚ Ñ„Ð°Ð¹Ð» model Ð¼ Ð¿Ñ€Ð°Ð²Ð¸Ð»Ð°Ð¼Ð¸ Ð¾Ð±ÑƒÑ‡ÐµÐ½Ð¸Ðµ.

+

+ÐŸÑ€Ð¸Ð¼ÐµÑ€Ñ‹ Ð·Ð°Ð¿ÑƒÑ�ÐºÐ°: 

+svm_learn -t 5 learning_file model_file - Ñ�Ñ‚Ð¾ Ñ�Ð°Ð¼Ñ‹Ð¹ Ð¿Ñ€Ð¾Ñ�Ñ‚Ð¾Ð¹ Ð²Ð°Ñ€Ð¸Ð°Ð½Ñ‚ Ð·Ð°Ð¿ÑƒÑ�ÐºÐ°, SubSetTreeKernel (Ð´Ð¾Ð¿ÑƒÑ�ÐºÐ°ÑŽÑ‚Ñ�Ñ� Ñ€Ð°Ð·Ñ€Ñ‹Ð²Ñ‹ Ð¿Ñ€Ð¸ Ð¾Ð±Ñ…Ð¾Ð´Ðµ Ð´ÐµÑ€ÐµÐ²ÑŒÐµÐ²)

+

+svm_learn -t 5 -D 0 learning_file model_file - Ð´Ñ€ÑƒÐ³Ð¾Ð¹ Ð²Ð°Ñ€Ð¸Ð°Ð½Ñ‚ Ñ�Ð´Ñ€Ð°, SubTreeKernel

+

+ÐŸÑ€Ð¸Ð¼ÐµÑ€ Ñ„Ð°Ð¹Ð»Ð° Ð»ÐµÐ¶Ð¸Ñ‚ Ð½Ð° ÐµÐ³Ð¾ Ñ�Ñ‚Ñ€Ð°Ð½Ð¸Ñ‡ÐºÐµ. Ð¢Ð°Ð¼ Ð¶Ðµ Ð¾Ð¿Ð¸Ñ�Ð°Ð½Ð¸Ðµ Ð¿Ð°Ñ€Ð°Ð¼ÐµÑ‚Ñ€Ð¾Ð².

+

+2. svm_classify.exe Ð±ÐµÑ€ÐµÑ‚ Ñ„Ð°Ð¹Ð» Ñ� Ñ‚ÐµÑ�Ñ‚Ð¾Ð²Ñ‹Ð¼Ð¸ Ð¿Ñ€Ð¸Ð¼ÐµÑ€Ð°Ð¼Ð¸, Ñ„Ð°Ð¹Ð» Ñ� Ð¼Ð¾Ð´ÐµÐ»ÑŒÑŽ, Ð¿Ð¾Ñ�Ñ‚Ñ€Ð¾ÐµÐ½Ð½Ñ‹Ð¹ svm_learn, Ð¸ Ð·Ð°Ð¿Ð¸Ñ�Ñ‹Ð²Ð°ÐµÑ‚ Ñ€ÐµÐ·ÑƒÐ»ÑŒÑ‚Ð°Ñ‚Ñ‹ Ð¾Ð±ÑƒÑ‡ÐµÐ½Ð¸Ñ� Ð² Ñ„Ð°Ð¹Ð» predictions_file.

+

+Ð—Ð°Ð¿ÑƒÑ�Ðº:     svm_classify example_file model_file predictions_file

+

+Ð¤Ð°Ð¹Ð» Ð¸Ð¼ÐµÐµÑ‚ Ñ‚Ð¾Ñ‚ Ð¶Ðµ Ñ„Ð¾Ñ€Ð¼Ð°Ñ‚, Ñ‡Ñ‚Ð¾ Ð¸ Ð²Ñ…Ð¾Ð´Ð½Ñ‹Ðµ Ð¿Ñ€Ð¸Ð¼ÐµÑ€Ñ‹. ÐžÐ±Ñ€Ð°Ð·ÐµÑ† Ð»ÐµÐ¶Ð¸Ñ‚ Ð² Ð°Ñ€Ñ…Ð¸Ð²Ðµ Ð½Ð° Ñ�Ñ‚Ñ€Ð°Ð½Ð¸Ñ‡ÐºÐµ ÐœÐ¾Ñ�ÐºÐ¸Ñ‚Ñ‚Ð¸. 

+ÐœÐ¾Ð¶Ð½Ð¾ Ñ�Ñ€Ð°Ð·Ñƒ Ð¶Ðµ ÑƒÐºÐ°Ð·Ñ‹Ð²Ð°Ñ‚ÑŒ, Ðº ÐºÐ°ÐºÐ¾Ð¼Ñƒ ÐºÐ»Ð°Ñ�Ñ�Ñƒ Ð¾Ñ‚Ð½Ð¾Ñ�Ð¸Ñ‚Ñ�Ñ� Ð¿Ñ€Ð¸Ð¼ÐµÑ€ (1 Ð¸Ð»Ð¸ -1 Ð² Ð½Ð°Ñ‡Ð°Ð»Ðµ Ñ�Ñ‚Ñ€Ð¾ÐºÐ¸). Ð’ Ñ�Ñ‚Ð¾Ð¼ Ñ�Ð»ÑƒÑ‡Ð°Ðµ Ñ‚Ð¾Ñ‡Ð½Ð¾Ñ�Ñ‚ÑŒ Ð¸ Ð¿Ð¾Ð»Ð½Ð¾Ñ‚Ð° Ð¾Ñ†ÐµÐ½Ð¸Ð²Ð°ÑŽÑ‚Ñ�Ñ� Ð°Ð²Ñ‚Ð¾Ð¼Ð°Ñ‚Ð¸Ñ‡ÐµÑ�ÐºÐ¸. Ð˜Ð»Ð¸ Ñ�Ñ‚Ð°Ð²Ð¸Ñ‚ÑŒ Ñ‚Ð°Ð¼ 0.

+	 */
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java b/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java
index 736eb35..7f4f589 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java

@@ -16,6 +16,7 @@
  */

 package opennlp.tools.nl2code;

 

+import java.io.File;

 import java.util.ArrayList;

 import java.util.Arrays;

 import java.util.List;

@@ -28,11 +29,19 @@
 public class NL2Obj {

   ObjectControlOp prevOp;

 

-  public NL2Obj() {

+  public NL2Obj(String path) {

     prevOp = new ObjectControlOp();

     prevOp.setOperatorIf("");

     prevOp.setOperatorFor("");

+    parser = ParserChunker2MatcherProcessor.getInstance(path);

   }

+  

+  public NL2Obj() {

+	    prevOp = new ObjectControlOp();

+	    prevOp.setOperatorIf("");

+	    prevOp.setOperatorFor("");

+	    parser = ParserChunker2MatcherProcessor.getInstance();

+	  }

 

   public static String[] epistemicStatesList = new String[] {

     "select", "verify", "find", "start", "stop", "go", "check"

@@ -268,6 +277,9 @@
 

 

   public static void main(String[] args){

+	  

+	String cDir = new File(".").getAbsolutePath();

+	

     String[] text = new String[]{

         "Randomly select a pixel at an image.",

         "Find a convex area this pixel belongs, so that all pixels are less than 128",      //area->REGION


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java b/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java
index 706e8f6..c2e54f5 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2ObjCreateAssign.java

@@ -25,120 +25,129 @@
 

 public class NL2ObjCreateAssign extends NL2Obj {

 

-  private boolean classBeingDefined = false;

-  public static String[] declarationStatesList = new String[] {

-    "create", "assign", "set", 

-  };

+	private boolean classBeingDefined = false;

+	public static String[] declarationStatesList = new String[] {

+		"create", "assign", "set", 

+	};

 

-  public static String[] dataTypesList = new String[] {

-    "text", "double", "array", 

-  };

+	public static String[] dataTypesList = new String[] {

+		"text", "double", "array", 

+	};

 

-  public static String[] arrayElementList = new String[] {

-    "first", "second", "third", "fourth" 

-  };

+	public static String[] arrayElementList = new String[] {

+		"first", "second", "third", "fourth" 

+	};

 

-  public static String[] arrayElementListInsdex = new String[] {

-    "0", "1", "2", "3" 

-  };

+	public static String[] arrayElementListInsdex = new String[] {

+		"0", "1", "2", "3" 

+	};

 

 

-  @Override

-  public ObjectPhraseListForSentence convertSentenceToControlObjectPhrase(String sentence){

-    String expression = null;

-    if (sentence.indexOf(":")>-1){

-      expression = sentence.split(":")[1];

-      sentence = sentence.split(":")[0]+".";

-    }

+

+	public NL2ObjCreateAssign() {

+		super();

+	}

+

+	public NL2ObjCreateAssign(String path) {

+		super(path);

+	}

+

+	@Override

+	public ObjectPhraseListForSentence convertSentenceToControlObjectPhrase(String sentence){

+		String expression = null;

+		if (sentence.indexOf(":")>-1){

+			expression = sentence.split(":")[1];

+			sentence = sentence.split(":")[0]+".";

+		}

 

 

-    List<ObjectPhrase> oPhrases = new  ArrayList<ObjectPhrase>();

-    parser = ParserChunker2MatcherProcessor.getInstance();

-    List<List<ParseTreeChunk>> lingPhrases = 

-      parser.formGroupedPhrasesFromChunksForSentence(sentence);

+		List<ObjectPhrase> oPhrases = new  ArrayList<ObjectPhrase>();

+		parser = ParserChunker2MatcherProcessor.getInstance();

+		List<List<ParseTreeChunk>> lingPhrases = 

+				parser.formGroupedPhrasesFromChunksForSentence(sentence);

 

-    ObjectControlOp op = extractControlPart(lingPhrases, prevOp);

-    prevOp = op;

+		ObjectControlOp op = extractControlPart(lingPhrases, prevOp);

+		prevOp = op;

 

-    //start with verb phrases

-    List<ParseTreeChunk> actionWithObject =  lingPhrases.get(1);

-    actionWithObject.addAll( lingPhrases.get(4));

+		//start with verb phrases

+		List<ParseTreeChunk> actionWithObject =  lingPhrases.get(1);

+		actionWithObject.addAll( lingPhrases.get(4));

 

-    System.out.println("      === "+actionWithObject);

+		System.out.println("      === "+actionWithObject);

 

-    for(ParseTreeChunk verbChunk: actionWithObject){

-      List<String> lems = verbChunk.getLemmas();

-      String declarativeAction = verbChunk.getLemmas().get(0).toLowerCase();

-      if (declarativeAction.equals("define")){

-        if (verbChunk.getLemmas().get(1).toLowerCase().equals("class") ||

-            verbChunk.getLemmas().get(2).toLowerCase().equals("class")){

-          // new class

-          String className = verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase();

-          className = className.substring(0, 1).toUpperCase()+className.substring(1, className.length());

-          op.setOperatorIf("class "+className + "{");

-          op.setOperatorFor("{");

-          classBeingDefined = true;

-          break;

-        }

-        String dataType = verbChunk.getLemmas().get(1).toLowerCase();

+		for(ParseTreeChunk verbChunk: actionWithObject){

+			List<String> lems = verbChunk.getLemmas();

+			String declarativeAction = verbChunk.getLemmas().get(0).toLowerCase();

+			if (declarativeAction.equals("define")){

+				if (verbChunk.getLemmas().get(1).toLowerCase().equals("class") ||

+						verbChunk.getLemmas().get(2).toLowerCase().equals("class")){

+					// new class

+					String className = verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase();

+					className = className.substring(0, 1).toUpperCase()+className.substring(1, className.length());

+					op.setOperatorIf("class "+className + "{");

+					op.setOperatorFor("{");

+					classBeingDefined = true;

+					break;

+				}

+				String dataType = verbChunk.getLemmas().get(1).toLowerCase();

 

-        if (classBeingDefined && Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){

-          op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());

-          classBeingDefined = true;

-          break;

-        }

-        if (Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){

-          op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());

-          classBeingDefined = true;

-          break;

-        }

-      } else if (declarativeAction.equals("create")){

+				if (classBeingDefined && Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){

+					op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());

+					classBeingDefined = true;

+					break;

+				}

+				if (Arrays.asList(dataTypesList).contains(dataType) && verbChunk.getLemmas().get(2).toLowerCase().equals("attribute")){

+					op.setOperatorFor(dataType + " "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase());

+					classBeingDefined = true;

+					break;

+				}

+			} else if (declarativeAction.equals("create")){

 

-        // now substituting array

-        if (verbChunk.getLemmas().get(1).toLowerCase().equals("array")){

+				// now substituting array

+				if (verbChunk.getLemmas().get(1).toLowerCase().equals("array")){

 

-          if(lems.contains("class")){

-            int indClass = lems.indexOf("class");

-            int numElements = lems.indexOf("elements");

-            if (numElements<0)

-              numElements = lems.indexOf("objects");

-            if (numElements<0)

-              numElements = lems.indexOf("members");

-            String arraySize = lems.get(numElements-1);

-            op.setOperatorFor(lems.get(indClass+1)+"[] "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase() 

-                +" = new "+lems.get(indClass+1)+"["+arraySize+"]");

-            classBeingDefined = false;

-            break;

-          }

-        }    

-      } else if (declarativeAction.equals("assign")){

-        int numElements = lems.indexOf("element");

-        if (numElements<0)

-          numElements = lems.indexOf("object");

-        if (numElements<0)

-          numElements = lems.indexOf("member");

-        if (Arrays.asList(arrayElementList).contains(lems.get(numElements-1))){

-          int arrIndex = Arrays.asList(arrayElementList).indexOf(lems.get(numElements-1));

-          String indexValue = arrayElementListInsdex[arrIndex]; 

+					if(lems.contains("class")){

+						int indClass = lems.indexOf("class");

+						int numElements = lems.indexOf("elements");

+						if (numElements<0)

+							numElements = lems.indexOf("objects");

+						if (numElements<0)

+							numElements = lems.indexOf("members");

+						String arraySize = lems.get(numElements-1);

+						op.setOperatorFor(lems.get(indClass+1)+"[] "+verbChunk.getLemmas().get(verbChunk.getLemmas().size()-1).toLowerCase() 

+								+" = new "+lems.get(indClass+1)+"["+arraySize+"]");

+						classBeingDefined = false;

+						break;

+					}

+				}    

+			} else if (declarativeAction.equals("assign")){

+				int numElements = lems.indexOf("element");

+				if (numElements<0)

+					numElements = lems.indexOf("object");

+				if (numElements<0)

+					numElements = lems.indexOf("member");

+				if (Arrays.asList(arrayElementList).contains(lems.get(numElements-1))){

+					int arrIndex = Arrays.asList(arrayElementList).indexOf(lems.get(numElements-1));

+					String indexValue = arrayElementListInsdex[arrIndex]; 

 

-          String arrayName = lems.get(lems.size()-1);

-          if (expression!=null)

-            op.setOperatorFor(arrayName+"["+indexValue+"]."+ expression);

-          break;

-        } 

-      } else if (declarativeAction.equals("set")){

-        int indQuantifier = lems.indexOf("all");

-        if (indQuantifier>-1 && 

-            (lems.get(indQuantifier+1).equals("elements") || lems.get(indQuantifier+1).equals("members") )){

-          

-          String arrayName = lems.get(lems.size()-1);

-          if (expression!=null)

-            op.setOperatorFor("for(int i=0; i<"+ arrayName+".size(); i++) "+

-                arrayName+"[i]."+ expression);

-          break;

-        } 

-      }

-      /*    

+					String arrayName = lems.get(lems.size()-1);

+					if (expression!=null)

+						op.setOperatorFor(arrayName+"["+indexValue+"]."+ expression);

+					break;

+				} 

+			} else if (declarativeAction.equals("set")){

+				int indQuantifier = lems.indexOf("all");

+				if (indQuantifier>-1 && 

+						(lems.get(indQuantifier+1).equals("elements") || lems.get(indQuantifier+1).equals("members") )){

+

+					String arrayName = lems.get(lems.size()-1);

+					if (expression!=null)

+						op.setOperatorFor("for(int i=0; i<"+ arrayName+".size(); i++) "+

+								arrayName+"[i]."+ expression);

+					break;

+				} 

+			}

+			/*    

         else {

           List<String> paramValues = verbChunk.getLemmas(), paramPOSs = verbChunk.getPOSs();

 

@@ -205,40 +214,40 @@
         oPhrases.add(oPhrase);      

 

       } */

-    }

+		}

 

-    ObjectPhraseListForSentence oplfs =  new ObjectPhraseListForSentence( oPhrases, op);

-    oplfs.cleanMethodNamesIsAre();

-    oplfs.substituteNullObjectIntoEmptyArg();

-      

-    return oplfs;

-  }

+		ObjectPhraseListForSentence oplfs =  new ObjectPhraseListForSentence( oPhrases, op);

+		oplfs.cleanMethodNamesIsAre();

+		oplfs.substituteNullObjectIntoEmptyArg();

 

-  public static void main(String[] args){

+		return oplfs;

+	}

 

-    String[] text = new String[]{

-        "Define a class and name it Employee. ",

-        "Define text attribute and name it m_name. ",

-        "Define double attribute and name it m_salary.",

-        "Create array of objects of class Employee for 10 elements, name the object as workforce.",

-        "Assign the first element in array workforce: m_name=\"Boss\"",

-        "Assign the second element in array workforce: m_name=\"His wife\"",

-       //  "Comment: We just started our small business company and expect to hire 8 more people soon.",

-        "Set for all elements in array workforce: m_salary=0 ",

-        "Print the list of all m_name attributes for workforce."

+	public static void main(String[] args){

 

-    };

+		String[] text = new String[]{

+				"Define a class and name it Employee. ",

+				"Define text attribute and name it m_name. ",

+				"Define double attribute and name it m_salary.",

+				"Create array of objects of class Employee for 10 elements, name the object as workforce.",

+				"Assign the first element in array workforce: m_name=\"Boss\"",

+				"Assign the second element in array workforce: m_name=\"His wife\"",

+				//  "Comment: We just started our small business company and expect to hire 8 more people soon.",

+				"Set for all elements in array workforce: m_salary=0 ",

+				"Print the list of all m_name attributes for workforce."

 

-    NL2Obj compiler = new NL2ObjCreateAssign();

-    for(String sent:text){

-      ObjectPhraseListForSentence opls=null;

-      try {

-        opls = compiler.convertSentenceToControlObjectPhrase(sent);

-      } catch (Exception e) {

-        e.printStackTrace();

-      }

-      System.out.println(sent+"\n"+opls+"\n");

-    }

+		};

 

-  }

+		NL2Obj compiler = new NL2ObjCreateAssign();

+		for(String sent:text){

+			ObjectPhraseListForSentence opls=null;

+			try {

+				opls = compiler.convertSentenceToControlObjectPhrase(sent);

+			} catch (Exception e) {

+				e.printStackTrace();

+			}

+			System.out.println(sent+"\n"+opls+"\n");

+		}

+

+	}

 }


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java
new file mode 100644
index 0000000..2c75ad0
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ArcType.java

@@ -0,0 +1,54 @@
+package opennlp.tools.parse_thicket;

+

+public class ArcType{

+	private String type; // rst

+	private String subtype; // rst-explain

+	private Integer type_id;

+	private Integer subtype_id;

+	

+	public ArcType(String type, // rst

+	String subtype, // rst-explain

+	Integer type_id,

+	Integer subtype_id){

+		this.type = type; // rst

+		this.subtype = subtype; // rst-explain

+		this.type_id= type_id;

+		this.subtype_id = subtype_id;

+	}

+

+	public String getType() {

+		return type;

+	}

+

+	public void setType(String type) {

+		this.type = type;

+	}

+

+	public String getSubtype() {

+		return subtype;

+	}

+

+	public void setSubtype(String subtype) {

+		this.subtype = subtype;

+	}

+

+	public Integer getType_id() {

+		return type_id;

+	}

+

+	public void setType_id(Integer type_id) {

+		this.type_id = type_id;

+	}

+

+	public Integer getSubtype_id() {

+		return subtype_id;

+	}

+

+	public void setSubtype_id(Integer subtype_id) {

+		this.subtype_id = subtype_id;

+	}

+	

+	public String toString(){

+		return type+":"+subtype;

+	}

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java
new file mode 100644
index 0000000..03256c8
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/IGeneralizer.java

@@ -0,0 +1,12 @@
+package opennlp.tools.parse_thicket;

+

+import java.util.List;

+

+public interface IGeneralizer<T> {

+	/* All objects such as words, ParseTreeNodes, Phrases, Communicative actions etc. are subject to 

+	 * generalization, so should implement this interface

+	 * 

+	 * In this project Everything is subject to generalization, and returns a list of generic objects

+	 */

+   public List<T> generalize(Object o1, Object o2);

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java
new file mode 100644
index 0000000..9a1dfd5
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/PTTree.java

@@ -0,0 +1,89 @@
+package opennlp.tools.parse_thicket;

+

+import java.io.PrintWriter;

+import java.util.ArrayList;

+import java.util.List;

+

+import edu.stanford.nlp.trees.LabeledScoredTreeNode;

+import edu.stanford.nlp.trees.SimpleTree;

+import edu.stanford.nlp.trees.Tree;

+import edu.stanford.nlp.trees.TreeFactory;

+

+

+

+public class PTTree extends SimpleTree {

+	

+	public PTTree(){

+		super();

+	}

+

+	public PTTree(Tree t){

+		super();

+	}

+	private static final long serialVersionUID = 1L;

+

+	@Override

+	public PTTree[] children() {

+		return children();

+	}

+

+	@Override

+	public TreeFactory treeFactory() {

+		// TODO Auto-generated method stub

+		return null;

+	}

+	

+	public void doNavigate(){

+		List<LabeledScoredTreeNode> phrases = new ArrayList<LabeledScoredTreeNode>();

+		navigate(0, false, false, false, true, true, phrases);

+	}

+	

+	private static void navigateChildren(PTTree[] trChildren, int indent, boolean parentLabelNull, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {

+	    boolean firstSibling = true;

+	    boolean leftSibIsPreTerm = true;  // counts as true at beginning

+	    for (PTTree currentTree : trChildren) {

+	      currentTree.navigate(indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, onlyLabelValue, phrases);

+	      leftSibIsPreTerm = currentTree.isPreTerminal();

+	      // CC is a special case for English, but leave it in so we can exactly match PTB3 tree formatting

+	      if (currentTree.value() != null && currentTree.value().startsWith("CC")) {

+	        leftSibIsPreTerm = false;

+	      }

+	      firstSibling = false;

+	    }

+	  }

+	

+	/**

+	   * navigate parse tree

+	   */

+	  private void navigate(int indent, boolean parentLabelNull, boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {

+	    // the condition for staying on the same line in Penn Treebank

+	    boolean suppressIndent = (parentLabelNull || (firstSibling && isPreTerminal()) || (leftSiblingPreTerminal && isPreTerminal() && (label() == null || !label().value().startsWith("CC"))));

+	    if (suppressIndent) {

+	      //pw.print(" ");

+	      // pw.flush();

+	    } else {

+	      if (!topLevel) {

+	        //pw.println();

+	      }

+	      for (int i = 0; i < indent; i++) {

+	        //pw.print("  ");

+	        // pw.flush();

+	      }

+	    }

+	    if (isLeaf() || isPreTerminal()) {

+	      String terminalString = toStringBuilder(new StringBuilder(), onlyLabelValue).toString();

+	      //pw.print(terminalString);

+	      //pw.flush();

+	      return;

+	    }

+	    //pw.print("(");

+	    String nodeString = onlyLabelValue ? value() : nodeString();

+	    //pw.print(nodeString);

+	    // pw.flush();

+	    boolean parentIsNull = label() == null || label().value() == null;

+	    navigateChildren(children(), indent + 1, parentIsNull, true, phrases);

+	    //pw.print(")");

+	    

+	  }

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java
new file mode 100644
index 0000000..850e1ee
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Pair.java

@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket;
+
+import java.util.Comparator;
+
+/**
+ * Generic pair class for holding two objects. Often used as return object.
+ * 
+ * @author Albert-Jan de Vries
+ * 
+ * @param <T1>
+ * @param <T2>
+ */
+public class Pair<T1, T2> {
+  private T1 first;
+
+  private T2 second;
+
+  public Pair() {
+
+  }
+
+  public Pair(T1 first, T2 second) {
+    this.first = first;
+    this.second = second;
+  }
+
+  public T1 getFirst() {
+    return first;
+  }
+
+  public void setFirst(T1 first) {
+    this.first = first;
+  }
+
+  public T2 getSecond() {
+    return second;
+  }
+
+  public void setSecond(T2 second) {
+    this.second = second;
+  }
+  
+  public class PairComparable implements Comparator<Pair<T1, T2>> {
+    // @Override
+    public int compare(Pair o1, Pair o2) {
+      int b = -2;
+      if ( o1.second instanceof Float && o2.second instanceof Float){
+        
+        b =  (((Float)o1.second > (Float)o2.second) ? -1
+          : (((Float)o1.second == (Float)o2.second) ? 0 : 1));
+      }
+      return b;
+    }
+  }
+  public String toString(){
+	  return this.first.toString()+" "+this.second.toString();
+  }
+  
+}
+

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java
new file mode 100644
index 0000000..10e9683
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java

@@ -0,0 +1,191 @@
+package opennlp.tools.parse_thicket;
+
+import java.io.*;
+import java.util.*;
+
+import opennlp.tools.parse_thicket.communicative_actions.CommunicativeActionsArcBuilder;
+
+import edu.stanford.nlp.dcoref.CorefChain;
+import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
+import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
+import edu.stanford.nlp.ling.*;
+import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
+import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
+import edu.stanford.nlp.pipeline.*;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.util.*;
+
+public class ParseCorefsBuilder {
+	protected static ParseCorefsBuilder instance;
+	private Annotation annotation;
+	StanfordCoreNLP pipeline;
+	CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder();
+	
+	  /**
+	   * singleton method of instantiating the processor
+	   * 
+	   * @return the instance
+	   */
+	  public synchronized static ParseCorefsBuilder getInstance() {
+	    if (instance == null)
+	      instance = new ParseCorefsBuilder();
+
+	    return instance;
+	  }
+	
+	ParseCorefsBuilder(){
+		Properties props = new Properties();
+		props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
+		pipeline = new StanfordCoreNLP(props);
+	}
+	
+	public ParseThicket buildParseThicket(String text){
+		List<Tree> ptTrees = new ArrayList<Tree>();
+		// all numbering from 1, not 0
+		List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+		List<List<ParseTreeNode>> nodesThicket = new ArrayList<List<ParseTreeNode>>();
+		
+		annotation = new Annotation(text);
+		try {
+			pipeline.annotate(annotation);
+			List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
+			if (sentences != null && sentences.size() > 0) 
+			for(CoreMap sentence: sentences){
+				List<ParseTreeNode> nodes = new ArrayList<ParseTreeNode>();
+				
+				// traversing the words in the current sentence
+			    // a CoreLabel is a CoreMap with additional token-specific methods
+				Class<TokensAnnotation> tokenAnn = TokensAnnotation.class;
+				List<CoreLabel> coreLabelList = sentence.get(tokenAnn);
+				int count=1;
+			    for (CoreLabel token: coreLabelList ) {
+			      // this is the text of the token
+			      String lemma = token.get(TextAnnotation.class);
+			      // this is the POS tag of the token
+			      String pos = token.get(PartOfSpeechAnnotation.class);
+			      // this is the NER label of the token
+			      String ne = token.get(NamedEntityTagAnnotation.class);     
+			      nodes.add(new ParseTreeNode(lemma, pos, ne, count));
+			      count++;
+			    }	
+			    nodesThicket.add(nodes);
+			  Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
+			  ptTrees.add(tree);
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	    
+	  
+	    // now coreferences
+	    Map<Integer, CorefChain> corefs = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
+	    List<CorefChain> chains = new ArrayList<CorefChain>(corefs.values());
+	    for(CorefChain c: chains){
+	      //System.out.println(c);
+	      List<CorefMention> mentions = c.getMentionsInTextualOrder();
+	      //System.out.println(mentions);
+	      if (mentions.size()>1)
+	      for(int i=0; i<mentions.size(); i++){
+	    	  for(int j=i+1; j<mentions.size(); j++){
+	    	  CorefMention mi = mentions.get(i), mj=mentions.get(j);
+	    	  
+	    	  
+	    	  int niSentence = mi.position.get(0);
+	    	  int niWord = mi.startIndex;
+	    	  int njSentence = mj.position.get(0);
+	    	  int njWord = mj.startIndex;
+	    	  
+	    	  ArcType arcType = new ArcType("coref-", mj.mentionType+"-"+mj.animacy, 0, 0);
+	    	  
+	    	  WordWordInterSentenceRelationArc arc = 
+	    			  new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(niSentence,niWord), 
+	    					  new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan, 
+	    					  arcType);
+	    	  arcs.add(arc);
+	    	  
+	    	  /*
+	    	  System.out.println("animacy = "+m.animacy);
+	    	  System.out.println("mention span = "+m.mentionSpan);
+	    	  System.out.println(" id = "+m.mentionID);
+	    	  System.out.println(" position = "+m.position);
+	    	  System.out.println(" start index = "+m.startIndex);
+	    	  System.out.println(" end index = "+m.endIndex);   
+	    	  System.out.println(" mentionType = "+m.mentionType);   
+	    	  System.out.println(" number =  = "+m.number);  
+	    	  */
+	    	  }
+	      }
+	      
+	      
+	    }
+	    List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket);
+	    
+	    ParseThicket result = new ParseThicket(ptTrees, arcs);
+	    result.setNodesThicket(nodesThicket);
+	    return result;
+	}
+
+  private List<WordWordInterSentenceRelationArc> buildCAarcs(
+			List<List<ParseTreeNode>> nodesThicket) {
+	  List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>();
+	  
+		for(int sentI=0; sentI<nodesThicket.size(); sentI++){
+			for(int sentJ=sentI+1; sentJ<nodesThicket.size(); sentJ++){
+				List<ParseTreeNode> sentenceI = nodesThicket.get(sentI), 
+						sentenceJ = nodesThicket.get(sentJ);
+				Pair<String, Integer[]> caI = caFinder.findCAInSentence(sentenceI);
+				Pair<String, Integer[]> caJ = caFinder.findCAInSentence(sentenceJ);
+				int indexCA1 = caFinder.findCAIndexInSentence(sentenceI);
+				int indexCA2 = caFinder.findCAIndexInSentence(sentenceJ);
+				if (caI==null || caJ==null)
+					continue;
+				Pair<String, Integer[]> caGen = caFinder.generalize(caI, caJ).get(0);
+				
+				ArcType arcType = new ArcType("ca", 
+						caGen.getFirst().toString()+printNumArray(caGen.getSecond()), 0, 0);
+				 WordWordInterSentenceRelationArc arc = 
+		    			  new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(sentI,indexCA1), 
+		    					  new Pair<Integer, Integer>(sentJ,indexCA2), caI.getFirst(), caJ.getFirst(), 
+		    					  arcType);
+		    	  arcs.add(arc);
+				
+			}
+					}
+		
+		return arcs;
+	}
+  
+    private String printNumArray(Integer[] arr){
+    	StringBuffer buf = new StringBuffer();
+    	for(Integer i: arr){
+    		buf.append(Integer.toString(i)+ " ");
+    	}
+    	return buf.toString();
+    }
+
+public static void main(String[] args) throws IOException {
+	  ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();
+	  ParseThicket  th = builder.buildParseThicket("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+
+    		  "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +
+    		  "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +
+    		  "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ");
+    //GraphFromPTreeBuilder gbuilder = new GraphFromPTreeBuilder();
+    //gbuilder.buildGraphFromPT(th);
+	 
+  }
+
+}
+
+/*
+ * [<sent=1-word=1..Iran> ===> <sent=3-word=9..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=1..Iran>, <sent=1-word=1..Iran> ===> <sent=4-word=4..its>, <sent=1-word=1..Iran> ===> <sent=4-word=17..it>, <sent=3-word=9..Iran> ===> <sent=4-word=1..Iran>, <sent=3-word=9..Iran> ===> <sent=4-word=4..its>, <sent=3-word=9..Iran> ===> <sent=4-word=17..it>, <sent=4-word=1..Iran> ===> <sent=4-word=4..its>, <sent=4-word=1..Iran> ===> <sent=4-word=17..it>, <sent=4-word=4..its> ===> <sent=4-word=17..it>, <sent=1-word=6..UN> ===> <sent=2-word=1..UN>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=10..its>, <sent=1-word=5..the UN proposal> ===> <sent=1-word=13..its>, <sent=1-word=10..its> ===> <sent=1-word=13..its>, <sent=1-word=16..nuclear weapons> ===> <sent=3-word=14..nuclear weapons>, <sent=2-word=1..UN nuclear watchdog> ===> <sent=2-word=11..its>]
+
+[[[1]Iran:NNP>LOCATION, [2]refuses:VBZ>O, [3]to:TO>O, [4]accept:VB>O, [5]the:DT>O, [6]UN:NNP>ORGANIZATION, [7]proposal:NN>O, [8]to:TO>O, [9]end:VB>O, [10]its:PRP$>O, [11]dispute:NN>O, [12]over:IN>O, [13]its:PRP$>O, [14]work:NN>O, [15]on:IN>O, [16]nuclear:JJ>O, [17]weapons:NNS>O, [18].:.>O], 
+
+[[1]UN:NNP>ORGANIZATION, [2]nuclear:JJ>O, [3]watchdog:NN>O, [4]passes:VBZ>O, [5]a:DT>O, [6]resolution:NN>O, [7]condemning:VBG>O, [8]Iran:NNP>LOCATION, [9]for:IN>O, [10]developing:VBG>O, [11]its:PRP$>O, [12]second:JJ>ORDINAL, [13]uranium:NN>O, [14]enrichment:NN>O, [15]site:NN>O, [16]in:IN>O, [17]secret:NN>O, [18].:.>O], 
+
+[[1]A:DT>O, [2]recent:JJ>O, [3]IAEA:NNP>ORGANIZATION, [4]report:NN>O, [5]presented:VBD>O, [6]diagrams:NNS>O, [7]that:WDT>O, [8]suggested:VBD>O, [9]Iran:NNP>LOCATION, [10]was:VBD>O, [11]secretly:RB>O, [12]working:VBG>O, [13]on:IN>O, [14]nuclear:JJ>O, [15]weapons:NNS>O, [16].:.>O], 
+
+[[1]Iran:NNP>LOCATION, [2]envoy:NN>O, [3]says:VBZ>O, [4]its:PRP$>O, [5]nuclear:JJ>O, [6]development:NN>O, [7]is:VBZ>O, [8]for:IN>O, [9]peaceful:JJ>O, [10]purpose:NN>O, [11],:,>O, [12]and:CC>O, [13]the:DT>O, [14]material:NN>O, [15]evidence:NN>O, [16]against:IN>O, [17]it:PRP>O, [18]has:VBZ>O, [19]been:VBN>O, [20]fabricated:VBN>O, [21]by:IN>O, [22]the:DT>O, [23]US:NNP>LOCATION, [24].:.>O]]
+*/

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java
new file mode 100644
index 0000000..e584d1e
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java

@@ -0,0 +1,59 @@
+package opennlp.tools.parse_thicket;

+

+import java.util.List;

+

+import edu.stanford.nlp.trees.Tree;

+

+public class ParseThicket {

+	// parse trees 

+	private List<Tree> sentenceTrees;

+	// there should be an arc for each sentence

+	private List<WordWordInterSentenceRelationArc> arcs;

+	// lists of nodes for each sentence

+	// then list for all sentences

+	private List<List<ParseTreeNode>> sentenceNodes;

+	

+	public List<Tree> getSentences() {

+		return sentenceTrees;

+	}

+

+	public void setSentences(List<Tree> sentences) {

+		this.sentenceTrees = sentences;

+	}

+

+	public List<WordWordInterSentenceRelationArc> getArcs() {

+		return arcs;

+	}

+

+	public void setArcs(List<WordWordInterSentenceRelationArc> arcs) {

+		this.arcs = arcs;

+	}

+

+	public List<List<ParseTreeNode>> getNodesThicket() {

+		return sentenceNodes;

+	}

+

+	public void setNodesThicket(List<List<ParseTreeNode>> nodesThicket) {

+		this.sentenceNodes = nodesThicket;

+	}

+

+	public ParseThicket(String paragraph){

+		ParseCorefsBuilder builder = ParseCorefsBuilder.getInstance();

+		ParseThicket res = builder.buildParseThicket(paragraph);

+		this.sentenceTrees= res.sentenceTrees;

+		this.arcs = res.arcs;		

+	}

+

+	public ParseThicket(List<Tree> ptTrees,

+			List<WordWordInterSentenceRelationArc> barcs) {

+		this.sentenceTrees= ptTrees;

+		this.arcs = barcs;				

+	}

+	

+	public String toString(){

+		return this.sentenceTrees+"\n"+this.arcs;

+	}

+	

+	

+	

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java
new file mode 100644
index 0000000..528eb4d
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java

@@ -0,0 +1,153 @@
+package opennlp.tools.parse_thicket;

+

+import java.util.ArrayList;

+import java.util.List;

+

+public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{

+	String word;

+    // this is the POS tag of the token

+    String pos; 

+    // this is the NER label of the token

+    String ne; 

+    Integer id;

+    //PhraseType 

+    String phraseType;

+    

+    public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP");

+    	private PhraseType(final String text) {

+        this.text = text;

+    	}

+        private final String text;

+    

+    }

+    

+	public ParseTreeNode(String word, String pos, String ne, Integer id) {

+		super();

+		this.word = word;

+		this.pos = pos;

+		this.ne = ne;

+		this.id = id;

+	}

+	

+	public ParseTreeNode(String word, String pos) {

+		super();

+		this.word = word;

+		this.pos = pos;

+		this.ne = ne;

+		this.id = id;

+	}

+	

+	public String getPhraseType() {

+		return phraseType;

+	}

+	public void setPhraseType(String pt) {

+		this.phraseType=pt;

+	}

+	public String getWord() {

+		return word;

+	}

+	public void setWord(String word) {

+		this.word = word;

+	}

+	public String getPos() {

+		return pos;

+	}

+	public void setPos(String pos) {

+		this.pos = pos;

+	}

+	public String getNe() {

+		return ne;

+	}

+	public void setNe(String ne) {

+		this.ne = ne;

+	}

+	public Integer getId() {

+		return id;

+	}

+	public void setId(Integer id) {

+		this.id = id;

+	} 

+    

+	public String toString(){

+		StringBuffer buf = new StringBuffer();

+		if (id!=null)

+			buf.append("<"+id+">");

+		if(phraseType!=null)

+			buf.append(phraseType);

+		if(word!=null)

+			buf.append("'"+word+"'");

+		if (pos!=null)

+			buf.append(":"+pos);

+		return buf.toString();

+	}

+

+	@Override

+	public List<ParseTreeNode> generalize(Object o1, Object o2) {

+		List<ParseTreeNode> result = new ArrayList<ParseTreeNode>();

+		

+		ParseTreeNode w1 = (ParseTreeNode) o1;

+		ParseTreeNode w2 = (ParseTreeNode) o2;

+		String posGen =  generalizePOS(w1.pos, w2.pos);

+		if (posGen ==null)

+			return result;

+		ParseTreeNode newNode = new ParseTreeNode(generalizeWord(w1.word, w2.word),

+				posGen, "O", -1);

+		result.add(newNode);

+		return result;

+	}

+	

+	public String generalizeWord(String lemma1, String lemma2){

+		if (lemma1.equals(lemma2))

+			return lemma1;

+		if (lemma1.equals("*"))

+			return "*";

+		if (lemma2.equals("*"))

+			return "*";

+		//TODO

+		return "*";

+		

+	}

+	

+	public String generalizePOS(String pos1, String pos2) {

+	    if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN")

+	        && pos1.equals("NP"))) {

+	      return "NN";

+	    }

+	    if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG")

+	        && pos1.equals("NN"))) {

+	      return "NN";

+	    }

+

+	    if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN")

+	        && pos1.equals("ADJP"))) {

+	      return "NN";

+	    }

+	    if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO")

+	        && pos2.equals("IN"))) {

+	      return "IN";

+	    }

+	    // VBx vs VBx = VB (does not matter which form for verb)

+	    if (pos1.startsWith("VB") && pos2.startsWith("VB")) {

+	      return "VB";

+	    }

+

+	    // ABx vs ABy always gives AB

+	    if (pos1.equalsIgnoreCase(pos2)) {

+	      return pos1;

+	    }

+	    if (pos1.length() > 2) {

+	      pos1 = pos1.substring(0, 2);

+	    }

+

+	    if (pos2.length() > 2) {

+	      pos2 = pos2.substring(0, 2);

+	    }

+	    if (pos1.equalsIgnoreCase(pos2)) {

+	      return pos1 + "*";

+	    }

+	    return null;

+	  }

+

+	

+};

+


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java
new file mode 100644
index 0000000..f4a8176
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/Triple.java

@@ -0,0 +1,49 @@
+package opennlp.tools.parse_thicket;

+

+import java.util.Comparator;

+

+

+public class Triple<T1, T2, T3> {

+		  private T1 first;

+

+		  private T2 second;

+		  

+		  private T3 third;

+

+		  public Triple() {

+

+		  }

+

+		  public T1 getFirst() {

+		    return first;

+		  }

+

+		  public void setFirst(T1 first) {

+		    this.first = first;

+		  }

+

+		  public T2 getSecond() {

+		    return second;

+		  }

+

+		  public void setSecond(T2 second) {

+		    this.second = second;

+		  }

+

+		public Triple(T1 first, T2 second, T3 third) {

+			super();

+			this.first = first;

+			this.second = second;

+			this.third = third;

+		}

+

+		public T3 getThird() {

+			return third;

+		}

+

+		public void setThird(T3 third) {

+			this.third = third;

+		}

+		  

+		  

+		}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java
new file mode 100644
index 0000000..db7905d
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java

@@ -0,0 +1,68 @@
+package opennlp.tools.parse_thicket;

+

+public class WordWordInterSentenceRelationArc {

+	

+	

+		Pair<Integer, Integer> codeFrom;

+		Pair<Integer, Integer> codeTo;

+		String lemmaFrom;

+		String lemmaTo;

+		ArcType arcType;

+		

+		public Pair<Integer, Integer> getCodeFrom() {

+			return codeFrom;

+		}

+

+		public void setCodeFrom(Pair<Integer, Integer> codeFrom) {

+			this.codeFrom = codeFrom;

+		}

+

+		public Pair<Integer, Integer> getCodeTo() {

+			return codeTo;

+		}

+

+		public void setCodeTo(Pair<Integer, Integer> codeTo) {

+			this.codeTo = codeTo;

+		}

+

+		public String getLemmaFrom() {

+			return lemmaFrom;

+		}

+

+		public void setLemmaFrom(String lemmaFrom) {

+			this.lemmaFrom = lemmaFrom;

+		}

+

+		public String getLemmaTo() {

+			return lemmaTo;

+		}

+

+		public void setLemmaTo(String lemmaTo) {

+			this.lemmaTo = lemmaTo;

+		}

+

+		public ArcType getArcType() {

+			return arcType;

+		}

+

+		public void setArcType(ArcType arcType) {

+			this.arcType = arcType;

+		}

+

+		public WordWordInterSentenceRelationArc(

+				Pair<Integer, Integer> codeFrom, Pair<Integer, Integer> codeTo,

+				String lemmaFrom, String lemmaTo, ArcType arcType) {

+			super();

+			this.codeFrom = codeFrom;

+			this.codeTo = codeTo;

+			this.lemmaFrom = lemmaFrom;

+			this.lemmaTo = lemmaTo;

+			this.arcType = arcType;

+		}

+	

+		public String toString(){

+			return "<sent="+codeFrom.getFirst()+"-word="+codeFrom.getSecond()+".."+lemmaFrom+"> ===> "+

+					"<sent="+codeTo.getFirst()+"-word="+codeTo.getSecond()+".."+lemmaTo+">";

+		}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java
new file mode 100644
index 0000000..09e371a
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/BingQueryRunnerMultipageSearchResults.java

@@ -0,0 +1,72 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.parse_thicket.apps;

+

+import java.io.BufferedReader;

+import java.io.InputStreamReader;

+import java.net.URL;

+import java.net.URLConnection;

+import java.net.URLEncoder;

+import java.util.ArrayList;

+import java.util.List;

+import java.util.logging.Logger;

+

+import net.billylieurance.azuresearch.AzureSearchResultSet;

+import net.billylieurance.azuresearch.AzureSearchWebQuery;

+import net.billylieurance.azuresearch.AzureSearchWebResult;

+

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+

+import org.apache.commons.lang.StringUtils;

+import org.json.JSONArray;

+import org.json.JSONObject;

+

+

+public class BingQueryRunnerMultipageSearchResults extends BingQueryRunner {

+	

+	private static String BING_KEY = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";

+	private static final Logger LOG = Logger

+		      .getLogger("opennlp.tools.similarity.apps.BingQueryRunnerMultipageSearchResults");

+	private AzureSearchWebQuery aq = new AzureSearchWebQuery();

+

+	public List<HitBase> runSearch(String query, int nRes, boolean bHighRank) {

+		aq.setAppid(BING_KEY);

+		aq.setQuery(query);		  		

+		aq.doQuery();

+		if (!bHighRank)

+			aq.setPage(5);

+		aq.setPerPage(nRes);

+		

+		List<HitBase> results = new ArrayList<HitBase> ();

+		AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();

+		

+		for (AzureSearchWebResult anr : ars){

+		    HitBase h = new HitBase();

+		    h.setAbstractText(anr.getDescription());

+		    h.setTitle(anr.getTitle());

+		    h.setUrl(anr.getUrl());

+		    results.add(h);

+		}

+		return results;

+	}

+	

+	

+

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MinedSentenceProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MinedSentenceProcessor.java
new file mode 100644
index 0000000..4f0512b
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MinedSentenceProcessor.java

@@ -0,0 +1,210 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.parse_thicket.apps;

+

+import java.util.Arrays;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.utils.Utils;

+

+import org.apache.commons.lang.StringUtils;

+

+public class MinedSentenceProcessor {

+  public static String acceptableMinedSentence(String sent) {

+    // if too many commas => seo text

+

+    String[] commas = StringUtils.split(sent, ',');

+    String[] spaces = StringUtils.split(sent, ' ');

+    if ((float) commas.length / (float) spaces.length > 0.7) {

+      System.out.println("Rejection: too many commas");

+      return null;

+    }

+    

+    String[] otherDelimiters = StringUtils.split(sent, '/');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    

+    otherDelimiters = StringUtils.split(sent, '.');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    otherDelimiters = StringUtils.split(sent, '!');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    otherDelimiters = StringUtils.split(sent, '=');

+    if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {

+        System.out.println("Rejection: too many delimiters");

+        return null;

+    }

+    

+    String[] pipes = StringUtils.split(sent, '|');

+    if (StringUtils.split(sent, '|').length > 2

+        || StringUtils.split(sent, '>').length > 2) {

+      System.out.println("Rejection: too many |s or >s ");

+      return null;

+    }

+    String sentTry = sent.toLowerCase();

+    // if too many long spaces

+    String sentSpaces = sentTry.replace("   ", "");

+    if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -

+      // suspicious

+      return null;

+

+    if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1

+        || sentTry.indexOf("copyright") > -1

+        || sentTry.indexOf("operating hours") > -1

+        || sentTry.indexOf("days per week") > -1

+        || sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1

+        || sentTry.indexOf("find the latest") > -1

+        || sentTry.startsWith("subscribe")

+        || sentTry.indexOf("Terms of Service") > -1

+        || sentTry.indexOf("clicking here") > -1

+        || sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1

+        || sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")

+        || sentTry.indexOf("available online") > -1

+        || sentTry.indexOf("get online") > -1

+        || sentTry.indexOf("buy online") > -1

+        || sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1

+        || sentTry.indexOf("official site") > -1

+        || sentTry.indexOf("this video") > -1

+        || sentTry.indexOf("this book") > -1

+        || sentTry.indexOf("this product") > -1

+        || sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1

+        || sentTry.indexOf("audio cd") > -1

+        || sentTry.indexOf("related searches") > -1

+        || sentTry.indexOf("permission is granted") > -1

+        || sentTry.indexOf("[edit") > -1

+        || sentTry.indexOf("edit categories") > -1

+        || sentTry.indexOf("free license") > -1

+        || sentTry.indexOf("permission is granted") > -1

+        || sentTry.indexOf("under the terms") > -1

+        || sentTry.indexOf("rights reserved") > -1

+        || sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")

+        || sentTry.endsWith("the.") || sentTry.startsWith("below") 

+        || sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1 

+        ||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1

+        ||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1 

+        ||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1

+        ||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1

+        ||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1

+        ||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1

+        

+        ||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 || sentTry.indexOf( "your money")>-1

+        ||sentTry.startsWith( "free") ||sentTry.indexOf( "purchase orders")>-1

+        ||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1 ||sentTry.indexOf( "credit card")>-1 

+        

+        ||sentTry.indexOf( "storeshop")>-1 || sentTry.startsWith( "find") || sentTry.startsWith( "shop") || sentTry.startsWith( "unlimited") 

+        ||sentTry.indexOf( "for a limited time")>-1 ||sentTry.indexOf( "prime members")>-1 ||sentTry.indexOf( "amazon members")>-1 ||sentTry.indexOf( "unlimited free")>-1 

+        ||sentTry.indexOf( "shipping")>-1 || sentTry.startsWith( "amazon")

+// not a script text

+        ||sentTry.indexOf( "document.body")>-1 ||sentTry.indexOf( " var ")>-1         ||sentTry.indexOf( "search suggestions")>-1 ||sentTry.startsWith( "Search") 

+        

+    		)

+      return null;

+    

+    //Millions of Amazon Prime members enjoy instant videos, free Kindle books and unlimited free two-day shipping.

+

+    // count symbols indicating wrong parts of page to mine for text

+    // if short and contains too many symbols indicating wrong area: reject

+    String sentWrongSym = sentTry.replace(">", "&&&").replace("�", "&&&")

+        .replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")

+        .replace("-", "&&&").replace("%", "&&&");

+    if ((sentWrongSym.length() - sentTry.length()) >= 4

+        && sentTry.length() < 200) // twice ot more

+      return null;

+

+    sent = sent.replace('[', ' ').replace(']', ' ')

+        .replace("_should_find_orig_", "").replace(".   .", ". ")

+        .replace("amp;", " ").replace("1.", " ").replace("2.", " ")

+        .replace("3.", " ").replace("4.", " ").replace("2009", "2011")

+        .replace("2008", "2011").replace("2006", "2011")

+        .replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")

+        .replace("no comments", " ").replace("  ", " ").replace("  ", " ")

+        .replace("(more.)", "").replace("more.", "").replace("<more>", "")

+        .replace("[more]", "").replace(".,", ".").replace("&lt;", "")

+        .replace("p&gt;", "").replace("product description", "");

+

+    // TODO .replace("a.", ".");

+

+    int endIndex = sent.indexOf(" posted");

+    if (endIndex > 0)

+      sent = sent.substring(0, endIndex);

+

+    return sent;

+  }

+

+  public static String processSentence(String pageSentence) {

+    if (pageSentence == null)

+      return "";

+    pageSentence = Utils.fullStripHTML(pageSentence);

+    pageSentence = StringUtils.chomp(pageSentence, "..");

+    pageSentence = StringUtils.chomp(pageSentence, ". .");

+    pageSentence = StringUtils.chomp(pageSentence, " .");

+    pageSentence = StringUtils.chomp(pageSentence, ".");

+    pageSentence = StringUtils.chomp(pageSentence, "...");

+    pageSentence = StringUtils.chomp(pageSentence, " ....");

+    pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")

+        .replace("(.)", "");

+

+    pageSentence = pageSentence.trim();

+    pageSentence = pageSentence.replaceAll("\\s+", " "); // make single

+    // spaces

+    // everywhere

+

+    String[] pipes = StringUtils.split(pageSentence, '|'); // removed

+    // shorter part

+    // of sentence

+    // at the end

+    // after pipe

+    if (pipes.length == 2

+        && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {

+      int pipePos = pageSentence.indexOf("|");

+      if (pipePos > -1)

+        pageSentence = pageSentence.substring(0, pipePos - 1).trim();

+

+    }

+

+    if (!StringUtils.contains(pageSentence, '.')

+        && !StringUtils.contains(pageSentence, '?')

+        && !StringUtils.contains(pageSentence, '!'))

+      pageSentence = pageSentence + ". ";

+

+    pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();

+    if (!pageSentence.endsWith("."))

+      pageSentence += ". ";

+    return pageSentence;

+  }

+

+  

+  public static String normalizeForSentenceSplitting(String pageContent) {

+    pageContent.replace("Jan.", "January").replace("Feb.", "February")

+        .replace("Mar.", "March").replace("Apr.", "April")

+        .replace("Jun.", "June").replace("Jul.", "July")

+        .replace("Aug.", "August").replace("Sep.", "September")

+        .replace("Oct.", "October").replace("Nov.", "November")

+        .replace("Dec.", "December");

+

+    return pageContent;

+

+  }

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MostFrequentWordsFromPageGetter.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MostFrequentWordsFromPageGetter.java
new file mode 100644
index 0000000..b106ac9
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MostFrequentWordsFromPageGetter.java

@@ -0,0 +1,70 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+import java.util.Map.Entry;

+import java.util.Scanner;

+import java.util.TreeMap;

+

+import org.apache.commons.lang.StringUtils;

+

+import opennlp.tools.similarity.apps.utils.ValueSortMap;

+

+public class MostFrequentWordsFromPageGetter {

+	

+	public List<String> getMostFrequentWordsInText(String input)

+	{

+		int maxRes = 4;

+		Scanner in = new Scanner(input);

+        in.useDelimiter("\\s+");

+        Map<String, Integer> words = 

+                new HashMap<String, Integer>();

+        

+        while (in.hasNext()) {

+            String word = in.next();

+            if (!StringUtils.isAlpha(word) || word.length()<4 )

+            	continue;

+            

+            if (!words.containsKey(word)) {

+                words.put(word, 1);

+            } else {

+                words.put(word, words.get(word) + 1);

+            }

+        }

+        

+        words = ValueSortMap.sortMapByValue(words, false);

+        List<String> results = new ArrayList<String>(words.keySet());

+		

+		if (results.size() > maxRes )

+			results = results.subList(0, maxRes); // get maxRes elements

+       

+        return results;

+    }

+	public List<String> getMostFrequentWordsInTextArr(String[] longestSents) {

+		StringBuffer buffer = new StringBuffer();

+		for(String s: longestSents){

+			buffer.append(s);

+		}

+		

+		return getMostFrequentWordsInText(buffer.toString());

+	}

+	

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java
new file mode 100644
index 0000000..ce4b600
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java

@@ -0,0 +1,184 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.List;

+import java.util.logging.Logger;

+

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.HitBaseComparable;

+import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class MultiSentenceSearchResultsProcessor  {

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");

+

+	private WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();

+	private Matcher matcher = new Matcher();

+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+	private BingQueryRunner bingSearcher = new BingQueryRunner();

+	private SnippetToParagraph snp = new SnippetToParagraph();

+

+	protected static final int NUM_OF_SEARCH_RESULTS = 10;

+

+	/*

+	 * Takes a search engine API (or scraped) search results and calculates the parse tree similarity

+	 * between the question and each snippet. Ranks those snippets with higher

+	 * similarity score up

+	 */

+

+

+	protected List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,

+			String searchQuery) {

+

+		List<HitBase> newHitList = new ArrayList<HitBase>();

+		int count = 0;

+		for (HitBase hit : hits) {

+			if (count>10)

+				break;

+			count++;

+			String[] pageSentsAndSnippet = formTextForReRankingFromHit(hit);

+					

+			Double score = 0.0;

+			try {

+				List<List<ParseTreeChunk>> match = null;

+				if (pageSentsAndSnippet!=null && pageSentsAndSnippet[0].length()>50){

+					match = matcher.assessRelevanceCache(pageSentsAndSnippet[0] ,

+							searchQuery);

+					score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);

+					hit.setSource(match.toString());

+				}

+				if (score < 2){ // attempt to match with snippet, if not much luck with original text

+					match = matcher.assessRelevanceCache(pageSentsAndSnippet[0] ,

+							searchQuery);

+					score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);

+				}

+				LOG.info(score + " | " +pageSentsAndSnippet[1]);

+			} catch (Exception e) {

+				LOG.severe("Problem processing snapshot " + pageSentsAndSnippet[1]);

+				e.printStackTrace();

+			}

+			hit.setGenerWithQueryScore(score);

+			newHitList.add(hit);

+		}

+		

+		System.out.println("\n\n ============= old ORDER ================= ");

+		for (HitBase hit : newHitList) {

+			System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore());

+			System.out.println("match = "+hit.getSource());

+		}

+		Collections.sort(newHitList, new HitBaseComparable());

+

+		System.out.println("\n\n ============= NEW ORDER ================= ");

+		for (HitBase hit : newHitList) {

+			System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore());

+			System.out.println("match = "+hit.getSource());

+		}

+

+		return newHitList;

+	}

+

+	protected String[] formTextForReRankingFromHit(HitBase hit) {

+		HitBase hitWithFullSents = snp.formTextFromOriginalPageGivenSnippet(hit);

+		String textFromOriginalPage = "";

+		try {

+			List<String> sents = hitWithFullSents.getOriginalSentences();

+			for(String s: sents){

+				textFromOriginalPage+=s+" ";

+			}

+

+			if (textFromOriginalPage.startsWith(".")){

+				textFromOriginalPage = textFromOriginalPage.substring(2);

+			}

+			textFromOriginalPage = textFromOriginalPage.replace(" . .", ". ").replace(". . ", ". ").

+					replace("..", ". ").trim();

+		} catch (Exception e1) {

+			e1.printStackTrace();

+			LOG.info("Problem processing snapshot "+hit.getAbstractText());

+		}

+		hit.setPageContent(textFromOriginalPage);

+		String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")

+				.replace("<b>", "").replace("</b>", "");

+		snapshot = snapshot.replace("</B>", "").replace("<B>", "")

+				.replace("<br>", "").replace("</br>", "").replace("...", ". ")

+				.replace("|", " ").replace(">", " ").replace(". .", ". ");

+		snapshot += " . " + hit.getTitle();

+		

+		return new String[] { textFromOriginalPage, snapshot };

+	}

+

+	public void close() {

+		// TODO

+		// matcher.close();

+	}

+

+	public List<HitBase> runSearch(String query) {

+

+

+		List<HitBase> hits = scraper.runSearch(query);

+		hits = calculateMatchScoreResortHits(hits, query);

+		return hits;

+	}

+

+

+	public List<HitBase> runSearchViaAPI(String query) {

+		List<String[]> reportData = new ArrayList<String[]>(); 

+		reportData.add(new String[]{query});

+		List<HitBase> hits = null;

+		try {

+			List<HitBase> resultList = bingSearcher.runSearch(query, NUM_OF_SEARCH_RESULTS);

+			reportData.add(convertListHitBaseIntoStringAr(resultList));

+			

+			// now we apply our own relevance filter

+			hits = calculateMatchScoreResortHits(resultList, query);

+			reportData.add(convertListHitBaseIntoStringAr(resultList));

+		} catch (Exception e) {

+			e.printStackTrace();

+			LOG.info("No search results for query '" + query);

+			return null;

+		}

+		ProfileReaderWriter.writeReport(reportData, "resultsForQuery_"+query.replace(' ', '_')+".csv");

+		return hits;

+	}

+	

+	private String[] convertListHitBaseIntoStringAr(List<HitBase> list){

+		List<String> results = new  ArrayList<String>(); 

+		for(HitBase h: list ){

+			results.add(h.getTitle()+ " | "+h.getAbstractText());

+		}

+		return results.toArray(new String[0]);

+	}

+

+	public static void main(String[] args){

+		String query = " I see no meaningful distinction between complacency or complicity in the military's latest failure to uphold their own " +

+				"standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense and the chain of " +

+				"command that was either oblivious to or tolerant of criminal behavior";

+

+		new MultiSentenceSearchResultsProcessor().runSearchViaAPI(query);

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java
new file mode 100644
index 0000000..dd7eaf7
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java

@@ -0,0 +1,382 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.List;

+import java.util.logging.Logger;

+

+import org.apache.commons.lang.StringUtils;

+

+

+import opennlp.tools.similarity.apps.ContentGeneratorSupport;

+import opennlp.tools.similarity.apps.Fragment;

+import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.RelatedSentenceFinder;

+import opennlp.tools.similarity.apps.utils.PageFetcher;

+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

+import opennlp.tools.similarity.apps.utils.Utils;

+import opennlp.tools.textsimilarity.TextProcessor;

+

+

+public class SnippetToParagraph extends ContentGeneratorSupport /*RelatedSentenceFinder */{

+	private PageFetcher pFetcher = new PageFetcher();

+	private static Logger LOG = Logger

+			.getLogger("com.become.parse_thicket.apps.SnippetToParagraph");

+

+	public HitBase formTextFromOriginalPageGivenSnippetDirect(HitBase item) {

+

+		// put orig sentence in structure

+		List<String> origs = new ArrayList<String>();

+

+		item.setOriginalSentences(origs);

+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

+				.replace("  ", " ").replace("  ", " ");

+		// generation results for this sentence

+		List<Fragment> result = new ArrayList<Fragment>();

+		// form plain text from snippet

+		String snapshot = item.getAbstractText().replace("<b>", " ")

+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");

+

+		String snapshotMarked = snapshot.replace("...",

+				" _should_find_orig_ . _should_find_orig_");

+		List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);

+		List<String> allFragms = new ArrayList<String>();

+		allFragms.addAll(fragments);

+

+		List<String> sents = new ArrayList<String>();

+		String downloadedPage;

+		try {

+			if (snapshotMarked.length() != snapshot.length()) {

+				downloadedPage = pFetcher.fetchPage(item.getUrl());

+				if (downloadedPage != null && downloadedPage.length() > 100) {

+					item.setPageContent(downloadedPage);

+					String pageContent = Utils.fullStripHTML(item.getPageContent());

+					pageContent = GeneratedSentenceProcessor

+							.normalizeForSentenceSplitting(pageContent);

+					pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",

+							// ". ")

+							.replace("..", ".").replace(". . .", " ").trim(); // sometimes

+					// html breaks

+					// are converted

+					// into ' ' (two

+					// spaces), so

+					// we need to

+					// put '.'

+					sents = TextProcessor.splitToSentences(pageContent);

+

+				}

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			// e.printStackTrace();

+			System.err

+			.println("Problem downloading  the page and splitting into sentences");

+			return item;

+		}

+

+		for (String fragment : allFragms) {

+			String followSent = null;

+			if (fragment.length() < 50)

+				continue;

+			String pageSentence = "";

+			// try to find original sentence from webpage

+			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null

+					&& sents.size() > 0)

+				try {

+					String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+							fragment.replace("_should_find_orig_", ""), (String[])sents.toArray(new String[]{}));

+					pageSentence = mainAndFollowSent[0];

+					followSent = mainAndFollowSent[1];

+

+				} catch (Exception e) {

+

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+			else

+				// or get original snippet

+				pageSentence = fragment;

+			if (pageSentence != null)

+				pageSentence = pageSentence.replace("_should_find_orig_", "");

+			String pageSentenceProc = GeneratedSentenceProcessor

+					.acceptableMinedSentence(pageSentence);

+			if (pageSentenceProc != null) {

+				pageSentenceProc = GeneratedSentenceProcessor

+						.processSentence(pageSentenceProc);

+				if (followSent != null) {

+					pageSentenceProc += " "

+							+ GeneratedSentenceProcessor.processSentence(followSent);

+				}

+

+				pageSentenceProc = Utils.convertToASCII(pageSentenceProc);

+				Fragment f = new Fragment(pageSentenceProc, 1);

+				f.setSourceURL(item.getUrl());

+				f.fragment = fragment;

+				result.add(f);

+				System.out.println("Accepted sentence: " + pageSentenceProc

+						+ "| with title= " + title);

+				System.out.println("For fragment = " + fragment);

+			} else

+				System.out

+				.println("Rejected sentence due to wrong area at webpage: "

+						+ pageSentence);

+		} 

+

+

+		item.setFragments(result);

+		return item;

+	}

+

+	public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) {

+

+		String[] sents = extractSentencesFromPage(item.getUrl());

+

+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

+				.replace("  ", " ").replace("  ", " ");

+		// generation results for this sentence

+		List<String> result = new ArrayList<String>();

+		// form plain text from snippet

+		String snapshot = item.getAbstractText().replace("<b>", " ")

+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ").replace("\"", "");

+

+		String snapshotMarked = snapshot.replace(" ...", ".");

+		List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);

+		if (fragments.size()<3 && StringUtils.countMatches(snapshotMarked, ".")>1){

+			snapshotMarked = snapshotMarked.replace("..", "&").replace(".", "&");

+			String[] fragmSents = snapshotMarked.split("&");

+			fragments = Arrays.asList(fragmSents);

+		}

+

+		for (String f : fragments) {

+			String followSent = null;

+			if (f.length() < 50)

+				continue;

+			String pageSentence = "";

+			// try to find original sentence from webpage

+

+			try {

+				String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+						f, sents);

+				pageSentence = mainAndFollowSent[0];

+				followSent = mainAndFollowSent[1];

+				if (pageSentence!=null)

+					result.add(pageSentence);

+				else {

+					result.add(f);

+					LOG.info("Could not find the original sentence \n"+f +"\n in the page " );

+				}

+				//if (followSent !=null)

+				//	result.add(followSent);

+			} catch (Exception e) {

+

+				e.printStackTrace();

+			}

+		}

+		item.setOriginalSentences(result);

+		return item;

+	}

+

+	public  List<String> cleanListOfSents(List<String> sents) {

+		List<String> sentsClean = new ArrayList<String>();

+		for (String s : sents) {

+			if (s == null || s.trim().length() < 30 || s.length() < 20)

+				continue;

+			sentsClean.add(s);

+		}

+		return sentsClean;

+	}

+

+

+

+	private String[] removeDuplicates(String[] hits)

+	{

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+

+		List<Integer> idsToRemove = new ArrayList<Integer>();

+		List<String> hitsDedup = new ArrayList<String>();

+		try

+		{

+			for (int i = 0; i < hits.length; i++)

+				for (int j = i + 1; j < hits.length; j++)

+				{

+					String title1 = hits[i];

+					String title2 = hits[j];

+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))

+						continue;

+					if (meas.measureStringDistance(title1, title2) > 0.7)

+					{

+						idsToRemove.add(j); // dupes found, later list member to

+						// be deleted

+					}

+				}

+			for (int i = 0; i < hits.length; i++)

+				if (!idsToRemove.contains(i))

+					hitsDedup.add(hits[i]);

+			if (hitsDedup.size() < hits.length)

+			{

+				System.out.println("Removed duplicates from relevant search results, including "

+						+ hits[idsToRemove.get(0)]);

+			}

+		}

+		catch (Exception e)

+		{

+			System.out.println("Problem removing duplicates from relevant images");

+		}

+

+		return hitsDedup.toArray(new String[0]);

+

+	}

+

+	public String[] extractSentencesFromPage(String url)

+	{

+

+		int maxSentsFromPage= 100;

+		List<String[]> results = new ArrayList<String[]>();

+

+		String downloadedPage = pFetcher.fetchPage(url, 20000);

+		if (downloadedPage == null || downloadedPage.length() < 100)

+		{

+			return null;

+		}

+

+		String pageOrigHTML = pFetcher.fetchOrigHTML(url);

+

+		downloadedPage= downloadedPage.replace("     ", "&");

+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");

+		String[] sents = downloadedPage.split("#");

+		List<TextChunk> sentsList = new ArrayList<TextChunk>();

+		for(String s: sents){

+			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")

+					.replace(": ", ". ").replace("- ", ". ").

+					replace (". .",".").trim();

+			sentsList.add(new TextChunk(s, s.length()));

+		}

+

+		Collections.sort(sentsList, new TextChunkComparable());

+		String[] longestSents = new String[maxSentsFromPage];

+		int j=0;

+		int initIndex = sentsList.size()-1 -maxSentsFromPage;

+		if (initIndex<0)

+			initIndex = 0;

+		for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){

+			longestSents[j] = sentsList.get(i).text;

+			j++;

+		}

+

+		sents = cleanSplitListOfSents(longestSents);

+

+		//sents = removeDuplicates(sents);

+		//sents = verifyEnforceStartsUpperCase(sents);

+

+		return sents;

+	}

+	

+	protected String[] cleanSplitListOfSents(String[] longestSents){

+	float minFragmentLength = 40, minFragmentLengthSpace=4;

+

+	List<String> sentsClean = new ArrayList<String>();

+	for (String sentenceOrMultSent : longestSents)

+	{

+		if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)

+			continue;

+		if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){

+			System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);

+			continue;

+		}

+		// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.

+		int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;

+		float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+		if ( avgSentenceLengthInTextPortion<minFragmentLength)

+			continue;

+		// o oo o ooo o o o ooo oo ooo o o oo

+		numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;

+		avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+		if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)

+			continue;

+

+		List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

+		

+		// forced split by ',' somewhere in the middle of sentence

+		// disused - Feb 26 13

+		//furtherSplit = furtherMakeSentencesShorter(furtherSplit);

+		furtherSplit.remove(furtherSplit.size()-1);

+		for(String s : furtherSplit){

+			if (s.indexOf('|')>-1)

+				continue;

+			s = s.replace("<em>"," ").replace("</em>"," ");

+			s = Utils.convertToASCII(s);

+			sentsClean.add(s);

+		}

+	}

+

+	return (String[]) sentsClean.toArray(new String[0]);

+}

+	private String[] verifyEnforceStartsUpperCase(String[] sents) {

+		for(int i=0; i<sents.length; i++){

+			String s = sents[i];

+			s = StringUtils.trim(s);

+			String sFirstChar = s.substring(0, 1);

+			if (!sFirstChar.toUpperCase().equals(sFirstChar)){

+				s = sFirstChar.toUpperCase()+s.substring(1);

+			}

+			sents[i] = s;

+		}

+		return sents;

+	}

+

+	private List<String> cleanProductFeatures(List<String> productFeaturesList) {

+		List<String> results = new ArrayList<String>();

+		for(String feature: productFeaturesList){

+			if (feature.startsWith("Unlimited Free") || feature.startsWith("View Larger") || feature.startsWith("View Larger") || feature.indexOf("shipping")>0)

+				continue;

+			results.add(feature);

+		}

+		return results;

+	}

+	public class TextChunk {

+		public TextChunk(String s, int length) {

+			this.text = s;

+			this.len = length;

+		}

+		public String text;

+		public int len;

+	}

+

+	public class TextChunkComparable implements Comparator<TextChunk>

+	{

+		public int compare(TextChunk ch1, TextChunk ch2)

+		{

+			if (ch1.len>ch2.len)

+				return 1;

+			else if (ch1.len<ch2.len)

+				return  -1;

+			else return 0;

+

+		}

+	}

+	

+	public static void main(String[] args){

+		

+	}

+

+}

+


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageContentSentenceExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageContentSentenceExtractor.java
new file mode 100644
index 0000000..038fcfc
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageContentSentenceExtractor.java

@@ -0,0 +1,147 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collections;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

+import opennlp.tools.similarity.apps.utils.Utils;

+import opennlp.tools.textsimilarity.TextProcessor;

+

+import org.apache.commons.lang.StringUtils;

+

+public class WebPageContentSentenceExtractor extends WebPageExtractor {

+	

+	

+	

+

+	public List<String> extractSentencesWithPotentialReviewPhrases(String url)

+	{

+		int maxSentsFromPage = 30;

+		String downloadedPage = pageFetcher.fetchPage(url, 20000);

+		if (downloadedPage == null || downloadedPage.length() < 100)

+		{

+			return null;

+		}

+		downloadedPage= downloadedPage.replace("     ", "&");

+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");

+		String[] sents = downloadedPage.split("#");

+		List<TextChunk> sentsList = new ArrayList<TextChunk>();

+		for(String s: sents){

+			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")

+					.replace(": ", ". ").replace("- ", ". ").

+					replace (". .",".").trim();

+			sentsList.add(new TextChunk(s, s.length()));

+		}

+		

+		Collections.sort(sentsList, new TextChunkComparable());

+		

+		String[] longestSents = new String[maxSentsFromPage];

+		int j=0;														// -1 removed

+		for(int i=sentsList.size()-1 -maxSentsFromPage; i< sentsList.size()-1; i++){

+			longestSents[j] = sentsList.get(i).text;

+			j++;

+		}

+

+		sents = cleanListOfSents(longestSents);

+	/*	

+		for(int i = 0; i< sents.length; i++){

+			sents[i] = sents[i].trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")

+					.replace(": ", ". ").replace("- ", ". ").

+					replace (". .",".").trim();

+		}

+		sents = cleanListOfSents(sents);

+	*/	sents = verifyEnforceStartsUpperCase(sents);

+

+		return Arrays.asList(sents);

+	}

+

+	private String[] verifyEnforceStartsUpperCase(String[] sents) {

+		for(int i=0; i<sents.length; i++){

+			String s = sents[i];

+			s = StringUtils.trim(s);

+			String sFirstChar = s.substring(0, 1);

+			if (!sFirstChar.toUpperCase().equals(sFirstChar)){

+				s = sFirstChar.toUpperCase()+s.substring(1);

+			}

+			sents[i] = s;

+		}

+			return sents;

+	}

+

+	private List<String> cleanProductFeatures(List<String> productFeaturesList) {

+		List<String> results = new ArrayList<String>();

+		for(String feature: productFeaturesList){

+			if (feature.startsWith("Unlimited Free") || feature.startsWith("View Larger") || feature.startsWith("View Larger") || feature.indexOf("shipping")>0)

+				continue;

+			results.add(feature);

+		}

+		return results;

+	}

+

+	// extracts paragraphs from web page

+	protected String[] cleanListOfSents(String[] longestSents)

+	{

+		float minFragmentLength = 40, minFragmentLengthSpace=4;

+

+		List<String> sentsClean = new ArrayList<String>();

+		for (String sentenceOrMultSent : longestSents)

+		{

+			if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){

+				System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);

+				continue;

+			}

+			// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.

+			int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;

+			float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLength)

+				continue;

+			// o oo o ooo o o o ooo oo ooo o o oo

+			numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;

+			avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)

+				continue;

+

+			sentsClean.add(sentenceOrMultSent);

+		}

+

+		return (String[]) sentsClean.toArray(new String[0]);

+	}

+

+	

+

+	private String startWithCapitalSent(String sent) {

+		String firstChar = sent.substring(0,1);

+		String remainder = sent.substring(1);

+		

+		return firstChar.toUpperCase()+remainder;

+	}

+

+	public HitBase formTextFromOriginalPageGivenSnippet(HitBase hit) {

+		List<String> results = extractSentencesWithPotentialReviewPhrases(hit.getUrl());

+		hit.setOriginalSentences(results);

+		return hit;

+	}

+

+	

+	

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java
new file mode 100644
index 0000000..b91f5cb
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java

@@ -0,0 +1,158 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.List;

+

+import org.apache.commons.lang.StringUtils;

+

+import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;

+import opennlp.tools.similarity.apps.utils.PageFetcher;

+import opennlp.tools.textsimilarity.TextProcessor;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class WebPageExtractor

+{

+	protected PageFetcher pageFetcher = new PageFetcher();

+	

+	protected ParserChunker2MatcherProcessor nlProc;

+	protected MostFrequentWordsFromPageGetter mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter();

+

+	protected static int sentThresholdLength = 70;

+

+	public List<String[]> extractSentencesWithPotentialProductKeywords(String url)

+	{

+		int maxSentsFromPage= 20;

+		List<String[]> results = new ArrayList<String[]>();

+

+		String downloadedPage = pageFetcher.fetchPage(url, 20000);

+		if (downloadedPage == null || downloadedPage.length() < 100)

+		{

+			return null;

+		}

+

+		String pageOrigHTML = pageFetcher.fetchOrigHTML(url);

+		String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>" );

+		pageTitle = pageTitle.replace("  ", ". ").replace("..", ".").replace(". . .", " ")

+				.replace(": ", ". ").replace("- ", ". ").replace(" |", ". ").

+				replace (". .",".").trim();

+		List<String> pageTitles = new ArrayList<String>();

+		pageTitles.addAll(TextProcessor.splitToSentences(pageTitle));

+		pageTitles.addAll(Arrays.asList(pageTitle.split(".")));

+

+		String[] headerSections = pageOrigHTML.split("<h2");

+		if (headerSections.length<2)

+			headerSections = pageOrigHTML.split("<h3");

+		for(String section: headerSections){

+

+			String header = StringUtils.substringBetween(section, ">", "<");

+			if (header!=null && header.length()>20)

+				pageTitles.add(header);

+		}

+

+

+		downloadedPage= downloadedPage.replace("     ", "&");

+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");

+		String[] sents = downloadedPage.split("#");

+		List<TextChunk> sentsList = new ArrayList<TextChunk>();

+		for(String s: sents){

+			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")

+					.replace(": ", ". ").replace("- ", ". ").

+					replace (". .",".").trim();

+			sentsList.add(new TextChunk(s, s.length()));

+		}

+

+		Collections.sort(sentsList, new TextChunkComparable());

+

+

+		String[] longestSents = new String[maxSentsFromPage];

+		int j=0;

+		for(int i=sentsList.size() -maxSentsFromPage; i< sentsList.size(); i++){

+			longestSents[j] = sentsList.get(i).text;

+			j++;

+		}

+

+		sents = cleanListOfSents(longestSents);

+

+		List<String>  mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter. getMostFrequentWordsInTextArr(sents);

+		// mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage);

+

+		results.add(pageTitles.toArray(new String[0]));

+		results.add(mosFrequentWordsListFromPage.toArray(new String[0]));

+		results.add(sents);

+

+		return results;

+	}

+

+	protected String[] cleanListOfSents(String[] longestSents)

+	{

+		List<String> sentsClean = new ArrayList<String>();

+		for (String sentenceOrMultSent : longestSents)

+		{

+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

+			for(String s : furtherSplit){

+				if (s.replace('.','&').split("&").length>3)

+					continue;

+				if (s.indexOf('|')>-1)

+					continue;

+				if (s == null || s.trim().length() < sentThresholdLength || s.length() < sentThresholdLength + 10)

+					continue;

+				if (GeneratedSentenceProcessor.acceptableMinedSentence(s)==null){

+					System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+s);

+					continue;

+				}

+				sentsClean.add(s);

+			}

+		}

+		return (String[]) sentsClean.toArray(new String[0]);

+	}

+

+	public class TextChunk {

+		public TextChunk(String s, int length) {

+			this.text = s;

+			this.len = length;

+		}

+		public String text;

+		public int len;

+	}

+

+	public class TextChunkComparable implements Comparator<TextChunk>

+	{

+		public int compare(TextChunk ch1, TextChunk ch2)

+		{

+			if (ch1.len>ch2.len)

+				return 1;

+			else if (ch1.len<ch2.len)

+				return  -1;

+			else return 0;

+

+		}

+	}

+	

+	public static void main(String[] args){

+		WebPageExtractor extractor = new WebPageExtractor();

+		List<String[]> res = 

+				extractor.extractSentencesWithPotentialProductKeywords("http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");

+		System.out.println(res.get(1));

+		

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsArcBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsArcBuilder.java
new file mode 100644
index 0000000..aea85b5
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsArcBuilder.java

@@ -0,0 +1,163 @@
+package opennlp.tools.parse_thicket.communicative_actions;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.parse_thicket.IGeneralizer;

+import opennlp.tools.parse_thicket.Pair;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+

+

+public class CommunicativeActionsArcBuilder implements IGeneralizer<Pair<String, Integer[]>>{

+

+	private List<Pair<String, Integer[]>> commActionsAttr = new ArrayList<Pair<String, Integer[]>>();

+	public CommunicativeActionsArcBuilder(){

+

+		commActionsAttr.add(new Pair<String, Integer[]>("agree", new Integer[]{	1,	-1,	-1,	1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("accept", new Integer[]{	1,	-1,	-1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("explain", new Integer[]{	0,	-1,	1,	1,	-1}));

+

+		commActionsAttr.add(new Pair<String, Integer[]>("suggest", new Integer[]{	1,	0,	1,	-1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("claim", new Integer[]{	1,	0,	1,	-1,	-1}));

+

+		// bring-attention

+		commActionsAttr.add(new Pair<String, Integer[]>("bring_attention", new Integer[]{	1,	1,	1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("remind", new Integer[]{	-1,	0,	1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("allow", new Integer[]{	1,	-1,	-1,	-1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("try", new Integer[]{	1,	0,	-1,	-1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("request", new Integer[]{	0,	1,	-1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("understand", new Integer[]{	0,	-1,	-1,	1,	-1}));

+

+		commActionsAttr.add(new Pair<String, Integer[]>("inform", new Integer[]{	0,	0,	1,	1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("notify", new Integer[]{	0,	0,	1,	1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("report", new Integer[]{	0,	0,	1,	1,	-1}));

+

+

+		commActionsAttr.add(new Pair<String, Integer[]>("confirm", new Integer[]{	0,	-1,	1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("ask", new Integer[]{	0,	1,	-1,	-1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("check", new Integer[]{	-1,	1,	-1,	-1,	1}));

+

+		commActionsAttr.add(new Pair<String, Integer[]>("ignore", new Integer[]{	-1,	-1,	-1,	-1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("wait", new Integer[]{	-1,	-1,	-1,	-1,	1}));

+

+		commActionsAttr.add(new Pair<String, Integer[]>("convince", new Integer[]{	0,	1,	1,	1, -1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("disagree", new Integer[]{	-1,	-1,	-1,	1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("appeal", new Integer[]{	-1,	1,	1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("deny", new Integer[]{	-1,	-1,	-1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("threaten", new Integer[]{	-1,	1, -1,	1,	1}));

+

+		commActionsAttr.add(new Pair<String, Integer[]>("concern", new Integer[]{	1,	-1, -1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("afraid", new Integer[]{	1,	-1, -1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("worri", new Integer[]{	1,	-1, -1,	1,	1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("scare", new Integer[]{	1,	-1, -1,	1,	1}));

+

+		commActionsAttr.add(new Pair<String, Integer[]>("want", new Integer[]{	1,	0,	-1,	-1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("know", new Integer[]{	0,	-1,	-1,	1,	-1}));

+		commActionsAttr.add(new Pair<String, Integer[]>("believe", new Integer[]{	0,	-1,	-1,	1,	-1}));

+	}

+

+	public Pair<String, Integer[]> findCAInSentence(List<ParseTreeNode> sentence){

+		for(ParseTreeNode node: sentence){

+			for(Pair<String, Integer[]> ca: commActionsAttr){

+				String lemma = (String)ca.getFirst();

+				// canonical form lemma is a sub-string of an actual form in parseTreeNode

+				if (node.getWord().toLowerCase().startsWith(lemma))

+					return ca;

+			}

+		}

+		return null;

+	}

+

+	public int findCAIndexInSentence(List<ParseTreeNode> sentence){

+		for(int index = 1; index< sentence.size(); index++){

+			ParseTreeNode node = sentence.get(index);

+			for(Pair<String, Integer[]> ca: commActionsAttr){

+				String lemma = (String)ca.getFirst();

+				String[] lemmas = lemma.split("_");

+				if (lemmas==null || lemmas.length<2){

+					if (node.getWord().toLowerCase().startsWith(lemma))

+						return index;

+				} else { //multiword matching 

+					for(int indexM= index+1; indexM<sentence.size(); indexM++);//

+				}

+				

+			}

+		}

+		return -1;

+	}

+

+

+	public List<Pair<String, Integer[]>> generalize(Object o1, Object o2) {

+		List<Pair<String, Integer[]>> results = new ArrayList<Pair<String, Integer[]>>();

+

+

+		String ca1 = null, ca2=null;

+

+		if (o1 instanceof String){

+			ca1 = (String)o1;

+			ca2 = (String)o2;

+		} else {			

+			ca1 = ((Pair<String, Integer[]>)o1).getFirst();

+			ca2 = ((Pair<String, Integer[]>)o2).getFirst();

+		}

+

+

+		// find entry for ca1

+		Pair<String, Integer[]> caP1=null, caP2=null;

+		for(Pair<String, Integer[]> ca: commActionsAttr){

+			String lemma = (String)ca.getFirst();

+			if (lemma.equals(ca1)){

+				caP1=ca;

+				break;

+			}					

+		}

+

+		// find entry for ca2

+		for(Pair<String, Integer[]> ca: commActionsAttr){

+			String lemma = (String)ca.getFirst();

+			if (lemma.equals(ca2)){

+				caP2=ca;

+				break;

+			}					

+		}

+

+		if (ca1.equals(ca2)){

+			results.add(caP1);

+		} else {

+			// generalization of int arrays also implements IGeneralizer

+			// we take Integer[] which is a first element of as resultant list

+			Integer[] res = new CommunicativeActionsAttribute().

+					generalize(caP1.getSecond(), caP2.getSecond()).get(0);

+			results.add(new Pair<String, Integer[]>("", res ));

+		}

+

+		return results;

+	}

+

+

+

+

+	/*Pair<String, Integer[]>[] commActionsAttrAr = new Pair<String, Integer[]>[] {

+			new Pair<String, Integer[]>("agree", new Integer[]{	1,	-1,	-1,	1,	-1}),

+			new Pair<String, Integer[]>("accept", new Integer[]{	1,	-1,	-1,	1,	1}),

+			new Pair<String, Integer[]>("explain", new Integer[]{	0,	-1,	1,	1,	-1}),

+			new Pair<String, Integer[]>("suggest", new Integer[]{	1,	0,	1,	-1,	-1}),

+			new Pair<String, Integer[]>("bring attention", new Integer[]{	1,	1,	1,	1,	1}),

+			new Pair<String, Integer[]>("remind", new Integer[]{	-1,	0,	1,	1,	1}),

+		    new Pair<String, Integer[]>("allow", new Integer[]{	1,	-1,	-1,	-1,	-1}),

+			new Pair<String, Integer[]>("try", new Integer[]{	1,	0,	-1,	-1,	-1}),

+			new Pair<String, Integer[]>("request", new Integer[]{	0,	1,	-1,	1,	1}),

+			new Pair<String, Integer[]>("understand", new Integer[]{	0,	-1,	-1,	1,	-1}),

+			new Pair<String, Integer[]>("inform", new Integer[]{	0,	0,	1,	1,	-1}),

+			new Pair<String, Integer[]>("confirm", new Integer[]{	0,	-1,	1,	1,	1}),

+			new Pair<String, Integer[]>("ask", new Integer[]{	0,	1,	-1,	-1,	-1}),

+			new Pair<String, Integer[]>("check", new Integer[]{	-1,	1,	-1,	-1,	1}),

+			new Pair<String, Integer[]>("ignore", new Integer[]{	-1,	-1,	-1,	-1,	1}),

+			new Pair<String, Integer[]>("convince", new Integer[]{	0,	1,	1,	1, -1}),

+			new Pair<String, Integer[]>("disagree", new Integer[]{	-1,	-1,	-1,	1,	-1}),

+			new Pair<String, Integer[]>("appeal", new Integer[]{	-1,	1,	1,	1,	1}),

+			new Pair<String, Integer[]>("deny", new Integer[]{	-1,	-1,	-1,	1,	1}),

+			new Pair<String, Integer[]>("threaten", new Integer[]{	-1,	1, -1,	1,	1}),	

+	} */

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsAttribute.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsAttribute.java
new file mode 100644
index 0000000..24bda54
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsAttribute.java

@@ -0,0 +1,29 @@
+package opennlp.tools.parse_thicket.communicative_actions;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.parse_thicket.IGeneralizer;

+

+

+public class CommunicativeActionsAttribute implements IGeneralizer<Integer[]>{

+

+	public List<Integer[]> generalize(Object intArr1ob, Object intArr2ob) {

+		Integer[] arr1 = (Integer[])intArr1ob, arr2 = (Integer[])intArr2ob;

+		Integer[] result = new Integer[arr2.length];

+		for(int i=0; i< arr2.length; i++ ){

+			if (arr1[i].equals(arr2[i]))

+				result[i] = arr1[i];

+			else if ((arr1[i]<0 && arr2[i]>0) || (arr1[i]>0 && arr2[i]<0)){

+				result[i]=0;

+			} else if (arr1[i]==0)

+				result[i]=arr2[i];

+			else if (arr2[i]==0)

+				result[i]=arr1[i];

+		}

+		List<Integer[]> results = new ArrayList<Integer[]>();

+		results.add(result);

+		return results;

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java
new file mode 100644
index 0000000..eb67724
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java

@@ -0,0 +1,155 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.kernel_interface;

+

+import java.io.File;

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.List;

+import java.util.logging.Logger;

+

+import edu.stanford.nlp.trees.Tree;

+import edu.stanford.nlp.util.StringUtils;

+

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;

+import opennlp.tools.parse_thicket.apps.WebPageContentSentenceExtractor;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.HitBaseComparable;

+import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class MultiSentenceExtendedForestSearchResultsProcessorSetFormer  extends MultiSentenceKernelBasedSearchResultsProcessor{

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor");

+	protected TreeExtenderByAnotherLinkedTree treeExtender = new TreeExtenderByAnotherLinkedTree();

+	

+	private TreeKernelRunner tkRunner = new TreeKernelRunner();

+	

+	protected static final String modelFileName = "model.txt";

+

+	private static final String trainingFileName = "training.txt";

+

+	protected static final String unknownToBeClassified = "unknown.txt";

+

+	private static final String classifierOutput = "classifier_output.txt";

+	

+	private String path;

+	public void setKernelPath (String path){

+		this.path=path;

+	}

+	

+	WebPageContentSentenceExtractor extractor = new WebPageContentSentenceExtractor();

+	

+	private List<HitBase> formTreeForestDataSet(

+			List<HitBase> hits, String query, boolean isPositive) {

+		List<HitBase> newHitList = new ArrayList<HitBase>(), newHitListReRanked = new ArrayList<HitBase>();

+		// form the training set from original documets. Since search results are ranked, we set the first half as positive set,

+		//and the second half as negative set.

+		// after re-classification, being re-ranked, the search results might end up in a different set

+		List<String[]> treeBankBuffer = new ArrayList<String[]>();

+		int count = 0;

+		for (HitBase hit : hits) {

+			count++;

+			// if orig content has been already set in HIT object, ok; otherwise set it

+			String searchResultText = hit.getPageContent();

+			if (searchResultText ==null){

+				try {

+					HitBase hitWithFullSents = extractor.formTextFromOriginalPageGivenSnippet(hit);

+					for(String paragraph: hitWithFullSents.getOriginalSentences()){

+						List<String[]> res = formTreeKernelStructure(paragraph, count, hits,  isPositive);

+						for(String[] rl : res){

+							StringUtils.printToFile(new File(path+trainingFileName), rl[0]+" \n", true);

+						}

+						//treeBankBuffer.addAll(res);

+					}

+				} catch (Exception e) {

+					e.printStackTrace();

+				}

+				

+			}			

+			newHitList.add(hit);

+			

+			

+		}	

+		// write the lits of samples to a file

+		ProfileReaderWriter.appendReport(treeBankBuffer, path+trainingFileName, ' ');

+		return newHitList;

+

+	}

+	

+	protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits, boolean isPositive) {

+		List<String[]> treeBankBuffer = new ArrayList<String[]> ();

+		try {

+			// get the parses from original documents, and form the training dataset

+			ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText);

+			List<Tree> forest = pt.getSentences();

+			// if from the first half or ranked docs, then positive, otherwise negative

+			String posOrNeg = null;

+			if (isPositive)

+				posOrNeg=" 1 ";

+			else 

+				posOrNeg=" -1 ";

+			// form the list of training samples

+			for(Tree t: forest){

+				treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t.toString()+ " |ET|"});

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		return treeBankBuffer;

+	}

+	

+	public List<HitBase> runSearchViaAPI(String query, Boolean isPositive) {

+		

+		try {

+			List<HitBase> hits = bingSearcher.runSearch(query, 20, true);

+			formTreeForestDataSet(hits, query, isPositive);

+

+		} catch (Exception e) {

+			e.printStackTrace();

+			LOG.info("No search results for query '" + query);

+			return null;

+		}

+

+

+		return null;

+	}

+	public static void main(String[] args){

+		String query = "digital camera for my mother as a gift";

+		Boolean isPositive = true;

+		if (args!=null && args.length>0){

+			query = args[0];

+			if (args.length>1 && args[1]!=null && args[1].startsWith("neg"))

+				isPositive = false;

+		}

+		

+		MultiSentenceExtendedForestSearchResultsProcessorSetFormer proc = new MultiSentenceExtendedForestSearchResultsProcessorSetFormer();

+		proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel_big\\");

+		proc.runSearchViaAPI(query, isPositive);

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java
new file mode 100644
index 0000000..1b2790f
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedExtendedForestSearchResultsProcessor.java

@@ -0,0 +1,92 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.kernel_interface;

+

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.List;

+import java.util.logging.Logger;

+

+import edu.stanford.nlp.trees.Tree;

+

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.HitBaseComparable;

+import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class MultiSentenceKernelBasedExtendedForestSearchResultsProcessor  extends MultiSentenceKernelBasedSearchResultsProcessor{

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedExtendedForestSearchResultsProcessor");

+	protected TreeExtenderByAnotherLinkedTree treeExtender = new TreeExtenderByAnotherLinkedTree();

+	

+	

+	

+

+	protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits) {

+		List<String[]> treeBankBuffer = new ArrayList<String[]> ();

+		try {

+			// get the parses from original documents, and form the training dataset

+			ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText);

+			List<String> extendedTreesDump = treeExtender.buildForestForCorefArcs(pt);

+			// if from the first half or ranked docs, then positive, otherwise negative

+			String posOrNeg = null;

+			if (count<hits.size()/2)

+				posOrNeg=" 1 ";

+			else 

+				posOrNeg=" -1 ";

+			// form the list of training samples

+			for(String t: extendedTreesDump){

+				treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t+ " |ET|"});

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		return treeBankBuffer;

+	}

+

+	public static void main(String[] args){

+		String query = null;

+		

+		/*" I see no meaningful distinction between complacency or complicity in the military's latest failure to uphold their own " +

+				"standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense and the chain of " +

+				"command that was either oblivious to or tolerant of criminal behavior";

+		

+		query = "I am now living abroad and have health insurance from Russia. How can I avoid penalty for not having health insurance in US";

+		

+		query = "ECUADOR'S PRESIDENT RAFAEL CORREA SAYS U.S. VP JOE BIDEN WANTS HIM TO REFUSE WHISTLEBLOWER EDWARD SNOWDEN'S BID FOR ASYLUM";

+		query = "how to pay tax on foreign income from real estate";

+		*/

+		if (args!=null && args.length>0)

+			query = args[0];

+		

+		MultiSentenceKernelBasedExtendedForestSearchResultsProcessor proc = new MultiSentenceKernelBasedExtendedForestSearchResultsProcessor();

+		proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel\\");

+		proc.runSearchViaAPI(query);

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java
new file mode 100644
index 0000000..df6189d
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java

@@ -0,0 +1,203 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.kernel_interface;

+

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.List;

+import java.util.logging.Logger;

+

+import edu.stanford.nlp.trees.Tree;

+

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.apps.BingQueryRunnerMultipageSearchResults;

+import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.similarity.apps.BingQueryRunner;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.HitBaseComparable;

+import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class MultiSentenceKernelBasedSearchResultsProcessor  extends MultiSentenceSearchResultsProcessor{

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedSearchResultsProcessor");

+

+	private WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();

+	protected Matcher matcher = new Matcher();

+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+	protected BingQueryRunnerMultipageSearchResults bingSearcher = new BingQueryRunnerMultipageSearchResults();

+	private SnippetToParagraph snp = new SnippetToParagraph();

+	private TreeKernelRunner tkRunner = new TreeKernelRunner();

+

+	private String path;

+	public void setKernelPath (String path){

+		this.path=path;

+	}

+	protected static final String modelFileName = "model.txt";

+

+	private static final String trainingFileName = "training.txt";

+

+	protected static final String unknownToBeClassified = "unknown.txt";

+

+	private static final String classifierOutput = "classifier_output.txt";

+

+

+	public List<HitBase> runSearchViaAPI(String query) {

+		List<HitBase> hits = null;

+		try {

+			List<HitBase> resultList = bingSearcher.runSearch(query);

+			// now we apply our own relevance filter

+			//hits = calculateMatchScoreResortHits(resultList, query);

+			

+			hits = resultList;

+			//once we applied our re-ranking, we set highly ranked as positive set, low-rated as negative set

+			//and classify all these search results again

+			//training set is formed from original documents for the search results, 

+			// and snippets of these search results are classified

+			hits = filterOutIrrelevantHitsByTreeKernelLearning(hits, query);

+

+		} catch (Exception e) {

+			e.printStackTrace();

+			LOG.info("No search results for query '" + query);

+			return null;

+		}

+

+

+		return hits;

+	}

+

+	private List<HitBase> filterOutIrrelevantHitsByTreeKernelLearning(

+			List<HitBase> hits, String query) {

+		List<HitBase> newHitList = new ArrayList<HitBase>(), newHitListReRanked = new ArrayList<HitBase>();

+		// form the training set from original documets. Since search results are ranked, we set the first half as positive set,

+		//and the second half as negative set.

+		// after re-classification, being re-ranked, the search results might end up in a different set

+		List<String[]> treeBankBuffer = new ArrayList<String[]>();

+		int count = 0;

+		for (HitBase hit : hits) {

+			count++;

+			// if orig content has been already set in HIT object, ok; otherwise set it

+			String searchResultText = hit.getPageContent();

+			if (searchResultText ==null){

+				String[] pageSentsAndSnippet = formTextForReRankingFromHit(hit);

+				searchResultText = pageSentsAndSnippet[0];

+				hit.setPageContent(searchResultText);

+			}			

+			newHitList.add(hit);

+			treeBankBuffer.addAll(formTreeKernelStructure(searchResultText, count, hits));

+			

+		}	

+		// write the lits of samples to a file

+		ProfileReaderWriter.writeReport(treeBankBuffer, path+trainingFileName, ' ');

+		// build the model

+		tkRunner.runLearner(path, trainingFileName, modelFileName);

+

+		// now we preparing the same answers to be classifies in/out

+		treeBankBuffer = new ArrayList<String[]>();

+		for (HitBase hit : newHitList) {			

+			// not original docs now but instead a snippet

+			String searchResultTextAbstr = hit.getAbstractText();

+			String snippet = searchResultTextAbstr.replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")

+					.replace("<b>", "").replace("</b>", "");

+			snippet = snippet.replace("</B>", "").replace("<B>", "")

+					.replace("<br>", "").replace("</br>", "").replace("...", ". ")

+					.replace("|", " ").replace(">", " ").replace(". .", ". ");

+			snippet =  hit.getTitle() + " " + snippet;

+			

+			ParseThicket pt = matcher.buildParseThicketFromTextWithRST(snippet);

+					//hit.getPageContent());

+			List<Tree> forest = pt.getSentences();

+			// we consider the snippet as a single sentence to be classified

+			if (forest.size()>0){

+				treeBankBuffer.add(new String[] {"0 |BT| "+forest.get(0).toString()+ " |ET|"});

+				newHitListReRanked .add(hit);

+			}

+

+		}	

+		// form a file from the snippets to be classified

+		ProfileReaderWriter.writeReport(treeBankBuffer, path+unknownToBeClassified, ' ');

+		tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput);

+		// read classification results

+		List<String[]> classifResults = ProfileReaderWriter.readProfiles(path+classifierOutput, ' ');

+		// iterate through classification results and set them as scores for hits

+		newHitList = new ArrayList<HitBase>();

+		for(int i=0; i<newHitListReRanked.size() && i<classifResults.size() ; i++){

+			String scoreClassif = classifResults.get(i)[0];

+			float val = Float.parseFloat(scoreClassif);

+			HitBase hit = newHitListReRanked.get(i);

+			hit.setGenerWithQueryScore((double) val);

+			newHitList.add(hit);

+		}

+		

+		// sort by SVM classification results

+		Collections.sort(newHitList, new HitBaseComparable());

+		System.out.println("\n\n ============= NEW ORDER ================= ");

+		for (HitBase hit : newHitList) {

+			System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore());

+			System.out.println("page content = "+hit.getPageContent());

+			System.out.println("title = "+hit.getAbstractText());

+			System.out.println("snippet = "+hit.getAbstractText());

+			System.out.println("match = "+hit.getSource());

+		}

+		

+		return newHitList;

+

+	}

+

+	protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits) {

+		List<String[]> treeBankBuffer = new ArrayList<String[]> ();

+		try {

+			// get the parses from original documents, and form the training dataset

+			ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText);

+			List<Tree> forest = pt.getSentences();

+			// if from the first half or ranked docs, then positive, otherwise negative

+			String posOrNeg = null;

+			if (count<hits.size()/2)

+				posOrNeg=" 1 ";

+			else 

+				posOrNeg=" -1 ";

+			// form the list of training samples

+			for(Tree t: forest){

+				treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t.toString()+ " |ET|"});

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		return treeBankBuffer;

+	}

+

+	public static void main(String[] args){

+		String query = " I see no meaningful distinction between complacency or complicity in the military's latest failure to uphold their own " +

+				"standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense and the chain of " +

+				"command that was either oblivious to or tolerant of criminal behavior";

+		

+		query = "I am now living abroad and have health insurance from Russia. How can I avoid penalty for not having health insurance in US";

+		

+		MultiSentenceKernelBasedSearchResultsProcessor proc = new MultiSentenceKernelBasedSearchResultsProcessor();

+		proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel\\");

+		proc.runSearchViaAPI(query);

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java
new file mode 100644
index 0000000..9c1c44a
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java

@@ -0,0 +1,80 @@
+package opennlp.tools.parse_thicket.kernel_interface;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import edu.stanford.nlp.trees.Tree;

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+

+public class PT2ExtendedTreeForestBuilder {

+	private Matcher matcher = new Matcher();	

+	private TreeKernelRunner tkRunner = new TreeKernelRunner();

+	private static final String modelFileName = "model.txt",

+			trainingFileName = "training.txt";

+	

+	private List<String[]> formTrainingSetFromText(String para,  boolean positive){

+		String prefix = null;

+		if (positive)

+			prefix=" 1 ";

+		else

+			prefix=" -1 ";

+			

+		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para);

+		List<Tree> forest = pt.getSentences();

+		List<String[]> treeBankBuffer = new ArrayList<String[]>();

+		for(Tree t: forest){

+			treeBankBuffer.add(new String[] {prefix+"|BT| "+t.toString()+ " |ET|"});

+		}

+		return treeBankBuffer;

+	}

+	

+	public void formPosNegTrainingSet(String pos, String neg, String path){

+		List<String[]> list = formTrainingSetFromText(pos,  true), 

+				negList= formTrainingSetFromText(neg, false);

+		list.addAll(negList);

+		ProfileReaderWriter.writeReport(list, path+trainingFileName, ' ');

+		tkRunner.runLearner(path, trainingFileName, modelFileName);

+	}

+	

+	public void classifySentences(String sentences, String path){

+		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(sentences);

+		List<Tree> forest = pt.getSentences();

+		List<String[]> treeBankBuffer = new ArrayList<String[]>();

+		for(Tree t: forest){

+			treeBankBuffer.add(new String[] {" 0 |BT| "+t.toString()+ " |ET|"});

+		}

+		

+		ProfileReaderWriter.writeReport(treeBankBuffer, path+"unknown.txt", ' ');

+		tkRunner.runClassifier(path, "unknown.txt", modelFileName, "classifier_output.txt");

+		

+		

+	}

+	

+	

+	public static void main(String[] args){

+		

+		PT2ExtendedTreeForestBuilder builder = new PT2ExtendedTreeForestBuilder();

+		

+			

+		String posSents = "Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+

+				"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +

+				"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +

+				"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ";

+

+		String negSents = "Iran refuses the UN offer to end a conflict over its nuclear weapons."+

+						"UN passes a resolution prohibiting Iran from developing its uranium enrichment site. " +

+						"A recent UN report presented charts saying Iran was working on nuclear weapons. " +

+				"Iran envoy to UN states its nuclear development is for peaceful purpose, and the evidence against its claim is fabricated by the US. ";

+		builder.formPosNegTrainingSet(posSents, negSents, "C:\\stanford-corenlp\\tree_kernel\\");

+		

+		

+		builder.classifySentences("Iran refuses Iraq's offer to end its conflict with UN. Iran passes a resolution prohibiting UN from doing second" +

+				" uranium enrichment site. Envoy to US says its nuclear development is for peaceful purposes. Material evidence againt US has been fabricated by UN.", 

+				

+				"C:\\stanford-corenlp\\tree_kernel\\");

+	}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java
new file mode 100644
index 0000000..4cf3b34
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java

@@ -0,0 +1,83 @@
+package opennlp.tools.parse_thicket.kernel_interface;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.List;

+import java.util.logging.Logger;

+

+import org.apache.commons.lang.StringUtils;

+

+

+import opennlp.tools.parse_thicket.apps.MinedSentenceProcessor;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;

+import opennlp.tools.similarity.apps.Fragment;

+import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.RelatedSentenceFinder;

+import opennlp.tools.similarity.apps.utils.PageFetcher;

+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

+import opennlp.tools.similarity.apps.utils.Utils;

+import opennlp.tools.textsimilarity.TextProcessor;

+

+

+public class SnippetToParagraphFull extends SnippetToParagraph {

+	private PageFetcher pFetcher = new PageFetcher();

+	private static Logger LOG = Logger

+			.getLogger("com.become.parse_thicket.apps.SnippetToParagraphFull");

+

+	

+

+	public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) {

+

+		String[] sents = extractSentencesFromPage(item.getUrl());

+

+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

+				.replace("  ", " ").replace("  ", " ");

+		// generation results for this sentence

+		List<String> result = new ArrayList<String>();

+		// form plain text from snippet

+		String snapshot = item.getAbstractText().replace("<b>", " ")

+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ").replace("\"", "");

+

+		String snapshotMarked = snapshot.replace(" ...", ".");

+		List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);

+		if (fragments.size()<3 && StringUtils.countMatches(snapshotMarked, ".")>1){

+			snapshotMarked = snapshotMarked.replace("..", "&").replace(".", "&");

+			String[] fragmSents = snapshotMarked.split("&");

+			fragments = Arrays.asList(fragmSents);

+		}

+

+		for (String f : fragments) {

+			String followSent = null;

+			if (f.length() < 50)

+				continue;

+			String pageSentence = "";

+			// try to find original sentence from webpage

+

+			try {

+				String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+						f, sents);

+				pageSentence = mainAndFollowSent[0];

+				followSent = mainAndFollowSent[1];

+				if (pageSentence!=null)

+					result.add(pageSentence);

+				else {

+					result.add(f);

+					LOG.info("Could not find the original sentence \n"+f +"\n in the page " );

+				}

+				//if (followSent !=null)

+				//	result.add(followSent);

+			} catch (Exception e) {

+

+				e.printStackTrace();

+			}

+		}

+		item.setOriginalSentences(result);

+		return item;

+	}

+

+	

+}

+


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java
new file mode 100644
index 0000000..47e474f
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java

@@ -0,0 +1,292 @@
+package opennlp.tools.parse_thicket.kernel_interface;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;

+import edu.stanford.nlp.trees.Tree;

+

+public class TreeExtenderByAnotherLinkedTree extends  PT2ThicketPhraseBuilder {

+

+	public List<String> buildForestForCorefArcs(ParseThicket pt){

+		List<String> results = new ArrayList<String>();

+		for(WordWordInterSentenceRelationArc arc: pt.getArcs()){

+			if (!arc.getArcType().getType().startsWith("coref"))

+				continue;

+			int fromSent = arc.getCodeFrom().getFirst();

+			int toSent = arc.getCodeTo().getFirst();

+			String wordFrom = arc.getLemmaFrom();

+			String wordTo = arc.getLemmaTo();

+

+			List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent-1), pt.getSentences().get(fromSent-1), new String[]{ wordFrom});

+			if (trees==null || trees.size()<1)

+				continue;

+			System.out.println(trees);

+			StringBuilder sb = new StringBuilder(10000);	

+			toStringBuilderExtenderByAnotherLinkedTree1(sb, pt.getSentences().get(toSent-1), trees.get(0), new String[]{wordTo});

+			System.out.println(sb.toString());

+			results.add(sb.toString());

+		}

+		/*

+		List<String[]> treeBankBuffer = new ArrayList<String[]>();

+		for(String t: results){

+			treeBankBuffer.add(new String[] {" 0 |BT|"+t.toString()+ "|ET|"});

+		}

+		ProfileReaderWriter.writeReport(treeBankBuffer, "C:\\stanford-corenlp\\tree_kernel\\unknownForest.txt", ' ');

+		*/

+		return results;

+	}

+

+	public StringBuilder toStringBuilderExtenderByAnotherLinkedTree1(StringBuilder sb, Tree t, Tree treeToInsert, String[] corefWords) {

+		if (t.isLeaf()) {

+			if (t.label() != null) {

+				sb.append(t.label().value());

+			}

+			return sb;

+		} else {

+			sb.append('(');

+			if (t.label() != null) {

+				if (t.value() != null) {

+					sb.append(t.label().value());

+				}

+			}

+			boolean bInsertNow=false;

+			Tree[] kids = t.children();

+			if (kids != null) {

+				for (Tree kid : kids) {

+					if (corefWords!=null){

+						String word = corefWords[corefWords.length-1];

+						String phraseStr = kid.toString();

+						phraseStr=phraseStr.replace(")", "");

+						if (phraseStr.endsWith(word)){

+							bInsertNow=true;

+						}

+					}

+				}

+				if (bInsertNow){ 

+					for (Tree kid : kids) {

+						sb.append(' ');

+						toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, null, null);

+					}

+					sb.append(' ');

+					toStringBuilderExtenderByAnotherLinkedTree1(sb, treeToInsert, null, null);

+					int z=0; z++;

+

+				} else {

+					for (Tree kid : kids) {

+						sb.append(' ');

+						toStringBuilderExtenderByAnotherLinkedTree1(sb, kid, treeToInsert, corefWords);

+					}

+

+				}

+			}

+

+			return sb.append(')');

+		}

+	}

+

+	public List<Tree> getASubtreeWithRootAsNodeForWord1(Tree tree, Tree currentSubTree, String[] corefWords){

+		if (currentSubTree.isLeaf()){

+			return null;

+		}

+		List<Tree> result = null;

+		Tree[] kids = currentSubTree.children();

+		if (kids != null) {

+			boolean bInsert=false;

+			String word = corefWords[corefWords.length-1];

+

+			for (Tree kid : kids) {

+				if (bInsert){

+					result.add(kid);

+				} else {

+

+					String phraseStr = kid.toString();

+					phraseStr=phraseStr.replace(")", "");

+					if (phraseStr.endsWith(word)){

+						bInsert=true;

+						result = new ArrayList<Tree>();

+					}

+				}

+			}

+			if (bInsert){

+				return result;

+			}

+

+			// if not a selected node, proceed with iteration

+			for (Tree kid : kids) {

+				List<Tree> ts = getASubtreeWithRootAsNodeForWord1(tree, kid, corefWords);

+				if (ts!=null)

+					return ts;

+			}

+

+		}

+		return null;

+	}

+

+

+	public Tree[] getASubtreeWithRootAsNodeForWord(Tree tree, Tree currentSubTree, String[] corefWords){

+		if (currentSubTree.isLeaf()){

+			return null;

+		}

+

+

+		boolean bInsertNow=false;

+		/*List<ParseTreeNode> bigTreeNodes = parsePhrase(currentSubTree.label().value());	

+		for(ParseTreeNode smallNode: bigTreeNodes ){

+			if (bigTreeNodes.get(0).getWord().equals("") )

+				continue;

+			String word = bigTreeNodes.get(0).getWord();

+			for(String cWord: corefWords){

+

+				if (word.equalsIgnoreCase(cWord))

+					bInsertNow=true;

+			}

+		} */

+

+		String nodePhraseStr = currentSubTree.toString();

+		System.out.println(nodePhraseStr);

+		for(String w: corefWords)

+			nodePhraseStr = nodePhraseStr.replace(w, "");

+		// all words are covered

+		if (nodePhraseStr.toUpperCase().equals(nodePhraseStr))

+			bInsertNow=true;

+

+		//if(bInsertNow)

+		//	return currentSubTree;

+

+		Tree[] kids = currentSubTree.children();

+		if (kids != null) {

+			/*for (Tree kid : kids) {

+				List<ParseTreeNode> bigTreeNodes = parsePhrase(kid.label().value());	

+				if (bigTreeNodes!=null && bigTreeNodes.size()>0 && bigTreeNodes.get(0)!=null &&

+						bigTreeNodes.get(0).getWord().equalsIgnoreCase(corefWords[0])){

+					bInsertNow=true;

+					return kids;

+				}

+

+			}*/

+

+

+			for (Tree kid : kids) {

+				Tree[] t = getASubtreeWithRootAsNodeForWord(tree, kid, corefWords);

+				if (t!=null)

+					return t;

+			}

+

+		}

+		return null;

+	}

+

+

+	public StringBuilder toStringBuilderExtenderByAnotherLinkedTree(StringBuilder sb, Tree t, Tree treeToInsert) {

+		if (t.isLeaf()) {

+			if (t.label() != null) {

+				sb.append(t.label().value());

+			}

+			return sb;

+		} else {

+			sb.append('(');

+			if (t.label() != null) {

+				if (t.value() != null) {

+					sb.append(t.label().value());

+				}

+			}

+

+			boolean bInsertNow=false;

+			// we try match trees to find out if we are at the insertion position

+			if (treeToInsert!=null){

+				List<ParseTreeNode> bigTreeNodes = parsePhrase(t.label().value());	

+				List<ParseTreeNode> smallTreeNodes = parsePhrase(treeToInsert.getChild(0).getChild(0).getChild(0).label().value());	

+

+				System.out.println(t + " \n "+ treeToInsert+ "\n");

+

+				if (smallTreeNodes.size()>0 && bigTreeNodes.size()>0)

+					for(ParseTreeNode smallNode: smallTreeNodes ){

+						if (!bigTreeNodes.get(0).getWord().equals("") 

+								&& bigTreeNodes.get(0).getWord().equalsIgnoreCase(smallNode.getWord()))

+							bInsertNow=true;

+					}

+			}

+

+			if (bInsertNow){ 

+				Tree[] kids = t.children();

+				if (kids != null) {

+					for (Tree kid : kids) {

+						sb.append(' ');

+						toStringBuilderExtenderByAnotherLinkedTree(sb, kid, null);

+					}

+					sb.append(' ');

+					toStringBuilderExtenderByAnotherLinkedTree(sb, treeToInsert.getChild(0).getChild(1), null);

+					int z=0; z++;

+				}

+			} else {

+				Tree[] kids = t.children();

+				if (kids != null) {

+					for (Tree kid : kids) {

+						sb.append(' ');

+						toStringBuilderExtenderByAnotherLinkedTree(sb, kid, treeToInsert);

+					}

+

+				}

+			}

+			return sb.append(')');

+		}

+	}

+

+	private StringBuilder toStringBuilder(StringBuilder sb, Tree t) {

+		if (t.isLeaf()) {

+			if (t.label() != null) {

+				sb.append(t.label().value());

+			}

+			return sb;

+		} else {

+			sb.append('(');

+			if (t.label() != null) {

+				if (t.value() != null) {

+					sb.append(t.label().value());

+				}

+			}

+			Tree[] kids = t.children();

+			if (kids != null) {

+				for (Tree kid : kids) {

+					sb.append(' ');

+					toStringBuilder(sb, kid);

+				}

+			}

+			return sb.append(')');

+		}

+	}

+

+	public static void main(String[] args){

+		Matcher matcher = new Matcher();

+		TreeExtenderByAnotherLinkedTree extender = new TreeExtenderByAnotherLinkedTree();

+		

+		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(//"I went to the forest to look for a tree. I found out that it was thick and green");

+				"Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+

+				"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +

+				"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +

+				"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. ");

+

+		List<String> results = extender.buildForestForCorefArcs(pt);

+		System.out.println(results);

+		System.exit(0);

+

+		List<Tree> forest = pt.getSentences();

+		

+		List<Tree> trees = extender.getASubtreeWithRootAsNodeForWord1(forest.get(1), forest.get(1), new String[]{"it"});

+		System.out.println(trees);

+		StringBuilder sb = new StringBuilder(10000);	

+		extender.toStringBuilderExtenderByAnotherLinkedTree1(sb, forest.get(0), trees.get(0), new String[]{"the", "forest"});

+		System.out.println(sb.toString());

+

+

+		//

+		//extender.toStringBuilderExtenderByAnotherLinkedTree(sb, forest.get(0), forest.get(1));

+		//System.out.println(sb.toString());

+	}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java
new file mode 100644
index 0000000..f00904f
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java

@@ -0,0 +1,115 @@
+package opennlp.tools.parse_thicket.kernel_interface;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.InputStreamReader;

+

+public class TreeKernelRunner {

+	public void runEXE(String[] command, String runPath){

+		Runtime r = Runtime.getRuntime();

+		Process mStartProcess = null;

+		try {

+			mStartProcess = r.exec( command, null, new File(runPath));

+		} catch (IOException e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+

+		StreamLogger outputGobbler = new StreamLogger(mStartProcess.getInputStream());

+		outputGobbler.start();

+

+		try {

+			int returnCode = mStartProcess.waitFor();

+		} catch (InterruptedException e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+	}

+

+	public void runLearner(String dir, String learning_file, String  model_file)

+	{

+		dir = dir.replace('/', '\\');

+		

+		if (!dir.endsWith("\\"))

+				dir+="\\";

+		String[] runString = new String[]{dir+"svm_learn.exe","-t", "5", dir+learning_file,  dir+model_file};

+		runEXE(runString, dir);

+	}

+	

+	

+	//svm_classify example_file model_file predictions_file

+	public void runClassifier(String dir, String example_file, String  model_file, String predictions_file)

+	{

+		dir = dir.replace('/', '\\');

+		

+		if (!dir.endsWith("\\"))

+				dir+="\\";

+		String[] runString = new String[]{dir+"svm_classify.exe", dir+example_file,  dir+model_file, dir+predictions_file};

+		runEXE(runString, dir);

+	}

+

+	class StreamLogger extends Thread{

+

+		private InputStream mInputStream;

+

+		public StreamLogger(InputStream is) {

+			this.mInputStream = is;

+		}

+

+		public void run() {

+			try {

+				InputStreamReader isr = new InputStreamReader(mInputStream);

+				BufferedReader br = new BufferedReader(isr);

+				String line = null;

+				while ((line = br.readLine()) != null) {

+					System.out.println(line);

+				}

+			} catch (IOException ioe) {

+				ioe.printStackTrace();

+			}

+		}

+

+	}

+	

+	public static void main(String[] args){

+		TreeKernelRunner runner = new TreeKernelRunner();

+		runner.runLearner("C:\\stanford-corenlp\\tree_kernel\\", "training.txt", "arg0.model1.txt");

+		runner.runClassifier("C:\\stanford-corenlp\\tree_kernel\\", "arg0.test", "arg0.model1.txt", "arg0.output1.txt");

+	}

+}

+

+	/*

+exec:

+

+public Process exec(String command, String envp[], File dir) 

+

+

+

+   @param      command   a specified system command.

+   @param      envp      array of strings, each element of which 

+                         has environment variable settings in format

+                         <i>name</i>=<i>value</i>.

+   @param      dir       the working directory of the subprocess, or

+                         <tt>null</tt> if the subprocess should inherit

+                         the working directory of the current process.

+

+                         Ð’ Ð´Ð¸Ñ�Ñ‚Ñ€Ð¸Ð±ÑƒÑ‚Ð¸Ð²Ðµ Ð´Ð²Ð° exe-Ñ„Ð°Ð¹Ð»Ð°: svm_learn.exe Ð¸ svm_classify.exe.

+

+1.   svm_learn.exe Ð±ÐµÑ€ÐµÑ‚ Ñ„Ð°Ð¹Ð» Ñ� Ð¿Ñ€Ð¸Ð¼ÐµÑ€Ð°Ð¼Ð¸, Ð¾Ð±Ñ€Ð°Ð±Ð°Ñ‚Ñ‹Ð²Ð°ÐµÑ‚ ÐµÐ³Ð¾, Ñ�Ñ‚Ñ€Ð¾Ð¸Ñ‚ Ñ„Ð°Ð¹Ð» model Ð¼ Ð¿Ñ€Ð°Ð²Ð¸Ð»Ð°Ð¼Ð¸ Ð¾Ð±ÑƒÑ‡ÐµÐ½Ð¸Ðµ.

+

+ÐŸÑ€Ð¸Ð¼ÐµÑ€Ñ‹ Ð·Ð°Ð¿ÑƒÑ�ÐºÐ°: 

+svm_learn -t 5 learning_file model_file - Ñ�Ñ‚Ð¾ Ñ�Ð°Ð¼Ñ‹Ð¹ Ð¿Ñ€Ð¾Ñ�Ñ‚Ð¾Ð¹ Ð²Ð°Ñ€Ð¸Ð°Ð½Ñ‚ Ð·Ð°Ð¿ÑƒÑ�ÐºÐ°, SubSetTreeKernel (Ð´Ð¾Ð¿ÑƒÑ�ÐºÐ°ÑŽÑ‚Ñ�Ñ� Ñ€Ð°Ð·Ñ€Ñ‹Ð²Ñ‹ Ð¿Ñ€Ð¸ Ð¾Ð±Ñ…Ð¾Ð´Ðµ Ð´ÐµÑ€ÐµÐ²ÑŒÐµÐ²)

+

+svm_learn -t 5 -D 0 learning_file model_file - Ð´Ñ€ÑƒÐ³Ð¾Ð¹ Ð²Ð°Ñ€Ð¸Ð°Ð½Ñ‚ Ñ�Ð´Ñ€Ð°, SubTreeKernel

+

+ÐŸÑ€Ð¸Ð¼ÐµÑ€ Ñ„Ð°Ð¹Ð»Ð° Ð»ÐµÐ¶Ð¸Ñ‚ Ð½Ð° ÐµÐ³Ð¾ Ñ�Ñ‚Ñ€Ð°Ð½Ð¸Ñ‡ÐºÐµ. Ð¢Ð°Ð¼ Ð¶Ðµ Ð¾Ð¿Ð¸Ñ�Ð°Ð½Ð¸Ðµ Ð¿Ð°Ñ€Ð°Ð¼ÐµÑ‚Ñ€Ð¾Ð².

+

+2. svm_classify.exe Ð±ÐµÑ€ÐµÑ‚ Ñ„Ð°Ð¹Ð» Ñ� Ñ‚ÐµÑ�Ñ‚Ð¾Ð²Ñ‹Ð¼Ð¸ Ð¿Ñ€Ð¸Ð¼ÐµÑ€Ð°Ð¼Ð¸, Ñ„Ð°Ð¹Ð» Ñ� Ð¼Ð¾Ð´ÐµÐ»ÑŒÑŽ, Ð¿Ð¾Ñ�Ñ‚Ñ€Ð¾ÐµÐ½Ð½Ñ‹Ð¹ svm_learn, Ð¸ Ð·Ð°Ð¿Ð¸Ñ�Ñ‹Ð²Ð°ÐµÑ‚ Ñ€ÐµÐ·ÑƒÐ»ÑŒÑ‚Ð°Ñ‚Ñ‹ Ð¾Ð±ÑƒÑ‡ÐµÐ½Ð¸Ñ� Ð² Ñ„Ð°Ð¹Ð» predictions_file.

+

+Ð—Ð°Ð¿ÑƒÑ�Ðº:     svm_classify example_file model_file predictions_file

+

+Ð¤Ð°Ð¹Ð» Ð¸Ð¼ÐµÐµÑ‚ Ñ‚Ð¾Ñ‚ Ð¶Ðµ Ñ„Ð¾Ñ€Ð¼Ð°Ñ‚, Ñ‡Ñ‚Ð¾ Ð¸ Ð²Ñ…Ð¾Ð´Ð½Ñ‹Ðµ Ð¿Ñ€Ð¸Ð¼ÐµÑ€Ñ‹. ÐžÐ±Ñ€Ð°Ð·ÐµÑ† Ð»ÐµÐ¶Ð¸Ñ‚ Ð² Ð°Ñ€Ñ…Ð¸Ð²Ðµ Ð½Ð° Ñ�Ñ‚Ñ€Ð°Ð½Ð¸Ñ‡ÐºÐµ ÐœÐ¾Ñ�ÐºÐ¸Ñ‚Ñ‚Ð¸. 

+ÐœÐ¾Ð¶Ð½Ð¾ Ñ�Ñ€Ð°Ð·Ñƒ Ð¶Ðµ ÑƒÐºÐ°Ð·Ñ‹Ð²Ð°Ñ‚ÑŒ, Ðº ÐºÐ°ÐºÐ¾Ð¼Ñƒ ÐºÐ»Ð°Ñ�Ñ�Ñƒ Ð¾Ñ‚Ð½Ð¾Ñ�Ð¸Ñ‚Ñ�Ñ� Ð¿Ñ€Ð¸Ð¼ÐµÑ€ (1 Ð¸Ð»Ð¸ -1 Ð² Ð½Ð°Ñ‡Ð°Ð»Ðµ Ñ�Ñ‚Ñ€Ð¾ÐºÐ¸). Ð’ Ñ�Ñ‚Ð¾Ð¼ Ñ�Ð»ÑƒÑ‡Ð°Ðµ Ñ‚Ð¾Ñ‡Ð½Ð¾Ñ�Ñ‚ÑŒ Ð¸ Ð¿Ð¾Ð»Ð½Ð¾Ñ‚Ð° Ð¾Ñ†ÐµÐ½Ð¸Ð²Ð°ÑŽÑ‚Ñ�Ñ� Ð°Ð²Ñ‚Ð¾Ð¼Ð°Ñ‚Ð¸Ñ‡ÐµÑ�ÐºÐ¸. Ð˜Ð»Ð¸ Ñ�Ñ‚Ð°Ð²Ð¸Ñ‚ÑŒ Ñ‚Ð°Ð¼ 0.

+	 */
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationListReducer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationListReducer.java
new file mode 100644
index 0000000..ef0569a
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationListReducer.java

@@ -0,0 +1,148 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.HashSet;

+import java.util.List;

+

+public class GeneralizationListReducer {

+  public List<ParseTreePath> applyFilteringBySubsumption_OLD(

+      List<ParseTreePath> result) {

+    List<ParseTreePath> resultDupl = new ArrayList<ParseTreePath>();

+    resultDupl.addAll(new HashSet<ParseTreePath>(result));

+    result = resultDupl;

+    if (result.size() < 2)

+      return result; // nothing to reduce

+    List<ParseTreePath> resultReduced = new ArrayList<ParseTreePath>();

+    int size = result.size();

+    for (int i = 0; i < size; i++) {

+      Boolean bSubChunk = false;

+      for (int j = 0; j < size; j++) {

+        if (i == j) {

+          continue;

+        }

+        if (result.get(j).isASubChunk(result.get(i))) {

+          bSubChunk = true;

+        }

+      }

+      if (!bSubChunk)

+        resultReduced.add(result.get(i));

+    }

+

+    if (resultReduced.size() < 1) {

+      System.err.println("Wrong subsumption reduction");

+    }

+

+    if (resultReduced.size() > 1) {

+      int z = 0;

+      z++;

+    }

+    return resultReduced;

+

+  }

+

+  public List<ParseTreePath> applyFilteringBySubsumptionOLD(

+      List<ParseTreePath> result) {

+    List<ParseTreePath> resultDupl = null;

+    if (result.size() < 2)

+      return result; // nothing to reduce

+    List<ParseTreePath> resultReduced = new ArrayList<ParseTreePath>();

+    int size = result.size();

+    resultDupl = new ArrayList<ParseTreePath>(result);

+    for (int s = 0; s < size; s++) {

+      for (int i = 0; i < resultDupl.size(); i++) {

+        Boolean bStop = false;

+        for (int j = 0; j < resultDupl.size(); j++) {

+          if (i == j) {

+            continue;

+          }

+          if (result.get(j).isASubChunk(result.get(i))

+              && !result.get(i).isASubChunk(result.get(j))) {

+            resultDupl.remove(i);

+            bStop = true;

+            break;

+          }

+        }

+        if (bStop) {

+          break;

+        }

+      }

+    }

+    resultReduced = resultDupl;

+    if (resultReduced.size() < 1) {

+      System.err.println("Wrong subsumption reduction");

+    }

+

+    if (resultReduced.size() > 1) {

+      int z = 0;

+      z++;

+    }

+    return resultReduced;

+

+  }

+

+  public List<ParseTreePath> applyFilteringBySubsumption(

+      List<ParseTreePath> result) {

+    List<Integer> resultDuplIndex = new ArrayList<Integer>();

+    List<ParseTreePath> resultReduced = new ArrayList<ParseTreePath>();

+

+    if (result.size() < 2) {

+      return result; // nothing to reduce

+    }

+    // remove empty

+    for (ParseTreePath ch : result) {

+      if (ch.getLemmas().size() > 0) {

+        resultReduced.add(ch);

+      }

+    }

+    result = resultReduced;

+

+    for (int i = 0; i < result.size(); i++) {

+      for (int j = i + 1; j < result.size(); j++) {

+        if (i == j) {

+          continue;

+        }

+        if (result.get(j).isASubChunk(result.get(i))) {

+          resultDuplIndex.add(i);

+        } else if (result.get(i).isASubChunk(result.get(j))) {

+          resultDuplIndex.add(j);

+        }

+      }

+

+    }

+    resultReduced = new ArrayList<ParseTreePath>();

+    for (int i = 0; i < result.size(); i++) {

+      if (!resultDuplIndex.contains(i)) {

+        resultReduced.add(result.get(i));

+      }

+    }

+

+    if (resultReduced.size() < 1) {

+      System.err.println("Wrong subsumption reduction");

+      resultReduced = result;

+    }

+

+    return resultReduced;

+

+  }

+

+  // testing sub-chunk functionality and

+  // elimination more general according to subsumption relation

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java
new file mode 100644
index 0000000..cb6f3e9
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/LemmaFormManager.java

@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.List;
+
+import opennlp.tools.stemmer.PorterStemmer;
+
+public class LemmaFormManager {
+
+  public String matchLemmas(PorterStemmer ps, String lemma1, String lemma2,
+      String POS) {
+    if (POS == null) {
+      return null;
+    }
+    lemma1 = lemma1.toLowerCase();
+    lemma2 = lemma2.toLowerCase();
+    // numbers have to be exact
+    if (POS.equals("CD")) {
+      if (lemma1.equals(lemma2)) {
+        return lemma1;
+      } else {
+        return null;
+      }
+    }
+
+    // 'must' occurrence of word - if not equal then 'fail'
+    if (lemma1.endsWith("_xyz") || lemma2.endsWith("_xyz")) {
+      lemma1 = lemma1.replace("_xyz", "");
+      lemma2 = lemma2.replace("_xyz", "");
+      if (lemma1.equals(lemma2)) {
+        return lemma1;
+      } else { // trying to check if nouns and different plural/single form
+        if (POS.equals("NN") || POS.equals("NP")) {
+          if ((lemma1.equals(lemma2 + "s") || lemma2.equals(lemma1 + "s"))
+              || lemma1.endsWith(lemma2) || lemma2.endsWith(lemma1)
+              || lemma1.startsWith(lemma2) || lemma2.startsWith(lemma1))
+            return lemma1;
+        }
+        return "fail";
+      }
+    }
+
+    if (lemma1.equals(lemma2)) {
+      return lemma1;
+    }
+
+    if (POS.equals("NN") || POS.equals("NP")) {
+      if ((lemma1.equals(lemma2 + "s") || lemma2.equals(lemma1 + "s"))
+          || lemma1.endsWith(lemma2) || lemma2.endsWith(lemma1)
+          || lemma1.startsWith(lemma2) || lemma2.startsWith(lemma1)) {
+        return lemma1;
+      }
+    }
+    try {
+      if (ps != null) {
+        if (ps.stem(lemma1).toString()
+            .equalsIgnoreCase(ps.stem(lemma2).toString())) {
+          return lemma1;
+        }
+      }
+    } catch (Exception e) {
+      System.err.println("Problem processing " + lemma1 + " " + lemma2);
+      return null;
+    }
+
+    return null;
+  }
+
+  public boolean acceptableLemmaAndPOS(String sim, String lemmaMatch) {
+    if (sim == null) {
+      return false;
+    }
+
+    if (lemmaMatch != null && !lemmaMatch.equals("fail")) {
+      return false;
+    }
+    // even if lemmaMatch==null
+    return true;
+    // if (sim!=null && (lemmaMatch!=null && !lemmaMatch.equals("fail"))){
+
+  }
+
+  // all lemmas ending with # in ch1 and/or ch2 SHOULD occur in chunkToAdd
+  public boolean mustOccurVerifier(ParseTreePath ch1, ParseTreePath ch2,
+      ParseTreePath chunkToAdd) {
+    List<String> lemmasWithMustOccur = ch1.getLemmas();
+    lemmasWithMustOccur.addAll(ch2.getLemmas());
+    List<String> res = chunkToAdd.getLemmas();
+    for (String lem : lemmasWithMustOccur) {
+      if (lem.endsWith("_xyz")) {
+        String pureLem = lem.replace("_xyz", "");
+        if (!res.contains(pureLem)) { // should occur but does not
+          return false;
+        }// failed the test
+      }
+    }
+    return true;
+  }
+
+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java
new file mode 100644
index 0000000..0830276
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/Matcher.java

@@ -0,0 +1,142 @@
+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+

+import opennlp.tools.parse_thicket.IGeneralizer;

+import opennlp.tools.parse_thicket.ParseCorefsBuilder;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.textsimilarity.LemmaPair;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class Matcher implements IGeneralizer<List<List<ParseTreeNode>>>{

+	ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();

+	ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();

+	PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();

+	Map<String, ParseThicket> parseThicketHash = new HashMap<String, ParseThicket>();

+	/**	   * The key function of similarity component which takes two portions of text

+	 * and does similarity assessment by finding the set of all maximum common

+	 * subtrees of the set of parse trees for each portion of text

+	 * 

+	 * @param input

+	 *          text 1

+	 * @param input

+	 *          text 2

+	 * @return the matching results structure, which includes the similarity score

+	 */

+	

+	public Matcher(){

+		

+	}

+	

+	public List<List<ParseTreeChunk>> assessRelevance(String para1, String para2) {

+		// first build PTs for each text

+		ParseThicket pt1 = ptBuilder.buildParseThicket(para1);

+		ParseThicket pt2 = ptBuilder.buildParseThicket(para2);

+		// then build phrases and rst arcs

+		List<List<ParseTreeNode>> phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);

+		List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);

+		// group phrases by type

+		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 

+				sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);

+

+		

+		List<List<ParseTreeChunk>> res = md

+				.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);

+		return res;

+

+	}

+	

+	public List<List<ParseTreeChunk>> assessRelevanceCache(String para1, String para2) {

+		// first build PTs for each text

+		

+		ParseThicket pt1 = parseThicketHash.get(para1);

+		if (pt1==null){

+			 pt1=	ptBuilder.buildParseThicket(para1);

+			 parseThicketHash.put(para1, pt1);

+		}

+		

+		ParseThicket pt2 = parseThicketHash.get(para2);

+		if (pt2==null){

+			 pt2=	ptBuilder.buildParseThicket(para2);

+			 parseThicketHash.put(para2, pt2);

+		}

+		

+		// then build phrases and rst arcs

+		List<List<ParseTreeNode>> phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);

+		List<List<ParseTreeNode>> phrs2 = phraseBuilder.buildPT2ptPhrases(pt2);

+		// group phrases by type

+		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 

+				sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);

+

+		

+		List<List<ParseTreeChunk>> res = md

+				.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);

+		return res;

+

+	}

+	

+	public List<List<ParseTreeChunk>> generalize(List<List<ParseTreeNode>> phrs1,

+			List<List<ParseTreeNode>> phrs2) {

+		// group phrases by type

+				List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(phrs1), 

+						sent2GrpLst = formGroupedPhrasesFromChunksForPara(phrs2);

+

+				

+				List<List<ParseTreeChunk>> res = md

+						.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);

+				return res;

+	}

+	private List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(

+			List<List<ParseTreeNode>> phrs) {

+		List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();

+		List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(), 

+				pps = new ArrayList<ParseTreeChunk>();

+		for(List<ParseTreeNode> ps:phrs){

+			ParseTreeChunk ch = convertNodeListIntoChunk(ps);

+			String ptype = ps.get(0).getPhraseType();

+			if (ptype.equals("NP")){

+				nps.add(ch);

+			} else if (ptype.equals("VP")){

+				vps.add(ch);

+			} else if (ptype.equals("PP")){

+				pps.add(ch);

+			}

+		}

+		results.add(nps); results.add(vps); results.add(pps);

+		return results;

+	}

+

+	private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) {

+		List<String> lemmas = new ArrayList<String>(),  poss = new ArrayList<String>();

+		for(ParseTreeNode n: ps){

+			lemmas.add(n.getWord());

+			poss.add(n.getPos());

+		}

+		ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);

+		ch.setMainPOS(ps.get(0).getPhraseType());

+		return ch;

+	}

+	

+	// this function is the main entry point into the PT builder if rst arcs are required

+	public ParseThicket buildParseThicketFromTextWithRST(String para){

+		ParseThicket pt = ptBuilder.buildParseThicket(para);

+		phraseBuilder.buildPT2ptPhrases(pt);

+		return pt;	

+	}

+

+

+	@Override

+	public List<List<List<ParseTreeNode>>> generalize(Object o1, Object o2) {

+		// TODO Auto-generated method stub

+		return null;

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java
new file mode 100644
index 0000000..7612f26
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilder.java

@@ -0,0 +1,421 @@
+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

+import opennlp.tools.parse_thicket.rhetoric_structure.RhetoricStructureArcsBuilder;

+

+import org.jgrapht.Graph;

+import org.jgrapht.graph.DefaultEdge;

+import org.jgrapht.graph.SimpleGraph;

+

+

+import edu.stanford.nlp.trees.Tree;

+

+public class PT2ThicketPhraseBuilder {

+	

+	RhetoricStructureArcsBuilder rstBuilder = new RhetoricStructureArcsBuilder();

+	

+	/*

+	 * Building phrases takes a Parse Thicket and forms phrases for each sentence individually

+	 * Then based on built phrases and obtained arcs, it builds arcs for RST

+	 * Finally, based on all formed arcs, it extends phrases with thicket phrases

+	 */

+

+	public List<List<ParseTreeNode>> buildPT2ptPhrases(ParseThicket pt ) {

+		List<List<ParseTreeNode>> phrasesAllSent = new ArrayList<List<ParseTreeNode>> ();

+		Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases = new HashMap<Integer, List<List<ParseTreeNode>>>();

+		// build regular phrases

+		for(int nSent=0; nSent<pt.getSentences().size(); nSent++){

+			

+			

+			List<ParseTreeNode> sentence = pt.getNodesThicket().get(nSent);

+			Tree ptree = pt.getSentences().get(nSent);

+			//ptree.pennPrint();

+			List<List<ParseTreeNode>> phrases = buildPT2ptPhrasesForASentence(ptree, sentence);

+			System.out.println(phrases);

+			phrasesAllSent.addAll(phrases);

+			sentNumPhrases.put(nSent, phrases);

+

+		}

+		

+		// discover and add RST arcs

+		List<WordWordInterSentenceRelationArc> arcsRST =

+				rstBuilder.buildRSTArcsFromMarkersAndCorefs(pt.getArcs(), sentNumPhrases, pt);

+		

+		List<WordWordInterSentenceRelationArc> arcs = pt.getArcs();

+		arcs.addAll(arcsRST);

+		pt.setArcs(arcs);

+		

+		

+		List<List<ParseTreeNode>> expandedPhrases = expandTowardsThicketPhrases(phrasesAllSent, pt.getArcs(), sentNumPhrases, pt);

+		return expandedPhrases;

+	}

+

+/* Take all phrases, all arcs and merge phrases into Thicket phrases.

+ * Then add the set of generalized (Thicket) phrases to the input set of phrases

+ * phrasesAllSent - list of lists of phrases for each sentence

+ * sentNumPhrase - map , gives for each sentence id, the above list

+ * arcs - arcs formed so far

+ * pt - the built Parse Thicket

+ */

+	private List<List<ParseTreeNode>> expandTowardsThicketPhrases(

+			List<List<ParseTreeNode>> phrasesAllSent,

+			List<WordWordInterSentenceRelationArc> arcs,

+			Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases, 

+			ParseThicket pt ) {

+		List<List<ParseTreeNode>> thicketPhrasesAllSent = new ArrayList<List<ParseTreeNode>>();

+		

+		

+			for(int nSent=0; nSent<pt.getSentences().size(); nSent++){

+				for(int mSent=nSent+1; mSent<pt.getSentences().size(); mSent++){

+					// for given arc, find phrases connected by this arc and add to the list of phrases

+					for(WordWordInterSentenceRelationArc arc: arcs){

+						List<List<ParseTreeNode>> phrasesFrom = sentNumPhrases.get(nSent);

+						List<List<ParseTreeNode>> phrasesTo = sentNumPhrases.get(mSent);

+						int fromIndex = arc.getCodeFrom().getFirst();

+						int toIndex = arc.getCodeTo().getFirst();

+						if (nSent==fromIndex && mSent==toIndex){

+							int sentPosFrom = arc.getCodeFrom().getSecond();

+							int sentPosTo = arc.getCodeTo().getSecond();

+							// for the given arc arc, find phrases which are connected by it

+							List<ParseTreeNode> lFromFound = null, lToFound = null;

+							for(List<ParseTreeNode> lFrom: phrasesFrom){

+								if (lToFound!=null)

+									break;

+								for(ParseTreeNode lFromP: lFrom){

+									if (lFromP.getId()!=null &&  lFromP.getId()==sentPosFrom){

+											lFromFound = lFrom;

+											break;

+										}

+								}

+							}

+							for(List<ParseTreeNode> lTo: phrasesTo){

+								if (lToFound!=null)

+									break;

+								for(ParseTreeNode lToP: lTo)

+									if (lToP.getId()!=null && lToP.getId()==sentPosTo){

+										lToFound = lTo;

+										break;

+									}

+							}

+							// obtain a thicket phrase and add it to the list

+							if (lFromFound!=null && lToFound!=null){

+								

+								if (identicalSubPhrase(lFromFound, lToFound))

+									continue;

+								List<ParseTreeNode> appended = append(lFromFound, lToFound);

+								if (thicketPhrasesAllSent.contains(appended))

+									continue;

+								System.out.println("rel: "+arc);

+								System.out.println("From "+lFromFound);

+								System.out.println("TO "+lToFound);

+								thicketPhrasesAllSent.add(append(lFromFound, lToFound));	

+								//break;

+							}

+						}

+						

+					}

+				}

+			}

+			phrasesAllSent.addAll(thicketPhrasesAllSent);

+			return phrasesAllSent;

+	}

+

+/* check that one phrase is subphrase of another by lemma (ignoring other node properties)

+ * returns true if not found different word

+ */

+	

+	private boolean identicalSubPhrase(List<ParseTreeNode> lFromFound,

+			List<ParseTreeNode> lToFound) {

+		for(int pos=0; pos<lFromFound.size()&& pos<lToFound.size(); pos++){

+			if (!lFromFound.get(pos).getWord().equals(lToFound.get(pos).getWord()))

+				return false;

+		}

+		return true;

+	}

+

+	private List<ParseTreeNode> append(List<ParseTreeNode> lFromFound,

+			List<ParseTreeNode> lToFound) {

+		List<ParseTreeNode> appendList = new ArrayList<ParseTreeNode>();

+		appendList.addAll(lFromFound);

+		appendList.addAll(lToFound);

+		return appendList;

+	}

+

+

+	public List<List<ParseTreeNode>> buildPT2ptPhrasesForASentence(Tree tree, List<ParseTreeNode> sentence ) {

+		List<List<ParseTreeNode>> phrases;

+

+		phrases = new ArrayList<List<ParseTreeNode>>();		

+		navigateR(tree, sentence, phrases);

+

+		return phrases;

+	}

+

+

+	

+

+/*

+ * 

+[[<1>NP'Iran':NNP], [<2>VP'refuses':VBZ, <3>VP'to':TO, <4>VP'accept':VB, <5>VP'the':DT, <6>VP'UN':NNP, 

+<7>VP'proposal':NN, <8>VP'to':TO, <9>VP'end':VB, <10>VP'its':PRP$, <11>VP'dispute':NN, <12>VP'over':IN, <13>VP'its':PRP$,

+ <14>VP'work':NN, <15>VP'on':IN, <16>VP'nuclear':JJ, <17>VP'weapons':NNS], [<3>VP'to':TO, <4>VP'accept':VB, <5>VP'the':DT,

+  <6>VP'UN':NNP, <7>VP'proposal':NN, <8>VP'to':TO, <9>VP'end':VB, <10>VP'its':PRP$, <11>VP'dispute':NN, <12>VP'over':IN, 

+  <13>VP'its':PRP$, <14>VP'work':NN, <15>VP'on':IN, <16>VP'nuclear':JJ, <17>VP'weapons':NNS], [<4>VP'accept':VB, 

+  <5>VP'the':DT, <6>VP'UN':NNP, <7>VP'proposal':NN, <8>VP'to':TO, <9>VP'end':VB, <10>VP'its':PRP$, <11>VP'dispute':NN, 

+  <12>VP'over':IN, <13>VP'its':PRP$, <14>VP'work':NN, <15>VP'on':IN, <16>VP'nuclear':JJ, <17>VP'weapons':NNS], 

+  [<5>NP'the':DT, <6>NP'UN':NNP, <7>NP'proposal':NN], [<8>VP'to':TO, <9>VP'end':VB, <10>VP'its':PRP$, <11>VP'dispute':NN, 

+  <12>VP'over':IN, <13>VP'its':PRP$, <14>VP'work':NN, <15>VP'on':IN, <16>VP'nuclear':JJ, <17>VP'weapons':NNS], 

+  [<9>VP'end':VB, <10>VP'its':PRP$, <11>VP'dispute':NN, <12>VP'over':IN, <13>VP'its':PRP$, <14>VP'work':NN, <15>VP'on':IN,

+   <16>VP'nuclear':JJ, <17>VP'weapons':NNS], [<10>NP'its':PRP$, <11>NP'dispute':NN], [<12>PP'over':IN, <13>PP'its':PRP$, 

+   <14>PP'work':NN, <15>PP'on':IN, <16>PP'nuclear':JJ, <17>PP'weapons':NNS], [<13>NP'its':PRP$, <14>NP'work':NN, 

+   <15>NP'on':IN, <16>NP'nuclear':JJ, <17>NP'weapons':NNS], [<13>NP'its':PRP$, <14>NP'work':NN],

+ [<15>PP'on':IN, <16>PP'nuclear':JJ, <17>PP'weapons':NNS], [<16>NP'nuclear':JJ, <17>NP'weapons':NNS]]

+ *  

+ * 

+ */

+	private void navigateR(Tree t, List<ParseTreeNode> sentence,

+			List<List<ParseTreeNode>> phrases) {

+		if (!t.isPreTerminal()) {

+			if (t.label() != null) {

+				if (t.value() != null) {

+					// if ROOT or S, returns empty

+					List<ParseTreeNode> nodes = parsePhrase(t.label().value(), t.toString());

+					nodes = assignIndexToNodes(nodes, sentence);

+					if (!nodes.isEmpty())

+						phrases.add(nodes);

+					if (nodes.size()>0 && nodes.get(0).getId()==null){

+							System.err.println("Failed alignment:"+nodes);

+					}

+				}

+			}

+			Tree[] kids = t.children();

+			if (kids != null) {

+				for (Tree kid : kids) {

+					navigateR(kid,sentence,  phrases);

+				}

+			}

+			return ;

+		}

+	}

+	

+	

+	/* alignment of phrases extracted from tree against the sentence as a list of lemma-pos */

+	

+	private List<ParseTreeNode> assignIndexToNodes(List<ParseTreeNode> node,

+			List<ParseTreeNode> sentence) {

+		if (sentence==null || sentence.size()<1)

+			return node;

+		

+		List<ParseTreeNode> results = new ArrayList<ParseTreeNode>();

+		

+		for(int i= 0; i<node.size(); i++){

+			String thisLemma = node.get(i).getWord();			

+			String thisPOS = node.get(i).getPos();

+			String nextLemma = null, nextPOS = null;

+			

+			if (i+1<node.size()){

+				nextLemma = node.get(i+1).getWord();

+				nextPOS = node.get(i+1).getPos();

+			}

+			Boolean matchOccurred = false;

+			int j = 0;

+			for(j= 0; j<sentence.size(); j++){

+				if (!(sentence.get(j).getWord().equals(thisLemma) && (sentence.get(j).getPos().equals(thisPOS))))

+					continue;

+				if (i+1<node.size() && j+1 < sentence.size() && nextLemma!=null 

+						&& ! (sentence.get(j+1).getWord().equals(nextLemma)

+					  && sentence.get(j+1).getPos().equals(nextPOS)))

+					continue;

+				matchOccurred = true;

+				break;

+			}

+			

+			ParseTreeNode n = node.get(i);

+			if (matchOccurred){

+				n.setId(sentence.get(j).getId());

+				n.setNe(sentence.get(j).getNe());

+			}

+			results.add(n);

+		}

+		

+		try {

+			if (results!=null && results.size()>1 && results.get(0)!=null && results.get(0).getId()!=null &&

+					results.get(1) !=null && results.get(1).getId()!=null &&  results.get(1).getId()>0){

+				ParseTreeNode p = results.get(0);

+				p.setId(results.get(1).getId()-1);

+				results.set(0, p);

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		return results;

+	}

+

+

+	/*

+	 * [[NP'':], ['(NNP':Iran)], [VP'':], ['(VBZ':refuses)], [VP'':], ['(TO':to)], [VP'':], ['(VB':accept)], [NP'':], 

+	 * ['(DT':the)], ['(NNP':UN)], ['(NN':proposal)], [VP'':], ['(TO':to)], [VP'':], ['(VB':end)], [NP'':], 

+	 * ['(PRP$':its)], ['(NN':dispute)], [PP'':], ['(IN':over)], [NP'':], [NP'':],

+	 *  ['(PRP$':its)], ['(NN':work)], [PP'':], ['(IN':on)], [NP'':], ['(JJ':nuclear)], ['(NNS':weapons)], ['(.':.)]]

+	 * 

+	 * [[NP'':], ['(NNP':Iran)],

+ [VP'':], ['(VBZ':refuses)], 

+ [VP'':], ['(TO':to)], 

+ [VP'':], ['(VB':accept)], 

+    [NP'':], ['(DT':the)], ['(NNP':UN)], ['(NN':proposal)], 

+    [VP'':], ['(TO':to)], [VP'':], ['(VB':end)], 

+    [NP'':], ['(PRP$':its)], ['(NN':dispute)], 

+        [PP'':], ['(IN':over)], 

+            [NP'':], [NP'':], ['(PRP$':its)], ['(NN':work)], 

+              [PP'':], ['(IN':on)], 

+                [NP'':], ['(JJ':nuclear)], ['(NNS':weapons)], 

+['(.':.)]]

+	 */

+	private void navigateR1(Tree t, List<ParseTreeNode> sentence, int l,

+			List<List<ParseTreeNode>> phrases) {

+		if (t.isPreTerminal()) {

+			if (t.label() != null) {

+				List<ParseTreeNode> node = parsePhrase(t.toString());	

+				if (!node.isEmpty())

+					phrases.add(node);

+			}

+			return;

+		} else {

+			if (t.label() != null) {

+				if (t.value() != null) {

+					List<ParseTreeNode> node = parsePhrase(t.label().value());		 

+					if (!node.isEmpty())

+						phrases.add(node);

+				}

+			}

+			Tree[] kids = t.children();

+			if (kids != null) {

+				for (Tree kid : kids) {

+					navigateR1(kid,sentence,  l, phrases);

+				}

+			}

+			return ;

+		}

+	}

+

+

+	protected List<ParseTreeNode> parsePhrase(String value) {

+		List<ParseTreeNode> nlist = new ArrayList<ParseTreeNode>(); 

+		if (value==null)

+			return nlist;

+		if (value.equals("ROOT")|| value.equals("S")) 

+			return nlist;

+		

+		String[] pos_value = value.split(" ");

+		ParseTreeNode node = null;

+		if (value.endsWith("P")){

+			node = new ParseTreeNode("", ""); 

+		    node.setPhraseType(value);

+		} else 

+		if (pos_value != null && pos_value.length==2){

+			node = new ParseTreeNode(pos_value[0], pos_value[1]);

+		} else {

+			node = new ParseTreeNode(value, "");

+		}

+			

+		nlist.add(node);

+		return nlist;

+	}

+	

+	private ParseTreeNode parsePhraseNode(String value) {

+		

+		if (value.equals("ROOT")|| value.equals("S")) 

+			return null;

+		

+		String[] pos_value = value.split(" ");

+		ParseTreeNode node = null;

+		if (value.endsWith("P")){

+			node = new ParseTreeNode("", ""); 

+		    node.setPhraseType(value);

+		} else 

+		if (pos_value != null && pos_value.length==2){

+			node = new ParseTreeNode(pos_value[0], pos_value[1]);

+		} else {

+			node = new ParseTreeNode(value, "");

+		}			

+		

+		return node;

+	}

+	

+	public List<ParseTreeNode> parsePhrase(String value, String fullDump) {

+		

+		List<ParseTreeNode> nlist = new ArrayList<ParseTreeNode>(); 

+		if (value.equals("S")|| value.equals("ROOT"))

+				return nlist;

+		

+		String flattened = fullDump.replace("(ROOT","").replace("(NP","").replace("(VP","").replace("(PP","")

+				.replace("(ADVP","").replace("(UCP","").replace("(ADJP","").replace("(SBAR","").

+				replace("(PRT", "").replace("(WHNP","").

+				 replace("))))",")").replace(")))",")").replace("))",")")

+				.replace("   ", " ").replace("  ", " ").replace("(S","")

+				.replace(") (","#").replace(")  (", "#");

+		String[] flattenedArr =  flattened.split("#");

+		for(String term: flattenedArr){

+			term = term.replace('(', ' ').replace(')',' ').trim();

+			if (term!=null && term.split(" ")!=null && term.split(" ").length==2){

+				ParseTreeNode node = new ParseTreeNode(term.split(" ")[1],term.split(" ")[0] );

+				node.setPhraseType(value);

+				nlist.add(node);

+			}

+		}

+		return nlist;

+	}

+	

+/* recursion example */

+	

+	private StringBuilder toStringBuilder(StringBuilder sb, Tree t) {

+		if (t.isLeaf()) {

+			if (t.label() != null) {

+				sb.append(t.label().value());

+			}

+			return sb;

+		} else {

+			sb.append('(');

+			if (t.label() != null) {

+				if (t.value() != null) {

+					sb.append(t.label().value());

+				}

+			}

+			Tree[] kids = t.children();

+			if (kids != null) {

+				for (Tree kid : kids) {

+					sb.append(' ');

+					toStringBuilder(sb, kid);

+				}

+			}

+			return sb.append(')');

+		}

+	}

+	

+	public static void main(String[] args){

+		PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();

+		String line = "(NP (NNP Iran)) (VP (VBZ refuses) (S (VP (TO to) (VP (VB accept) (S (NP (DT the) " +

+				"(NNP UN) (NN proposal)) (VP (TO to) (VP (VB end) (NP (PRP$ its) (NN dispute))))))))";

+		

+		List<ParseTreeNode> res = phraseBuilder. parsePhrase("NP", line);

+		System.out.println(res);

+		

+

+		line = "(VP (VBP am) (NP (NP (DT a) (NNP US) (NN citizen)) (UCP (VP (VBG living) (ADVP (RB abroad))) (, ,) (CC and) (ADJP (JJ concerned) (PP (IN about) (NP (NP (DT the) (NN health) (NN reform) (NN regulation)) (PP (IN of) (NP (CD 2014)))))))))";

+		res = phraseBuilder. parsePhrase("VP", line);

+		System.out.println(res);

+				

+		line = "(VP (TO to) (VP (VB wait) (SBAR (IN till) (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ sick) (S (VP (TO to) (VP (VB buy) (NP (NN health) (NN insurance)))))))))))";

+		res = phraseBuilder. parsePhrase("VP", line);

+		System.out.println(res);

+	}

+  

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeChunkListScorer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeChunkListScorer.java
new file mode 100644
index 0000000..21e7f52
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeChunkListScorer.java

@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parse_thicket.matching;
+
+import java.util.List;
+
+public class ParseTreeChunkListScorer {
+  // find the single expression with the highest score
+  public double getParseTreeChunkListScore(
+      List<List<ParseTreePath>> matchResult) {
+    double currScore = 0.0;
+    for (List<ParseTreePath> chunksGivenPhraseType : matchResult)
+      for (ParseTreePath chunk : chunksGivenPhraseType) {
+        Double score = getScore(chunk);
+        // System.out.println(chunk+ " => score >>> "+score);
+        if (score > currScore) {
+          currScore = score;
+        }
+      }
+    return currScore;
+  }
+
+  // get max score per phrase type and then sum up
+  public double getParseTreeChunkListScoreAggregPhraseType(
+      List<List<ParseTreePath>> matchResult) {
+    double currScoreTotal = 0.0;
+    for (List<ParseTreePath> chunksGivenPhraseType : matchResult) {
+      double currScorePT = 0.0;
+      for (ParseTreePath chunk : chunksGivenPhraseType) {
+        Double score = getScore(chunk);
+        // System.out.println(chunk+ " => score >>> "+score);
+        if (score > currScorePT) {
+          currScorePT = score;
+        }
+      }
+      // if substantial for given phrase type
+      if (currScorePT > 0.5) {
+        currScoreTotal += currScorePT;
+      }
+    }
+    return currScoreTotal;
+  }
+
+  // score is meaningful only for chunks which are results of generalization
+
+  public double getScore(ParseTreePath chunk) {
+    double score = 0.0;
+    int i = 0;
+    for (String l : chunk.getLemmas()) {
+      String pos = chunk.getPOSs().get(i);
+      if (l.equals("*")) {
+        if (pos.startsWith("CD")) { // number vs number gives high score
+                                    // although different numbers
+          score += 0.7;
+        } else if (pos.endsWith("_high")) { // if query modification adds 'high'
+          score += 1.0;
+        } else {
+          score += 0.1;
+        }
+      } else {
+
+        if (pos.startsWith("NN") || pos.startsWith("NP")
+            || pos.startsWith("CD") || pos.startsWith("RB")) {
+          score += 1.0;
+        } else if (pos.startsWith("VB") || pos.startsWith("JJ")) {
+          if (l.equals("get")) { // 'common' verbs are not that important
+            score += 0.3;
+          } else {
+            score += 0.5;
+          }
+        } else {
+          score += 0.3;
+        }
+      }
+      i++;
+
+    }
+    return score;
+  }
+
+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePath.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePath.java
new file mode 100644
index 0000000..d0bf61f
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePath.java

@@ -0,0 +1,422 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.textsimilarity.LemmaPair;

+

+public class ParseTreePath {

+  private String mainPOS;

+

+  private List<String> lemmas;

+

+  private List<String> POSs;

+  //order number of a word in a sentence

+  private List<Integer> wordUniqueCodes;

+

+  private int startPos;

+

+  private int endPos;

+

+  private int size;

+

+  private ParseTreePathMatcher parseTreeMatcher;

+

+  private LemmaFormManager lemmaFormManager;

+

+  private GeneralizationListReducer generalizationListReducer;

+

+  public ParseTreePath() {

+  }

+

+  public ParseTreePath(List<String> lemmas, List<String> POSs, int startPos,

+      int endPos) {

+    this.lemmas = lemmas;

+    this.POSs = POSs;

+    this.startPos = startPos;

+    this.endPos = endPos;

+

+  }

+

+  // constructor which takes lemmas and POS as lists so that phrases can be

+  // conveniently specified.

+  // usage: stand-alone runs

+  public ParseTreePath(String mPOS, String[] lemmas, String[] POSss) {

+    this.mainPOS = mPOS;

+    this.lemmas = new ArrayList<String>();

+    for (String l : lemmas) {

+      this.lemmas.add(l);

+    }

+    if (mPOS.equals("SENTENCE")){

+    	for(int i=0; i<lemmas.length; i++){

+    		wordUniqueCodes.add(this.lemmas.get(i).hashCode());

+    	}

+    }

+    

+    this.POSs = new ArrayList<String>();

+    for (String p : POSss) {

+      this.POSs.add(p);

+    }

+  }

+

+  // constructor which takes lemmas and POS as lists so that phrases can be

+  // conveniently specified.

+  // usage: stand-alone runs

+  public ParseTreePath(String mPOS, List<String> lemmas, List<String> POSss) {

+    this.mainPOS = mPOS;

+    this.lemmas = lemmas;

+    this.POSs = POSss;

+

+  }

+

+  // Before:

+  // [0(S-At home we like to eat great pizza deals), 0(PP-At home), 0(IN-At),

+  // 3(NP-home), 3(NN-home), 8(NP-we),

+  // 8(PRP-we), 11(VP-like to eat great pizza deals), 11(VBP-like), 16(S-to eat

+  // great pizza deals), 16(VP-to eat great

+  // pizza deals),

+  // 16(TO-to), 19(VP-eat great pizza deals), 19(VB-eat), 23(NP-great pizza

+  // deals), 23(JJ-great), 29(NN-pizza),

+  // 35(NNS-deals)]

+

+  // After:

+  // [S [IN-At NP-home NP-we VBP-like ], PP [IN-At NP-home ], IN [IN-At ], NP

+  // [NP-home ], NN [NP-home ], NP [NP-we ],

+  // PRP [NP-we ], VP [VBP-like TO-to VB-eat JJ-great ], VBP [VBP-like ], S

+  // [TO-to VB-eat JJ-great NN-pizza ], VP

+  // [TO-to VB-eat JJ-great NN-pizza ], TO [TO-to ], VP [VB-eat JJ-great

+  // NN-pizza NNS-deals ],

+  // VB [VB-eat ], NP [JJ-great NN-pizza NNS-deals ], JJ [JJ-great ], NN

+  // [NN-pizza ], NNS [NNS-deals ]]

+

+  public List<ParseTreePath> buildChunks(List<LemmaPair> parseResults) {

+    List<ParseTreePath> chunksResults = new ArrayList<ParseTreePath>();

+    for (LemmaPair chunk : parseResults) {

+      String[] lemmasAr = chunk.getLemma().split(" ");

+      List<String> poss = new ArrayList<String>(), lems = new ArrayList<String>();

+      for (String lem : lemmasAr) {

+        lems.add(lem);

+        // now looking for POSs for individual word

+        for (LemmaPair chunkCur : parseResults) {

+          if (chunkCur.getLemma().equals(lem)

+              &&

+              // check that this is a proper word in proper position

+              chunkCur.getEndPos() <= chunk.getEndPos()

+              && chunkCur.getStartPos() >= chunk.getStartPos()) {

+            poss.add(chunkCur.getPOS());

+            break;

+          }

+        }

+      }

+      if (lems.size() != poss.size()) {

+        System.err.println("lems.size()!= poss.size()");

+      }

+      if (lems.size() < 2) { // single word phrase, nothing to match

+        continue;

+      }

+      ParseTreePath ch = new ParseTreePath(lems, poss, chunk.getStartPos(),

+          chunk.getEndPos());

+      ch.setMainPOS(chunk.getPOS());

+      chunksResults.add(ch);

+    }

+    return chunksResults;

+  }

+

+  public List<List<ParseTreePath>> matchTwoSentencesGivenPairLists(

+      List<LemmaPair> sent1Pairs, List<LemmaPair> sent2Pairs) {

+

+    List<ParseTreePath> chunk1List = buildChunks(sent1Pairs);

+    List<ParseTreePath> chunk2List = buildChunks(sent2Pairs);

+

+    List<List<ParseTreePath>> sent1GrpLst = groupChunksAsParses(chunk1List);

+    List<List<ParseTreePath>> sent2GrpLst = groupChunksAsParses(chunk2List);

+

+    System.out.println("=== Grouped chunks 1 " + sent1GrpLst);

+    System.out.println("=== Grouped chunks 2 " + sent2GrpLst);

+

+    return matchTwoSentencesGroupedChunks(sent1GrpLst, sent2GrpLst);

+  }

+

+  // groups noun phrases, verb phrases, propos phrases etc. for separate match

+

+  public List<List<ParseTreePath>> groupChunksAsParses(

+      List<ParseTreePath> parseResults) {

+    List<ParseTreePath> np = new ArrayList<ParseTreePath>(), vp = new ArrayList<ParseTreePath>(), prp = new ArrayList<ParseTreePath>(), sbarp = new ArrayList<ParseTreePath>(), pp = new ArrayList<ParseTreePath>(), adjp = new ArrayList<ParseTreePath>(), whadvp = new ArrayList<ParseTreePath>(), restOfPhrasesTypes = new ArrayList<ParseTreePath>();

+    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();

+    for (ParseTreePath ch : parseResults) {

+      String mainPos = ch.getMainPOS().toLowerCase();

+

+      if (mainPos.equals("s")) {

+        continue;

+      }

+      if (mainPos.equals("np")) {

+        np.add(ch);

+      } else if (mainPos.equals("vp")) {

+        vp.add(ch);

+      } else if (mainPos.equals("prp")) {

+        prp.add(ch);

+      } else if (mainPos.equals("pp")) {

+        pp.add(ch);

+      } else if (mainPos.equals("adjp")) {

+        adjp.add(ch);

+      } else if (mainPos.equals("whadvp")) {

+        whadvp.add(ch);

+      } else if (mainPos.equals("sbar")) {

+        sbarp.add(ch);

+      } else {

+        restOfPhrasesTypes.add(ch);

+      }

+

+    }

+    results.add(np);

+    results.add(vp);

+    results.add(prp);

+    results.add(pp);

+    results.add(adjp);

+    results.add(whadvp);

+    results.add(restOfPhrasesTypes);

+

+    return results;

+

+  }

+

+  // main function to generalize two expressions grouped by phrase types

+  // returns a list of generalizations for each phrase type with filtered

+  // sub-expressions

+  public List<List<ParseTreePath>> matchTwoSentencesGroupedChunks(

+      List<List<ParseTreePath>> sent1, List<List<ParseTreePath>> sent2) {

+    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();

+    // first irerate through component

+    for (int comp = 0; comp < 2 && // just np & vp

+        comp < sent1.size() && comp < sent2.size(); comp++) {

+      List<ParseTreePath> resultComps = new ArrayList<ParseTreePath>();

+      // then iterate through each phrase in each component

+      for (ParseTreePath ch1 : sent1.get(comp)) {

+        for (ParseTreePath ch2 : sent2.get(comp)) { // simpler version

+          ParseTreePath chunkToAdd = parseTreeMatcher

+              .generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(

+                  ch1, ch2);

+

+          if (!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) {

+            continue; // if the words which have to stay do not stay, proceed to

+                      // other elements

+          }

+          Boolean alreadyThere = false;

+          for (ParseTreePath chunk : resultComps) {

+            if (chunk.equalsTo(chunkToAdd)) {

+              alreadyThere = true;

+              break;

+            }

+

+            if (parseTreeMatcher

+                .generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk,

+                    chunkToAdd).equalsTo(chunkToAdd)) {

+              alreadyThere = true;

+              break;

+            }

+          }

+

+          if (!alreadyThere) {

+            resultComps.add(chunkToAdd);

+          }

+

+          List<ParseTreePath> resultCompsReduced = generalizationListReducer

+              .applyFilteringBySubsumption(resultComps);

+          // if (resultCompsReduced.size() != resultComps.size())

+          // System.out.println("reduction of gen list occurred");

+        }

+      }

+      results.add(resultComps);

+    }

+

+    return results;

+  }

+

+  public Boolean equals(ParseTreePath ch) {

+    List<String> lems = ch.getLemmas();

+    List<String> poss = ch.POSs;

+

+    if (this.lemmas.size() <= lems.size())

+      return false; // sub-chunk should be shorter than chunk

+

+    for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {

+      if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(

+          poss.get(i))))

+        return false;

+    }

+    return true;

+  }

+

+  // 'this' is super - chunk of ch, ch is sub-chunk of 'this'

+  public Boolean isASubChunk(ParseTreePath ch) {

+    List<String> lems = ch.getLemmas();

+    List<String> poss = ch.POSs;

+

+    if (this.lemmas.size() < lems.size())

+      return false; // sub-chunk should be shorter than chunk

+

+    for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) {

+      if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(

+          poss.get(i))))

+        return false;

+    }

+    return true;

+  }

+

+  public Boolean equalsTo(ParseTreePath ch) {

+    List<String> lems = ch.getLemmas();

+    List<String> poss = ch.POSs;

+    if (this.lemmas.size() != lems.size() || this.POSs.size() != poss.size())

+      return false;

+

+    for (int i = 0; i < lems.size(); i++) {

+      if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals(

+          poss.get(i))))

+        return false;

+    }

+

+    return true;

+  }

+

+  public String toString() {

+    String buf = " [";

+    if (mainPOS != null)

+      buf = mainPOS + " [";

+    for (int i = 0; i < lemmas.size() && i < POSs.size() // && i<=3

+    ; i++) {

+      buf += POSs.get(i) + "-" + lemmas.get(i) + " ";

+    }

+    return buf + "]";

+  }

+

+  public int compareTo(ParseTreePath o) {

+    if (this.size > o.size)

+      return -1;

+    else

+      return 1;

+

+  }

+

+  public String listToString(List<List<ParseTreePath>> chunks) {

+    StringBuffer buf = new StringBuffer();

+    if (chunks.get(0).size() > 0) {

+      buf.append(" np " + chunks.get(0).toString());

+    }

+    if (chunks.get(1).size() > 0) {

+      buf.append(" vp " + chunks.get(1).toString());

+    }

+    if (chunks.size() < 3) {

+      return buf.toString();

+    }

+    if (chunks.get(2).size() > 0) {

+      buf.append(" prp " + chunks.get(2).toString());

+    }

+    if (chunks.get(3).size() > 0) {

+      buf.append(" pp " + chunks.get(3).toString());

+    }

+    if (chunks.get(4).size() > 0) {

+      buf.append(" adjp " + chunks.get(4).toString());

+    }

+    if (chunks.get(5).size() > 0) {

+      buf.append(" whadvp " + chunks.get(5).toString());

+    }

+    /*

+     * if (mainPos.equals("np")) np.add(ch); else if (mainPos.equals( "vp"))

+     * vp.add(ch); else if (mainPos.equals( "prp")) prp.add(ch); else if

+     * (mainPos.equals( "pp")) pp.add(ch); else if (mainPos.equals( "adjp"))

+     * adjp.add(ch); else if (mainPos.equals( "whadvp")) whadvp.add(ch);

+     */

+    return buf.toString();

+  }

+

+  public List<List<ParseTreePath>> obtainParseTreeChunkListByParsingList(

+      String toParse) {

+    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();

+    // if (toParse.endsWith("]]]")){

+    // toParse = toParse.replace("[[","").replace("]]","");

+    // }

+    toParse = toParse.replace(" ]], [ [", "&");

+    String[] phraseTypeFragments = toParse.trim().split("&");

+    for (String toParseFragm : phraseTypeFragments) {

+      toParseFragm = toParseFragm.replace("],  [", "#");

+

+      List<ParseTreePath> resultsPhraseType = new ArrayList<ParseTreePath>();

+      String[] indivChunks = toParseFragm.trim().split("#");

+      for (String expr : indivChunks) {

+        List<String> lems = new ArrayList<String>(), poss = new ArrayList<String>();

+        expr = expr.replace("[", "").replace(" ]", "");

+        String[] pairs = expr.trim().split(" ");

+        for (String word : pairs) {

+          word = word.replace("]]", "").replace("]", "");

+          String[] pos_lem = word.split("-");

+          lems.add(pos_lem[1].trim());

+          poss.add(pos_lem[0].trim());

+        }

+        ParseTreePath ch = new ParseTreePath();

+        ch.setLemmas(lems);

+        ch.setPOSs(poss);

+        resultsPhraseType.add(ch);

+      }

+      results.add(resultsPhraseType);

+    }

+    System.out.println(results);

+    return results;

+

+    // 2.1 | Vietnam <b>embassy</b> <b>in</b> <b>Israel</b>: information on how

+    // to get your <b>visa</b> at Vietnam

+    // <b>embassy</b> <b>in</b> <b>Israel</b>. <b>...</b> <b>Spain</b>.

+    // Scotland. Sweden. Slovakia. Switzerland. T

+    // [Top of Page] <b>...</b>

+    // [[ [NN-* IN-in NP-israel ], [NP-* IN-in NP-israel ], [NP-* IN-* TO-* NN-*

+    // ], [NN-visa IN-* NN-* IN-in ]], [

+    // [VB-get NN-visa IN-* NN-* IN-in .-* ], [VBD-* IN-* NN-* NN-* .-* ], [VB-*

+    // NP-* ]]]

+

+  }

+

+  public void setMainPOS(String mainPOS) {

+    this.mainPOS = mainPOS;

+  }

+

+  public String getMainPOS() {

+    return mainPOS;

+  }

+

+  public List<String> getLemmas() {

+    return lemmas;

+  }

+

+  public void setLemmas(List<String> lemmas) {

+    this.lemmas = lemmas;

+  }

+

+  public List<String> getPOSs() {

+    return POSs;

+  }

+

+  public void setPOSs(List<String> pOSs) {

+    POSs = pOSs;

+  }

+

+  public ParseTreePathMatcher getParseTreeMatcher() {

+    return parseTreeMatcher;

+  }

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathComparable.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathComparable.java
new file mode 100644
index 0000000..539c61e
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathComparable.java

@@ -0,0 +1,32 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.parse_thicket.matching;

+

+import java.util.Comparator;

+

+public class ParseTreePathComparable implements Comparator<ParseTreePath> {

+  public int compare(ParseTreePath ch1, ParseTreePath ch2) {

+    for (int i = 0; i < ch1.getLemmas().size() && i < ch2.getLemmas().size(); i++) {

+      if (!(ch1.getLemmas().get(i).equals(ch2.getLemmas().get(i)) && ch1

+          .getPOSs().get(i).equals(ch2.getPOSs().get(i))))

+        return -1;

+    }

+    return 0;

+

+  }

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcher.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcher.java
new file mode 100644
index 0000000..7323a8e
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcher.java

@@ -0,0 +1,254 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.List;

+

+import opennlp.tools.textsimilarity.POSManager;

+

+public class ParseTreePathMatcher {

+

+  private static final int NUMBER_OF_ITERATIONS = 2;

+

+  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+  private POSManager posManager = new POSManager();

+  private LemmaFormManager lemmaFormManager = new LemmaFormManager();

+

+  public ParseTreePathMatcher() {

+

+  }

+

+  public ParseTreePath generalizeTwoGroupedPhrasesOLD(ParseTreePath chunk1,

+      ParseTreePath chunk2) {

+    List<String> pos1 = chunk1.getPOSs();

+    List<String> pos2 = chunk1.getPOSs();

+

+    List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();

+    int k1 = 0, k2 = 0;

+    Boolean incrFirst = true;

+    while (k1 < pos1.size() && k2 < pos2.size()) {

+      // first check if the same POS

+      String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));

+      if (sim != null) {

+        commonPOS.add(pos1.get(k1));

+        if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2

+            && chunk1.getLemmas().get(k1).equals(chunk2.getLemmas().get(k2))) {

+          commonLemmas.add(chunk1.getLemmas().get(k1));

+        } else {

+          commonLemmas.add("*");

+        }

+        k1++;

+        k2++;

+      } else if (incrFirst) {

+        k1++;

+      } else {

+        k2++;

+      }

+      incrFirst = !incrFirst;

+    }

+

+    ParseTreePath res = new ParseTreePath(commonLemmas, commonPOS, 0, 0);

+    // if (parseTreeChunkListScorer.getScore(res)> 0.6)

+    // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" + res);

+    return res;

+  }

+

+  // A for B => B have A

+  // transforms expr { A B C prep X Y }

+  // into {A B {X Y} C}

+  // should only be applied to a noun phrase

+  public ParseTreePath prepositionalNNSTransform(ParseTreePath ch) {

+    List<String> transfPOS = new ArrayList<String>(), transfLemmas = new ArrayList<String>();

+    if (!ch.getPOSs().contains("IN"))

+      return ch;

+    int indexIN = ch.getPOSs().lastIndexOf("IN");

+

+    if (indexIN < 2)// preposition is a first word - should not be in a noun

+                    // phrase

+      return ch;

+    String Word_IN = ch.getLemmas().get(indexIN);

+    if (!(Word_IN.equals("to") || Word_IN.equals("on") || Word_IN.equals("in")

+        || Word_IN.equals("of") || Word_IN.equals("with")

+        || Word_IN.equals("by") || Word_IN.equals("from")))

+      return ch;

+

+    List<String> toShiftAfterPartPOS = ch.getPOSs().subList(indexIN + 1,

+        ch.getPOSs().size());

+    List<String> toShiftAfterPartLemmas = ch.getLemmas().subList(indexIN + 1,

+        ch.getLemmas().size());

+

+    if (indexIN - 1 > 0)

+      transfPOS.addAll(ch.getPOSs().subList(0, indexIN - 1));

+    transfPOS.addAll(toShiftAfterPartPOS);

+    transfPOS.add(ch.getPOSs().get(indexIN - 1));

+

+    if (indexIN - 1 > 0)

+      transfLemmas.addAll(ch.getLemmas().subList(0, indexIN - 1));

+    transfLemmas.addAll(toShiftAfterPartLemmas);

+    transfLemmas.add(ch.getLemmas().get(indexIN - 1));

+

+    return new ParseTreePath(transfLemmas, transfPOS, 0, 0);

+  }

+

+  public ParseTreePath generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(

+      ParseTreePath chunk1, ParseTreePath chunk2) {

+    ParseTreePath chRes1 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(

+        chunk1, chunk2);

+    ParseTreePath chRes2 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(

+        prepositionalNNSTransform(chunk1), chunk2);

+    ParseTreePath chRes3 = generalizeTwoGroupedPhrasesRandomSelectHighestScore(

+        prepositionalNNSTransform(chunk2), chunk1);

+

+    ParseTreePath chRes = null;

+    if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer

+        .getScore(chRes2))

+      if (parseTreeChunkListScorer.getScore(chRes1) > parseTreeChunkListScorer

+          .getScore(chRes3))

+        chRes = chRes1;

+      else

+        chRes = chRes3;

+    else if (parseTreeChunkListScorer.getScore(chRes2) > parseTreeChunkListScorer

+        .getScore(chRes3))

+      chRes = chRes2;

+    else

+      chRes = chRes3;

+

+    return chRes;

+  }

+

+  public ParseTreePath generalizeTwoGroupedPhrasesRandomSelectHighestScore(

+      ParseTreePath chunk1, ParseTreePath chunk2) {

+    List<String> pos1 = chunk1.getPOSs();

+    List<String> pos2 = chunk2.getPOSs();

+    // Map <ParseTreeChunk, Double> scoredResults = new HashMap <ParseTreeChunk,

+    // Double> ();

+    int timesRepetitiveRun = NUMBER_OF_ITERATIONS;

+

+    Double globalScore = -1.0;

+    ParseTreePath result = null;

+

+    for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {

+      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();

+      int k1 = 0, k2 = 0;

+      Double score = 0.0;

+      while (k1 < pos1.size() && k2 < pos2.size()) {

+        // first check if the same POS

+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));

+        String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1

+            .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);

+        // if (LemmaFormManager.acceptableLemmaAndPOS(sim, lemmaMatch)){

+        if ((sim != null)

+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch

+                .equals("fail")))) {

+          // if (sim!=null){ // && (lemmaMatch!=null &&

+          // !lemmaMatch.equals("fail"))){

+          commonPOS.add(pos1.get(k1));

+          if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2

+              && lemmaMatch != null) {

+            commonLemmas.add(lemmaMatch);

+

+          } else {

+            commonLemmas.add("*");

+

+          }

+          k1++;

+          k2++;

+        } else if (Math.random() > 0.5) {

+          k1++;

+        } else {

+          k2++;

+        }

+

+      }

+      ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,

+          0, 0);

+      score = parseTreeChunkListScorer.getScore(currResult);

+      if (score > globalScore) {

+        // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" +

+        // result+" score = "+ score +"\n\n");

+        result = currResult;

+        globalScore = score;

+      }

+    }

+

+    for (int timesRun = 0; timesRun < timesRepetitiveRun; timesRun++) {

+      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();

+      int k1 = pos1.size() - 1, k2 = pos2.size() - 1;

+      Double score = 0.0;

+      while (k1 >= 0 && k2 >= 0) {

+        // first check if the same POS

+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));

+        String lemmaMatch = lemmaFormManager.matchLemmas(null, chunk1

+            .getLemmas().get(k1), chunk2.getLemmas().get(k2), sim);

+        // if (acceptableLemmaAndPOS(sim, lemmaMatch)){

+        if ((sim != null)

+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch

+                .equals("fail")))) {

+          commonPOS.add(pos1.get(k1));

+          if (chunk1.getLemmas().size() > k1 && chunk2.getLemmas().size() > k2

+              && lemmaMatch != null) {

+            commonLemmas.add(lemmaMatch);

+          } else {

+            commonLemmas.add("*");

+

+          }

+          k1--;

+          k2--;

+        } else if (Math.random() > 0.5) {

+          k1--;

+        } else {

+          k2--;

+        }

+

+      }

+      Collections.reverse(commonLemmas);

+      Collections.reverse(commonPOS);

+

+      ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,

+          0, 0);

+      score = parseTreeChunkListScorer.getScore(currResult);

+      if (score > globalScore) {

+        // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" +

+        // currResult+" score = "+ score +"\n\n");

+        result = currResult;

+        globalScore = score;

+      }

+    }

+

+    // // System.out.println(chunk1 + "  + \n"+ chunk2 + " = \n" + result

+    // +" score = " +

+    // // parseTreeChunkListScorer.getScore(result)+"\n\n");

+    return result;

+  }

+

+  public Boolean acceptableLemmaAndPOS(String sim, String lemmaMatch) {

+    if (sim == null) {

+      return false;

+    }

+

+    if (lemmaMatch != null && !lemmaMatch.equals("fail")) {

+      return false;

+    }

+    // even if lemmaMatch==null

+    return true;

+    // if (sim!=null && (lemmaMatch!=null && !lemmaMatch.equals("fail"))){

+

+  }

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java
new file mode 100644
index 0000000..fc32380
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreePathMatcherDeterministic.java

@@ -0,0 +1,280 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.stemmer.PorterStemmer;

+import opennlp.tools.textsimilarity.POSManager;

+

+

+public class ParseTreePathMatcherDeterministic {

+

+  private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer();

+

+  private LemmaFormManager lemmaFormManager = new LemmaFormManager();

+

+  private POSManager posManager = new POSManager();

+

+  /**

+   * key matching function which takes two phrases, aligns them and finds a set

+   * of maximum common sub-phrase

+   * 

+   * @param chunk1

+   * @param chunk2

+   * @return

+   */

+

+  public List<ParseTreePath> generalizeTwoGroupedPhrasesDeterministic(

+      ParseTreePath chunk1, ParseTreePath chunk2) {

+    List<String> pos1 = chunk1.getPOSs();

+    List<String> pos2 = chunk2.getPOSs();

+    List<String> lem1 = chunk1.getLemmas();

+    List<String> lem2 = chunk2.getLemmas();

+

+    List<String> lem1stem = new ArrayList<String>();

+    List<String> lem2stem = new ArrayList<String>();

+

+    PorterStemmer ps = new PorterStemmer();

+    for (String word : lem1) {

+      try {

+        lem1stem.add(ps.stem(word.toLowerCase()).toString());

+      } catch (Exception e) {

+        // e.printStackTrace();

+

+        if (word.length() > 2)

+          System.err.println("Unable to stem: " + word);

+      }

+    }

+    try {

+      for (String word : lem2) {

+        lem2stem.add(ps.stem(word.toLowerCase()).toString());

+      }

+    } catch (Exception e) {

+      System.err.println("problem processing word " + lem2.toString());

+    }

+

+    List<String> overlap = new ArrayList(lem1stem);

+    overlap.retainAll(lem2stem);

+

+    if (overlap == null || overlap.size() < 1)

+      return null;

+

+    List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();

+    for (String word : overlap) {

+      Integer i1 = lem1stem.indexOf(word);

+      Integer i2 = lem2stem.indexOf(word);

+      occur1.add(i1);

+      occur2.add(i2);

+    }

+

+    // now we search for plausible sublists of overlaps

+    // if at some position correspondence is inverse (one of two position

+    // decreases instead of increases)

+    // then we terminate current alignment accum and start a new one

+    List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();

+    // starts from 1, not 0

+    List<int[]> accum = new ArrayList<int[]>();

+    accum.add(new int[] { occur1.get(0), occur2.get(0) });

+    for (int i = 1; i < occur1.size(); i++) {

+

+      if (occur1.get(i) > occur1.get(i - 1)

+          && occur2.get(i) > occur2.get(i - 1))

+        accum.add(new int[] { occur1.get(i), occur2.get(i) });

+      else {

+        overlapsPlaus.add(accum);

+        accum = new ArrayList<int[]>();

+        accum.add(new int[] { occur1.get(i), occur2.get(i) });

+      }

+    }

+    if (accum.size() > 0) {

+      overlapsPlaus.add(accum);

+    }

+

+    List<ParseTreePath> results = new ArrayList<ParseTreePath>();

+    for (List<int[]> occur : overlapsPlaus) {

+      List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();

+      for (int[] column : occur) {

+        occr1.add(column[0]);

+        occr2.add(column[1]);

+      }

+

+      int ov1 = 0, ov2 = 0; // iterators over common words;

+      List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();

+      // we start two words before first word

+      int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;

+      // if (k1<0) k1=0; if (k2<0) k2=0;

+      Boolean bReachedCommonWord = false;

+      while (k1 < 0 || k2 < 0) {

+        k1++;

+        k2++;

+      }

+      int k1max = pos1.size() - 1, k2max = pos2.size() - 1;

+      while (k1 <= k1max && k2 <= k2max) {

+        // first check if the same POS

+        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));

+        String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),

+            lem2.get(k2), sim);

+        if ((sim != null)

+            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch

+                .equals("fail")))) {

+          commonPOS.add(pos1.get(k1));

+          if (lemmaMatch != null) {

+            commonLemmas.add(lemmaMatch);

+            // System.out.println("Added "+lemmaMatch);

+            if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))

+              bReachedCommonWord = true; // now we can have different increment

+                                         // opera

+            else {

+              if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1

+                  && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {

+                ov1++;

+                ov2++;

+                bReachedCommonWord = true;

+              }

+              // else

+              // System.err.println("Next match reached '"+lemmaMatch+

+              // "' | k1 - k2: "+k1 + " "+k2 +

+              // "| occur index ov1-ov2 "+

+              // ov1+" "+ov2+

+              // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "

+              // +

+              // occr1.get(ov1) + " "+ occr2.get(ov1));

+            }

+          } else {

+            commonLemmas.add("*");

+          } // the same parts of speech, proceed to the next word in both

+            // expressions

+          k1++;

+          k2++;

+

+        } else if (!bReachedCommonWord) {

+          k1++;

+          k2++;

+        } // still searching

+        else {

+          // different parts of speech, jump to the next identified common word

+          ov1++;

+          ov2++;

+          if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)

+            break;

+          // now trying to find

+          int kk1 = occr1.get(ov1) - 2, // new positions of iterators

+          kk2 = occr2.get(ov2) - 2;

+          int countMove = 0;

+          while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is

+                                                                    // behind

+                                                                    // current

+                                                                    // position,

+                                                                    // synchroneously

+                                                                    // move

+                                                                    // towards

+                                                                    // right

+            kk1++;

+            kk2++;

+            countMove++;

+          }

+          k1 = kk1;

+          k2 = kk2;

+

+          if (k1 > k1max)

+            k1 = k1max;

+          if (k2 > k2max)

+            k2 = k2max;

+          bReachedCommonWord = false;

+        }

+      }

+      ParseTreePath currResult = new ParseTreePath(commonLemmas, commonPOS,

+          0, 0);

+      results.add(currResult);

+    }

+

+    return results;

+  }

+

+  /**

+   * main function to generalize two expressions grouped by phrase types returns

+   * a list of generalizations for each phrase type with filtered

+   * sub-expressions

+   * 

+   * @param sent1

+   * @param sent2

+   * @return List<List<ParseTreeChunk>> list of list of POS-words pairs for each

+   *         resultant matched / overlapped phrase

+   */

+  public List<List<ParseTreePath>> matchTwoSentencesGroupedChunksDeterministic(

+      List<List<ParseTreePath>> sent1, List<List<ParseTreePath>> sent2) {

+    List<List<ParseTreePath>> results = new ArrayList<List<ParseTreePath>>();

+    // first iterate through component

+    for (int comp = 0; comp < 2 && // just np & vp

+        comp < sent1.size() && comp < sent2.size(); comp++) {

+      List<ParseTreePath> resultComps = new ArrayList<ParseTreePath>();

+      // then iterate through each phrase in each component

+      for (ParseTreePath ch1 : sent1.get(comp)) {

+        for (ParseTreePath ch2 : sent2.get(comp)) { // simpler version

+          List<ParseTreePath> chunkToAdd = generalizeTwoGroupedPhrasesDeterministic(

+              ch1, ch2);

+

+          if (chunkToAdd == null)

+            chunkToAdd = new ArrayList<ParseTreePath>();

+          // System.out.println("ch1 = "+

+          // ch1.toString()+" | ch2="+ch2.toString()

+          // +"\n result = "+chunkToAdd.toString() + "\n");

+          /*

+           * List<ParseTreeChunk> chunkToAdd1 =

+           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic

+           * ( ParseTreeMatcher.prepositionalNNSTransform(ch1), ch2); if

+           * (chunkToAdd1!=null) chunkToAdd.addAll(chunkToAdd1);

+           * List<ParseTreeChunk> chunkToAdd2 =

+           * ParseTreeMatcherDeterministic.generalizeTwoGroupedPhrasesDeterministic

+           * ( ParseTreeMatcher.prepositionalNNSTransform(ch2), ch1); if

+           * (chunkToAdd2!=null) chunkToAdd.addAll(chunkToAdd2);

+           */

+

+          // For generalized match not with orig sentences but with templates

+          // if (!LemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd))

+          // continue; // if the words which have to stay do not stay, proceed

+          // to other elements

+          Boolean alreadyThere = false;

+          for (ParseTreePath chunk : resultComps) {

+            if (chunkToAdd.contains(chunk)) {

+              alreadyThere = true;

+              break;

+            }

+

+            // }

+          }

+

+          if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) {

+            resultComps.addAll(chunkToAdd);

+          }

+

+        }

+      }

+      List<ParseTreePath> resultCompsRed = generalizationListReducer

+          .applyFilteringBySubsumption(resultComps);

+

+      resultComps = resultCompsRed;

+      results.add(resultComps);

+    }

+

+    return results;

+  }

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java
new file mode 100644
index 0000000..fb97716
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/EdgeProductBuilder.java

@@ -0,0 +1,121 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collection;

+import java.util.List;

+import java.util.Set;

+

+import opennlp.tools.parse_thicket.ParseCorefsBuilder;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+

+import org.jgrapht.Graph;

+import org.jgrapht.alg.BronKerboschCliqueFinder;

+import org.jgrapht.graph.DefaultEdge;

+import org.jgrapht.graph.SimpleGraph;

+

+

+public class EdgeProductBuilder {

+	private Matcher matcher = new Matcher();

+	private ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();

+	private GraphFromPTreeBuilder graphBuilder = new GraphFromPTreeBuilder();

+	

+	

+	public Graph<ParseGraphNode[], DefaultEdge>  

+		buildEdgeProduct(Graph<ParseGraphNode, DefaultEdge> g1, Graph<ParseGraphNode, DefaultEdge> g2 ){

+			Graph<ParseGraphNode[], DefaultEdge> gp = 

+				new SimpleGraph<ParseGraphNode[], DefaultEdge>(DefaultEdge.class);

+		

+		Set<DefaultEdge> edges1 = g1.edgeSet();

+		Set<DefaultEdge> edges2 = g2.edgeSet();

+		// build nodes of product graph

+		for(DefaultEdge e1:edges1){

+			for(DefaultEdge e2:edges2){

+				ParseGraphNode sourceE1s = g1.getEdgeSource(e1), sourceE1t = g1.getEdgeTarget(e1);

+				ParseGraphNode sourceE2s = g2.getEdgeSource(e2), sourceE2t = g2.getEdgeTarget(e2);

+				

+				if (isNotEmpty(matcher.generalize(sourceE1s.getPtNodes(), sourceE2s.getPtNodes())) && 

+						isNotEmpty(matcher.generalize(sourceE1t.getPtNodes(), sourceE2t.getPtNodes()))

+					)

+					gp.addVertex(new ParseGraphNode[] {sourceE1s, sourceE1t, sourceE2s, sourceE2t } );

+			}

+		}

+		

+		Set<ParseGraphNode[]> productVerticesSet = gp.vertexSet();

+		List<ParseGraphNode[]> productVerticesList = new ArrayList<ParseGraphNode[]>(productVerticesSet);

+		for(int i=0; i<productVerticesList.size(); i++){

+			for(int j=i+1; j<productVerticesList.size(); j++){

+				ParseGraphNode[] prodVertexI = productVerticesList.get(i);

+				ParseGraphNode[] prodVertexJ = productVerticesList.get(j);

+				if (bothAjacentOrNeitherAdjacent(prodVertexI, prodVertexJ)){

+					gp.addEdge(prodVertexI, prodVertexJ);

+				}

+			}

+		}

+		

+		

+		return gp;

+		

+	}

+	/*

+	 * Finding the maximal clique is the slowest part

+	 */

+	

+	public Collection<Set<ParseGraphNode[]>> getMaximalCommonSubgraphs(Graph<ParseGraphNode[], DefaultEdge>  g){

+		BronKerboschCliqueFinder<ParseGraphNode[], DefaultEdge> finder =

+	            new BronKerboschCliqueFinder<ParseGraphNode[], DefaultEdge>(g);

+

+	        Collection<Set<ParseGraphNode[]>> cliques = finder.getBiggestMaximalCliques();

+	        return cliques;

+	}

+

+

+	private boolean bothAjacentOrNeitherAdjacent(ParseGraphNode[] prodVertexI,

+			ParseGraphNode[] prodVertexJ) {

+		List<ParseGraphNode> prodVertexIlist = 

+				new ArrayList<ParseGraphNode>(Arrays.asList(prodVertexI));

+		List<ParseGraphNode> prodVertexJlist = 

+				new ArrayList<ParseGraphNode>(Arrays.asList(prodVertexJ));

+		prodVertexIlist.retainAll(prodVertexJlist);

+		return (prodVertexIlist.size()==2 || prodVertexIlist.size()==4);

+	}

+

+

+	private boolean isNotEmpty(List<List<ParseTreeChunk>> generalize) {

+		if (generalize!=null && generalize.get(0)!=null && generalize.get(0).size()>0)

+			return true;

+		else

+			return false;

+	}

+	

+	public Collection<Set<ParseGraphNode[]>>  assessRelevanceViaMaximalCommonSubgraphs(String para1, String para2) {

+		// first build PTs for each text

+		ParseThicket pt1 = ptBuilder.buildParseThicket(para1);

+		ParseThicket pt2 = ptBuilder.buildParseThicket(para2);

+		// then build phrases and rst arcs

+		Graph<ParseGraphNode, DefaultEdge> g1 = graphBuilder.buildGraphFromPT(pt1);

+		Graph<ParseGraphNode, DefaultEdge> g2 = graphBuilder.buildGraphFromPT(pt2);

+		

+		Graph<ParseGraphNode[], DefaultEdge> gp =  buildEdgeProduct(g1, g2);

+		Collection<Set<ParseGraphNode[]>> col = getMaximalCommonSubgraphs(gp);

+		return col;

+		}

+	

+	public static void main(String[] args){

+		 EdgeProductBuilder b = new  EdgeProductBuilder();

+		 Collection<Set<ParseGraphNode[]>> col = b.assessRelevanceViaMaximalCommonSubgraphs("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+

+				"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +

+				"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +

+				"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "

+

+				, "Iran refuses the UN offer to end a conflict over its nuclear weapons."+

+						"UN passes a resolution prohibiting Iran from developing its uranium enrichment site. " +

+						"A recent UN report presented charts saying Iran was working on nuclear weapons. " +

+				"Iran envoy to UN states its nuclear development is for peaceful purpose, and the evidence against its claim is fabricated by the US. ");

+		System.out.print(col);

+	}

+}

+				


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java
new file mode 100644
index 0000000..bad6403
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/GraphFromPTreeBuilder.java

@@ -0,0 +1,131 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;

+

+import java.io.PrintWriter;

+import java.util.List;

+

+import opennlp.tools.parse_thicket.PTTree;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import org.jgrapht.Graph;

+import org.jgrapht.graph.DefaultDirectedWeightedGraph;

+import org.jgrapht.graph.DefaultEdge;

+import org.jgrapht.graph.SimpleGraph;

+

+

+import edu.stanford.nlp.trees.LabeledScoredTreeNode;

+import edu.stanford.nlp.trees.Tree;

+

+public class GraphFromPTreeBuilder {

+	

+	

+	public Graph<ParseGraphNode, DefaultEdge> buildGraphFromPT(ParseThicket pt){

+		PrintWriter out = new PrintWriter(System.out);

+

+		

+		List<Tree> ts = pt.getSentences();

+		ts.get(0).pennPrint(out);

+		Graph<ParseGraphNode, DefaultEdge> gfragment = buildGGraphFromTree(ts.get(0));

+		

+		//ParseTreeVisualizer applet = new ParseTreeVisualizer();

+		//applet.showGraph(gfragment);

+		

+		return gfragment;

+		

+	}

+	

+	

+	private Graph<ParseGraphNode, DefaultEdge> buildGGraphFromTree(Tree tree) {

+		Graph<ParseGraphNode, DefaultEdge> g =

+				new SimpleGraph<ParseGraphNode, DefaultEdge>(DefaultEdge.class);

+		ParseGraphNode root = new ParseGraphNode(tree,"S 0");

+		g.addVertex(root);

+		navigate(tree, g, 0, root);

+	        

+		return g;

+	}

+

+

+

+	private void navigate(Tree tree, Graph<ParseGraphNode, DefaultEdge> g, int l, ParseGraphNode currParent) {

+		//String currParent = tree.label().value()+" $"+Integer.toString(l);

+		//g.addVertex(currParent);

+		if (tree.getChildrenAsList().size()==1)

+			navigate(tree.getChildrenAsList().get(0), g, l+1, currParent);

+		else

+			if (tree.getChildrenAsList().size()==0)

+				return;

+		

+		for(Tree child: tree.getChildrenAsList()){

+			String currChild = null;

+			ParseGraphNode currChildNode = null;

+			try {

+				if (child.isLeaf()) 

+					continue;

+				if (child.label().value().startsWith("S"))

+					navigate(child.getChildrenAsList().get(0), g, l+1, currParent);

+				

+				if (!child.isPhrasal() || child.isPreTerminal())

+					currChild = child.toString()+" #"+Integer.toString(l);

+				else 

+					currChild = child.label().value()+" #"+Integer.toString(l);

+				currChildNode = new ParseGraphNode(child, currChild);

+				g.addVertex(currChildNode);

+				g.addEdge(currParent, currChildNode);

+			} catch (Exception e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+			navigate(child, g, l+1, currChildNode);

+		}

+	}

+

+

+	/*

+	private static void navigateChildren(PTTree[] trChildren, int indent, boolean parentLabelNull, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {

+	    boolean firstSibling = true;

+	    boolean leftSibIsPreTerm = true;  // counts as true at beginning

+	    for (PTTree currentTree : trChildren) {

+	      currentTree.navigate(indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, onlyLabelValue, phrases);

+	      leftSibIsPreTerm = currentTree.isPreTerminal();

+	      // CC is a special case for English, but leave it in so we can exactly match PTB3 tree formatting

+	      if (currentTree.value() != null && currentTree.value().startsWith("CC")) {

+	        leftSibIsPreTerm = false;

+	      }

+	      firstSibling = false;

+	    }

+	  }

+	

+	

+	  private void navigate(int indent, boolean parentLabelNull, boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, boolean onlyLabelValue, List<LabeledScoredTreeNode> phrases) {

+	    // the condition for staying on the same line in Penn Treebank

+	    boolean suppressIndent = (parentLabelNull || (firstSibling && isPreTerminal()) || (leftSiblingPreTerminal && isPreTerminal() && (label() == null || !label().value().startsWith("CC"))));

+	    if (suppressIndent) {

+	      //pw.print(" ");

+	      // pw.flush();

+	    } else {

+	      if (!topLevel) {

+	        //pw.println();

+	      }

+	      for (int i = 0; i < indent; i++) {

+	        //pw.print("  ");

+	        // pw.flush();

+	      }

+	    }

+	    if (isLeaf() || isPreTerminal()) {

+	      String terminalString = toStringBuilder(new StringBuilder(), onlyLabelValue).toString();

+	      //pw.print(terminalString);

+	      //pw.flush();

+	      return;

+	    }

+	    //pw.print("(");

+	    String nodeString = onlyLabelValue ? value() : nodeString();

+	    //pw.print(nodeString);

+	    // pw.flush();

+	    boolean parentIsNull = label() == null || label().value() == null;

+	    navigateChildren(children(), indent + 1, parentIsNull, true, phrases);

+	    //pw.print(")");

+	    

+	  }

+	  */

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java
new file mode 100644
index 0000000..9620499
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseGraphNode.java

@@ -0,0 +1,51 @@
+package opennlp.tools.parse_thicket.parse_thicket2graph;

+

+import java.util.List;

+

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;

+

+

+import edu.stanford.nlp.trees.Tree;

+

+public class ParseGraphNode {

+	 PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();

+	 

+	private Tree tree;

+	private String label;

+	private List<List<ParseTreeNode>> ptNodes;

+	

+	

+	

+	public List<List<ParseTreeNode>> getPtNodes() {

+		return ptNodes;

+	}

+

+	public ParseGraphNode(Tree tree, String label) {

+		super();

+		this.tree = tree;

+		this.label = label;

+		ptNodes =  phraseBuilder.buildPT2ptPhrasesForASentence(tree, null);

+	}

+

+	public Tree getTree() {

+		return tree;

+	}

+

+	public void setTree(Tree tree) {

+		this.tree = tree;

+	}

+

+	public String getLabel() {

+		return label;

+	}

+

+	public void setLabel(String label) {

+		this.label = label;

+	}

+

+	public String toString(){

+		return label;

+	}

+}

+	


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java
new file mode 100644
index 0000000..d34d974
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/parse_thicket2graph/ParseTreeVisualizer.java

@@ -0,0 +1,194 @@
+/* ==========================================

+ * JGraphT : a free Java graph-theory library

+ * ==========================================

+ *

+ * Project Info:  http://jgrapht.sourceforge.net/

+ * Project Creator:  Barak Naveh (http://sourceforge.net/users/barak_naveh)

+ *

+ * (C) Copyright 2003-2008, by Barak Naveh and Contributors.

+ *

+ * This library is free software; you can redistribute it and/or modify it

+ * under the terms of the GNU Lesser General Public License as published by

+ * the Free Software Foundation; either version 2.1 of the License, or

+ * (at your option) any later version.

+ *

+ * This library is distributed in the hope that it will be useful, but

+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public

+ * License for more details.

+ *

+ * You should have received a copy of the GNU Lesser General Public License

+ * along with this library; if not, write to the Free Software Foundation,

+ * Inc.,

+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

+ */

+/* ----------------------

+ * JGraphAdapterDemo.java

+ * ----------------------

+ * (C) Copyright 2003-2008, by Barak Naveh and Contributors.

+ *

+ * Original Author:  Barak Naveh

+ * Contributor(s):   -

+ *

+ * $Id: JGraphAdapterDemo.java 725 2010-11-26 01:24:28Z perfecthash $

+ *

+ * Changes

+ * -------

+ * 03-Aug-2003 : Initial revision (BN);

+ * 07-Nov-2003 : Adaptation to JGraph 3.0 (BN);

+ *

+ */

+package opennlp.tools.parse_thicket.parse_thicket2graph;

+

+import java.awt.*;

+import java.awt.geom.*;

+import java.util.HashMap;

+import java.util.Map;

+import java.util.Set;

+

+import javax.swing.*;

+

+

+import org.jgraph.*;

+import org.jgraph.graph.*;

+

+import org.jgrapht.*;

+import org.jgrapht.ext.*;

+import org.jgrapht.graph.*;

+

+

+import org.jgrapht.graph.DefaultEdge;

+

+public class ParseTreeVisualizer

+extends JApplet

+{

+	//~ Static fields/initializers ---------------------------------------------

+

+	private static final long serialVersionUID = 3256346823498765434L;

+	private static final Color DEFAULT_BG_COLOR = Color.decode("#FAFBFF");

+	private static final Dimension DEFAULT_SIZE = new Dimension(1200, 800);

+

+	//~ Instance fields --------------------------------------------------------

+

+	//

+	private JGraphModelAdapter<String, DefaultEdge> jgAdapter;

+

+	public void  showGraph(Graph g){

+		ParseTreeVisualizer applet = new ParseTreeVisualizer();

+		applet.importGraph(g);

+

+		JFrame frame = new JFrame();

+		frame.getContentPane().add(applet);

+		frame.setTitle("Showing parse thicket");

+		frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

+		frame.pack();

+		frame.setVisible(true);

+	}

+

+	// TODO cast to ParseGraphNode

+	private void importGraph(Graph g) {

+		// create a visualization using JGraph, via an adapter

+		jgAdapter = new JGraphModelAdapter<String, DefaultEdge>(g);

+

+		JGraph jgraph = new JGraph(jgAdapter);

+

+		adjustDisplaySettings(jgraph);

+		getContentPane().add(jgraph);

+		resize(DEFAULT_SIZE);

+

+		Set<String> vertexSet = ( Set<String>)g.vertexSet();

+		int count=0;

+		Map<Integer, Integer> level_count = new HashMap<Integer, Integer> ();

+

+		for(String vertexStr: vertexSet){

+			Integer key = 0;

+			try {

+				if (vertexStr.indexOf('#')>-1)

+					key = Integer.parseInt(vertexStr.split("#")[1]);

+			} catch (Exception e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+			Integer howManyAlready = 0;

+

+			if (key>0){

+				 howManyAlready = level_count.get(key);

+				if (howManyAlready==null){

+					howManyAlready=0;

+					level_count.put(key, 1);

+				} else {

+					level_count.put(key, howManyAlready+1);

+				}

+			}

+			positionVertexAt(vertexStr, count+howManyAlready*50, count);

+			count+=20;

+		}

+

+

+	}

+

+	/**

+	 * An alternative starting point for this demo, to also allow running this

+	 * applet as an application.

+	 *

+	 * @param args ignored.

+	 */

+	public static void main(String [] args)

+	{

+		ParseTreeVisualizer applet = new ParseTreeVisualizer();

+		applet.init();

+

+		JFrame frame = new JFrame();

+		frame.getContentPane().add(applet);

+		frame.setTitle("JGraphT Adapter to JGraph Demo");

+		frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

+		frame.pack();

+		frame.setVisible(true);

+	}

+

+

+

+	private void adjustDisplaySettings(JGraph jg)

+	{

+		jg.setPreferredSize(DEFAULT_SIZE);

+

+		Color c = DEFAULT_BG_COLOR;

+		String colorStr = null;

+

+		try {

+			colorStr = getParameter("bgcolor");

+		} catch (Exception e) {

+		}

+

+		if (colorStr != null) {

+			c = Color.decode(colorStr);

+		}

+

+		jg.setBackground(c);

+	}

+

+	@SuppressWarnings("unchecked") // FIXME hb 28-nov-05: See FIXME below

+	private void positionVertexAt(Object vertex, int x, int y)

+	{

+		DefaultGraphCell cell = jgAdapter.getVertexCell(vertex);

+		AttributeMap attr = cell.getAttributes();

+		Rectangle2D bounds = GraphConstants.getBounds(attr);

+

+		Rectangle2D newBounds =

+				new Rectangle2D.Double(

+						x,

+						y,

+						bounds.getWidth(),

+						bounds.getHeight());

+

+		GraphConstants.setBounds(attr, newBounds);

+

+		// TODO: Clean up generics once JGraph goes generic

+		AttributeMap cellAttr = new AttributeMap();

+		cellAttr.put(cell, attr);

+		jgAdapter.edit(cellAttr, null, null, null);

+	}

+

+}

+

+// End JGraphAdapterDemo.java


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java
new file mode 100644
index 0000000..ecba4b5
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhraseConcept.java

@@ -0,0 +1,45 @@
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+
+public class PhraseConcept {
+	int position;
+	//Set<Integer> intent;
+	List<List<ParseTreeChunk>> intent;
+	Set<Integer> parents;
+	public PhraseConcept() {
+		position = -1;
+		intent = new ArrayList<List<ParseTreeChunk>>();
+		parents = new HashSet<Integer>();
+	}
+	public void setPosition( int newPosition ){
+	       position = newPosition;
+	}
+	public void setIntent( List<List<ParseTreeChunk>> newIntent ){
+	       intent.clear();
+	       intent.addAll(newIntent);
+	}
+	public void setParents( Set<Integer> newParents ){
+	       //parents = newParents;
+		parents.clear();
+		parents.addAll(newParents);
+	}
+	public void printConcept() {
+		System.out.println("Concept position:" + position);
+		System.out.println("Concept intent:" + intent);
+		System.out.println("Concept parents:" + parents);
+	}
+	 public static void main(String []args) {
+		 PhraseConcept c = new PhraseConcept();
+		 c.printConcept();
+		 c.setPosition(10);
+		 c.printConcept();
+		 //List<List<ParseTreeChunk>> test = new List<List<ParseTreeChunk>>();
+		 //c.setIntent(test);
+		 c.printConcept();
+
+	 }
+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
new file mode 100644
index 0000000..23fd5a3
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java

@@ -0,0 +1,166 @@
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+
+
+public class PhrasePatternStructure {
+	int objectCount;
+	int attributeCount;
+	ArrayList<PhraseConcept> conceptList;
+	ParseTreeMatcherDeterministic md; 
+	public PhrasePatternStructure(int objectCounts, int attributeCounts) {
+		objectCount = objectCounts;
+		attributeCount = attributeCounts;
+		conceptList = new ArrayList<PhraseConcept>();
+		PhraseConcept bottom = new PhraseConcept();
+		md = new ParseTreeMatcherDeterministic();
+		/*Set<Integer> b_intent = new HashSet<Integer>();
+		for (int index = 0; index < attributeCount; ++index) {
+			b_intent.add(index);
+		}
+		bottom.setIntent(b_intent);*/
+		bottom.setPosition(0);
+		conceptList.add(bottom);
+	}
+	public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, int Generator) {
+		boolean parentIsMaximal = true;
+		while(parentIsMaximal) {
+			parentIsMaximal = false;
+			for (int parent : conceptList.get(Generator).parents) {
+				if (conceptList.get(parent).intent.containsAll(intent)) {
+					Generator = parent;
+					parentIsMaximal = true;
+					break;
+				}
+			}
+		}
+		return Generator;
+	}
+	public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) {
+		System.out.println("debug");
+		System.out.println("called for " + intent);
+		//printLattice();
+		int generator_tmp = GetMaximalConcept(intent, generator);
+		generator = generator_tmp;
+		if (conceptList.get(generator).intent.equals(intent)) {
+			System.out.println("at generator:" + conceptList.get(generator).intent);
+			System.out.println("to add:" + intent);
+
+			System.out.println("already generated");
+			return generator;
+		}
+		Set<Integer> generatorParents = conceptList.get(generator).parents;
+		Set<Integer> newParents = new HashSet<Integer>();
+		for (int candidate : generatorParents) {
+			if (!intent.containsAll(conceptList.get(candidate).intent)) {
+			//if (!conceptList.get(candidate).intent.containsAll(intent)) {
+				//Set<Integer> intersection = new HashSet<Integer>(conceptList.get(candidate).intent);
+				//List<List<ParseTreeChunk>> intersection = new ArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent);
+				//intersection.retainAll(intent);
+				List<List<ParseTreeChunk>> intersection = md
+				.matchTwoSentencesGroupedChunksDeterministic(intent, conceptList.get(candidate).intent);
+				System.out.println("recursive call (inclusion)");
+				candidate = AddIntent(intersection, candidate);
+			}
+			boolean addParents = true;
+			System.out.println("now iterating over parents");
+			Iterator<Integer> iterator = newParents.iterator();
+			while (iterator.hasNext()) {
+				Integer parent = iterator.next();
+				if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {
+					addParents = false;
+					break;
+				}
+				else {
+					if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {
+						iterator.remove();
+					}
+				}
+			}
+			/*for (int parent : newParents) {
+				System.out.println("parent = " + parent);
+				System.out.println("candidate intent:"+conceptList.get(candidate).intent);
+				System.out.println("parent intent:"+conceptList.get(parent).intent);
+				
+				if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) {
+					addParents = false;
+					break;
+				}
+				else {
+					if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) {
+						newParents.remove(parent);
+					}
+				}
+			}*/
+			if (addParents) {
+				newParents.add(candidate);
+			}
+		}
+		System.out.println("size of lattice: " + conceptList.size());
+		PhraseConcept newConcept = new PhraseConcept();
+		newConcept.setIntent(intent);
+		newConcept.setPosition(conceptList.size());
+		conceptList.add(newConcept);
+		conceptList.get(generator).parents.add(newConcept.position);
+		for (int newParent: newParents) {
+			if (conceptList.get(generator).parents.contains(newParent)) {
+				conceptList.get(generator).parents.remove(newParent);
+			}
+			conceptList.get(newConcept.position).parents.add(newParent);
+		}
+		return newConcept.position;
+	}
+	public void printLatticeStats() {
+		System.out.println("Lattice stats");
+		System.out.println("max_object_index = " + objectCount);
+		System.out.println("max_attribute_index = " + attributeCount);
+		System.out.println("Current concept count = " + conceptList.size());
+	}
+	public void printLattice() {
+		for (int i = 0; i < conceptList.size(); ++i) {
+			printConceptByPosition(i);
+		}
+	}
+	public void printConceptByPosition(int index) {
+		System.out.println("Concept at position " + index);
+		conceptList.get(index).printConcept();
+	}
+	public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
+			List<List<ParseTreeNode>> phrs) {
+		List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>();
+		List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(), 
+				pps = new ArrayList<ParseTreeChunk>();
+		for(List<ParseTreeNode> ps:phrs){
+			ParseTreeChunk ch = convertNodeListIntoChunk(ps);
+			String ptype = ps.get(0).getPhraseType();
+			if (ptype.equals("NP")){
+				nps.add(ch);
+			} else if (ptype.equals("VP")){
+				vps.add(ch);
+			} else if (ptype.equals("PP")){
+				pps.add(ch);
+			}
+		}
+		results.add(nps); results.add(vps); results.add(pps);
+		return results;
+	}
+	private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) {
+		List<String> lemmas = new ArrayList<String>(),  poss = new ArrayList<String>();
+		for(ParseTreeNode n: ps){
+			lemmas.add(n.getWord());
+			poss.add(n.getPos());
+		}
+		ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);
+		ch.setMainPOS(ps.get(0).getPhraseType());
+		return ch;
+	}
+	
+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
new file mode 100644
index 0000000..3a36e80
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java

@@ -0,0 +1,162 @@
+package opennlp.tools.parse_thicket.rhetoric_structure;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.HashMap;

+import java.util.List;

+import java.util.Map;

+

+import opennlp.tools.parse_thicket.ArcType;

+import opennlp.tools.parse_thicket.Pair;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

+

+import org.jgrapht.Graph;

+import org.jgrapht.graph.DefaultEdge;

+import org.jgrapht.graph.SimpleGraph;

+

+

+import edu.stanford.nlp.trees.Tree;

+

+public class RhetoricStructureArcsBuilder {

+	private RhetoricStructureMarker markerBuilderForSentence = new RhetoricStructureMarker();

+

+	private Map<Integer, List<Pair<String, Integer[]>>> buildMarkers(ParseThicket pt){

+

+		Map<Integer, List<Pair<String, Integer[]>>> sentNumMarkers = new 

+				HashMap<Integer, List<Pair<String, Integer[]>>>();

+		int count = 0;

+		for( List<ParseTreeNode> sent: pt.getNodesThicket()){

+			List<Pair<String, Integer[]>> markersForSentence = markerBuilderForSentence.

+					extractRSTrelationInSentenceGetBoundarySpan(sent);

+			sentNumMarkers.put(count,  markersForSentence);

+			count++;

+		}

+		return sentNumMarkers;

+	}

+

+

+	/*

+	 * Induced RST algorithm

+	 * 

+	 * Input: obtained RST markers (numbers of words which 

+	 * splits sentence in potential RST relation arguments) +

+	 * Current Parse Thicket with arcs for coreferences

+	 * 

+	 * We search for parts of sentences on the opposite side of RST markers

+	 * 

+	 * $sentPosFrom$  marker

+	 *  | == == == [ ] == == == |

+	 *     \				\

+	 *       \				  \

+	 *       coref          RST arc being formed

+	 *           \ 				\

+	 *             \			 \

+	 *     | == == == == == [  ] == == ==|      

+	 *     

+	 *       Mark yelled at his dog, but it disobeyed

+	 *        |							\

+	 *       coref                 RST arc for CONTRAST being formed

+	 *        | 							\

+	 *       He was upset, however he did not show it

+	 *       $sentPosTo$

+	 */

+	public List<WordWordInterSentenceRelationArc> buildRSTArcsFromMarkersAndCorefs(

+			List<WordWordInterSentenceRelationArc> arcs,

+			Map<Integer, List<List<ParseTreeNode>>> sentNumPhrasesMap, 

+			ParseThicket pt ) {

+		List<WordWordInterSentenceRelationArc> arcsRST = new ArrayList<WordWordInterSentenceRelationArc>();		

+

+		Map<Integer, List<Pair<String, Integer[]>>> rstMarkersMap = buildMarkers(pt);

+

+		for(int nSentFrom=0; nSentFrom<pt.getSentences().size(); nSentFrom++){

+			for(int nSentTo=nSentFrom+1; nSentTo<pt.getSentences().size(); nSentTo++){

+				// for given arc, find phrases connected by this arc and add to the list of phrases

+

+				List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom);

+				List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo);

+				List<Pair<String, Integer[]>> markersFrom = rstMarkersMap.get(nSentFrom);

+				List<Pair<String, Integer[]>> markersTo = rstMarkersMap.get(nSentTo);

+				for(WordWordInterSentenceRelationArc arc: arcs){

+					// arc should be coref and link these sentences

+					if (nSentFrom != arc.getCodeFrom().getFirst() ||

+							nSentTo != arc.getCodeTo().getFirst() ||

+							!arc.getArcType().getType().startsWith("coref")

+							)

+						continue;

+					int sentPosFrom = arc.getCodeFrom().getSecond();

+					int sentPosTo = arc.getCodeTo().getSecond();

+					// not more than a single RST link for a pair of sentences

+					boolean bFound = false;

+					for(List<ParseTreeNode> vpFrom: phrasesFrom){

+						if (bFound)

+							break;

+						for(List<ParseTreeNode> vpTo: phrasesTo){

+							for(Pair<String, Integer[]> mFrom: markersFrom){

+								for(Pair<String, Integer[]> mTo: markersTo) {

+									{

+										// the phrases should be on an opposite side of rst marker for a coref link

+										if (isSequence( new Integer[] { sentPosFrom,  vpFrom.get(0).getId(), mFrom.getSecond()[0]}) &&

+												isSequence( new Integer[] { sentPosTo,  vpTo.get(0).getId(), mTo.getSecond()[0]})	){

+											ArcType arcType = new ArcType("rst", mFrom.getFirst(), 0, 0);

+

+											WordWordInterSentenceRelationArc arcRST = 

+													new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(nSentFrom, mFrom.getSecond()[1]), 

+															new Pair<Integer, Integer>(nSentTo, mTo.getSecond()[1]), "", "", arcType);

+											arcsRST.add(arcRST);

+											bFound = true;

+											break;

+										}

+									}

+								}

+							}

+						}

+					}

+				}

+			}

+		}

+

+		return arcs;

+	}

+

+// check if the word positions occur in sentence in the order Integer[]

+// TODO make more sensitive algo	

+	private static boolean isSequence(Integer[] integers) {

+		//TODO better construction of array

+		if (integers==null || integers.length<3)

+			return false;

+		try {

+			for(Integer i: integers)

+				if (i==0)

+					return false;

+		} catch (Exception e) {

+			return false;

+		}

+		

+		Boolean bWrongOrder = false;

+		for(int i=1; i< integers.length; i++){

+			if (integers[i-1]>integers[i]){

+				bWrongOrder = true;

+				break;

+			}

+		}

+		

+		Boolean bWrongInverseOrder = false;

+		for(int i=1; i< integers.length; i++){

+			if (integers[i-1]<integers[i]){

+				bWrongInverseOrder = true;

+				break;

+			}

+		}

+		

+		return !(bWrongOrder && bWrongInverseOrder);

+	}

+

+

+

+	public static void main(String[] args){

+

+

+	}

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
new file mode 100644
index 0000000..060d32f
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java

@@ -0,0 +1,129 @@
+package opennlp.tools.parse_thicket.rhetoric_structure;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.List;

+

+import opennlp.tools.parse_thicket.IGeneralizer;

+import opennlp.tools.parse_thicket.Pair;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+

+

+public class RhetoricStructureMarker implements IGeneralizer<Integer[]>  {

+	//private static String rstRelations[] = {"antithesis", "concession", "contrast", "elaboration"};

+	List<Pair<String, ParseTreeNode[]>> rstMarkers = new ArrayList<Pair<String, ParseTreeNode[]>>();

+

+	public  RhetoricStructureMarker(){

+

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>("contrast", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("than",",")  }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "antithesis", new ParseTreeNode[]{new ParseTreeNode("although",","),  new ParseTreeNode("*","*")  }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("however","*")  }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("however","*"), new ParseTreeNode(",",","),

+					new ParseTreeNode("*","prp"),   }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("*","NN")  }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode("as","*"),  new ParseTreeNode("a","*")  }));

+	

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>("explanation", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("because",",")  }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "example", new ParseTreeNode[]{new ParseTreeNode("for","IN"),  new ParseTreeNode("example","NN")  }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("ye","*")  }));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode(",",","),

+					new ParseTreeNode("*","prp"),   }));

+		

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode("i","*"),

+				  }));

+		

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "explanation", new ParseTreeNode[]{new ParseTreeNode(",",","),  new ParseTreeNode("where","*")  }));

+		//as long as

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","RB"), 

+				new ParseTreeNode("as","IN"),}));

+		rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","VB*"), 

+				new ParseTreeNode("until","IN"),}));

+

+	}

+

+	/* For a sentence, we obtain a list of markers with the CA word and position in the sentence

+	 * Output span is an integer array with start/end occurrence of an RST marker in a sentence

+	 * */

+	public List<Pair<String, Integer[]>> extractRSTrelationInSentenceGetBoundarySpan(List<ParseTreeNode> sentence){

+		List<Pair<String, Integer[]>> results = new ArrayList<Pair<String, Integer[]>> ();

+		

+		for(Pair<String, ParseTreeNode[]> template: rstMarkers){

+			List<Integer[]> spanList = generalize(sentence,template.getSecond() );

+			if (!spanList.isEmpty())

+				results.add(new Pair<String, Integer[]>(template.getFirst(), spanList.get(0)));

+		}

+		return results;

+	}

+

+	/* Rule application in the form of generalization

+	 * Generalizing a sentence with a rule (a template), we obtain the occurrence of rhetoric marker

+	 *

+	 * o1 - sentence

+	 * o2 - rule/template, specifying lemmas and/or POS, including punctuation

+	 * @see opennlp.tools.parse_thicket.IGeneralizer#generalize(java.lang.Object, java.lang.Object)

+	 * returns the span Integer[] 

+	 */

+	@Override

+	public List<Integer[]> generalize(Object o1, Object o2) {

+		List<Integer[]> result = new ArrayList<Integer[]>();

+

+		List<ParseTreeNode> sentence = (List<ParseTreeNode> )o1;

+		ParseTreeNode[] template = (ParseTreeNode[]) o2;

+

+		boolean bBeingMatched = false;

+		for(int wordIndexInSentence=0; wordIndexInSentence<sentence.size(); wordIndexInSentence++){

+			ParseTreeNode word = sentence.get(wordIndexInSentence);

+			int wordIndexInSentenceEnd = wordIndexInSentence; //init iterators for internal loop

+			int templateIterator=0;

+			while (wordIndexInSentenceEnd<sentence.size() && templateIterator< template.length){

+				ParseTreeNode tword = template[templateIterator];

+				ParseTreeNode currWord=sentence.get(wordIndexInSentenceEnd);

+				List<ParseTreeNode> gRes = tword.generalize(tword, currWord);

+				if (gRes.isEmpty()|| gRes.get(0)==null || ( gRes.get(0).getWord().equals("*") 

+						&& gRes.get(0).getPos().equals("*") )){

+					bBeingMatched = false;

+					break;

+				} else {

+					bBeingMatched = true;

+				}

+				wordIndexInSentenceEnd++;

+				templateIterator++;

+			}

+			// template iteration is done

+			// the only condition for successful match is IF we are at the end of template

+			if (templateIterator == template.length){

+				result.add(new Integer[]{wordIndexInSentence, wordIndexInSentenceEnd-1});

+				return result;

+			}

+

+			// no match for current sentence word: proceed to the next

+		}

+		return result; 

+	}

+	

+	public String markerToString(List<Pair<String, Integer[]>> res){

+		StringBuffer buf = new StringBuffer();

+		buf.append("[");

+		for(Pair<String, Integer[]> marker: res){

+			buf.append(marker.getFirst()+":");

+			for(int a: marker.getSecond()){

+				buf.append(a+" ");

+			}

+			buf.append (" | ");

+		}

+		buf.append("]");

+		return buf.toString();

+	}

+

+	public static void main(String[] args){

+		ParseTreeNode[] sent = 	

+		new ParseTreeNode[]{new ParseTreeNode("he","prn"), new ParseTreeNode("was","vbz"), new ParseTreeNode("more","jj"), 

+				new ParseTreeNode(",",","),  new ParseTreeNode("than",","), new ParseTreeNode("little","jj"), new ParseTreeNode("boy","nn"),

+				new ParseTreeNode(",",","), new ParseTreeNode("however","*"), new ParseTreeNode(",",","),

+				new ParseTreeNode("he","prp"), new ParseTreeNode("was","vbz"), new ParseTreeNode("adult","jj")

+		};

+		

+		List<Pair<String, Integer[]>> res = new RhetoricStructureMarker().extractRSTrelationInSentenceGetBoundarySpan(Arrays.asList(sent));

+		System.out.println( new RhetoricStructureMarker().markerToString(res));

+	} 

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
index 9e793b3..c9b1f76 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java

@@ -17,28 +17,90 @@
 

 package opennlp.tools.similarity.apps;

 

-import java.io.BufferedReader;

-import java.io.InputStreamReader;

-import java.net.URL;

-import java.net.URLConnection;

-import java.net.URLEncoder;

 import java.util.ArrayList;

 import java.util.List;

 import java.util.logging.Logger;

 

-import org.apache.commons.lang.StringUtils;

-import org.json.JSONArray;

-import org.json.JSONObject;

+import net.billylieurance.azuresearch.AzureSearchImageQuery;

+import net.billylieurance.azuresearch.AzureSearchImageResult;

+import net.billylieurance.azuresearch.AzureSearchResultSet;

+import net.billylieurance.azuresearch.AzureSearchWebQuery;

+import net.billylieurance.azuresearch.AzureSearchWebResult;

 

 public class BingQueryRunner {

-  protected static final String APP_ID = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";

-    //"DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";

-  // TODO user needs to have own APP_ID from Bing API

+	

+	protected static String BING_KEY = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";

+	private static final Logger LOG = Logger

+		      .getLogger("opennlp.tools.similarity.apps.BingQueryRunner");

+	protected AzureSearchWebQuery aq = new AzureSearchWebQuery();

+	private AzureSearchImageQuery iq = new AzureSearchImageQuery();

+	

+	public void setKey(String key){

+		BING_KEY = key;

+	}

+	

+	public void setLang(String language){

+		aq.setMarket(language);

+	}

+  

+	public List<HitBase> runSearch(String query, int nRes) {

+		aq.setAppid(BING_KEY);

+		aq.setQuery(query);		

+		aq.setPerPage(nRes);

+		try {

+			aq.doQuery();

+		} catch (Exception e) { // most likely exception is due to limit on bing key

+			aq.setAppid("pjtCgujmf9TtfjCVBdcQ2rBUQwGLmtLtgCG4Ex7kekw");

+			try {

+				aq.doQuery();

+			} catch (Exception e1) {

+				// TODO Auto-generated catch block

+				e1.printStackTrace();

+			}

+			e.printStackTrace();

+		}

+		

+		//org.xml.sax.SAXParseException

+		

+		List<HitBase> results = new ArrayList<HitBase> ();

+		AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();

+		

+		for (AzureSearchWebResult anr : ars){

+		    HitBase h = new HitBase();

+		    h.setAbstractText(anr.getDescription());

+		    h.setTitle(anr.getTitle());

+		    h.setUrl(anr.getUrl());

+		    results.add(h);

+		}

+		return results;

+	}

+	

+	

+	public AzureSearchResultSet<AzureSearchImageResult> runImageSearch(String query) {

+		iq.setAppid(BING_KEY);

+		iq.setQuery(query);		

+		iq.doQuery();

+		

+		AzureSearchResultSet<AzureSearchImageResult> ars = iq.getQueryResult();

+

+		return ars;

+	}

+	public int getTotalPagesAtASite(String site)

+	{

+		return runSearch("site:"+site, 1000000).size();

+	}

+	

+

+	public List<HitBase> runSearch(String query) {

+		return runSearch(query, 100);

+	}	

+	

+	

+	

 

   private float snapshotSimilarityThreshold = 0.4f;

 

-  private static final Logger LOG = Logger

-      .getLogger("opennlp.tools.similarity.apps.BingQueryRunner");

+  

 

   public void setSnapshotSimilarityThreshold(float thr) {

     snapshotSimilarityThreshold = thr;

@@ -53,8 +115,7 @@
   }

 

   /*

-   * 

-   */

+ 

 

   private String constructBingUrl(String query, String domainWeb, String lang,

       int numbOfHits) throws Exception {

@@ -73,9 +134,8 @@
     return yahooRequest;

   }

 

-  /*

-     *  

-     */

+ 

+    

   public ArrayList<String> search(String query, String domainWeb, String lang,

       int numbOfHits) throws Exception {

     URL url = new URL(constructBingUrl(query, domainWeb, lang, numbOfHits));

@@ -145,6 +205,7 @@
     hits = HitBase.removeDuplicates(hits);

     return hits;

   }

+  */

 

   // TODO comment back when dependencies resolved (CopyrightViolations)

   /*

@@ -185,10 +246,16 @@
 

   public static void main(String[] args) {

     BingQueryRunner self = new BingQueryRunner();

+    

+    AzureSearchResultSet<AzureSearchImageResult> res = self.runImageSearch("albert einstein");

+    System.out.println(res);

     try {

+    	self.setLang("es-MX");

+    	self.setKey(

+    			"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=");

       List<HitBase> resp = self

-          .runSearch("Rates rise at weekly Treasury auction");

-      // "British Actress Lynn Redgrave dies at 67");

+          .runSearch(//"art scene");

+        		  "biomecanica las palancas");

       System.out.print(resp.get(0));

     } catch (Exception e) {

       // TODO Auto-generated catch block

@@ -196,6 +263,12 @@
     }

 

     /*

+     * 

+     * de-DE

+     * es-MX

+     * es-SP

+     */

+    /*

      * String[] submittedNews = new String[]{

      * "Asian airports had already increased security following the Christmas Day attack, but South Korea and Pakistan are thinking about additional measures."

      * ,


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java
new file mode 100644
index 0000000..4bff64f
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java

@@ -0,0 +1,467 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.similarity.apps;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.List;

+import java.util.logging.Logger;

+

+import opennlp.tools.parse_thicket.Triple;

+import opennlp.tools.similarity.apps.utils.PageFetcher;

+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

+import opennlp.tools.similarity.apps.utils.Utils;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+/*

+ * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine 

+ * them in the form 

+ * expected to be readable by humans and not distinguishable from genuine content by search engines

+ * 

+ */

+

+public class ContentGenerator /*extends RelatedSentenceFinder*/ {

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.ContentGenerator");

+	PageFetcher pFetcher = new PageFetcher();

+	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor

+			.getInstance();

+	protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+	protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();

+	protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();

+	protected BingQueryRunner yrunner = new BingQueryRunner();

+	protected ContentGeneratorSupport support = new ContentGeneratorSupport();

+	protected int MAX_STEPS = 1;

+	protected int MAX_SEARCH_RESULTS = 1;

+	protected float RELEVANCE_THRESHOLD = 1.1f;

+

+	//private static final int MAX_FRAGMENT_SENTS = 10;

+

+	public ContentGenerator(int ms, int msr, float thresh, String key) {

+		this.MAX_STEPS = ms;

+		this.MAX_SEARCH_RESULTS = msr;

+		this.RELEVANCE_THRESHOLD=thresh;

+		yrunner.setKey(key);

+	}

+

+	public ContentGenerator() {

+		// TODO Auto-generated constructor stub

+	}

+	public void setLang(String lang) {

+		yrunner.setLang(lang);

+

+	}

+

+

+	/**

+	 * Main content generation function which takes a seed as a person, rock

+	 * group, or other entity name and produce a list of text fragments by web

+	 * mining for <br>

+	 * 

+	 * @param String

+	 *          entity name

+	 * @return List<HitBase> of text fragment structures which contain approved

+	 *         (in terms of relevance) mined sentences, as well as original search

+	 *         results objects such as doc titles, abstracts, and urls.

+	 */

+

+	public List<HitBase> generateContentAbout(String sentence) throws Exception {

+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();

+		System.out.println(" \n=== Entity to write about = " + sentence);

+	

+		int stepCount=0;

+		for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {

+			List<HitBase> searchResult = yrunner.runSearch(sentence + " "

+					+ verbAddition, MAX_SEARCH_RESULTS); //100);

+			if (MAX_SEARCH_RESULTS<searchResult.size())

+				searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);

+			//TODO for shorter run

+			if (searchResult != null) {

+				for (HitBase item : searchResult) { // got some text from .html

+					if (item.getAbstractText() != null

+							&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf

+						opinionSentencesToAdd

+						.add(buildParagraphOfGeneratedText(item, sentence, null));

+					}

+				}

+			}

+			stepCount++;

+			if (stepCount>MAX_STEPS)

+				break;

+		}

+

+		opinionSentencesToAdd = ContentGeneratorSupport.removeDuplicatesFromResultantHits(opinionSentencesToAdd);

+		return opinionSentencesToAdd;

+	}

+

+	/**

+	 * Takes a sentence and extracts noun phrases and entity names to from search

+	 * queries for finding relevant sentences on the web, which are then subject

+	 * to relevance assessment by Similarity. Search queries should not be too

+	 * general (irrelevant search results) or too specific (too few search

+	 * results)

+	 * 

+	 * @param String

+	 *          input sentence to form queries

+	 * @return List<String> of search expressions

+	 */

+	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {

+		ParseTreeChunk matcher = new ParseTreeChunk();

+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor

+				.getInstance();

+		List<List<ParseTreeChunk>> sent1GrpLst = null;

+

+		List<ParseTreeChunk> nPhrases = pos

+				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);

+		List<String> queryArrayStr = new ArrayList<String>();

+		for (ParseTreeChunk ch : nPhrases) {

+			String query = "";

+			int size = ch.getLemmas().size();

+

+			for (int i = 0; i < size; i++) {

+				if (ch.getPOSs().get(i).startsWith("N")

+						|| ch.getPOSs().get(i).startsWith("J")) {

+					query += ch.getLemmas().get(i) + " ";

+				}

+			}

+			query = query.trim();

+			int len = query.split(" ").length;

+			if (len < 2 || len > 5)

+				continue;

+			if (len < 4) { // every word should start with capital

+				String[] qs = query.split(" ");

+				boolean bAccept = true;

+				for (String w : qs) {

+					if (w.toLowerCase().equals(w)) // idf only two words then

+						// has to be person name,

+						// title or geo location

+						bAccept = false;

+				}

+				if (!bAccept)

+					continue;

+			}

+

+			query = query.trim().replace(" ", " +");

+			query = " +" + query;

+

+			queryArrayStr.add(query);

+

+		}

+		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2

+			// keywords

+			for (ParseTreeChunk ch : nPhrases) {

+				String query = "";

+				int size = ch.getLemmas().size();

+

+				for (int i = 0; i < size; i++) {

+					if (ch.getPOSs().get(i).startsWith("N")

+							|| ch.getPOSs().get(i).startsWith("J")) {

+						query += ch.getLemmas().get(i) + " ";

+					}

+				}

+				query = query.trim();

+				int len = query.split(" ").length;

+				if (len < 2)

+					continue;

+

+				query = query.trim().replace(" ", " +");

+				query = " +" + query;

+

+				queryArrayStr.add(query);

+

+			}

+		}

+

+		queryArrayStr = ContentGeneratorSupport.removeDuplicatesFromQueries(queryArrayStr);

+		queryArrayStr.add(sentence);

+

+		return queryArrayStr;

+

+	}

+

+	private Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){

+		if (sentsAll == null)

+			sentsAll = new ArrayList<String>();

+		// put orig sentence in structure

+		List<String> origs = new ArrayList<String>();

+		origs.add(originalSentence);

+		item.setOriginalSentences(origs);

+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

+				.replace("  ", " ").replace("  ", " ");

+		// generation results for this sentence

+		List<Fragment> result = new ArrayList<Fragment>();

+		// form plain text from snippet

+		String snapshot = item.getAbstractText().replace("<b>", " ")

+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");

+

+

+		// fix a template expression which can be substituted by original if

+		// relevant

+		String snapshotMarked = snapshot.replace("...",

+				" _should_find_orig_ . _should_find_orig_");

+		String[] fragments = sm.splitSentences(snapshotMarked);

+		List<String> allFragms = new ArrayList<String>();

+		allFragms.addAll(Arrays.asList(fragments));

+

+		String[] sents = null;

+		String downloadedPage = null;

+		try {

+			if (snapshotMarked.length() != snapshot.length()) {

+				downloadedPage = pFetcher.fetchPage(item.getUrl());

+				if (downloadedPage != null && downloadedPage.length() > 100) {

+					item.setPageContent(downloadedPage);

+					String pageContent = Utils.fullStripHTML(item.getPageContent());

+					pageContent = GeneratedSentenceProcessor

+							.normalizeForSentenceSplitting(pageContent);

+					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);

+			

+					sents = sm.splitSentences(pageContent);

+

+					sents = ContentGeneratorSupport.cleanListOfSents(sents);

+				}

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			// e.printStackTrace();

+			System.err

+			.println("Problem downloading  the page and splitting into sentences");

+			return new Triple(allFragms, downloadedPage, sents);

+		}

+		return new Triple(allFragms, downloadedPage, sents);

+	}

+

+	private String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){

+		String[] mainAndFollowSent = null;

+

+		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();

+		String downloadedPage = (String)fragmentExtractionResults.getSecond();

+		String[] sents = (String[])fragmentExtractionResults.getThird();

+

+		String followSent = null;

+		if (fragment.length() < 50)

+			return null;

+		String pageSentence = "";

+		// try to find original sentence from webpage

+		if (fragment.indexOf("_should_find_orig_") > -1 && sents != null

+				&& sents.length > 0){

+			try { 

+				// first try sorted sentences from page by length approach

+				String[] sentsSortedByLength = support.extractSentencesFromPage(downloadedPage);

+

+

+				try {

+					mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(

+							fragment.replace("_should_find_orig_", ""), sentsSortedByLength);

+				} catch (Exception e) {

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+				// if the above gives null than try to match all sentences from snippet fragment

+				if (mainAndFollowSent==null || mainAndFollowSent[0]==null){

+					mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(

+							fragment.replace("_should_find_orig_", ""), sents);

+				}

+

+

+			} catch (Exception e) {

+

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+		}

+		else

+			// or get original snippet

+			pageSentence = fragment;

+		if (pageSentence != null)

+			pageSentence.replace("_should_find_orig_", "");

+

+		return mainAndFollowSent;

+

+	}	

+

+	private Fragment verifyCandidateSentencesAndFormParagraph(

+			String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {

+		Fragment result = null;	

+

+		String pageSentence = candidateSentences[0];

+		String followSent = "";

+		for(int i = 1; i< candidateSentences.length; i++)

+			followSent+= candidateSentences[i];

+		String title = item.getTitle();

+

+		// resultant sentence SHOULD NOT be longer than for times the size of

+		// snippet fragment

+		if (!(pageSentence != null && pageSentence.length()>50 

+				&& (float) pageSentence.length() / (float) fragment.length() < 4.0) )

+			return null;

+

+

+		try { // get score from syntactic match between sentence in

+			// original text and mined sentence

+			double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

+

+			SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence

+					+ " " + title, originalSentence);

+			List<List<ParseTreeChunk>> match = matchRes.getMatchResult();

+			if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {

+				System.out

+				.println("Rejected Sentence : No verb OR Yes imperative verb :"

+						+ pageSentence);

+				return null;

+			}

+

+			syntScore = parseTreeChunkListScorer

+					.getParseTreeChunkListScore(match);

+			System.out.println(parseTreeChunk.listToString(match) + " "

+					+ syntScore + "\n pre-processed sent = '" + pageSentence);

+

+			if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents

+				for (String currSent : sentsAll) {

+					if (currSent.startsWith(originalSentence))

+						continue;

+					match = sm.assessRelevance(currSent, pageSentence)

+							.getMatchResult();

+					double syntScoreCurr = parseTreeChunkListScorer

+							.getParseTreeChunkListScore(match);

+					if (syntScoreCurr > syntScore) {

+						syntScore = syntScoreCurr;

+					}

+				}

+				if (syntScore > RELEVANCE_THRESHOLD) {

+					System.out.println("Got match with other sent: "

+							+ parseTreeChunk.listToString(match) + " " + syntScore);

+				}

+			}

+

+			measScore = stringDistanceMeasurer.measureStringDistance(

+					originalSentence, pageSentence);

+

+

+			if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)

+					&& measScore < 0.8 && pageSentence.length() > 40) // >70

+			{

+				String pageSentenceProc = GeneratedSentenceProcessor

+						.acceptableMinedSentence(pageSentence);

+				if (pageSentenceProc != null) {

+					pageSentenceProc = GeneratedSentenceProcessor

+							.processSentence(pageSentenceProc);

+					followSent = GeneratedSentenceProcessor.processSentence(followSent);

+					if (followSent != null) {

+						pageSentenceProc += " "+ followSent;

+					}

+

+					pageSentenceProc = Utils.convertToASCII(pageSentenceProc);

+					result = new Fragment(pageSentenceProc, syntScore + measScore

+							+ mentalScore + (double) pageSentenceProc.length()

+							/ (double) 50);

+					result.setSourceURL(item.getUrl());

+					result.fragment = fragment;

+

+					System.out.println("Accepted sentence: " + pageSentenceProc

+							+ "| with title= " + title);

+					System.out.println("For fragment = " + fragment);

+				} else

+					System.out

+					.println("Rejected sentence due to wrong area at webpage: "

+							+ pageSentence);

+			} else

+				System.out.println("Rejected sentence due to low score: "

+						+ pageSentence);

+			// }

+		} catch (Throwable t) {

+			t.printStackTrace();

+		}

+

+	return result;

+}

+	/**

+	 * Takes single search result for an entity which is the subject of the essay

+	 * to be written and forms essey sentences from the title, abstract, and

+	 * possibly original page

+	 * 

+	 * @param HitBase

+	 *          item : search result

+	 * @param originalSentence

+	 *          : seed for the essay to be written

+	 * @param sentsAll

+	 *          : list<String> of other sentences in the seed if it is

+	 *          multi-sentence

+	 * @return search result

+	 */

+	public HitBase buildParagraphOfGeneratedText(HitBase item,

+			String originalSentence, List<String> sentsAll) {

+		List<Fragment> results = new ArrayList<Fragment>() ;

+		

+		Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);

+

+		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();

+		String downloadedPage = (String)fragmentExtractionResults.getSecond();

+		String[] sents = (String[])fragmentExtractionResults.getThird();

+

+		for (String fragment : allFragms) {

+			String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);

+			if (candidateSentences == null)

+				continue;

+			Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);

+			if (res!=null)

+				results.add(res);

+

+		}

+		

+		item.setFragments(results );

+		return item;

+	}

+

+

+

+

+public static void main(String[] args) {

+	ContentGenerator f = new ContentGenerator();

+

+	List<HitBase> hits = null;

+	try {

+		// uncomment the sentence you would like to serve as a seed sentence for

+		// content generation for an event description

+

+		// uncomment the sentence you would like to serve as a seed sentence for

+		// content generation for an event description

+		hits = f.generateContentAbout("Albert Einstein"

+				// "Britney Spears - The Femme Fatale Tour"

+				// "Rush Time Machine",

+				// "Blue Man Group" ,

+				// "Belly Dance With Zaharah",

+				// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",

+				// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",

+				);

+		System.out.println(HitBase.toString(hits));

+		System.out.println(HitBase.toResultantString(hits));

+		// WordFileGenerator.createWordDoc("Essey about Albert Einstein",

+		// hits.get(0).getTitle(), hits);

+

+	} catch (Exception e) {

+		e.printStackTrace();

+	}

+

+}

+

+

+

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java
new file mode 100644
index 0000000..4cc36a5
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java

@@ -0,0 +1,99 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.similarity.apps;

+

+import java.util.List;

+

+import javax.mail.internet.AddressException;

+import javax.mail.internet.InternetAddress;

+

+import opennlp.tools.apps.utils.email.EmailSender;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class ContentGeneratorRunner {

+	public static void main(String[] args) {

+		ParserChunker2MatcherProcessor sm = null;

+	    	    

+	    try {

+			String resourceDir = args[2];

+			if (resourceDir!=null)

+				sm = ParserChunker2MatcherProcessor.getInstance(resourceDir);

+			else

+				sm = ParserChunker2MatcherProcessor.getInstance();

+	

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+	    

+	    String bingKey = args[7];

+	    if (bingKey == null){

+	    	bingKey = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";

+	    }

+	    

+	    RelatedSentenceFinder f = null;

+	    String lang = args[6];

+	    if (lang.startsWith("es")){

+	    	f = new RelatedSentenceFinderML(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);

+	    	f.setLang(lang);

+	    } else	    

+	    

+		    if (args.length>4 && args[4]!=null)

+		    	f = new RelatedSentenceFinder(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);

+		    else

+		    	f = new RelatedSentenceFinder();

+		    

+	    List<HitBase> hits = null;

+	    try {

+	      

+	      hits = f.generateContentAbout(args[0].replace('+', ' ').replace('"', ' ').trim());

+	      System.out.println(HitBase.toString(hits));

+	      String generatedContent = HitBase.toResultantString(hits);

+	      

+	      opennlp.tools.apps.utils.email.EmailSender s = new opennlp.tools.apps.utils.email.EmailSender();

+			

+			try {

+				s.sendMail("smtp.live.com", "bgalitsky@hotmail.com", "borgalor", new InternetAddress("bgalitsky@hotmail.com"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, 

+						"Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);

+			} catch (AddressException e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			} catch (Exception e) {

+		

+				e.printStackTrace();

+				try {

+					s.sendMail("smtp.live.com", "bgalitsky@hotmail.com", "borgalor", new InternetAddress("bgalitsky@hotmail.com"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, 

+							"Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);

+				} catch (Exception e1) {

+					// TODO Auto-generated catch block

+					e1.printStackTrace();

+				}

+			}

+	      

+	      

+	    } catch (Exception e) {

+	      e.printStackTrace();

+	    }

+

+	  }

+}

+

+/*

+ * C:\stanford-corenlp>java -Xmx1g -jar pt.jar albert+einstein bgalitsky@hotmail.com C:/stanford-corenlp/src/test/resources

+ * 

+ * http://173.255.254.250:8983/solr/contentgen/?q=albert+einstein&email=bgalitsky@hotmail.com&resourceDir=/home/solr/solr-4.4.0/example/src/test/resources&workDir=/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib&stepsNum=20&searchResultsNum=100&relevanceThreshold=0.5&lang=es-US&bingKey=e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=

+ */


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
new file mode 100644
index 0000000..428cd4e
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java

@@ -0,0 +1,478 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.similarity.apps;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.List;

+import java.util.logging.Logger;

+

+import opennlp.tools.parse_thicket.Triple;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunk;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunkComparable;

+import opennlp.tools.similarity.apps.utils.PageFetcher;

+import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

+import opennlp.tools.similarity.apps.utils.Utils;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.TextProcessor;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+import org.apache.commons.lang.StringUtils;

+

+/*

+ * This class supports content generation by static functions

+ * 

+ */

+

+public class ContentGeneratorSupport {

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.ContentGeneratorSupport");

+

+	/**

+	 * Takes a sentence and extracts noun phrases and entity names to from search

+	 * queries for finding relevant sentences on the web, which are then subject

+	 * to relevance assessment by Similarity. Search queries should not be too

+	 * general (irrelevant search results) or too specific (too few search

+	 * results)

+	 * 

+	 * @param String

+	 *          input sentence to form queries

+	 * @return List<String> of search expressions

+	 */

+	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {

+		ParseTreeChunk matcher = new ParseTreeChunk();

+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor

+				.getInstance();

+		List<List<ParseTreeChunk>> sent1GrpLst = null;

+

+		List<ParseTreeChunk> nPhrases = pos

+				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);

+		List<String> queryArrayStr = new ArrayList<String>();

+		for (ParseTreeChunk ch : nPhrases) {

+			String query = "";

+			int size = ch.getLemmas().size();

+

+			for (int i = 0; i < size; i++) {

+				if (ch.getPOSs().get(i).startsWith("N")

+						|| ch.getPOSs().get(i).startsWith("J")) {

+					query += ch.getLemmas().get(i) + " ";

+				}

+			}

+			query = query.trim();

+			int len = query.split(" ").length;

+			if (len < 2 || len > 5)

+				continue;

+			if (len < 4) { // every word should start with capital

+				String[] qs = query.split(" ");

+				boolean bAccept = true;

+				for (String w : qs) {

+					if (w.toLowerCase().equals(w)) // idf only two words then

+						// has to be person name,

+						// title or geo location

+						bAccept = false;

+				}

+				if (!bAccept)

+					continue;

+			}

+

+			query = query.trim().replace(" ", " +");

+			query = " +" + query;

+

+			queryArrayStr.add(query);

+

+		}

+		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2

+			// keywords

+			for (ParseTreeChunk ch : nPhrases) {

+				String query = "";

+				int size = ch.getLemmas().size();

+

+				for (int i = 0; i < size; i++) {

+					if (ch.getPOSs().get(i).startsWith("N")

+							|| ch.getPOSs().get(i).startsWith("J")) {

+						query += ch.getLemmas().get(i) + " ";

+					}

+				}

+				query = query.trim();

+				int len = query.split(" ").length;

+				if (len < 2)

+					continue;

+

+				query = query.trim().replace(" ", " +");

+				query = " +" + query;

+

+				queryArrayStr.add(query);

+

+			}

+		}

+

+		queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);

+		queryArrayStr.add(sentence);

+

+		return queryArrayStr;

+

+	}

+	

+	public static String[] cleanListOfSents(String[] sents) {

+		List<String> sentsClean = new ArrayList<String>();

+		for (String s : sents) {

+			if (s == null || s.trim().length() < 30 || s.length() < 20)

+				continue;

+			sentsClean.add(s);

+		}

+		return (String[]) sentsClean.toArray(new String[0]);

+	}

+

+	public static String cleanSpacesInCleanedHTMLpage(String pageContent){ //was 4 spaces 

+		 //was 3 spaces => now back to 2

+		//TODO - verify regexp!!

+		pageContent = pageContent.trim().replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3")

+				//replaceAll("[a-z]  [A-Z]", ". $0")// .replace("  ",

+				// ". ")

+				.replace("..", ".").replace(". . .", " ").

+				replace(".    .",". ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so

+		// we need to put '.'

+		return pageContent;

+	}

+

+	/**

+	 * remove dupes from queries to easy cleaning dupes and repetitive search

+	 * afterwards

+	 * 

+	 * @param List

+	 *          <String> of sentences (search queries, or search results

+	 *          abstracts, or titles

+	 * @return List<String> of sentences where dupes are removed

+	 */

+	public static List<String> removeDuplicatesFromQueries(List<String> hits) {

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		double dupeThresh = 0.8; // if more similar, then considered dupes was

+		// 0.7

+		List<Integer> idsToRemove = new ArrayList<Integer>();

+		List<String> hitsDedup = new ArrayList<String>();

+		try {

+			for (int i = 0; i < hits.size(); i++)

+				for (int j = i + 1; j < hits.size(); j++) {

+					String title1 = hits.get(i);

+					String title2 = hits.get(j);

+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))

+						continue;

+					if (meas.measureStringDistance(title1, title2) > dupeThresh) {

+						idsToRemove.add(j); // dupes found, later list member to

+						// be deleted

+

+					}

+				}

+

+			for (int i = 0; i < hits.size(); i++)

+				if (!idsToRemove.contains(i))

+					hitsDedup.add(hits.get(i));

+

+			if (hitsDedup.size() < hits.size()) {

+				LOG.info("Removed duplicates from formed query, including "

+						+ hits.get(idsToRemove.get(0)));

+			}

+

+		} catch (Exception e) {

+			LOG.severe("Problem removing duplicates from query list");

+		}

+

+		return hitsDedup;

+

+	}

+

+	/**

+	 * remove dupes from search results

+	 * 

+	 * @param List

+	 *          <HitBase> of search results objects

+	 * @return List<String> of search results objects where dupes are removed

+	 */

+	public static List<HitBase> removeDuplicatesFromResultantHits(

+			List<HitBase> hits) {

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		double dupeThresh = // 0.8; // if more similar, then considered dupes was

+				0.7;

+		List<Integer> idsToRemove = new ArrayList<Integer>();

+		List<HitBase> hitsDedup = new ArrayList<HitBase>();

+		try {

+			for (int i = 0; i < hits.size(); i++)

+				for (int j = i + 1; j < hits.size(); j++) {

+					HitBase hit2 = hits.get(j);

+					List<Fragment> fragmList1 = hits.get(i).getFragments();

+					List<Fragment> fragmList2 = hits.get(j).getFragments();

+					List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);

+					for (Fragment f1 : fragmList1)

+						for (Fragment f2 : fragmList2) {

+							String sf1 = f1.getResultText();

+							String sf2 = f2.getResultText();

+							if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))

+								continue;

+							if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {

+								fragmList2Results.remove(f2);

+								LOG.info("Removed duplicates from formed fragments list: "

+										+ sf2);

+							}

+						}

+

+					hit2.setFragments(fragmList2Results);

+					hits.set(j, hit2);

+				}

+		} catch (Exception e) {

+			LOG.severe("Problem removing duplicates from list of fragment");

+		}

+		return hits;

+	}

+

+

+

+	// given a fragment from snippet, finds an original sentence at a webpage by

+	// optimizing alignmemt score

+	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(

+			String fragment, String[] sents) {

+		if (fragment.trim().length() < 15)

+			return null;

+

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		Double dist = 0.0;

+		String result = null, followSent = "";

+		for (int i = 0; i < sents.length; i++) {

+			String s = sents[i];

+			if (s == null || s.length() < 30)

+				continue;

+			Double distCurr = meas.measureStringDistance(s, fragment);

+			if (distCurr > dist && distCurr > 0.4) {

+				result = s;

+				dist = distCurr;

+				try {

+					if (i < sents.length - 1 && sents[i + 1].length() > 60) { 

+						String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]);

+						if (f1!=null){

+							followSent = f1;

+						}

+					}

+

+					if (i < sents.length - 2 && sents[i + 2].length() > 60) {

+						String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]);

+						if (f2!=null){

+							followSent += " "+f2;

+						}

+					}

+				} catch (Exception e) {

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+			}

+		}

+		return new String[] { result, followSent };

+	}

+

+	// given a fragment from snippet, finds an original sentence at a webpage by

+	// optimizing alignmemt score

+	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(

+			String fragment, String[] sents) {

+		if (fragment.trim().length() < 15)

+			return null;

+		int bestSentIndex = -1;

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		Double distBest = 10.0; // + sup

+		String result = null, followSent = null;

+		for (int i = 0; i < sents.length; i++) {

+			String s = sents[i];

+			if (s == null || s.length() < 30)

+				continue;

+			Double distCurr = meas.measureStringDistance(s, fragment);

+			if (distCurr > distBest) {

+				distBest = distCurr;

+				bestSentIndex = i;

+			}

+

+		}

+		if (distBest > 0.4) {

+			result = sents[bestSentIndex];

+

+			if (bestSentIndex < sents.length - 1

+					&& sents[bestSentIndex + 1].length() > 60) {

+				followSent = sents[bestSentIndex + 1];

+			}

+

+		}

+

+		return new String[] { result, followSent };

+	}

+

+	public String[] extractSentencesFromPage(String downloadedPage)

+	{

+

+		int maxSentsFromPage= 100;

+		List<String[]> results = new ArrayList<String[]>();

+

+		//String pageOrigHTML = pFetcher.fetchOrigHTML(url);

+

+		downloadedPage= downloadedPage.replace("     ", "&");

+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");

+		String[] sents = downloadedPage.split("#");

+		List<TextChunk> sentsList = new ArrayList<TextChunk>();

+		for(String s: sents){

+			s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);

+			sentsList.add(new TextChunk(s, s.length()));

+		}

+

+		Collections.sort(sentsList, new TextChunkComparable());

+		String[] longestSents = new String[maxSentsFromPage];

+		int j=0;

+		int initIndex = sentsList.size()-1 -maxSentsFromPage;

+		if (initIndex<0)

+			initIndex = 0;

+		for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){

+			longestSents[j] = sentsList.get(i).text;

+			j++;

+		}

+

+		sents = cleanSplitListOfSents(longestSents);

+

+		//sents = removeDuplicates(sents);

+		//sents = verifyEnforceStartsUpperCase(sents);

+

+		return sents;

+	}

+

+	public class TextChunk {

+		public TextChunk(String s, int length) {

+			this.text = s;

+			this.len = length;

+		}

+		public String text;

+		public int len;

+	}

+

+	public class TextChunkComparable implements Comparator<TextChunk>

+	{

+		public int compare(TextChunk ch1, TextChunk ch2)

+		{

+			if (ch1.len>ch2.len)

+				return 1;

+			else if (ch1.len<ch2.len)

+				return  -1;

+			else return 0;

+

+		}

+	}

+

+	protected String[] cleanSplitListOfSents(String[] longestSents){

+		float minFragmentLength = 40, minFragmentLengthSpace=4;

+

+		List<String> sentsClean = new ArrayList<String>();

+		for (String sentenceOrMultSent : longestSents)

+		{

+			if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)

+				continue;

+			if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){

+				System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);

+				continue;

+			}

+			// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.

+			int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;

+			float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLength)

+				continue;

+			// o oo o ooo o o o ooo oo ooo o o oo

+			numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;

+			avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)

+				continue;

+

+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

+

+			// forced split by ',' somewhere in the middle of sentence

+			// disused - Feb 26 13

+			//furtherSplit = furtherMakeSentencesShorter(furtherSplit);

+			furtherSplit.remove(furtherSplit.size()-1);

+			for(String s : furtherSplit){

+				if (s.indexOf('|')>-1)

+					continue;

+				s = s.replace("<em>"," ").replace("</em>"," ");

+				s = Utils.convertToASCII(s);

+				sentsClean.add(s);

+			}

+		}

+		return (String[]) sentsClean.toArray(new String[0]);

+	}	

+

+	protected String[] cleanSplitListOfSentsFirstSplit(String[] longestSents){

+		float minFragmentLength = 40, minFragmentLengthSpace=4;

+

+		List<String> sentsClean = new ArrayList<String>();

+		for (String sentenceOrMultSent : longestSents)

+		{

+			if (sentenceOrMultSent==null || sentenceOrMultSent.length()<minFragmentLength)

+				continue;

+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

+			for(String sentence: furtherSplit ){

+				if (sentence==null || sentence.length()<20)

+					continue;

+				if (GeneratedSentenceProcessor.acceptableMinedSentence(sentence)==null){

+					//System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);

+					continue;

+				}

+				// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.

+				int numOfDots = sentence.replace('.','&').split("&").length;

+				float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+				if ( avgSentenceLengthInTextPortion<minFragmentLength)

+					continue;

+				// o oo o ooo o o o ooo oo ooo o o oo

+				numOfDots = sentence.replace(' ','&').split("&").length;

+				avgSentenceLengthInTextPortion = (float)sentence.length() /(float) numOfDots;

+				if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)

+					continue;

+

+

+

+				// forced split by ',' somewhere in the middle of sentence

+				// disused - Feb 26 13

+				//furtherSplit = furtherMakeSentencesShorter(furtherSplit);

+				//furtherSplit.remove(furtherSplit.size()-1);

+

+				if (sentence.indexOf('|')>-1)

+					continue;

+				sentence = Utils.convertToASCII(sentence);

+				sentsClean.add(sentence);

+			}

+		}

+		return (String[]) sentsClean.toArray(new String[0]);

+	}

+	

+	public static void main(String[] args){

+		String s = "You can grouP   parts  Of your regular expression  In your pattern   You grouP  elements";

+		//with round brackets, e.g., ()." +

+		//		" This allows you to assign a repetition operator to a complete group.";

+		String sr = s.replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3");

+		String sr1 = s.replaceAll("  [A-Z]", ". $0");

+		sr = s.replaceAll("[a-z]  [A-Z]", ". $1");

+		sr1 = s.replaceAll("  [A-Z]", ". $1");

+	}

+

+}

+

+

+


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
index e1f6d77..3e79b7a 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java

@@ -25,189 +25,297 @@
 import org.apache.commons.lang.StringUtils;

 

 public class GeneratedSentenceProcessor {

-  public static String acceptableMinedSentence(String sent) {

-    // if too many commas => seo text

 

-    String[] commas = StringUtils.split(sent, ',');

-    String[] spaces = StringUtils.split(sent, ' ');

-    if ((float) commas.length / (float) spaces.length > 0.7) {

-      System.out.println("Rejection: too many commas");

-      return null;

-    }

+	public static String[] occurs = new String[]{ "click here", "wikipedia", "retrieved", "isbn",

+		"http", "www.",

+		"copyright", "advertise",  "(accessed", "[edit]", "[citation needed]",

+		"site map",  "email updates",  "contact us", "rss feeds",  "cite this site",

+		"operating hours", "last modified", "product catalog",

+		"days per week", "leave a comment", "corporate information",  

+		"employment opportunities", "terms of use", "private policy", "parental guidelines", "copyright policy",  "ad choices",

+		"about us",  "about our ads",  "privacy policy",  "terms of use",

+		"click for", "photos",

+		"find the latest",		       

+		"terms of service",

+		"clicking here",

+		"skip to", "sidebar",

+		"Tags:", 

+		"available online",

+		"get online",

+		"buy online",

+		"not valid", "get discount",

+		"official site",

+		"this video",

+		//"this book",

+		"this product",

+		"paperback", "hardcover",

+		"audio cd",

+		"related searches",

+		"permission is granted",

+		"[edit",

+		"edit categories",

+		"free license",

+		"permission is granted",

+		"under the terms",

+		"rights reserved",

+		"wikipedia", 

+		"recipient of", "this message", 

+		"mailing list",  "purchase order",

+		"mon-fri",  "email us",  "privacy pol",  "back to top", 

+		"click here",  "for details",  "assistance?",  "chat live",

+		"free shipping",  "company info",  "satisfaction g",  "contact us",

+		"menu.", "search.",  "sign in", "home.",

+		"additional terms", "may apply"};

 

-    String[] pipes = StringUtils.split(sent, '|');

-    if (StringUtils.split(sent, '|').length > 2

-        || StringUtils.split(sent, '>').length > 2) {

-      System.out.println("Rejection: too many |s or >s ");

-      return null;

-    }

-    String sentTry = sent.toLowerCase();

-    // if too many long spaces

-    String sentSpaces = sentTry.replace("   ", "");

-    if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -

-      // suspicious

-      return null;

+	public static String[] occursStartsWith = new String[]{

+		"fax",  "write","email", "contact",  "conditions",  "chat live",

+		"we ",  "the recipient",  "day return",  "days return",

+		"refund it",  "your money",

+		"purchase orders",

+		"exchange it ",  "return it",  "day return",  "days return",

+		"subscribe","posted by", "below" , "corporate",

+		"this book"};

+	public static String acceptableMinedSentence(String sent) {

+		if (sent==null || sent.length()<40)

+			return null;

+		// if too many commas => seo text

 

-    if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1

-        || sentTry.indexOf("copyright") > -1

-        || sentTry.indexOf("operating hours") > -1

-        || sentTry.indexOf("days per week") > -1

-        || sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1

-        || sentTry.indexOf("find the latest") > -1

-        || sentTry.startsWith("subscribe")

-        || sentTry.indexOf("Terms of Service") > -1

-        || sentTry.indexOf("clicking here") > -1

-        || sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1

-        || sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")

-        || sentTry.indexOf("available online") > -1

-        || sentTry.indexOf("get online") > -1

-        || sentTry.indexOf("buy online") > -1

-        || sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1

-        || sentTry.indexOf("official site") > -1

-        || sentTry.indexOf("this video") > -1

-        || sentTry.indexOf("this book") > -1

-        || sentTry.indexOf("this product") > -1

-        || sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1

-        || sentTry.indexOf("audio cd") > -1

-        || sentTry.indexOf("related searches") > -1

-        || sentTry.indexOf("permission is granted") > -1

-        || sentTry.indexOf("[edit") > -1

-        || sentTry.indexOf("edit categories") > -1

-        || sentTry.indexOf("free license") > -1

-        || sentTry.indexOf("permission is granted") > -1

-        || sentTry.indexOf("under the terms") > -1

-        || sentTry.indexOf("rights reserved") > -1

-        || sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")

-        || sentTry.endsWith("the.") || sentTry.startsWith("below") 

-        || sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1 

-        ||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1

-        ||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1 

-        ||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1

-        ||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1

-        ||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1

-        ||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1

-        

-        ||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 || sentTry.indexOf( "your money")>-1

-        ||sentTry.indexOf( "cond???")>-1 ||sentTry.indexOf( "purchase orders")>-1

-        ||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1

-    )

-      return null;

+		String[] commas = StringUtils.split(sent, ',');

+		String[] spaces = StringUtils.split(sent, ' ');

+		if ((float) commas.length / (float) spaces.length > 0.5) {

+			System.out.println("Rejection: too many commas  in sent ='"+sent);

+			return null;

+		}

 

-    // count symbols indicating wrong parts of page to mine for text

-    // if short and contains too many symbols indicating wrong area: reject

-    String sentWrongSym = sentTry.replace(">", "&&&").replace("�", "&&&")

-        .replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")

-        .replace("-", "&&&").replace("%", "&&&");

-    if ((sentWrongSym.length() - sentTry.length()) >= 4

-        && sentTry.length() < 200) // twice ot more

-      return null;

+		String[] periods = StringUtils.split(sent.replace('.', '#'), '#');

+		if ((float) periods.length / (float) spaces.length > 0.2) {

+			System.out.println("Rejection: too many periods in sent ='"+sent);

+			return null;

+		}

+		// commented [x], to avoid rejection sentences with refs[]

+		String[] brakets = StringUtils.split(sent.replace('(', '#').replace(')', '#')/*.replace('[', '#').replace(']', '#')*/, '#');

+		if ((float) periods.length / (float) spaces.length > 0.2) {

+			System.out.println("Rejection: too many brakets in sent ='"+sent);

+			return null;

+		}

+		

+		String[] pipes = StringUtils.split(sent, '|');

+		if (StringUtils.split(sent, '|').length > 2

+				|| StringUtils.split(sent, '>').length > 2) {

+			System.out.println("Rejection: too many |s or >s in sent ='"+sent);

+			return null;

+		}

+		String sentTry = sent.toLowerCase();

+		// if too many long spaces

+		String sentSpaces = sentTry.replace("   ", "");

+		if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -

+			// suspicious

+			return null;

+		if (isProhibitiveWordsOccurOrStartWith(sentTry))

+			return null;

 

-    sent = sent.replace('[', ' ').replace(']', ' ')

-        .replace("_should_find_orig_", "").replace(".   .", ". ")

-        .replace("amp;", " ").replace("1.", " ").replace("2.", " ")

-        .replace("3.", " ").replace("4.", " ").replace("2009", "2011")

-        .replace("2008", "2011").replace("2006", "2011")

-        .replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")

-        .replace("no comments", " ").replace("  ", " ").replace("  ", " ")

-        .replace("(more.)", "").replace("more.", "").replace("<more>", "")

-        .replace("[more]", "").replace(".,", ".").replace("&lt;", "")

-        .replace("p&gt;", "").replace("product description", "");

+		

 

-    // TODO .replace("a.", ".");

+		// count symbols indicating wrong parts of page to mine for text

+		// if short and contains too many symbols indicating wrong area: reject

+		String sentWrongSym = sentTry.replace(">", "&&&").replace("�", "&&&")

+				.replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")

+				.replace("-", "&&&").replace("%", "&&&");

+		if ((sentWrongSym.length() - sentTry.length()) >= 4

+				&& sentTry.length() < 200) // twice ot more

+			return null;

 

-    int endIndex = sent.indexOf(" posted");

-    if (endIndex > 0)

-      sent = sent.substring(0, endIndex);

+		sent = sent.replace('[', ' ').replace(']', ' ')

+				.replace("_should_find_orig_", "").replace(".   .", ". ")

+				.replace("amp;", " ").replace("1.", " ").replace("2.", " ")

+				.replace("3.", " ").replace("4.", " ").

+			/*	.replace("2009", "2011")

+				.replace("2008", "2011").replace("2006", "2011")

+				.replace("2007", "2011").

+			*/	replace("VIDEO:", " ").replace("Video:", " ")

+				.replace("no comments", " ").replace("  ", " ").replace("  ", " ")

+				.replace("(more.)", "").replace("more.", "").replace("<more>", "")

+				.replace("[more]", "").replace(".,", ".").replace("&lt;", "")

+				.replace("p&gt;", "").replace("product description", "");

 

-    return sent;

-  }

+		//sent = sent.replace("Click here. ","").replace("Share this:.","").replace("Facebook.",""). 

+		//		replace("Twitter." Email. Google. Print. Tumblr. Pinterest. More. Digg. LinkedIn. StumbleUpon. Reddit. Like this: Like Loading.. ")

 

-  public static String processSentence(String pageSentence) {

-    if (pageSentence == null)

-      return "";

-    pageSentence = Utils.fullStripHTML(pageSentence);

-    pageSentence = StringUtils.chomp(pageSentence, "..");

-    pageSentence = StringUtils.chomp(pageSentence, ". .");

-    pageSentence = StringUtils.chomp(pageSentence, " .");

-    pageSentence = StringUtils.chomp(pageSentence, ".");

-    pageSentence = StringUtils.chomp(pageSentence, "...");

-    pageSentence = StringUtils.chomp(pageSentence, " ....");

-    pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")

-        .replace("(.)", "");

+		// TODO .replace("a.", ".");

 

-    pageSentence = pageSentence.trim();

-    pageSentence = pageSentence.replaceAll("\\s+", " "); // make single

-    // spaces

-    // everywhere

+		int endIndex = sent.indexOf(" posted");

+		if (endIndex > 0)

+			sent = sent.substring(0, endIndex);

 

-    String[] pipes = StringUtils.split(pageSentence, '|'); // removed

-    // shorter part

-    // of sentence

-    // at the end

-    // after pipe

-    if (pipes.length == 2

-        && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {

-      int pipePos = pageSentence.indexOf("|");

-      if (pipePos > -1)

-        pageSentence = pageSentence.substring(0, pipePos - 1).trim();

+		return sent;

+	}

 

-    }

+	public static String processSentence(String pageSentence) {

+		if (acceptableMinedSentence(pageSentence)==null){

+			System.out.println("Rejected sentence by GenerSentProc.processSentence.acceptableMinedSentence()");

+			return "";

+		}

+		if (pageSentence == null)

+			return "";

+		pageSentence = Utils.fullStripHTML(pageSentence);

+		pageSentence = StringUtils.chomp(pageSentence, "..");

+		pageSentence = StringUtils.chomp(pageSentence, ". .");

+		pageSentence = StringUtils.chomp(pageSentence, " .");

+		pageSentence = StringUtils.chomp(pageSentence, ".");

+		pageSentence = StringUtils.chomp(pageSentence, "...");

+		pageSentence = StringUtils.chomp(pageSentence, " ....");

+		pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")

+				.replace("(.)", "");

 

-    if (!StringUtils.contains(pageSentence, '.')

-        && !StringUtils.contains(pageSentence, '?')

-        && !StringUtils.contains(pageSentence, '!'))

-      pageSentence = pageSentence + ". ";

+		pageSentence = pageSentence.trim();

+		pageSentence = pageSentence.replaceAll("\\s+", " "); // make single

+		// spaces

+		// everywhere

 

-    pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();

-    if (!pageSentence.endsWith("."))

-      pageSentence += ". ";

-    return pageSentence;

-  }

+		String[] pipes = StringUtils.split(pageSentence, '|'); // removed

+		// shorter part

+		// of sentence

+		// at the end

+		// after pipe

+		if (pipes.length == 2

+				&& ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {

+			int pipePos = pageSentence.indexOf("|");

+			if (pipePos > -1)

+				pageSentence = pageSentence.substring(0, pipePos - 1).trim();

 

-  public static void main(String[] args) {

+		}

 

-    String para = "About Albert Einstein     15 External links  16 Credits         Youth and schooling  Albert Einstein was born into a Jewish family";

-    para = "inventions of albert einstein                            what was albert einsteins invention                            invention of einstein                            what were albert einsteins inventions ";

+		if (!StringUtils.contains(pageSentence, '.')

+				&& !StringUtils.contains(pageSentence, '?')

+				&& !StringUtils.contains(pageSentence, '!'))

+			pageSentence = pageSentence + ". ";

 

-    para = para.replaceAll("  [A-Z]", ". $0");

-    System.out.println(para);

+		pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();

+		if (!pageSentence.endsWith(".") && !pageSentence.endsWith(":") 

+				&&!pageSentence.endsWith("!") &&!pageSentence.endsWith("."))

+			pageSentence += ". ";

+		return pageSentence;

+	}

 

-    para = "Page 2 of 93";

+	public static boolean isProhibitiveWordsOccurOrStartWith(String sentenceLowercase){

+		for(String o: occurs){

+			if (sentenceLowercase.indexOf(o)>-1){

+				System.out.println("Found prohibited occurrence "+ o +" \n in sentence = "+  sentenceLowercase);

+				return true;

+			}

+		}

 

-    System.exit(0);

-    RelatedSentenceFinder f = new RelatedSentenceFinder();

-    try {

-      List<HitBase> hits = f

-          .findRelatedOpinionsForSentence(

-              "Give me a break, there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader",

-              Arrays

-                  .asList(new String[] { "Give me a break there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader. For example you went to cash in 2008 and stay in cash until now you made nothing. Whereas people who rode out the storm are doing fine so let's quit focusing on the loser who think they are so smart and went to 100% cash and are wondering what happen. Its a market that always moves unlike your mattress.", }));

-      StringBuffer buf = new StringBuffer();

+		for(String o: occursStartsWith){

+			if (sentenceLowercase.startsWith(o)){

+				System.out.println("Found prohibited occurrence Start With  "+ o +" \n in sentence = "+  sentenceLowercase);

+				return true;

+			}

+		}

 

-      for (HitBase h : hits) {

-        List<Fragment> frags = h.getFragments();

-        for (Fragment fr : frags) {

-          if (fr.getResultText() != null && fr.getResultText().length() > 3)

-            buf.append(fr.getResultText());

-        }

-      }

 

-    } catch (Exception e) {

-      // TODO Auto-generated catch block

-      e.printStackTrace();

-    }

 

-  }

+		//  || sentTry.endsWith("the")

+		//  || sentTry.endsWith("the.") || sentTry.startsWith("below") 

+		return false;

+	}

 

-  public static String normalizeForSentenceSplitting(String pageContent) {

-    pageContent.replace("Jan.", "January").replace("Feb.", "February")

-        .replace("Mar.", "March").replace("Apr.", "April")

-        .replace("Jun.", "June").replace("Jul.", "July")

-        .replace("Aug.", "August").replace("Sep.", "September")

-        .replace("Oct.", "October").replace("Nov.", "November")

-        .replace("Dec.", "December");

+	public static void main(String[] args) {

+		

+		String sentence = "Accepted sentence: Educational. Video. About Us menu. Home. Nobel Prizes and Laureates. Nobel Prizes and Laureates. Physics Prize. Chemistry Prize. Medicine Prize. Literature Prize. Peace Prize. Prize in Economic Sciences. Quick Facts. Nomination. Nomination. Physics Prize. Chemistry Prize. Medicine Prize. Literature Prize. Peace Prize. Prize in Economic Sciences. Nomination Archive. Ceremonies. Ceremonies. Ceremony Archive. Nobel Banquet Menus. Nobel Banquet Dress Code. The Queen's Gowns. Eyewitness Reports. Alfred Nobel. Alfred Nobel. Alfred Nobel's Will. Alfred Nobel's Life. Private Library of Alfred Nobel. Books on Alfred Nobel. Events. Events. Nobel Week Dialogue. Nobel Prize Inspiration Initiative. Nobel Prize Concert. Exhibitions at the Nobel Museum. Exhibitions at the Nobel Peace Center. About Us. Nobel Prizes and Laureates. Physics PrizesChemistry PrizesMedicine PrizesLiterature PrizesPeace PrizesPrize in Economic Sciences. About the Nobel Prize in Physics 1921. Albert Einstein. Facts. Biographical. Nobel Lecture. Banquet Speech. Documentary. Photo Gallery. Questions and Answers. Other Resources. All Nobel Prizes in Physics. All Nobel Prizes in 1921. The Nobel Prize in Physics 1921. Albert Einstein. Questions and Answers. Question: When was Albert Einstein born . Answer: Albert Einstein was born on 14 March 1879. Question: Where was he born . Answer: He was born in Ulm, Germany. Question: When did he die . Answer: He died 18 April 1955 in Princeton, New Jersey, USA. Question: Who were his parents . Answer: His father was Hermann Einstein and his mother was Pauline Einstein (born Koch). Question: Did he have any sisters and brothers . Answer: He had one sister named Maja. Question: Did he marry and have children . Answer: He was married to Mileva Mari between 1903 and 1919. They had three children, Lieserl (born 1902), Hans Albert (born 1904) and Eduard (born 1910). He married Elsa L Kwenthal in 1919 and they lived together until her death in 1936. Question: Where did he receive his education . Answer: He received his main education at the following schools:. Catholic elementary school in Munich, Germany (1885-1888). Luitpold Gymnasium in Munich, Germany (1888-1894). Cantonal school in Aarau, Switzerland (1895-1896). Swiss Federal Institute of Technology in Zurich, Switzerland (1896-1900). Ph.D. from Zurich University, Switzerland (1905). Question: When was Albert Einstein awarded the Nobel Prize in Physics . Answer: The Nobel Prize Awarding Institution, the Royal Swedish Academy of Sciences, decided to reserve the Nobel Prize in Physics in 1921, and therefore no Physics Prize was awarded that year.";

+		

+		String res = GeneratedSentenceProcessor.acceptableMinedSentence(sentence);

 

-    return pageContent;

+		String para = "About Albert Einstein     15 External links  16 Credits         Youth and schooling  Albert Einstein was born into a Jewish family";

+		para = "inventions of albert einstein                            what was albert einsteins invention                            invention of einstein                            what were albert einsteins inventions ";

 

-  }

-}
\ No newline at end of file
+		para = para.replaceAll("  [A-Z]", ". $0");

+		System.out.println(para);

+

+		para = "Page 2 of 93";

+

+		System.exit(0);

+		RelatedSentenceFinder f = new RelatedSentenceFinder();

+		try {

+			List<HitBase> hits = f

+					.findRelatedOpinionsForSentence(

+							"Give me a break, there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader",

+							Arrays

+							.asList(new String[] { "Give me a break there is no reason why you can't retire in ten years if you had been a rational investor and not a crazy trader. For example you went to cash in 2008 and stay in cash until now you made nothing. Whereas people who rode out the storm are doing fine so let's quit focusing on the loser who think they are so smart and went to 100% cash and are wondering what happen. Its a market that always moves unlike your mattress.", }));

+			StringBuffer buf = new StringBuffer();

+

+			for (HitBase h : hits) {

+				List<Fragment> frags = h.getFragments();

+				for (Fragment fr : frags) {

+					if (fr.getResultText() != null && fr.getResultText().length() > 3)

+						buf.append(fr.getResultText());

+				}

+			}

+

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+

+	}

+

+	public static String normalizeForSentenceSplitting(String pageContent) {

+		pageContent.replace("Jan.", "January").replace("Feb.", "February")

+		.replace("Mar.", "March").replace("Apr.", "April")

+		.replace("Jun.", "June").replace("Jul.", "July")

+		.replace("Aug.", "August").replace("Sep.", "September")

+		.replace("Oct.", "October").replace("Nov.", "November")

+		.replace("Dec.", "December");

+

+		return pageContent;

+

+	}

+}

+

+/*

+

+if (sentTry.indexOf("click here")>-1 || sentTry.indexOf(" wikip") > -1

+|| sentTry.indexOf("copyright") > -1

+|| sentTry.indexOf("operating hours") > -1

+|| sentTry.indexOf("days per week") > -1

+|| sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1

+|| sentTry.indexOf("find the latest") > -1

+|| sentTry.startsWith("subscribe")

+|| sentTry.indexOf("Terms of Service") > -1

+|| sentTry.indexOf("clicking here") > -1

+|| sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1

+|| sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")

+|| sentTry.indexOf("available online") > -1

+|| sentTry.indexOf("get online") > -1

+|| sentTry.indexOf("buy online") > -1

+|| sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1

+|| sentTry.indexOf("official site") > -1

+|| sentTry.indexOf("this video") > -1

+|| sentTry.indexOf("this book") > -1

+|| sentTry.indexOf("this product") > -1

+|| sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1

+|| sentTry.indexOf("audio cd") > -1

+|| sentTry.indexOf("related searches") > -1

+|| sentTry.indexOf("permission is granted") > -1

+|| sentTry.indexOf("[edit") > -1

+|| sentTry.indexOf("edit categories") > -1

+|| sentTry.indexOf("free license") > -1

+|| sentTry.indexOf("permission is granted") > -1

+|| sentTry.indexOf("under the terms") > -1

+|| sentTry.indexOf("rights reserved") > -1

+|| sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")

+|| sentTry.endsWith("the.") || sentTry.startsWith("below") 

+|| sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1 

+||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1

+||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1 

+||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1

+||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1

+||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1

+||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1

+

+||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 || sentTry.indexOf( "your money")>-1

+||sentTry.indexOf( "cond???")>-1 ||sentTry.indexOf( "purchase orders")>-1

+||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1

+)

+return null;

+

+*/
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java
index c8d4d6a..42c1e3b 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBase.java

@@ -26,214 +26,236 @@
 import org.apache.commons.lang.StringUtils;

 

 public class HitBase {

-  private static final Logger LOG = Logger

-      .getLogger("opennlp.tools.similarity.apps.HitBase");

+	private static final Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.HitBase");

 

-  private String abstractText;

+	private String abstractText;

 

-  private String clickUrl;

+	private String clickUrl;

 

-  private String displayUrl;

+	private String displayUrl;

 

-  private String url;

+	private String url;

 

-  private String date;

+	private String date;

 

-  private String title;

+	private String title;

 

-  private Double generWithQueryScore;

+	private Double generWithQueryScore;

 

-  private String source;

+	private String source;

 

-  private List<String> originalSentences;

+	private List<String> originalSentences;

 

-  private String pageContent;

+	private String pageContent;

 

-  private List<Fragment> fragments;

+	private List<Fragment> fragments;

 

-  public HitBase() {

-    super();

-  }

+	public HitBase() {

+		super();

+	}

 

-  public String getPageContent() {

-    return pageContent;

-  }

+	public String getPageContent() {

+		return pageContent;

+	}

 

-  public HitBase(String orig, String[] generateds) {

-    originalSentences = new ArrayList<String>();

-    originalSentences.add(orig);

+	public HitBase(String orig, String[] generateds) {

+		originalSentences = new ArrayList<String>();

+		originalSentences.add(orig);

 

-    fragments = new ArrayList<Fragment>();

-    for (String sent : generateds) {

-      Fragment f = new Fragment(sent, 0.0);

-      fragments.add(f);

-    }

-    // the rest of params are null

-  }

+		fragments = new ArrayList<Fragment>();

+		for (String sent : generateds) {

+			Fragment f = new Fragment(sent, 0.0);

+			fragments.add(f);

+		}

+		// the rest of params are null

+	}

 

-  public void setPageContent(String pageContent) {

-    this.pageContent = pageContent;

-  }

+	public void setPageContent(String pageContent) {

+		this.pageContent = pageContent;

+	}

 

-  public List<Fragment> getFragments() {

-    return fragments;

-  }

+	public List<Fragment> getFragments() {

+		return fragments;

+	}

 

-  public void setFragments(List<Fragment> fragments) {

-    this.fragments = fragments;

-  }

+	public void setFragments(List<Fragment> fragments) {

+		this.fragments = fragments;

+	}

 

-  public String getSource() {

-    return source;

-  }

+	public String getSource() {

+		return source;

+	}

 

-  public void setSource(String source) {

-    this.source = source;

-  }

+	public void setSource(String source) {

+		this.source = source;

+	}

 

-  public List<String> getOriginalSentences() {

-    return originalSentences;

-  }

+	public List<String> getOriginalSentences() {

+		return originalSentences;

+	}

 

-  public void setOriginalSentences(List<String> originalSentences) {

-    this.originalSentences = originalSentences;

-  }

+	public void setOriginalSentences(List<String> originalSentences) {

+		this.originalSentences = originalSentences;

+	}

 

-  public String getTitle() {

-    return title;

-  }

+	public String getTitle() {

+		return title;

+	}

 

-  public void setTitle(String title) {

-    this.title = title;

-  }

+	public void setTitle(String title) {

+		this.title = title;

+	}

 

-  public String getAbstractText() {

-    return abstractText;

-  }

+	public String getAbstractText() {

+		return abstractText;

+	}

 

-  public void setAbstractText(String abstractText) {

-    this.abstractText = abstractText;

-  }

+	public void setAbstractText(String abstractText) {

+		this.abstractText = abstractText;

+	}

 

-  public String getClickUrl() {

-    return clickUrl;

-  }

+	public String getClickUrl() {

+		return clickUrl;

+	}

 

-  public void setClickUrl(String clickUrl) {

-    this.clickUrl = clickUrl;

-  }

+	public void setClickUrl(String clickUrl) {

+		this.clickUrl = clickUrl;

+	}

 

-  public String getDisplayUrl() {

-    return displayUrl;

-  }

+	public String getDisplayUrl() {

+		return displayUrl;

+	}

 

-  public void setDisplayUrl(String displayUrl) {

-    this.displayUrl = displayUrl;

-  }

+	public void setDisplayUrl(String displayUrl) {

+		this.displayUrl = displayUrl;

+	}

 

-  public String getUrl() {

-    return url;

-  }

+	public String getUrl() {

+		return url;

+	}

 

-  public void setUrl(String url) {

-    this.url = url;

-  }

+	public void setUrl(String url) {

+		this.url = url;

+	}

 

-  public String getDate() {

-    return date;

-  }

+	public String getDate() {

+		return date;

+	}

 

-  public void setDate(String date) {

-    this.date = date;

-  }

+	public void setDate(String date) {

+		this.date = date;

+	}

 

-  public Double getGenerWithQueryScore() {

-    return generWithQueryScore;

-  }

+	public Double getGenerWithQueryScore() {

+		return generWithQueryScore;

+	}

 

-  public void setGenerWithQueryScore(Double generWithQueryScore) {

-    this.generWithQueryScore = generWithQueryScore;

-  }

+	public void setGenerWithQueryScore(Double generWithQueryScore) {

+		this.generWithQueryScore = generWithQueryScore;

+	}

 

-  public String toString() {

-    // return "\n"+this.getUrl()+" | " +this.getTitle()+ " | "+

-    // this.abstractText ;

-    if (this.getFragments() != null && this.getFragments().size() > 0)

-      return this.getFragments().toString();

-    else

-      return this.title;

-  }

+	public String toString() {

+		// return "\n"+this.getUrl()+" | " +this.getTitle()+ " | "+

+				// this.abstractText ;

+		if (this.getFragments() != null && this.getFragments().size() > 0)

+			return this.getFragments().toString();

+		else

+			return this.title;

+	}

 

-  public static String toString(List<HitBase> hits) {

-    StringBuffer buf = new StringBuffer();

-    Boolean pBreak = true;

-    for (HitBase hit : hits) {

-      String fragm = (hit.toString());

-      if (fragm.length() > 15) {

-        if (pBreak)

-          buf.append(fragm + " | ");

-        else

-          buf.append(fragm + " | \n");

-        // switch to opposite

-        if (pBreak)

-          pBreak = false;

-        else

-          pBreak = true;

-      }

+	public static String toString(List<HitBase> hits) {

+		StringBuffer buf = new StringBuffer();

+		Boolean pBreak = true;

+		for (HitBase hit : hits) {

+			String fragm = (hit.toString());

+			if (fragm.length() > 15) {

+				if (pBreak)

+					buf.append(fragm + " | ");

+				else

+					buf.append(fragm + " | \n");

+				// switch to opposite

+				if (pBreak)

+					pBreak = false;

+				else

+					pBreak = true;

+			}

 

-    }

-    return buf.toString();

-  }

+		}

+		return buf.toString();

+	}

 

-  public static String toResultantString(List<HitBase> hits) {

-    StringBuffer buf = new StringBuffer();

-    Boolean pBreak = true;

-    for (HitBase hit : hits) {

-      String fragm = hit.getFragments().toString();

-      if (fragm.length() > 15) {

-        if (pBreak)

-          buf.append(fragm + " | 	");

-        else

-          buf.append(fragm + " | \n");

-        // switch to opposite

-        if (pBreak)

-          pBreak = false;

-        else

-          pBreak = true;

-      }

+	public static String toResultantString(List<HitBase> hits) {

+		StringBuffer buf = new StringBuffer();

+		Boolean pBreak = true;

+		for (HitBase hit : hits) {

+			try {

+				if (hit.getFragments()==null)	

+					continue;

+				String fragm = hit.getFragments().toString();

+				if (fragm.length() > 15) {

+					if (pBreak)

+						buf.append(fragm + " | 	");

+					else

+						buf.append(fragm + " | <br>\n");

+					// switch to opposite

+					if (pBreak)

+						pBreak = false;

+					else

+						pBreak = true;

+				}

+			} catch (Exception e) {

+				e.printStackTrace();

+			}

 

-    }

-    return buf.toString().replace("[", "").replace("]", "").replace(" | ", "")

-        .replace(".,", ".").replace(".\"", "\"").replace(". .", ".")

-        .replace(",.", ".");

-  }

+		}

+		return buf.toString().replace("[", "").replace("]", "").replace(" | ", "")

+				.replace(".,", ".").replace(".\"", "\"").replace(". .", ".")

+				.replace(",.", ".");

+	}

+	

+	public static String produceReferenceSection(List<HitBase> hits) {

+		StringBuffer buf = new StringBuffer();

+		for (HitBase hit : hits) {

+			try {

+				if (hit.getUrl()==null)	

+					continue;

+				buf.append(hit.getUrl());					

+			

+			} catch (Exception e) {

+				e.printStackTrace();

+			}

 

-  public static List<HitBase> removeDuplicates(List<HitBase> hits) {

-    StringDistanceMeasurer meas = new StringDistanceMeasurer();

-    double imageDupeThresh = 0.8; // if more similar, then considered dupes

-    List<Integer> idsToRemove = new ArrayList<Integer>();

-    List<HitBase> hitsDedup = new ArrayList<HitBase>();

-    try {

-      for (int i = 0; i < hits.size(); i++)

-        for (int j = i + 1; j < hits.size(); j++) {

-          String title1 = hits.get(i).getTitle();

-          String title2 = hits.get(j).getTitle();

-          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))

-            continue;

-          if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {

-            idsToRemove.add(j); // dupes found, later list member to be deleted

-          }

-        }

-      for (int i = 0; i < hits.size(); i++)

-        if (!idsToRemove.contains(i))

-          hitsDedup.add(hits.get(i));

-      if (hitsDedup.size() < hits.size()) {

-        LOG.info("Removed duplicates from relevant search results, including "

-            + hits.get(idsToRemove.get(0)).getTitle());

-      }

-    } catch (Exception e) {

-      LOG.severe("Problem removing duplicates from relevant images: " + e);

-    }

-    return hitsDedup;

-  }

+		}

+		return buf.toString();

+	}

+

+	public static List<HitBase> removeDuplicates(List<HitBase> hits) {

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		double imageDupeThresh = 0.8; // if more similar, then considered dupes

+		List<Integer> idsToRemove = new ArrayList<Integer>();

+		List<HitBase> hitsDedup = new ArrayList<HitBase>();

+		try {

+			for (int i = 0; i < hits.size(); i++)

+				for (int j = i + 1; j < hits.size(); j++) {

+					String title1 = hits.get(i).getTitle();

+					String title2 = hits.get(j).getTitle();

+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))

+						continue;

+					if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {

+						idsToRemove.add(j); // dupes found, later list member to be deleted

+					}

+				}

+			for (int i = 0; i < hits.size(); i++)

+				if (!idsToRemove.contains(i))

+					hitsDedup.add(hits.get(i));

+			if (hitsDedup.size() < hits.size()) {

+				LOG.info("Removed duplicates from relevant search results, including "

+						+ hits.get(idsToRemove.get(0)).getTitle());

+			}

+		} catch (Exception e) {

+			LOG.severe("Problem removing duplicates from relevant images: " + e);

+		}

+		return hitsDedup;

+	}

 }
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java
index 1f1fcc6..b41f8ec 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/HitBaseComparable.java

@@ -1,3 +1,19 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

 package opennlp.tools.similarity.apps;

 

 import java.util.Comparator;


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
index 7ff9fc3..bfeff62 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java

@@ -19,15 +19,24 @@
 

 import java.util.ArrayList;

 import java.util.Arrays;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.HashSet;

 import java.util.List;

+import java.util.Set;

 import java.util.logging.Logger;

 

+import opennlp.tools.parse_thicket.Triple;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunk;

+import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunkComparable;

 import opennlp.tools.similarity.apps.utils.PageFetcher;

 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

 import opennlp.tools.similarity.apps.utils.Utils;

 import opennlp.tools.textsimilarity.ParseTreeChunk;

 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

 import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.TextProcessor;

 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 

 import org.apache.commons.lang.StringUtils;

@@ -43,575 +52,952 @@
  */

 

 public class RelatedSentenceFinder {

-  private static Logger LOG = Logger

-      .getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");

-  PageFetcher pFetcher = new PageFetcher();

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");

+	PageFetcher pFetcher = new PageFetcher();

+	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor

+			.getInstance();

+	protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+	protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();

+	protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();

+	protected BingQueryRunner yrunner = new BingQueryRunner();

+	protected int MAX_STEPS = 1;

+	protected int MAX_SEARCH_RESULTS = 1;

+	protected float RELEVANCE_THRESHOLD = 1.1f;

+	protected Set<String> visitedURLs = new HashSet();

 

-  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

-  private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();

+	// used to indicate that a sentence is an opinion, so more appropriate

+	static List<String> MENTAL_VERBS = new ArrayList<String>(

+			Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",

+					"accept", "agree", "allow", "appeal", "ask", "assume", "believe",

+					"check", "confirm", "convince", "deny", "disagree", "explain",

+					"ignore", "inform", "remind", "request", "suggest", "suppose",

+					"think", "threaten", "try", "understand" }));

 

-  static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();

+	private static final int MAX_FRAGMENT_SENTS = 10;

 

-  // used to indicate that a sentence is an opinion, so more appropriate

-  static List<String> MENTAL_VERBS = new ArrayList<String>(

-      Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",

-          "accept", "agree", "allow", "appeal", "ask", "assume", "believe",

-          "check", "confirm", "convince", "deny", "disagree", "explain",

-          "ignore", "inform", "remind", "request", "suggest", "suppose",

-          "think", "threaten", "try", "understand" }));

+	public RelatedSentenceFinder(int ms, int msr, float thresh, String key) {

+		this.MAX_STEPS = ms;

+		this.MAX_SEARCH_RESULTS = msr;

+		this.RELEVANCE_THRESHOLD=thresh;

+		yrunner.setKey(key);

+	}

 

-  private static final int MAX_FRAGMENT_SENTS = 10;

+	public RelatedSentenceFinder() {

+		// TODO Auto-generated constructor stub

+	}

+	public void setLang(String lang) {

+		yrunner.setLang(lang);

 

-  public RelatedSentenceFinder() {

+	}

+	public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,

+			List<String> sents) throws Exception {

 

-  }

+		List<HitBase> searchResult = yrunner.runSearch(word, 100);

+		return searchResult;

+	}

 

-  public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,

-      List<String> sents) throws Exception {

-    BingWebQueryRunner yrunner = new BingWebQueryRunner();

-    List<HitBase> searchResult = yrunner.runSearch(word, 100);

-    return searchResult;

-  }

+	public List<HitBase> findRelatedOpinionsForSentence(String sentence,

+			List<String> sents) throws Exception {

+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();

+		System.out.println(" \n\n=== Sentence  = " + sentence);

+		List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

 

-  public List<HitBase> findRelatedOpinionsForSentence(String sentence,

-      List<String> sents) throws Exception {

-    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();

-    System.out.println(" \n\n=== Sentence  = " + sentence);

-    List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

+		BingQueryRunner yrunner = new BingQueryRunner();

+		for (String query : nounPhraseQueries) {

+			System.out.println("\nquery = " + query);

+			// query += " "+join(MENTAL_VERBS, " OR ") ;

+			List<HitBase> searchResult = yrunner.runSearch(query, 100);

+			if (searchResult != null) {

+				for (HitBase item : searchResult) { // got some text from .html

+					if (item.getAbstractText() != null

+							&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude

+						// pdf

+						opinionSentencesToAdd

+						.add(augmentWithMinedSentencesAndVerifyRelevance(item,

+								sentence, sents));

+						

+					}

+				}

+			}

+		}

 

-    BingWebQueryRunner yrunner = new BingWebQueryRunner();

-    for (String query : nounPhraseQueries) {

-      System.out.println("\nquery = " + query);

-      // query += " "+join(MENTAL_VERBS, " OR ") ;

-      List<HitBase> searchResult = yrunner.runSearch(query, 100);

-      if (searchResult != null) {

-        for (HitBase item : searchResult) { // got some text from .html

-          if (item.getAbstractText() != null

-              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude

-                                                         // pdf

-            opinionSentencesToAdd

-                .add(augmentWithMinedSentencesAndVerifyRelevance(item,

-                    sentence, sents));

-          }

-        }

-      }

-    }

+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);

+		return opinionSentencesToAdd;

+	}

 

-    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);

-    return opinionSentencesToAdd;

-  }

+	/**

+	 * Main content generation function which takes a seed as a person, rock

+	 * group, or other entity name and produce a list of text fragments by web

+	 * mining for <br>

+	 * 

+	 * @param String

+	 *          entity name

+	 * @return List<HitBase> of text fragment structures which contain approved

+	 *         (in terms of relevance) mined sentences, as well as original search

+	 *         results objects such as doc titles, abstracts, and urls.

+	 */

 

-  /**

-   * Main content generation function which takes a seed as a person, rock

-   * group, or other entity name and produce a list of text fragments by web

-   * mining for <br>

-   * 

-   * @param String

-   *          entity name

-   * @return List<HitBase> of text fragment structures which contain approved

-   *         (in terms of relevance) mined sentences, as well as original search

-   *         results objects such as doc titles, abstracts, and urls.

-   */

+	public List<HitBase> generateContentAbout(String sentence) throws Exception {

+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();

+		System.out.println(" \n=== Entity to write about = " + sentence);

+		List<String> nounPhraseQueries = new ArrayList<String>();

 

-  public List<HitBase> generateContentAbout(String sentence) throws Exception {

-    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();

-    System.out.println(" \n=== Entity to write about = " + sentence);

-    List<String> nounPhraseQueries = new ArrayList<String>();

+		String[] extraKeywords = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity(sentence);

+		System.out.println("Found  extraKeywords "+ Arrays.asList(extraKeywords));

+		if (extraKeywords==null || extraKeywords.length<1)

+			extraKeywords = StoryDiscourseNavigator.frequentPerformingVerbs;

 

-    // nounPhraseQueries.add(sentence + frequentPerformingVerbs);

+		int stepCount=0;

+		for (String verbAddition : extraKeywords) {

+			List<HitBase> searchResult = yrunner.runSearch(sentence + " "

+					+ verbAddition, MAX_SEARCH_RESULTS); //100);

+			if (MAX_SEARCH_RESULTS<searchResult.size())

+				searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);

+			//TODO for shorter run

+			if (searchResult != null) {

+				for (HitBase item : searchResult) { // got some text from .html

+					if (item.getAbstractText() != null

+							&& !(item.getUrl().indexOf(".pdf") > 0) && !visitedURLs.contains(item.getUrl())) { // exclude pdf

+						opinionSentencesToAdd

+						.add(//augmentWithMinedSentencesAndVerifyRelevance(item,

+							//	sentence, null));

+								buildParagraphOfGeneratedText(item,	sentence, null));

+						visitedURLs.add(item.getUrl());

+					}

+				}

+			}

+			stepCount++;

+			if (stepCount>MAX_STEPS)

+				break;

+		}

 

-    BingWebQueryRunner yrunner = new BingWebQueryRunner();

-    for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {

-      List<HitBase> searchResult = yrunner.runSearch(sentence + " "

-          + verbAddition, 100);

-      if (searchResult != null) {

-        for (HitBase item : searchResult) { // got some text from .html

-          if (item.getAbstractText() != null

-              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf

-            opinionSentencesToAdd

-                .add(augmentWithMinedSentencesAndVerifyRelevance(item,

-                    sentence, null));

-          }

-        }

-      }

-    }

+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);

+		return opinionSentencesToAdd;

+	}

 

-    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);

-    return opinionSentencesToAdd;

-  }

+	/**

+	 * Takes a sentence and extracts noun phrases and entity names to from search

+	 * queries for finding relevant sentences on the web, which are then subject

+	 * to relevance assessment by Similarity. Search queries should not be too

+	 * general (irrelevant search results) or too specific (too few search

+	 * results)

+	 * 

+	 * @param String

+	 *          input sentence to form queries

+	 * @return List<String> of search expressions

+	 */

+	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {

+		ParseTreeChunk matcher = new ParseTreeChunk();

+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor

+				.getInstance();

+		List<List<ParseTreeChunk>> sent1GrpLst = null;

 

-  /**

-   * Takes a sentence and extracts noun phrases and entity names to from search

-   * queries for finding relevant sentences on the web, which are then subject

-   * to relevance assessment by Similarity. Search queries should not be too

-   * general (irrelevant search results) or too specific (too few search

-   * results)

-   * 

-   * @param String

-   *          input sentence to form queries

-   * @return List<String> of search expressions

-   */

-  public static List<String> buildSearchEngineQueryFromSentence(String sentence) {

-    ParseTreeChunk matcher = new ParseTreeChunk();

-    ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor

-        .getInstance();

-    List<List<ParseTreeChunk>> sent1GrpLst = null;

+		List<ParseTreeChunk> nPhrases = pos

+				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);

+		List<String> queryArrayStr = new ArrayList<String>();

+		for (ParseTreeChunk ch : nPhrases) {

+			String query = "";

+			int size = ch.getLemmas().size();

+

+			for (int i = 0; i < size; i++) {

+				if (ch.getPOSs().get(i).startsWith("N")

+						|| ch.getPOSs().get(i).startsWith("J")) {

+					query += ch.getLemmas().get(i) + " ";

+				}

+			}

+			query = query.trim();

+			int len = query.split(" ").length;

+			if (len < 2 || len > 5)

+				continue;

+			if (len < 4) { // every word should start with capital

+				String[] qs = query.split(" ");

+				boolean bAccept = true;

+				for (String w : qs) {

+					if (w.toLowerCase().equals(w)) // idf only two words then

+						// has to be person name,

+						// title or geo location

+						bAccept = false;

+				}

+				if (!bAccept)

+					continue;

+			}

+

+			query = query.trim().replace(" ", " +");

+			query = " +" + query;

+

+			queryArrayStr.add(query);

+

+		}

+		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2

+			// keywords

+			for (ParseTreeChunk ch : nPhrases) {

+				String query = "";

+				int size = ch.getLemmas().size();

+

+				for (int i = 0; i < size; i++) {

+					if (ch.getPOSs().get(i).startsWith("N")

+							|| ch.getPOSs().get(i).startsWith("J")) {

+						query += ch.getLemmas().get(i) + " ";

+					}

+				}

+				query = query.trim();

+				int len = query.split(" ").length;

+				if (len < 2)

+					continue;

+

+				query = query.trim().replace(" ", " +");

+				query = " +" + query;

+

+				queryArrayStr.add(query);

+

+			}

+		}

+

+		queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);

+		queryArrayStr.add(sentence);

+

+		return queryArrayStr;

+

+	}

+

+	/**

+	 * remove dupes from queries to easy cleaning dupes and repetitive search

+	 * afterwards

+	 * 

+	 * @param List

+	 *          <String> of sentences (search queries, or search results

+	 *          abstracts, or titles

+	 * @return List<String> of sentences where dupes are removed

+	 */

+	public static List<String> removeDuplicatesFromQueries(List<String> hits) {

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		double dupeThresh = 0.8; // if more similar, then considered dupes was

+		// 0.7

+		List<Integer> idsToRemove = new ArrayList<Integer>();

+		List<String> hitsDedup = new ArrayList<String>();

+		try {

+			for (int i = 0; i < hits.size(); i++)

+				for (int j = i + 1; j < hits.size(); j++) {

+					String title1 = hits.get(i);

+					String title2 = hits.get(j);

+					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))

+						continue;

+					if (meas.measureStringDistance(title1, title2) > dupeThresh) {

+						idsToRemove.add(j); // dupes found, later list member to

+						// be deleted

+

+					}

+				}

+

+			for (int i = 0; i < hits.size(); i++)

+				if (!idsToRemove.contains(i))

+					hitsDedup.add(hits.get(i));

+

+			if (hitsDedup.size() < hits.size()) {

+				LOG.info("Removed duplicates from formed query, including "

+						+ hits.get(idsToRemove.get(0)));

+			}

+

+		} catch (Exception e) {

+			LOG.severe("Problem removing duplicates from query list");

+		}

+

+		return hitsDedup;

+

+	}

+

+	/**

+	 * remove dupes from search results

+	 * 

+	 * @param List

+	 *          <HitBase> of search results objects

+	 * @return List<String> of search results objects where dupes are removed

+	 */

+	public static List<HitBase> removeDuplicatesFromResultantHits(

+			List<HitBase> hits) {

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		double dupeThresh = // 0.8; // if more similar, then considered dupes was

+				0.7;

+		List<Integer> idsToRemove = new ArrayList<Integer>();

+		List<HitBase> hitsDedup = new ArrayList<HitBase>();

+		try {

+			for (int i = 0; i < hits.size(); i++)

+				for (int j = i + 1; j < hits.size(); j++) {

+					HitBase hit2 = hits.get(j);

+					List<Fragment> fragmList1 = hits.get(i).getFragments();

+					List<Fragment> fragmList2 = hits.get(j).getFragments();

+					List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);

+					for (Fragment f1 : fragmList1)

+						for (Fragment f2 : fragmList2) {

+							String sf1 = f1.getResultText();

+							String sf2 = f2.getResultText();

+							if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))

+								continue;

+							if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {

+								fragmList2Results.remove(f2);

+								LOG.info("Removed duplicates from formed fragments list: "

+										+ sf2);

+							}

+						}

+

+					hit2.setFragments(fragmList2Results);

+					hits.set(j, hit2);

+				}

+		} catch (Exception e) {

+			LOG.severe("Problem removing duplicates from list of fragment");

+		}

+		return hits;

+	}

+

+	/**

+	 * Takes single search result for an entity which is the subject of the essay

+	 * to be written and forms essey sentences from the title, abstract, and

+	 * possibly original page

+	 * 

+	 * @param HitBase

+	 *          item : search result

+	 * @param originalSentence

+	 *          : seed for the essay to be written

+	 * @param sentsAll

+	 *          : list<String> of other sentences in the seed if it is

+	 *          multi-sentence

+	 * @return search result

+	 */

+

+	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,

+			String originalSentence, List<String> sentsAll) {

+		if (sentsAll == null)

+			sentsAll = new ArrayList<String>();

+		// put orig sentence in structure

+		List<String> origs = new ArrayList<String>();

+		origs.add(originalSentence);

+		item.setOriginalSentences(origs);

+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

+				.replace("  ", " ").replace("  ", " ");

+		// generation results for this sentence

+		List<Fragment> result = new ArrayList<Fragment>();

+		// form plain text from snippet

+		String snapshot = item.getAbstractText().replace("<b>", " ")

+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");

+

+

+		// fix a template expression which can be substituted by original if

+		// relevant

+		String snapshotMarked = snapshot.replace("...",

+				" _should_find_orig_ . _should_find_orig_");

+		String[] fragments = sm.splitSentences(snapshotMarked);

+		List<String> allFragms = new ArrayList<String>();

+		allFragms.addAll(Arrays.asList(fragments));

+

+		String[] sents = null;

+		String downloadedPage = null;

+		try {

+			if (snapshotMarked.length() != snapshot.length()) {

+				downloadedPage = pFetcher.fetchPage(item.getUrl());

+				if (downloadedPage != null && downloadedPage.length() > 100) {

+					item.setPageContent(downloadedPage);

+					String pageContent = Utils.fullStripHTML(item.getPageContent());

+					pageContent = GeneratedSentenceProcessor

+							.normalizeForSentenceSplitting(pageContent);

+					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);

+					//pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",

+					//		// ". ")

+					//		.replace("..", ".").replace(". . .", " ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so

+					// we need to put '.'

+					sents = sm.splitSentences(pageContent);

+

+					sents = ContentGeneratorSupport.cleanListOfSents(sents);

+				}

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			// e.printStackTrace();

+			System.err

+			.println("Problem downloading  the page and splitting into sentences");

+			return item;

+		}

+

+		for (String fragment : allFragms) {

+			String followSent = "";

+			if (fragment.length() < 50)

+				continue;

+			String pageSentence = "";

+			// try to find original sentence from webpage

+			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null

+					&& sents.length > 0){

+				try { 

+					// first try sorted sentences from page by length approach

+					String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);

+					String[] mainAndFollowSent = null;

+

+					try {

+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+								fragment.replace("_should_find_orig_", ""), sentsSortedByLength);

+					} catch (Exception e) {

+						// TODO Auto-generated catch block

+						e.printStackTrace();

+					}

+					// if the above gives null than try to match all sentences from snippet fragment

+					if (mainAndFollowSent==null || mainAndFollowSent[0]==null){

+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+								fragment.replace("_should_find_orig_", ""), sents);

+					}

+					

+					if (mainAndFollowSent!=null || mainAndFollowSent[0]!=null){

+						pageSentence = mainAndFollowSent[0];

+						for(int i = 1; i< mainAndFollowSent.length; i++)

+							if (mainAndFollowSent[i]!=null)

+								followSent+= mainAndFollowSent[i];

+					}

+

+				} catch (Exception e) {

+

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+			}

+			

+			else

+				// or get original snippet

+				pageSentence = fragment;

+			if (pageSentence != null)

+				pageSentence.replace("_should_find_orig_", "");

+

+			// resultant sentence SHOULD NOT be longer than for times the size of

+			// snippet fragment

+			if (pageSentence != null && pageSentence.length()>50 )

+			//		&& (float) pageSentence.length() / (float) fragment.length() < 4.0)

+			{ // was 2.0,

+

+				try { // get score from syntactic match between sentence in

+					// original text and mined sentence

+					double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

+

+					SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence

+							+ " " + title, originalSentence);

+					List<List<ParseTreeChunk>> match = matchRes.getMatchResult();

+					if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {

+						System.out

+						.println("Rejected Sentence : No verb OR Yes imperative verb :"

+								+ pageSentence);

+						continue;

+					}

+

+					syntScore = parseTreeChunkListScorer

+							.getParseTreeChunkListScore(match);

+					System.out.println(parseTreeChunk.listToString(match) + " "

+							+ syntScore + "\n pre-processed sent = '" + pageSentence);

+

+					if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents

+						for (String currSent : sentsAll) {

+							if (currSent.startsWith(originalSentence))

+								continue;

+							match = sm.assessRelevance(currSent, pageSentence)

+									.getMatchResult();

+							double syntScoreCurr = parseTreeChunkListScorer

+									.getParseTreeChunkListScore(match);

+							if (syntScoreCurr > syntScore) {

+								syntScore = syntScoreCurr;

+							}

+						}

+						if (syntScore > RELEVANCE_THRESHOLD) {

+							System.out.println("Got match with other sent: "

+									+ parseTreeChunk.listToString(match) + " " + syntScore);

+						}

+					}

+

+					measScore = stringDistanceMeasurer.measureStringDistance(

+							originalSentence, pageSentence);

+

+

+					if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)

+							&& measScore < 0.8 && pageSentence.length() > 40) // >70

+					{

+						String pageSentenceProc = GeneratedSentenceProcessor

+								.acceptableMinedSentence(pageSentence);

+						if (pageSentenceProc != null) {

+							pageSentenceProc = GeneratedSentenceProcessor

+									.processSentence(pageSentenceProc);

+							followSent = GeneratedSentenceProcessor.processSentence(followSent);

+							if (followSent != null) {

+								pageSentenceProc += " "+ followSent;

+							}

+

+							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);

+							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore

+									+ mentalScore + (double) pageSentenceProc.length()

+									/ (double) 50);

+							f.setSourceURL(item.getUrl());

+							f.fragment = fragment;

+							result.add(f);

+							System.out.println("Accepted sentence: " + pageSentenceProc + " | "+followSent

+									+ "| with title= " + title);

+							System.out.println("For fragment = " + fragment);

+						} else

+							System.out

+							.println("Rejected sentence due to wrong area at webpage: "

+									+ pageSentence);

+					} else

+						System.out.println("Rejected sentence due to low score: "

+								+ pageSentence);

+					// }

+				} catch (Throwable t) {

+					t.printStackTrace();

+				}

+			}

+		}

+		item.setFragments(result);

+		return item;

+	}

+

+	

+

+	// given a fragment from snippet, finds an original sentence at a webpage by

+	// optimizing alignmemt score

+	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(

+			String fragment, String[] sents) {

+		if (fragment.trim().length() < 15)

+			return null;

+

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		Double dist = 0.0;

+		String result = null, followSent = "";

+		for (int i = 0; i < sents.length; i++) {

+			String s = sents[i];

+			if (s == null || s.length() < 30)

+				continue;

+			Double distCurr = meas.measureStringDistance(s, fragment);

+			if (distCurr > dist && distCurr > 0.4) {

+				result = s;

+				dist = distCurr;

+				try {

+					if (i < sents.length - 1 && sents[i + 1].length() > 60) { 

+						String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]);

+						if (f1!=null){

+							followSent = f1;

+						}

+					}

+

+					if (i < sents.length - 2 && sents[i + 2].length() > 60) {

+						String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]);

+						if (f2!=null){

+							followSent += " "+f2;

+						}

+					}

+				} catch (Exception e) {

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+			}

+		}

+		return new String[] { result, followSent };

+	}

+

+	// given a fragment from snippet, finds an original sentence at a webpage by

+	// optimizing alignmemt score

+	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(

+			String fragment, String[] sents) {

+		if (fragment.trim().length() < 15)

+			return null;

+		int bestSentIndex = -1;

+		StringDistanceMeasurer meas = new StringDistanceMeasurer();

+		Double distBest = 10.0; // + sup

+		String result = null, followSent = null;

+		for (int i = 0; i < sents.length; i++) {

+			String s = sents[i];

+			if (s == null || s.length() < 30)

+				continue;

+			Double distCurr = meas.measureStringDistance(s, fragment);

+			if (distCurr > distBest) {

+				distBest = distCurr;

+				bestSentIndex = i;

+			}

+

+		}

+		if (distBest > 0.4) {

+			result = sents[bestSentIndex];

+

+			if (bestSentIndex < sents.length - 1

+					&& sents[bestSentIndex + 1].length() > 60) {

+				followSent = sents[bestSentIndex + 1];

+			}

+

+		}

+

+		return new String[] { result, followSent };

+	}

+

+	public String[] extractSentencesFromPage(String downloadedPage)

+	{

+

+		int maxSentsFromPage= 100;

+		List<String[]> results = new ArrayList<String[]>();

+

+		//String pageOrigHTML = pFetcher.fetchOrigHTML(url);

+

+		downloadedPage= downloadedPage.replace("     ", "&");

+		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");

+		String[] sents = downloadedPage.split("#");

+		List<TextChunk> sentsList = new ArrayList<TextChunk>();

+		for(String s: sents){

+			s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);

+		/*	s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")

+					.replace(": ", ". ").replace("- ", ". ").

+					replace (". .",".").trim(); */

+			sentsList.add(new TextChunk(s, s.length()));

+		}

+

+		Collections.sort(sentsList, new TextChunkComparable());

+		String[] longestSents = new String[maxSentsFromPage];

+		int j=0;

+		int initIndex = sentsList.size()-1 -maxSentsFromPage;

+		if (initIndex<0)

+			initIndex = 0;

+		for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){

+			longestSents[j] = sentsList.get(i).text;

+			j++;

+		}

+

+		sents = cleanSplitListOfSents(longestSents);

+

+		//sents = removeDuplicates(sents);

+		//sents = verifyEnforceStartsUpperCase(sents);

+

+		return sents;

+	}

 

-    List<ParseTreeChunk> nPhrases = pos

-        .formGroupedPhrasesFromChunksForSentence(sentence).get(0);

-    List<String> queryArrayStr = new ArrayList<String>();

-    for (ParseTreeChunk ch : nPhrases) {

-      String query = "";

-      int size = ch.getLemmas().size();

+	public class TextChunk {

+		public TextChunk(String s, int length) {

+			this.text = s;

+			this.len = length;

+		}

+		public String text;

+		public int len;

+	}

 

-      for (int i = 0; i < size; i++) {

-        if (ch.getPOSs().get(i).startsWith("N")

-            || ch.getPOSs().get(i).startsWith("J")) {

-          query += ch.getLemmas().get(i) + " ";

-        }

-      }

-      query = query.trim();

-      int len = query.split(" ").length;

-      if (len < 2 || len > 5)

-        continue;

-      if (len < 4) { // every word should start with capital

-        String[] qs = query.split(" ");

-        boolean bAccept = true;

-        for (String w : qs) {

-          if (w.toLowerCase().equals(w)) // idf only two words then

-            // has to be person name,

-            // title or geo location

-            bAccept = false;

-        }

-        if (!bAccept)

-          continue;

-      }

+	public class TextChunkComparable implements Comparator<TextChunk>

+	{

+		public int compare(TextChunk ch1, TextChunk ch2)

+		{

+			if (ch1.len>ch2.len)

+				return 1;

+			else if (ch1.len<ch2.len)

+				return  -1;

+			else return 0;

 

-      query = query.trim().replace(" ", " +");

-      query = " +" + query;

+		}

+	}

 

-      queryArrayStr.add(query);

+	protected String[] cleanSplitListOfSents(String[] longestSents){

+		float minFragmentLength = 40, minFragmentLengthSpace=4;

 

-    }

-    if (queryArrayStr.size() < 1) { // release constraints on NP down to 2

-                                    // keywords

-      for (ParseTreeChunk ch : nPhrases) {

-        String query = "";

-        int size = ch.getLemmas().size();

+		List<String> sentsClean = new ArrayList<String>();

+		for (String sentenceOrMultSent : longestSents)

+		{

+			if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)

+				continue;

+			if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){

+				//System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);

+				continue;

+			}

+			// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.

+			int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;

+			float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLength)

+				continue;

+			// o oo o ooo o o o ooo oo ooo o o oo

+			numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;

+			avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;

+			if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)

+				continue;

 

-        for (int i = 0; i < size; i++) {

-          if (ch.getPOSs().get(i).startsWith("N")

-              || ch.getPOSs().get(i).startsWith("J")) {

-            query += ch.getLemmas().get(i) + " ";

-          }

-        }

-        query = query.trim();

-        int len = query.split(" ").length;

-        if (len < 2)

-          continue;

+			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

 

-        query = query.trim().replace(" ", " +");

-        query = " +" + query;

+			// forced split by ',' somewhere in the middle of sentence

+			// disused - Feb 26 13

+			//furtherSplit = furtherMakeSentencesShorter(furtherSplit);

+			furtherSplit.remove(furtherSplit.size()-1);

+			for(String s : furtherSplit){

+				if (s.indexOf('|')>-1)

+					continue;

+				s = s.replace("<em>"," ").replace("</em>"," ");

+				s = Utils.convertToASCII(s);

+				sentsClean.add(s);

+			}

+		}

+		return (String[]) sentsClean.toArray(new String[0]);

+	}

 

-        queryArrayStr.add(query);

+	public Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){

+		if (sentsAll == null)

+			sentsAll = new ArrayList<String>();

+		// put orig sentence in structure

+		List<String> origs = new ArrayList<String>();

+		origs.add(originalSentence);

+		item.setOriginalSentences(origs);

+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

+				.replace("  ", " ").replace("  ", " ");

+		// generation results for this sentence

+		List<Fragment> result = new ArrayList<Fragment>();

+		// form plain text from snippet

+		String snapshot = item.getAbstractText().replace("<b>", " ")

+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");

 

-      }

-    }

 

-    queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);

-    queryArrayStr.add(sentence);

+		// fix a template expression which can be substituted by original if

+		// relevant

+		String snapshotMarked = snapshot.replace("...",

+				" _should_find_orig_ . _should_find_orig_");

+		String[] fragments = sm.splitSentences(snapshotMarked);

+		List<String> allFragms = new ArrayList<String>();

+		allFragms.addAll(Arrays.asList(fragments));

 

-    return queryArrayStr;

+		String[] sents = null;

+		String downloadedPage = null;

+		try {

+			if (snapshotMarked.length() != snapshot.length()) {

+				downloadedPage = pFetcher.fetchPage(item.getUrl());

+				if (downloadedPage != null && downloadedPage.length() > 100) {

+					item.setPageContent(downloadedPage);

+					String pageContent = Utils.fullStripHTML(item.getPageContent());

+					pageContent = GeneratedSentenceProcessor

+							.normalizeForSentenceSplitting(pageContent);

+					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);

+					//pageContent = pageContent.trim().replaceAll("    [A-Z]", ". $0")// .replace("  ",

+					//		// ". ")

+					//		.replace("..", ".").replace(". . .", " ").

+					//		replace(".    .",". ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so

+					// we need to put '.'

+					sents = sm.splitSentences(pageContent);

 

-  }

+					sents = ContentGeneratorSupport.cleanListOfSents(sents);

+				}

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			// e.printStackTrace();

+			System.err

+			.println("Problem downloading  the page and splitting into sentences");

+			return new Triple(allFragms, downloadedPage, sents);

+		}

+		return new Triple(allFragms, downloadedPage, sents);

+	}

 

-  /**

-   * remove dupes from queries to easy cleaning dupes and repetitive search

-   * afterwards

-   * 

-   * @param List

-   *          <String> of sentences (search queries, or search results

-   *          abstracts, or titles

-   * @return List<String> of sentences where dupes are removed

-   */

-  public static List<String> removeDuplicatesFromQueries(List<String> hits) {

-    StringDistanceMeasurer meas = new StringDistanceMeasurer();

-    double dupeThresh = 0.8; // if more similar, then considered dupes was

-    // 0.7

-    List<Integer> idsToRemove = new ArrayList<Integer>();

-    List<String> hitsDedup = new ArrayList<String>();

-    try {

-      for (int i = 0; i < hits.size(); i++)

-        for (int j = i + 1; j < hits.size(); j++) {

-          String title1 = hits.get(i);

-          String title2 = hits.get(j);

-          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))

-            continue;

-          if (meas.measureStringDistance(title1, title2) > dupeThresh) {

-            idsToRemove.add(j); // dupes found, later list member to

-            // be deleted

+	String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){

+		String[] mainAndFollowSent = null;

 

-          }

-        }

+		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();

+		String downloadedPage = (String)fragmentExtractionResults.getSecond();

+		String[] sents = (String[])fragmentExtractionResults.getThird();

 

-      for (int i = 0; i < hits.size(); i++)

-        if (!idsToRemove.contains(i))

-          hitsDedup.add(hits.get(i));

+		String followSent = null;

+		if (fragment.length() < 50)

+			return null;

+		String pageSentence = "";

+		// try to find original sentence from webpage

+		if (fragment.indexOf("_should_find_orig_") > -1 && sents != null

+				&& sents.length > 0){

+			try { 

+				// first try sorted sentences from page by length approach

+				String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);

 

-      if (hitsDedup.size() < hits.size()) {

-        LOG.info("Removed duplicates from formed query, including "

-            + hits.get(idsToRemove.get(0)));

-      }

 

-    } catch (Exception e) {

-      LOG.severe("Problem removing duplicates from query list");

-    }

+				try {

+					mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+							fragment.replace("_should_find_orig_", ""), sentsSortedByLength);

+				} catch (Exception e) {

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+				// if the above gives null than try to match all sentences from snippet fragment

+				if (mainAndFollowSent==null || mainAndFollowSent[0]==null){

+					mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+							fragment.replace("_should_find_orig_", ""), sents);

+				}

 

-    return hitsDedup;

 

-  }

+			} catch (Exception e) {

 

-  /**

-   * remove dupes from search results

-   * 

-   * @param List

-   *          <HitBase> of search results objects

-   * @return List<String> of search results objects where dupes are removed

-   */

-  public static List<HitBase> removeDuplicatesFromResultantHits(

-      List<HitBase> hits) {

-    StringDistanceMeasurer meas = new StringDistanceMeasurer();

-    double dupeThresh = // 0.8; // if more similar, then considered dupes was

-    0.7;

-    List<Integer> idsToRemove = new ArrayList<Integer>();

-    List<HitBase> hitsDedup = new ArrayList<HitBase>();

-    try {

-      for (int i = 0; i < hits.size(); i++)

-        for (int j = i + 1; j < hits.size(); j++) {

-          HitBase hit2 = hits.get(j);

-          List<Fragment> fragmList1 = hits.get(i).getFragments();

-          List<Fragment> fragmList2 = hits.get(j).getFragments();

-          List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);

-          for (Fragment f1 : fragmList1)

-            for (Fragment f2 : fragmList2) {

-              String sf1 = f1.getResultText();

-              String sf2 = f2.getResultText();

-              if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))

-                continue;

-              if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {

-                fragmList2Results.remove(f2);

-                LOG.info("Removed duplicates from formed fragments list: "

-                    + sf2);

-              }

-            }

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+		}

+		else

+			// or get original snippet

+			pageSentence = fragment;

+		if (pageSentence != null)

+			pageSentence.replace("_should_find_orig_", "");

 

-          hit2.setFragments(fragmList2Results);

-          hits.set(j, hit2);

-        }

-    } catch (Exception e) {

-      LOG.severe("Problem removing duplicates from list of fragment");

-    }

-    return hits;

-  }

+		return mainAndFollowSent;

 

-  /**

-   * Takes single search result for an entity which is the subject of the essay

-   * to be written and forms essey sentences from the title, abstract, and

-   * possibly original page

-   * 

-   * @param HitBase

-   *          item : search result

-   * @param originalSentence

-   *          : seed for the essay to be written

-   * @param sentsAll

-   *          : list<String> of other sentences in the seed if it is

-   *          multi-sentence

-   * @return search result

-   */

+	}	

 

-  public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,

-      String originalSentence, List<String> sentsAll) {

-    if (sentsAll == null)

-      sentsAll = new ArrayList<String>();

-    // put orig sentence in structure

-    List<String> origs = new ArrayList<String>();

-    origs.add(originalSentence);

-    item.setOriginalSentences(origs);

-    String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

-        .replace("  ", " ").replace("  ", " ");

-    // generation results for this sentence

-    List<Fragment> result = new ArrayList<Fragment>();

-    // form plain text from snippet

-    String snapshot = item.getAbstractText().replace("<b>", " ")

-        .replace("</b>", " ").replace("  ", " ").replace("  ", " ");

+	private Fragment verifyCandidateSentencesAndFormParagraph(

+			String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {

+		Fragment result = null;	

 

-    ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor

-        .getInstance();

-    // fix a template expression which can be substituted by original if

-    // relevant

-    String snapshotMarked = snapshot.replace("...",

-        " _should_find_orig_ . _should_find_orig_");

-    String[] fragments = sm.splitSentences(snapshotMarked);

-    List<String> allFragms = new ArrayList<String>();

-    allFragms.addAll(Arrays.asList(fragments));

+		String pageSentence = candidateSentences[0];

+		String followSent = "";

+		for(int i = 1; i< candidateSentences.length; i++)

+			followSent+= candidateSentences[i];

+		String title = item.getTitle();

 

-    String[] sents = null;

-    String downloadedPage;

-    try {

-      if (snapshotMarked.length() != snapshot.length()) {

-        downloadedPage = pFetcher.fetchPage(item.getUrl());

-        if (downloadedPage != null && downloadedPage.length() > 100) {

-          item.setPageContent(downloadedPage);

-          String pageContent = Utils.fullStripHTML(item.getPageContent());

-          pageContent = GeneratedSentenceProcessor

-              .normalizeForSentenceSplitting(pageContent);

-          pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",

-                                                                        // ". ")

-              .replace("..", ".").replace(". . .", " ").trim(); // sometimes

-                                                                // html breaks

-                                                                // are converted

-                                                                // into ' ' (two

-                                                                // spaces), so

-                                                                // we need to

-                                                                // put '.'

-          sents = sm.splitSentences(snapshotMarked);

-          ;

-          sents = cleanListOfSents(sents);

-        }

-      }

-    } catch (Exception e) {

-      // TODO Auto-generated catch block

-      // e.printStackTrace();

-      System.err

-          .println("Problem downloading  the page and splitting into sentences");

-      return item;

-    }

+		// resultant sentence SHOULD NOT be longer than for times the size of

+		// snippet fragment

+		if (!(pageSentence != null && pageSentence.length()>50) ){

+			System.out.println("Cannot accept the sentence = "+ pageSentence +

+					"!(pageSentence != null && pageSentence.length()>50 && (float) pageSentence.length() / (float) fragment.length() < 4.0) )");

+			

+			return null;

+		}

 

-    for (String fragment : allFragms) {

-      String followSent = null;

-      if (fragment.length() < 50)

-        continue;

-      String pageSentence = "";

-      // try to find original sentence from webpage

-      if (fragment.indexOf("_should_find_orig_") > -1 && sents != null

-          && sents.length > 0)

-        try {

-          String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

-              fragment.replace("_should_find_orig_", ""), sents);

-          pageSentence = mainAndFollowSent[0];

-          followSent = mainAndFollowSent[1];

 

-        } catch (Exception e) {

+		try { // get score from syntactic match between sentence in

+			// original text and mined sentence

+			double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

 

-          // TODO Auto-generated catch block

-          e.printStackTrace();

-        }

-      else

-        // or get original snippet

-        pageSentence = fragment;

-      if (pageSentence != null)

-        pageSentence.replace("_should_find_orig_", "");

+			SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence

+					+ " " + title, originalSentence);

+			List<List<ParseTreeChunk>> match = matchRes.getMatchResult();

+			if (match==null || match.size()<1){

+				System.out

+				.println("Rejected Sentence : empty match "+ pageSentence);

+				return null;

+			}

+			

+			if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {

+				System.out

+				.println("Rejected Sentence : No verb OR Yes imperative verb :"

+						+ pageSentence);

+				return null;

+			}

 

-      // resultant sentence SHOULD NOT be longer than twice the size of

-      // snippet fragment

-      if (pageSentence != null

-          && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was

-                                                                                // 2.0,

-                                                                                // but

-                                                                                // since

-                                                                                // snippet

-                                                                                // sentences

-                                                                                // are

-                                                                                // rather

-                                                                                // short

-                                                                                // now...

-        try { // get score from syntactic match between sentence in

-              // original text and mined sentence

-          double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

+			syntScore = parseTreeChunkListScorer

+					.getParseTreeChunkListScore(match);

+			System.out.println(parseTreeChunk.listToString(match) + " "

+					+ syntScore + "\n pre-processed sent = '" + pageSentence);

 

-          SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence

-              + " " + title, originalSentence);

-          List<List<ParseTreeChunk>> match = matchRes.getMatchResult();

-          if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {

-            System.out

-                .println("Rejected Sentence : No verb OR Yes imperative verb :"

-                    + pageSentence);

-            continue;

-          }

+			try {

+				if (sentsAll!=null && syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents

+					for (String currSent : sentsAll) {

+						if (currSent.startsWith(originalSentence))

+							continue;

+						match = sm.assessRelevance(currSent, pageSentence)

+								.getMatchResult();

+						double syntScoreCurr = parseTreeChunkListScorer

+								.getParseTreeChunkListScore(match);

+						if (syntScoreCurr > syntScore) {

+							syntScore = syntScoreCurr;

+						}

+					}

+					if (syntScore > RELEVANCE_THRESHOLD) {

+						System.out.println("Got match with other sent: "

+								+ parseTreeChunk.listToString(match) + " " + syntScore);

+					}

+				}

+			} catch (Exception e) {

+				e.printStackTrace();

+			}

 

-          syntScore = parseTreeChunkListScorer

-              .getParseTreeChunkListScore(match);

-          System.out.println(parseTreeChunk.listToString(match) + " "

-              + syntScore + "\n pre-processed sent = '" + pageSentence);

+			measScore = stringDistanceMeasurer.measureStringDistance(

+					originalSentence, pageSentence);

 

-          if (syntScore < 1.5) { // trying other sents

-            for (String currSent : sentsAll) {

-              if (currSent.startsWith(originalSentence))

-                continue;

-              match = sm.assessRelevance(currSent, pageSentence)

-                  .getMatchResult();

-              double syntScoreCurr = parseTreeChunkListScorer

-                  .getParseTreeChunkListScore(match);

-              if (syntScoreCurr > syntScore) {

-                syntScore = syntScoreCurr;

-              }

-            }

-            if (syntScore > 1.5) {

-              System.out.println("Got match with other sent: "

-                  + parseTreeChunk.listToString(match) + " " + syntScore);

-            }

-          }

 

-          measScore = STRING_DISTANCE_MEASURER.measureStringDistance(

-              originalSentence, pageSentence);

+			if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)

+					&& measScore < 0.8 && pageSentence.length() > 40) // >70

+			{

+				String pageSentenceProc = GeneratedSentenceProcessor

+						.acceptableMinedSentence(pageSentence);

+				if (pageSentenceProc != null) {

+					pageSentenceProc = GeneratedSentenceProcessor

+							.processSentence(pageSentenceProc);

+					followSent = GeneratedSentenceProcessor.processSentence(followSent);

+					if (followSent != null) {

+						pageSentenceProc += " "+ followSent;

+					}

 

-          // now possibly increase score by finding mental verbs

-          // indicating opinions

-          for (String s : MENTAL_VERBS) {

-            if (pageSentence.indexOf(s) > -1) {

-              mentalScore += 0.3;

-              break;

-            }

-          }

+					pageSentenceProc = Utils.convertToASCII(pageSentenceProc);

+					result = new Fragment(pageSentenceProc, syntScore + measScore

+							+ mentalScore + (double) pageSentenceProc.length()

+							/ (double) 50);

+					result.setSourceURL(item.getUrl());

+					result.fragment = fragment;

 

-          if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5)

-              && measScore < 0.8 && pageSentence.length() > 40) // >70

-          {

-            String pageSentenceProc = GeneratedSentenceProcessor

-                .acceptableMinedSentence(pageSentence);

-            if (pageSentenceProc != null) {

-              pageSentenceProc = GeneratedSentenceProcessor

-                  .processSentence(pageSentenceProc);

-              if (followSent != null) {

-                pageSentenceProc += " "

-                    + GeneratedSentenceProcessor.processSentence(followSent);

-              }

+					System.out.println("Accepted sentence: " + pageSentenceProc

+							+ "| with title= " + title);

+					System.out.println("For fragment = " + fragment);

+				} else

+					System.out

+					.println("Rejected sentence due to wrong area at webpage: "

+							+ pageSentence);

+			} else

+				System.out.println("Rejected sentence due to low score: "

+						+ pageSentence);

+			// }

+		} catch (Throwable t) {

+			t.printStackTrace();

+		}

 

-              pageSentenceProc = Utils.convertToASCII(pageSentenceProc);

-              Fragment f = new Fragment(pageSentenceProc, syntScore + measScore

-                  + mentalScore + (double) pageSentenceProc.length()

-                  / (double) 50);

-              f.setSourceURL(item.getUrl());

-              f.fragment = fragment;

-              result.add(f);

-              System.out.println("Accepted sentence: " + pageSentenceProc

-                  + "| with title= " + title);

-              System.out.println("For fragment = " + fragment);

-            } else

-              System.out

-                  .println("Rejected sentence due to wrong area at webpage: "

-                      + pageSentence);

-          } else

-            System.out.println("Rejected sentence due to low score: "

-                + pageSentence);

-          // }

-        } catch (Throwable t) {

-          t.printStackTrace();

-        }

-      }

-    }

-    item.setFragments(result);

-    return item;

-  }

+	return result;

+}

 

-  public static String[] cleanListOfSents(String[] sents) {

-    List<String> sentsClean = new ArrayList<String>();

-    for (String s : sents) {

-      if (s == null || s.trim().length() < 30 || s.length() < 20)

-        continue;

-      sentsClean.add(s);

-    }

-    return (String[]) sentsClean.toArray(new String[0]);

-  }

+public HitBase buildParagraphOfGeneratedText(HitBase item,

+		String originalSentence, List<String> sentsAll) {

+	List<Fragment> results = new ArrayList<Fragment>() ;

+	

+	Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);

 

-  // given a fragment from snippet, finds an original sentence at a webpage by

-  // optimizing alignmemt score

-  public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(

-      String fragment, String[] sents) {

-    if (fragment.trim().length() < 15)

-      return null;

+	List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();

+	String downloadedPage = (String)fragmentExtractionResults.getSecond();

+	String[] sents = (String[])fragmentExtractionResults.getThird();

 

-    StringDistanceMeasurer meas = new StringDistanceMeasurer();

-    Double dist = 0.0;

-    String result = null, followSent = null;

-    for (int i = 0; i < sents.length; i++) {

-      String s = sents[i];

-      if (s == null || s.length() < 30)

-        continue;

-      Double distCurr = meas.measureStringDistance(s, fragment);

-      if (distCurr > dist && distCurr > 0.4) {

-        result = s;

-        dist = distCurr;

-        if (i < sents.length - 1 && sents[i + 1].length() > 60) {

-          followSent = sents[i + 1];

-        }

+	for (String fragment : allFragms) {

+		String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);

+		if (candidateSentences == null)

+			continue;

+		Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);

+		if (res!=null)

+			results.add(res);

 

-      }

-    }

-    return new String[] { result, followSent };

-  }

+	}

+	

+	item.setFragments(results );

+	return item;

+}

 

-  // given a fragment from snippet, finds an original sentence at a webpage by

-  // optimizing alignmemt score

-  public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(

-      String fragment, String[] sents) {

-    if (fragment.trim().length() < 15)

-      return null;

-    int bestSentIndex = -1;

-    StringDistanceMeasurer meas = new StringDistanceMeasurer();

-    Double distBest = 10.0; // + sup

-    String result = null, followSent = null;

-    for (int i = 0; i < sents.length; i++) {

-      String s = sents[i];

-      if (s == null || s.length() < 30)

-        continue;

-      Double distCurr = meas.measureStringDistance(s, fragment);

-      if (distCurr > distBest) {

-        distBest = distCurr;

-        bestSentIndex = i;

-      }

 

-    }

-    if (distBest > 0.4) {

-      result = sents[bestSentIndex];

 

-      if (bestSentIndex < sents.length - 1

-          && sents[bestSentIndex + 1].length() > 60) {

-        followSent = sents[bestSentIndex + 1];

-      }

 

-    }

+public static void main(String[] args) {

+	RelatedSentenceFinder f = new RelatedSentenceFinder();

 

-    return new String[] { result, followSent };

-  }

+	List<HitBase> hits = null;

+	try {

+		// uncomment the sentence you would like to serve as a seed sentence for

+		// content generation for an event description

 

-  public static void main(String[] args) {

-    RelatedSentenceFinder f = new RelatedSentenceFinder();

+		// uncomment the sentence you would like to serve as a seed sentence for

+		// content generation for an event description

+		hits = f.generateContentAbout("Albert Einstein"

+				// "Britney Spears - The Femme Fatale Tour"

+				// "Rush Time Machine",

+				// "Blue Man Group" ,

+				// "Belly Dance With Zaharah",

+				// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",

+				// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",

+				);

+		System.out.println(HitBase.toString(hits));

+		System.out.println(HitBase.toResultantString(hits));

+		// WordFileGenerator.createWordDoc("Essey about Albert Einstein",

+		// hits.get(0).getTitle(), hits);

 

-    List<HitBase> hits = null;

-    try {

-      // uncomment the sentence you would like to serve as a seed sentence for

-      // content generation for an event description

+	} catch (Exception e) {

+		e.printStackTrace();

+	}

 

-      // uncomment the sentence you would like to serve as a seed sentence for

-      // content generation for an event description

-      hits = f.generateContentAbout("Albert Einstein"

-      // "Britney Spears - The Femme Fatale Tour"

-      // "Rush Time Machine",

-      // "Blue Man Group" ,

-      // "Belly Dance With Zaharah",

-      // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",

-      // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",

-          );

-      System.out.println(HitBase.toString(hits));

-      System.out.println(HitBase.toResultantString(hits));

-      // WordFileGenerator.createWordDoc("Essey about Albert Einstein",

-      // hits.get(0).getTitle(), hits);

+}

 

-    } catch (Exception e) {

-      e.printStackTrace();

-    }

 

-  }

 

 }
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java
new file mode 100644
index 0000000..2cd5a55
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinderML.java

@@ -0,0 +1,295 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.similarity.apps;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.List;

+import java.util.logging.Logger;

+import opennlp.tools.similarity.apps.utils.Utils;

+import opennlp.tools.textsimilarity.TextProcessor;

+

+/*

+ * This class does content generation in ES, DE etc

+ * 

+ */

+

+public class RelatedSentenceFinderML extends RelatedSentenceFinder{

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinderML");

+

+

+	public RelatedSentenceFinderML(int ms, int msr, float thresh, String key) {

+		this.MAX_STEPS = ms;

+		this.MAX_SEARCH_RESULTS = msr;

+		this.RELEVANCE_THRESHOLD=thresh;

+		yrunner.setKey(key);

+	}

+

+	public RelatedSentenceFinderML() {

+		// TODO Auto-generated constructor stub

+	}

+

+	public List<HitBase> generateContentAbout(String sentence) throws Exception {

+		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();

+		System.out.println(" \n=== Entity to write about = " + sentence);

+		List<String> nounPhraseQueries = new ArrayList<String>();

+

+		List<HitBase> searchResult = yrunner.runSearch(sentence, 100);

+		if (MAX_SEARCH_RESULTS<searchResult.size())

+			searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);

+		//TODO for shorter run

+		if (searchResult != null) {

+			for (HitBase item : searchResult) { // got some text from .html

+				if (item.getAbstractText() != null

+						&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf

+					opinionSentencesToAdd

+					.add(augmentWithMinedSentencesAndVerifyRelevance(item,

+							sentence, null));

+				}

+			}

+		}

+

+		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);

+		return opinionSentencesToAdd;

+	}

+

+

+	/**

+	 * Takes single search result for an entity which is the subject of the essay

+	 * to be written and forms essey sentences from the title, abstract, and

+	 * possibly original page

+	 * 

+	 * @param HitBase

+	 *          item : search result

+	 * @param originalSentence

+	 *          : seed for the essay to be written

+	 * @param sentsAll

+	 *          : list<String> of other sentences in the seed if it is

+	 *          multi-sentence

+	 * @return search result

+	 */

+

+	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,

+			String originalSentence, List<String> sentsAll) {

+		if (sentsAll == null)

+			sentsAll = new ArrayList<String>();

+		// put orig sentence in structure

+		List<String> origs = new ArrayList<String>();

+		origs.add(originalSentence);

+		item.setOriginalSentences(origs);

+		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")

+				.replace("  ", " ").replace("  ", " ");

+		// generation results for this sentence

+		List<Fragment> result = new ArrayList<Fragment>();

+		// form plain text from snippet

+		String snapshot = item.getAbstractText().replace("<b>", " ")

+				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");

+

+

+		// fix a template expression which can be substituted by original if

+		// relevant

+		String snapshotMarked = snapshot.replace("...",

+				" _should_find_orig_ . _should_find_orig_");

+		String[] fragments = sm.splitSentences(snapshotMarked);

+		List<String> allFragms = new ArrayList<String>();

+		allFragms.addAll(Arrays.asList(fragments));

+

+		String[] sents = null;

+		String downloadedPage = null;

+		try {

+			if (snapshotMarked.length() != snapshot.length()) {

+				downloadedPage = pFetcher.fetchPage(item.getUrl());

+				if (downloadedPage != null && downloadedPage.length() > 100) {

+					item.setPageContent(downloadedPage);

+					String pageContent = Utils.fullStripHTML(item.getPageContent());

+					pageContent = GeneratedSentenceProcessor

+							.normalizeForSentenceSplitting(pageContent);

+					pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",

+							// ". ")

+							.replace("..", ".").replace(". . .", " ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so

+					// we need to put '.'

+					sents = sm.splitSentences(pageContent);

+

+					sents = ContentGeneratorSupport.cleanListOfSents(sents);

+				}

+			}

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			// e.printStackTrace();

+			System.err

+			.println("Problem downloading  the page and splitting into sentences");

+			return item;

+		}

+

+		for (String fragment : allFragms) {

+			String followSent = null;

+			if (fragment.length() < 50)

+				continue;

+			String pageSentence = "";

+			// try to find original sentence from webpage

+			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null

+					&& sents.length > 0)

+				try { 

+					// first try sorted sentences from page by lenght approach

+					String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);

+					String[] mainAndFollowSent = null;

+

+					try {

+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+								fragment.replace("_should_find_orig_", ""), sentsSortedByLength);

+					} catch (Exception e) {

+						// TODO Auto-generated catch block

+						e.printStackTrace();

+					}

+					// if the above gives null than try to match all sentences from snippet fragment

+					if (mainAndFollowSent==null || mainAndFollowSent[0]==null){

+						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(

+								fragment.replace("_should_find_orig_", ""), sents);

+					}

+

+

+				} catch (Exception e) {

+

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+			else

+				// or get original snippet

+				pageSentence = fragment;

+			if (pageSentence != null)

+				pageSentence.replace("_should_find_orig_", "");

+

+			// resultant sentence SHOULD NOT be longer than twice the size of

+			// snippet fragment

+			if (pageSentence != null

+					&& (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was 2.0,

+

+				try { // get score from syntactic match between sentence in

+					// original text and mined sentence

+					double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

+

+					syntScore = calculateKeywordScore(pageSentence + " " + title, originalSentence);

+

+

+					if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents

+						for (String currSent : sentsAll) {

+							if (currSent.startsWith(originalSentence))

+								continue;

+							double syntScoreCurr = calculateKeywordScore(currSent, pageSentence);

+							if (syntScoreCurr > syntScore) {

+								syntScore = syntScoreCurr;

+							}

+						}

+						if (syntScore > RELEVANCE_THRESHOLD) {

+							System.out.println("Got match with other sent: " + syntScore);

+						}

+					}

+

+					measScore = stringDistanceMeasurer.measureStringDistance(

+							originalSentence, pageSentence);

+

+					// now possibly increase score by finding mental verbs

+					// indicating opinions

+					for (String s : MENTAL_VERBS) {

+						if (pageSentence.indexOf(s) > -1) {

+							mentalScore += 0.3;

+							break;

+						}

+					}

+

+					if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5 || mentalScore > 0.5)

+							&& measScore < 0.8 && pageSentence.length() > 40) // >70

+					{

+						String pageSentenceProc = GeneratedSentenceProcessor

+								.acceptableMinedSentence(pageSentence);

+						if (pageSentenceProc != null) {

+							pageSentenceProc = GeneratedSentenceProcessor

+									.processSentence(pageSentenceProc);

+							if (followSent != null) {

+								pageSentenceProc += " "

+										+ GeneratedSentenceProcessor.processSentence(followSent);

+							}

+

+							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);

+							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore

+									+ mentalScore + (double) pageSentenceProc.length()

+									/ (double) 50);

+							f.setSourceURL(item.getUrl());

+							f.fragment = fragment;

+							result.add(f);

+							System.out.println("Accepted sentence: " + pageSentenceProc

+									+ "| with title= " + title);

+							System.out.println("For fragment = " + fragment);

+						} else

+							System.out

+							.println("Rejected sentence due to wrong area at webpage: "

+									+ pageSentence);

+					} else

+						System.out.println("Rejected sentence due to low score: "

+								+ pageSentence);

+					// }

+				} catch (Throwable t) {

+					t.printStackTrace();

+				}

+			}

+		}

+		item.setFragments(result);

+		return item;

+	}

+

+	private double calculateKeywordScore(String currSent, String pageSentence) {

+		List<String>  list1 =TextProcessor.fastTokenize(currSent, false);

+		List<String>  list2 =TextProcessor.fastTokenize(pageSentence, false);

+		List<String> overlap1 = new ArrayList<String>(list1);		

+		overlap1.retainAll(list2);

+		return overlap1.size();

+

+	}

+

+

+	public static void main(String[] args) {

+		RelatedSentenceFinderML f = new RelatedSentenceFinderML();

+

+		List<HitBase> hits = null;

+		try {

+			// uncomment the sentence you would like to serve as a seed sentence for

+			// content generation for an event description

+

+			// uncomment the sentence you would like to serve as a seed sentence for

+			// content generation for an event description

+			hits = f.generateContentAbout("Albert Einstein"

+					// "Britney Spears - The Femme Fatale Tour"

+					// "Rush Time Machine",

+					// "Blue Man Group" ,

+					// "Belly Dance With Zaharah",

+					// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",

+					// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",

+					);

+			System.out.println(HitBase.toString(hits));

+			System.out.println(HitBase.toResultantString(hits));

+			// WordFileGenerator.createWordDoc("Essey about Albert Einstein",

+			// hits.get(0).getTitle(), hits);

+

+		} catch (Exception e) {

+			e.printStackTrace();

+		}

+

+	}

+

+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
index 128bee2..5c40ce0 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java

@@ -26,7 +26,7 @@
 import opennlp.tools.textsimilarity.SentencePairMatchResult;

 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 

-public class SearchResultsProcessor extends BingWebQueryRunner {

+public class SearchResultsProcessor extends BingQueryRunner {

   private static Logger LOG = Logger

       .getLogger("opennlp.tools.similarity.apps.SearchResultsProcessor");

   private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

@@ -92,7 +92,7 @@
   public List<HitBase> runSearchViaAPI(String query) {

 	List<HitBase> hits = null;

     try {

-      List<HitBase> resultList = runSearch(query, 30);

+      List<HitBase> resultList = runSearch(query);

       // now we apply our own relevance filter

       hits = calculateMatchScoreResortHits(resultList, query);

 

@@ -102,7 +102,6 @@
       return null;

     }

 

-    hits = removeDuplicates(hits, 0.9);

 

     return hits;

   }


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
index 1959d64..2607eb6 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java

@@ -101,8 +101,6 @@
     double bestSentScore = -1;

     String bestSent = null;

     for (String sentence : sents) {

-      BingResponse resp = null, // obtained from bing

-      newResp = null; // re-sorted based on similarity

       try {

         List<HitBase> resultList = scraper.runSearch(sentence);

         double scoreForSentence = calculateTotalMatchScoreForHits(resultList,


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
index 1347a2b..b2d2194 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java

@@ -17,22 +17,115 @@
 

 package opennlp.tools.similarity.apps;

 

-public class StoryDiscourseNavigator {

-  public static final String[] frequentPerformingVerbs = {

-      " born raised meet learn ", " graduated enter discover",

-      " facts inventions life ", "accomplishments childhood timeline",

-      " acquire befriend encounter", " achieve reache describe ",

-      " invent innovate improve ", " impress outstanding award",

-      " curous sceptical pessimistic", " spend enroll assume point",

-      " explain discuss dispute", " learn teach study investigate",

-      " propose suggest indicate", " pioneer explorer discoverer ",

-      " advance promote lead", " direct control simulate ",

-      " guide lead assist ", " inspire first initial",

-      " vision predict foresee", " prediction inspiration achievement",

-      " approve agree confirm", " deny argue disagree",

-      " emotional loud imagination", " release announce celebrate discover",

-      "introduce enjoy follow", " open present show",

-      "meet enjoy follow create", "discover continue produce"

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.Collection;

+import java.util.HashSet;

+import java.util.List;

 

-  };

+import opennlp.tools.similarity.apps.utils.StringCleaner;

+import opennlp.tools.stemmer.PorterStemmer;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.TextProcessor;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+public class StoryDiscourseNavigator {

+	protected BingQueryRunner yrunner = new BingQueryRunner();

+	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor

+			.getInstance();

+	private PorterStemmer ps = new PorterStemmer();

+

+	public static final String[] frequentPerformingVerbs = {

+		" born raised meet learn ", " graduated enter discover",

+		" facts inventions life ", "accomplishments childhood timeline",

+		" acquire befriend encounter", " achieve reache describe ",

+		" invent innovate improve ", " impress outstanding award",

+		" curous sceptical pessimistic", " spend enroll assume point",

+		" explain discuss dispute", " learn teach study investigate",

+		" propose suggest indicate", " pioneer explorer discoverer ",

+		" advance promote lead", " direct control simulate ",

+		" guide lead assist ", " inspire first initial",

+		" vision predict foresee", " prediction inspiration achievement",

+		" approve agree confirm", " deny argue disagree",

+		" emotional loud imagination", " release announce celebrate discover",

+		"introduce enjoy follow", " open present show",

+		"meet enjoy follow create", "discover continue produce"

+

+	};

+

+	public String[] obtainAdditionalKeywordsForAnEntity(String entity){

+		List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(

+				entity, "", "en", 30);

+		Collection<String> keywordsToRemove = TextProcessor.fastTokenize(entity.toLowerCase(), false);

+		List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(matchList);

+		String[] res = new String[resList.size()];

+		int i=0;

+		for(List<String> phrase: resList){

+			phrase.removeAll(keywordsToRemove);

+			String keywords = phrase.toString().replace('[', ' ').replace(']', ' ').replace(',',' ');

+			res[i] = keywords;

+			i++;

+		}

+		return res;

+	}

+

+	public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,

+			String domain, String lang, int numbOfHits) {

+		List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();

+		try {

+			List<HitBase> resultList = yrunner.runSearch(query, numbOfHits);

+

+			for (int i = 0; i < resultList.size(); i++) {

+				{

+					for (int j = i + 1; j < resultList.size(); j++) {

+						HitBase h1 = resultList.get(i);

+						HitBase h2 = resultList.get(j);

+						String snapshot1 = StringCleaner.processSnapshotForMatching(h1

+								.getTitle() + " . " + h1.getAbstractText());

+						String snapshot2 = StringCleaner.processSnapshotForMatching(h2

+								.getTitle() + " . " + h2.getAbstractText());

+						SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,

+								snapshot2);

+						List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();

+						genResult.addAll(matchResult);

+					}

+				}

+			}

+

+		} catch (Exception e) {

+			System.err.print("Problem extracting taxonomy node");

+		}

+

+		return genResult;

+	}

+	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(

+			List<List<ParseTreeChunk>> matchList) {

+		List<List<String>> res = new ArrayList<List<String>>();

+		for (List<ParseTreeChunk> chunks : matchList) {

+			List<String> wordRes = new ArrayList<String>();

+			for (ParseTreeChunk ch : chunks) {

+				List<String> lemmas = ch.getLemmas();

+				for (int w = 0; w < lemmas.size(); w++)

+					if ((!lemmas.get(w).equals("*"))

+							&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)

+									.startsWith("VB"))) && lemmas.get(w).length() > 2) {

+						String formedWord = lemmas.get(w);

+						String stemmedFormedWord = ps.stem(formedWord);

+						if (!stemmedFormedWord.startsWith("invalid"))

+							wordRes.add(formedWord);

+					}

+			}

+			wordRes = new ArrayList<String>(new HashSet<String>(wordRes));	   

+			if (wordRes.size() > 0) {

+				res.add(wordRes);

+			}

+		}

+		res = new ArrayList<List<String>>(new HashSet<List<String>>(res));

+		return res;

+	}

+	public static void main(String[] args){

+		String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");

+		System.out.println(Arrays.asList(res));

+	}

 }


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java
new file mode 100644
index 0000000..50db87c
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/YahooAnswersMiner.java

@@ -0,0 +1,110 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.similarity.apps;

+

+import java.util.ArrayList;

+import java.util.List;

+import java.util.logging.Logger;

+

+import opennlp.tools.jsmlearning.ProfileReaderWriter;

+import opennlp.tools.parse_thicket.Triple;

+

+import net.billylieurance.azuresearch.AzureSearchImageQuery;

+import net.billylieurance.azuresearch.AzureSearchImageResult;

+import net.billylieurance.azuresearch.AzureSearchResultSet;

+import net.billylieurance.azuresearch.AzureSearchWebQuery;

+import net.billylieurance.azuresearch.AzureSearchWebResult;

+

+public class YahooAnswersMiner extends BingQueryRunner{

+

+	private static final Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.YahooAnswersMiner");

+	private int page = 0;

+	private static final int hitsPerPage = 50;

+

+	public List<HitBase> runSearch(String query) {

+		aq.setAppid(BING_KEY);

+		aq.setQuery("site:answers.yahoo.com "+

+				query);		

+		aq.setPerPage(hitsPerPage);

+		aq.setPage(page);

+

+		aq.doQuery();

+		List<HitBase> results = new ArrayList<HitBase> ();

+		AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();

+

+		for (AzureSearchWebResult anr : ars){

+			HitBase h = new HitBase();

+			h.setAbstractText(anr.getDescription());

+			h.setTitle(anr.getTitle());

+			h.setUrl(anr.getUrl());

+			results.add(h);

+		}

+		page++;

+

+		return results;

+	}

+

+

+	public List<HitBase> runSearch(String query, int totalPages) {

+		int count=0;

+		List<HitBase> results = new ArrayList<HitBase>();

+		while(totalPages>page*hitsPerPage){

+			List<HitBase> res = runSearch(query);

+			results.addAll(res);

+			if (count>10)

+				break;

+			count++;

+		}

+

+		return results;

+	}

+

+

+	public static void main(String[] args) {

+		YahooAnswersMiner self = new YahooAnswersMiner();

+		RelatedSentenceFinder extractor = new RelatedSentenceFinder();

+		String topic = "obamacare";

+

+		List<HitBase> resp = self

+				.runSearch(topic, 150);

+		System.out.print(resp.get(0));

+		List<String[]> data = new ArrayList<String[]>();

+

+

+		for(HitBase item: resp){	      

+			Triple<List<String>, String, String[]> fragmentExtractionResults = 

+					extractor.formCandidateFragmentsForPage(item, topic, null);

+

+			List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();

+			String downloadedPage = (String)fragmentExtractionResults.getSecond();

+			String[] sents = (String[])fragmentExtractionResults.getThird();

+

+			for (String fragment : allFragms) {

+				String[] candidateSentences = extractor.formCandidateSentences(fragment, fragmentExtractionResults);

+				System.out.println(candidateSentences);

+				data.add(candidateSentences);

+			}

+			

+		}

+

+		ProfileReaderWriter.writeReport(data, "multi_sentence_queries.csv");

+

+	}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java
new file mode 100644
index 0000000..0e8d743
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java

@@ -0,0 +1,223 @@
+package opennlp.tools.similarity.apps.solr;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.IOException;

+import java.io.InputStream;

+import java.io.InputStreamReader;

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Iterator;

+import java.util.LinkedList;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Logger;

+

+import javax.mail.internet.AddressException;

+import javax.mail.internet.InternetAddress;

+

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.RelatedSentenceFinder;

+import opennlp.tools.similarity.apps.RelatedSentenceFinderML;

+import opennlp.tools.similarity.apps.utils.Pair;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+import org.apache.commons.lang.ArrayUtils;

+import org.apache.commons.lang.StringUtils;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.index.CorruptIndexException;

+import org.apache.lucene.index.IndexReader;

+import org.apache.lucene.queryparser.classic.ParseException;

+import org.apache.lucene.search.BooleanClause.Occur;

+import org.apache.lucene.search.BooleanQuery;

+import org.apache.lucene.search.CachingWrapperFilter;

+import org.apache.lucene.search.Collector;

+import org.apache.lucene.search.Filter;

+import org.apache.lucene.search.Query;

+import org.apache.lucene.search.QueryWrapperFilter;

+import org.apache.lucene.search.ScoreDoc;

+import org.apache.solr.common.SolrDocument;

+import org.apache.solr.common.SolrDocumentList;

+import org.apache.solr.common.SolrException;

+import org.apache.solr.common.params.CommonParams;

+import org.apache.solr.common.params.ModifiableSolrParams;

+import org.apache.solr.common.params.ShardParams;

+import org.apache.solr.common.params.SolrParams;

+import org.apache.solr.common.util.NamedList;

+import org.apache.solr.handler.component.SearchHandler;

+import org.apache.solr.request.SolrQueryRequest;

+import org.apache.solr.response.SolrQueryResponse;

+

+

+

+public class ContentGeneratorRequestHandler extends SearchHandler {

+	private static Logger LOG = Logger

+			.getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler");

+	private ParserChunker2MatcherProcessor sm = null;

+

+

+	public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){

+

+		String query = req.getParams().get("q");

+		LOG.info(query);

+

+		String[] runCommand = new String[12], runInternal = new String[8];

+		runCommand[0] = "java";

+		runCommand[1] = "-Xmx1g";

+		runCommand[2] = "-jar";

+		runCommand[3] = "pt.jar";

+		runCommand[4] = "\""+query+"\"";

+		runCommand[5] = req.getParams().get("email");

+		runCommand[6] = req.getParams().get("resourceDir");

+		runCommand[7] = req.getParams().get("stepsNum");

+		runCommand[8] = req.getParams().get("searchResultsNum");

+		runCommand[9] = req.getParams().get("relevanceThreshold");

+		runCommand[10] = req.getParams().get("lang");

+		runCommand[11] = req.getParams().get("bingKey");

+

+		for(int i= 0; i<8; i++){

+			runInternal[i] = runCommand[i+4];

+		}

+		String resultText = null;

+		try {

+			resultText = cgRunner(runInternal);

+		} catch (Exception e1) {

+			

+/*

+		Runtime r = Runtime.getRuntime();

+		Process mStartProcess = null;

+		String workDir = req.getParams().get("workDir"); 

+		if (workDir == null)

+			System.err.println("workDir = null");

+

+		try {

+			mStartProcess = r.exec(runCommand, null, new File(workDir));

+		} catch (IOException e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+

+		StreamLogger outputGobbler = new StreamLogger(mStartProcess.getInputStream());

+		outputGobbler.start();

+		}

+*/

+		}

+		

+		NamedList<Object> values = rsp.getValues();

+		values.remove("response");

+		values.add("response", "We completed your request to write an essay on '"+query+"' and sent you an email at "+ runCommand[5]);

+		values.add("text", resultText);

+		rsp.setAllValues(values);

+

+	}

+

+

+	class StreamLogger extends Thread{

+

+		private InputStream mInputStream;

+

+		public StreamLogger(InputStream is) {

+			this.mInputStream = is;

+		}

+

+		public void run() {

+			try {

+				InputStreamReader isr = new InputStreamReader(mInputStream);

+				BufferedReader br = new BufferedReader(isr);

+				String line = null;

+				while ((line = br.readLine()) != null) {

+					System.out.println(line);

+				}

+			} catch (IOException ioe) {

+				ioe.printStackTrace();

+			}

+		}

+	}

+

+	public String cgRunner(String[] args) {

+		ParserChunker2MatcherProcessor sm = null;

+		int count=0; 

+		for(String a: args){

+			System.out.print(count+" >> " + a);

+			count++;

+		}

+		

+

+		try {

+			String resourceDir = args[2];

+			if (resourceDir!=null)

+				sm = ParserChunker2MatcherProcessor.getInstance(resourceDir);

+			else

+				sm = ParserChunker2MatcherProcessor.getInstance();

+

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+

+		String bingKey = args[7];

+		if (bingKey == null){

+			bingKey = //"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";

+					"xdnRVcVf9m4vDvW1SkTAz5kS5DFYa19CrPYGelGJxnc";

+		}

+

+		RelatedSentenceFinder f = null;

+		String lang = args[6];

+		if (lang.startsWith("es")){

+			f = new RelatedSentenceFinderML(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);

+			f.setLang(lang);

+		} else	    

+

+			if (args.length>4 && args[4]!=null)

+				f = new RelatedSentenceFinder(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey);

+			else

+				f = new RelatedSentenceFinder();

+		String generatedContent = null;

+		List<HitBase> hits = null;

+		try {

+

+			hits = f.generateContentAbout(args[0].replace('+', ' ').replace('"', ' ').trim());

+			System.out.println(HitBase.toString(hits));

+			generatedContent = HitBase.toResultantString(hits);

+

+			opennlp.tools.apps.utils.email.EmailSender s = new opennlp.tools.apps.utils.email.EmailSender();

+

+			try {

+				s.sendMail("smtp.rambler.ru", "bg7550@rambler.ru", "pill0693", new InternetAddress("bg7550@rambler.ru"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, 

+						"Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);

+			} catch (AddressException e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			} catch (Exception e) {

+

+				e.printStackTrace();

+				try {

+					s.sendMail("smtp.rambler.ru", "bg7550@rambler.ru", "pill0693", new InternetAddress("bg7550@rambler.ru"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, 

+							"Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);

+				} catch (Exception e1) {

+					// TODO Auto-generated catch block

+					e1.printStackTrace();

+				}

+			}

+

+

+		} catch (Exception e) {

+			e.printStackTrace();

+		}

+		return generatedContent;

+	}

+

+	

+}

+

+/*

+http://173.255.254.250:8983/solr/contentgen/?q=human+body+anatomy&email=bgalitsky@hotmail.com&resourceDir=/home/solr/solr-4.4.0/example/src/test/resources&workDir=/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib&stepsNum=20&searchResultsNum=10&relevanceThreshold=1.5

+

+ */
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java
index 68be1a6..14dc9ff 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java

@@ -1,19 +1,3 @@
-/*

- * Licensed to the Apache Software Foundation (ASF) under one or more

- * contributor license agreements.  See the NOTICE file distributed with

- * this work for additional information regarding copyright ownership.

- * The ASF licenses this file to You under the Apache License, Version 2.0

- * (the "License"); you may not use this file except in compliance with

- * the License. You may obtain a copy of the License at

- *

- *     http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

 package opennlp.tools.similarity.apps.solr;

 

 import java.io.IOException;

@@ -21,7 +5,6 @@
 import java.util.List;

 

 import org.apache.commons.lang.StringUtils;

-

 import org.apache.lucene.queryparser.classic.ParseException;

 import org.apache.lucene.search.BooleanQuery;

 import org.apache.lucene.search.Query;

@@ -35,13 +18,11 @@
 import org.apache.solr.response.ResultContext;

 import org.apache.solr.response.SolrQueryResponse;

 import org.apache.solr.search.DocList;

-

 import org.apache.solr.search.QParser;

 import org.apache.solr.search.QParserPlugin;

 import org.apache.solr.search.QueryParsing;

 

 

-

 public class IterativeQueryComponent extends QueryComponent{

 	public static final String COMPONENT_NAME = "iterative_query";

 	public static final String[] fieldSequence = new String[]{"cat", "name", "content", "author"}; 

@@ -75,6 +56,36 @@
 				}

 			}

 		}

+/*

+		nameValuePairs = rb.rsp.getValues();

+		c = (ResultContext) nameValuePairs.get("response");

+		if (c!=null){

+			DocList dList = c.docs;

+			if (dList.size()<1){

+				nameValuePairs.remove("response");

+				rb.rsp.setAllValues(nameValuePairs);

+				rb = substituteField(rb, fieldSequence[2] );

+				super.process(rb);

+			}

+			else {

+				return;

+			}

+		}

+		nameValuePairs = rb.rsp.getValues();

+		c = (ResultContext) nameValuePairs.get("response");

+		if (c!=null){

+			DocList dList = c.docs;

+			if (dList.size()<1){

+				nameValuePairs.remove("response");

+				rb.rsp.setAllValues(nameValuePairs);

+				rb = substituteField(rb, fieldSequence[3] );

+				super.process(rb);

+			}

+			else {

+				return;

+			}

+		}

+*/

 	}

 

 	private ResponseBuilder substituteField(ResponseBuilder rb, String newFieldName) {

@@ -90,6 +101,7 @@
 		rb.req.setParams(params);

 		rb.setQueryString(query);

 

+

 		String defType = params.get(QueryParsing.DEFTYPE,QParserPlugin.DEFAULT_QTYPE);

 

 		// get it from the response builder to give a different component a chance

@@ -101,21 +113,71 @@
 			rb.setQueryString(queryString);

 		}

 

+		QParser parser = null;

 		try {

-			QParser parser = QParser.getParser(rb.getQueryString(), defType, rb.req);

-			Query q = parser.getQuery();

-			if (q == null) {

-				// normalize a null query to a query that matches nothing

-				q = new BooleanQuery();        

-			}

-			rb.setQuery( q );

-			rb.setSortSpec( parser.getSort(true) );

-			rb.setQparser(parser);

-			rb.setScoreDoc(parser.getPaging());

-

-		} catch (ParseException e) {

-			throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);

+			parser = QParser.getParser(rb.getQueryString(), defType, rb.req);

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

 		}

+		Query q = null;

+		try {

+			q = parser.getQuery();

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		if (q == null) {

+			// normalize a null query to a query that matches nothing

+			q = new BooleanQuery();        

+		}

+		rb.setQuery( q );

+		try {

+			rb.setSortSpec( parser.getSort(true) );

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		rb.setQparser(parser);

+		try {

+			rb.setScoreDoc(parser.getPaging());

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+

+		String[] fqs = rb.req.getParams().getParams(CommonParams.FQ);

+		if (fqs!=null && fqs.length!=0) {

+			List<Query> filters = rb.getFilters();

+			if (filters==null) {

+				filters = new ArrayList<Query>(fqs.length);

+			}

+			for (String fq : fqs) {

+				if (fq != null && fq.trim().length()!=0) {

+					QParser fqp = null;

+					try {

+						fqp = QParser.getParser(fq, null, rb.req);

+					} catch (Exception e) {

+						// TODO Auto-generated catch block

+						e.printStackTrace();

+					}

+					try {

+						filters.add(fqp.getQuery());

+					} catch (Exception e) {

+						// TODO Auto-generated catch block

+						e.printStackTrace();

+					}

+				}

+			}

+			// only set the filters if they are not empty otherwise

+			// fq=&someotherParam= will trigger all docs filter for every request 

+			// if filter cache is disabled

+			if (!filters.isEmpty()) {

+				rb.setFilters( filters );

+			}

+		}

+

+

 		return rb;

 	}

 


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java
new file mode 100644
index 0000000..87f5ed9
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java

@@ -0,0 +1,366 @@
+package opennlp.tools.similarity.apps.solr;

+

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Iterator;

+import java.util.LinkedList;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+

+import opennlp.tools.similarity.apps.HitBaseComparable;

+import opennlp.tools.similarity.apps.utils.Pair;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+import org.apache.commons.lang.ArrayUtils;

+import org.apache.commons.lang.StringUtils;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.index.CorruptIndexException;

+import org.apache.lucene.index.IndexReader;

+import org.apache.lucene.queryparser.classic.ParseException;

+import org.apache.lucene.search.BooleanClause.Occur;

+import org.apache.lucene.search.BooleanQuery;

+import org.apache.lucene.search.CachingWrapperFilter;

+import org.apache.lucene.search.Collector;

+import org.apache.lucene.search.Filter;

+import org.apache.lucene.search.Query;

+import org.apache.lucene.search.QueryWrapperFilter;

+import org.apache.lucene.search.ScoreDoc;

+import org.apache.solr.common.SolrDocument;

+import org.apache.solr.common.SolrDocumentList;

+import org.apache.solr.common.SolrException;

+import org.apache.solr.common.params.CommonParams;

+import org.apache.solr.common.params.ModifiableSolrParams;

+import org.apache.solr.common.params.ShardParams;

+import org.apache.solr.common.params.SolrParams;

+import org.apache.solr.common.util.NamedList;

+import org.apache.solr.handler.RequestHandlerBase;

+import org.apache.solr.handler.component.ResponseBuilder;

+import org.apache.solr.handler.component.SearchComponent;

+import org.apache.solr.handler.component.SearchHandler;

+import org.apache.solr.handler.component.ShardHandler;

+import org.apache.solr.handler.component.ShardRequest;

+import org.apache.solr.handler.component.ShardResponse;

+import org.apache.solr.request.SolrQueryRequest;

+import org.apache.solr.response.ResultContext;

+import org.apache.solr.response.SolrQueryResponse;

+import org.apache.solr.schema.SchemaField;

+import org.apache.solr.search.DocIterator;

+import org.apache.solr.search.DocList;

+import org.apache.solr.search.DocSlice;

+import org.apache.solr.search.QParser;

+import org.apache.solr.search.SolrIndexSearcher;

+import org.apache.solr.util.RTimer;

+import org.apache.solr.util.SolrPluginUtils;

+

+public class IterativeSearchRequestHandler extends SearchHandler {

+

+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+

+	public SolrQueryResponse runSearchIteration(SolrQueryRequest req, SolrQueryResponse rsp, String fieldToTry){

+		try {

+			req = substituteField(req, fieldToTry);

+			super.handleRequestBody(req, rsp);

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		return rsp;

+	}

+

+	public static SolrQueryRequest substituteField(SolrQueryRequest req, String newFieldName){

+		SolrParams params = req.getParams();

+		String query = params.get("q");

+		String currField = StringUtils.substringBetween(" "+query, " ", ":");

+		if ( currField !=null && newFieldName!=null)

+			query = query.replace(currField, newFieldName);

+		NamedList values = params.toNamedList();

+		values.remove("q");

+		values.add("q", query);

+		params = SolrParams.toSolrParams(values);

+		req.setParams(params);

+		return req;

+

+	}

+

+	public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){

+         

+		SolrQueryResponse rsp1 = new SolrQueryResponse(), rsp2=new SolrQueryResponse(), rsp3=new SolrQueryResponse();

+		NamedList list = rsp.getValues();

+		rsp1.setAllValues(rsp.getValues().clone());

+		rsp2.setAllValues(rsp.getValues().clone());

+		rsp3.setAllValues(rsp.getValues().clone());

+		

+		

+		rsp1 = runSearchIteration(req, rsp1, "cat");

+		NamedList values = rsp1.getValues();

+		ResultContext c = (ResultContext) values.get("response");

+		if (c!=null){			

+			DocList dList = c.docs;

+			if (dList.size()<1){

+				rsp2 = runSearchIteration(req, rsp2, "name");

+			}

+			else {

+				rsp.setAllValues(rsp1.getValues());

+				return;

+			}

+		}

+

+		values = rsp2.getValues();

+		c = (ResultContext) values.get("response");

+		if (c!=null){

+			DocList dList = c.docs;

+			if (dList.size()<1){

+				rsp3 = runSearchIteration(req, rsp3, "content");

+			}

+			else {

+				rsp.setAllValues(rsp2.getValues());

+				return;

+			}

+		}

+		

+		rsp.setAllValues(rsp3.getValues());

+

+	}

+

+	

+

+

+

+public DocList filterResultsBySyntMatchReduceDocSet(DocList docList,

+		SolrQueryRequest req,  SolrParams params) {		

+	//if (!docList.hasScores()) 

+	//	return docList;

+

+	int len = docList.size();

+	if (len < 1) // do nothing

+		return docList;

+	ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor .getInstance();

+

+	DocIterator iter = docList.iterator();

+	float[] syntMatchScoreArr = new float[len];

+	String requestExpression = req.getParamString();

+	String[] exprParts = requestExpression.split("&");

+	for(String part: exprParts){

+		if (part.startsWith("q="))

+			requestExpression = part;			

+	}

+	String fieldNameQuery = StringUtils.substringBetween(requestExpression, "=", ":");

+	// extract phrase query (in double-quotes)

+	String[] queryParts = requestExpression.split("\"");

+	if  (queryParts.length>=2 && queryParts[1].length()>5)

+		requestExpression = queryParts[1].replace('+', ' ');	

+	else if (requestExpression.indexOf(":") > -1 ) {// still field-based expression

+		requestExpression = requestExpression.replaceAll(fieldNameQuery+":", "").replace('+',' ').replaceAll("  ", " ").replace("q=", "");

+	}

+

+	if (fieldNameQuery ==null)

+		return docList;

+	if (requestExpression==null || requestExpression.length()<5  || requestExpression.split(" ").length<3)

+		return docList;

+	int[] docIDsHits = new int[len]; 

+

+	IndexReader indexReader = req.getSearcher().getIndexReader();

+	List<Integer> bestMatchesDocIds = new ArrayList<Integer>(); List<Float> bestMatchesScore = new ArrayList<Float>();

+	List<Pair<Integer, Float>> docIdsScores = new ArrayList<Pair<Integer, Float>> ();

+	try {

+		for (int i=0; i<docList.size(); ++i) {

+			int docId = iter.nextDoc();

+			docIDsHits[i] = docId;

+			Document doc = indexReader.document(docId);

+

+			// get text for event

+			String answerText = doc.get(fieldNameQuery);

+			if (answerText==null)

+				continue;

+			SentencePairMatchResult matchResult = pos.assessRelevance( requestExpression , answerText);

+			float syntMatchScore =  new Double(parseTreeChunkListScorer.getParseTreeChunkListScore(matchResult.getMatchResult())).floatValue();

+			bestMatchesDocIds.add(docId);

+			bestMatchesScore.add(syntMatchScore);

+			syntMatchScoreArr[i] = (float)syntMatchScore; //*iter.score();

+			System.out.println(" Matched query = '"+requestExpression + "' with answer = '"+answerText +"' | doc_id = '"+docId);

+			System.out.println(" Match result = '"+matchResult.getMatchResult() + "' with score = '"+syntMatchScore +"';" );

+			docIdsScores.add(new Pair(docId, syntMatchScore));

+		}

+

+	} catch (CorruptIndexException e1) {

+		// TODO Auto-generated catch block

+		e1.printStackTrace();

+		//log.severe("Corrupt index"+e1);

+	} catch (IOException e1) {

+		// TODO Auto-generated catch block

+		e1.printStackTrace();

+		//log.severe("File read IO / index"+e1);

+	}

+

+

+	Collections.sort(docIdsScores, new PairComparable());

+	for(int i = 0; i<docIdsScores.size(); i++){

+		bestMatchesDocIds.set(i, docIdsScores.get(i).getFirst());

+		bestMatchesScore.set(i, docIdsScores.get(i).getSecond());

+	}

+	System.out.println(bestMatchesScore);

+	float maxScore = docList.maxScore(); // do not change

+	int limit = docIdsScores.size();

+	int start = 0; 

+	DocSlice ds = null;

+

+	ds = new DocSlice(start, limit, 

+			ArrayUtils.toPrimitive(bestMatchesDocIds.toArray(new Integer[0])), 

+			ArrayUtils.toPrimitive(bestMatchesScore.toArray(new Float[0])), 

+			bestMatchesDocIds.size(), maxScore);

+

+

+

+	return ds;

+}

+

+

+public void handleRequestBody1(SolrQueryRequest req, SolrQueryResponse rsp)

+throws Exception {

+

+	// extract params from request

+	SolrParams params = req.getParams();

+	String q = params.get(CommonParams.Q);

+	String[] fqs = params.getParams(CommonParams.FQ);

+	int start = 0;

+	try { start = Integer.parseInt(params.get(CommonParams.START)); } 

+	catch (Exception e) { /* default */ }

+	int rows = 0;

+	try { rows = Integer.parseInt(params.get(CommonParams.ROWS)); } 

+	catch (Exception e) { /* default */ }

+	//SolrPluginUtils.setReturnFields(req, rsp);

+

+	// build initial data structures

+

+	SolrDocumentList results = new SolrDocumentList();

+	SolrIndexSearcher searcher = req.getSearcher();

+	Map<String,SchemaField> fields = req.getSchema().getFields();

+	int ndocs = start + rows;

+	Filter filter = buildFilter(fqs, req);

+	Set<Integer> alreadyFound = new HashSet<Integer>();

+

+	// invoke the various sub-handlers in turn and return results

+	doSearch1(results, searcher, q, filter, ndocs, req, 

+			fields, alreadyFound);

+

+	// ... more sub-handler calls here ...

+

+	// build and write response

+	float maxScore = 0.0F;

+	int numFound = 0;

+	List<SolrDocument> slice = new ArrayList<SolrDocument>();

+	for (Iterator<SolrDocument> it = results.iterator(); it.hasNext(); ) {

+		SolrDocument sdoc = it.next();

+		Float score = (Float) sdoc.getFieldValue("score");

+		if (maxScore < score) {

+			maxScore = score;

+		}

+		if (numFound >= start && numFound < start + rows) {

+			slice.add(sdoc);

+		}

+		numFound++;

+	}

+	results.clear();

+	results.addAll(slice);

+	results.setNumFound(numFound);

+	results.setMaxScore(maxScore);

+	results.setStart(start);

+	rsp.add("response", results);

+

+}

+

+

+private Filter buildFilter(String[] fqs, SolrQueryRequest req) 

+throws IOException, ParseException {

+	if (fqs != null && fqs.length > 0) {

+		BooleanQuery fquery = new BooleanQuery();

+		for (int i = 0; i < fqs.length; i++) {

+			QParser parser = null;

+			try {

+				parser = QParser.getParser(fqs[i], null, req);

+			} catch (Exception e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+			try {

+				fquery.add(parser.getQuery(), Occur.MUST);

+			} catch (Exception e) {

+				// TODO Auto-generated catch block

+				e.printStackTrace();

+			}

+		}

+		return new CachingWrapperFilter(new QueryWrapperFilter(fquery));

+	}

+	return null;

+}

+

+private void doSearch1(SolrDocumentList results,

+		SolrIndexSearcher searcher, String q, Filter filter, 

+		int ndocs, SolrQueryRequest req,

+		Map<String,SchemaField> fields, Set<Integer> alreadyFound) 

+throws IOException {

+

+	// build custom query and extra fields

+	Query query = null; //buildCustomQuery1(q);

+	Map<String,Object> extraFields = new HashMap<String,Object>();

+	extraFields.put("search_type", "search1");

+	boolean includeScore = 

+		req.getParams().get(CommonParams.FL).contains("score");

+

+	int  maxDocsPerSearcherType = 0;

+	float maprelScoreCutoff = 2.0f;

+	append(results, searcher.search(

+			query, filter, maxDocsPerSearcherType).scoreDocs,

+			alreadyFound, fields, extraFields, maprelScoreCutoff , 

+			searcher.getIndexReader(), includeScore);

+}

+

+// ... more doSearchXXX() calls here ...

+

+private void append(SolrDocumentList results, ScoreDoc[] more, 

+		Set<Integer> alreadyFound, Map<String,SchemaField> fields,

+		Map<String,Object> extraFields, float scoreCutoff, 

+		IndexReader reader, boolean includeScore) throws IOException {

+	for (ScoreDoc hit : more) {

+		if (alreadyFound.contains(hit.doc)) {

+			continue;

+		}

+		Document doc = reader.document(hit.doc);

+		SolrDocument sdoc = new SolrDocument();

+		for (String fieldname : fields.keySet()) {

+			SchemaField sf = fields.get(fieldname);

+			if (sf.stored()) {

+				sdoc.addField(fieldname, doc.get(fieldname));

+			}

+		}

+		for (String extraField : extraFields.keySet()) {

+			sdoc.addField(extraField, extraFields.get(extraField));

+		}

+		if (includeScore) {

+			sdoc.addField("score", hit.score);

+		}

+		results.add(sdoc);

+		alreadyFound.add(hit.doc);

+	}

+}

+public class PairComparable implements Comparator<Pair> {

+	// @Override

+	public int compare(Pair o1, Pair o2) {

+		int b = -2;

+		if ( o1.getSecond() instanceof Float && o2.getSecond() instanceof Float){

+

+			b =  (((Float)o1.getSecond()> (Float)o2.getSecond()) ? -1

+					: (((Float)o1.getSecond() == (Float)o2.getSecond()) ? 0 : 1));

+		}

+		return b;

+	}

+}

+

+}


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java
new file mode 100644
index 0000000..0876700
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java

@@ -0,0 +1,119 @@
+package opennlp.tools.similarity.apps.solr;

+

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Iterator;

+import java.util.LinkedList;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Logger;

+

+import opennlp.tools.nl2code.NL2Obj;

+import opennlp.tools.nl2code.NL2ObjCreateAssign;

+import opennlp.tools.nl2code.ObjectPhraseListForSentence;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.utils.Pair;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+import org.apache.commons.lang.ArrayUtils;

+import org.apache.commons.lang.StringUtils;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.index.CorruptIndexException;

+import org.apache.lucene.index.IndexReader;

+import org.apache.lucene.queryparser.classic.ParseException;

+import org.apache.lucene.search.BooleanClause.Occur;

+import org.apache.lucene.search.BooleanQuery;

+import org.apache.lucene.search.CachingWrapperFilter;

+import org.apache.lucene.search.Collector;

+import org.apache.lucene.search.Filter;

+import org.apache.lucene.search.Query;

+import org.apache.lucene.search.QueryWrapperFilter;

+import org.apache.lucene.search.ScoreDoc;

+import org.apache.solr.common.SolrDocument;

+import org.apache.solr.common.SolrDocumentList;

+import org.apache.solr.common.SolrException;

+import org.apache.solr.common.params.CommonParams;

+import org.apache.solr.common.params.ModifiableSolrParams;

+import org.apache.solr.common.params.ShardParams;

+import org.apache.solr.common.params.SolrParams;

+import org.apache.solr.common.util.NamedList;

+import org.apache.solr.handler.component.SearchHandler;

+import org.apache.solr.request.SolrQueryRequest;

+import org.apache.solr.response.SolrQueryResponse;

+

+

+

+public class NLProgram2CodeRequestHandler extends SearchHandler {

+	private static Logger LOG = Logger

+			.getLogger("opennlp.tools.similarity.apps.solr.NLProgram2CodeRequestHandler");

+	private final static int MAX_SEARCH_RESULTS = 100;

+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+	private ParserChunker2MatcherProcessor sm = null;

+	private int MAX_QUERY_LENGTH_NOT_TO_RERANK=3;

+	private static String resourceDir = "/home/solr/solr-4.4.0/example/src/test/resources";

+	//"C:/workspace/TestSolr/src/test/resources";

+

+	//"/data1/solr/example/src/test/resources";

+	

+	NL2Obj compiler = new NL2ObjCreateAssign(resourceDir);

+

+	public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){

+		// get query string

+		String requestExpression = req.getParamString();

+		String[] exprParts = requestExpression.split("&");

+		String[] text = new String[exprParts.length];

+			int count=0;

+			for(String val : exprParts){

+				if (val.startsWith("line=")){

+					val = StringUtils.mid(val, 5, val.length());

+					text[count] = val;

+					count++;

+				}

+

+			}

+		

+

+			StringBuffer buf = new StringBuffer();

+		    for(String sent:text){

+		      ObjectPhraseListForSentence opls=null;

+		      try {

+		        opls = compiler.convertSentenceToControlObjectPhrase(sent);

+		      } catch (Exception e) {

+		        e.printStackTrace();

+		      }

+		      System.out.println(sent+"\n"+opls+"\n");

+		      buf.append(sent+"\n |=> "+opls+"\n");

+		    }

+		

+		

+		LOG.info("re-ranking results: "+buf.toString());

+		NamedList<Object> values = rsp.getValues();

+		values.remove("response");

+		values.add("response", buf.toString().trim());

+		rsp.setAllValues(values);

+		

+	}

+

+	

+

+}

+

+/*

+

+http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases

+&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case

+&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case

+&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family

+&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad

+&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad

+

+http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad

+ */
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java
new file mode 100644
index 0000000..fbef398
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java

@@ -0,0 +1,245 @@
+package opennlp.tools.similarity.apps.solr;

+

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.Collections;

+import java.util.Comparator;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Iterator;

+import java.util.LinkedList;

+import java.util.List;

+import java.util.Map;

+import java.util.Set;

+import java.util.logging.Logger;

+

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.utils.Pair;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+

+import org.apache.commons.lang.ArrayUtils;

+import org.apache.commons.lang.StringUtils;

+import org.apache.lucene.document.Document;

+import org.apache.lucene.index.CorruptIndexException;

+import org.apache.lucene.index.IndexReader;

+import org.apache.lucene.queryparser.classic.ParseException;

+import org.apache.lucene.search.BooleanClause.Occur;

+import org.apache.lucene.search.BooleanQuery;

+import org.apache.lucene.search.CachingWrapperFilter;

+import org.apache.lucene.search.Collector;

+import org.apache.lucene.search.Filter;

+import org.apache.lucene.search.Query;

+import org.apache.lucene.search.QueryWrapperFilter;

+import org.apache.lucene.search.ScoreDoc;

+import org.apache.solr.common.SolrDocument;

+import org.apache.solr.common.SolrDocumentList;

+import org.apache.solr.common.SolrException;

+import org.apache.solr.common.params.CommonParams;

+import org.apache.solr.common.params.ModifiableSolrParams;

+import org.apache.solr.common.params.ShardParams;

+import org.apache.solr.common.params.SolrParams;

+import org.apache.solr.common.util.NamedList;

+import org.apache.solr.handler.component.SearchHandler;

+import org.apache.solr.request.SolrQueryRequest;

+import org.apache.solr.response.SolrQueryResponse;

+

+

+

+public class SearchResultsReRankerRequestHandler extends SearchHandler {

+	private static Logger LOG = Logger

+			.getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler");

+	private final static int MAX_SEARCH_RESULTS = 100;

+	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

+	private ParserChunker2MatcherProcessor sm = null;

+	private int MAX_QUERY_LENGTH_NOT_TO_RERANK=3;

+	private static String resourceDir = "/home/solr/solr-4.4.0/example/src/test/resources";

+	//"C:/workspace/TestSolr/src/test/resources";

+

+	//"/data1/solr/example/src/test/resources";

+

+	public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){

+		// get query string

+		String requestExpression = req.getParamString();

+		String[] exprParts = requestExpression.split("&");

+		for(String part: exprParts){

+			if (part.startsWith("q="))

+				requestExpression = part;			

+		}

+		String query = StringUtils.substringAfter(requestExpression, ":");

+		LOG.info(requestExpression);

+

+

+		SolrParams ps = req.getOriginalParams();

+		Iterator<String> iter =  ps.getParameterNamesIterator();

+		List<String> keys = new ArrayList<String>();

+		while(iter.hasNext()){

+			keys.add(iter.next());

+		}

+

+		List<HitBase> searchResults = new ArrayList<HitBase>();

+

+

+

+

+

+		for ( Integer i=0; i< MAX_SEARCH_RESULTS; i++){

+			String title = req.getParams().get("t"+i.toString());

+			String descr = req.getParams().get("d"+i.toString());

+

+			if(title==null || descr==null)

+				continue;

+

+			HitBase hit = new HitBase();

+			hit.setTitle(title);

+			hit.setAbstractText(descr);

+			hit.setSource(i.toString());

+			searchResults.add(hit);

+		}

+

+		/*

+		 * http://173.255.254.250:8983/solr/collection1/reranker/?

+		 * q=search_keywords:design+iphone+cases&fields=spend+a+day+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+with+mobile+case+for+your+family&fields=Add+style+to+your+iPhone+and+iPad&fields=Add+Apple+fashion+to+your+iPhone+and+iPad

+		 * 

+		 */

+

+		if (searchResults.size()<1) {

+			int count=0;

+			for(String val : exprParts){

+				if (val.startsWith("fields=")){

+					val = StringUtils.mid(val, 7, val.length());

+					HitBase hit = new HitBase();

+					hit.setTitle("");

+					hit.setAbstractText(val);

+					hit.setSource(new Integer(count).toString());

+					searchResults.add(hit);

+					count++;

+				}

+

+			}

+		}

+

+

+		List<HitBase> reRankedResults = null;

+		query = query.replace('+', ' ');

+		if (tooFewKeywords(query)|| orQuery(query)){

+			reRankedResults = searchResults;

+			LOG.info("No re-ranking for "+query);

+		}

+		else 

+			reRankedResults = calculateMatchScoreResortHits(searchResults, query);

+		/*

+		 * <scores>

+<score index="2">3.0005</score>

+<score index="1">2.101</score>

+<score index="3">2.1003333333333334</score>

+<score index="4">2.00025</score>

+<score index="5">1.1002</score>

+</scores>

+		 * 

+		 * 

+		 */

+		StringBuffer buf = new StringBuffer(); 

+		buf.append("<scores>");

+		for(HitBase hit: reRankedResults){

+			buf.append("<score index=\""+hit.getSource()+"\">"+hit.getGenerWithQueryScore()+"</score>");				

+		}

+		buf.append("</scores>");

+

+		NamedList<Object> scoreNum = new NamedList<Object>();

+		for(HitBase hit: reRankedResults){

+			scoreNum.add(hit.getSource(), hit.getGenerWithQueryScore());				

+		}

+		

+		StringBuffer bufNums = new StringBuffer(); 

+		bufNums.append("order>");

+		for(HitBase hit: reRankedResults){

+			bufNums.append(hit.getSource()+"_");				

+		}

+		bufNums.append("/order>");

+		

+		LOG.info("re-ranking results: "+buf.toString());

+		NamedList<Object> values = rsp.getValues();

+		values.remove("response");

+		values.add("response", scoreNum); 

+		values.add("new_order", bufNums.toString().trim());

+		rsp.setAllValues(values);

+		

+	}

+

+	private boolean orQuery(String query) {

+		if (query.indexOf('|')>-1)

+			return true;

+

+		return false;

+	}

+

+	private boolean tooFewKeywords(String query) {

+		String[] parts = query.split(" ");

+		if (parts!=null && parts.length< MAX_QUERY_LENGTH_NOT_TO_RERANK)

+			return true;

+

+		return false;

+	}

+

+	private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,

+			String searchQuery) {

+		try {

+			System.out.println("loading openNLP models...from "+resourceDir);

+			sm =  ParserChunker2MatcherProcessor.getInstance(resourceDir);

+			System.out.println("DONE loading openNLP model s.");

+		} catch (Exception e){

+			LOG.severe(e.getMessage());

+		}

+		List<HitBase> newHitList = new ArrayList<HitBase>();

+

+

+		int count=1;

+		for (HitBase hit : hits) {

+			String snapshot = hit.getAbstractText();

+			snapshot += " . " + hit.getTitle();

+			Double score = 0.0;

+			try {

+				SentencePairMatchResult matchRes = sm.assessRelevance(snapshot,

+						searchQuery);

+				List<List<ParseTreeChunk>> match = matchRes.getMatchResult(); // we need the second member

+				// so that when scores are the same, original order is maintained

+				score = parseTreeChunkListScorer.getParseTreeChunkListScore(match)+0.001/(double)count;

+			} catch (Exception e) {

+				LOG.info(e.getMessage());

+				e.printStackTrace();

+			}

+			hit.setGenerWithQueryScore(score);

+			newHitList.add(hit);

+			count++;

+		}

+		Collections.sort(newHitList, new HitBaseComparable());

+		LOG.info(newHitList.toString());

+

+		return newHitList;

+	}

+

+

+	public class HitBaseComparable implements Comparator<HitBase> {

+		// @Override

+		public int compare(HitBase o1, HitBase o2) {

+			return (o1.getGenerWithQueryScore() > o2.getGenerWithQueryScore() ? -1

+					: (o1 == o2 ? 0 : 1));

+		}

+	}

+

+}

+

+/*

+

+http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases

+&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case

+&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case

+&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family

+&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad

+&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad

+

+http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad

+ */
\ No newline at end of file

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
index 57a45b8..b2d6295 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java

@@ -1,26 +1,18 @@
-/*

- * Licensed to the Apache Software Foundation (ASF) under one or more

- * contributor license agreements.  See the NOTICE file distributed with

- * this work for additional information regarding copyright ownership.

- * The ASF licenses this file to You under the Apache License, Version 2.0

- * (the "License"); you may not use this file except in compliance with

- * the License. You may obtain a copy of the License at

- *

- *     http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

 package opennlp.tools.similarity.apps.solr;

 

 import java.io.IOException;

 import java.util.ArrayList;

 import java.util.Collections;

 import java.util.Comparator;

+import java.util.HashMap;

+import java.util.HashSet;

+import java.util.Iterator;

+import java.util.LinkedList;

 import java.util.List;

+import java.util.Map;

+import java.util.Set;

+

+import opennlp.tools.similarity.apps.HitBaseComparable;

 import opennlp.tools.similarity.apps.utils.Pair;

 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;

 import opennlp.tools.textsimilarity.SentencePairMatchResult;

@@ -31,138 +23,309 @@
 import org.apache.lucene.document.Document;

 import org.apache.lucene.index.CorruptIndexException;

 import org.apache.lucene.index.IndexReader;

+import org.apache.lucene.queryparser.classic.ParseException;

+import org.apache.lucene.search.BooleanClause.Occur;

+import org.apache.lucene.search.BooleanQuery;

+import org.apache.lucene.search.CachingWrapperFilter;

+import org.apache.lucene.search.Collector;

+import org.apache.lucene.search.Filter;

+import org.apache.lucene.search.Query;

+import org.apache.lucene.search.QueryWrapperFilter;

+import org.apache.lucene.search.ScoreDoc;

+import org.apache.solr.common.SolrDocument;

+import org.apache.solr.common.SolrDocumentList;

+import org.apache.solr.common.SolrException;

+import org.apache.solr.common.params.CommonParams;

+import org.apache.solr.common.params.ModifiableSolrParams;

+import org.apache.solr.common.params.ShardParams;

 import org.apache.solr.common.params.SolrParams;

 import org.apache.solr.common.util.NamedList;

+import org.apache.solr.handler.RequestHandlerBase;

+import org.apache.solr.handler.component.ResponseBuilder;

+import org.apache.solr.handler.component.SearchComponent;

 import org.apache.solr.handler.component.SearchHandler;

+import org.apache.solr.handler.component.ShardHandler;

+import org.apache.solr.handler.component.ShardRequest;

+import org.apache.solr.handler.component.ShardResponse;

 import org.apache.solr.request.SolrQueryRequest;

 import org.apache.solr.response.ResultContext;

 import org.apache.solr.response.SolrQueryResponse;

+import org.apache.solr.schema.SchemaField;

 import org.apache.solr.search.DocIterator;

 import org.apache.solr.search.DocList;

 import org.apache.solr.search.DocSlice;

+import org.apache.solr.search.QParser;

+import org.apache.solr.search.SolrIndexSearcher;

+

+import org.apache.solr.util.RTimer;

+import org.apache.solr.util.SolrPluginUtils;

 

 public class SyntGenRequestHandler extends SearchHandler {

 

 	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

 

 	public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){

-      try {

-          super.handleRequestBody(req, rsp);

-      } catch (Exception e) {

-          // TODO Auto-generated catch block

-          e.printStackTrace();

-      }

-      //modify rsp

-      NamedList values = rsp.getValues();

-      ResultContext c = (ResultContext) values.get("response");

-      if (c==null)

-          return;

-

-      DocList dList = c.docs;

-      DocList dListResult=null;

-      try {

-          dListResult = filterResultsBySyntMatchReduceDocSet(dList,

-                  req,  req.getParams());

-      } catch (Exception e) {

-          dListResult = dList;

-          // TODO Auto-generated catch block

-          e.printStackTrace();

-      }

-      c.docs = dListResult;

-      values.remove("response");

-      values.add("response", c.docs);

-      rsp.setAllValues(values);

-  }

+		try {

+			super.handleRequestBody(req, rsp);

+		} catch (Exception e) {

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		

+		SolrParams reqValues = req.getOriginalParams();

+		Iterator<String> iter = reqValues.getParameterNamesIterator();

+		while(iter.hasNext()){

+			System.out.println(iter.next());

+		}

+		

+		String param = req.getParamString();

+		//modify rsp

+		NamedList values = rsp.getValues();

+		ResultContext c = (ResultContext) values.get("response");

+		if (c==null)

+			return;

+		

+		String val1 = (String)values.get("t1");

+		String k1 = values.getName(0);

+		k1 = values.getName(1);

+		k1 = values.getName(2);

+		k1 = values.getName(3);

+		k1 = values.getName(4);

+		

+		DocList dList = c.docs;

+		DocList dListResult=null;

+		try {

+			dListResult = filterResultsBySyntMatchReduceDocSet(dList,

+					req,  req.getParams());

+		} catch (Exception e) {

+			dListResult = dList;

+			// TODO Auto-generated catch block

+			e.printStackTrace();

+		}

+		c.docs = dListResult;

+		values.remove("response");

+		

+		rsp.setAllValues(values);

+	}

 

 

-  public DocList filterResultsBySyntMatchReduceDocSet(DocList docList,

-          SolrQueryRequest req,  SolrParams params) {     

-      //if (!docList.hasScores()) 

-      //  return docList;

+	public DocList filterResultsBySyntMatchReduceDocSet(DocList docList,

+			SolrQueryRequest req,  SolrParams params) {		

+		//if (!docList.hasScores()) 

+		//	return docList;

 

-      int len = docList.size();

-      if (len < 1) // do nothing

-          return docList;

-      ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor .getInstance();

+		int len = docList.size();

+		if (len < 1) // do nothing

+			return docList;

+		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor .getInstance();

 

-      DocIterator iter = docList.iterator();

-      float[] syntMatchScoreArr = new float[len];

-      String requestExpression = req.getParamString();

-      String[] exprParts = requestExpression.split("&");

-      for(String part: exprParts){

-          if (part.startsWith("q="))

-              requestExpression = part;           

-      }

-      String fieldNameQuery = StringUtils.substringBetween(requestExpression, "=", ":");

-      // extract phrase query (in double-quotes)

-      String[] queryParts = requestExpression.split("\"");

-      if  (queryParts.length>=2 && queryParts[1].length()>5)

-          requestExpression = queryParts[1].replace('+', ' ');    

-      else if (requestExpression.indexOf(":") > -1 ) {// still field-based expression

-          requestExpression = requestExpression.replaceAll(fieldNameQuery+":", "").replace('+',' ').replaceAll("  ", " ").replace("q=", "");

-      }

-      

-      if (fieldNameQuery ==null)

-          return docList;

-      if (requestExpression==null || requestExpression.length()<5  || requestExpression.split(" ").length<3)

-          return docList;

-      int[] docIDsHits = new int[len]; 

+		DocIterator iter = docList.iterator();

+		float[] syntMatchScoreArr = new float[len];

+		String requestExpression = req.getParamString();

+		String[] exprParts = requestExpression.split("&");

+		for(String part: exprParts){

+			if (part.startsWith("q="))

+				requestExpression = part;			

+		}

+		String fieldNameQuery = StringUtils.substringBetween(requestExpression, "=", ":");

+		// extract phrase query (in double-quotes)

+		String[] queryParts = requestExpression.split("\"");

+		if  (queryParts.length>=2 && queryParts[1].length()>5)

+			requestExpression = queryParts[1].replace('+', ' ');	

+		else if (requestExpression.indexOf(":") > -1 ) {// still field-based expression

+			requestExpression = requestExpression.replaceAll(fieldNameQuery+":", "").replace('+',' ').replaceAll("  ", " ").replace("q=", "");

+		}

+		

+		if (fieldNameQuery ==null)

+			return docList;

+		if (requestExpression==null || requestExpression.length()<5  || requestExpression.split(" ").length<3)

+			return docList;

+		int[] docIDsHits = new int[len]; 

 

-      IndexReader indexReader = req.getSearcher().getIndexReader();

-      List<Integer> bestMatchesDocIds = new ArrayList<Integer>(); List<Float> bestMatchesScore = new ArrayList<Float>();

-      List<Pair<Integer, Float>> docIdsScores = new ArrayList<Pair<Integer, Float>> ();

-      try {

-          for (int i=0; i<docList.size(); ++i) {

-              int docId = iter.nextDoc();

-              docIDsHits[i] = docId;

-              Document doc = indexReader.document(docId);

+		IndexReader indexReader = req.getSearcher().getIndexReader();

+		List<Integer> bestMatchesDocIds = new ArrayList<Integer>(); List<Float> bestMatchesScore = new ArrayList<Float>();

+		List<Pair<Integer, Float>> docIdsScores = new ArrayList<Pair<Integer, Float>> ();

+		try {

+			for (int i=0; i<docList.size(); ++i) {

+				int docId = iter.nextDoc();

+				docIDsHits[i] = docId;

+				Document doc = indexReader.document(docId);

 

-              // get text for event

-              String answerText = doc.get(fieldNameQuery);

-              if (answerText==null)

-                  continue;

-              SentencePairMatchResult matchResult = pos.assessRelevance( requestExpression , answerText);

-              float syntMatchScore =  new Double(parseTreeChunkListScorer.getParseTreeChunkListScore(matchResult.getMatchResult())).floatValue();

-              bestMatchesDocIds.add(docId);

-              bestMatchesScore.add(syntMatchScore);

-              syntMatchScoreArr[i] = (float)syntMatchScore; //*iter.score();

-              System.out.println(" Matched query = '"+requestExpression + "' with answer = '"+answerText +"' | doc_id = '"+docId);

-              System.out.println(" Match result = '"+matchResult.getMatchResult() + "' with score = '"+syntMatchScore +"';" );

-              docIdsScores.add(new Pair(docId, syntMatchScore));

-          }

+				// get text for event

+				String answerText = doc.get(fieldNameQuery);

+				if (answerText==null)

+					continue;

+				SentencePairMatchResult matchResult = pos.assessRelevance( requestExpression , answerText);

+				float syntMatchScore =  new Double(parseTreeChunkListScorer.getParseTreeChunkListScore(matchResult.getMatchResult())).floatValue();

+				bestMatchesDocIds.add(docId);

+				bestMatchesScore.add(syntMatchScore);

+				syntMatchScoreArr[i] = (float)syntMatchScore; //*iter.score();

+				System.out.println(" Matched query = '"+requestExpression + "' with answer = '"+answerText +"' | doc_id = '"+docId);

+				System.out.println(" Match result = '"+matchResult.getMatchResult() + "' with score = '"+syntMatchScore +"';" );

+				docIdsScores.add(new Pair(docId, syntMatchScore));

+			}

 

-      } catch (CorruptIndexException e1) {

-          // TODO Auto-generated catch block

-          e1.printStackTrace();

-          //log.severe("Corrupt index"+e1);

-      } catch (IOException e1) {

-          // TODO Auto-generated catch block

-          e1.printStackTrace();

-          //log.severe("File read IO / index"+e1);

-      }

-      

-      

-      Collections.sort(docIdsScores, new PairComparable());

-      for(int i = 0; i<docIdsScores.size(); i++){

-          bestMatchesDocIds.set(i, docIdsScores.get(i).getFirst());

-          bestMatchesScore.set(i, docIdsScores.get(i).getSecond());

-      }

-      System.out.println(bestMatchesScore);

-      float maxScore = docList.maxScore(); // do not change

-      int limit = docIdsScores.size();

-      int start = 0; 

-      DocSlice ds = null;

+		} catch (CorruptIndexException e1) {

+			// TODO Auto-generated catch block

+			e1.printStackTrace();

+			//log.severe("Corrupt index"+e1);

+		} catch (IOException e1) {

+			// TODO Auto-generated catch block

+			e1.printStackTrace();

+			//log.severe("File read IO / index"+e1);

+		}

+		

+		

+		Collections.sort(docIdsScores, new PairComparable());

+		for(int i = 0; i<docIdsScores.size(); i++){

+			bestMatchesDocIds.set(i, docIdsScores.get(i).getFirst());

+			bestMatchesScore.set(i, docIdsScores.get(i).getSecond());

+		}

+		System.out.println(bestMatchesScore);

+		float maxScore = docList.maxScore(); // do not change

+		int limit = docIdsScores.size();

+		int start = 0; 

+		DocSlice ds = null;

 

-      ds = new DocSlice(start, limit, 

-              ArrayUtils.toPrimitive(bestMatchesDocIds.toArray(new Integer[0])), 

-              ArrayUtils.toPrimitive(bestMatchesScore.toArray(new Float[0])), 

-              bestMatchesDocIds.size(), maxScore);

+		ds = new DocSlice(start, limit, 

+				ArrayUtils.toPrimitive(bestMatchesDocIds.toArray(new Integer[0])), 

+				ArrayUtils.toPrimitive(bestMatchesScore.toArray(new Float[0])), 

+				bestMatchesDocIds.size(), maxScore);

 

 

 

-      return ds;

-  }

+		return ds;

+	}

 

+

+	public void handleRequestBody1(SolrQueryRequest req, SolrQueryResponse rsp)

+	throws Exception {

+

+		// extract params from request

+		SolrParams params = req.getParams();

+		String q = params.get(CommonParams.Q);

+		String[] fqs = params.getParams(CommonParams.FQ);

+		int start = 0;

+		try { start = Integer.parseInt(params.get(CommonParams.START)); } 

+		catch (Exception e) { /* default */ }

+		int rows = 0;

+		try { rows = Integer.parseInt(params.get(CommonParams.ROWS)); } 

+		catch (Exception e) { /* default */ }

+		//SolrPluginUtils.setReturnFields(req, rsp);

+

+		// build initial data structures

+

+		SolrDocumentList results = new SolrDocumentList();

+		SolrIndexSearcher searcher = req.getSearcher();

+		Map<String,SchemaField> fields = req.getSchema().getFields();

+		int ndocs = start + rows;

+		Filter filter = buildFilter(fqs, req);

+		Set<Integer> alreadyFound = new HashSet<Integer>();

+

+		// invoke the various sub-handlers in turn and return results

+		doSearch1(results, searcher, q, filter, ndocs, req, 

+				fields, alreadyFound);

+

+		// ... more sub-handler calls here ...

+

+		// build and write response

+		float maxScore = 0.0F;

+		int numFound = 0;

+		List<SolrDocument> slice = new ArrayList<SolrDocument>();

+		for (Iterator<SolrDocument> it = results.iterator(); it.hasNext(); ) {

+			SolrDocument sdoc = it.next();

+			Float score = (Float) sdoc.getFieldValue("score");

+			if (maxScore < score) {

+				maxScore = score;

+			}

+			if (numFound >= start && numFound < start + rows) {

+				slice.add(sdoc);

+			}

+			numFound++;

+		}

+		results.clear();

+		results.addAll(slice);

+		results.setNumFound(numFound);

+		results.setMaxScore(maxScore);

+		results.setStart(start);

+		rsp.add("response", results);

+

+	}

+

+

+	private Filter buildFilter(String[] fqs, SolrQueryRequest req) 

+	throws IOException, ParseException {

+		if (fqs != null && fqs.length > 0) {

+			BooleanQuery fquery = new BooleanQuery();

+			for (int i = 0; i < fqs.length; i++) {

+				QParser parser = null;

+				try {

+					parser = QParser.getParser(fqs[i], null, req);

+				} catch (Exception e) {

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+				try {

+					fquery.add(parser.getQuery(), Occur.MUST);

+				} catch (Exception e) {

+					// TODO Auto-generated catch block

+					e.printStackTrace();

+				}

+			}

+			return new CachingWrapperFilter(new QueryWrapperFilter(fquery));

+		}

+		return null;

+	}

+

+	private void doSearch1(SolrDocumentList results,

+			SolrIndexSearcher searcher, String q, Filter filter, 

+			int ndocs, SolrQueryRequest req,

+			Map<String,SchemaField> fields, Set<Integer> alreadyFound) 

+	throws IOException {

+

+		// build custom query and extra fields

+		Query query = null; //buildCustomQuery1(q);

+		Map<String,Object> extraFields = new HashMap<String,Object>();

+		extraFields.put("search_type", "search1");

+		boolean includeScore = 

+			req.getParams().get(CommonParams.FL).contains("score");

+

+		int  maxDocsPerSearcherType = 0;

+		float maprelScoreCutoff = 2.0f;

+		append(results, searcher.search(

+				query, filter, maxDocsPerSearcherType).scoreDocs,

+				alreadyFound, fields, extraFields, maprelScoreCutoff , 

+				searcher.getIndexReader(), includeScore);

+	}

+

+	// ... more doSearchXXX() calls here ...

+

+	private void append(SolrDocumentList results, ScoreDoc[] more, 

+			Set<Integer> alreadyFound, Map<String,SchemaField> fields,

+			Map<String,Object> extraFields, float scoreCutoff, 

+			IndexReader reader, boolean includeScore) throws IOException {

+		for (ScoreDoc hit : more) {

+			if (alreadyFound.contains(hit.doc)) {

+				continue;

+			}

+			Document doc = reader.document(hit.doc);

+			SolrDocument sdoc = new SolrDocument();

+			for (String fieldname : fields.keySet()) {

+				SchemaField sf = fields.get(fieldname);

+				if (sf.stored()) {

+					sdoc.addField(fieldname, doc.get(fieldname));

+				}

+			}

+			for (String extraField : extraFields.keySet()) {

+				sdoc.addField(extraField, extraFields.get(extraField));

+			}

+			if (includeScore) {

+				sdoc.addField("score", hit.score);

+			}

+			results.add(sdoc);

+			alreadyFound.add(hit.doc);

+		}

+	}

 	public class PairComparable implements Comparator<Pair> {

 		// @Override

 		public int compare(Pair o1, Pair o2) {

@@ -177,3 +340,9 @@
 	}

 

 }

+

+/*

+ * 

+ * 

+ * http://localhost:8080/solr/syntgen/?q=add-style-to-your-every-day-fresh-design-iphone-cases&t1=Personalized+iPhone+Cases&d1=Add+style+to+your+every+day+with+a+custom+iPhone+case&t2=Personalized+iPhone+Cases&d2=Add+style+to+your+every+day+with+a+custom+iPhone+case&t3=Personalized+iPhone+Cases&d3=Add+style+to+your+every+day+with+a+custom+iPhone+case&t4=Personalized+iPhone+Cases&d4=add+style+to+your+every+day+with+a+custom+iPhone+case

+ * */


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/cgRequestForm.html b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/cgRequestForm.html
new file mode 100644
index 0000000..f2b4e63
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/cgRequestForm.html

@@ -0,0 +1,157 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"

+"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

+ 

+<html xmlns='http://www.w3.org/1999/xhtml'>

+   <head >

+      <meta http-equiv='Content-Type' content='text/html; charset=utf-8'/>

+      <title >Submit Your Essay Writing request here</title>

+      <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"></script>

+      

+      <link rel="stylesheet" href="http://netdna.bootstrapcdn.com/bootstrap/3.0.3/css/bootstrap.min.css">

+      <link rel="stylesheet" href="http://netdna.bootstrapcdn.com/bootstrap/3.0.3/css/bootstrap-theme.min.css">

+      <script src="http://netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js"></script>

+      

+      <link rel="stylesheet" href="ladda/ladda.min.css">

+	<script src="ladda/spin.min.js"></script>

+	<script src="ladda/ladda.min.js"></script>

+	<script src="ladda/spin.min.js"></script>

+      

+      

+      

+      

+      <style type="text/css">

+    .bs-example{

+    	margin: 30px;

+    }

+    

+    .form-horizontal .form-group input#seed {

+    	width:350px;

+	}

+

+    .form-horizontal .form-group input {

+    	width:165px;

+	}

+

+    .form-horizontal .form-group select#lang {

+    width:140px;

+	}

+	

+    .form-horizontal .form-group input#searchResultsNum {

+    width:70px;

+	}

+	

+    .form-horizontal .form-group input#stepsNum {

+    width:70px;

+	}

+	

+	

+	

+body { 

+

+background: #ffffff; /* Old browsers */

+background: -moz-linear-gradient(45deg,  #ffffff 0%, #f6f6f6 47%, #ededed 100%); /* FF3.6+ */

+background: -webkit-gradient(linear, left bottom, right top, color-stop(0%,#ffffff), color-stop(47%,#f6f6f6), color-stop(100%,#ededed)); /* Chrome,Safari4+ */

+background: -webkit-linear-gradient(45deg,  #ffffff 0%,#f6f6f6 47%,#ededed 100%); /* Chrome10+,Safari5.1+ */

+background: -o-linear-gradient(45deg,  #ffffff 0%,#f6f6f6 47%,#ededed 100%); /* Opera 11.10+ */

+background: -ms-linear-gradient(45deg,  #ffffff 0%,#f6f6f6 47%,#ededed 100%); /* IE10+ */

+background: linear-gradient(45deg,  #ffffff 0%,#f6f6f6 47%,#ededed 100%); /* W3C */

+filter: progid:DXImageTransform.Microsoft.gradient( startColorstr='#ffffff', endColorstr='#ededed',GradientType=1 ); /* IE6-9 fallback on horizontal gradient */

+

+}

+

+</style>

+

+<script>

+ 

+

+ 

+</script>

+   </head>

+<body>

+<div class="bs-example">

+<h1>Submit Your Essay Writing request here / Envie su solicitud ensayo escrito aqui</h1>

+It takes 20-30 minutes to write a 40-60 page essay. So don't wait for this form to finish, just open another tab to proceed with browsing.

+<br>Your essay should arrive to your specified email address with a DOCX as attachment.

+Depending on your requirements, you might want to edit DOCX file to remove unwanted parts and make a smoother read.

+References to sources are included: notice how many are used. We compiled this from hundreds sources on the web. It would probably take you a few days to write something of a similar size. Instead, using this tool it should take you couple of hours to polish the draft.<br>

+Here are the docs <a href="http://173.255.254.250/written/">ordered by other users</a>

+

+

+

+<form class="form-horizontal" role="form"  id="essayForm"  method="post" action="http://173.255.254.250:8983/solr/contentgen/?resourceDir=/home/solr/solr-4.4.0/example/src/test/resources&workDir=/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib&relevanceThreshold=0.5&bingKey=e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=">

+  <div class="form-group">

+    <label for="inputEmail1" class="col-lg-2 control-label">Topic for your essay/Tema de su ensayo: </label>

+    <div class="col-lg-10">

+      <input type="text" class="form-control" id="q" name="q" placeholder="Enter at least 2 words...">

+    </div>

+  </div>

+  <div class="form-group">

+    <label for="inputPassword1" class="col-lg-2 control-label">Email to receive your essay/para recibir su ensayo:</label>

+    <div class="col-lg-10">

+      <input type="email" name="email" class="form-control" id="email" placeholder="">

+    </div>

+  </div>

+  <div class="form-group">

+    <label for="inpuLanguage" class="col-lg-2 control-label">Select language/seleccionar el idioma:</label>

+    <div class="col-lg-10">

+     <select class="form-control" name="lang" id="lang" >

+   		<option value="en-US"> English</option>

+ 		<option value="es-US"> Espaniol</option>

+ 		<option value="de-DE"> German</option>

+	</select>

+    </div>

+  </div>

+ 

+

+  <div class="form-group">

+	   <label for="inputEmail1" class="col-lg-2 control-label">Number of plot elements (use 30 as default for fourty page doc):  </label>

+	   <div class="col-lg-10">

+	     <input type="number" name="stepsNum" class="form-control" id="stepsNum" value = "30">

+	   </div>    

+   </div>  

+  <div class="form-group">

+	    <label for="inputEmail1" class="col-lg-2 control-label">Number of paragraphs in each plot (use 50 for fourty page doc):  </label>

+	    <div class="col-lg-10">

+	      <input type="number" name="searchResultsNum" class="form-control" id="searchResultsNum" value = "50">

+	    </div>

+ </div>

+  

+  

+  <div class="form-group">

+    <div class="col-lg-offset-2 col-lg-10">

+	     <a href="#" id="form-submit" class="btn btn-primary btn-lg ladda-button" data-style="expand-right" data-size="l">

+	     	<span class="ladda-label">Submit</span>

+	     </a>

+	</div>

+  </div>

+</form>

+</div>

+<script>

+$(function() {

+	$('#form-submit').click(function(e){

+	 	e.preventDefault();

+	 	var l = Ladda.create(this);

+	 	l.start();

+	 	var data = $('#essayForm').serialize();

+	 	console.log("form-data", data);

+	 	

+	 	$.post("http://173.255.254.250:8983/solr/contentgen/?resourceDir=/home/solr/solr-4.4.0/example/src/test/resources&workDir=/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib&relevanceThreshold=0.5&bingKey=e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=&wt=json", 

+	 	   data,

+	 	  function(response){

+	 	    console.log(response);

+	 	    var responseMsg = response.response;

+	 	    

+	 	    alert(responseMsg);

+	 	  }, "json")

+	 	.always(function() { l.stop(); });

+	 	return false;

+	});

+});

+  

+

+</script>	

+	

+	

+</body>

+

+</html>


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/solrconfig.xml b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/solrconfig.xml
new file mode 100644
index 0000000..3a9db3b
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/solrconfig.xml

@@ -0,0 +1,1814 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- 
+     For more details about configurations options that may appear in
+     this file, see http://wiki.apache.org/solr/SolrConfigXml. 
+-->
+<config>
+  <!-- In all configuration below, a prefix of "solr." for class names
+       is an alias that causes solr to search appropriate packages,
+       including org.apache.solr.(search|update|request|core|analysis)
+
+       You may also specify a fully qualified Java classname if you
+       have your own custom plugins.
+    -->
+
+  <!-- Controls what version of Lucene various components of Solr
+       adhere to.  Generally, you want to use the latest version to
+       get all bug fixes and improvements. It is highly recommended
+       that you fully re-index after changing this setting as it can
+       affect both how text is indexed and queried.
+  -->
+  <luceneMatchVersion>LUCENE_40</luceneMatchVersion>
+
+  <!-- lib directives can be used to instruct Solr to load an Jars
+       identified and use them to resolve any "plugins" specified in
+       your solrconfig.xml or schema.xml (ie: Analyzers, Request
+       Handlers, etc...).
+
+       All directories and paths are resolved relative to the
+       instanceDir.
+
+       If a "./lib" directory exists in your instanceDir, all files
+       found in it are included as if you had used the following
+       syntax...
+       
+              <lib dir="./lib" />
+    -->
+
+  <!-- A 'dir' option by itself adds any files found in the directory 
+       to the classpath, this is useful for including all jars in a
+       directory.
+    -->
+  <!--
+     <lib dir="../add-everything-found-in-this-dir-to-the-classpath" />
+  -->
+
+  <!-- When a 'regex' is specified in addition to a 'dir', only the
+       files in that directory which completely match the regex
+       (anchored on both ends) will be included.
+    -->
+  <lib dir="../../../dist/" regex="apache-solr-cell-\d.*\.jar" />
+  <lib dir="../../../contrib/extraction/lib" regex=".*\.jar" />
+
+  <lib dir="../../../dist/" regex="apache-solr-clustering-\d.*\.jar" />
+  <lib dir="../../../contrib/clustering/lib/" regex=".*\.jar" />
+
+  <lib dir="../../../dist/" regex="apache-solr-langid-\d.*\.jar" />
+  <lib dir="../../../contrib/langid/lib/" regex=".*\.jar" />
+
+  <lib dir="../../../dist/" regex="apache-solr-velocity-\d.*\.jar" />
+  <lib dir="../../../contrib/velocity/lib" regex=".*\.jar" />
+
+  <!-- If a 'dir' option (with or without a regex) is used and nothing
+       is found that matches, it will be ignored
+    -->
+  <lib dir="/total/crap/dir/ignored" /> 
+
+  <!-- an exact 'path' can be used instead of a 'dir' to specify a 
+       specific file.  This will cause a serious error to be logged if 
+       it can't be loaded.
+    -->
+  <!--
+     <lib path="../a-jar-that-does-not-exist.jar" /> 
+  -->
+  <lib path="/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib/solrRequestHandlers.jar" />
+  <lib path="/home/solr/solr-4.4.0/example/solr-webapp/webapp/WEB-INF/lib/pt.jar" />
+  
+  
+  
+  <!-- Data Directory
+
+       Used to specify an alternate directory to hold all index data
+       other than the default ./data under the Solr home.  If
+       replication is in use, this should match the replication
+       configuration.
+    -->
+  <dataDir>${solr.data.dir:}</dataDir>
+
+
+  <!-- The DirectoryFactory to use for indexes.
+       
+       solr.StandardDirectoryFactory is filesystem
+       based and tries to pick the best implementation for the current
+       JVM and platform.  solr.NRTCachingDirectoryFactory, the default,
+       wraps solr.StandardDirectoryFactory and caches small files in memory
+       for better NRT performance.
+
+       One can force a particular implementation via solr.MMapDirectoryFactory,
+       solr.NIOFSDirectoryFactory, or solr.SimpleFSDirectoryFactory.
+
+       solr.RAMDirectoryFactory is memory based, not
+       persistent, and doesn't work with replication.
+    -->
+  <directoryFactory name="DirectoryFactory" 
+                    class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/> 
+
+  <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+       Index Config - These settings control low-level behavior of indexing
+       Most example settings here show the default value, but are commented
+       out, to more easily see where customizations have been made.
+       
+       Note: This replaces <indexDefaults> and <mainIndex> from older versions
+       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
+  <indexConfig>
+    <!-- maxFieldLength was removed in 4.0. To get similar behavior, include a 
+         LimitTokenCountFilterFactory in your fieldType definition. E.g. 
+     <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10000"/>
+    -->
+    <!-- Maximum time to wait for a write lock (ms) for an IndexWriter. Default: 1000 -->
+    <!-- <writeLockTimeout>1000</writeLockTimeout>  -->
+
+    <!-- Expert: Enabling compound file will use less files for the index, 
+         using fewer file descriptors on the expense of performance decrease. 
+         Default in Lucene is "true". Default in Solr is "false" (since 3.6) -->
+    <!-- <useCompoundFile>false</useCompoundFile> -->
+
+    <!-- ramBufferSizeMB sets the amount of RAM that may be used by Lucene
+         indexing for buffering added documents and deletions before they are
+         flushed to the Directory.
+         maxBufferedDocs sets a limit on the number of documents buffered
+         before flushing.
+         If both ramBufferSizeMB and maxBufferedDocs is set, then
+         Lucene will flush based on whichever limit is hit first.  -->
+    <!-- <ramBufferSizeMB>32</ramBufferSizeMB> -->
+    <!-- <maxBufferedDocs>1000</maxBufferedDocs> -->
+
+    <!-- Expert: Merge Policy 
+         The Merge Policy in Lucene controls how merging of segments is done.
+         The default since Solr/Lucene 3.3 is TieredMergePolicy.
+         The default since Lucene 2.3 was the LogByteSizeMergePolicy,
+         Even older versions of Lucene used LogDocMergePolicy.
+      -->
+    <!--
+        <mergePolicy class="org.apache.lucene.index.TieredMergePolicy">
+          <int name="maxMergeAtOnce">10</int>
+          <int name="segmentsPerTier">10</int>
+        </mergePolicy>
+      -->
+       
+    <!-- Merge Factor
+         The merge factor controls how many segments will get merged at a time.
+         For TieredMergePolicy, mergeFactor is a convenience parameter which
+         will set both MaxMergeAtOnce and SegmentsPerTier at once.
+         For LogByteSizeMergePolicy, mergeFactor decides how many new segments
+         will be allowed before they are merged into one.
+         Default is 10 for both merge policies.
+      -->
+    <!-- 
+    <mergeFactor>10</mergeFactor>
+      -->
+
+    <!-- Expert: Merge Scheduler
+         The Merge Scheduler in Lucene controls how merges are
+         performed.  The ConcurrentMergeScheduler (Lucene 2.3 default)
+         can perform merges in the background using separate threads.
+         The SerialMergeScheduler (Lucene 2.2 default) does not.
+     -->
+    <!-- 
+       <mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/>
+       -->
+
+    <!-- LockFactory 
+
+         This option specifies which Lucene LockFactory implementation
+         to use.
+      
+         single = SingleInstanceLockFactory - suggested for a
+                  read-only index or when there is no possibility of
+                  another process trying to modify the index.
+         native = NativeFSLockFactory - uses OS native file locking.
+                  Do not use when multiple solr webapps in the same
+                  JVM are attempting to share a single index.
+         simple = SimpleFSLockFactory  - uses a plain file for locking
+
+         Defaults: 'native' is default for Solr3.6 and later, otherwise
+                   'simple' is the default
+
+         More details on the nuances of each LockFactory...
+         http://wiki.apache.org/lucene-java/AvailableLockFactories
+    -->
+    <!-- <lockType>native</lockType> -->
+
+    <!-- Unlock On Startup
+
+         If true, unlock any held write or commit locks on startup.
+         This defeats the locking mechanism that allows multiple
+         processes to safely access a lucene index, and should be used
+         with care. Default is "false".
+
+         This is not needed if lock type is 'none' or 'single'
+     -->
+    <!--
+    <unlockOnStartup>false</unlockOnStartup>
+      -->
+    
+    <!-- Expert: Controls how often Lucene loads terms into memory
+         Default is 128 and is likely good for most everyone.
+      -->
+    <!-- <termIndexInterval>128</termIndexInterval> -->
+
+    <!-- If true, IndexReaders will be reopened (often more efficient)
+         instead of closed and then opened. Default: true
+      -->
+    <!-- 
+    <reopenReaders>true</reopenReaders>
+      -->
+
+    <!-- Commit Deletion Policy
+
+         Custom deletion policies can be specified here. The class must
+         implement org.apache.lucene.index.IndexDeletionPolicy.
+
+         http://lucene.apache.org/java/3_5_0/api/core/org/apache/lucene/index/IndexDeletionPolicy.html
+
+         The default Solr IndexDeletionPolicy implementation supports
+         deleting index commit points on number of commits, age of
+         commit point and optimized status.
+         
+         The latest commit point should always be preserved regardless
+         of the criteria.
+    -->
+    <!-- 
+    <deletionPolicy class="solr.SolrDeletionPolicy">
+    -->
+      <!-- The number of commit points to be kept -->
+      <!-- <str name="maxCommitsToKeep">1</str> -->
+      <!-- The number of optimized commit points to be kept -->
+      <!-- <str name="maxOptimizedCommitsToKeep">0</str> -->
+      <!--
+          Delete all commit points once they have reached the given age.
+          Supports DateMathParser syntax e.g.
+        -->
+      <!--
+         <str name="maxCommitAge">30MINUTES</str>
+         <str name="maxCommitAge">1DAY</str>
+      -->
+    <!-- 
+    </deletionPolicy>
+    -->
+
+    <!-- Lucene Infostream
+       
+         To aid in advanced debugging, Lucene provides an "InfoStream"
+         of detailed information when indexing.
+
+         Setting The value to true will instruct the underlying Lucene
+         IndexWriter to write its debugging info the specified file
+      -->
+     <!-- <infoStream file="INFOSTREAM.txt">false</infoStream> --> 
+  </indexConfig>
+
+
+  <!-- JMX
+       
+       This example enables JMX if and only if an existing MBeanServer
+       is found, use this if you want to configure JMX through JVM
+       parameters. Remove this to disable exposing Solr configuration
+       and statistics to JMX.
+
+       For more details see http://wiki.apache.org/solr/SolrJmx
+    -->
+  <jmx />
+  <!-- If you want to connect to a particular server, specify the
+       agentId 
+    -->
+  <!-- <jmx agentId="myAgent" /> -->
+  <!-- If you want to start a new MBeanServer, specify the serviceUrl -->
+  <!-- <jmx serviceUrl="service:jmx:rmi:///jndi/rmi://localhost:9999/solr"/>
+    -->
+
+  <!-- The default high-performance update handler -->
+  <updateHandler class="solr.DirectUpdateHandler2">
+
+    <!-- AutoCommit
+
+         Perform a hard commit automatically under certain conditions.
+         Instead of enabling autoCommit, consider using "commitWithin"
+         when adding documents. 
+
+         http://wiki.apache.org/solr/UpdateXmlMessages
+
+         maxDocs - Maximum number of documents to add since the last
+                   commit before automatically triggering a new commit.
+
+         maxTime - Maximum amount of time in ms that is allowed to pass
+                   since a document was added before automaticly
+                   triggering a new commit. 
+         openSearcher - if false, the commit causes recent index changes
+         to be flushed to stable storage, but does not cause a new
+         searcher to be opened to make those changes visible.
+      -->
+     <autoCommit> 
+       <maxTime>15000</maxTime> 
+       <openSearcher>false</openSearcher> 
+     </autoCommit>
+
+    <!-- softAutoCommit is like autoCommit except it causes a
+         'soft' commit which only ensures that changes are visible
+         but does not ensure that data is synced to disk.  This is
+         faster and more near-realtime friendly than a hard commit.
+      -->
+     <!--
+       <autoSoftCommit> 
+         <maxTime>1000</maxTime> 
+       </autoSoftCommit>
+      -->
+
+    <!-- Update Related Event Listeners
+         
+         Various IndexWriter related events can trigger Listeners to
+         take actions.
+
+         postCommit - fired after every commit or optimize command
+         postOptimize - fired after every optimize command
+      -->
+    <!-- The RunExecutableListener executes an external command from a
+         hook such as postCommit or postOptimize.
+         
+         exe - the name of the executable to run
+         dir - dir to use as the current working directory. (default=".")
+         wait - the calling thread waits until the executable returns. 
+                (default="true")
+         args - the arguments to pass to the program.  (default is none)
+         env - environment variables to set.  (default is none)
+      -->
+    <!-- This example shows how RunExecutableListener could be used
+         with the script based replication...
+         http://wiki.apache.org/solr/CollectionDistribution
+      -->
+    <!--
+       <listener event="postCommit" class="solr.RunExecutableListener">
+         <str name="exe">solr/bin/snapshooter</str>
+         <str name="dir">.</str>
+         <bool name="wait">true</bool>
+         <arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
+         <arr name="env"> <str>MYVAR=val1</str> </arr>
+       </listener>
+      -->
+
+    <!-- Enables a transaction log, currently used for real-time get.
+         "dir" - the target directory for transaction logs, defaults to the
+            solr data directory.  --> 
+    <updateLog>
+      <str name="dir">${solr.data.dir:}</str>
+    </updateLog>
+   
+
+  </updateHandler>
+  
+  <!-- IndexReaderFactory
+
+       Use the following format to specify a custom IndexReaderFactory,
+       which allows for alternate IndexReader implementations.
+
+       ** Experimental Feature **
+
+       Please note - Using a custom IndexReaderFactory may prevent
+       certain other features from working. The API to
+       IndexReaderFactory may change without warning or may even be
+       removed from future releases if the problems cannot be
+       resolved.
+
+
+       ** Features that may not work with custom IndexReaderFactory **
+
+       The ReplicationHandler assumes a disk-resident index. Using a
+       custom IndexReader implementation may cause incompatibility
+       with ReplicationHandler and may cause replication to not work
+       correctly. See SOLR-1366 for details.
+
+    -->
+  <!--
+  <indexReaderFactory name="IndexReaderFactory" class="package.class">
+    <str name="someArg">Some Value</str>
+  </indexReaderFactory >
+  -->
+  <!-- By explicitly declaring the Factory, the termIndexDivisor can
+       be specified.
+    -->
+  <!--
+     <indexReaderFactory name="IndexReaderFactory" 
+                         class="solr.StandardIndexReaderFactory">
+       <int name="setTermIndexDivisor">12</int>
+     </indexReaderFactory >
+    -->
+
+  <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+       Query section - these settings control query time things like caches
+       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
+  <query>
+    <!-- Max Boolean Clauses
+
+         Maximum number of clauses in each BooleanQuery,  an exception
+         is thrown if exceeded.
+
+         ** WARNING **
+         
+         This option actually modifies a global Lucene property that
+         will affect all SolrCores.  If multiple solrconfig.xml files
+         disagree on this property, the value at any given moment will
+         be based on the last SolrCore to be initialized.
+         
+      -->
+    <maxBooleanClauses>1024</maxBooleanClauses>
+
+
+    <!-- Solr Internal Query Caches
+
+         There are two implementations of cache available for Solr,
+         LRUCache, based on a synchronized LinkedHashMap, and
+         FastLRUCache, based on a ConcurrentHashMap.  
+
+         FastLRUCache has faster gets and slower puts in single
+         threaded operation and thus is generally faster than LRUCache
+         when the hit ratio of the cache is high (> 75%), and may be
+         faster under other scenarios on multi-cpu systems.
+    -->
+
+    <!-- Filter Cache
+
+         Cache used by SolrIndexSearcher for filters (DocSets),
+         unordered sets of *all* documents that match a query.  When a
+         new searcher is opened, its caches may be prepopulated or
+         "autowarmed" using data from caches in the old searcher.
+         autowarmCount is the number of items to prepopulate.  For
+         LRUCache, the autowarmed items will be the most recently
+         accessed items.
+
+         Parameters:
+           class - the SolrCache implementation LRUCache or
+               (LRUCache or FastLRUCache)
+           size - the maximum number of entries in the cache
+           initialSize - the initial capacity (number of entries) of
+               the cache.  (see java.util.HashMap)
+           autowarmCount - the number of entries to prepopulate from
+               and old cache.  
+      -->
+    <filterCache class="solr.FastLRUCache"
+                 size="512"
+                 initialSize="512"
+                 autowarmCount="0"/>
+
+    <!-- Query Result Cache
+         
+         Caches results of searches - ordered lists of document ids
+         (DocList) based on a query, a sort, and the range of documents requested.  
+      -->
+    <queryResultCache class="solr.LRUCache"
+                     size="512"
+                     initialSize="512"
+                     autowarmCount="0"/>
+   
+    <!-- Document Cache
+
+         Caches Lucene Document objects (the stored fields for each
+         document).  Since Lucene internal document ids are transient,
+         this cache will not be autowarmed.  
+      -->
+    <documentCache class="solr.LRUCache"
+                   size="512"
+                   initialSize="512"
+                   autowarmCount="0"/>
+    
+    <!-- Field Value Cache
+         
+         Cache used to hold field values that are quickly accessible
+         by document id.  The fieldValueCache is created by default
+         even if not configured here.
+      -->
+    <!--
+       <fieldValueCache class="solr.FastLRUCache"
+                        size="512"
+                        autowarmCount="128"
+                        showItems="32" />
+      -->
+
+    <!-- Custom Cache
+
+         Example of a generic cache.  These caches may be accessed by
+         name through SolrIndexSearcher.getCache(),cacheLookup(), and
+         cacheInsert().  The purpose is to enable easy caching of
+         user/application level data.  The regenerator argument should
+         be specified as an implementation of solr.CacheRegenerator 
+         if autowarming is desired.  
+      -->
+    <!--
+       <cache name="myUserCache"
+              class="solr.LRUCache"
+              size="4096"
+              initialSize="1024"
+              autowarmCount="1024"
+              regenerator="com.mycompany.MyRegenerator"
+              />
+      -->
+
+
+    <!-- Lazy Field Loading
+
+         If true, stored fields that are not requested will be loaded
+         lazily.  This can result in a significant speed improvement
+         if the usual case is to not load all stored fields,
+         especially if the skipped fields are large compressed text
+         fields.
+    -->
+    <enableLazyFieldLoading>true</enableLazyFieldLoading>
+
+   <!-- Use Filter For Sorted Query
+
+        A possible optimization that attempts to use a filter to
+        satisfy a search.  If the requested sort does not include
+        score, then the filterCache will be checked for a filter
+        matching the query. If found, the filter will be used as the
+        source of document ids, and then the sort will be applied to
+        that.
+
+        For most situations, this will not be useful unless you
+        frequently get the same search repeatedly with different sort
+        options, and none of them ever use "score"
+     -->
+   <!--
+      <useFilterForSortedQuery>true</useFilterForSortedQuery>
+     -->
+
+   <!-- Result Window Size
+
+        An optimization for use with the queryResultCache.  When a search
+        is requested, a superset of the requested number of document ids
+        are collected.  For example, if a search for a particular query
+        requests matching documents 10 through 19, and queryWindowSize is 50,
+        then documents 0 through 49 will be collected and cached.  Any further
+        requests in that range can be satisfied via the cache.  
+     -->
+   <queryResultWindowSize>20</queryResultWindowSize>
+
+   <!-- Maximum number of documents to cache for any entry in the
+        queryResultCache. 
+     -->
+   <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
+
+   <!-- Query Related Event Listeners
+
+        Various IndexSearcher related events can trigger Listeners to
+        take actions.
+
+        newSearcher - fired whenever a new searcher is being prepared
+        and there is a current searcher handling requests (aka
+        registered).  It can be used to prime certain caches to
+        prevent long request times for certain requests.
+
+        firstSearcher - fired whenever a new searcher is being
+        prepared but there is no current registered searcher to handle
+        requests or to gain autowarming data from.
+
+        
+     -->
+    <!-- QuerySenderListener takes an array of NamedList and executes a
+         local query request for each NamedList in sequence. 
+      -->
+    <listener event="newSearcher" class="solr.QuerySenderListener">
+      <arr name="queries">
+        <!--
+           <lst><str name="q">solr</str><str name="sort">price asc</str></lst>
+           <lst><str name="q">rocks</str><str name="sort">weight asc</str></lst>
+          -->
+      </arr>
+    </listener>
+    <listener event="firstSearcher" class="solr.QuerySenderListener">
+      <arr name="queries">
+        <lst>
+          <str name="q">static firstSearcher warming in solrconfig.xml</str>
+        </lst>
+      </arr>
+    </listener>
+
+    <!-- Use Cold Searcher
+
+         If a search request comes in and there is no current
+         registered searcher, then immediately register the still
+         warming searcher and use it.  If "false" then all requests
+         will block until the first searcher is done warming.
+      -->
+    <useColdSearcher>false</useColdSearcher>
+
+    <!-- Max Warming Searchers
+         
+         Maximum number of searchers that may be warming in the
+         background concurrently.  An error is returned if this limit
+         is exceeded.
+
+         Recommend values of 1-2 for read-only slaves, higher for
+         masters w/o cache warming.
+      -->
+    <maxWarmingSearchers>2</maxWarmingSearchers>
+
+  </query>
+
+
+  <!-- Request Dispatcher
+
+       This section contains instructions for how the SolrDispatchFilter
+       should behave when processing requests for this SolrCore.
+
+       handleSelect is a legacy option that affects the behavior of requests
+       such as /select?qt=XXX
+
+       handleSelect="true" will cause the SolrDispatchFilter to process
+       the request and dispatch the query to a handler specified by the
+       "qt" param, assuming "/select" isn't already registered.
+
+       handleSelect="false" will cause the SolrDispatchFilter to
+       ignore "/select" requests, resulting in a 404 unless a handler
+       is explicitly registered with the name "/select"
+
+       handleSelect="true" is not recommended for new users, but is the default
+       for backwards compatibility
+    -->
+  <requestDispatcher handleSelect="false" >
+    <!-- Request Parsing
+
+         These settings indicate how Solr Requests may be parsed, and
+         what restrictions may be placed on the ContentStreams from
+         those requests
+
+         enableRemoteStreaming - enables use of the stream.file
+         and stream.url parameters for specifying remote streams.
+
+         multipartUploadLimitInKB - specifies the max size of
+         Multipart File Uploads that Solr will allow in a Request.
+         
+         *** WARNING ***
+         The settings below authorize Solr to fetch remote files, You
+         should make sure your system has some authentication before
+         using enableRemoteStreaming="true"
+
+      --> 
+    <requestParsers enableRemoteStreaming="true" 
+                    multipartUploadLimitInKB="2048000" />
+
+    <!-- HTTP Caching
+
+         Set HTTP caching related parameters (for proxy caches and clients).
+
+         The options below instruct Solr not to output any HTTP Caching
+         related headers
+      -->
+    <httpCaching never304="true" />
+    <!-- If you include a <cacheControl> directive, it will be used to
+         generate a Cache-Control header (as well as an Expires header
+         if the value contains "max-age=")
+         
+         By default, no Cache-Control header is generated.
+         
+         You can use the <cacheControl> option even if you have set
+         never304="true"
+      -->
+    <!--
+       <httpCaching never304="true" >
+         <cacheControl>max-age=30, public</cacheControl> 
+       </httpCaching>
+      -->
+    <!-- To enable Solr to respond with automatically generated HTTP
+         Caching headers, and to response to Cache Validation requests
+         correctly, set the value of never304="false"
+         
+         This will cause Solr to generate Last-Modified and ETag
+         headers based on the properties of the Index.
+
+         The following options can also be specified to affect the
+         values of these headers...
+
+         lastModFrom - the default value is "openTime" which means the
+         Last-Modified value (and validation against If-Modified-Since
+         requests) will all be relative to when the current Searcher
+         was opened.  You can change it to lastModFrom="dirLastMod" if
+         you want the value to exactly correspond to when the physical
+         index was last modified.
+
+         etagSeed="..." is an option you can change to force the ETag
+         header (and validation against If-None-Match requests) to be
+         different even if the index has not changed (ie: when making
+         significant changes to your config file)
+
+         (lastModifiedFrom and etagSeed are both ignored if you use
+         the never304="true" option)
+      -->
+    <!--
+       <httpCaching lastModifiedFrom="openTime"
+                    etagSeed="Solr">
+         <cacheControl>max-age=30, public</cacheControl> 
+       </httpCaching>
+      -->
+  </requestDispatcher>
+
+  <!-- Request Handlers 
+
+       http://wiki.apache.org/solr/SolrRequestHandler
+
+       Incoming queries will be dispatched to a specific handler by name
+       based on the path specified in the request.
+
+       Legacy behavior: If the request path uses "/select" but no Request
+       Handler has that name, and if handleSelect="true" has been specified in
+       the requestDispatcher, then the Request Handler is dispatched based on
+       the qt parameter.  Handlers without a leading '/' are accessed this way
+       like so: http://host/app/[core/]select?qt=name  If no qt is
+       given, then the requestHandler that declares default="true" will be
+       used or the one named "standard".
+       
+       If a Request Handler is declared with startup="lazy", then it will
+       not be initialized until the first request that uses it.
+
+    -->
+    <searchComponent name="iterative-query-component" class="opennlp.tools.similarity.apps.solr.IterativeQueryComponent" >
+
+  </searchComponent>
+      <requestHandler name="/syntgen" 
+      class="opennlp.tools.similarity.apps.solr.SyntGenRequestHandler">
+    <lst name="defaults">
+      <str name="echoParams">explicit</str>
+      <str name="fl">*,score</str>
+      <str name="wt">xml</str>
+    </lst>
+    <arr name="last-components">
+      <str>iterative-query-component</str>
+    </arr>
+    <lst name="search_params">
+      <str name="param1">value1</str>
+      <int name="param2">2</int>
+      <!-- ... more config items here ... -->
+    </lst>
+  </requestHandler>
+  <requestHandler name="/reranker" 
+      class="opennlp.tools.similarity.apps.solr.SearchResultsReRankerRequestHandler">
+    <lst name="defaults">
+      <str name="echoParams">explicit</str>
+      <str name="fl">*,score</str>
+      <str name="wt">xml</str>
+    </lst>
+    <arr name="last-components">
+      <str>iterative-query-component</str>
+    </arr>
+    <lst name="search_params">
+      <str name="param1">value1</str>
+      <int name="param2">2</int>
+      <!-- ... more config items here ... -->
+    </lst>
+  </requestHandler>
+  <requestHandler name="/contentgen" 
+      class="opennlp.tools.similarity.apps.solr.ContentGeneratorRequestHandler">
+    <lst name="defaults">
+      <str name="echoParams">explicit</str>
+      <str name="fl">*,score</str>
+      <str name="wt">xml</str>
+    </lst>
+    <arr name="last-components">
+      <str>iterative-query-component</str>
+    </arr>
+    <lst name="search_params">
+      <str name="param1">value1</str>
+      <int name="param2">2</int>
+      <!-- ... more config items here ... -->
+    </lst>
+  </requestHandler>
+        <requestHandler name="/iterative" 
+      class="opennlp.tools.similarity.apps.solr.IterativeSearchRequestHandler">
+    <lst name="defaults">
+      <str name="echoParams">explicit</str>
+      <str name="fl">*,score</str>
+      <str name="wt">xml</str>
+    </lst>
+    <lst name="search_params">
+      <str name="param1">value1</str>
+      <int name="param2">2</int>
+      <!-- ... more config items here ... -->
+    </lst>
+  </requestHandler>
+  
+  <!-- SearchHandler
+
+       http://wiki.apache.org/solr/SearchHandler
+
+       For processing Search Queries, the primary Request Handler
+       provided with Solr is "SearchHandler" It delegates to a sequent
+       of SearchComponents (see below) and supports distributed
+       queries across multiple shards
+    -->
+  <requestHandler name="/select" class="solr.SearchHandler">
+    <!-- default values for query parameters can be specified, these
+         will be overridden by parameters in the request
+      -->
+     <lst name="defaults">
+       <str name="echoParams">explicit</str>
+       <int name="rows">10</int>
+       <str name="df">text</str>
+     </lst>
+    <!-- In addition to defaults, "appends" params can be specified
+         to identify values which should be appended to the list of
+         multi-val params from the query (or the existing "defaults").
+      -->
+    <!-- In this example, the param "fq=instock:true" would be appended to
+         any query time fq params the user may specify, as a mechanism for
+         partitioning the index, independent of any user selected filtering
+         that may also be desired (perhaps as a result of faceted searching).
+
+         NOTE: there is *absolutely* nothing a client can do to prevent these
+         "appends" values from being used, so don't use this mechanism
+         unless you are sure you always want it.
+      -->
+    <!--
+       <lst name="appends">
+         <str name="fq">inStock:true</str>
+       </lst>
+      -->
+    <!-- "invariants" are a way of letting the Solr maintainer lock down
+         the options available to Solr clients.  Any params values
+         specified here are used regardless of what values may be specified
+         in either the query, the "defaults", or the "appends" params.
+
+         In this example, the facet.field and facet.query params would
+         be fixed, limiting the facets clients can use.  Faceting is
+         not turned on by default - but if the client does specify
+         facet=true in the request, these are the only facets they
+         will be able to see counts for; regardless of what other
+         facet.field or facet.query params they may specify.
+
+         NOTE: there is *absolutely* nothing a client can do to prevent these
+         "invariants" values from being used, so don't use this mechanism
+         unless you are sure you always want it.
+      -->
+    <!--
+       <lst name="invariants">
+         <str name="facet.field">cat</str>
+         <str name="facet.field">manu_exact</str>
+         <str name="facet.query">price:[* TO 500]</str>
+         <str name="facet.query">price:[500 TO *]</str>
+       </lst>
+      -->
+    <!-- If the default list of SearchComponents is not desired, that
+         list can either be overridden completely, or components can be
+         prepended or appended to the default list.  (see below)
+      -->
+    <!--
+       <arr name="components">
+         <str>nameOfCustomComponent1</str>
+         <str>nameOfCustomComponent2</str>
+       </arr>
+      -->
+    </requestHandler>
+
+  <!-- A request handler that returns indented JSON by default -->
+  <requestHandler name="/query" class="solr.SearchHandler">
+     <lst name="defaults">
+       <str name="echoParams">explicit</str>
+       <str name="wt">json</str>
+       <str name="indent">true</str>
+       <str name="df">text</str>
+     </lst>
+  </requestHandler>
+
+
+  <!-- realtime get handler, guaranteed to return the latest stored fields of
+       any document, without the need to commit or open a new searcher.  The
+       current implementation relies on the updateLog feature being enabled. -->
+  <requestHandler name="/get" class="solr.RealTimeGetHandler">
+     <lst name="defaults">
+       <str name="omitHeader">true</str>
+       <str name="wt">json</str>
+       <str name="indent">true</str>
+     </lst>
+  </requestHandler>
+
+ 
+  <!-- A Robust Example 
+       
+       This example SearchHandler declaration shows off usage of the
+       SearchHandler with many defaults declared
+
+       Note that multiple instances of the same Request Handler
+       (SearchHandler) can be registered multiple times with different
+       names (and different init parameters)
+    -->
+  <requestHandler name="/browse" class="solr.SearchHandler">
+     <lst name="defaults">
+       <str name="echoParams">explicit</str>
+
+       <!-- VelocityResponseWriter settings -->
+       <str name="wt">velocity</str>
+       <str name="v.template">browse</str>
+       <str name="v.layout">layout</str>
+       <str name="title">Solritas</str>
+
+       <!-- Query settings -->
+       <str name="defType">edismax</str>
+       <str name="qf">
+          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
+          title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0
+       </str>
+       <str name="df">text</str>
+       <str name="mm">100%</str>
+       <str name="q.alt">*:*</str>
+       <str name="rows">10</str>
+       <str name="fl">*,score</str>
+
+       <str name="mlt.qf">
+         text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
+         title^10.0 description^5.0 keywords^5.0 author^2.0 resourcename^1.0
+       </str>
+       <str name="mlt.fl">text,features,name,sku,id,manu,cat,title,description,keywords,author,resourcename</str>
+       <int name="mlt.count">3</int>
+
+       <!-- Faceting defaults -->
+       <str name="facet">on</str>
+       <str name="facet.field">cat</str>
+       <str name="facet.field">manu_exact</str>
+       <str name="facet.field">content_type</str>
+       <str name="facet.field">author_s</str>
+       <str name="facet.query">ipod</str>
+       <str name="facet.query">GB</str>
+       <str name="facet.mincount">1</str>
+       <str name="facet.pivot">cat,inStock</str>
+       <str name="facet.range.other">after</str>
+       <str name="facet.range">price</str>
+       <int name="f.price.facet.range.start">0</int>
+       <int name="f.price.facet.range.end">600</int>
+       <int name="f.price.facet.range.gap">50</int>
+       <str name="facet.range">popularity</str>
+       <int name="f.popularity.facet.range.start">0</int>
+       <int name="f.popularity.facet.range.end">10</int>
+       <int name="f.popularity.facet.range.gap">3</int>
+       <str name="facet.range">manufacturedate_dt</str>
+       <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str>
+       <str name="f.manufacturedate_dt.facet.range.end">NOW</str>
+       <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str>
+       <str name="f.manufacturedate_dt.facet.range.other">before</str>
+       <str name="f.manufacturedate_dt.facet.range.other">after</str>
+
+       <!-- Highlighting defaults -->
+       <str name="hl">on</str>
+       <str name="hl.fl">content features title name</str>
+       <str name="hl.encoder">html</str>
+       <str name="hl.simple.pre">&lt;b&gt;</str>
+       <str name="hl.simple.post">&lt;/b&gt;</str>
+       <str name="f.title.hl.fragsize">0</str>
+       <str name="f.title.hl.alternateField">title</str>
+       <str name="f.name.hl.fragsize">0</str>
+       <str name="f.name.hl.alternateField">name</str>
+       <str name="f.content.hl.snippets">3</str>
+       <str name="f.content.hl.fragsize">200</str>
+       <str name="f.content.hl.alternateField">content</str>
+       <str name="f.content.hl.maxAlternateFieldLength">750</str>
+
+       <!-- Spell checking defaults -->
+       <str name="spellcheck">on</str>
+       <str name="spellcheck.extendedResults">false</str>       
+       <str name="spellcheck.count">5</str>
+       <str name="spellcheck.alternativeTermCount">2</str>
+       <str name="spellcheck.maxResultsForSuggest">5</str>       
+       <str name="spellcheck.collate">true</str>
+       <str name="spellcheck.collateExtendedResults">true</str>  
+       <str name="spellcheck.maxCollationTries">5</str>
+       <str name="spellcheck.maxCollations">3</str>           
+     </lst>
+
+     <!-- append spellchecking to our list of components -->
+     <arr name="last-components">
+       <str>spellcheck</str>
+     </arr>
+  </requestHandler>
+
+
+  <!-- Update Request Handler.  
+       
+       http://wiki.apache.org/solr/UpdateXmlMessages
+
+       The canonical Request Handler for Modifying the Index through
+       commands specified using XML, JSON, CSV, or JAVABIN
+
+       Note: Since solr1.1 requestHandlers requires a valid content
+       type header if posted in the body. For example, curl now
+       requires: -H 'Content-type:text/xml; charset=utf-8'
+       
+       To override the request content type and force a specific 
+       Content-type, use the request parameter: 
+         ?update.contentType=text/csv
+       
+       This handler will pick a response format to match the input
+       if the 'wt' parameter is not explicit
+    -->
+  <requestHandler name="/update" class="solr.UpdateRequestHandler">
+    <!-- See below for information on defining 
+         updateRequestProcessorChains that can be used by name 
+         on each Update Request
+      -->
+    <!--
+       <lst name="defaults">
+         <str name="update.chain">dedupe</str>
+       </lst>
+       -->
+  </requestHandler>
+  
+
+  <!-- Solr Cell Update Request Handler
+
+       http://wiki.apache.org/solr/ExtractingRequestHandler 
+
+    -->
+  <requestHandler name="/update/extract" 
+                  startup="lazy"
+                  class="solr.extraction.ExtractingRequestHandler" >
+    <lst name="defaults">
+      <str name="lowernames">true</str>
+      <str name="uprefix">ignored_</str>
+
+      <!-- capture link hrefs but ignore div attributes -->
+      <str name="captureAttr">true</str>
+      <str name="fmap.a">links</str>
+      <str name="fmap.div">ignored_</str>
+    </lst>
+  </requestHandler>
+
+
+  <!-- Field Analysis Request Handler
+
+       RequestHandler that provides much the same functionality as
+       analysis.jsp. Provides the ability to specify multiple field
+       types and field names in the same request and outputs
+       index-time and query-time analysis for each of them.
+
+       Request parameters are:
+       analysis.fieldname - field name whose analyzers are to be used
+
+       analysis.fieldtype - field type whose analyzers are to be used
+       analysis.fieldvalue - text for index-time analysis
+       q (or analysis.q) - text for query time analysis
+       analysis.showmatch (true|false) - When set to true and when
+           query analysis is performed, the produced tokens of the
+           field value analysis will be marked as "matched" for every
+           token that is produces by the query analysis
+   -->
+  <requestHandler name="/analysis/field" 
+                  startup="lazy"
+                  class="solr.FieldAnalysisRequestHandler" />
+
+
+  <!-- Document Analysis Handler
+
+       http://wiki.apache.org/solr/AnalysisRequestHandler
+
+       An analysis handler that provides a breakdown of the analysis
+       process of provided documents. This handler expects a (single)
+       content stream with the following format:
+
+       <docs>
+         <doc>
+           <field name="id">1</field>
+           <field name="name">The Name</field>
+           <field name="text">The Text Value</field>
+         </doc>
+         <doc>...</doc>
+         <doc>...</doc>
+         ...
+       </docs>
+
+    Note: Each document must contain a field which serves as the
+    unique key. This key is used in the returned response to associate
+    an analysis breakdown to the analyzed document.
+
+    Like the FieldAnalysisRequestHandler, this handler also supports
+    query analysis by sending either an "analysis.query" or "q"
+    request parameter that holds the query text to be analyzed. It
+    also supports the "analysis.showmatch" parameter which when set to
+    true, all field tokens that match the query tokens will be marked
+    as a "match". 
+  -->
+  <requestHandler name="/analysis/document" 
+                  class="solr.DocumentAnalysisRequestHandler" 
+                  startup="lazy" />
+
+  <!-- Admin Handlers
+
+       Admin Handlers - This will register all the standard admin
+       RequestHandlers.  
+    -->
+  <requestHandler name="/admin/" 
+                  class="solr.admin.AdminHandlers" />
+  <!-- This single handler is equivalent to the following... -->
+  <!--
+     <requestHandler name="/admin/luke"       class="solr.admin.LukeRequestHandler" />
+     <requestHandler name="/admin/system"     class="solr.admin.SystemInfoHandler" />
+     <requestHandler name="/admin/plugins"    class="solr.admin.PluginInfoHandler" />
+     <requestHandler name="/admin/threads"    class="solr.admin.ThreadDumpHandler" />
+     <requestHandler name="/admin/properties" class="solr.admin.PropertiesRequestHandler" />
+     <requestHandler name="/admin/file"       class="solr.admin.ShowFileRequestHandler" >
+    -->
+  <!-- If you wish to hide files under ${solr.home}/conf, explicitly
+       register the ShowFileRequestHandler using: 
+    -->
+  <!--
+     <requestHandler name="/admin/file" 
+                     class="solr.admin.ShowFileRequestHandler" >
+       <lst name="invariants">
+         <str name="hidden">synonyms.txt</str> 
+         <str name="hidden">anotherfile.txt</str> 
+       </lst>
+     </requestHandler>
+    -->
+
+  <!-- ping/healthcheck -->
+  <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
+    <lst name="invariants">
+      <str name="q">solrpingquery</str>
+    </lst>
+    <lst name="defaults">
+      <str name="echoParams">all</str>
+    </lst>
+    <!-- An optional feature of the PingRequestHandler is to configure the 
+         handler with a "healthcheckFile" which can be used to enable/disable 
+         the PingRequestHandler.
+         relative paths are resolved against the data dir 
+      -->
+    <!-- <str name="healthcheckFile">server-enabled.txt</str> -->
+  </requestHandler>
+
+  <!-- Echo the request contents back to the client -->
+  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
+    <lst name="defaults">
+     <str name="echoParams">explicit</str> 
+     <str name="echoHandler">true</str>
+    </lst>
+  </requestHandler>
+  
+  <!-- Solr Replication
+
+       The SolrReplicationHandler supports replicating indexes from a
+       "master" used for indexing and "slaves" used for queries.
+
+       http://wiki.apache.org/solr/SolrReplication 
+
+       In the example below, remove the <lst name="master"> section if
+       this is just a slave and remove  the <lst name="slave"> section
+       if this is just a master.
+    -->
+  <!--
+     <requestHandler name="/replication" class="solr.ReplicationHandler" >
+       <lst name="master">
+         <str name="replicateAfter">commit</str>
+         <str name="replicateAfter">startup</str>
+         <str name="confFiles">schema.xml,stopwords.txt</str>
+       </lst>
+       <lst name="slave">
+         <str name="masterUrl">http://localhost:8983/solr</str>
+         <str name="pollInterval">00:00:60</str>
+       </lst>
+     </requestHandler>
+    -->
+    
+    <!-- Solr Replication for SolrCloud Recovery
+    
+         This is the config need for SolrCloud's recovery replication.
+    -->
+	<requestHandler name="/replication" class="solr.ReplicationHandler" startup="lazy" /> 
+
+
+  <!-- Search Components
+
+       Search components are registered to SolrCore and used by 
+       instances of SearchHandler (which can access them by name)
+       
+       By default, the following components are available:
+       
+       <searchComponent name="query"     class="solr.QueryComponent" />
+       <searchComponent name="facet"     class="solr.FacetComponent" />
+       <searchComponent name="mlt"       class="solr.MoreLikeThisComponent" />
+       <searchComponent name="highlight" class="solr.HighlightComponent" />
+       <searchComponent name="stats"     class="solr.StatsComponent" />
+       <searchComponent name="debug"     class="solr.DebugComponent" />
+   
+       Default configuration in a requestHandler would look like:
+
+       <arr name="components">
+         <str>query</str>
+         <str>facet</str>
+         <str>mlt</str>
+         <str>highlight</str>
+         <str>stats</str>
+         <str>debug</str>
+       </arr>
+
+       If you register a searchComponent to one of the standard names, 
+       that will be used instead of the default.
+
+       To insert components before or after the 'standard' components, use:
+    
+       <arr name="first-components">
+         <str>myFirstComponentName</str>
+       </arr>
+    
+       <arr name="last-components">
+         <str>myLastComponentName</str>
+       </arr>
+
+       NOTE: The component registered with the name "debug" will
+       always be executed after the "last-components" 
+       
+     -->
+  
+   <!-- Spell Check
+
+        The spell check component can return a list of alternative spelling
+        suggestions.  
+
+        http://wiki.apache.org/solr/SpellCheckComponent
+     -->
+  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
+
+    <str name="queryAnalyzerFieldType">textSpell</str>
+
+    <!-- Multiple "Spell Checkers" can be declared and used by this
+         component
+      -->
+
+    <!-- a spellchecker built from a field of the main index -->
+    <lst name="spellchecker">
+      <str name="name">default</str>
+      <str name="field">name</str>
+      <str name="classname">solr.DirectSolrSpellChecker</str>
+      <!-- the spellcheck distance measure used, the default is the internal levenshtein -->
+      <str name="distanceMeasure">internal</str>
+      <!-- minimum accuracy needed to be considered a valid spellcheck suggestion -->
+      <float name="accuracy">0.5</float>
+      <!-- the maximum #edits we consider when enumerating terms: can be 1 or 2 -->
+      <int name="maxEdits">2</int>
+      <!-- the minimum shared prefix when enumerating terms -->
+      <int name="minPrefix">1</int>
+      <!-- maximum number of inspections per result. -->
+      <int name="maxInspections">5</int>
+      <!-- minimum length of a query term to be considered for correction -->
+      <int name="minQueryLength">4</int>
+      <!-- maximum threshold of documents a query term can appear to be considered for correction -->
+      <float name="maxQueryFrequency">0.01</float>
+      <!-- uncomment this to require suggestions to occur in 1% of the documents
+      	<float name="thresholdTokenFrequency">.01</float>
+      -->
+    </lst>
+    
+    <!-- a spellchecker that can break or combine words.  See "/spell" handler below for usage -->
+    <lst name="spellchecker">
+      <str name="name">wordbreak</str>
+      <str name="classname">solr.WordBreakSolrSpellChecker</str>      
+      <str name="field">name</str>
+      <str name="combineWords">true</str>
+      <str name="breakWords">true</str>
+      <int name="maxChanges">10</int>
+    </lst>
+
+    <!-- a spellchecker that uses a different distance measure -->
+    <!--
+       <lst name="spellchecker">
+         <str name="name">jarowinkler</str>
+         <str name="field">spell</str>
+         <str name="classname">solr.DirectSolrSpellChecker</str>
+         <str name="distanceMeasure">
+           org.apache.lucene.search.spell.JaroWinklerDistance
+         </str>
+       </lst>
+     -->
+
+    <!-- a spellchecker that use an alternate comparator 
+
+         comparatorClass be one of:
+          1. score (default)
+          2. freq (Frequency first, then score)
+          3. A fully qualified class name
+      -->
+    <!--
+       <lst name="spellchecker">
+         <str name="name">freq</str>
+         <str name="field">lowerfilt</str>
+         <str name="classname">solr.DirectSolrSpellChecker</str>
+         <str name="comparatorClass">freq</str>
+      -->
+
+    <!-- A spellchecker that reads the list of words from a file -->
+    <!--
+       <lst name="spellchecker">
+         <str name="classname">solr.FileBasedSpellChecker</str>
+         <str name="name">file</str>
+         <str name="sourceLocation">spellings.txt</str>
+         <str name="characterEncoding">UTF-8</str>
+         <str name="spellcheckIndexDir">spellcheckerFile</str>
+       </lst>
+      -->
+  </searchComponent>
+
+  <!-- A request handler for demonstrating the spellcheck component.  
+
+       NOTE: This is purely as an example.  The whole purpose of the
+       SpellCheckComponent is to hook it into the request handler that
+       handles your normal user queries so that a separate request is
+       not needed to get suggestions.
+
+       IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
+       NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!
+       
+       See http://wiki.apache.org/solr/SpellCheckComponent for details
+       on the request parameters.
+    -->
+  <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
+    <lst name="defaults">
+      <str name="df">text</str>
+      <!-- Solr will use suggestions from both the 'default' spellchecker
+           and from the 'wordbreak' spellchecker and combine them.
+           collations (re-written queries) can include a combination of
+           corrections from both spellcheckers -->
+      <str name="spellcheck.dictionary">default</str>
+      <str name="spellcheck.dictionary">wordbreak</str>
+      <str name="spellcheck">on</str>
+      <str name="spellcheck.extendedResults">true</str>       
+      <str name="spellcheck.count">10</str>
+      <str name="spellcheck.alternativeTermCount">5</str>
+      <str name="spellcheck.maxResultsForSuggest">5</str>       
+      <str name="spellcheck.collate">true</str>
+      <str name="spellcheck.collateExtendedResults">true</str>  
+      <str name="spellcheck.maxCollationTries">10</str>
+      <str name="spellcheck.maxCollations">5</str>         
+    </lst>
+    <arr name="last-components">
+      <str>spellcheck</str>
+    </arr>
+  </requestHandler>
+
+  <!-- Term Vector Component
+
+       http://wiki.apache.org/solr/TermVectorComponent
+    -->
+  <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
+
+  <!-- A request handler for demonstrating the term vector component
+
+       This is purely as an example.
+
+       In reality you will likely want to add the component to your 
+       already specified request handlers. 
+    -->
+  <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy">
+    <lst name="defaults">
+      <str name="df">text</str>
+      <bool name="tv">true</bool>
+    </lst>
+    <arr name="last-components">
+      <str>tvComponent</str>
+    </arr>
+  </requestHandler>
+
+  <!-- Clustering Component
+
+       http://wiki.apache.org/solr/ClusteringComponent
+
+       You'll need to set the solr.cluster.enabled system property
+       when running solr to run with clustering enabled:
+
+            java -Dsolr.clustering.enabled=true -jar start.jar
+
+    -->
+  <searchComponent name="clustering"
+                   enable="${solr.clustering.enabled:false}"
+                   class="solr.clustering.ClusteringComponent" >
+    <!-- Declare an engine -->
+    <lst name="engine">
+      <!-- The name, only one can be named "default" -->
+      <str name="name">default</str>
+
+      <!-- Class name of Carrot2 clustering algorithm.
+
+           Currently available algorithms are:
+           
+           * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
+           * org.carrot2.clustering.stc.STCClusteringAlgorithm
+           * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
+           
+           See http://project.carrot2.org/algorithms.html for the
+           algorithm's characteristics.
+        -->
+      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
+      <!-- Overriding values for Carrot2 default algorithm attributes.
+
+           For a description of all available attributes, see:
+           http://download.carrot2.org/stable/manual/#chapter.components.
+           Use attribute key as name attribute of str elements
+           below. These can be further overridden for individual
+           requests by specifying attribute key as request parameter
+           name and attribute value as parameter value.
+        -->
+      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
+
+      <!-- Location of Carrot2 lexical resources.
+
+           A directory from which to load Carrot2-specific stop words
+           and stop labels. Absolute or relative to Solr config directory.
+           If a specific resource (e.g. stopwords.en) is present in the
+           specified dir, it will completely override the corresponding
+           default one that ships with Carrot2.
+
+           For an overview of Carrot2 lexical resources, see:
+           http://download.carrot2.org/head/manual/#chapter.lexical-resources
+        -->
+      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
+      <!-- The language to assume for the documents.
+
+           For a list of allowed values, see:
+           http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
+       -->
+      <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
+    </lst>
+    <lst name="engine">
+      <str name="name">stc</str>
+      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
+    </lst>
+  </searchComponent>
+
+  <!-- A request handler for demonstrating the clustering component
+
+       This is purely as an example.
+
+       In reality you will likely want to add the component to your 
+       already specified request handlers. 
+    -->
+  <requestHandler name="/clustering"
+                  startup="lazy"
+                  enable="${solr.clustering.enabled:false}"
+                  class="solr.SearchHandler">
+    <lst name="defaults">
+      <bool name="clustering">true</bool>
+      <str name="clustering.engine">default</str>
+      <bool name="clustering.results">true</bool>
+      <!-- The title field -->
+      <str name="carrot.title">name</str>
+      <str name="carrot.url">id</str>
+      <!-- The field to cluster on -->
+       <str name="carrot.snippet">features</str>
+       <!-- produce summaries -->
+       <bool name="carrot.produceSummary">true</bool>
+       <!-- the maximum number of labels per cluster -->
+       <!--<int name="carrot.numDescriptions">5</int>-->
+       <!-- produce sub clusters -->
+       <bool name="carrot.outputSubClusters">false</bool>
+       
+       <str name="defType">edismax</str>
+       <str name="qf">
+         text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
+       </str>
+       <str name="q.alt">*:*</str>
+       <str name="rows">10</str>
+       <str name="fl">*,score</str>
+    </lst>     
+    <arr name="last-components">
+      <str>clustering</str>
+    </arr>
+  </requestHandler>
+  
+  <!-- Terms Component
+
+       http://wiki.apache.org/solr/TermsComponent
+
+       A component to return terms and document frequency of those
+       terms
+    -->
+  <searchComponent name="terms" class="solr.TermsComponent"/>
+
+  <!-- A request handler for demonstrating the terms component -->
+  <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
+     <lst name="defaults">
+      <bool name="terms">true</bool>
+    </lst>     
+    <arr name="components">
+      <str>terms</str>
+    </arr>
+  </requestHandler>
+
+
+  <!-- Query Elevation Component
+
+       http://wiki.apache.org/solr/QueryElevationComponent
+
+       a search component that enables you to configure the top
+       results for a given query regardless of the normal lucene
+       scoring.
+    -->
+  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
+    <!-- pick a fieldType to analyze queries -->
+    <str name="queryFieldType">string</str>
+    <str name="config-file">elevate.xml</str>
+  </searchComponent>
+
+  <!-- A request handler for demonstrating the elevator component -->
+  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
+    <lst name="defaults">
+      <str name="echoParams">explicit</str>
+      <str name="df">text</str>
+    </lst>
+    <arr name="last-components">
+      <str>elevator</str>
+    </arr>
+  </requestHandler>
+
+  <!-- Highlighting Component
+
+       http://wiki.apache.org/solr/HighlightingParameters
+    -->
+  <searchComponent class="solr.HighlightComponent" name="highlight">
+    <highlighting>
+      <!-- Configure the standard fragmenter -->
+      <!-- This could most likely be commented out in the "default" case -->
+      <fragmenter name="gap" 
+                  default="true"
+                  class="solr.highlight.GapFragmenter">
+        <lst name="defaults">
+          <int name="hl.fragsize">100</int>
+        </lst>
+      </fragmenter>
+
+      <!-- A regular-expression-based fragmenter 
+           (for sentence extraction) 
+        -->
+      <fragmenter name="regex" 
+                  class="solr.highlight.RegexFragmenter">
+        <lst name="defaults">
+          <!-- slightly smaller fragsizes work better because of slop -->
+          <int name="hl.fragsize">70</int>
+          <!-- allow 50% slop on fragment sizes -->
+          <float name="hl.regex.slop">0.5</float>
+          <!-- a basic sentence pattern -->
+          <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
+        </lst>
+      </fragmenter>
+
+      <!-- Configure the standard formatter -->
+      <formatter name="html" 
+                 default="true"
+                 class="solr.highlight.HtmlFormatter">
+        <lst name="defaults">
+          <str name="hl.simple.pre"><![CDATA[<em>]]></str>
+          <str name="hl.simple.post"><![CDATA[</em>]]></str>
+        </lst>
+      </formatter>
+
+      <!-- Configure the standard encoder -->
+      <encoder name="html" 
+               class="solr.highlight.HtmlEncoder" />
+
+      <!-- Configure the standard fragListBuilder -->
+      <fragListBuilder name="simple" 
+                       class="solr.highlight.SimpleFragListBuilder"/>
+      
+      <!-- Configure the single fragListBuilder -->
+      <fragListBuilder name="single" 
+                       class="solr.highlight.SingleFragListBuilder"/>
+      
+      <!-- Configure the weighted fragListBuilder -->
+      <fragListBuilder name="weighted" 
+                       default="true"
+                       class="solr.highlight.WeightedFragListBuilder"/>
+      
+      <!-- default tag FragmentsBuilder -->
+      <fragmentsBuilder name="default" 
+                        default="true"
+                        class="solr.highlight.ScoreOrderFragmentsBuilder">
+        <!-- 
+        <lst name="defaults">
+          <str name="hl.multiValuedSeparatorChar">/</str>
+        </lst>
+        -->
+      </fragmentsBuilder>
+
+      <!-- multi-colored tag FragmentsBuilder -->
+      <fragmentsBuilder name="colored" 
+                        class="solr.highlight.ScoreOrderFragmentsBuilder">
+        <lst name="defaults">
+          <str name="hl.tag.pre"><![CDATA[
+               <b style="background:yellow">,<b style="background:lawgreen">,
+               <b style="background:aquamarine">,<b style="background:magenta">,
+               <b style="background:palegreen">,<b style="background:coral">,
+               <b style="background:wheat">,<b style="background:khaki">,
+               <b style="background:lime">,<b style="background:deepskyblue">]]></str>
+          <str name="hl.tag.post"><![CDATA[</b>]]></str>
+        </lst>
+      </fragmentsBuilder>
+      
+      <boundaryScanner name="default" 
+                       default="true"
+                       class="solr.highlight.SimpleBoundaryScanner">
+        <lst name="defaults">
+          <str name="hl.bs.maxScan">10</str>
+          <str name="hl.bs.chars">.,!? &#9;&#10;&#13;</str>
+        </lst>
+      </boundaryScanner>
+      
+      <boundaryScanner name="breakIterator" 
+                       class="solr.highlight.BreakIteratorBoundaryScanner">
+        <lst name="defaults">
+          <!-- type should be one of CHARACTER, WORD(default), LINE and SENTENCE -->
+          <str name="hl.bs.type">WORD</str>
+          <!-- language and country are used when constructing Locale object.  -->
+          <!-- And the Locale object will be used when getting instance of BreakIterator -->
+          <str name="hl.bs.language">en</str>
+          <str name="hl.bs.country">US</str>
+        </lst>
+      </boundaryScanner>
+    </highlighting>
+  </searchComponent>
+
+  <!-- Update Processors
+
+       Chains of Update Processor Factories for dealing with Update
+       Requests can be declared, and then used by name in Update
+       Request Processors
+
+       http://wiki.apache.org/solr/UpdateRequestProcessor
+
+    --> 
+  <!-- Deduplication
+
+       An example dedup update processor that creates the "id" field
+       on the fly based on the hash code of some other fields.  This
+       example has overwriteDupes set to false since we are using the
+       id field as the signatureField and Solr will maintain
+       uniqueness based on that anyway.  
+       
+    -->
+  <!--
+     <updateRequestProcessorChain name="dedupe">
+       <processor class="solr.processor.SignatureUpdateProcessorFactory">
+         <bool name="enabled">true</bool>
+         <str name="signatureField">id</str>
+         <bool name="overwriteDupes">false</bool>
+         <str name="fields">name,features,cat</str>
+         <str name="signatureClass">solr.processor.Lookup3Signature</str>
+       </processor>
+       <processor class="solr.LogUpdateProcessorFactory" />
+       <processor class="solr.RunUpdateProcessorFactory" />
+     </updateRequestProcessorChain>
+    -->
+  
+  <!-- Language identification
+
+       This example update chain identifies the language of the incoming
+       documents using the langid contrib. The detected language is
+       written to field language_s. No field name mapping is done.
+       The fields used for detection are text, title, subject and description,
+       making this example suitable for detecting languages form full-text
+       rich documents injected via ExtractingRequestHandler.
+       See more about langId at http://wiki.apache.org/solr/LanguageDetection
+    -->
+    <!--
+     <updateRequestProcessorChain name="langid">
+       <processor class="org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory">
+         <str name="langid.fl">text,title,subject,description</str>
+         <str name="langid.langField">language_s</str>
+         <str name="langid.fallback">en</str>
+       </processor>
+       <processor class="solr.LogUpdateProcessorFactory" />
+       <processor class="solr.RunUpdateProcessorFactory" />
+     </updateRequestProcessorChain>
+    -->
+
+  <!-- Script update processor
+
+    This example hooks in an update processor implemented using JavaScript.
+
+    See more about the script update processor at http://wiki.apache.org/solr/ScriptUpdateProcessor
+  -->
+  <!--
+    <updateRequestProcessorChain name="script">
+      <processor class="solr.StatelessScriptUpdateProcessorFactory">
+        <str name="script">update-script.js</str>
+        <lst name="params">
+          <str name="config_param">example config parameter</str>
+        </lst>
+      </processor>
+      <processor class="solr.RunUpdateProcessorFactory" />
+    </updateRequestProcessorChain>
+  -->
+ 
+  <!-- Response Writers
+
+       http://wiki.apache.org/solr/QueryResponseWriter
+
+       Request responses will be written using the writer specified by
+       the 'wt' request parameter matching the name of a registered
+       writer.
+
+       The "default" writer is the default and will be used if 'wt' is
+       not specified in the request.
+    -->
+  <!-- The following response writers are implicitly configured unless
+       overridden...
+    -->
+  
+     <queryResponseWriter name="xml" 
+                          default="true"
+                          class="solr.XMLResponseWriter" />
+     <queryResponseWriter name="json" class="solr.JSONResponseWriter"/>
+
+     <queryResponseWriter name="csv" class="solr.CSVResponseWriter"/>
+    
+
+  <queryResponseWriter name="json" class="solr.JSONResponseWriter">
+     <!-- For the purposes of the tutorial, JSON responses are written as
+      plain text so that they are easy to read in *any* browser.
+      If you expect a MIME type of "application/json" just remove this override.
+     -->
+    <str name="content-type">text/plain; charset=UTF-8</str>
+  </queryResponseWriter>
+  
+  <!--
+     Custom response writers can be declared as needed...
+    -->
+    <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/>
+  
+
+  <!-- XSLT response writer transforms the XML output by any xslt file found
+       in Solr's conf/xslt directory.  Changes to xslt files are checked for
+       every xsltCacheLifetimeSeconds.  
+    -->
+  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
+    <int name="xsltCacheLifetimeSeconds">5</int>
+  </queryResponseWriter>
+
+  <!-- Query Parsers
+
+       http://wiki.apache.org/solr/SolrQuerySyntax
+
+       Multiple QParserPlugins can be registered by name, and then
+       used in either the "defType" param for the QueryComponent (used
+       by SearchHandler) or in LocalParams
+    -->
+  <!-- example of registering a query parser -->
+  <!--
+     <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/>
+    -->
+
+  <!-- Function Parsers
+
+       http://wiki.apache.org/solr/FunctionQuery
+
+       Multiple ValueSourceParsers can be registered by name, and then
+       used as function names when using the "func" QParser.
+    -->
+  <!-- example of registering a custom function parser  -->
+  <!--
+     <valueSourceParser name="myfunc" 
+                        class="com.mycompany.MyValueSourceParser" />
+    -->
+    
+  
+  <!-- Document Transformers
+       http://wiki.apache.org/solr/DocTransformers
+    -->
+  <!--
+     Could be something like:
+     <transformer name="db" class="com.mycompany.LoadFromDatabaseTransformer" >
+       <int name="connection">jdbc://....</int>
+     </transformer>
+     
+     To add a constant value to all docs, use:
+     <transformer name="mytrans2" class="org.apache.solr.response.transform.ValueAugmenterFactory" >
+       <int name="value">5</int>
+     </transformer>
+     
+     If you want the user to still be able to change it with _value:something_ use this:
+     <transformer name="mytrans3" class="org.apache.solr.response.transform.ValueAugmenterFactory" >
+       <double name="defaultValue">5</double>
+     </transformer>
+
+      If you are using the QueryElevationComponent, you may wish to mark documents that get boosted.  The
+      EditorialMarkerFactory will do exactly that:
+     <transformer name="qecBooster" class="org.apache.solr.response.transform.EditorialMarkerFactory" />
+    -->
+    
+
+  <!-- Legacy config for the admin interface -->
+  <admin>
+    <defaultQuery>*:*</defaultQuery>
+  </admin>
+
+</config>

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
index bbdb707..84440bd 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java

@@ -23,8 +23,7 @@
 import java.util.Map;

 import java.util.logging.Logger;

 

-import opennlp.tools.similarity.apps.BingResponse;

-import opennlp.tools.similarity.apps.BingWebQueryRunner;

+import opennlp.tools.similarity.apps.BingQueryRunner;

 import opennlp.tools.similarity.apps.HitBase;

 import opennlp.tools.similarity.apps.utils.StringCleaner;

 import opennlp.tools.stemmer.PorterStemmer;

@@ -44,7 +43,7 @@
  * 

  */

 

-public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner {

+public class TaxonomyExtenderViaMebMining extends BingQueryRunner {

   private static Logger LOG = Logger

       .getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");

   private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
index 2ee288b..4c01e39 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java

@@ -100,11 +100,11 @@
       // LOG.error(e.getMessage(), e);

       // System.err.println("error fetching url " + url);

     }

-    try {

+/*    try {

       Thread.sleep(50); // do nothing 4 sec

     } catch (InterruptedException e) {

       e.printStackTrace();

-    }

+    } */

     return buf.toString();

   }

 


diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
index a4aa734..2726553 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java

@@ -109,8 +109,11 @@
       sentence_parseObject = new HashMap<String, String[][]>();
 
     try {
-    	if (MODEL_DIR==null) 
-    		MODEL_DIR = new File(".").getAbsolutePath().replace(".", "") + MODEL_DIR_REL;
+    	if (MODEL_DIR==null || MODEL_DIR.equals("/models")) {
+    		String absPath = new File(".").getAbsolutePath();
+    		absPath = absPath.substring(0, absPath.length()-1);
+    		MODEL_DIR = absPath + MODEL_DIR_REL;
+    	}
     	//get full path from constructor
     		
       initializeSentenceDetector();
@@ -587,13 +590,14 @@
       SentenceModel model = new SentenceModel(is);
       sentenceDetector = new SentenceDetectorME(model);
     } catch (IOException e) {
-      //e.printStackTrace();
+      e.printStackTrace();
     } finally {
       if (is != null) {
         try {
           is.close();
         } catch (IOException e) {
            // we swallow exception to support the cached run
+        	e.printStackTrace();
         }
       }
     }
@@ -692,7 +696,12 @@
     // check sentence node, the node contained in the top node
     if (type.equals(AbstractBottomUpParser.TOP_NODE)
         && childrenNodeList != null && childrenNodeList.size() > 0) {
-      PhraseNode rootNode = (PhraseNode) childrenNodeList.get(0);
+      PhraseNode rootNode;
+	try {
+		rootNode = (PhraseNode) childrenNodeList.get(0);
+	} catch (Exception e) {
+		return null;
+	}
       return new SentenceNode(text, rootNode.getChildren());
     }
 
@@ -780,3 +789,19 @@
     }
   }
 }
+
+/*
+ * 
+ * java.lang.ClassCastException: opennlp.tools.textsimilarity.chunker2matcher.WordNode cannot be cast to opennlp.tools.textsimilarity.chunker2matcher.PhraseNode
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.toSyntacticTreeNode(ParserChunker2MatcherProcessor.java:699)
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.sentenceToSentenceNode(ParserChunker2MatcherProcessor.java:525)
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.parseSentenceNode(ParserChunker2MatcherProcessor.java:554)
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.parseSentenceNode(ParserChunker2MatcherProcessor.java:548)
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.parseChunkSentence(ParserChunker2MatcherProcessor.java:282)
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.formGroupedPhrasesFromChunksForSentence(ParserChunker2MatcherProcessor.java:355)
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.formGroupedPhrasesFromChunksForPara(ParserChunker2MatcherProcessor.java:250)
+	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.assessRelevance(ParserChunker2MatcherProcessor.java:747)
+	at opennlp.tools.similarity.apps.RelatedSentenceFinder.augmentWithMinedSentencesAndVerifyRelevance(RelatedSentenceFinder.java:458)
+	at opennlp.tools.similarity.apps.RelatedSentenceFinder.generateContentAbout(RelatedSentenceFinder.java:156)
+	at 
+	*/

diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessorTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessorTest.java
new file mode 100644
index 0000000..0d2d58e
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessorTest.java

@@ -0,0 +1,65 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+import java.util.List;

+

+import opennlp.tools.similarity.apps.HitBase;

+

+import junit.framework.TestCase;

+

+public class MultiSentenceSearchResultsProcessorTest extends TestCase {

+	MultiSentenceSearchResultsProcessor proc = new MultiSentenceSearchResultsProcessor();

+

+	public void testSearchOrder() {

+		List<HitBase> res; HitBase first = null;

+		String query ;

+		/*

+		query = "I am now living abroad and have health insurance from Russia. How can I avoid penalty for not having health insurance in US";

+		res = proc.runSearchViaAPI(query);

+		// we verify that top answers have high similarity score

+		System.out.println(res);

+		first = res.get(0);

+		assertTrue(first.getGenerWithQueryScore() > 2.0f);

+*/

+		

+		

+		query = "Furious about reports that the IRS was used to target conservative groups, President Obama said that acting IRS Director Steve T. Miller was asked to resign. "+

+				"IRS actions were inexcusable. Americans are right to be angry about it. Obama will not tolerate this type of behavior by IRS";

+		res = proc.runSearchViaAPI(query);

+		// we verify that top answers have high similarity score

+		System.out.println(res);

+		first = res.get(0);

+		assertTrue(first.getGenerWithQueryScore() > 000f);

+

+

+		query = " I see no meaningful distinction between complacency or complicity in the military's latest failure to uphold their own " +

+				"standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense and the chain of " +

+				"command that was either oblivious to or tolerant of criminal behavior";

+		res = proc.runSearchViaAPI(query);

+		first = res.get(0);

+		assertTrue(first.getGenerWithQueryScore() > 1.69);

+		// assertTrue(second.getTitle().indexOf("living abroad")>-1);

+		proc.close();

+

+	}

+

+	/*public void testSimpleQuery(){

+		List<HitBase> res = proc.runSearchViaAPI("How can I pay tax on my income abroad");

+	}*/

+

+}


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/RelatedSentenceFinderTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/RelatedSentenceFinderTest.java
new file mode 100644
index 0000000..c9e70ef
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/RelatedSentenceFinderTest.java

@@ -0,0 +1,142 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.ContentGenerator;

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.RelatedSentenceFinder;

+import junit.framework.TestCase;

+

+

+public class RelatedSentenceFinderTest extends TestCase {

+	//RelatedSentenceFinder finder = new RelatedSentenceFinder();

+	ContentGenerator finder = new ContentGenerator();

+	

+	public void testAugmentWithMinedSentencesAndVerifyRelevanceTest(){

+		HitBase input = new HitBase();

+		input.setAbstractText("He is pictured here in the Swiss Patent Office where he did ...");

+		input.setUrl("http://apod.nasa.gov/apod/ap951219.html");

+		input.setTitle("Albert Einstein");

+		HitBase result = finder.//augmentWithMinedSentencesAndVerifyRelevance(input,

+				buildParagraphOfGeneratedText(input,

+				"Swiss Patent Office", new ArrayList<String>());

+		System.out.println(result.toString());

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+		assertTrue(result.getFragments().size()>0);

+		assertTrue(result.getFragments().get(0).getFragment().indexOf("Swiss Patent Office")>-1);

+	}

+	

+	

+	public void testBuildParagraphOfGeneratedTextTest(){

+		HitBase input = new HitBase();

+		input.setAbstractText("Albert Einstein was a German-born theoretical physicist who developed the general theory of relativity, one of the two pillars of modern physics (alongside ...");

+		input.setUrl("http://en.wikipedia.org/wiki/Albert_Einstein");

+		input.setTitle("Albert Einstein - Wikipedia, the free encyclopedia");

+		HitBase result = finder.buildParagraphOfGeneratedText(input,

+				"Albert Einstein", new ArrayList<String>());

+		System.out.println(result.toString());

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+		assertTrue(result.getFragments().size()>0);

+		assertTrue(result.getFragments().get(0).getFragment().indexOf("Albert Einstein")>-1);

+	} 

+

+	

+	public void testBuildParagraphOfGeneratedTextTestYearInTheEnd(){

+	    

+		HitBase input = new HitBase();

+		input.setAbstractText("Albert Einstein was born ... Germany, on March 14, 1879");

+		input.setUrl("http://www.nobelprize.org/nobel_prizes/physics/laureates/1921/einstein-bio.html");

+		input.setTitle("Albert Einstein - Biographical");

+		HitBase result = finder.buildParagraphOfGeneratedText(input,

+				"Albert Einstein", new ArrayList<String>());

+		System.out.println(result.toString());

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+		assertTrue(result.getFragments().size()>0);

+		assertTrue(result.getFragments().get(0).getFragment().indexOf("Albert Einstein")>-1);

+	} 

+	

+	public void testBuildParagraphOfGeneratedTextTestBio1(){

+		HitBase input = new HitBase();

+		input.setAbstractText("Today, the practical applications of Einsteins theories ...");

+		input.setUrl("http://einstein.biz/biography.php");

+		input.setTitle("Biography");

+		HitBase result = finder.buildParagraphOfGeneratedText(input,

+				"applications of Einstein theories ", new ArrayList<String>());

+		System.out.println(result.toString());

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+		assertTrue(result.getFragments().size()>0);

+		assertTrue(result.getFragments().get(0).getFragment().indexOf("Einstein")>-1);

+	} 

+	

+	public void testBuildParagraphOfGeneratedTextTestBio2(){

+		HitBase input = new HitBase();

+		input.setAbstractText("The theory of relativity is a beautiful example of  ...");

+		input.setUrl("https://en.wikiquote.org/wiki/Albert_Einstein");

+		input.setTitle("Albert Einstein");

+		HitBase result = finder.buildParagraphOfGeneratedText(input,

+				"beautiful example of", new ArrayList<String>());

+		System.out.println(result.toString());

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+		assertTrue(result.getFragments().size()>0);

+		assertTrue(result.getFragments().get(0).getFragment().indexOf("relativity")>-1);

+	} 

+	

+	public void testBuildParagraphOfGeneratedTextTestBio3(){

+		HitBase input = new HitBase();

+		input.setAbstractText("I cannot conceive of a god who rewards and punishes his creatures or has a will of the kind that we experience  ...");

+		input.setUrl("http://www.ldolphin.org/einstein.html");

+		input.setTitle("Some Quotations of ALBERT EINSTEIN (1879-1955)");

+		HitBase result = finder.buildParagraphOfGeneratedText(input,

+				"cannot conceive a god", new ArrayList<String>());

+		System.out.println(result.toString());

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+		assertTrue(result.getFragments().size()>0);

+		assertTrue(result.getFragments().get(0).getFragment().indexOf("cannot conceive")>-1);

+	} 

+	

+

+	public void testBuildParagraphOfGeneratedTextTestBio4(){

+		HitBase input = new HitBase();

+		input.setAbstractText(" In 1905 our view of the world was changed dramatically and ...");

+		input.setUrl("http://philosophynow.org/issues/93/Albert_Einstein_1879-1955");

+		input.setTitle("ALBERT EINSTEIN (1879-1955)");

+		HitBase result = finder.buildParagraphOfGeneratedText(input,

+				"view of the world", new ArrayList<String>());

+		System.out.println(result.toString());

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+		assertTrue(result.getFragments().size()>0);

+		assertTrue(result.getFragments().get(0).getFragment().indexOf("view of the world")>-1);

+	} 

+	

+

+}

+

+

+//[Albert Einstein (/ælbrt anstan/; German. albt antan ( listen); 14 March 1879 18 April 1955) was a German-born theoretical physicist who developed the general theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). 2 3 While best known for his massenergy equivalence formula E = mc2 (which has been dubbed "the world's most famous equation"), 4 he received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". 5 The latter was pivotal in establishing quantum theory. nullNear the beginning of his career, Einstein thought that Newtonian mechanics was no longer enough to reconcile the laws of classical mechanics with the laws of the electromagnetic field. This led to the development of his special theory of relativity.,

+

+//"Today, the practical applications of Einsteins theories include the development of the television"
\ No newline at end of file

diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/SnippetToParagraphTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/SnippetToParagraphTest.java
new file mode 100644
index 0000000..fb6259b
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/SnippetToParagraphTest.java

@@ -0,0 +1,37 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+

+import opennlp.tools.similarity.apps.HitBase;

+import junit.framework.TestCase;

+

+

+public class SnippetToParagraphTest extends TestCase {

+	SnippetToParagraph converter = new SnippetToParagraph();

+

+	public void testConversionTest(){

+		HitBase input = new HitBase();

+		input.setAbstractText("... complicity in the military's latest failure to uphold their own standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense ...");

+		input.setUrl("http://armedservices.house.gov/index.cfm/press-releases?ContentRecord_id=b5d9aeab-6745-4eba-94ea-12295fd40e67");

+		input.setTitle("Press Releases - News - Armed Services Republicans");

+		HitBase result = converter.formTextFromOriginalPageGivenSnippet(input);

+		assertTrue(result.getOriginalSentences()!=null);

+		assertTrue(result.getOriginalSentences().size()>0);

+	}

+

+}


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/StoryDiscourseNavigatorTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/StoryDiscourseNavigatorTest.java
new file mode 100644
index 0000000..f94d1a7
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/apps/StoryDiscourseNavigatorTest.java

@@ -0,0 +1,47 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.apps;

+

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.List;

+

+import opennlp.tools.similarity.apps.HitBase;

+import opennlp.tools.similarity.apps.RelatedSentenceFinder;

+import opennlp.tools.similarity.apps.StoryDiscourseNavigator;

+import junit.framework.TestCase;

+

+

+public class StoryDiscourseNavigatorTest extends TestCase {

+	RelatedSentenceFinder finder = new RelatedSentenceFinder();

+

+	

+	public void testGeneratedExtednsionKeywords(){

+		String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");

+		System.out.println(Arrays.asList(res));

+		assertTrue(res.length>0);

+		assertTrue(Arrays.asList(res).toString().indexOf("physics")>-1);

+		assertTrue(Arrays.asList(res).toString().indexOf("relativity")>-1);

+		

+		

+		

+	} 

+

+}

+

+//[Albert Einstein (/ælbrt anstan/; German. albt antan ( listen); 14 March 1879 18 April 1955) was a German-born theoretical physicist who developed the general theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). 2 3 While best known for his massenergy equivalence formula E = mc2 (which has been dubbed "the world's most famous equation"), 4 he received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". 5 The latter was pivotal in establishing quantum theory. nullNear the beginning of his career, Einstein thought that Newtonian mechanics was no longer enough to reconcile the laws of classical mechanics with the laws of the electromagnetic field. This led to the development of his special theory of relativity.,


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsArcBuilderTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsArcBuilderTest.java
new file mode 100644
index 0000000..bbce9e8
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/communicative_actions/CommunicativeActionsArcBuilderTest.java

@@ -0,0 +1,67 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+package opennlp.tools.parse_thicket.communicative_actions;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.similarity.apps.HitBase;

+

+import junit.framework.TestCase;

+

+public class CommunicativeActionsArcBuilderTest extends TestCase {

+	Matcher matcher = new Matcher();

+	

+	public void testCommunicativeActionsArcBuilderTestQ(){

+		String text = "As a US citizen living abroad, I am concerned about the health reform regulation of 2014. "+

+				"I do not want to wait till I am sick to buy health insurance. "+

+				"Yet I am afraid I will end up being requested to pay the tax. "+

+				"Although I live abroad, I am worried about having to pay a fine for being reported as not having health insurance coverage. ";

+		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text);

+		List<WordWordInterSentenceRelationArc> results = new ArrayList<WordWordInterSentenceRelationArc>();

+		for(WordWordInterSentenceRelationArc arc: pt.getArcs()){

+			if(arc.getArcType().getType().startsWith("ca")){

+				results.add(arc);

+				System.out.println(arc);

+			}

+		}

+		assertTrue(results.size()>11);

+		

+	}

+	public void testCommunicativeActionsArcBuilderTestA(){

+		String text =	"People are worried about paying a fine for not carrying health insurance coverage, having been informed by IRS about new regulations. "+

+				"Yet hardly anyone is expected to pay the tax, when the health reform law takes full effect in 2014. "+

+				"The individual mandate confirms that people dont wait until they are sick to buy health insurance. "+

+				"People are exempt from health insurance fine if they report they make too little money, or US citizens living abroad.";

+		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text);

+		List<WordWordInterSentenceRelationArc> results = new ArrayList<WordWordInterSentenceRelationArc>();

+		for(WordWordInterSentenceRelationArc arc: pt.getArcs()){

+			if(arc.getArcType().getType().startsWith("ca")){

+				results.add(arc);

+				System.out.println(arc);

+			}

+		}

+		assertTrue(results.size()>5);

+	}

+	

+

+	

+

+}


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilderTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilderTest.java
new file mode 100644
index 0000000..12ae8ff
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PT2ThicketPhraseBuilderTest.java

@@ -0,0 +1,23 @@
+package opennlp.tools.parse_thicket.matching;

+

+import java.util.List;

+

+import opennlp.tools.parse_thicket.ParseTreeNode;

+

+

+import junit.framework.TestCase;

+

+public class PT2ThicketPhraseBuilderTest extends TestCase {

+	private PT2ThicketPhraseBuilder builder = new PT2ThicketPhraseBuilder();

+	

+	public  void testParsePhrase(){

+		  String line = "(NP (NNP Iran)) (VP (VBZ refuses) (S (VP (TO to) (VP (VB accept) (S (NP (DT the) " +

+		  		"(NNP UN) (NN proposal)) (VP (TO to) (VP (VB end) (NP (PRP$ its) (NN dispute))))))))";

+		  

+		  List<ParseTreeNode> res = builder.parsePhrase("NP", line);

+		  System.out.println(res);

+		  assertTrue(res!=null);

+		  assertTrue(res.size()>0);

+				   

+	  }

+}


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PTMatcherTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PTMatcherTest.java
new file mode 100644
index 0000000..9761bb2
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PTMatcherTest.java

@@ -0,0 +1,106 @@
+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import junit.framework.TestCase;

+

+public class PTMatcherTest extends TestCase {

+	Matcher m = new Matcher();

+	

+	public void testMatchTwoParaTestReduced(){

+		String q = "I am a US citizen living abroad, and concerned about the health reform regulation of 2014. I do not want to wait till I am sick to buy health insurance. I am afraid I will end up paying the tax.";

+		String a = "People are worried about having to pay a fine for not carrying health insurance coverage got more guidance this week with some new federal regulations. "+

+				"Hardly anyone will end up paying the tax when the health reform law takes full effect in 2014. "+

+				"The individual mandate makes sure that people dont wait until they are sick to buy health insurance. "+

+				"People are exempt from health insurance fine if they make too little money to file an income tax return, or US citizens living abroad."; 

+		List<List<ParseTreeChunk>> res = m.assessRelevance(q, a);

+		System.out.print(res);

+		assertTrue(res!=null);

+		assertTrue(res.size()>0);

+		assertEquals(res.toString(), "[[ [NNP-us NN-citizen VBG-living RB-abroad ],  [,-, CC-* ],  [DT-a NNP-* ],  [DT-the NN-* NN-health NN-reform NN-* CD-2014 ],  [NN-* IN-* CD-2014 ],  [NN-health NN-* NN-* IN-* ],  [NN-regulation ], " +

+				" [DT-the NN-health NN-reform NN-* ],  [CD-2014 ],  [NN-health NN-insurance ],  [DT-the NN-tax ],  [NN-tax ]], [ [VBP-* DT-a NNP-* NN-health NN-* NN-* NN-regulation ],  [NN-health NN-* NN-* NN-regulation ],  [NN-regulation ], " +

+				" [DT-the NN-* NN-health NN-reform NN-* CD-2014 ],  [NN-* IN-* CD-2014 ],  [IN-* NN-health NN-* ],  [NNP-us NN-citizen VBG-living RB-abroad ],  [,-, CC-* ],  [NN-health NN-* NN-* IN-* ], " +

+				" [IN-about NN-health NN-* NN-* NN-regulation ],  [VBG-living RB-abroad ],  [TO-to VB-* VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [TO-to VB-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  " +

+				"[TO-to VB-* NN-health NN-insurance ],  [TO-to VB-buy NN-health NN-insurance ],  [VB-* TO-to VB-* VB-* NN-health NN-insurance ],  [TO-to VB-* VB-* NN-health NN-insurance ],  [RB-not VB-* NN-health NN-insurance ],  [VBG-paying DT-* NN-* ],  " +

+				"[MD-will VB-end RP-up VBG-paying DT-the NN-tax ],  [VB-end RP-up VBG-paying DT-the NN-tax ],  [VBG-paying DT-the NN-tax ],  [VBP-do RB-* VB-* TO-* TO-to VB-* ],  [VB-* VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ], " +

+				" [VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [TO-to VB-* VB-buy NN-health NN-insurance ],  [VB-buy NN-health NN-insurance ],  [NN-health NN-insurance NN-tax ],  " +

+				"[TO-to VB-* NN-tax ],  [NN-tax ],  [VB-* TO-to VB-* VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [VB-* TO-to VB-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [VB-* NN-health NN-insurance ],  [VB-* VBG-paying DT-* NN-* ]]]");

+	

+	}

+

+	public void testMatchTwoParaTest1(){

+		List<List<ParseTreeChunk>> res = m.assessRelevance("Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+

+				"UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " +

+				"A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " +

+				"Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "

+

+			, "Iran refuses the UN offer to end a conflict over its nuclear weapons."+

+					"UN passes a resolution prohibiting Iran from developing its uranium enrichment site. " +

+					"A recent UN report presented charts saying Iran was working on nuclear weapons. " +

+				"Iran envoy to UN states its nuclear development is for peaceful purpose, and the evidence against its claim is fabricated by the US. ");

+		System.out.print(res);

+		assertTrue(res!=null);

+		assertTrue(res.size()>0);

+		assertEquals(res.toString(), "[[ [DT-the NNP-un NN-* ],  [PRP$-its JJ-nuclear NNS-weapons ],  [NN-work IN-on JJ-nuclear NNS-weapons ],  [PRP$-its NN-* JJ-nuclear NNS-* ],  [PRP$-its JJ-nuclear NNS-* ],  [DT-a NN-* PRP$-its JJ-* NN-* ],  [DT-a NN-resolution VBG-* NNP-iran IN-* VBG-developing PRP$-its NN-uranium NN-enrichment NN-site ],  [NN-* VBG-* NNP-iran ],  [DT-a NN-resolution VBG-* NNP-* NNP-iran ],  [DT-a NN-resolution NNP-iran ],  [DT-a NNP-iran ],  [DT-a PRP$-its ],  [NNP-iran IN-* VBG-developing PRP$-its NN-uranium NN-enrichment NN-site ],  [IN-for ],  [VBG-* PRP$-its JJ-* NN-* ],  [PRP$-its NN-uranium NN-enrichment NN-site ],  [PRP$-its JJ-* NN-* ],  [VBD-* NNP-iran VBD-was VBG-working IN-on JJ-nuclear NNS-weapons ],  [VBG-* JJ-nuclear NNS-* ],  [JJ-nuclear NNS-weapons ],  [JJ-nuclear NNS-* ],  [NNP-iran NN-envoy ],  [NN-* IN-* PRP-it ],  [NN-* PRP-it ],  [DT-the NN-* NN-evidence IN-against PRP-it ],  [DT-the NN-* NN-* ],  [PRP-it ],  [DT-the NNP-us ],  [DT-the NNP-* ],  [DT-a NN-resolution DT-a JJ-recent NNP-* NN-report ],  [DT-a JJ-recent NNP-* NN-report ],  [NN-* PRP$-its JJ-nuclear NN-* ],  [PRP$-its JJ-nuclear NN-* ],  [VBZ-* PRP$-its ],  [NN-development ],  [PRP$-its JJ-nuclear NN-development ],  [JJ-peaceful NN-purpose ],  [NN-* VBZ-says ],  [NNP-un JJ-nuclear NN-* VBZ-* ],  [NN-* VBZ-* PRP$-its JJ-nuclear NN-development VBZ-is IN-for JJ-peaceful NN-purpose ],  [JJ-nuclear NN-* VBZ-* NN-development VBZ-is IN-for JJ-peaceful NN-purpose ],  [NNP-un NN-* PRP$-its ]], [ [VBZ-refuses TO-to VB-* DT-* NNP-* ],  [VB-* DT-the NNP-un NN-* TO-to VB-end PRP$-its ],  [NNP-un ],  [NNP-* NN-* TO-to ],  [TO-to VB-end PRP$-its ],  [VBZ-* DT-a NN-* PRP$-its JJ-* NN-* ],  [VBZ-passes DT-a NN-resolution VBG-* NNP-iran IN-* VBG-developing PRP$-its NN-uranium NN-enrichment NN-site ],  [NN-* VBG-* NNP-iran ],  [VBG-* NNP-iran IN-* VBG-developing PRP$-its NN-uranium NN-enrichment NN-site ],  [IN-for ],  [PRP$-its JJ-* NN-* ],  [VBG-developing PRP$-its NN-uranium NN-enrichment NN-site ],  [VBG-* PRP$-its JJ-* NN-* ],  [VBD-presented NNS-* NNP-iran VBD-was VBG-working IN-on JJ-nuclear NNS-weapons ],  [VBD-* NNP-iran VBD-was VBG-working IN-on JJ-nuclear NNS-weapons ],  [NNP-iran ],  [VBD-was VBG-working IN-on JJ-nuclear NNS-weapons ],  [JJ-nuclear NNS-weapons ],  [VBG-* JJ-nuclear NNS-* ],  [VBG-working IN-on JJ-nuclear NNS-weapons ],  [PRP$-its JJ-nuclear NN-* ],  [NN-development ],  [VBZ-says JJ-nuclear NN-* ],  [VBZ-* PRP$-its JJ-nuclear NN-development VBZ-is IN-for JJ-peaceful NN-purpose ],  [VBZ-* JJ-nuclear NN-* ],  [VBZ-is IN-for JJ-peaceful NN-purpose ],  [VBN-* VBN-fabricated IN-by DT-the NNP-us ],  [VBN-fabricated IN-by DT-the NNP-us ],  [TO-to VB-* DT-* NNP-* VB-end PRP$-its ],  [VB-end PRP$-its ],  [NN-* IN-over PRP$-its ],  [PRP$-its JJ-nuclear NNS-weapons ],  [DT-a ],  [TO-* VB-* PRP$-its NN-* ],  [VB-* PRP$-its NN-* ],  [VB-* PRP$-its JJ-nuclear NNS-* ],  [DT-the NNP-* ],  [TO-to NNP-un ],  [NN-work IN-on JJ-nuclear NNS-weapons ]]]");

+	}

+

+	public void testMatchTwoParaTest2(){

+		List<List<ParseTreeChunk>> res = m.assessRelevance("I am a US citizen living abroad, and concerned about the health reform regulation of 2014. "+

+				"I do not want to wait till I am sick to buy health insurance. "+

+				"I am afraid I will end up paying the tax. "+

+				"I am worried about having to pay a fine for not having health insurance coverage. "

+				, 

+				"People are worried about having to pay a fine for not carrying health insurance coverage got more guidance this week with some new federal regulations. "+

+						"Hardly anyone will end up paying the tax when the health reform law takes full effect in 2014. "+

+						"The individual mandate makes sure that people dont wait until they are sick to buy health insurance. "+

+				"People are exempt from health insurance fine if they make too little money to file an income tax return, or US citizens living abroad.");

+		System.out.print(res);

+		assertTrue(res!=null);

+		assertTrue(res.size()>0);

+		assertEquals(res.toString(), "[[ [NNP-us NN-citizen VBG-living RB-abroad ],  [,-, CC-* ],  [DT-a NNP-* ],  [DT-the NN-* NN-health NN-reform NN-* CD-2014 ],  " +

+				"[NN-* IN-* CD-2014 ],  [NN-health NN-* NN-* IN-* ],  [NN-regulation ],  [DT-the NN-health NN-reform NN-* ],  [CD-2014 ],  [DT-the NN-tax ],  [NN-tax ], " +

+				" [DT-a NN-fine ],  [NN-health NN-insurance NN-coverage ],  [TO-to VB-* DT-* NN-* ],  [NN-fine IN-* ],  [NN-health NN-insurance NN-* ]], " +

+				"[ [VBP-* DT-a NNP-* NN-health NN-* NN-* NN-regulation ],  [NN-health NN-* NN-* NN-regulation ],  [NN-regulation ],  [DT-the NN-* NN-health NN-reform NN-* CD-2014 ], " +

+				" [NN-* IN-* CD-2014 ],  [IN-* NN-health NN-* ],  [NNP-us NN-citizen VBG-living RB-abroad ],  [,-, CC-* ],  [NN-health NN-* NN-* IN-* ],  [IN-about NN-health NN-* NN-* NN-regulation ],  [VBG-living RB-abroad ],  [TO-to VB-* VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [TO-to VB-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [TO-to VB-buy NN-health NN-insurance ],  [VBG-* VB-pay DT-* NN-* NN-health NN-* NN-* ],  [VB-pay DT-* NN-* NN-health NN-* NN-* ],  [RB-not VBG-* NN-health NN-insurance NN-coverage ],  [VBG-having NN-health NN-insurance NN-coverage ],  [NN-health NN-insurance NN-tax ],  [TO-to VB-* NN-tax ],  [VB-* TO-to VB-* VB-* NN-health NN-insurance ],  [TO-to VB-* VB-* NN-health NN-insurance ],  [TO-to VB-* VB-pay DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ],  [VB-pay DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ],  [RB-not VB-* NN-health NN-insurance NN-coverage ],  [VBP-do RB-* VB-* TO-* TO-to VB-* ],  [VB-* VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [TO-to VB-* VB-buy NN-health NN-insurance ],  [VB-buy NN-health NN-insurance ],  [VB-* TO-to VB-* VB-wait IN-* PRP-* VBP-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [VB-* TO-to VB-* JJ-sick TO-to VB-buy NN-health NN-insurance ],  [VB-* TO-to VB-* VB-pay DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ],  [VB-* NN-health NN-insurance NN-coverage ],  [VBG-having TO-to VB-pay DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ],  [TO-to VB-pay DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ],  [VBG-paying DT-* NN-* DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ],  [VBG-* NN-health NN-insurance NN-coverage ],  [MD-will VB-end RP-up VBG-paying DT-the NN-tax ],  [VB-end RP-up VBG-paying DT-the NN-tax NN-health NN-* NN-* ],  [VBG-paying DT-the NN-tax NN-health NN-* NN-* ],  [TO-to VB-* NN-health NN-insurance ],  [NN-fine IN-* ],  [NN-health NN-insurance NN-* ],  [TO-to VB-* DT-* NN-* ],  [NN-tax ],  [VBP-* VBN-worried IN-about VBG-having TO-to VB-pay DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ],  [VB-* VBG-paying DT-* NN-* DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ], " +

+				" [VBN-worried IN-about VBG-having TO-to VB-pay DT-a NN-fine IN-for RB-not VBG-* NN-health NN-insurance NN-coverage ]]]");

+	}

+

+

+	public void testMatchTwoParaTestCA(){

+		List<List<ParseTreeChunk>> res = m.assessRelevance("As a US citizen living abroad, I am concerned about the health reform regulation of 2014. "+

+				"I do not want to wait till I am sick to buy health insurance. "+

+				"Yet I am afraid I will end up paying the tax. "+

+				"Although I live abroad, I am worried about having to pay a fine for being reported as not having health insurance coverage. "

+				, 

+				"People are worried about paying a fine for not carrying health insurance coverage, having been informed by IRS about new regulations. "+

+						"Yet hardly anyone is expected to pay the tax, when the health reform law takes full effect in 2014. "+

+						"The individual mandate confirms that people dont wait until they are sick to buy health insurance. "+

+				"People are exempt from health insurance fine if they report they make too little money, or US citizens living abroad.");

+		System.out.print(res);

+		assertTrue(res!=null);

+		assertTrue(res.size()>0);

+	}

+

+	public void testMatchTwoParaTestCA1(){

+		String text1 = "As a US citizen living abroad, I am concerned about the health reform regulation of 2014. "+

+				"I do not want to wait till I am sick to buy health insurance. "+

+				"Yet I am afraid I will end up being requested to pay the tax. "+

+				"Although I live abroad, I am worried about having to pay a fine for being reported as not having health insurance coverage. ";

+

+		String text2 =	"People are worried about paying a fine for not carrying health insurance coverage, having been informed by IRS about new regulations. "+

+				"Yet hardly anyone is expected to pay the tax, when the health reform law takes full effect in 2014. "+

+				"The individual mandate confirms that people dont wait until they are sick to buy health insurance. "+

+				"People are exempt from health insurance fine if they report they make too little money, or US citizens living abroad.";

+		List<List<ParseTreeChunk>> res = m.assessRelevance(text1, text2);

+		System.out.print(res);

+		assertTrue(res!=null);

+		assertTrue(res.size()>0);

+	}

+

+}

+

+


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PTPhraseBuilderTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PTPhraseBuilderTest.java
new file mode 100644
index 0000000..7233c46
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PTPhraseBuilderTest.java

@@ -0,0 +1,76 @@
+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import edu.stanford.nlp.trees.Tree;

+

+import opennlp.tools.parse_thicket.ParseCorefsBuilder;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import junit.framework.TestCase;

+

+public class PTPhraseBuilderTest extends TestCase {

+	private ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();

+	private PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();

+		

+	public void testBuildPhraseForUCP(){

+		String q = "I am a US citizen living abroad, and concerned about the health reform regulation of 2014. " +

+				"I do not want to wait till I am sick to buy health insurance. I am afraid I will end up paying the tax.";

+		

+		ParseThicket pt = ptBuilder.buildParseThicket(q);

+		List<ParseTreeNode> sentence = pt.getNodesThicket().get(0);

+		Tree ptree = pt.getSentences().get(0);		

+		List<List<ParseTreeNode>> res = phraseBuilder.buildPT2ptPhrasesForASentence(ptree, sentence );	

+		assertTrue(res!=null);  

+		assertEquals(res.get(7).toString(), 

+				"[<10>ADJP'concerned':JJ, <11>ADJP'about':IN, <12>ADJP'the':DT, <13>ADJP'health':NN, <14>ADJP'reform':NN, <15>ADJP'regulation':NN, <16>ADJP'of':IN, <17>ADJP'2014':CD]");

+		

+		assertTrue(res.size()>12);

+		

+		sentence = pt.getNodesThicket().get(1);

+		ptree = pt.getSentences().get(1);		

+		ptree.pennPrint();

+		res = phraseBuilder.buildPT2ptPhrasesForASentence(ptree, sentence );		

+		assertTrue(res!=null);

+		assertTrue(res.size()>0);

+	

+	}

+	

+	public void testParsePhrase(){

+		String line = "(NP (NNP Iran)) (VP (VBZ refuses) (S (VP (TO to) (VP (VB accept) (S (NP (DT the) " +

+				"(NNP UN) (NN proposal)) (VP (TO to) (VP (VB end) (NP (PRP$ its) (NN dispute))))))))";

+

+		List<ParseTreeNode> res = phraseBuilder. parsePhrase("NP", line);

+		System.out.println(res);

+		assertEquals(res.toString(), 

+				"[NP'Iran':NNP, NP'refuses':VBZ, NP'to':TO, NP'accept':VB, NP'the':DT, NP'UN':NNP, NP'proposal':NN, NP'to':TO, NP'end':VB, NP'its':PRP$, NP'dispute':NN]");

+

+

+		line = "(VP (VBP am) (NP (NP (DT a) (NNP US) (NN citizen)) (UCP (VP (VBG living) (ADVP (RB abroad))) (, ,) (CC and) (ADJP (JJ concerned) (PP (IN about) (NP (NP (DT the) (NN health) (NN reform) (NN regulation)) (PP (IN of) (NP (CD 2014)))))))))";

+		res = phraseBuilder. parsePhrase("VP", line);

+		System.out.println(res);

+		assertEquals(res.toString(), "[VP'am':VBP, VP'a':DT, VP'US':NNP, VP'citizen':NN, VP'living':VBG, VP'abroad':RB, VP',':,, VP'and':CC, VP'concerned':JJ, VP'about':IN, VP'the':DT, VP'health':NN, VP'reform':NN, VP'regulation':NN, VP'of':IN, VP'2014':CD]");

+

+		

+		line = "(VP (TO to) (VP (VB wait) (SBAR (IN till) (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ sick) (S (VP (TO to) (VP (VB buy) (NP (NN health) (NN insurance)))))))))))";

+		res = phraseBuilder. parsePhrase("VP", line);

+		assertEquals(res.toString(), "[VP'to':TO, VP'wait':VB, VP'till':IN, VP'I':PRP, VP'am':VBP, VP'sick':JJ, VP'to':TO, VP'buy':VB, VP'health':NN, VP'insurance':NN]");

+		System.out.println(res);

+	}

+	

+	public void testBuilderPTPhrase(){

+		String q = "I am a US citizen living abroad, and concerned about the health reform regulation of 2014. " +

+				"I do not want to wait till I am sick to buy health insurance. I am afraid I will end up paying the tax.";

+			ParseThicket pt = ptBuilder.buildParseThicket(q);

+			List<List<ParseTreeNode>> res = phraseBuilder.buildPT2ptPhrases(pt);

+			assertTrue(res!=null);

+			assertTrue(res.size()>0);

+

+	}

+

+}

+

+


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PairwiseMatcherTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PairwiseMatcherTest.java
new file mode 100644
index 0000000..a5eb09e
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/matching/PairwiseMatcherTest.java

@@ -0,0 +1,33 @@
+package opennlp.tools.parse_thicket.matching;

+

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+import opennlp.tools.textsimilarity.SentencePairMatchResult;

+import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

+import junit.framework.TestCase;

+

+public class PairwiseMatcherTest extends TestCase {

+	public void testMatchTwoParaTestReduced(){

+		String q = "I am a US citizen living abroad, and concerned about the health reform regulation of 2014. I do not want to wait till I am sick to buy health insurance. I am afraid I will end up paying the tax.";

+		String a = "People are worried about having to pay a fine for not carrying health insurance coverage got more guidance this week with some new federal regulations. "+

+				"Hardly anyone will end up paying the tax when the health reform law takes full effect in 2014. "+

+				"The individual mandate makes sure that people dont wait until they are sick to buy health insurance. "+

+				"People are exempt from health insurance fine if they make too little money to file an income tax return, or US citizens living abroad."; 

+		ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();

+		SentencePairMatchResult res1 = sm.assessRelevance(a, q);

+		System.out.print(res1.getMatchResult());

+		System.out.print(res1);

+		assertTrue(res1!=null);

+		assertTrue(res1.getMatchResult().size()>0);

+	

+	}

+

+	

+

+}

+

+


diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructureTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructureTest.java
new file mode 100644
index 0000000..958910e
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructureTest.java

@@ -0,0 +1,125 @@
+package opennlp.tools.parse_thicket.pattern_structure;
+
+import java.util.*;
+import java.io.*;
+
+import junit.framework.TestCase;
+
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+
+
+public class PhrasePatternStructureTest extends TestCase{
+	ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
+	ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance();
+	PT2ThicketPhraseBuilder phraseBuilder = new PT2ThicketPhraseBuilder();
+	
+	public void testLeoTolstoyTest() {
+		PhrasePatternStructure lat = new PhrasePatternStructure(3,1);	
+		
+		String description;
+		ParseThicket pt1;
+		List<List<ParseTreeNode>> phrs1;
+		List<List<ParseTreeChunk>> sent1GrpLst;
+		//Example 1
+		description = "Eh bien, mon prince, so Genoa and Lucca are now no more than family estates of the Bonapartes. No, I warn you, if you donÕt say that this means war, if you still permit yourself to condone all the infamies, all the atrocities, of this AntichristÑand thatÕs what I really believe he isÑI will have nothing more to do with you, you are no longer my friend, my faithful slave, as you say. But how do you do, how do you do? I see that I am frightening you. Sit down and tell me all about it.";
+		pt1 = ptBuilder.buildParseThicket(description);	
+		phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		sent1GrpLst = lat.formGroupedPhrasesFromChunksForPara(phrs1);
+		lat.AddIntent(sent1GrpLst, 0);
+		
+		description = "Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, if you don't tell me that this means war, if you still try to defend the infamies and horrors perpetrated by that AntichristÑI really believe he is AntichristÑI will have nothing more to do with you and you are no longer my friend, no longer my 'faithful slave,' as you call yourself! But how do you do? I see I have frightened youÑsit down and tell me all the news";		
+		pt1 = ptBuilder.buildParseThicket(description);	
+		phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		sent1GrpLst = lat.formGroupedPhrasesFromChunksForPara(phrs1);
+		lat.AddIntent(sent1GrpLst, 0);
+		
+		
+		description = "Well, Prince, Genoa and Lucca are now nothing more than estates taken over by the Buonaparte family.1 No, I give you fair warning. If you wonÕt say this means war, if you will allow yourself to condone all the ghastly atrocities perpetrated by that Antichrist Ð yes, thatÕs what I think he is Ð I shall disown you. YouÕre no friend of mine Ð not the Òfaithful slaveÓ you claim to be . . . But how are you? How are you keeping? I can see IÕm intimidating you. Do sit down and talk to me.";
+		pt1 = ptBuilder.buildParseThicket(description);	
+		phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		sent1GrpLst = lat.formGroupedPhrasesFromChunksForPara(phrs1);
+		lat.AddIntent(sent1GrpLst, 0);
+		
+		description = "Well, prince, Genoa and Lucca are now nothing more than the apanages, than the private property of the Bonaparte family. I warn you that if you do not tell me we are going to have war, if you still allow yourself to condone all the infamies, all the atrocities of this Antichrist - on my word I believe he is Antichrist - that is the end of our acquaintance; you are no longer my friend, you are no longer my faithful slave, as you call yourself. Now, be of good courage, I see I frighten you. Come, sit down and tell me all about it.";
+		pt1 = ptBuilder.buildParseThicket(description);	
+		phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		sent1GrpLst = lat.formGroupedPhrasesFromChunksForPara(phrs1);
+		lat.AddIntent(sent1GrpLst, 0); 
+		
+		lat.printLattice();
+		lat.printLatticeStats();
+	}
+		//Example 2
+	public void testNewsTest() {
+		PhrasePatternStructure lat = new PhrasePatternStructure(3,4);	
+		
+		String description;
+		ParseThicket pt1;
+		List<List<ParseTreeNode>> phrs1;
+		List<List<ParseTreeChunk>> sent1GrpLst;
+		/*List<List<ParseTreeChunk>> res = m.assessRelevance("At least 9 people were killed and 43 others wounded in shootings and bomb attacks, including four car bombings, in central and western Iraq on Thursday, the police said. A car bomb parked near the entrance of the local government compound in Anbar's provincial capital of Ramadi, some 110 km west of Baghdad, detonated in the morning near a convoy of vehicles carrying the provincial governor Qassim al-Fahdawi, a provincial police source told Xinhua on condition of anonymity.",
+				"Officials say a car bomb in northeast Baghdad killed four people, while another bombing at a market in the central part of the capital killed at least two and wounded many more. Security officials also say at least two policemen were killed by a suicide car bomb attack in the northern city of Mosul. No group has claimed responsibility for the attacks, which occurred in both Sunni and Shi'ite neighborhoods."
+				);*/
+		description = "At least 9 people were killed and 43 others wounded in shootings and bomb attacks, including four car bombings, in central and western Iraq on Thursday, the police said. A car bomb parked near the entrance of the local government compound in Anbar's provincial capital of Ramadi, some 110 km west of Baghdad, detonated in the morning near a convoy of vehicles carrying the provincial governor Qassim al-Fahdawi, a provincial police source told Xinhua on condition of anonymity.";
+		pt1 = ptBuilder.buildParseThicket(description);	
+		phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		sent1GrpLst = lat.formGroupedPhrasesFromChunksForPara(phrs1);
+		lat.AddIntent(sent1GrpLst, 0);
+		
+		description = "Officials say a car bomb in northeast Baghdad killed four people, while another bombing at a market in the central part of the capital killed at least two and wounded many more. Security officials also say at least two policemen were killed by a suicide car bomb attack in the northern city of Mosul. No group has claimed responsibility for the attacks, which occurred in both Sunni and Shi'ite neighborhoods.";
+		pt1 = ptBuilder.buildParseThicket(description);	
+		phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		sent1GrpLst = lat.formGroupedPhrasesFromChunksForPara(phrs1);
+		lat.AddIntent(sent1GrpLst, 0);
+		
+		description = "Two car bombs killed at least four people and wounded dozens of others on Monday in one of the bloodiest attacks this year in Dagestan, a turbulent province in Russia's North Caucasus region where armed groups are waging an Islamist insurgency. Car bombs, suicide bombings and firefights are common in Dagestan, at the centre of an insurgency rooted in two post-Soviet wars against separatist rebels in neighbouring Chechnya. Such attacks are rare in other parts of Russia, but in a separate incident in a suburb of Moscow on Monday, security forces killed two suspected militants alleged to have been plotting an attack in the capital and arrested a third suspect after a gunbattle";
+	//	Description = "AMMAN, Jordan (AP) Ñ A Syrian government official says a car bomb has exploded in a suburb of the capital Damascus, killing three people and wounding several others. The Britain-based Syrian Observatory for Human Rights confirmed the Sunday explosion in Jouber, which it said has seen heavy clashes recently between rebels and the Syrian army. It did not have any immediate word on casualties. It said the blast targeted a police station and was carried out by the Jabhat al-Nusra, a militant group linked to al-Qaida, did not elaborate.";
+	//	Description = "A car bombing in Damascus has killed at least nine security forces, with aid groups urging the evacuation of civilians trapped in the embattled Syrian town of Qusayr. The Syrian Observatory for Human Rights said on Sunday the explosion, in the east of the capital, appeared to have been carried out by the extremist Al-Nusra Front, which is allied to al-Qaeda, although there was no immediate confirmation. In Lebanon, security sources said two rockets fired from Syria landed in a border area, and Israeli war planes could be heard flying low over several parts of the country.";
+		pt1 = ptBuilder.buildParseThicket(description);	
+		phrs1 = phraseBuilder.buildPT2ptPhrases(pt1);
+		sent1GrpLst = lat.formGroupedPhrasesFromChunksForPara(phrs1);
+		lat.AddIntent(sent1GrpLst, 0);
+		
+		
+	
+		lat.printLattice();
+		lat.printConceptByPosition(0);
+		/*
+		Set<Integer> intent = new HashSet<Integer>();
+		intent.add(0);
+		intent.add(1);
+		int gen = lat.GetMaximalConcept(intent,0);
+		System.out.println("generator: " + gen);
+		intent.clear();
+		intent.add(0);
+		intent.add(3);
+		
+		lat.AddIntent(intent, 0);
+		//System.out.println("after first addintent");
+		//lat.printConceptByPosition(0);
+		//lat.printConceptByPosition(1);
+		intent.clear();
+		intent.add(0);
+		intent.add(2);
+		lat.AddIntent(intent, 0);
+
+		intent.clear();
+		intent.add(1);
+		intent.add(2);
+
+		lat.AddIntent(intent, 0);
+		intent.clear();
+		intent.add(1);
+		intent.add(2);
+		intent.add(3);
+		lat.AddIntent(intent, 0);
+		lat.printLattice();
+		lat.printLatticeStats();
+		*/
+	}
+}
\ No newline at end of file

diff --git a/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarkerTest.java b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarkerTest.java
new file mode 100644
index 0000000..4a39a7f
--- /dev/null
+++ b/opennlp-similarity/src/test/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarkerTest.java

@@ -0,0 +1,67 @@
+package opennlp.tools.parse_thicket.rhetoric_structure;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.List;

+

+import junit.framework.TestCase;

+

+import opennlp.tools.parse_thicket.IGeneralizer;

+import opennlp.tools.parse_thicket.Pair;

+import opennlp.tools.parse_thicket.ParseThicket;

+import opennlp.tools.parse_thicket.ParseTreeNode;

+import opennlp.tools.parse_thicket.matching.Matcher;

+import opennlp.tools.textsimilarity.ParseTreeChunk;

+

+

+public class RhetoricStructureMarkerTest extends TestCase  {

+	

+	private RhetoricStructureMarker rstMarker = new RhetoricStructureMarker();

+	private Matcher matcher = new Matcher();

+	

+	public  RhetoricStructureMarkerTest(){

+

+		

+	}

+

+	public void testRSTmarker(){

+		String text1 = "As a US citizen living abroad, I am concerned about the health reform regulation of 2014. "+

+				"I do not want to wait till I am sick to buy health insurance. "+

+				"Yet I am afraid I will end up being requested to pay the tax. "+

+				"Although I live abroad, I am worried about having to pay a fine for being reported as not having health insurance coverage. ";

+

+		String text2 =	"People are worried about paying a fine for not carrying health insurance coverage, having been informed by IRS about new regulations. "+

+				"Yet hardly anyone is expected to pay the tax, when the health reform law takes full effect in 2014. "+

+				"The individual mandate confirms that people dont wait until they are sick to buy health insurance. "+

+				"People are exempt from health insurance fine as long as they report they make too little money, or US citizens living abroad.";

+		ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text1);

+		for(List<ParseTreeNode> sent: pt.getNodesThicket()){

+			List<Pair<String, Integer[]>> res = rstMarker .extractRSTrelationInSentenceGetBoundarySpan(sent);

+			System.out.println(rstMarker.markerToString(res));

+		}

+		

+		//assertTrue(res.size()>1);

+		

+		

+		pt = matcher.buildParseThicketFromTextWithRST(text2);

+		for(List<ParseTreeNode> sent: pt.getNodesThicket()){

+			List<Pair<String, Integer[]>> res = rstMarker .extractRSTrelationInSentenceGetBoundarySpan(sent);

+			System.out.println(rstMarker.markerToString(res));

+		}

+		

+	}

+

+	public void testLocal(){

+		ParseTreeNode[] sent = 	

+		new ParseTreeNode[]{new ParseTreeNode("he","prn"), new ParseTreeNode("was","vbz"), new ParseTreeNode("more","jj"), 

+				new ParseTreeNode(",",","),  new ParseTreeNode("than",","), new ParseTreeNode("little","jj"), new ParseTreeNode("boy","nn"),

+				new ParseTreeNode(",",","), new ParseTreeNode("however","*"), new ParseTreeNode(",",","),

+				new ParseTreeNode("he","prp"), new ParseTreeNode("was","vbz"), new ParseTreeNode("adult","jj")

+		};

+		

+		List<Pair<String, Integer[]>> res = rstMarker.extractRSTrelationInSentenceGetBoundarySpan(Arrays.asList(sent));

+		assertTrue(res.size()>2);

+		assertTrue(res.get(0).getFirst().startsWith("contrast"));

+		System.out.println(rstMarker.markerToString(res));

+	}

+}


diff --git a/opennlp-similarity/src/test/resources/tree_kernel/tree_kernel.zip b/opennlp-similarity/src/test/resources/tree_kernel/tree_kernel.zip
new file mode 100644
index 0000000..87a5ddd
--- /dev/null
+++ b/opennlp-similarity/src/test/resources/tree_kernel/tree_kernel.zip
Binary files differ
commit	3bc5f315f9856bd4e1dbbba4948d83153ff8da0b	[log] [tgz]
author	Boris Galitsky <bgalitsky@apache.org>	Mon Jan 06 17:48:30 2014 +0000
committer	Boris Galitsky <bgalitsky@apache.org>	Mon Jan 06 17:48:30 2014 +0000
tree	24f1d15a20168c3b9350e71a3c993ee3b12df1a1
parent	b866ac199c37260433bb3f9fc310e42865356c00 [diff]