opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/request_response_recognizer/TreeKernelBasedRecognizerOfRequest_Response.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.parse_thicket.request_response_recognizer;


 import java.util.ArrayList;
 import java.util.List;

 import opennlp.tools.parse_thicket.ParseThicket;
 import opennlp.tools.parse_thicket.VerbNetProcessor;
 import opennlp.tools.parse_thicket.external_rst.MatcherExternalRST;
 import opennlp.tools.parse_thicket.external_rst.ParseThicketWithDiscourseTree;
 import opennlp.tools.parse_thicket.kernel_interface.TreeKernelBasedClassifierMultiplePara;

 /*
  * This class performs TK learning based on parse thicket which includes RST relations only
  * based on Surdeanu at al RST parser. It does sentence parsing and NLP pipeline of
  * Surdeanu's wrapper of Stanford NLP
  */
 public class TreeKernelBasedRecognizerOfRequest_Response extends TreeKernelBasedClassifierMultiplePara{

 	private MatcherExternalRST matcherRST = new MatcherExternalRST();

 	protected List<String> formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
 		//TODO
 		this.setShortRun();
 		List<String> extendedTreesDumpTotal = new ArrayList<String>();
 		try {

 			for(String text: texts){
 				// get the parses from original documents, and form the training dataset
 				try {
 					System.out.print("About to build pt with external rst from "+text + "\n...");
 					ParseThicket pt = matcherRST.buildParseThicketFromTextWithRST(text);
 					if (pt == null)
 						continue;
 					System.out.print("About to build extended forest with external rst...");
 					List<String> extendedTreesDump =  // use direct option (true
 							buildReptresentationForDiscourseTreeAndExtensions((ParseThicketWithDiscourseTree)pt, true);
 					for(String line: extendedTreesDump)
 						extendedTreesDumpTotal.add(flag + " |BT| "+line + " |ET| ");
 					System.out.println("DONE");
 				} catch (Exception e) {
 					e.printStackTrace();
 				}
 			}
 		} catch (Exception e) {
 			e.printStackTrace();
 		}
 		return extendedTreesDumpTotal;
 	}

 	private List<String> buildReptresentationForDiscourseTreeAndExtensions(ParseThicketWithDiscourseTree pt, boolean bDirectDT){
 		List<String> extendedTreesDump = new ArrayList<String>();
 		if (!bDirectDT)
 			// option 1: use RST relation for extended trees
 			extendedTreesDump = treeExtender.buildForestForRSTArcs(pt);
 		else {
 			// option 2: use DT directly
 			extendedTreesDump.add(pt.getDtDump());
 		    extendedTreesDump.add(pt.getDtDumpWithPOS());
 		    extendedTreesDump.add(pt.getDtDumpWithEmbeddedTrees());
 		    extendedTreesDump.add(pt.getDtDumpWithVerbNet());
 		}
 		return extendedTreesDump;
 	}

 	public static void main(String[] args){
 		VerbNetProcessor p = VerbNetProcessor.
 				getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources");

 		TreeKernelBasedRecognizerOfRequest_Response proc = new TreeKernelBasedRecognizerOfRequest_Response();
 		proc.setKernelPath("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
 		proc.trainClassifier(
 				YahooAnswersTrainingSetCreator.origFilesDir,
 				YahooAnswersTrainingSetCreator.origFilesDir.replace("/text", "/neg_text")
 				);
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.parse_thicket.request_response_recognizer;


	import java.util.ArrayList;
	import java.util.List;

	import opennlp.tools.parse_thicket.ParseThicket;
	import opennlp.tools.parse_thicket.VerbNetProcessor;
	import opennlp.tools.parse_thicket.external_rst.MatcherExternalRST;
	import opennlp.tools.parse_thicket.external_rst.ParseThicketWithDiscourseTree;
	import opennlp.tools.parse_thicket.kernel_interface.TreeKernelBasedClassifierMultiplePara;

	/*
	* This class performs TK learning based on parse thicket which includes RST relations only
	* based on Surdeanu at al RST parser. It does sentence parsing and NLP pipeline of
	* Surdeanu's wrapper of Stanford NLP
	*/
	public class TreeKernelBasedRecognizerOfRequest_Response extends TreeKernelBasedClassifierMultiplePara{

	private MatcherExternalRST matcherRST = new MatcherExternalRST();

	protected List<String> formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
	//TODO
	this.setShortRun();
	List<String> extendedTreesDumpTotal = new ArrayList<String>();
	try {

	for(String text: texts){
	// get the parses from original documents, and form the training dataset
	try {
	System.out.print("About to build pt with external rst from "+text + "\n...");
	ParseThicket pt = matcherRST.buildParseThicketFromTextWithRST(text);
	if (pt == null)
	continue;
	System.out.print("About to build extended forest with external rst...");
	List<String> extendedTreesDump = // use direct option (true
	buildReptresentationForDiscourseTreeAndExtensions((ParseThicketWithDiscourseTree)pt, true);
	for(String line: extendedTreesDump)
	extendedTreesDumpTotal.add(flag + " \|BT\| "+line + " \|ET\| ");
	System.out.println("DONE");
	} catch (Exception e) {
	e.printStackTrace();
	}
	}
	} catch (Exception e) {
	e.printStackTrace();
	}
	return extendedTreesDumpTotal;
	}

	private List<String> buildReptresentationForDiscourseTreeAndExtensions(ParseThicketWithDiscourseTree pt, boolean bDirectDT){
	List<String> extendedTreesDump = new ArrayList<String>();
	if (!bDirectDT)
	// option 1: use RST relation for extended trees
	extendedTreesDump = treeExtender.buildForestForRSTArcs(pt);
	else {
	// option 2: use DT directly
	extendedTreesDump.add(pt.getDtDump());
	extendedTreesDump.add(pt.getDtDumpWithPOS());
	extendedTreesDump.add(pt.getDtDumpWithEmbeddedTrees());
	extendedTreesDump.add(pt.getDtDumpWithVerbNet());
	}
	return extendedTreesDump;
	}

	public static void main(String[] args){
	VerbNetProcessor p = VerbNetProcessor.
	getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources");

	TreeKernelBasedRecognizerOfRequest_Response proc = new TreeKernelBasedRecognizerOfRequest_Response();
	proc.setKernelPath("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
	proc.trainClassifier(
	YahooAnswersTrainingSetCreator.origFilesDir,
	YahooAnswersTrainingSetCreator.origFilesDir.replace("/text", "/neg_text")
	);
	}

	}