opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelBasedClassifierMultiplePara.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.parse_thicket.kernel_interface;

 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Logger;

 import org.apache.commons.io.FileUtils;


 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;

 import opennlp.tools.jsmlearning.ProfileReaderWriter;
 import opennlp.tools.parse_thicket.ParseThicket;
 import opennlp.tools.parse_thicket.VerbNetProcessor;
 import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
 import opennlp.tools.parse_thicket.matching.Matcher;

 public class TreeKernelBasedClassifierMultiplePara extends TreeKernelBasedClassifier{
 	boolean bShortRun = false;
 	public void setShortRun(){
 		bShortRun = true;
 	}


 	public void trainClassifier(
 			String posDirectory, String negDirectory) {

 		queuePos.clear(); queueNeg.clear();
 		addFilesPos(new File(posDirectory));
 		addFilesNeg(new File(negDirectory));

 		List<File> filesPos = new ArrayList<File>(queuePos), filesNeg = new ArrayList<File>(queueNeg);

 		Collection<String> treeBankBuffer = new ArrayList<String>();
 		int countPos=0, countNeg=0;

 		for (File f : filesPos) {
 			// get first paragraph of text
 			List<String> texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
 			List<String> lines = formTreeKernelStructuresMultiplePara(texts, "1");
 			treeBankBuffer.addAll(lines);
 			if (bShortRun && countPos>3000)
 				break;

 			countPos++;
 		}
 		for (File f : filesNeg) {
 			// get first paragraph of text
 			List<String> texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
 			List<String> lines = formTreeKernelStructuresMultiplePara(texts, "-1");
 			treeBankBuffer.addAll(lines);
 			if (bShortRun && countNeg>3000)
 				break;

 			countNeg++;
 		}

 		// write the lists of samples to a file
 		try {
 			FileUtils.writeLines(new File(path+trainingFileName), null, treeBankBuffer);
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 		//	ProfileReaderWriter.writeReport(treeBankBuffer, path+trainingFileName, ' ');
 		// build the model
 		tkRunner.runLearner(path, trainingFileName, modelFileName);
 	}

 	public List<String[]> classifyFilesInDirectory(String dirFilesToBeClassified){
 		Map<Integer, Integer> countObject = new HashMap<Integer, Integer>();
 		int itemCount=0, objectCount = 0;
 		List<String> treeBankBuffer = new ArrayList<String>();
 		queuePos.clear();
 		addFilesPos(new File( dirFilesToBeClassified));
 		List<File> filesUnkn = new ArrayList<File>(queuePos);
 		for (File f : filesUnkn) {
 			List<String> texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
 			List<String> lines = formTreeKernelStructuresMultiplePara(texts, "0");
 			for(String l: lines){
 				countObject.put(itemCount, objectCount);
 				itemCount++;
 			}
 			objectCount++;
 			treeBankBuffer.addAll(lines);
 		}
 		// write the lists of samples to a file
 		try {
 			FileUtils.writeLines(new File(path+unknownToBeClassified), null, treeBankBuffer);
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}

 		tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput);
 		// read classification results
 		List<String[]> classifResults = ProfileReaderWriter.readProfiles(path+classifierOutput, ' ');
 		// iterate through classification results and set them as scores for hits
 		List<String[]>results = new ArrayList<String[]>();

 		itemCount=0; objectCount = 0;
 		int currentItemCount=0;
 		float accum = 0;
 		for(String[] line: classifResults){
 			Float val = Float.parseFloat(line[0]);
 			accum+=val;
 			// last line
 			Boolean bLastLine = false;
 			if (itemCount==classifResults.size()-1)
 				bLastLine = true;

 			if (objectCount== countObject .get(itemCount) /*&& !bLastLine*/){
 				itemCount++;
 				currentItemCount++;
 				continue;
 			}
 			else while(objectCount!= countObject .get(itemCount)-1){
 				objectCount++;
 				String[] rline = new String[]{filesUnkn.get(objectCount).getName(), "unknown", "0",
 						filesUnkn.get(objectCount).getAbsolutePath() , new Integer(itemCount).toString(), new Integer(objectCount).toString()};
 				results.add(rline);
 			}
 			objectCount = countObject.get(itemCount);
 			itemCount++;

 			float averaged = accum/(float)currentItemCount;
 			currentItemCount=0;
 			Boolean in = false;
 			if (averaged> MIN_SVM_SCORE_TOBE_IN)
 				in = true;

 			String[] rline = new String[]{filesUnkn.get(objectCount).getName(), in.toString(), new Float(averaged).toString(),
 					filesUnkn.get(objectCount).getAbsolutePath() , new Integer(itemCount).toString(), new Integer(objectCount).toString()};
 			results.add(rline);
 			accum=0;
 		}
 		return results;

 	}


 	protected List<String> formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
 		List<String> extendedTreesDumpTotal = new ArrayList<String>();
 		try {
 			for(String text: texts){
 				// get the parses from original documents, and form the training dataset
 				System.out.println("About to build pt from "+text);
 				ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text);
 				System.out.print("About to build extended forest ");
 				List<String> extendedTreesDump = treeExtender.buildForestForCorefArcs(pt);
 				for(String line: extendedTreesDump)
 					extendedTreesDumpTotal.add(flag + " |BT| "+line + " |ET| ");
 				System.out.println("DONE");
 			}

 		} catch (Exception e) {
 			e.printStackTrace();
 		}
 		return extendedTreesDumpTotal;
 	}

 	public static void main(String[] args){
 		VerbNetProcessor p = VerbNetProcessor.
 				getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources");

 		TreeKernelBasedClassifierMultiplePara proc = new TreeKernelBasedClassifierMultiplePara();
 		proc.setKernelPath("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
 		proc.trainClassifier(

 				"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/ted",
 				"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/Tedi");

 		//		List<String[]>res = proc.classifyFilesInDirectory(args[2]);
 		//		ProfileReaderWriter.writeReport(res, "svmDesignDocReport05plus.csv");
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.parse_thicket.kernel_interface;

	import java.io.File;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.logging.Logger;

	import org.apache.commons.io.FileUtils;


	import org.apache.tika.Tika;
	import org.apache.tika.exception.TikaException;

	import opennlp.tools.jsmlearning.ProfileReaderWriter;
	import opennlp.tools.parse_thicket.ParseThicket;
	import opennlp.tools.parse_thicket.VerbNetProcessor;
	import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor;
	import opennlp.tools.parse_thicket.matching.Matcher;

	public class TreeKernelBasedClassifierMultiplePara extends TreeKernelBasedClassifier{
	boolean bShortRun = false;
	public void setShortRun(){
	bShortRun = true;
	}


	public void trainClassifier(
	String posDirectory, String negDirectory) {

	queuePos.clear(); queueNeg.clear();
	addFilesPos(new File(posDirectory));
	addFilesNeg(new File(negDirectory));

	List<File> filesPos = new ArrayList<File>(queuePos), filesNeg = new ArrayList<File>(queueNeg);

	Collection<String> treeBankBuffer = new ArrayList<String>();
	int countPos=0, countNeg=0;

	for (File f : filesPos) {
	// get first paragraph of text
	List<String> texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
	List<String> lines = formTreeKernelStructuresMultiplePara(texts, "1");
	treeBankBuffer.addAll(lines);
	if (bShortRun && countPos>3000)
	break;

	countPos++;
	}
	for (File f : filesNeg) {
	// get first paragraph of text
	List<String> texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
	List<String> lines = formTreeKernelStructuresMultiplePara(texts, "-1");
	treeBankBuffer.addAll(lines);
	if (bShortRun && countNeg>3000)
	break;

	countNeg++;
	}

	// write the lists of samples to a file
	try {
	FileUtils.writeLines(new File(path+trainingFileName), null, treeBankBuffer);
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	// ProfileReaderWriter.writeReport(treeBankBuffer, path+trainingFileName, ' ');
	// build the model
	tkRunner.runLearner(path, trainingFileName, modelFileName);
	}

	public List<String[]> classifyFilesInDirectory(String dirFilesToBeClassified){
	Map<Integer, Integer> countObject = new HashMap<Integer, Integer>();
	int itemCount=0, objectCount = 0;
	List<String> treeBankBuffer = new ArrayList<String>();
	queuePos.clear();
	addFilesPos(new File( dirFilesToBeClassified));
	List<File> filesUnkn = new ArrayList<File>(queuePos);
	for (File f : filesUnkn) {
	List<String> texts=DescriptiveParagraphFromDocExtractor.getLongParagraphsFromFile(f);
	List<String> lines = formTreeKernelStructuresMultiplePara(texts, "0");
	for(String l: lines){
	countObject.put(itemCount, objectCount);
	itemCount++;
	}
	objectCount++;
	treeBankBuffer.addAll(lines);
	}
	// write the lists of samples to a file
	try {
	FileUtils.writeLines(new File(path+unknownToBeClassified), null, treeBankBuffer);
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}

	tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput);
	// read classification results
	List<String[]> classifResults = ProfileReaderWriter.readProfiles(path+classifierOutput, ' ');
	// iterate through classification results and set them as scores for hits
	List<String[]>results = new ArrayList<String[]>();

	itemCount=0; objectCount = 0;
	int currentItemCount=0;
	float accum = 0;
	for(String[] line: classifResults){
	Float val = Float.parseFloat(line[0]);
	accum+=val;
	// last line
	Boolean bLastLine = false;
	if (itemCount==classifResults.size()-1)
	bLastLine = true;

	if (objectCount== countObject .get(itemCount) /&& !bLastLine/){
	itemCount++;
	currentItemCount++;
	continue;
	}
	else while(objectCount!= countObject .get(itemCount)-1){
	objectCount++;
	String[] rline = new String[]{filesUnkn.get(objectCount).getName(), "unknown", "0",
	filesUnkn.get(objectCount).getAbsolutePath() , new Integer(itemCount).toString(), new Integer(objectCount).toString()};
	results.add(rline);
	}
	objectCount = countObject.get(itemCount);
	itemCount++;

	float averaged = accum/(float)currentItemCount;
	currentItemCount=0;
	Boolean in = false;
	if (averaged> MIN_SVM_SCORE_TOBE_IN)
	in = true;

	String[] rline = new String[]{filesUnkn.get(objectCount).getName(), in.toString(), new Float(averaged).toString(),
	filesUnkn.get(objectCount).getAbsolutePath() , new Integer(itemCount).toString(), new Integer(objectCount).toString()};
	results.add(rline);
	accum=0;
	}
	return results;

	}


	protected List<String> formTreeKernelStructuresMultiplePara(List<String> texts, String flag) {
	List<String> extendedTreesDumpTotal = new ArrayList<String>();
	try {
	for(String text: texts){
	// get the parses from original documents, and form the training dataset
	System.out.println("About to build pt from "+text);
	ParseThicket pt = matcher.buildParseThicketFromTextWithRST(text);
	System.out.print("About to build extended forest ");
	List<String> extendedTreesDump = treeExtender.buildForestForCorefArcs(pt);
	for(String line: extendedTreesDump)
	extendedTreesDumpTotal.add(flag + " \|BT\| "+line + " \|ET\| ");
	System.out.println("DONE");
	}

	} catch (Exception e) {
	e.printStackTrace();
	}
	return extendedTreesDumpTotal;
	}

	public static void main(String[] args){
	VerbNetProcessor p = VerbNetProcessor.
	getInstance("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources");

	TreeKernelBasedClassifierMultiplePara proc = new TreeKernelBasedClassifierMultiplePara();
	proc.setKernelPath("/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/tree_kernel/");
	proc.trainClassifier(

	"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/ted",
	"/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/style_recognizer/txt/Tedi");

	// List<String[]>res = proc.classifyFilesInDirectory(args[2]);
	// ProfileReaderWriter.writeReport(res, "svmDesignDocReport05plus.csv");
	}

	}