opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/ClassifierTrainingSetIndexer.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.doc_classifier;

 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;

 import opennlp.tools.jsmlearning.ProfileReaderWriter;

 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;
 import org.apache.tika.Tika;

 public class ClassifierTrainingSetIndexer {
 	public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources";
     public static String INDEX_PATH = "/classif",
             CLASSIF_TRAINING_CORPUS_PATH = "/training_corpus";
     protected ArrayList<File> queue = new ArrayList<File>();
     Tika tika = new Tika();

     IndexWriter indexWriter = null;
     protected static String[] domains =  new String[] { "legal", "health",
    	 "computing", "engineering", "business" };
 	private String absolutePathTrainingSet=null;

     public ClassifierTrainingSetIndexer() {

         try {
             initIndexWriter(resourceDir);
         } catch (Exception e) {
             e.printStackTrace();
         }
     }

     public ClassifierTrainingSetIndexer(String absolutePathTrainingSet) {
     	this.absolutePathTrainingSet = absolutePathTrainingSet;
         try {
             initIndexWriter(resourceDir);
         } catch (Exception e) {
             e.printStackTrace();
         }
     }

     public void indexTrainingSet() {

         try {
         	if (absolutePathTrainingSet==null)
             indexFileOrDirectory(resourceDir
                     + CLASSIF_TRAINING_CORPUS_PATH);
         	else
         		 indexFileOrDirectory(
                          this.absolutePathTrainingSet);

         } catch (IOException e1) {
             e1.printStackTrace();
         }
         try {
             indexWriter.commit();
         } catch (IOException e) {
             e.printStackTrace();
         }
     }
 /*
     private void indexTrainingSample(String text, String flag, int id)
             throws IOException {

         Document doc = new Document();
         doc.add(new StringField("id", new Integer(id).toString(),
                 Field.Store.YES));
         doc.add(new TextField("text", text.toLowerCase(), Field.Store.YES));
         doc.add(new StringField("class", flag.toLowerCase(), Field.Store.YES));
         indexWriter.addDocument(doc);

     }
 */
     private void addFiles(File file) {

         if (!file.exists()) {
             System.out.println(file + " does not exist.");
         }
         if (file.isDirectory()) {
             for (File f : file.listFiles()) {
                 if (f.getName().startsWith("."))
                     continue;
                 addFiles(f);
                 System.out.println(f.getName());
             }
         } else {
             queue.add(file);

         }
     }

     // index last folder name, before filename itself

     public void indexFileOrDirectory(String fileName) throws IOException {
         addFiles(new File(fileName));

         List<File> files = new ArrayList<File>(queue);
         for (File f : files) {
             if (!f.getName().endsWith(".xml")) {

                 try {
                     Document doc = new Document();

                     String name = f.getPath();
                     String className = null;
                     for (String d : domains) {
                         if (name.indexOf(d) > -1) {
                             className = d;
                             break;
                         }
                     }

                     try {
                         doc.add(new TextField("text", tika.parse(f)));
                     } catch (Exception e1) {
                         e1.printStackTrace();
                     }

                     doc.add(new StringField("path", f.getPath(),
                             Field.Store.YES));
                     doc.add(new StringField("class", className, Field.Store.YES));
                     try {

                         indexWriter.addDocument(doc);

                     } catch (Exception e) {
                         e.printStackTrace();
                         System.out.println("Could not add: " + f);
                     }
                 } catch (Exception ee) {
                     ee.printStackTrace();
                 }
             } else { // for xml files
                 try {
                     Document doc = new Document();

                     String name = new String(f.getPath());
                     String[] nparts = name.split("/");
                     int len = nparts.length;
                     name = nparts[len - 2];

                     FileReader fr = new FileReader(f);
                     doc.add(new TextField("text", fr));

                     doc.add(new StringField("path", f.getPath(),
                             Field.Store.YES));
                     doc.add(new StringField("class", name, Field.Store.YES));
                     try {

                         indexWriter.addDocument(doc);

                     } catch (Exception e) {
                         e.printStackTrace();
                         System.out.println("Could not add: " + f);
                     } finally {
                         fr.close();
                     }
                 } catch (Exception ee) {
                     ee.printStackTrace();
                 }
             }

             queue.clear();
         }
     }

     public static String getIndexDir() {
         try {
             return new File(".").getCanonicalPath() + INDEX_PATH;
         } catch (IOException e) {
             // TODO Auto-generated catch block
             e.printStackTrace();
             return null;
         }
     }

     private void initIndexWriter(String dir) throws Exception {

         Directory indexDir = null;

         try {
             indexDir = FSDirectory.open(new File(dir + INDEX_PATH));
         } catch (IOException e) {
             // TODO Auto-generated catch block
             e.printStackTrace();
         }

         Version luceneVersion = Version.LUCENE_46;
         IndexWriterConfig luceneConfig = new IndexWriterConfig(luceneVersion,
                 new StandardAnalyzer(luceneVersion));
         luceneConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

         indexWriter = new IndexWriter(indexDir, luceneConfig);

     }

     void close() {
         try {
             indexWriter.commit();
             indexWriter.close();
         } catch (IOException e) {
             // TODO Auto-generated catch block
             e.printStackTrace();
         }
     }

     public static String getCategoryFromFilePath(String path){
     	String className = null;
         for (String d : domains) {
             if (path.indexOf("/"+d+"/") > -1) {
                 className = d;
                 break;
             }
         }
         return className;
     }

     public static void main(String[] args) {
     	ClassifierTrainingSetIndexer indexer = null;
     	if (args!=null && args.length==1){
 	    	String relativeDirWithTrainingCorpus = args[0];
 	    	// expect corpus relative to 'resource' directory, such as 'training_corpus'
 	    	if (!relativeDirWithTrainingCorpus.startsWith("/"))
 	    		relativeDirWithTrainingCorpus = "/"+relativeDirWithTrainingCorpus;
 	        indexer = new ClassifierTrainingSetIndexer(relativeDirWithTrainingCorpus);
     	} else {
     		// expect corpus in the default location, "/training_corpus" in the resource directory
     		indexer = new ClassifierTrainingSetIndexer();
     	}
         try {
             indexer.indexTrainingSet();
         } catch (Exception e) {
             e.printStackTrace();
         }
         indexer.close();
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.doc_classifier;

	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;

	import opennlp.tools.jsmlearning.ProfileReaderWriter;

	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.Version;
	import org.apache.tika.Tika;

	public class ClassifierTrainingSetIndexer {
	public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources";
	public static String INDEX_PATH = "/classif",
	CLASSIF_TRAINING_CORPUS_PATH = "/training_corpus";
	protected ArrayList<File> queue = new ArrayList<File>();
	Tika tika = new Tika();

	IndexWriter indexWriter = null;
	protected static String[] domains = new String[] { "legal", "health",
	"computing", "engineering", "business" };
	private String absolutePathTrainingSet=null;

	public ClassifierTrainingSetIndexer() {

	try {
	initIndexWriter(resourceDir);
	} catch (Exception e) {
	e.printStackTrace();
	}
	}

	public ClassifierTrainingSetIndexer(String absolutePathTrainingSet) {
	this.absolutePathTrainingSet = absolutePathTrainingSet;
	try {
	initIndexWriter(resourceDir);
	} catch (Exception e) {
	e.printStackTrace();
	}
	}

	public void indexTrainingSet() {

	try {
	if (absolutePathTrainingSet==null)
	indexFileOrDirectory(resourceDir
	+ CLASSIF_TRAINING_CORPUS_PATH);
	else
	indexFileOrDirectory(
	this.absolutePathTrainingSet);

	} catch (IOException e1) {
	e1.printStackTrace();
	}
	try {
	indexWriter.commit();
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	/*
	private void indexTrainingSample(String text, String flag, int id)
	throws IOException {

	Document doc = new Document();
	doc.add(new StringField("id", new Integer(id).toString(),
	Field.Store.YES));
	doc.add(new TextField("text", text.toLowerCase(), Field.Store.YES));
	doc.add(new StringField("class", flag.toLowerCase(), Field.Store.YES));
	indexWriter.addDocument(doc);

	}
	*/
	private void addFiles(File file) {

	if (!file.exists()) {
	System.out.println(file + " does not exist.");
	}
	if (file.isDirectory()) {
	for (File f : file.listFiles()) {
	if (f.getName().startsWith("."))
	continue;
	addFiles(f);
	System.out.println(f.getName());
	}
	} else {
	queue.add(file);

	}
	}

	// index last folder name, before filename itself

	public void indexFileOrDirectory(String fileName) throws IOException {
	addFiles(new File(fileName));

	List<File> files = new ArrayList<File>(queue);
	for (File f : files) {
	if (!f.getName().endsWith(".xml")) {

	try {
	Document doc = new Document();

	String name = f.getPath();
	String className = null;
	for (String d : domains) {
	if (name.indexOf(d) > -1) {
	className = d;
	break;
	}
	}

	try {
	doc.add(new TextField("text", tika.parse(f)));
	} catch (Exception e1) {
	e1.printStackTrace();
	}

	doc.add(new StringField("path", f.getPath(),
	Field.Store.YES));
	doc.add(new StringField("class", className, Field.Store.YES));
	try {

	indexWriter.addDocument(doc);

	} catch (Exception e) {
	e.printStackTrace();
	System.out.println("Could not add: " + f);
	}
	} catch (Exception ee) {
	ee.printStackTrace();
	}
	} else { // for xml files
	try {
	Document doc = new Document();

	String name = new String(f.getPath());
	String[] nparts = name.split("/");
	int len = nparts.length;
	name = nparts[len - 2];

	FileReader fr = new FileReader(f);
	doc.add(new TextField("text", fr));

	doc.add(new StringField("path", f.getPath(),
	Field.Store.YES));
	doc.add(new StringField("class", name, Field.Store.YES));
	try {

	indexWriter.addDocument(doc);

	} catch (Exception e) {
	e.printStackTrace();
	System.out.println("Could not add: " + f);
	} finally {
	fr.close();
	}
	} catch (Exception ee) {
	ee.printStackTrace();
	}
	}

	queue.clear();
	}
	}

	public static String getIndexDir() {
	try {
	return new File(".").getCanonicalPath() + INDEX_PATH;
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	return null;
	}
	}

	private void initIndexWriter(String dir) throws Exception {

	Directory indexDir = null;

	try {
	indexDir = FSDirectory.open(new File(dir + INDEX_PATH));
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}

	Version luceneVersion = Version.LUCENE_46;
	IndexWriterConfig luceneConfig = new IndexWriterConfig(luceneVersion,
	new StandardAnalyzer(luceneVersion));
	luceneConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

	indexWriter = new IndexWriter(indexDir, luceneConfig);

	}

	void close() {
	try {
	indexWriter.commit();
	indexWriter.close();
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}

	public static String getCategoryFromFilePath(String path){
	String className = null;
	for (String d : domains) {
	if (path.indexOf("/"+d+"/") > -1) {
	className = d;
	break;
	}
	}
	return className;
	}

	public static void main(String[] args) {
	ClassifierTrainingSetIndexer indexer = null;
	if (args!=null && args.length==1){
	String relativeDirWithTrainingCorpus = args[0];
	// expect corpus relative to 'resource' directory, such as 'training_corpus'
	if (!relativeDirWithTrainingCorpus.startsWith("/"))
	relativeDirWithTrainingCorpus = "/"+relativeDirWithTrainingCorpus;
	indexer = new ClassifierTrainingSetIndexer(relativeDirWithTrainingCorpus);
	} else {
	// expect corpus in the default location, "/training_corpus" in the resource directory
	indexer = new ClassifierTrainingSetIndexer();
	}
	try {
	indexer.indexTrainingSet();
	} catch (Exception e) {
	e.printStackTrace();
	}
	indexer.close();
	}

	}