lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/Syns2Index.java - lucene-solr - Git at Google

 package org.apache.lucene.wordnet;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.io.PrintStream;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.TieredMergePolicy;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;

 /**
  * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
  * into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
  *
  * This has been tested with WordNet 2.0.
  *
  * The index has fields named "word" ({@link #F_WORD})
  * and "syn" ({@link #F_SYN}).
  * <p>
  * The source word (such as 'big') can be looked up in the
  * "word" field, and if present there will be fields named "syn"
  * for every synonym. What's tricky here is that there could be <b>multiple</b>
  * fields with the same name, in the general case for words that have multiple synonyms.
  * </p>
  * <p>
  * While the WordNet file distinguishes groups of synonyms with
  * related meanings we don't do that here.
  * </p>
  *
  * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
  *
  * @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
  * @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
  * @see <a href="http://www.hostmon.com/rfc/advanced.jsp">sample site that uses it</a>
  */
 public class Syns2Index
 {
 	/**
 	 *
 	 */
 	private static final PrintStream o = System.out;

 	/**
 	 *
 	 */
 	private static final PrintStream err = System.err;

 	/**
 	 *
 	 */
 	public static final String F_SYN = "syn";

 	/**
 	 *
 	 */
 	public static final String F_WORD = "word";

 	/**
 	 *
 	 */
     private static final Analyzer ana = new StandardAnalyzer(Version.LUCENE_CURRENT);

     /**
      * Takes arg of prolog file name and index directory.
      */
     public static void main(String[] args)
         throws Throwable
     {
         // get command line arguments
         String prologFilename = null; // name of file "wn_s.pl"
         String indexDir = null;
         if (args.length == 2)
         {
             prologFilename = args[0];
             indexDir = args[1];
         }
         else
         {
             usage();
             System.exit(1);
         }

         // ensure that the prolog file is readable
         if (! (new File(prologFilename)).canRead())
         {
             err.println("Error: cannot read Prolog file: " + prologFilename);
             System.exit(1);
         }
         // exit if the target index directory already exists
         if ((new File(indexDir)).isDirectory())
         {
             err.println("Error: index directory already exists: " + indexDir);
             err.println("Please specify a name of a non-existent directory");
             System.exit(1);
         }

         o.println("Opening Prolog file " + prologFilename);
         final FileInputStream fis = new FileInputStream(prologFilename);
         final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
         String line;

         // maps a word to all the "groups" it's in
         final Map<String,List<String>> word2Nums = new TreeMap<String,List<String>>();
         // maps a group to all the words in it
         final Map<String,List<String>> num2Words = new TreeMap<String,List<String>>();
         // number of rejected words
         int ndecent = 0;

         // status output
         int mod = 1;
         int row = 1;
         // parse prolog file
 		o.println( "[1/2] Parsing " + prologFilename);
         while ((line = br.readLine()) != null)
         {
             // occasional progress
             if ((++row) % mod == 0) // periodically print out line we read in
             {
                 mod *= 2;
                 o.println("\t" + row + " " + line + " " + word2Nums.size()
                     + " " + num2Words.size() + " ndecent=" + ndecent);
             }

             // syntax check
             if (! line.startsWith("s("))
             {
                 err.println("OUCH: " + line);
                 System.exit(1);
             }

             // parse line
             line = line.substring(2);
             int comma = line.indexOf(',');
             String num = line.substring(0, comma);
             int q1 = line.indexOf('\'');
             line = line.substring(q1 + 1);
             int q2 = line.lastIndexOf('\'');
             String word = line.substring(0, q2).toLowerCase().replace("''", "'");

             // make sure is a normal word
             if (! isDecent(word))
             {
                 ndecent++;
                 continue; // don't store words w/ spaces
             }

             // 1/2: word2Nums map
             // append to entry or add new one
             List<String> lis = word2Nums.get(word);
             if (lis == null)
             {
                 lis = new LinkedList<String>();
                 lis.add(num);
                 word2Nums.put(word, lis);
             }
             else
                 lis.add(num);

             // 2/2: num2Words map
             lis = num2Words.get(num);
             if (lis == null)
             {
                 lis = new LinkedList<String>();
                 lis.add(word);
                 num2Words.put(num, lis);
             }
             else
                 lis.add(word);
         }

         // close the streams
         fis.close();
         br.close();

         // create the index
 		o.println( "[2/2] Building index to store synonyms, " +
 				   " map sizes are " + word2Nums.size() + " and " + num2Words.size());
         index(indexDir, word2Nums, num2Words);
     }

     /**
      * Checks to see if a word contains only alphabetic characters by
      * checking it one character at a time.
      *
      * @param s string to check
      * @return <code>true</code> if the string is decent
      */
     private static boolean isDecent(String s)
     {
         int len = s.length();
         for (int i = 0; i < len; i++)
         {
             if (!Character.isLetter(s.charAt(i)))
             {
                 return false;
             }
         }
         return true;
     }

     /**
      * Forms a Lucene index based on the 2 maps.
      *
      * @param indexDir the directory where the index should be created
      * @param word2Nums
      * @param num2Words
      */
     private static void index(String indexDir, Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words)
         throws Throwable
     {
         int row = 0;
         int mod = 1;
         FSDirectory dir = FSDirectory.open(new File(indexDir));
         try {

           // override the specific index if it already exists
           IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
               Version.LUCENE_CURRENT, ana).setOpenMode(OpenMode.CREATE));
           ((TieredMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(true); // why?
           Iterator<String> i1 = word2Nums.keySet().iterator();
           while (i1.hasNext()) // for each word
           {
               String g = i1.next();
               Document doc = new Document();

               int n = index(word2Nums, num2Words, g, doc);
               if (n > 0)
               {
           doc.add( new Field( F_WORD, StringField.TYPE_STORED, g));
                   if ((++row % mod) == 0)
                   {
                       o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
                       mod *= 2;
                   }
                   writer.addDocument(doc);
               } // else degenerate
           }
           o.println( "Optimizing..");
           writer.optimize();
           writer.close();
         } finally {
           dir.close();
         }
     }

     /**
      * Given the 2 maps fills a document for 1 word.
      */
     private static int index(Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words, String g, Document doc)
         throws Throwable
     {
         List<String> keys = word2Nums.get(g); // get list of key#'s
         Iterator<String> i2 = keys.iterator();

         Set<String> already = new TreeSet<String>(); // keep them sorted

         // pass 1: fill up 'already' with all words
         while (i2.hasNext()) // for each key#
         {
             already.addAll(num2Words.get(i2.next())); // get list of words
         }
         int num = 0;
         already.remove(g); // of course a word is it's own syn
         Iterator<String> it = already.iterator();
         while (it.hasNext())
         {
             String cur = it.next();
             // don't store things like 'pit bull' -> 'american pit bull'
             if (!isDecent(cur))
             {
                 continue;
             }
             num++;
             FieldType ft = new FieldType();
             ft.setStored(true);
             doc.add( new Field( F_SYN, ft, cur));
         }
         return num;
     }

 	/**
 	 *
 	 */
     private static void usage()
     {
         o.println("\n\n" +
             "java org.apache.lucene.wordnet.Syns2Index <prolog file> <index dir>\n\n");
     }

 }
	package org.apache.lucene.wordnet;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.InputStreamReader;
	import java.io.PrintStream;
	import java.util.Iterator;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.TreeMap;
	import java.util.TreeSet;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.FieldType;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.index.TieredMergePolicy;
	import org.apache.lucene.index.IndexWriterConfig.OpenMode;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.Version;

	/**
	* Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a>
	* into a Lucene index suitable for looking up synonyms and performing query expansion ({@link SynExpand#expand SynExpand.expand(...)}).
	*
	* This has been tested with WordNet 2.0.
	*
	* The index has fields named "word" ({@link #F_WORD})
	* and "syn" ({@link #F_SYN}).
	* <p>
	* The source word (such as 'big') can be looked up in the
	* "word" field, and if present there will be fields named "syn"
	* for every synonym. What's tricky here is that there could be <b>multiple</b>
	* fields with the same name, in the general case for words that have multiple synonyms.
	* </p>
	* <p>
	* While the WordNet file distinguishes groups of synonyms with
	* related meanings we don't do that here.
	* </p>
	*
	* This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB.
	*
	* @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a>
	* @see <a href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb man page</a>
	* @see <a href="http://www.hostmon.com/rfc/advanced.jsp">sample site that uses it</a>
	*/
	public class Syns2Index
	{
	/**
	*
	*/
	private static final PrintStream o = System.out;

	/**
	*
	*/
	private static final PrintStream err = System.err;

	/**
	*
	*/
	public static final String F_SYN = "syn";

	/**
	*
	*/
	public static final String F_WORD = "word";

	/**
	*
	*/
	private static final Analyzer ana = new StandardAnalyzer(Version.LUCENE_CURRENT);

	/**
	* Takes arg of prolog file name and index directory.
	*/
	public static void main(String[] args)
	throws Throwable
	{
	// get command line arguments
	String prologFilename = null; // name of file "wn_s.pl"
	String indexDir = null;
	if (args.length == 2)
	{
	prologFilename = args[0];
	indexDir = args[1];
	}
	else
	{
	usage();
	System.exit(1);
	}

	// ensure that the prolog file is readable
	if (! (new File(prologFilename)).canRead())
	{
	err.println("Error: cannot read Prolog file: " + prologFilename);
	System.exit(1);
	}
	// exit if the target index directory already exists
	if ((new File(indexDir)).isDirectory())
	{
	err.println("Error: index directory already exists: " + indexDir);
	err.println("Please specify a name of a non-existent directory");
	System.exit(1);
	}

	o.println("Opening Prolog file " + prologFilename);
	final FileInputStream fis = new FileInputStream(prologFilename);
	final BufferedReader br = new BufferedReader(new InputStreamReader(fis));
	String line;

	// maps a word to all the "groups" it's in
	final Map<String,List<String>> word2Nums = new TreeMap<String,List<String>>();
	// maps a group to all the words in it
	final Map<String,List<String>> num2Words = new TreeMap<String,List<String>>();
	// number of rejected words
	int ndecent = 0;

	// status output
	int mod = 1;
	int row = 1;
	// parse prolog file
	o.println( "[1/2] Parsing " + prologFilename);
	while ((line = br.readLine()) != null)
	{
	// occasional progress
	if ((++row) % mod == 0) // periodically print out line we read in
	{
	mod *= 2;
	o.println("\t" + row + " " + line + " " + word2Nums.size()
	+ " " + num2Words.size() + " ndecent=" + ndecent);
	}

	// syntax check
	if (! line.startsWith("s("))
	{
	err.println("OUCH: " + line);
	System.exit(1);
	}

	// parse line
	line = line.substring(2);
	int comma = line.indexOf(',');
	String num = line.substring(0, comma);
	int q1 = line.indexOf('\'');
	line = line.substring(q1 + 1);
	int q2 = line.lastIndexOf('\'');
	String word = line.substring(0, q2).toLowerCase().replace("''", "'");

	// make sure is a normal word
	if (! isDecent(word))
	{
	ndecent++;
	continue; // don't store words w/ spaces
	}

	// 1/2: word2Nums map
	// append to entry or add new one
	List<String> lis = word2Nums.get(word);
	if (lis == null)
	{
	lis = new LinkedList<String>();
	lis.add(num);
	word2Nums.put(word, lis);
	}
	else
	lis.add(num);

	// 2/2: num2Words map
	lis = num2Words.get(num);
	if (lis == null)
	{
	lis = new LinkedList<String>();
	lis.add(word);
	num2Words.put(num, lis);
	}
	else
	lis.add(word);
	}

	// close the streams
	fis.close();
	br.close();

	// create the index
	o.println( "[2/2] Building index to store synonyms, " +
	" map sizes are " + word2Nums.size() + " and " + num2Words.size());
	index(indexDir, word2Nums, num2Words);
	}

	/**
	* Checks to see if a word contains only alphabetic characters by
	* checking it one character at a time.
	*
	* @param s string to check
	* @return <code>true</code> if the string is decent
	*/
	private static boolean isDecent(String s)
	{
	int len = s.length();
	for (int i = 0; i < len; i++)
	{
	if (!Character.isLetter(s.charAt(i)))
	{
	return false;
	}
	}
	return true;
	}

	/**
	* Forms a Lucene index based on the 2 maps.
	*
	* @param indexDir the directory where the index should be created
	* @param word2Nums
	* @param num2Words
	*/
	private static void index(String indexDir, Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words)
	throws Throwable
	{
	int row = 0;
	int mod = 1;
	FSDirectory dir = FSDirectory.open(new File(indexDir));
	try {

	// override the specific index if it already exists
	IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
	Version.LUCENE_CURRENT, ana).setOpenMode(OpenMode.CREATE));
	((TieredMergePolicy) writer.getConfig().getMergePolicy()).setUseCompoundFile(true); // why?
	Iterator<String> i1 = word2Nums.keySet().iterator();
	while (i1.hasNext()) // for each word
	{
	String g = i1.next();
	Document doc = new Document();

	int n = index(word2Nums, num2Words, g, doc);
	if (n > 0)
	{
	doc.add( new Field( F_WORD, StringField.TYPE_STORED, g));
	if ((++row % mod) == 0)
	{
	o.println("\trow=" + row + "/" + word2Nums.size() + " doc= " + doc);
	mod *= 2;
	}
	writer.addDocument(doc);
	} // else degenerate
	}
	o.println( "Optimizing..");
	writer.optimize();
	writer.close();
	} finally {
	dir.close();
	}
	}

	/**
	* Given the 2 maps fills a document for 1 word.
	*/
	private static int index(Map<String,List<String>> word2Nums, Map<String,List<String>> num2Words, String g, Document doc)
	throws Throwable
	{
	List<String> keys = word2Nums.get(g); // get list of key#'s
	Iterator<String> i2 = keys.iterator();

	Set<String> already = new TreeSet<String>(); // keep them sorted

	// pass 1: fill up 'already' with all words
	while (i2.hasNext()) // for each key#
	{
	already.addAll(num2Words.get(i2.next())); // get list of words
	}
	int num = 0;
	already.remove(g); // of course a word is it's own syn
	Iterator<String> it = already.iterator();
	while (it.hasNext())
	{
	String cur = it.next();
	// don't store things like 'pit bull' -> 'american pit bull'
	if (!isDecent(cur))
	{
	continue;
	}
	num++;
	FieldType ft = new FieldType();
	ft.setStored(true);
	doc.add( new Field( F_SYN, ft, cur));
	}
	return num;
	}

	/**
	*
	*/
	private static void usage()
	{
	o.println("\n\n" +
	"java org.apache.lucene.wordnet.Syns2Index <prolog file> <index dir>\n\n");
	}

	}