lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.spell;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefIterator;

 /**
  * <p>
  *   Spell Checker class  (Main class).<br>
  *  (initially inspired by the David Spencer code).
  * </p>
  *
  * <p>Example Usage:
  *
  * <pre class="prettyprint">
  *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
  *  // To index a field of a user index:
  *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
  *  // To index a file containing words:
  *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
  *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
  * </pre>
  *
  *
  */
 public class SpellChecker implements java.io.Closeable {

   /**
    * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} .
    */
   public static final float DEFAULT_ACCURACY = 0.5f;

   /**
    * Field name for each word in the ngram index.
    */
   public static final String F_WORD = "word";

   /**
    * the spell index
    */
   // don't modify the directory directly - see #swapSearcher()
   // TODO: why is this package private?
   Directory spellIndex;
   /**
    * Boost value for start and end grams
    */
   private float bStart = 2.0f;

   private float bEnd = 1.0f;
   // don't use this searcher directly - see #swapSearcher()

   private IndexSearcher searcher;
   /*
    * this locks all modifications to the current searcher.
    */

   private final Object searcherLock = new Object();
   /*
    * this lock synchronizes all possible modifications to the
    * current index directory. It should not be possible to try modifying
    * the same index concurrently. Note: Do not acquire the searcher lock
    * before acquiring this lock!
    */
   private final Object modifyCurrentIndexLock = new Object();

   private volatile boolean closed = false;
   // minimum score for hits generated by the spell checker query

   private float accuracy = DEFAULT_ACCURACY;

   private StringDistance sd;
   private Comparator<SuggestWord> comparator;

   /**
    * Use the given directory as a spell checker index. The directory
    * is created if it doesn't exist yet.
    * @param spellIndex the spell index directory
    * @param sd the {@link StringDistance} measurement to use
    * @throws IOException if Spellchecker can not open the directory
    */
   public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
     this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
   }
   /**
    * Use the given directory as a spell checker index with a
    * {@link LevenshteinDistance} as the default {@link StringDistance}. The
    * directory is created if it doesn't exist yet.
    *
    * @param spellIndex
    *          the spell index directory
    * @throws IOException
    *           if spellchecker can not open the directory
    */
   public SpellChecker(Directory spellIndex) throws IOException {
     this(spellIndex, new LevenshteinDistance());
   }

   /**
    * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
    * and the given {@link java.util.Comparator} for sorting the results.
    * @param spellIndex The spelling index
    * @param sd The distance
    * @param comparator The comparator
    * @throws IOException if there is a problem opening the index
    */
   public SpellChecker(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
     setSpellIndex(spellIndex);
     setStringDistance(sd);
     this.comparator = comparator;
   }

   /**
    * Use a different index as the spell checker index or re-open
    * the existing index if <code>spellIndex</code> is the same value
    * as given in the constructor.
    * @param spellIndexDir the spell directory to use
    * @throws AlreadyClosedException if the Spellchecker is already closed
    * @throws  IOException if spellchecker can not open the directory
    */
   // TODO: we should make this final as it is called in the constructor
   public void setSpellIndex(Directory spellIndexDir) throws IOException {
     // this could be the same directory as the current spellIndex
     // modifications to the directory should be synchronized
     synchronized (modifyCurrentIndexLock) {
       ensureOpen();
       if (!DirectoryReader.indexExists(spellIndexDir)) {
           IndexWriter writer = new IndexWriter(spellIndexDir,
             new IndexWriterConfig(null));
           writer.close();
       }
       swapSearcher(spellIndexDir);
     }
   }

   /**
    * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
    * @param comparator the comparator
    */
   public void setComparator(Comparator<SuggestWord> comparator) {
     this.comparator = comparator;
   }

   /**
    * Gets the comparator in use for ranking suggestions.
    * @see #setComparator(Comparator)
    */
   public Comparator<SuggestWord> getComparator() {
     return comparator;
   }

   /**
    * Sets the {@link StringDistance} implementation for this
    * {@link SpellChecker} instance.
    *
    * @param sd the {@link StringDistance} implementation for this
    * {@link SpellChecker} instance
    */
   public void setStringDistance(StringDistance sd) {
     this.sd = sd;
   }
   /**
    * Returns the {@link StringDistance} instance used by this
    * {@link SpellChecker} instance.
    *
    * @return the {@link StringDistance} instance used by this
    *         {@link SpellChecker} instance.
    */
   public StringDistance getStringDistance() {
     return sd;
   }

   /**
    * Sets the accuracy 0 &lt; minScore &lt; 1; default {@link #DEFAULT_ACCURACY}
    * @param acc The new accuracy
    */
   public void setAccuracy(float acc) {
     this.accuracy = acc;
   }

   /**
    * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)}, to
    * decide whether a suggestion is included or not.
    * @return The current accuracy setting
    */
   public float getAccuracy() {
     return accuracy;
   }

   /**
    * Suggest similar words.
    *
    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
    * is not the same as the edit distance strategy used to calculate the best
    * matching spell-checked word from the hits that Lucene found, one usually has
    * to retrieve a couple of numSug's in order to get the true best match.
    *
    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
    *
    * @param word the word you want a spell check done on
    * @param numSug the number of suggested words
    * @throws IOException if the underlying index throws an {@link IOException}
    * @throws AlreadyClosedException if the Spellchecker is already closed
    * @return String[]
    *
    * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
    */
   public String[] suggestSimilar(String word, int numSug) throws IOException {
     return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
   }

   /**
    * Suggest similar words.
    *
    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
    * is not the same as the edit distance strategy used to calculate the best
    * matching spell-checked word from the hits that Lucene found, one usually has
    * to retrieve a couple of numSug's in order to get the true best match.
    *
    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
    *
    * @param word the word you want a spell check done on
    * @param numSug the number of suggested words
    * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
    * @throws IOException if the underlying index throws an {@link IOException}
    * @throws AlreadyClosedException if the Spellchecker is already closed
    * @return String[]
    *
    * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
    */
   public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
     return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
   }

   /**
    * Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
    *       suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)}
    *
    */
   public String[] suggestSimilar(String word, int numSug, IndexReader ir,
       String field, SuggestMode suggestMode) throws IOException {
     return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
   }

   /**
    * Suggest similar words (optionally restricted to a field of an index).
    *
    * <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
    * is not the same as the edit distance strategy used to calculate the best
    * matching spell-checked word from the hits that Lucene found, one usually has
    * to retrieve a couple of numSug's in order to get the true best match.
    *
    * <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
    * Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
    *
    * @param word the word you want a spell check done on
    * @param numSug the number of suggested words
    * @param ir the indexReader of the user index (can be null see field param)
    * @param field the field of the user index: if field is not null, the suggested
    * words are restricted to the words present in this field.
    * @param suggestMode
    * (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS)
    * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
    * @throws IOException if the underlying index throws an {@link IOException}
    * @throws AlreadyClosedException if the Spellchecker is already closed
    * @return String[] the sorted list of the suggest words with these 2 criteria:
    * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
    * of the suggest words in the field of the user index
    *
    */
   public String[] suggestSimilar(String word, int numSug, IndexReader ir,
       String field, SuggestMode suggestMode, float accuracy) throws IOException {
     // obtainSearcher calls ensureOpen
     final IndexSearcher indexSearcher = obtainSearcher();
     try {
       if (ir == null || field == null) {
         suggestMode = SuggestMode.SUGGEST_ALWAYS;
       }
       if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
         ir = null;
         field = null;
       }

       final int lengthWord = word.length();

       final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
       final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
       // if the word exists in the real index and we don't care for word frequency, return the word itself
       if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
         return new String[] { word };
       }

       BooleanQuery.Builder query = new BooleanQuery.Builder();
       String[] grams;
       String key;

       for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

         key = "gram" + ng; // form key

         grams = formGrams(word, ng); // form word into ngrams (allow dups too)

         if (grams.length == 0) {
           continue; // hmm
         }

         if (bStart > 0) { // should we boost prefixes?
           add(query, "start" + ng, grams[0], bStart); // matches start of word

         }
         if (bEnd > 0) { // should we boost suffixes
           add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

         }
         for (int i = 0; i < grams.length; i++) {
           add(query, key, grams[i]);
         }
       }

       int maxHits = 10 * numSug;

   //    System.out.println("Q: " + query);
       ScoreDoc[] hits = indexSearcher.search(query.build(), maxHits).scoreDocs;
   //    System.out.println("HITS: " + hits.length());
       SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);

       // go thru more than 'maxr' matches in case the distance filter triggers
       int stop = Math.min(hits.length, maxHits);
       SuggestWord sugWord = new SuggestWord();
       for (int i = 0; i < stop; i++) {

         sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word

         // don't suggest a word for itself, that would be silly
         if (sugWord.string.equals(word)) {
           continue;
         }

         // edit distance
         sugWord.score = sd.getDistance(word,sugWord.string);
         if (sugWord.score < accuracy) {
           continue;
         }

         if (ir != null && field != null) { // use the user index
           sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
           // don't suggest a word that is not present in the field
           if ((suggestMode==SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) {
             continue;
           }
         }
         sugQueue.insertWithOverflow(sugWord);
         if (sugQueue.size() == numSug) {
           // if queue full, maintain the minScore score
           accuracy = sugQueue.top().score;
         }
         sugWord = new SuggestWord();
       }

       // convert to array string
       String[] list = new String[sugQueue.size()];
       for (int i = sugQueue.size() - 1; i >= 0; i--) {
         list[i] = sugQueue.pop().string;
       }

       return list;
     } finally {
       releaseSearcher(indexSearcher);
     }
   }
   /**
    * Add a clause to a boolean query.
    */
   private static void add(BooleanQuery.Builder q, String name, String value, float boost) {
     Query tq = new TermQuery(new Term(name, value));
     q.add(new BooleanClause(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD));
   }

   /**
    * Add a clause to a boolean query.
    */
   private static void add(BooleanQuery.Builder q, String name, String value) {
     q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
   }

   /**
    * Form all ngrams for a given word.
    * @param text the word to parse
    * @param ng the ngram length e.g. 3
    * @return an array of all ngrams in the word and note that duplicates are not removed
    */
   private static String[] formGrams(String text, int ng) {
     int len = text.length();
     String[] res = new String[len - ng + 1];
     for (int i = 0; i < len - ng + 1; i++) {
       res[i] = text.substring(i, i + ng);
     }
     return res;
   }

   /**
    * Removes all terms from the spell check index.
    * @throws IOException If there is a low-level I/O error.
    * @throws AlreadyClosedException if the Spellchecker is already closed
    */
   public void clearIndex() throws IOException {
     synchronized (modifyCurrentIndexLock) {
       ensureOpen();
       final Directory dir = this.spellIndex;
       final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)
           .setOpenMode(OpenMode.CREATE));
       writer.close();
       swapSearcher(dir);
     }
   }

   /**
    * Check whether the word exists in the index.
    * @param word word to check
    * @throws IOException If there is a low-level I/O error.
    * @throws AlreadyClosedException if the Spellchecker is already closed
    * @return true if the word exists in the index
    */
   public boolean exist(String word) throws IOException {
     // obtainSearcher calls ensureOpen
     final IndexSearcher indexSearcher = obtainSearcher();
     try{
       // TODO: we should use ReaderUtil+seekExact, we dont care about the docFreq
       // this is just an existence check
       return indexSearcher.getIndexReader().docFreq(new Term(F_WORD, word)) > 0;
     } finally {
       releaseSearcher(indexSearcher);
     }
   }

   /**
    * Indexes the data from the given {@link Dictionary}.
    * @param dict Dictionary to index
    * @param config {@link IndexWriterConfig} to use
    * @param fullMerge whether or not the spellcheck index should be fully merged
    * @throws AlreadyClosedException if the Spellchecker is already closed
    * @throws IOException If there is a low-level I/O error.
    */
   public final void indexDictionary(Dictionary dict, IndexWriterConfig config, boolean fullMerge) throws IOException {
     synchronized (modifyCurrentIndexLock) {
       ensureOpen();
       final Directory dir = this.spellIndex;
       final IndexWriter writer = new IndexWriter(dir, config);
       IndexSearcher indexSearcher = obtainSearcher();
       final List<TermsEnum> termsEnums = new ArrayList<>();

       final IndexReader reader = searcher.getIndexReader();
       if (reader.maxDoc() > 0) {
         for (final LeafReaderContext ctx : reader.leaves()) {
           Terms terms = ctx.reader().terms(F_WORD);
           if (terms != null)
             termsEnums.add(terms.iterator());
         }
       }

       boolean isEmpty = termsEnums.isEmpty();

       try {
         BytesRefIterator iter = dict.getEntryIterator();
         BytesRef currentTerm;

         terms: while ((currentTerm = iter.next()) != null) {

           String word = currentTerm.utf8ToString();
           int len = word.length();
           if (len < 3) {
             continue; // too short we bail but "too long" is fine...
           }

           if (!isEmpty) {
             for (TermsEnum te : termsEnums) {
               if (te.seekExact(currentTerm)) {
                 continue terms;
               }
             }
           }

           // ok index the word
           Document doc = createDocument(word, getMin(len), getMax(len));
           writer.addDocument(doc);
         }
       } finally {
         releaseSearcher(indexSearcher);
       }
       if (fullMerge) {
         writer.forceMerge(1);
       }
       // close writer
       writer.close();
       // TODO: this isn't that great, maybe in the future SpellChecker should take
       // IWC in its ctor / keep its writer open?

       // also re-open the spell index to see our own changes when the next suggestion
       // is fetched:
       swapSearcher(dir);
     }
   }

   private static int getMin(int l) {
     if (l > 5) {
       return 3;
     }
     if (l == 5) {
       return 2;
     }
     return 1;
   }

   private static int getMax(int l) {
     if (l > 5) {
       return 4;
     }
     if (l == 5) {
       return 3;
     }
     return 2;
   }

   private static Document createDocument(String text, int ng1, int ng2) {
     Document doc = new Document();
     // the word field is never queried on... it's indexed so it can be quickly
     // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
     Field f = new StringField(F_WORD, text, Field.Store.YES);
     doc.add(f); // orig term
     addGram(text, doc, ng1, ng2);
     return doc;
   }

   private static void addGram(String text, Document doc, int ng1, int ng2) {
     int len = text.length();
     for (int ng = ng1; ng <= ng2; ng++) {
       String key = "gram" + ng;
       String end = null;
       for (int i = 0; i < len - ng + 1; i++) {
         String gram = text.substring(i, i + ng);
         FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
         ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
         Field ngramField = new Field(key, gram, ft);
         // spellchecker does not use positional queries, but we want freqs
         // for scoring these multivalued n-gram fields.
         doc.add(ngramField);
         if (i == 0) {
           // only one term possible in the startXXField, TF/pos and norms aren't needed.
           Field startField = new StringField("start" + ng, gram, Field.Store.NO);
           doc.add(startField);
         }
         end = gram;
       }
       if (end != null) { // may not be present if len==ng1
         // only one term possible in the endXXField, TF/pos and norms aren't needed.
         Field endField = new StringField("end" + ng, end, Field.Store.NO);
         doc.add(endField);
       }
     }
   }

   private IndexSearcher obtainSearcher() {
     synchronized (searcherLock) {
       ensureOpen();
       searcher.getIndexReader().incRef();
       return searcher;
     }
   }

   private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{
       // don't check if open - always decRef
       // don't decrement the private searcher - could have been swapped
       aSearcher.getIndexReader().decRef();
   }

   private void ensureOpen() {
     if (closed) {
       throw new AlreadyClosedException("Spellchecker has been closed");
     }
   }

   /**
    * Close the IndexSearcher used by this SpellChecker
    * @throws IOException if the close operation causes an {@link IOException}
    * @throws AlreadyClosedException if the {@link SpellChecker} is already closed
    */
   @Override
   public void close() throws IOException {
     synchronized (searcherLock) {
       ensureOpen();
       closed = true;
       if (searcher != null) {
         searcher.getIndexReader().close();
       }
       searcher = null;
     }
   }

   private void swapSearcher(final Directory dir) throws IOException {
     /*
      * opening a searcher is possibly very expensive.
      * We rather close it again if the Spellchecker was closed during
      * this operation than block access to the current searcher while opening.
      */
     final IndexSearcher indexSearcher = createSearcher(dir);
     synchronized (searcherLock) {
       if(closed){
         indexSearcher.getIndexReader().close();
         throw new AlreadyClosedException("Spellchecker has been closed");
       }
       if (searcher != null) {
         searcher.getIndexReader().close();
       }
       // set the spellindex in the sync block - ensure consistency.
       searcher = indexSearcher;
       this.spellIndex = dir;
     }
   }

   /**
    * Creates a new read-only IndexSearcher
    * @param dir the directory used to open the searcher
    * @return a new read-only IndexSearcher
    * @throws IOException f there is a low-level IO error
    */
   // for testing purposes
   IndexSearcher createSearcher(final Directory dir) throws IOException{
     return new IndexSearcher(DirectoryReader.open(dir));
   }

   /**
    * Returns <code>true</code> if and only if the {@link SpellChecker} is
    * closed, otherwise <code>false</code>.
    *
    * @return <code>true</code> if and only if the {@link SpellChecker} is
    *         closed, otherwise <code>false</code>.
    */
   boolean isClosed(){
     return closed;
   }

 }