lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.sandbox.queries;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Objects;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermStates;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostAttribute;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.FuzzyTermsEnum;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.search.similarities.TFIDFSimilarity;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;

 /**
  * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
  * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
  * of fuzzy scoring factors.
  * This generally produces good results for queries where users may provide details in a number of
  * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
  * a fast query.
  *
  * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
  * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
  * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
  * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
  * term) and this is factored into the variant's boost. If the source query term does not exist in the
  * index the average IDF of the variants is used.
  */
 public class FuzzyLikeThisQuery extends Query
 {
   // TODO: generalize this query (at least it should not reuse this static sim!
   // a better way might be to convert this into multitermquery rewrite methods.
   // the rewrite method can 'average' the TermStates's term statistics (docfreq,totalTermFreq)
   // provided to TermQuery, so that the general idea is agnostic to any scoring system...
   static TFIDFSimilarity sim=new ClassicSimilarity();
   ArrayList<FieldVals> fieldVals=new ArrayList<>();
   Analyzer analyzer;

   int MAX_VARIANTS_PER_TERM=50;
   boolean ignoreTF=false;
   private int maxNumTerms;

   @Override
   public int hashCode() {
     int prime = 31;
     int result = classHash();
     result = prime * result + Objects.hashCode(analyzer);
     result = prime * result + Objects.hashCode(fieldVals);
     result = prime * result + (ignoreTF ? 1231 : 1237);
     result = prime * result + maxNumTerms;
     return result;
   }

   @Override
   public boolean equals(Object other) {
     return sameClassAs(other) &&
       equalsTo(getClass().cast(other));
   }

   private boolean equalsTo(FuzzyLikeThisQuery other) {
     return Objects.equals(analyzer, other.analyzer) &&
       Objects.equals(fieldVals, other.fieldVals) &&
       ignoreTF == other.ignoreTF &&
       maxNumTerms == other.maxNumTerms;
   }

   /**
    *
    * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
    */
   public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
   {
     this.analyzer=analyzer;
     this.maxNumTerms = maxNumTerms;
   }

   static class FieldVals
   {
     String queryString;
     String fieldName;
     int maxEdits;
     int prefixLength;
     public FieldVals(String name, int maxEdits, int length, String queryString)
     {
       fieldName = name;
       this.maxEdits = maxEdits;
       prefixLength = length;
       this.queryString = queryString;
     }

     @Override
     public int hashCode() {
       final int prime = 31;
       int result = 1;
       result = prime * result
         + ((fieldName == null) ? 0 : fieldName.hashCode());
       result = prime * result + maxEdits;
       result = prime * result + prefixLength;
       result = prime * result
         + ((queryString == null) ? 0 : queryString.hashCode());
       return result;
     }

     @Override
     public boolean equals(Object obj) {
       if (this == obj)
         return true;
       if (obj == null)
         return false;
       if (getClass() != obj.getClass())
         return false;
       FieldVals other = (FieldVals) obj;
       if (fieldName == null) {
         if (other.fieldName != null)
           return false;
       } else if (!fieldName.equals(other.fieldName))
         return false;
       if (maxEdits != other.maxEdits) {
         return false;
       }
       if (prefixLength != other.prefixLength)
         return false;
       if (queryString == null) {
         if (other.queryString != null)
           return false;
       } else if (!queryString.equals(other.queryString))
         return false;
       return true;
     }


   }

   /**
    * Adds user input for "fuzzification"
    * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
    * @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see FuzzyTermsEnum)
    * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum)
    */
   public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
   {
     int maxEdits = (int) minSimilarity;
     if (maxEdits != minSimilarity || maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
       throw new IllegalArgumentException("minSimilarity must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + minSimilarity);
     }
     fieldVals.add(new FieldVals(fieldName,maxEdits,prefixLength,queryString));
   }


   private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
     if (f.queryString == null) return;
     final Terms terms = MultiTerms.getTerms(reader, f.fieldName);
     if (terms == null) {
       return;
     }
     try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

       int corpusNumDocs = reader.numDocs();
       HashSet<String> processedTerms = new HashSet<>();
       ts.reset();
       while (ts.incrementToken()) {
         String term = termAtt.toString();
         if (!processedTerms.contains(term)) {
           processedTerms.add(term);
           ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
           float minScore = 0;
           Term startTerm = new Term(f.fieldName, term);
           FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
           //store the df so all variants use same idf
           int df = reader.docFreq(startTerm);
           int numVariants = 0;
           int totalVariantDocFreqs = 0;
           BytesRef possibleMatch;
           BoostAttribute boostAtt =
             fe.attributes().addAttribute(BoostAttribute.class);
           while ((possibleMatch = fe.next()) != null) {
             numVariants++;
             totalVariantDocFreqs += fe.docFreq();
             float score = boostAtt.getBoost();
             if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
               ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
               variantsQ.insertWithOverflow(st);
               minScore = variantsQ.top().score; // maintain minScore
             }
             fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
           }

           if (numVariants > 0) {
             int avgDf = totalVariantDocFreqs / numVariants;
             if (df == 0)//no direct match we can use as df for all variants
               {
                 df = avgDf; //use avg df of all variants
               }

             // take the top variants (scored by edit distance) and reset the score
             // to include an IDF factor then add to the global queue for ranking
             // overall top query terms
             int size = variantsQ.size();
             for (int i = 0; i < size; i++) {
               ScoreTerm st = variantsQ.pop();
               st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
               q.insertWithOverflow(st);
             }
           }
         }
       }
       ts.end();
     }
   }

   private Query newTermQuery(IndexReader reader, Term term) throws IOException {
     if (ignoreTF) {
       return new ConstantScoreQuery(new TermQuery(term));
     } else {
       // we build an artificial TermStates that will give an overall df and ttf
       // equal to 1
       TermStates context = new TermStates(reader.getContext());
       for (LeafReaderContext leafContext : reader.leaves()) {
         Terms terms = leafContext.reader().terms(term.field());
         if (terms != null) {
           TermsEnum termsEnum = terms.iterator();
           if (termsEnum.seekExact(term.bytes())) {
             int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
             context.register(termsEnum.termState(), leafContext.ord, freq, freq);
           }
         }
       }
       return new TermQuery(term, context);
     }
   }

   @Override
   public void visit(QueryVisitor visitor) {
     visitor.visitLeaf(this);
   }

   @Override
   public Query rewrite(IndexReader reader) throws IOException
   {
     ScoreTermQueue q = new ScoreTermQueue(maxNumTerms);
     //load up the list of possible terms
     for (FieldVals f : fieldVals) {
       addTerms(reader, f, q);
     }

     BooleanQuery.Builder bq = new BooleanQuery.Builder();

     //create BooleanQueries to hold the variants for each token/field pair and ensure it
     // has no coord factor
     //Step 1: sort the termqueries by term/field
     HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<>();
     int size = q.size();
     for(int i = 0; i < size; i++)
       {
         ScoreTerm st = q.pop();
         ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm);
         if(l==null)
           {
             l=new ArrayList<>();
             variantQueries.put(st.fuzziedSourceTerm,l);
           }
         l.add(st);
       }
     //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
     for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();)
       {
         ArrayList<ScoreTerm> variants = iter.next();
         if(variants.size()==1)
           {
             //optimize where only one selected variant
             ScoreTerm st= variants.get(0);
             Query tq = newTermQuery(reader, st.term);
             // set the boost to a mix of IDF and score
             bq.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD);
           }
         else
           {
             BooleanQuery.Builder termVariants=new BooleanQuery.Builder();
             for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2
                    .hasNext();)
               {
                 ScoreTerm st = iterator2.next();
                 // found a match
                 Query tq = newTermQuery(reader, st.term);
                 // set the boost using the ScoreTerm's score
                 termVariants.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD);          // add to query
               }
             bq.add(termVariants.build(), BooleanClause.Occur.SHOULD);          // add to query
           }
       }
     //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
     // booleans with a minimum-should-match of NumFields-1?
     return bq.build();
   }

   //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
   // term variants) then is reset with IDF for use in ranking against all other
   // terms/fields
   private static class ScoreTerm{
     public Term term;
     public float score;
     Term fuzziedSourceTerm;

     public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
       this.term = term;
       this.score = score;
       this.fuzziedSourceTerm=fuzziedSourceTerm;
     }
   }

   private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
     public ScoreTermQueue(int size){
       super(size);
     }

     /* (non-Javadoc)
      * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
      */
     @Override
     protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
       if (termA.score== termB.score)
         return termA.term.compareTo(termB.term) > 0;
       else
         return termA.score < termB.score;
     }

   }

   /* (non-Javadoc)
    * @see org.apache.lucene.search.Query#toString(java.lang.String)
    */
   @Override
   public String toString(String field)
   {
     return null;
   }


   public boolean isIgnoreTF()
   {
     return ignoreTF;
   }


   public void setIgnoreTF(boolean ignoreTF)
   {
     this.ignoreTF = ignoreTF;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.sandbox.queries;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.Objects;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.MultiTerms;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.TermStates;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.search.BooleanClause;
	import org.apache.lucene.search.BooleanQuery;
	import org.apache.lucene.search.BoostAttribute;
	import org.apache.lucene.search.BoostQuery;
	import org.apache.lucene.search.ConstantScoreQuery;
	import org.apache.lucene.search.FuzzyTermsEnum;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.QueryVisitor;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.search.similarities.ClassicSimilarity;
	import org.apache.lucene.search.similarities.TFIDFSimilarity;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.PriorityQueue;
	import org.apache.lucene.util.automaton.LevenshteinAutomata;

	/**
	* Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
	* In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
	* of fuzzy scoring factors.
	* This generally produces good results for queries where users may provide details in a number of
	* fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
	* a fast query.
	*
	* For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
	* we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
	* TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
	* terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
	* term) and this is factored into the variant's boost. If the source query term does not exist in the
	* index the average IDF of the variants is used.
	*/
	public class FuzzyLikeThisQuery extends Query
	{
	// TODO: generalize this query (at least it should not reuse this static sim!
	// a better way might be to convert this into multitermquery rewrite methods.
	// the rewrite method can 'average' the TermStates's term statistics (docfreq,totalTermFreq)
	// provided to TermQuery, so that the general idea is agnostic to any scoring system...
	static TFIDFSimilarity sim=new ClassicSimilarity();
	ArrayList<FieldVals> fieldVals=new ArrayList<>();
	Analyzer analyzer;

	int MAX_VARIANTS_PER_TERM=50;
	boolean ignoreTF=false;
	private int maxNumTerms;

	@Override
	public int hashCode() {
	int prime = 31;
	int result = classHash();
	result = prime * result + Objects.hashCode(analyzer);
	result = prime * result + Objects.hashCode(fieldVals);
	result = prime * result + (ignoreTF ? 1231 : 1237);
	result = prime * result + maxNumTerms;
	return result;
	}

	@Override
	public boolean equals(Object other) {
	return sameClassAs(other) &&
	equalsTo(getClass().cast(other));
	}

	private boolean equalsTo(FuzzyLikeThisQuery other) {
	return Objects.equals(analyzer, other.analyzer) &&
	Objects.equals(fieldVals, other.fieldVals) &&
	ignoreTF == other.ignoreTF &&
	maxNumTerms == other.maxNumTerms;
	}

	/**
	*
	* @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
	*/
	public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
	{
	this.analyzer=analyzer;
	this.maxNumTerms = maxNumTerms;
	}

	static class FieldVals
	{
	String queryString;
	String fieldName;
	int maxEdits;
	int prefixLength;
	public FieldVals(String name, int maxEdits, int length, String queryString)
	{
	fieldName = name;
	this.maxEdits = maxEdits;
	prefixLength = length;
	this.queryString = queryString;
	}

	@Override
	public int hashCode() {
	final int prime = 31;
	int result = 1;
	result = prime * result
	+ ((fieldName == null) ? 0 : fieldName.hashCode());
	result = prime * result + maxEdits;
	result = prime * result + prefixLength;
	result = prime * result
	+ ((queryString == null) ? 0 : queryString.hashCode());
	return result;
	}

	@Override
	public boolean equals(Object obj) {
	if (this == obj)
	return true;
	if (obj == null)
	return false;
	if (getClass() != obj.getClass())
	return false;
	FieldVals other = (FieldVals) obj;
	if (fieldName == null) {
	if (other.fieldName != null)
	return false;
	} else if (!fieldName.equals(other.fieldName))
	return false;
	if (maxEdits != other.maxEdits) {
	return false;
	}
	if (prefixLength != other.prefixLength)
	return false;
	if (queryString == null) {
	if (other.queryString != null)
	return false;
	} else if (!queryString.equals(other.queryString))
	return false;
	return true;
	}



	}

	/**
	* Adds user input for "fuzzification"
	* @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
	* @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see FuzzyTermsEnum)
	* @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum)
	*/
	public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
	{
	int maxEdits = (int) minSimilarity;
	if (maxEdits != minSimilarity \|\| maxEdits < 0 \|\| maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
	throw new IllegalArgumentException("minSimilarity must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + minSimilarity);
	}
	fieldVals.add(new FieldVals(fieldName,maxEdits,prefixLength,queryString));
	}


	private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
	if (f.queryString == null) return;
	final Terms terms = MultiTerms.getTerms(reader, f.fieldName);
	if (terms == null) {
	return;
	}
	try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
	CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

	int corpusNumDocs = reader.numDocs();
	HashSet<String> processedTerms = new HashSet<>();
	ts.reset();
	while (ts.incrementToken()) {
	String term = termAtt.toString();
	if (!processedTerms.contains(term)) {
	processedTerms.add(term);
	ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
	float minScore = 0;
	Term startTerm = new Term(f.fieldName, term);
	FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
	//store the df so all variants use same idf
	int df = reader.docFreq(startTerm);
	int numVariants = 0;
	int totalVariantDocFreqs = 0;
	BytesRef possibleMatch;
	BoostAttribute boostAtt =
	fe.attributes().addAttribute(BoostAttribute.class);
	while ((possibleMatch = fe.next()) != null) {
	numVariants++;
	totalVariantDocFreqs += fe.docFreq();
	float score = boostAtt.getBoost();
	if (variantsQ.size() < MAX_VARIANTS_PER_TERM \|\| score > minScore) {
	ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
	variantsQ.insertWithOverflow(st);
	minScore = variantsQ.top().score; // maintain minScore
	}
	fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
	}

	if (numVariants > 0) {
	int avgDf = totalVariantDocFreqs / numVariants;
	if (df == 0)//no direct match we can use as df for all variants
	{
	df = avgDf; //use avg df of all variants
	}

	// take the top variants (scored by edit distance) and reset the score
	// to include an IDF factor then add to the global queue for ranking
	// overall top query terms
	int size = variantsQ.size();
	for (int i = 0; i < size; i++) {
	ScoreTerm st = variantsQ.pop();
	st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
	q.insertWithOverflow(st);
	}
	}
	}
	}
	ts.end();
	}
	}

	private Query newTermQuery(IndexReader reader, Term term) throws IOException {
	if (ignoreTF) {
	return new ConstantScoreQuery(new TermQuery(term));
	} else {
	// we build an artificial TermStates that will give an overall df and ttf
	// equal to 1
	TermStates context = new TermStates(reader.getContext());
	for (LeafReaderContext leafContext : reader.leaves()) {
	Terms terms = leafContext.reader().terms(term.field());
	if (terms != null) {
	TermsEnum termsEnum = terms.iterator();
	if (termsEnum.seekExact(term.bytes())) {
	int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
	context.register(termsEnum.termState(), leafContext.ord, freq, freq);
	}
	}
	}
	return new TermQuery(term, context);
	}
	}

	@Override
	public void visit(QueryVisitor visitor) {
	visitor.visitLeaf(this);
	}

	@Override
	public Query rewrite(IndexReader reader) throws IOException
	{
	ScoreTermQueue q = new ScoreTermQueue(maxNumTerms);
	//load up the list of possible terms
	for (FieldVals f : fieldVals) {
	addTerms(reader, f, q);
	}

	BooleanQuery.Builder bq = new BooleanQuery.Builder();

	//create BooleanQueries to hold the variants for each token/field pair and ensure it
	// has no coord factor
	//Step 1: sort the termqueries by term/field
	HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<>();
	int size = q.size();
	for(int i = 0; i < size; i++)
	{
	ScoreTerm st = q.pop();
	ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm);
	if(l==null)
	{
	l=new ArrayList<>();
	variantQueries.put(st.fuzziedSourceTerm,l);
	}
	l.add(st);
	}
	//Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
	for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();)
	{
	ArrayList<ScoreTerm> variants = iter.next();
	if(variants.size()==1)
	{
	//optimize where only one selected variant
	ScoreTerm st= variants.get(0);
	Query tq = newTermQuery(reader, st.term);
	// set the boost to a mix of IDF and score
	bq.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD);
	}
	else
	{
	BooleanQuery.Builder termVariants=new BooleanQuery.Builder();
	for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2
	.hasNext();)
	{
	ScoreTerm st = iterator2.next();
	// found a match
	Query tq = newTermQuery(reader, st.term);
	// set the boost using the ScoreTerm's score
	termVariants.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD); // add to query
	}
	bq.add(termVariants.build(), BooleanClause.Occur.SHOULD); // add to query
	}
	}
	//TODO possible alternative step 3 - organize above booleans into a new layer of field-based
	// booleans with a minimum-should-match of NumFields-1?
	return bq.build();
	}

	//Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
	// term variants) then is reset with IDF for use in ranking against all other
	// terms/fields
	private static class ScoreTerm{
	public Term term;
	public float score;
	Term fuzziedSourceTerm;

	public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
	this.term = term;
	this.score = score;
	this.fuzziedSourceTerm=fuzziedSourceTerm;
	}
	}

	private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
	public ScoreTermQueue(int size){
	super(size);
	}

	/* (non-Javadoc)
	* @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
	*/
	@Override
	protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
	if (termA.score== termB.score)
	return termA.term.compareTo(termB.term) > 0;
	else
	return termA.score < termB.score;
	}

	}

	/* (non-Javadoc)
	* @see org.apache.lucene.search.Query#toString(java.lang.String)
	*/
	@Override
	public String toString(String field)
	{
	return null;
	}


	public boolean isIgnoreTF()
	{
	return ignoreTF;
	}


	public void setIgnoreTF(boolean ignoreTF)
	{
	this.ignoreTF = ignoreTF;
	}

	}