blob: 9a185cbefb4022ca3006a20b85e4351414bca0c6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System.Collections.Generic;
using Lucene.Net.Index;
using Lucene.Net.Search.Suggest;
using Lucene.Net.Util;
namespace Lucene.Net.Search.Spell
{
/// <summary>
/// HighFrequencyDictionary: terms taken from the given field
/// of a Lucene index, which appear in a number of documents
/// above a given threshold.
///
/// Threshold is a value in [0..1] representing the minimum
/// number of documents (of the total) where a term should appear.
///
/// Based on LuceneDictionary.
/// </summary>
public class HighFrequencyDictionary : Dictionary
{
private IndexReader reader;
private string field;
private float thresh;
/// <summary>
/// Creates a new Dictionary, pulling source terms from
/// the specified <code>field</code> in the provided <code>reader</code>.
/// <para>
/// Terms appearing in less than <code>thresh</code> percentage of documents
/// will be excluded.
/// </para>
/// </summary>
public HighFrequencyDictionary(IndexReader reader, string field, float thresh)
{
this.reader = reader;
this.field = field;
this.thresh = thresh;
}
public InputIterator EntryIterator
{
get
{
return new HighFrequencyIterator(this);
}
}
internal sealed class HighFrequencyIterator : InputIterator
{
private readonly HighFrequencyDictionary outerInstance;
internal readonly BytesRef spare = new BytesRef();
internal readonly TermsEnum termsEnum;
internal int minNumDocs;
internal long freq;
internal HighFrequencyIterator(HighFrequencyDictionary outerInstance)
{
this.outerInstance = outerInstance;
Terms terms = MultiFields.GetTerms(outerInstance.reader, outerInstance.field);
if (terms != null)
{
termsEnum = terms.Iterator(null);
}
else
{
termsEnum = null;
}
minNumDocs = (int)(outerInstance.thresh * (float)outerInstance.reader.NumDocs());
}
internal bool IsFrequent(int freq)
{
return freq >= minNumDocs;
}
public long Weight
{
get { return freq; }
}
public BytesRef Next()
{
if (termsEnum != null)
{
BytesRef next;
while ((next = termsEnum.Next()) != null)
{
if (IsFrequent(termsEnum.DocFreq()))
{
freq = termsEnum.DocFreq();
spare.CopyBytes(next);
return spare;
}
}
}
return null;
}
public IComparer<BytesRef> Comparator
{
get
{
if (termsEnum == null)
{
return null;
}
else
{
return termsEnum.Comparator;
}
}
}
public BytesRef Payload
{
get { return null; }
}
public bool HasPayloads
{
get { return false; }
}
public HashSet<BytesRef> Contexts
{
get { return null; }
}
public bool HasContexts
{
get { return false; }
}
}
}
}