blob: c0f9c91b056a85fa4593f2a2ebcdde3c68b39822 [file] [log] [blame]
// Lucene version compatibility level 4.8.1
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Text;
namespace Lucene.Net.Queries
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Constructs a filter for docs matching any of the terms added to this class.
/// Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
/// a sequence. An example might be a collection of primary keys from a database query result or perhaps
/// a choice of "category" labels picked by the end user. As a filter, this is much faster than the
/// equivalent query (a <see cref="BooleanQuery"/> with many "should" <see cref="TermQuery"/>s)
/// </summary>
public sealed class TermsFilter : Filter
{
/*
* this class is often used for large number of terms in a single field.
* to optimize for this case and to be filter-cache friendly we
* serialize all terms into a single byte array and store offsets
* in a parallel array to keep the # of object constant and speed up
* equals / hashcode.
*
* This adds quite a bit of complexity but allows large term filters to
* be efficient for GC and cache-lookups
*/
private readonly int[] offsets;
private readonly byte[] termsBytes;
private readonly TermsAndField[] termsAndFields;
private readonly int hashCode; // cached hashcode for fast cache lookups
private const int PRIME = 31;
/// <summary>
/// Creates a new <see cref="TermsFilter"/> from the given list. The list
/// can contain duplicate terms and multiple fields.
/// </summary>
public TermsFilter(IList<Term> terms)
: this(new FieldAndTermEnumAnonymousClass(terms), terms.Count)
{
}
private class FieldAndTermEnumAnonymousClass : FieldAndTermEnum
{
public FieldAndTermEnumAnonymousClass(IList<Term> terms)
{
if (terms.Count == 0)
{
throw new ArgumentException("no terms provided");
}
terms.Sort();
iter = terms.GetEnumerator();
}
// we need to sort for deduplication and to have a common cache key
private readonly IEnumerator<Term> iter;
public override bool MoveNext()
{
if (iter.MoveNext())
{
var next = iter.Current;
Field = next.Field;
m_current = next.Bytes;
return true;
}
m_current = null;
return false;
}
}
/// <summary>
/// Creates a new <see cref="TermsFilter"/> from the given <see cref="BytesRef"/> list for
/// a single field.
/// </summary>
public TermsFilter(string field, IList<BytesRef> terms)
: this(new FieldAndTermEnumAnonymousClass2(field, terms), terms.Count)
{
}
private class FieldAndTermEnumAnonymousClass2 : FieldAndTermEnum
{
public FieldAndTermEnumAnonymousClass2(string field, IList<BytesRef> terms)
: base(field)
{
if (terms.Count == 0)
{
throw new ArgumentException("no terms provided");
}
terms.Sort();
iter = terms.GetEnumerator();
}
// we need to sort for deduplication and to have a common cache key
private readonly IEnumerator<BytesRef> iter;
public override bool MoveNext()
{
if (iter.MoveNext())
{
m_current = iter.Current;
return true;
}
m_current = null;
return false;
}
}
/// <summary>
/// Creates a new <see cref="TermsFilter"/> from the given <see cref="BytesRef"/> array for
/// a single field.
/// </summary>
public TermsFilter(string field, params BytesRef[] terms)
: this(field, (IList<BytesRef>)terms)
{
// this ctor prevents unnecessary Term creations
}
/// <summary>
/// Creates a new <see cref="TermsFilter"/> from the given array. The array can
/// contain duplicate terms and multiple fields.
/// </summary>
public TermsFilter(params Term[] terms)
: this((IList<Term>)terms)
{
}
private TermsFilter(FieldAndTermEnum iter, int length)
{
// TODO: maybe use oal.index.PrefixCodedTerms instead?
// If number of terms is more than a few hundred it
// should be a win
// TODO: we also pack terms in FieldCache/DocValues
// ... maybe we can refactor to share that code
// TODO: yet another option is to build the union of the terms in
// an automaton an call intersect on the termsenum if the density is high
int hash = 9;
var serializedTerms = Arrays.Empty<byte>();
this.offsets = new int[length + 1];
int lastEndOffset = 0;
int index = 0;
var termsAndFields = new List<TermsAndField>();
TermsAndField lastTermsAndField = null;
BytesRef previousTerm = null;
string previousField = null;
BytesRef currentTerm;
string currentField;
while (iter.MoveNext())
{
currentTerm = iter.Current;
currentField = iter.Field;
if (currentField == null)
{
throw new ArgumentException("Field must not be null");
}
if (previousField != null)
{
// deduplicate
if (previousField.Equals(currentField, StringComparison.Ordinal))
{
if (previousTerm.BytesEquals(currentTerm))
{
continue;
}
}
else
{
int _start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(_start, index, previousField);
termsAndFields.Add(lastTermsAndField);
}
}
hash = PRIME * hash + currentField.GetHashCode();
hash = PRIME * hash + currentTerm.GetHashCode();
if (serializedTerms.Length < lastEndOffset + currentTerm.Length)
{
serializedTerms = ArrayUtil.Grow(serializedTerms, lastEndOffset + currentTerm.Length);
}
Array.Copy(currentTerm.Bytes, currentTerm.Offset, serializedTerms, lastEndOffset, currentTerm.Length);
offsets[index] = lastEndOffset;
lastEndOffset += currentTerm.Length;
index++;
previousTerm = currentTerm;
previousField = currentField;
}
offsets[index] = lastEndOffset;
int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(start, index, previousField);
termsAndFields.Add(lastTermsAndField);
this.termsBytes = ArrayUtil.Shrink(serializedTerms, lastEndOffset);
this.termsAndFields = termsAndFields.ToArray();
this.hashCode = hash;
}
public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs)
{
AtomicReader reader = context.AtomicReader;
FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
Fields fields = reader.Fields;
BytesRef spare = new BytesRef(this.termsBytes);
if (fields == null)
{
return result;
}
Terms terms; // LUCENENET: IDE0059: Remove unnecessary value assignment
TermsEnum termsEnum = null;
DocsEnum docs = null;
foreach (TermsAndField termsAndField in this.termsAndFields)
{
if ((terms = fields.GetTerms(termsAndField.field)) != null)
{
termsEnum = terms.GetEnumerator(termsEnum); // this won't return null
for (int i = termsAndField.start; i < termsAndField.end; i++)
{
spare.Offset = offsets[i];
spare.Length = offsets[i + 1] - offsets[i];
if (termsEnum.SeekExact(spare))
{
docs = termsEnum.Docs(acceptDocs, docs, DocsFlags.NONE); // no freq since we don't need them
if (result == null)
{
if (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
{
result = new FixedBitSet(reader.MaxDoc);
// lazy init but don't do it in the hot loop since we could read many docs
result.Set(docs.DocID);
}
}
while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
{
result.Set(docs.DocID);
}
}
}
}
}
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if ((obj == null) || (obj.GetType() != this.GetType()))
{
return false;
}
var test = (TermsFilter)obj;
// first check the fields before even comparing the bytes
if (test.hashCode == hashCode && Arrays.Equals(termsAndFields, test.termsAndFields))
{
int lastOffset = termsAndFields[termsAndFields.Length - 1].end;
// compare offsets since we sort they must be identical
if (ArrayUtil.Equals(offsets, 0, test.offsets, 0, lastOffset + 1))
{
// straight byte comparison since we sort they must be identical
return ArrayUtil.Equals(termsBytes, 0, test.termsBytes, 0, offsets[lastOffset]);
}
}
return false;
}
public override int GetHashCode()
{
return hashCode;
}
public override string ToString()
{
var builder = new StringBuilder();
var spare = new BytesRef(termsBytes);
bool first = true;
for (int i = 0; i < termsAndFields.Length; i++)
{
TermsAndField current = termsAndFields[i];
for (int j = current.start; j < current.end; j++)
{
spare.Offset = offsets[j];
spare.Length = offsets[j + 1] - offsets[j];
if (!first)
{
builder.Append(' ');
}
first = false;
builder.Append(current.field).Append(':');
builder.Append(spare.Utf8ToString());
}
}
return builder.ToString();
}
private sealed class TermsAndField
{
internal readonly int start;
internal readonly int end;
internal readonly string field;
internal TermsAndField(int start, int end, string field)
: base()
{
this.start = start;
this.end = end;
this.field = field;
}
public override int GetHashCode()
{
const int prime = 31;
int result = 1;
result = prime * result + ((field == null) ? 0 : field.GetHashCode());
result = prime * result + end;
result = prime * result + start;
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if (obj == null)
{
return false;
}
if (this.GetType() != obj.GetType())
{
return false;
}
var other = (TermsAndField)obj;
if (field == null)
{
if (other.field != null)
{
return false;
}
}
else if (!field.Equals(other.field, StringComparison.Ordinal))
{
return false;
}
if (end != other.end)
{
return false;
}
if (start != other.start)
{
return false;
}
return true;
}
}
private abstract class FieldAndTermEnum
{
// LUCENENET specific - removed field and changed Field property to protected set
protected BytesRef m_current;
public BytesRef Current => m_current;
public abstract bool MoveNext();
public FieldAndTermEnum()
{
}
public FieldAndTermEnum(string field)
{
this.Field = field;
}
public virtual string Field { get; protected set; }
}
}
}