blob: dc0d5b3a0b4b98dde78894df9291e441f2377b8f [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Diagnostics;
using Lucene.Net.Index;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using Console = Lucene.Net.Util.SystemConsole;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Codecs.Lucene3x
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BytesRef = Lucene.Net.Util.BytesRef;
using Directory = Lucene.Net.Store.Directory;
using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
using DocsEnum = Lucene.Net.Index.DocsEnum;
using FieldInfo = Lucene.Net.Index.FieldInfo;
using FieldInfos = Lucene.Net.Index.FieldInfos;
using IBits = Lucene.Net.Util.IBits;
using IndexFileNames = Lucene.Net.Index.IndexFileNames;
using IndexInput = Lucene.Net.Store.IndexInput;
using IndexOptions = Lucene.Net.Index.IndexOptions;
using IOContext = Lucene.Net.Store.IOContext;
using IOUtils = Lucene.Net.Util.IOUtils;
using SegmentInfo = Lucene.Net.Index.SegmentInfo;
using Term = Lucene.Net.Index.Term;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
/// <summary>
/// Exposes flex API on a pre-flex index, as a codec.
/// <para/>
/// @lucene.experimental
/// </summary>
[Obsolete("(4.0)")]
internal class Lucene3xFields : FieldsProducer
{
#pragma warning disable CA1802 // Use literals where appropriate
private static readonly bool DEBUG_SURROGATES = false;
#pragma warning restore CA1802 // Use literals where appropriate
public TermInfosReader Tis { get; set; }
public TermInfosReader TisNoIndex { get; private set; }
public IndexInput FreqStream { get; private set; }
public IndexInput ProxStream { get; private set; }
private readonly FieldInfos fieldInfos;
//private readonly SegmentInfo si; // LUCENENET: Never read
// LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java
internal readonly IDictionary<string, FieldInfo> fields = new JCG.SortedDictionary<string, FieldInfo>(StringComparer.Ordinal);
internal readonly IDictionary<string, Terms> preTerms = new Dictionary<string, Terms>();
//private readonly Directory dir; // LUCENENET: Never read
//private readonly IOContext context; // LUCENENET: Never read
//private Directory cfsReader; // LUCENENET NOTE: cfsReader not used
public Lucene3xFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, IOContext context, int indexDivisor)
{
//si = info; // LUCENENET: Never read
// NOTE: we must always load terms index, even for
// "sequential" scan during merging, because what is
// sequential to merger may not be to TermInfosReader
// since we do the surrogates dance:
if (indexDivisor < 0)
{
indexDivisor = -indexDivisor;
}
bool success = false;
try
{
var r = new TermInfosReader(dir, info.Name, fieldInfos, context, indexDivisor);
if (indexDivisor == -1)
{
TisNoIndex = r;
}
else
{
TisNoIndex = null;
Tis = r;
}
//this.context = context; // LUCENENET: Never read
this.fieldInfos = fieldInfos;
// make sure that all index files have been read or are kept open
// so that if an index update removes them we'll still have them
FreqStream = dir.OpenInput(IndexFileNames.SegmentFileName(info.Name, "", Lucene3xPostingsFormat.FREQ_EXTENSION), context);
bool anyProx = false;
foreach (FieldInfo fi in fieldInfos)
{
if (fi.IsIndexed)
{
fields[fi.Name] = fi;
preTerms[fi.Name] = new PreTerms(this, fi);
if (fi.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
{
anyProx = true;
}
}
}
if (anyProx)
{
ProxStream = dir.OpenInput(IndexFileNames.SegmentFileName(info.Name, "", Lucene3xPostingsFormat.PROX_EXTENSION), context);
}
else
{
ProxStream = null;
}
success = true;
}
finally
{
// With lock-less commits, it's entirely possible (and
// fine) to hit a FileNotFound exception above. In
// this case, we want to explicitly close any subset
// of things that were opened so that we don't have to
// wait for a GC to do so.
if (!success)
{
Dispose();
}
}
//this.dir = dir; // LUCENENET: Never read
}
// If this returns, we do the surrogates dance so that the
// terms are sorted by unicode sort order. this should be
// true when segments are used for "normal" searching;
// it's only false during testing, to create a pre-flex
// index, using the test-only PreFlexRW.
protected virtual bool SortTermsByUnicode => true;
public override IEnumerator<string> GetEnumerator()
{
return fields.Keys.GetEnumerator();
}
public override Terms GetTerms(string field)
{
preTerms.TryGetValue(field, out Terms result);
return result;
}
public override int Count
{
get
{
if (Debugging.AssertsEnabled) Debugging.Assert(preTerms.Count == fields.Count);
return fields.Count;
}
}
[Obsolete("iterate fields and add their Count instead.")]
public override long UniqueTermCount => TermsDict.Count;
private TermInfosReader TermsDict
{
get
{
lock (this)
{
if (Tis != null)
{
return Tis;
}
else
{
return TisNoIndex;
}
}
}
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
IOUtils.Dispose(Tis, TisNoIndex, /*cfsReader,*/ FreqStream, ProxStream); // LUCENENET NOTE: cfsReader not used
}
}
private class PreTerms : Terms
{
private readonly Lucene3xFields outerInstance;
internal readonly FieldInfo fieldInfo;
internal PreTerms(Lucene3xFields outerInstance, FieldInfo fieldInfo)
{
this.outerInstance = outerInstance;
this.fieldInfo = fieldInfo;
}
public override TermsEnum GetEnumerator()
{
var termsEnum = new PreTermsEnum(outerInstance);
termsEnum.Reset(fieldInfo);
return termsEnum;
}
public override IComparer<BytesRef> Comparer
{
get
{
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
if (outerInstance.SortTermsByUnicode)
{
return BytesRef.UTF8SortedAsUnicodeComparer;
}
else
{
return BytesRef.UTF8SortedAsUTF16Comparer;
}
}
}
public override long Count => -1;
public override long SumTotalTermFreq => -1;
public override long SumDocFreq => -1;
public override int DocCount => -1;
// LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
public override bool HasFreqs => IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS) >= 0;
public override bool HasOffsets
{
get
{
// preflex doesn't support this
// LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
if (Debugging.AssertsEnabled) Debugging.Assert(IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0);
return false;
}
}
// LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
public override bool HasPositions => IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
public override bool HasPayloads => fieldInfo.HasPayloads;
}
private class PreTermsEnum : TermsEnum
{
private readonly Lucene3xFields outerInstance;
public PreTermsEnum(Lucene3xFields outerInstance)
{
this.outerInstance = outerInstance;
}
private SegmentTermEnum termEnum;
private FieldInfo fieldInfo;
private string internedFieldName;
private bool skipNext;
private BytesRef current;
private SegmentTermEnum seekTermEnum;
private const sbyte UTF8_NON_BMP_LEAD = unchecked((sbyte)0xf0);
private const sbyte UTF8_HIGH_BMP_LEAD = unchecked((sbyte)0xee);
// Returns true if the unicode char is "after" the
// surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
private static bool IsHighBMPChar(byte[] b, int idx)
{
return (((sbyte)b[idx]) & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
}
// Returns true if the unicode char in the UTF8 byte
// sequence starting at idx encodes a char outside of
// BMP (ie what would be a surrogate pair in UTF16):
private static bool IsNonBMPChar(byte[] b, int idx)
{
return (((sbyte)b[idx]) & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
}
private readonly sbyte[] scratch = new sbyte[4];
private readonly BytesRef prevTerm = new BytesRef();
private readonly BytesRef scratchTerm = new BytesRef();
private int newSuffixStart;
// Swap in S, in place of E:
private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
{
int savLength = term.Length;
if (Debugging.AssertsEnabled) Debugging.Assert(term.Offset == 0);
// The 3 bytes starting at downTo make up 1
// unicode character:
if (Debugging.AssertsEnabled) Debugging.Assert(IsHighBMPChar(term.Bytes, pos));
// NOTE: we cannot make this assert, because
// AutomatonQuery legitimately sends us malformed UTF8
// (eg the UTF8 bytes with just 0xee)
// assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();
// Save the bytes && length, since we need to
// restore this if seek "back" finds no matching
// terms
if (term.Bytes.Length < 4 + pos)
{
term.Grow(4 + pos);
}
scratch[0] = (sbyte)term.Bytes[pos];
scratch[1] = (sbyte)term.Bytes[pos + 1];
scratch[2] = (sbyte)term.Bytes[pos + 2];
term.Bytes[pos] = 0xf0;
term.Bytes[pos + 1] = 0x90;
term.Bytes[pos + 2] = 0x80;
term.Bytes[pos + 3] = 0x80;
term.Length = 4 + pos;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
}
// Seek "back":
outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);
// Test if the term we seek'd to in fact found a
// surrogate pair at the same position as the E:
Term t2 = te.Term();
// Cannot be null (or move to next field) because at
// "worst" it'd seek to the same term we are on now,
// unless we are being called from seek
if (t2 == null || t2.Field != internedFieldName)
{
return false;
}
if (DEBUG_SURROGATES)
{
Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text()));
}
// Now test if prefix is identical and we found
// a non-BMP char at the same position:
BytesRef b2 = t2.Bytes;
if (Debugging.AssertsEnabled) Debugging.Assert(b2.Offset == 0);
bool matches;
if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
{
matches = true;
for (int i = 0; i < pos; i++)
{
if (term.Bytes[i] != b2.Bytes[i])
{
matches = false;
break;
}
}
}
else
{
matches = false;
}
// Restore term:
term.Length = savLength;
term.Bytes[pos] = (byte)scratch[0];
term.Bytes[pos + 1] = (byte)scratch[1];
term.Bytes[pos + 2] = (byte)scratch[2];
return matches;
}
// Seek type 2 "continue" (back to the start of the
// surrogates): scan the stripped suffix from the
// prior term, backwards. If there was an E in that
// part, then we try to seek back to S. If that
// seek finds a matching term, we go there.
private bool DoContinue()
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" try cont");
}
int downTo = prevTerm.Length - 1;
bool didSeek = false;
int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1);
while (downTo > limit)
{
if (IsHighBMPChar(prevTerm.Bytes, downTo))
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" found E pos=" + downTo + " vs len=" + prevTerm.Length);
}
if (SeekToNonBMP(seekTermEnum, prevTerm, downTo))
{
// TODO: more efficient seek?
outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);
//newSuffixStart = downTo+4;
newSuffixStart = downTo;
scratchTerm.CopyBytes(termEnum.Term().Bytes);
didSeek = true;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" seek!");
}
break;
}
else
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" no seek");
}
}
}
// Shorten prevTerm in place so that we don't redo
// this loop if we come back here:
if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0)
{
prevTerm.Length = downTo;
}
downTo--;
}
return didSeek;
}
// Look for seek type 3 ("pop"): if the delta from
// prev -> current was replacing an S with an E,
// we must now seek to beyond that E. this seek
// "finishes" the dance at this character
// position.
private bool DoPop()
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" try pop");
}
if (Debugging.AssertsEnabled)
{
Debugging.Assert(newSuffixStart <= prevTerm.Length);
Debugging.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0);
}
if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart))
{
// Seek type 2 -- put 0xFF at this position:
scratchTerm.Bytes[newSuffixStart] = 0xff;
scratchTerm.Length = newSuffixStart + 1;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
}
// TODO: more efficient seek? can we simply swap
// the enums?
outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true);
Term t2 = termEnum.Term();
// We could hit EOF or different field since this
// was a seek "forward":
if (t2 != null && t2.Field == internedFieldName)
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes);
}
BytesRef b2 = t2.Bytes;
if (Debugging.AssertsEnabled) Debugging.Assert(b2.Offset == 0);
// Set newSuffixStart -- we can't use
// termEnum's since the above seek may have
// done no scanning (eg, term was precisely
// and index term, or, was in the term seek
// cache):
scratchTerm.CopyBytes(b2);
SetNewSuffixStart(prevTerm, scratchTerm);
return true;
}
else if (newSuffixStart != 0 || scratchTerm.Length != 0)
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" got term=null (or next field)");
}
newSuffixStart = 0;
scratchTerm.Length = 0;
return true;
}
}
return false;
}
// Pre-flex indices store terms in UTF16 sort order, but
// certain queries require Unicode codepoint order; this
// method carefully seeks around surrogates to handle
// this impedance mismatch
private void SurrogateDance()
{
if (!unicodeSortOrder)
{
return;
}
// We are invoked after TIS.next() (by UTF16 order) to
// possibly seek to a different "next" (by unicode
// order) term.
// We scan only the "delta" from the last term to the
// current term, in UTF8 bytes. We look at 1) the bytes
// stripped from the prior term, and then 2) the bytes
// appended to that prior term's prefix.
// We don't care about specific UTF8 sequences, just
// the "category" of the UTF16 character. Category S
// is a high/low surrogate pair (it non-BMP).
// Category E is any BMP char > UNI_SUR_LOW_END (and <
// U+FFFF). Category A is the rest (any unicode char
// <= UNI_SUR_HIGH_START).
// The core issue is that pre-flex indices sort the
// characters as ASE, while flex must sort as AES. So
// when scanning, when we hit S, we must 1) seek
// forward to E and enum the terms there, then 2) seek
// back to S and enum all terms there, then 3) seek to
// after E. Three different seek points (1, 2, 3).
// We can easily detect S in UTF8: if a byte has
// prefix 11110 (0xf0), then that byte and the
// following 3 bytes encode a single unicode codepoint
// in S. Similarly, we can detect E: if a byte has
// prefix 1110111 (0xee), then that byte and the
// following 2 bytes encode a single unicode codepoint
// in E.
// Note that this is really a recursive process --
// maybe the char at pos 2 needs to dance, but any
// point in its dance, suddenly pos 4 needs to dance
// so you must finish pos 4 before returning to pos
// 2. But then during pos 4's dance maybe pos 7 needs
// to dance, etc. However, despite being recursive,
// we don't need to hold any state because the state
// can always be derived by looking at prior term &
// current term.
// TODO: can we avoid this copy?
if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName)
{
scratchTerm.Length = 0;
}
else
{
scratchTerm.CopyBytes(termEnum.Term().Bytes);
}
if (DEBUG_SURROGATES)
{
Console.WriteLine(" dance");
Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString()));
Console.WriteLine(" " + prevTerm.ToString());
Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()));
Console.WriteLine(" " + scratchTerm.ToString());
}
// this code assumes TermInfosReader/SegmentTermEnum
// always use BytesRef.offset == 0
if (Debugging.AssertsEnabled)
{
Debugging.Assert(prevTerm.Offset == 0);
Debugging.Assert(scratchTerm.Offset == 0);
}
// Need to loop here because we may need to do multiple
// pops, and possibly a continue in the end, ie:
//
// cont
// pop, cont
// pop, pop, cont
// <nothing>
//
while (true)
{
if (DoContinue())
{
break;
}
else
{
if (!DoPop())
{
break;
}
}
}
if (DEBUG_SURROGATES)
{
Console.WriteLine(" finish bmp ends");
}
DoPushes();
}
// Look for seek type 1 ("push"): if the newly added
// suffix contains any S, we must try to seek to the
// corresponding E. If we find a match, we go there;
// else we keep looking for additional S's in the new
// suffix. this "starts" the dance, at this character
// position:
private void DoPushes()
{
int upTo = newSuffixStart;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length);
}
while (upTo < scratchTerm.Length)
{
if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo)))))
{
// A non-BMP char (4 bytes UTF8) starts here:
if (Debugging.AssertsEnabled) Debugging.Assert(scratchTerm.Length >= upTo + 4);
int savLength = scratchTerm.Length;
scratch[0] = (sbyte)scratchTerm.Bytes[upTo];
scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1];
scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2];
scratchTerm.Bytes[upTo] = unchecked((byte)UTF8_HIGH_BMP_LEAD);
scratchTerm.Bytes[upTo + 1] = 0x80;
scratchTerm.Bytes[upTo + 2] = 0x80;
scratchTerm.Length = upTo + 3;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
}
// Seek "forward":
// TODO: more efficient seek?
outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true);
scratchTerm.Bytes[upTo] = (byte)scratch[0];
scratchTerm.Bytes[upTo + 1] = (byte)scratch[1];
scratchTerm.Bytes[upTo + 2] = (byte)scratch[2];
scratchTerm.Length = savLength;
// Did we find a match?
Term t2 = seekTermEnum.Term();
if (DEBUG_SURROGATES)
{
if (t2 == null)
{
Console.WriteLine(" hit term=null");
}
else
{
Console.WriteLine($" hit term={UnicodeUtil.ToHexString(t2.Text())} {t2?.Bytes}");
}
}
// Since this was a seek "forward", we could hit
// EOF or a different field:
bool matches;
if (t2 != null && t2.Field == internedFieldName)
{
BytesRef b2 = t2.Bytes;
if (Debugging.AssertsEnabled) Debugging.Assert(b2.Offset == 0);
if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo))
{
matches = true;
for (int i = 0; i < upTo; i++)
{
if (scratchTerm.Bytes[i] != b2.Bytes[i])
{
matches = false;
break;
}
}
}
else
{
matches = false;
}
}
else
{
matches = false;
}
if (matches)
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" matches!");
}
// OK seek "back"
// TODO: more efficient seek?
outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);
scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);
// +3 because we don't need to check the char
// at upTo: we know it's > BMP
upTo += 3;
// NOTE: we keep iterating, now, since this
// can easily "recurse". Ie, after seeking
// forward at a certain char position, we may
// find another surrogate in our [new] suffix
// and must then do another seek (recurse)
}
else
{
upTo++;
}
}
else
{
upTo++;
}
}
}
private bool unicodeSortOrder;
internal virtual void Reset(FieldInfo fieldInfo)
{
//System.out.println("pff.reset te=" + termEnum);
this.fieldInfo = fieldInfo;
internedFieldName = fieldInfo.Name.Intern();
Term term = new Term(internedFieldName);
if (termEnum == null)
{
termEnum = outerInstance.TermsDict.Terms(term);
seekTermEnum = outerInstance.TermsDict.Terms(term);
//System.out.println(" term=" + termEnum.term());
}
else
{
outerInstance.TermsDict.SeekEnum(termEnum, term, true);
}
skipNext = true;
unicodeSortOrder = outerInstance.SortTermsByUnicode;
Term t = termEnum.Term();
if (t != null && t.Field == internedFieldName)
{
newSuffixStart = 0;
prevTerm.Length = 0;
SurrogateDance();
}
}
public override IComparer<BytesRef> Comparer
{
get
{
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
if (unicodeSortOrder)
{
return BytesRef.UTF8SortedAsUnicodeComparer;
}
else
{
return BytesRef.UTF8SortedAsUTF16Comparer;
}
}
}
public override void SeekExact(long ord)
{
throw new NotSupportedException();
}
public override long Ord => throw new NotSupportedException();
public override SeekStatus SeekCeil(BytesRef term)
{
if (DEBUG_SURROGATES)
{
Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
}
skipNext = false;
TermInfosReader tis = outerInstance.TermsDict;
Term t0 = new Term(fieldInfo.Name, term);
if (Debugging.AssertsEnabled) Debugging.Assert(termEnum != null);
tis.SeekEnum(termEnum, t0, false);
Term t = termEnum.Term();
if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes))
{
// If we found an exact match, no need to do the
// surrogate dance
if (DEBUG_SURROGATES)
{
Console.WriteLine(" seek exact match");
}
current = t.Bytes;
return SeekStatus.FOUND;
}
else if (t == null || t.Field != internedFieldName)
{
// TODO: maybe we can handle this like the next()
// into null? set term as prevTerm then dance?
if (DEBUG_SURROGATES)
{
Console.WriteLine(" seek hit EOF");
}
// We hit EOF; try end-case surrogate dance: if we
// find an E, try swapping in S, backwards:
scratchTerm.CopyBytes(term);
if (Debugging.AssertsEnabled) Debugging.Assert(scratchTerm.Offset == 0);
for (int i = scratchTerm.Length - 1; i >= 0; i--)
{
if (IsHighBMPChar(scratchTerm.Bytes, i))
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" found E pos=" + i + "; try seek");
}
if (SeekToNonBMP(seekTermEnum, scratchTerm, i))
{
scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);
outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false);
newSuffixStart = 1 + i;
DoPushes();
// Found a match
// TODO: faster seek?
current = termEnum.Term().Bytes;
return SeekStatus.NOT_FOUND;
}
}
}
if (DEBUG_SURROGATES)
{
Console.WriteLine(" seek END");
}
current = null;
return SeekStatus.END;
}
else
{
// We found a non-exact but non-null term; this one
// is fun -- just treat it like next, by pretending
// requested term was prev:
prevTerm.CopyBytes(term);
if (DEBUG_SURROGATES)
{
Console.WriteLine(" seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text()));
}
BytesRef br = t.Bytes;
if (Debugging.AssertsEnabled) Debugging.Assert(br.Offset == 0);
SetNewSuffixStart(term, br);
SurrogateDance();
Term t2 = termEnum.Term();
if (t2 == null || t2.Field != internedFieldName)
{
// PreFlex codec interns field names; verify:
if (Debugging.AssertsEnabled) Debugging.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal));
current = null;
return SeekStatus.END;
}
else
{
current = t2.Bytes;
if (Debugging.AssertsEnabled) Debugging.Assert(!unicodeSortOrder || term.CompareTo(current) < 0,"term={0} vs current={1}",
// LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called
new BytesRefFormatter(term, BytesRefFormat.UTF8AsHex), new BytesRefFormatter(current, BytesRefFormat.UTF8AsHex));
return SeekStatus.NOT_FOUND;
}
}
}
private void SetNewSuffixStart(BytesRef br1, BytesRef br2)
{
int limit = Math.Min(br1.Length, br2.Length);
int lastStart = 0;
for (int i = 0; i < limit; i++)
{
if ((br1.Bytes[br1.Offset + i] & 0xc0) == 0xc0 || (br1.Bytes[br1.Offset + i] & 0x80) == 0)
{
lastStart = i;
}
if (br1.Bytes[br1.Offset + i] != br2.Bytes[br2.Offset + i])
{
newSuffixStart = lastStart;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" set newSuffixStart=" + newSuffixStart);
}
return;
}
}
newSuffixStart = limit;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" set newSuffixStart=" + newSuffixStart);
}
}
public override bool MoveNext()
{
if (DEBUG_SURROGATES)
{
Console.WriteLine("TE.MoveNext()");
}
if (skipNext)
{
if (DEBUG_SURROGATES)
{
Console.WriteLine(" skipNext=true");
}
skipNext = false;
if (termEnum.Term() == null)
{
return false;
// PreFlex codec interns field names:
}
else if (termEnum.Term().Field != internedFieldName)
{
return false;
}
else
{
current = termEnum.Term().Bytes;
return true;
}
}
// TODO: can we use STE's prevBuffer here?
prevTerm.CopyBytes(termEnum.Term().Bytes);
if (termEnum.Next() && termEnum.Term().Field == internedFieldName)
{
newSuffixStart = termEnum.newSuffixStart;
if (DEBUG_SURROGATES)
{
Console.WriteLine(" newSuffixStart=" + newSuffixStart);
}
SurrogateDance();
Term t = termEnum.Term();
if (t == null || t.Field != internedFieldName)
{
// PreFlex codec interns field names; verify:
if (Debugging.AssertsEnabled) Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
current = null;
return false;
}
else
{
current = t.Bytes;
return true;
}
}
else
{
// this field is exhausted, but we have to give
// surrogateDance a chance to seek back:
if (DEBUG_SURROGATES)
{
Console.WriteLine(" force cont");
}
//newSuffixStart = prevTerm.length;
newSuffixStart = 0;
SurrogateDance();
Term t = termEnum.Term();
if (t == null || t.Field != internedFieldName)
{
// PreFlex codec interns field names; verify:
if (Debugging.AssertsEnabled) Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
return false;
}
else
{
current = t.Bytes;
return true;
}
}
}
[Obsolete("Use MoveNext() and Term instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
public override BytesRef Next()
{
if (MoveNext())
return current;
return null;
}
public override BytesRef Term => current;
public override int DocFreq => termEnum.DocFreq;
public override long TotalTermFreq => -1;
public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags)
{
if (reuse == null || !(reuse is PreDocsEnum docsEnum) || docsEnum.FreqStream != outerInstance.FreqStream)
docsEnum = new PreDocsEnum(outerInstance);
return docsEnum.Reset(termEnum, liveDocs);
}
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags)
{
if (fieldInfo.IndexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
return null;
if (reuse is null || !(reuse is PreDocsAndPositionsEnum docsPosEnum) || docsPosEnum.FreqStream != outerInstance.FreqStream)
docsPosEnum = new PreDocsAndPositionsEnum(outerInstance);
return docsPosEnum.Reset(termEnum, liveDocs);
}
}
private sealed class PreDocsEnum : DocsEnum
{
private readonly Lucene3xFields outerInstance;
internal readonly SegmentTermDocs docs;
private int docID = -1;
internal PreDocsEnum(Lucene3xFields outerInstance)
{
this.outerInstance = outerInstance;
docs = new SegmentTermDocs(outerInstance.FreqStream, outerInstance.TermsDict, outerInstance.fieldInfos);
}
internal IndexInput FreqStream => outerInstance.FreqStream;
public PreDocsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs)
{
docs.LiveDocs = liveDocs;
docs.Seek(termEnum);
docs.freq = 1;
docID = -1;
return this;
}
public override int NextDoc()
{
if (docs.Next())
{
return docID = docs.Doc;
}
else
{
return docID = NO_MORE_DOCS;
}
}
public override int Advance(int target)
{
if (docs.SkipTo(target))
{
return docID = docs.Doc;
}
else
{
return docID = NO_MORE_DOCS;
}
}
public override int Freq => docs.Freq;
public override int DocID => docID;
public override long GetCost()
{
return docs.m_df;
}
}
private sealed class PreDocsAndPositionsEnum : DocsAndPositionsEnum
{
private readonly Lucene3xFields outerInstance;
private readonly SegmentTermPositions pos;
private int docID = -1;
internal PreDocsAndPositionsEnum(Lucene3xFields outerInstance)
{
this.outerInstance = outerInstance;
pos = new SegmentTermPositions(outerInstance.FreqStream, outerInstance.ProxStream, outerInstance.TermsDict, outerInstance.fieldInfos);
}
internal IndexInput FreqStream => outerInstance.FreqStream;
public DocsAndPositionsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs)
{
pos.LiveDocs = liveDocs;
pos.Seek(termEnum);
docID = -1;
return this;
}
public override int NextDoc()
{
if (pos.Next())
{
return docID = pos.Doc;
}
else
{
return docID = NO_MORE_DOCS;
}
}
public override int Advance(int target)
{
if (pos.SkipTo(target))
{
return docID = pos.Doc;
}
else
{
return docID = NO_MORE_DOCS;
}
}
public override int Freq => pos.Freq;
public override int DocID => docID;
public override int NextPosition()
{
if (Debugging.AssertsEnabled) Debugging.Assert(docID != NO_MORE_DOCS);
return pos.NextPosition();
}
public override int StartOffset => -1;
public override int EndOffset => -1;
public override BytesRef GetPayload()
{
return pos.GetPayload();
}
public override long GetCost()
{
return pos.m_df;
}
}
public override long RamBytesUsed()
{
if (Tis != null)
{
return Tis.RamBytesUsed();
}
else
{
// when there is no index, there is almost nothing loaded into RAM
return 0L;
}
}
public override void CheckIntegrity()
{
}
}
}