| using J2N.Text; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Index; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using Console = Lucene.Net.Util.SystemConsole; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Codecs.Lucene3x |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using Directory = Lucene.Net.Store.Directory; |
| using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum; |
| using DocsEnum = Lucene.Net.Index.DocsEnum; |
| using FieldInfo = Lucene.Net.Index.FieldInfo; |
| using FieldInfos = Lucene.Net.Index.FieldInfos; |
| using IBits = Lucene.Net.Util.IBits; |
| using IndexFileNames = Lucene.Net.Index.IndexFileNames; |
| using IndexInput = Lucene.Net.Store.IndexInput; |
| using IndexOptions = Lucene.Net.Index.IndexOptions; |
| using IOContext = Lucene.Net.Store.IOContext; |
| using IOUtils = Lucene.Net.Util.IOUtils; |
| using SegmentInfo = Lucene.Net.Index.SegmentInfo; |
| using Term = Lucene.Net.Index.Term; |
| using Terms = Lucene.Net.Index.Terms; |
| using TermsEnum = Lucene.Net.Index.TermsEnum; |
| using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; |
| |
| /// <summary> |
| /// Exposes flex API on a pre-flex index, as a codec. |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| [Obsolete("(4.0)")] |
| internal class Lucene3xFields : FieldsProducer |
| { |
| #pragma warning disable CA1802 // Use literals where appropriate |
| private static readonly bool DEBUG_SURROGATES = false; |
| #pragma warning restore CA1802 // Use literals where appropriate |
| |
| public TermInfosReader Tis { get; set; } |
| public TermInfosReader TisNoIndex { get; private set; } |
| |
| public IndexInput FreqStream { get; private set; } |
| public IndexInput ProxStream { get; private set; } |
| private readonly FieldInfos fieldInfos; |
| //private readonly SegmentInfo si; // LUCENENET: Never read |
| |
| // LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java |
| internal readonly IDictionary<string, FieldInfo> fields = new JCG.SortedDictionary<string, FieldInfo>(StringComparer.Ordinal); |
| internal readonly IDictionary<string, Terms> preTerms = new Dictionary<string, Terms>(); |
| //private readonly Directory dir; // LUCENENET: Never read |
| //private readonly IOContext context; // LUCENENET: Never read |
| //private Directory cfsReader; // LUCENENET NOTE: cfsReader not used |
| |
| public Lucene3xFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, IOContext context, int indexDivisor) |
| { |
| //si = info; // LUCENENET: Never read |
| |
| // NOTE: we must always load terms index, even for |
| // "sequential" scan during merging, because what is |
| // sequential to merger may not be to TermInfosReader |
| // since we do the surrogates dance: |
| if (indexDivisor < 0) |
| { |
| indexDivisor = -indexDivisor; |
| } |
| |
| bool success = false; |
| try |
| { |
| var r = new TermInfosReader(dir, info.Name, fieldInfos, context, indexDivisor); |
| if (indexDivisor == -1) |
| { |
| TisNoIndex = r; |
| } |
| else |
| { |
| TisNoIndex = null; |
| Tis = r; |
| } |
| //this.context = context; // LUCENENET: Never read |
| this.fieldInfos = fieldInfos; |
| |
| // make sure that all index files have been read or are kept open |
| // so that if an index update removes them we'll still have them |
| FreqStream = dir.OpenInput(IndexFileNames.SegmentFileName(info.Name, "", Lucene3xPostingsFormat.FREQ_EXTENSION), context); |
| bool anyProx = false; |
| foreach (FieldInfo fi in fieldInfos) |
| { |
| if (fi.IsIndexed) |
| { |
| fields[fi.Name] = fi; |
| preTerms[fi.Name] = new PreTerms(this, fi); |
| if (fi.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) |
| { |
| anyProx = true; |
| } |
| } |
| } |
| |
| if (anyProx) |
| { |
| ProxStream = dir.OpenInput(IndexFileNames.SegmentFileName(info.Name, "", Lucene3xPostingsFormat.PROX_EXTENSION), context); |
| } |
| else |
| { |
| ProxStream = null; |
| } |
| success = true; |
| } |
| finally |
| { |
| // With lock-less commits, it's entirely possible (and |
| // fine) to hit a FileNotFound exception above. In |
| // this case, we want to explicitly close any subset |
| // of things that were opened so that we don't have to |
| // wait for a GC to do so. |
| if (!success) |
| { |
| Dispose(); |
| } |
| } |
| //this.dir = dir; // LUCENENET: Never read |
| } |
| |
| // If this returns, we do the surrogates dance so that the |
| // terms are sorted by unicode sort order. this should be |
| // true when segments are used for "normal" searching; |
| // it's only false during testing, to create a pre-flex |
| // index, using the test-only PreFlexRW. |
| protected virtual bool SortTermsByUnicode => true; |
| |
| public override IEnumerator<string> GetEnumerator() |
| { |
| return fields.Keys.GetEnumerator(); |
| } |
| |
| public override Terms GetTerms(string field) |
| { |
| preTerms.TryGetValue(field, out Terms result); |
| return result; |
| } |
| |
| public override int Count |
| { |
| get |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(preTerms.Count == fields.Count); |
| return fields.Count; |
| } |
| } |
| |
| [Obsolete("iterate fields and add their Count instead.")] |
| public override long UniqueTermCount => TermsDict.Count; |
| |
| private TermInfosReader TermsDict |
| { |
| get |
| { |
| lock (this) |
| { |
| if (Tis != null) |
| { |
| return Tis; |
| } |
| else |
| { |
| return TisNoIndex; |
| } |
| } |
| } |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| if (disposing) |
| { |
| IOUtils.Dispose(Tis, TisNoIndex, /*cfsReader,*/ FreqStream, ProxStream); // LUCENENET NOTE: cfsReader not used |
| } |
| } |
| |
| private class PreTerms : Terms |
| { |
| private readonly Lucene3xFields outerInstance; |
| |
| internal readonly FieldInfo fieldInfo; |
| |
| internal PreTerms(Lucene3xFields outerInstance, FieldInfo fieldInfo) |
| { |
| this.outerInstance = outerInstance; |
| this.fieldInfo = fieldInfo; |
| } |
| |
| public override TermsEnum GetEnumerator() |
| { |
| var termsEnum = new PreTermsEnum(outerInstance); |
| termsEnum.Reset(fieldInfo); |
| return termsEnum; |
| } |
| |
| public override IComparer<BytesRef> Comparer |
| { |
| get |
| { |
| // Pre-flex indexes always sorted in UTF16 order, but |
| // we remap on-the-fly to unicode order |
| if (outerInstance.SortTermsByUnicode) |
| { |
| return BytesRef.UTF8SortedAsUnicodeComparer; |
| } |
| else |
| { |
| return BytesRef.UTF8SortedAsUTF16Comparer; |
| } |
| } |
| } |
| |
| public override long Count => -1; |
| |
| public override long SumTotalTermFreq => -1; |
| |
| public override long SumDocFreq => -1; |
| |
| public override int DocCount => -1; |
| |
| // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare() |
| public override bool HasFreqs => IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS) >= 0; |
| |
| public override bool HasOffsets |
| { |
| get |
| { |
| // preflex doesn't support this |
| // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare() |
| if (Debugging.AssertsEnabled) Debugging.Assert(IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) < 0); |
| return false; |
| } |
| } |
| |
| // LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare() |
| public override bool HasPositions => IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; |
| |
| public override bool HasPayloads => fieldInfo.HasPayloads; |
| } |
| |
| private class PreTermsEnum : TermsEnum |
| { |
| private readonly Lucene3xFields outerInstance; |
| |
| public PreTermsEnum(Lucene3xFields outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| private SegmentTermEnum termEnum; |
| private FieldInfo fieldInfo; |
| private string internedFieldName; |
| private bool skipNext; |
| private BytesRef current; |
| |
| private SegmentTermEnum seekTermEnum; |
| |
| private const sbyte UTF8_NON_BMP_LEAD = unchecked((sbyte)0xf0); |
| private const sbyte UTF8_HIGH_BMP_LEAD = unchecked((sbyte)0xee); |
| |
| // Returns true if the unicode char is "after" the |
| // surrogates in UTF16, ie >= U+E000 and <= U+FFFF: |
| private static bool IsHighBMPChar(byte[] b, int idx) |
| { |
| return (((sbyte)b[idx]) & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD; |
| } |
| |
| // Returns true if the unicode char in the UTF8 byte |
| // sequence starting at idx encodes a char outside of |
| // BMP (ie what would be a surrogate pair in UTF16): |
| private static bool IsNonBMPChar(byte[] b, int idx) |
| { |
| return (((sbyte)b[idx]) & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD; |
| } |
| |
| private readonly sbyte[] scratch = new sbyte[4]; |
| private readonly BytesRef prevTerm = new BytesRef(); |
| private readonly BytesRef scratchTerm = new BytesRef(); |
| private int newSuffixStart; |
| |
| // Swap in S, in place of E: |
| private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) |
| { |
| int savLength = term.Length; |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(term.Offset == 0); |
| |
| // The 3 bytes starting at downTo make up 1 |
| // unicode character: |
| if (Debugging.AssertsEnabled) Debugging.Assert(IsHighBMPChar(term.Bytes, pos)); |
| |
| // NOTE: we cannot make this assert, because |
| // AutomatonQuery legitimately sends us malformed UTF8 |
| // (eg the UTF8 bytes with just 0xee) |
| // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); |
| |
| // Save the bytes && length, since we need to |
| // restore this if seek "back" finds no matching |
| // terms |
| if (term.Bytes.Length < 4 + pos) |
| { |
| term.Grow(4 + pos); |
| } |
| |
| scratch[0] = (sbyte)term.Bytes[pos]; |
| scratch[1] = (sbyte)term.Bytes[pos + 1]; |
| scratch[2] = (sbyte)term.Bytes[pos + 2]; |
| |
| term.Bytes[pos] = 0xf0; |
| term.Bytes[pos + 1] = 0x90; |
| term.Bytes[pos + 2] = 0x80; |
| term.Bytes[pos + 3] = 0x80; |
| term.Length = 4 + pos; |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); |
| } |
| |
| // Seek "back": |
| outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); |
| |
| // Test if the term we seek'd to in fact found a |
| // surrogate pair at the same position as the E: |
| Term t2 = te.Term(); |
| |
| // Cannot be null (or move to next field) because at |
| // "worst" it'd seek to the same term we are on now, |
| // unless we are being called from seek |
| if (t2 == null || t2.Field != internedFieldName) |
| { |
| return false; |
| } |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); |
| } |
| |
| // Now test if prefix is identical and we found |
| // a non-BMP char at the same position: |
| BytesRef b2 = t2.Bytes; |
| if (Debugging.AssertsEnabled) Debugging.Assert(b2.Offset == 0); |
| |
| bool matches; |
| if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) |
| { |
| matches = true; |
| for (int i = 0; i < pos; i++) |
| { |
| if (term.Bytes[i] != b2.Bytes[i]) |
| { |
| matches = false; |
| break; |
| } |
| } |
| } |
| else |
| { |
| matches = false; |
| } |
| |
| // Restore term: |
| term.Length = savLength; |
| term.Bytes[pos] = (byte)scratch[0]; |
| term.Bytes[pos + 1] = (byte)scratch[1]; |
| term.Bytes[pos + 2] = (byte)scratch[2]; |
| |
| return matches; |
| } |
| |
| // Seek type 2 "continue" (back to the start of the |
| // surrogates): scan the stripped suffix from the |
| // prior term, backwards. If there was an E in that |
| // part, then we try to seek back to S. If that |
| // seek finds a matching term, we go there. |
| private bool DoContinue() |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" try cont"); |
| } |
| |
| int downTo = prevTerm.Length - 1; |
| |
| bool didSeek = false; |
| |
| int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1); |
| |
| while (downTo > limit) |
| { |
| if (IsHighBMPChar(prevTerm.Bytes, downTo)) |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" found E pos=" + downTo + " vs len=" + prevTerm.Length); |
| } |
| |
| if (SeekToNonBMP(seekTermEnum, prevTerm, downTo)) |
| { |
| // TODO: more efficient seek? |
| outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); |
| //newSuffixStart = downTo+4; |
| newSuffixStart = downTo; |
| scratchTerm.CopyBytes(termEnum.Term().Bytes); |
| didSeek = true; |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" seek!"); |
| } |
| break; |
| } |
| else |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" no seek"); |
| } |
| } |
| } |
| |
| // Shorten prevTerm in place so that we don't redo |
| // this loop if we come back here: |
| if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0) |
| { |
| prevTerm.Length = downTo; |
| } |
| |
| downTo--; |
| } |
| |
| return didSeek; |
| } |
| |
| // Look for seek type 3 ("pop"): if the delta from |
| // prev -> current was replacing an S with an E, |
| // we must now seek to beyond that E. this seek |
| // "finishes" the dance at this character |
| // position. |
| private bool DoPop() |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" try pop"); |
| } |
| |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(newSuffixStart <= prevTerm.Length); |
| Debugging.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0); |
| } |
| |
| if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart)) |
| { |
| // Seek type 2 -- put 0xFF at this position: |
| scratchTerm.Bytes[newSuffixStart] = 0xff; |
| scratchTerm.Length = newSuffixStart + 1; |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); |
| } |
| |
| // TODO: more efficient seek? can we simply swap |
| // the enums? |
| outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true); |
| |
| Term t2 = termEnum.Term(); |
| |
| // We could hit EOF or different field since this |
| // was a seek "forward": |
| if (t2 != null && t2.Field == internedFieldName) |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes); |
| } |
| |
| BytesRef b2 = t2.Bytes; |
| if (Debugging.AssertsEnabled) Debugging.Assert(b2.Offset == 0); |
| |
| // Set newSuffixStart -- we can't use |
| // termEnum's since the above seek may have |
| // done no scanning (eg, term was precisely |
| // and index term, or, was in the term seek |
| // cache): |
| scratchTerm.CopyBytes(b2); |
| SetNewSuffixStart(prevTerm, scratchTerm); |
| |
| return true; |
| } |
| else if (newSuffixStart != 0 || scratchTerm.Length != 0) |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" got term=null (or next field)"); |
| } |
| newSuffixStart = 0; |
| scratchTerm.Length = 0; |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| // Pre-flex indices store terms in UTF16 sort order, but |
| // certain queries require Unicode codepoint order; this |
| // method carefully seeks around surrogates to handle |
| // this impedance mismatch |
| |
| private void SurrogateDance() |
| { |
| if (!unicodeSortOrder) |
| { |
| return; |
| } |
| |
| // We are invoked after TIS.next() (by UTF16 order) to |
| // possibly seek to a different "next" (by unicode |
| // order) term. |
| |
| // We scan only the "delta" from the last term to the |
| // current term, in UTF8 bytes. We look at 1) the bytes |
| // stripped from the prior term, and then 2) the bytes |
| // appended to that prior term's prefix. |
| |
| // We don't care about specific UTF8 sequences, just |
| // the "category" of the UTF16 character. Category S |
| // is a high/low surrogate pair (it non-BMP). |
| // Category E is any BMP char > UNI_SUR_LOW_END (and < |
| // U+FFFF). Category A is the rest (any unicode char |
| // <= UNI_SUR_HIGH_START). |
| |
| // The core issue is that pre-flex indices sort the |
| // characters as ASE, while flex must sort as AES. So |
| // when scanning, when we hit S, we must 1) seek |
| // forward to E and enum the terms there, then 2) seek |
| // back to S and enum all terms there, then 3) seek to |
| // after E. Three different seek points (1, 2, 3). |
| |
| // We can easily detect S in UTF8: if a byte has |
| // prefix 11110 (0xf0), then that byte and the |
| // following 3 bytes encode a single unicode codepoint |
| // in S. Similarly, we can detect E: if a byte has |
| // prefix 1110111 (0xee), then that byte and the |
| // following 2 bytes encode a single unicode codepoint |
| // in E. |
| |
| // Note that this is really a recursive process -- |
| // maybe the char at pos 2 needs to dance, but any |
| // point in its dance, suddenly pos 4 needs to dance |
| // so you must finish pos 4 before returning to pos |
| // 2. But then during pos 4's dance maybe pos 7 needs |
| // to dance, etc. However, despite being recursive, |
| // we don't need to hold any state because the state |
| // can always be derived by looking at prior term & |
| // current term. |
| |
| // TODO: can we avoid this copy? |
| if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName) |
| { |
| scratchTerm.Length = 0; |
| } |
| else |
| { |
| scratchTerm.CopyBytes(termEnum.Term().Bytes); |
| } |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" dance"); |
| Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString())); |
| Console.WriteLine(" " + prevTerm.ToString()); |
| Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString())); |
| Console.WriteLine(" " + scratchTerm.ToString()); |
| } |
| |
| // this code assumes TermInfosReader/SegmentTermEnum |
| // always use BytesRef.offset == 0 |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(prevTerm.Offset == 0); |
| Debugging.Assert(scratchTerm.Offset == 0); |
| } |
| |
| // Need to loop here because we may need to do multiple |
| // pops, and possibly a continue in the end, ie: |
| // |
| // cont |
| // pop, cont |
| // pop, pop, cont |
| // <nothing> |
| // |
| |
| while (true) |
| { |
| if (DoContinue()) |
| { |
| break; |
| } |
| else |
| { |
| if (!DoPop()) |
| { |
| break; |
| } |
| } |
| } |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" finish bmp ends"); |
| } |
| |
| DoPushes(); |
| } |
| |
| // Look for seek type 1 ("push"): if the newly added |
| // suffix contains any S, we must try to seek to the |
| // corresponding E. If we find a match, we go there; |
| // else we keep looking for additional S's in the new |
| // suffix. this "starts" the dance, at this character |
| // position: |
| private void DoPushes() |
| { |
| int upTo = newSuffixStart; |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length); |
| } |
| |
| while (upTo < scratchTerm.Length) |
| { |
| if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo))))) |
| { |
| // A non-BMP char (4 bytes UTF8) starts here: |
| if (Debugging.AssertsEnabled) Debugging.Assert(scratchTerm.Length >= upTo + 4); |
| |
| int savLength = scratchTerm.Length; |
| scratch[0] = (sbyte)scratchTerm.Bytes[upTo]; |
| scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1]; |
| scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2]; |
| |
| scratchTerm.Bytes[upTo] = unchecked((byte)UTF8_HIGH_BMP_LEAD); |
| scratchTerm.Bytes[upTo + 1] = 0x80; |
| scratchTerm.Bytes[upTo + 2] = 0x80; |
| scratchTerm.Length = upTo + 3; |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); |
| } |
| |
| // Seek "forward": |
| // TODO: more efficient seek? |
| outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true); |
| |
| scratchTerm.Bytes[upTo] = (byte)scratch[0]; |
| scratchTerm.Bytes[upTo + 1] = (byte)scratch[1]; |
| scratchTerm.Bytes[upTo + 2] = (byte)scratch[2]; |
| scratchTerm.Length = savLength; |
| |
| // Did we find a match? |
| Term t2 = seekTermEnum.Term(); |
| |
| if (DEBUG_SURROGATES) |
| { |
| if (t2 == null) |
| { |
| Console.WriteLine(" hit term=null"); |
| } |
| else |
| { |
| Console.WriteLine($" hit term={UnicodeUtil.ToHexString(t2.Text())} {t2?.Bytes}"); |
| } |
| } |
| |
| // Since this was a seek "forward", we could hit |
| // EOF or a different field: |
| bool matches; |
| |
| if (t2 != null && t2.Field == internedFieldName) |
| { |
| BytesRef b2 = t2.Bytes; |
| if (Debugging.AssertsEnabled) Debugging.Assert(b2.Offset == 0); |
| if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo)) |
| { |
| matches = true; |
| for (int i = 0; i < upTo; i++) |
| { |
| if (scratchTerm.Bytes[i] != b2.Bytes[i]) |
| { |
| matches = false; |
| break; |
| } |
| } |
| } |
| else |
| { |
| matches = false; |
| } |
| } |
| else |
| { |
| matches = false; |
| } |
| |
| if (matches) |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" matches!"); |
| } |
| |
| // OK seek "back" |
| // TODO: more efficient seek? |
| outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); |
| |
| scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); |
| |
| // +3 because we don't need to check the char |
| // at upTo: we know it's > BMP |
| upTo += 3; |
| |
| // NOTE: we keep iterating, now, since this |
| // can easily "recurse". Ie, after seeking |
| // forward at a certain char position, we may |
| // find another surrogate in our [new] suffix |
| // and must then do another seek (recurse) |
| } |
| else |
| { |
| upTo++; |
| } |
| } |
| else |
| { |
| upTo++; |
| } |
| } |
| } |
| |
| private bool unicodeSortOrder; |
| |
| internal virtual void Reset(FieldInfo fieldInfo) |
| { |
| //System.out.println("pff.reset te=" + termEnum); |
| this.fieldInfo = fieldInfo; |
| |
| internedFieldName = fieldInfo.Name.Intern(); |
| |
| Term term = new Term(internedFieldName); |
| if (termEnum == null) |
| { |
| termEnum = outerInstance.TermsDict.Terms(term); |
| seekTermEnum = outerInstance.TermsDict.Terms(term); |
| //System.out.println(" term=" + termEnum.term()); |
| } |
| else |
| { |
| outerInstance.TermsDict.SeekEnum(termEnum, term, true); |
| } |
| skipNext = true; |
| |
| unicodeSortOrder = outerInstance.SortTermsByUnicode; |
| |
| Term t = termEnum.Term(); |
| if (t != null && t.Field == internedFieldName) |
| { |
| newSuffixStart = 0; |
| prevTerm.Length = 0; |
| SurrogateDance(); |
| } |
| } |
| |
| public override IComparer<BytesRef> Comparer |
| { |
| get |
| { |
| // Pre-flex indexes always sorted in UTF16 order, but |
| // we remap on-the-fly to unicode order |
| if (unicodeSortOrder) |
| { |
| return BytesRef.UTF8SortedAsUnicodeComparer; |
| } |
| else |
| { |
| return BytesRef.UTF8SortedAsUTF16Comparer; |
| } |
| } |
| } |
| |
| public override void SeekExact(long ord) |
| { |
| throw new NotSupportedException(); |
| } |
| |
| public override long Ord => throw new NotSupportedException(); |
| |
| public override SeekStatus SeekCeil(BytesRef term) |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); |
| } |
| skipNext = false; |
| TermInfosReader tis = outerInstance.TermsDict; |
| Term t0 = new Term(fieldInfo.Name, term); |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(termEnum != null); |
| |
| tis.SeekEnum(termEnum, t0, false); |
| |
| Term t = termEnum.Term(); |
| |
| if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes)) |
| { |
| // If we found an exact match, no need to do the |
| // surrogate dance |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" seek exact match"); |
| } |
| current = t.Bytes; |
| return SeekStatus.FOUND; |
| } |
| else if (t == null || t.Field != internedFieldName) |
| { |
| // TODO: maybe we can handle this like the next() |
| // into null? set term as prevTerm then dance? |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" seek hit EOF"); |
| } |
| |
| // We hit EOF; try end-case surrogate dance: if we |
| // find an E, try swapping in S, backwards: |
| scratchTerm.CopyBytes(term); |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(scratchTerm.Offset == 0); |
| |
| for (int i = scratchTerm.Length - 1; i >= 0; i--) |
| { |
| if (IsHighBMPChar(scratchTerm.Bytes, i)) |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" found E pos=" + i + "; try seek"); |
| } |
| |
| if (SeekToNonBMP(seekTermEnum, scratchTerm, i)) |
| { |
| scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); |
| outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false); |
| |
| newSuffixStart = 1 + i; |
| |
| DoPushes(); |
| |
| // Found a match |
| // TODO: faster seek? |
| current = termEnum.Term().Bytes; |
| return SeekStatus.NOT_FOUND; |
| } |
| } |
| } |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" seek END"); |
| } |
| |
| current = null; |
| return SeekStatus.END; |
| } |
| else |
| { |
| // We found a non-exact but non-null term; this one |
| // is fun -- just treat it like next, by pretending |
| // requested term was prev: |
| prevTerm.CopyBytes(term); |
| |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text())); |
| } |
| |
| BytesRef br = t.Bytes; |
| if (Debugging.AssertsEnabled) Debugging.Assert(br.Offset == 0); |
| |
| SetNewSuffixStart(term, br); |
| |
| SurrogateDance(); |
| |
| Term t2 = termEnum.Term(); |
| if (t2 == null || t2.Field != internedFieldName) |
| { |
| // PreFlex codec interns field names; verify: |
| if (Debugging.AssertsEnabled) Debugging.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal)); |
| current = null; |
| return SeekStatus.END; |
| } |
| else |
| { |
| current = t2.Bytes; |
| if (Debugging.AssertsEnabled) Debugging.Assert(!unicodeSortOrder || term.CompareTo(current) < 0,"term={0} vs current={1}", |
| // LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called |
| new BytesRefFormatter(term, BytesRefFormat.UTF8AsHex), new BytesRefFormatter(current, BytesRefFormat.UTF8AsHex)); |
| return SeekStatus.NOT_FOUND; |
| } |
| } |
| } |
| |
| private void SetNewSuffixStart(BytesRef br1, BytesRef br2) |
| { |
| int limit = Math.Min(br1.Length, br2.Length); |
| int lastStart = 0; |
| for (int i = 0; i < limit; i++) |
| { |
| if ((br1.Bytes[br1.Offset + i] & 0xc0) == 0xc0 || (br1.Bytes[br1.Offset + i] & 0x80) == 0) |
| { |
| lastStart = i; |
| } |
| if (br1.Bytes[br1.Offset + i] != br2.Bytes[br2.Offset + i]) |
| { |
| newSuffixStart = lastStart; |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" set newSuffixStart=" + newSuffixStart); |
| } |
| return; |
| } |
| } |
| newSuffixStart = limit; |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" set newSuffixStart=" + newSuffixStart); |
| } |
| } |
| |
| public override bool MoveNext() |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine("TE.MoveNext()"); |
| } |
| if (skipNext) |
| { |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" skipNext=true"); |
| } |
| skipNext = false; |
| if (termEnum.Term() == null) |
| { |
| return false; |
| // PreFlex codec interns field names: |
| } |
| else if (termEnum.Term().Field != internedFieldName) |
| { |
| return false; |
| } |
| else |
| { |
| current = termEnum.Term().Bytes; |
| return true; |
| } |
| } |
| |
| // TODO: can we use STE's prevBuffer here? |
| prevTerm.CopyBytes(termEnum.Term().Bytes); |
| |
| if (termEnum.Next() && termEnum.Term().Field == internedFieldName) |
| { |
| newSuffixStart = termEnum.newSuffixStart; |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" newSuffixStart=" + newSuffixStart); |
| } |
| SurrogateDance(); |
| Term t = termEnum.Term(); |
| if (t == null || t.Field != internedFieldName) |
| { |
| // PreFlex codec interns field names; verify: |
| if (Debugging.AssertsEnabled) Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); |
| current = null; |
| return false; |
| } |
| else |
| { |
| current = t.Bytes; |
| return true; |
| } |
| } |
| else |
| { |
| // this field is exhausted, but we have to give |
| // surrogateDance a chance to seek back: |
| if (DEBUG_SURROGATES) |
| { |
| Console.WriteLine(" force cont"); |
| } |
| //newSuffixStart = prevTerm.length; |
| newSuffixStart = 0; |
| SurrogateDance(); |
| |
| Term t = termEnum.Term(); |
| if (t == null || t.Field != internedFieldName) |
| { |
| // PreFlex codec interns field names; verify: |
| if (Debugging.AssertsEnabled) Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); |
| return false; |
| } |
| else |
| { |
| current = t.Bytes; |
| return true; |
| } |
| } |
| } |
| |
| [Obsolete("Use MoveNext() and Term instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)] |
| public override BytesRef Next() |
| { |
| if (MoveNext()) |
| return current; |
| return null; |
| } |
| |
| public override BytesRef Term => current; |
| |
| public override int DocFreq => termEnum.DocFreq; |
| |
| public override long TotalTermFreq => -1; |
| |
| public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags) |
| { |
| if (reuse == null || !(reuse is PreDocsEnum docsEnum) || docsEnum.FreqStream != outerInstance.FreqStream) |
| docsEnum = new PreDocsEnum(outerInstance); |
| |
| return docsEnum.Reset(termEnum, liveDocs); |
| } |
| |
| public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags) |
| { |
| if (fieldInfo.IndexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) |
| return null; |
| |
| if (reuse is null || !(reuse is PreDocsAndPositionsEnum docsPosEnum) || docsPosEnum.FreqStream != outerInstance.FreqStream) |
| docsPosEnum = new PreDocsAndPositionsEnum(outerInstance); |
| |
| return docsPosEnum.Reset(termEnum, liveDocs); |
| } |
| } |
| |
| private sealed class PreDocsEnum : DocsEnum |
| { |
| private readonly Lucene3xFields outerInstance; |
| |
| internal readonly SegmentTermDocs docs; |
| private int docID = -1; |
| |
| internal PreDocsEnum(Lucene3xFields outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| docs = new SegmentTermDocs(outerInstance.FreqStream, outerInstance.TermsDict, outerInstance.fieldInfos); |
| } |
| |
| internal IndexInput FreqStream => outerInstance.FreqStream; |
| |
| public PreDocsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs) |
| { |
| docs.LiveDocs = liveDocs; |
| docs.Seek(termEnum); |
| docs.freq = 1; |
| docID = -1; |
| return this; |
| } |
| |
| public override int NextDoc() |
| { |
| if (docs.Next()) |
| { |
| return docID = docs.Doc; |
| } |
| else |
| { |
| return docID = NO_MORE_DOCS; |
| } |
| } |
| |
| public override int Advance(int target) |
| { |
| if (docs.SkipTo(target)) |
| { |
| return docID = docs.Doc; |
| } |
| else |
| { |
| return docID = NO_MORE_DOCS; |
| } |
| } |
| |
| public override int Freq => docs.Freq; |
| |
| public override int DocID => docID; |
| |
| public override long GetCost() |
| { |
| return docs.m_df; |
| } |
| } |
| |
| private sealed class PreDocsAndPositionsEnum : DocsAndPositionsEnum |
| { |
| private readonly Lucene3xFields outerInstance; |
| |
| private readonly SegmentTermPositions pos; |
| private int docID = -1; |
| |
| internal PreDocsAndPositionsEnum(Lucene3xFields outerInstance) |
| { |
| this.outerInstance = outerInstance; |
| pos = new SegmentTermPositions(outerInstance.FreqStream, outerInstance.ProxStream, outerInstance.TermsDict, outerInstance.fieldInfos); |
| } |
| |
| internal IndexInput FreqStream => outerInstance.FreqStream; |
| |
| public DocsAndPositionsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs) |
| { |
| pos.LiveDocs = liveDocs; |
| pos.Seek(termEnum); |
| docID = -1; |
| return this; |
| } |
| |
| public override int NextDoc() |
| { |
| if (pos.Next()) |
| { |
| return docID = pos.Doc; |
| } |
| else |
| { |
| return docID = NO_MORE_DOCS; |
| } |
| } |
| |
| public override int Advance(int target) |
| { |
| if (pos.SkipTo(target)) |
| { |
| return docID = pos.Doc; |
| } |
| else |
| { |
| return docID = NO_MORE_DOCS; |
| } |
| } |
| |
| public override int Freq => pos.Freq; |
| |
| public override int DocID => docID; |
| |
| public override int NextPosition() |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(docID != NO_MORE_DOCS); |
| return pos.NextPosition(); |
| } |
| |
| public override int StartOffset => -1; |
| |
| public override int EndOffset => -1; |
| |
| public override BytesRef GetPayload() |
| { |
| return pos.GetPayload(); |
| } |
| |
| public override long GetCost() |
| { |
| return pos.m_df; |
| } |
| } |
| |
| public override long RamBytesUsed() |
| { |
| if (Tis != null) |
| { |
| return Tis.RamBytesUsed(); |
| } |
| else |
| { |
| // when there is no index, there is almost nothing loaded into RAM |
| return 0L; |
| } |
| } |
| |
| public override void CheckIntegrity() |
| { |
| } |
| } |
| } |