| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Linq; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| namespace Lucene.Net.Index |
| { |
| using Lucene.Net.Support; |
| using Bits = Lucene.Net.Util.Bits; |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; |
| using PagedBytes = Lucene.Net.Util.PagedBytes; |
| using PostingsFormat = Lucene.Net.Codecs.PostingsFormat; // javadocs |
| using SeekStatus = Lucene.Net.Index.TermsEnum.SeekStatus; |
| using StringHelper = Lucene.Net.Util.StringHelper; |
| |
| /// <summary> |
| /// this class enables fast access to multiple term ords for |
| /// a specified field across all docIDs. |
| /// |
| /// Like FieldCache, it uninverts the index and holds a |
| /// packed data structure in RAM to enable fast access. |
| /// Unlike FieldCache, it can handle multi-valued fields, |
| /// and, it does not hold the term bytes in RAM. Rather, you |
| /// must obtain a TermsEnum from the <seealso cref="#getOrdTermsEnum"/> |
| /// method, and then seek-by-ord to get the term's bytes. |
| /// |
| /// While normally term ords are type long, in this API they are |
| /// int as the internal representation here cannot address |
| /// more than MAX_INT unique terms. Also, typically this |
| /// class is used on fields with relatively few unique terms |
| /// vs the number of documents. In addition, there is an |
| /// internal limit (16 MB) on how many bytes each chunk of |
| /// documents may consume. If you trip this limit you'll hit |
| /// an InvalidOperationException. |
| /// |
| /// Deleted documents are skipped during uninversion, and if |
| /// you look them up you'll get 0 ords. |
| /// |
| /// The returned per-document ords do not retain their |
| /// original order in the document. Instead they are returned |
| /// in sorted (by ord, ie term's BytesRef comparator) order. They |
| /// are also de-dup'd (ie if doc has same term more than once |
| /// in this field, you'll only get that ord back once). |
| /// |
| /// this class tests whether the provided reader is able to |
| /// retrieve terms by ord (ie, it's single segment, and it |
| /// uses an ord-capable terms index). If not, this class |
| /// will create its own term index internally, allowing to |
| /// create a wrapped TermsEnum that can handle ord. The |
| /// <seealso cref="#getOrdTermsEnum"/> method then provides this |
| /// wrapped enum, if necessary. |
| /// |
| /// The RAM consumption of this class can be high! |
| /// |
| /// @lucene.experimental |
| /// </summary> |
| |
| /* |
| * Final form of the un-inverted field: |
| * Each document points to a list of term numbers that are contained in that document. |
| * |
| * Term numbers are in sorted order, and are encoded as variable-length deltas from the |
| * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A |
| * term number of 0 signals the end of the termNumber list. |
| * |
| * There is a single int[maxDoc()] which either contains a pointer into a byte[] for |
| * the termNumber lists, or directly contains the termNumber list if it fits in the 4 |
| * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes |
| * are a pointer into a byte[] where the termNumber list starts. |
| * |
| * There are actually 256 byte arrays, to compensate for the fact that the pointers |
| * into the byte arrays are only 3 bytes long. The correct byte array for a document |
| * is a function of it's id. |
| * |
| * To save space and speed up faceting, any term that matches enough documents will |
| * not be un-inverted... it will be skipped while building the un-inverted field structure, |
| * and will use a set intersection method during faceting. |
| * |
| * To further save memory, the terms (the actual string values) are not all stored in |
| * memory, but a TermIndex is used to convert term numbers to term values only |
| * for the terms needed after faceting has completed. Only every 128th term value |
| * is stored, along with it's corresponding term number, and this is used as an |
| * index to find the closest term and iterate until the desired number is hit (very |
| * much like Lucene's own internal term index). |
| * |
| */ |
| |
| public class DocTermOrds |
| { |
| // Term ords are shifted by this, internally, to reserve |
| // values 0 (end term) and 1 (index is a pointer into byte array) |
| private const int TNUM_OFFSET = 2; |
| |
| /// <summary> |
| /// Every 128th term is indexed, by default. </summary> |
| public const int DEFAULT_INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing |
| |
| private int IndexIntervalBits; |
| private int IndexIntervalMask; |
| private int IndexInterval; |
| |
| /// <summary> |
| /// Don't uninvert terms that exceed this count. </summary> |
| protected internal readonly int MaxTermDocFreq; |
| |
| /// <summary> |
| /// Field we are uninverting. </summary> |
| protected internal readonly string Field; |
| |
| /// <summary> |
| /// Number of terms in the field. </summary> |
| protected internal int NumTermsInField; |
| |
| /// <summary> |
| /// Total number of references to term numbers. </summary> |
| protected internal long TermInstances; |
| |
| private long Memsz; |
| |
| /// <summary> |
| /// Total time to uninvert the field. </summary> |
| protected internal int Total_time; |
| |
| /// <summary> |
| /// Time for phase1 of the uninvert process. </summary> |
| protected internal int Phase1_time; |
| |
| /// <summary> |
| /// Holds the per-document ords or a pointer to the ords. </summary> |
| protected internal int[] Index; |
| |
| /// <summary> |
| /// Holds term ords for documents. </summary> |
| protected internal sbyte[][] Tnums = new sbyte[256][]; |
| |
| /// <summary> |
| /// Total bytes (sum of term lengths) for all indexed terms. </summary> |
| protected internal long SizeOfIndexedStrings; |
| |
| /// <summary> |
| /// Holds the indexed (by default every 128th) terms. </summary> |
| protected internal BytesRef[] IndexedTermsArray; |
| |
| /// <summary> |
| /// If non-null, only terms matching this prefix were |
| /// indexed. |
| /// </summary> |
| protected internal BytesRef Prefix; |
| |
| /// <summary> |
| /// Ordinal of the first term in the field, or 0 if the |
| /// <seealso cref="PostingsFormat"/> does not implement {@link |
| /// TermsEnum#ord}. |
| /// </summary> |
| protected internal int OrdBase; |
| |
| /// <summary> |
| /// Used while uninverting. </summary> |
| protected internal DocsEnum DocsEnum; |
| |
| /// <summary> |
| /// Returns total bytes used. </summary> |
| public virtual long RamUsedInBytes() |
| { |
| // can cache the mem size since it shouldn't change |
| if (Memsz != 0) |
| { |
| return Memsz; |
| } |
| long sz = 8 * 8 + 32; // local fields |
| if (Index != null) |
| { |
| sz += Index.Length * 4; |
| } |
| if (Tnums != null) |
| { |
| sz = Tnums.Where(arr => arr != null).Aggregate(sz, (current, arr) => current + arr.Length); |
| } |
| Memsz = sz; |
| return sz; |
| } |
| |
| /// <summary> |
| /// Inverts all terms </summary> |
| public DocTermOrds(AtomicReader reader, Bits liveDocs, string field) |
| : this(reader, liveDocs, field, null, int.MaxValue) |
| { |
| } |
| |
| /// <summary> |
| /// Inverts only terms starting w/ prefix </summary> |
| public DocTermOrds(AtomicReader reader, Bits liveDocs, string field, BytesRef termPrefix) |
| : this(reader, liveDocs, field, termPrefix, int.MaxValue) |
| { |
| } |
| |
| /// <summary> |
| /// Inverts only terms starting w/ prefix, and only terms |
| /// whose docFreq (not taking deletions into account) is |
| /// <= maxTermDocFreq |
| /// </summary> |
| public DocTermOrds(AtomicReader reader, Bits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq) |
| : this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS) |
| { |
| } |
| |
| /// <summary> |
| /// Inverts only terms starting w/ prefix, and only terms |
| /// whose docFreq (not taking deletions into account) is |
| /// <= maxTermDocFreq, with a custom indexing interval |
| /// (default is every 128nd term). |
| /// </summary> |
| public DocTermOrds(AtomicReader reader, Bits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) |
| : this(field, maxTermDocFreq, indexIntervalBits) |
| { |
| Uninvert(reader, liveDocs, termPrefix); |
| } |
| |
| /// <summary> |
| /// Subclass inits w/ this, but be sure you then call |
| /// uninvert, only once |
| /// </summary> |
| protected internal DocTermOrds(string field, int maxTermDocFreq, int indexIntervalBits) |
| { |
| //System.out.println("DTO init field=" + field + " maxTDFreq=" + maxTermDocFreq); |
| this.Field = field; |
| this.MaxTermDocFreq = maxTermDocFreq; |
| this.IndexIntervalBits = indexIntervalBits; |
| IndexIntervalMask = (int)((uint)0xffffffff >> (32 - indexIntervalBits)); |
| IndexInterval = 1 << indexIntervalBits; |
| } |
| |
| /// <summary> |
| /// Returns a TermsEnum that implements ord. If the |
| /// provided reader supports ord, we just return its |
| /// TermsEnum; if it does not, we build a "private" terms |
| /// index internally (WARNING: consumes RAM) and use that |
| /// index to implement ord. this also enables ord on top |
| /// of a composite reader. The returned TermsEnum is |
| /// unpositioned. this returns null if there are no terms. |
| /// |
| /// <p><b>NOTE</b>: you must pass the same reader that was |
| /// used when creating this class |
| /// </summary> |
| public virtual TermsEnum GetOrdTermsEnum(AtomicReader reader) |
| { |
| if (IndexedTermsArray == null) |
| { |
| //System.out.println("GET normal enum"); |
| Fields fields = reader.Fields; |
| if (fields == null) |
| { |
| return null; |
| } |
| Terms terms = fields.Terms(Field); |
| if (terms == null) |
| { |
| return null; |
| } |
| else |
| { |
| return terms.Iterator(null); |
| } |
| } |
| else |
| { |
| //System.out.println("GET wrapped enum ordBase=" + ordBase); |
| return new OrdWrappedTermsEnum(this, reader); |
| } |
| } |
| |
| /// <summary> |
| /// Returns the number of terms in this field |
| /// </summary> |
| public virtual int NumTerms() |
| { |
| return NumTermsInField; |
| } |
| |
| /// <summary> |
| /// Returns {@code true} if no terms were indexed. |
| /// </summary> |
| public virtual bool Empty |
| { |
| get |
| { |
| return Index == null; |
| } |
| } |
| |
| /// <summary> |
| /// Subclass can override this </summary> |
| protected internal virtual void VisitTerm(TermsEnum te, int termNum) |
| { |
| } |
| |
| /// <summary> |
| /// Invoked during <seealso cref="#uninvert(AtomicReader,Bits,BytesRef)"/> |
| /// to record the document frequency for each uninverted |
| /// term. |
| /// </summary> |
| protected internal virtual void SetActualDocFreq(int termNum, int df) |
| { |
| } |
| |
| /// <summary> |
| /// Call this only once (if you subclass!) </summary> |
| protected internal virtual void Uninvert(AtomicReader reader, Bits liveDocs, BytesRef termPrefix) |
| { |
| FieldInfo info = reader.FieldInfos.FieldInfo(Field); |
| if (info != null && info.HasDocValues()) |
| { |
| throw new InvalidOperationException("Type mismatch: " + Field + " was indexed as " + info.DocValuesType); |
| } |
| //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); |
| long startTime = Environment.TickCount; |
| Prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); |
| |
| int maxDoc = reader.MaxDoc; |
| int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number |
| int[] lastTerm = new int[maxDoc]; // last term we saw for this document |
| var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) |
| |
| Fields fields = reader.Fields; |
| if (fields == null) |
| { |
| // No terms |
| return; |
| } |
| Terms terms = fields.Terms(Field); |
| if (terms == null) |
| { |
| // No terms |
| return; |
| } |
| |
| TermsEnum te = terms.Iterator(null); |
| BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); |
| //System.out.println("seekStart=" + seekStart.utf8ToString()); |
| if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) |
| { |
| // No terms match |
| return; |
| } |
| |
| // If we need our "term index wrapper", these will be |
| // init'd below: |
| IList<BytesRef> indexedTerms = null; |
| PagedBytes indexedTermsBytes = null; |
| |
| bool testedOrd = false; |
| |
| // we need a minimum of 9 bytes, but round up to 12 since the space would |
| // be wasted with most allocators anyway. |
| var tempArr = new sbyte[12]; |
| |
| // |
| // enumerate all terms, and build an intermediate form of the un-inverted field. |
| // |
| // During this intermediate form, every document has a (potential) byte[] |
| // and the int[maxDoc()] array either contains the termNumber list directly |
| // or the *end* offset of the termNumber list in it's byte array (for faster |
| // appending and faster creation of the final form). |
| // |
| // idea... if things are too large while building, we could do a range of docs |
| // at a time (but it would be a fair amount slower to build) |
| // could also do ranges in parallel to take advantage of multiple CPUs |
| |
| // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) |
| // values. this requires going over the field first to find the most |
| // frequent terms ahead of time. |
| |
| int termNum = 0; |
| DocsEnum = null; |
| |
| // Loop begins with te positioned to first term (we call |
| // seek above): |
| for (; ; ) |
| { |
| BytesRef t = te.Term(); |
| if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) |
| { |
| break; |
| } |
| //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); |
| |
| if (!testedOrd) |
| { |
| try |
| { |
| OrdBase = (int)te.Ord(); |
| //System.out.println("got ordBase=" + ordBase); |
| } |
| catch (System.NotSupportedException uoe) |
| { |
| // Reader cannot provide ord support, so we wrap |
| // our own support by creating our own terms index: |
| indexedTerms = new List<BytesRef>(); |
| indexedTermsBytes = new PagedBytes(15); |
| //System.out.println("NO ORDS"); |
| } |
| testedOrd = true; |
| } |
| |
| VisitTerm(te, termNum); |
| |
| if (indexedTerms != null && (termNum & IndexIntervalMask) == 0) |
| { |
| // Index this term |
| SizeOfIndexedStrings += t.Length; |
| BytesRef indexedTerm = new BytesRef(); |
| indexedTermsBytes.Copy(t, indexedTerm); |
| // TODO: really should 1) strip off useless suffix, |
| // and 2) use FST not array/PagedBytes |
| indexedTerms.Add(indexedTerm); |
| } |
| |
| int df = te.DocFreq(); |
| if (df <= MaxTermDocFreq) |
| { |
| DocsEnum = te.Docs(liveDocs, DocsEnum, DocsEnum.FLAG_NONE); |
| |
| // dF, but takes deletions into account |
| int actualDF = 0; |
| |
| for (; ; ) |
| { |
| int doc = DocsEnum.NextDoc(); |
| if (doc == DocIdSetIterator.NO_MORE_DOCS) |
| { |
| break; |
| } |
| //System.out.println(" chunk=" + chunk + " docs"); |
| |
| actualDF++; |
| TermInstances++; |
| |
| //System.out.println(" docID=" + doc); |
| // add TNUM_OFFSET to the term number to make room for special reserved values: |
| // 0 (end term) and 1 (index into byte array follows) |
| int delta = termNum - lastTerm[doc] + TNUM_OFFSET; |
| lastTerm[doc] = termNum; |
| int val = index[doc]; |
| |
| if ((val & 0xff) == 1) |
| { |
| // index into byte array (actually the end of |
| // the doc-specific byte[] when building) |
| int pos = (int)((uint)val >> 8); |
| int ilen = VIntSize(delta); |
| var arr = bytes[doc]; |
| int newend = pos + ilen; |
| if (newend > arr.Length) |
| { |
| // We avoid a doubling strategy to lower memory usage. |
| // this faceting method isn't for docs with many terms. |
| // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. |
| // TODO: figure out what array lengths we can round up to w/o actually using more memory |
| // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? |
| // It should be safe to round up to the nearest 32 bits in any case. |
| int newLen = (newend + 3) & unchecked((int)0xfffffffc); // 4 byte alignment |
| var newarr = new sbyte[newLen]; |
| Array.Copy(arr, 0, newarr, 0, pos); |
| arr = newarr; |
| bytes[doc] = newarr; |
| } |
| pos = WriteInt(delta, arr, pos); |
| index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] |
| } |
| else |
| { |
| // OK, this int has data in it... find the end (a zero starting byte - not |
| // part of another number, hence not following a byte with the high bit set). |
| int ipos; |
| if (val == 0) |
| { |
| ipos = 0; |
| } |
| else if ((val & 0x0000ff80) == 0) |
| { |
| ipos = 1; |
| } |
| else if ((val & 0x00ff8000) == 0) |
| { |
| ipos = 2; |
| } |
| else if ((val & 0xff800000) == 0) |
| { |
| ipos = 3; |
| } |
| else |
| { |
| ipos = 4; |
| } |
| |
| //System.out.println(" ipos=" + ipos); |
| |
| int endPos = WriteInt(delta, tempArr, ipos); |
| //System.out.println(" endpos=" + endPos); |
| if (endPos <= 4) |
| { |
| //System.out.println(" fits!"); |
| // value will fit in the integer... move bytes back |
| for (int j = ipos; j < endPos; j++) |
| { |
| val |= (tempArr[j] & 0xff) << (j << 3); |
| } |
| index[doc] = val; |
| } |
| else |
| { |
| // value won't fit... move integer into byte[] |
| for (int j = 0; j < ipos; j++) |
| { |
| tempArr[j] = (sbyte)val; |
| val = (int)((uint)val >> 8); |
| } |
| // point at the end index in the byte[] |
| index[doc] = (endPos << 8) | 1; |
| bytes[doc] = tempArr; |
| tempArr = new sbyte[12]; |
| } |
| } |
| } |
| SetActualDocFreq(termNum, actualDF); |
| } |
| |
| termNum++; |
| if (te.Next() == null) |
| { |
| break; |
| } |
| } |
| |
| NumTermsInField = termNum; |
| |
| long midPoint = Environment.TickCount; |
| |
| if (TermInstances == 0) |
| { |
| // we didn't invert anything |
| // lower memory consumption. |
| Tnums = null; |
| } |
| else |
| { |
| this.Index = index; |
| |
| // |
| // transform intermediate form into the final form, building a single byte[] |
| // at a time, and releasing the intermediate byte[]s as we go to avoid |
| // increasing the memory footprint. |
| // |
| |
| for (int pass = 0; pass < 256; pass++) |
| { |
| var target = Tnums[pass]; |
| var pos = 0; // end in target; |
| if (target != null) |
| { |
| pos = target.Length; |
| } |
| else |
| { |
| target = new sbyte[4096]; |
| } |
| |
| // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx |
| // where pp is the pass (which array we are building), and xx is all values. |
| // each pass shares the same byte[] for termNumber lists. |
| for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) |
| { |
| int lim = Math.Min(docbase + (1 << 16), maxDoc); |
| for (int doc = docbase; doc < lim; doc++) |
| { |
| //System.out.println(" pass=" + pass + " process docID=" + doc); |
| int val = index[doc]; |
| if ((val & 0xff) == 1) |
| { |
| int len = (int)((uint)val >> 8); |
| //System.out.println(" ptr pos=" + pos); |
| index[doc] = (pos << 8) | 1; // change index to point to start of array |
| if ((pos & 0xff000000) != 0) |
| { |
| // we only have 24 bits for the array index |
| throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + Field); |
| } |
| var arr = bytes[doc]; |
| /* |
| for(byte b : arr) { |
| //System.out.println(" b=" + Integer.toHexString((int) b)); |
| } |
| */ |
| bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM |
| if (target.Length <= pos + len) |
| { |
| int newlen = target.Length; |
| /// <summary> |
| ///* we don't have to worry about the array getting too large |
| /// since the "pos" param will overflow first (only 24 bits available) |
| /// if ((newlen<<1) <= 0) { |
| /// // overflow... |
| /// newlen = Integer.MAX_VALUE; |
| /// if (newlen <= pos + len) { |
| /// throw new SolrException(400,"Too many terms to uninvert field!"); |
| /// } |
| /// } else { |
| /// while (newlen <= pos + len) newlen<<=1; // doubling strategy |
| /// } |
| /// *** |
| /// </summary> |
| while (newlen <= pos + len) // doubling strategy |
| { |
| newlen <<= 1; |
| } |
| var newtarget = new sbyte[newlen]; |
| Array.Copy(target, 0, newtarget, 0, pos); |
| target = newtarget; |
| } |
| Array.Copy(arr, 0, target, pos, len); |
| pos += len + 1; // skip single byte at end and leave it 0 for terminator |
| } |
| } |
| } |
| |
| // shrink array |
| if (pos < target.Length) |
| { |
| var newtarget = new sbyte[pos]; |
| Array.Copy(target, 0, newtarget, 0, pos); |
| target = newtarget; |
| } |
| |
| Tnums[pass] = target; |
| |
| if ((pass << 16) > maxDoc) |
| { |
| break; |
| } |
| } |
| } |
| if (indexedTerms != null) |
| { |
| IndexedTermsArray = indexedTerms.ToArray(); |
| } |
| |
| long endTime = Environment.TickCount; |
| |
| Total_time = (int)(endTime - startTime); |
| Phase1_time = (int)(midPoint - startTime); |
| } |
| |
| /// <summary> |
| /// Number of bytes to represent an unsigned int as a vint. </summary> |
| private static int VIntSize(int x) |
| { |
| if ((x & (0xffffffff << (7 * 1))) == 0) |
| { |
| return 1; |
| } |
| if ((x & (0xffffffff << (7 * 2))) == 0) |
| { |
| return 2; |
| } |
| if ((x & (0xffffffff << (7 * 3))) == 0) |
| { |
| return 3; |
| } |
| if ((x & (0xffffffff << (7 * 4))) == 0) |
| { |
| return 4; |
| } |
| return 5; |
| } |
| |
| // todo: if we know the size of the vInt already, we could do |
| // a single switch on the size |
| private static int WriteInt(int x, sbyte[] arr, int pos) |
| { |
| var a = ((int)((uint)x >> (7 * 4))); |
| if (a != 0) |
| { |
| arr[pos++] = unchecked((sbyte)(a | 0x80)); |
| } |
| a = ((int)((uint)x >> (7 * 3))); |
| if (a != 0) |
| { |
| arr[pos++] = unchecked((sbyte)(a | 0x80)); |
| } |
| a = ((int)((uint)x >> (7 * 2))); |
| if (a != 0) |
| { |
| arr[pos++] = unchecked((sbyte)(a | 0x80)); |
| } |
| a = ((int)((uint)x >> (7 * 1))); |
| if (a != 0) |
| { |
| arr[pos++] = unchecked((sbyte)(a | 0x80)); |
| } |
| arr[pos++] = (sbyte)(x & 0x7f); |
| return pos; |
| } |
| |
| /* Only used if original IndexReader doesn't implement |
| * ord; in this case we "wrap" our own terms index |
| * around it. */ |
| |
| private sealed class OrdWrappedTermsEnum : TermsEnum |
| { |
| internal bool InstanceFieldsInitialized = false; |
| |
| internal void InitializeInstanceFields() |
| { |
| Ord_Renamed = -OuterInstance.IndexInterval - 1; |
| } |
| |
| private readonly DocTermOrds OuterInstance; |
| |
| internal readonly TermsEnum TermsEnum; |
| internal BytesRef Term_Renamed; |
| internal long Ord_Renamed; // force "real" seek |
| |
| public OrdWrappedTermsEnum(DocTermOrds outerInstance, AtomicReader reader) |
| { |
| this.OuterInstance = outerInstance; |
| |
| if (!InstanceFieldsInitialized) |
| { |
| InitializeInstanceFields(); |
| InstanceFieldsInitialized = true; |
| } |
| Debug.Assert(outerInstance.IndexedTermsArray != null); |
| TermsEnum = reader.Fields.Terms(outerInstance.Field).Iterator(null); |
| } |
| |
| public override IComparer<BytesRef> Comparator |
| { |
| get |
| { |
| return TermsEnum.Comparator; |
| } |
| } |
| |
| public override DocsEnum Docs(Bits liveDocs, DocsEnum reuse, int flags) |
| { |
| return TermsEnum.Docs(liveDocs, reuse, flags); |
| } |
| |
| public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) |
| { |
| return TermsEnum.DocsAndPositions(liveDocs, reuse, flags); |
| } |
| |
| public override BytesRef Term() |
| { |
| return Term_Renamed; |
| } |
| |
| public override BytesRef Next() |
| { |
| if (++Ord_Renamed < 0) |
| { |
| Ord_Renamed = 0; |
| } |
| if (TermsEnum.Next() == null) |
| { |
| Term_Renamed = null; |
| return null; |
| } |
| return SetTerm(); // this is extra work if we know we are in bounds... |
| } |
| |
| public override int DocFreq() |
| { |
| return TermsEnum.DocFreq(); |
| } |
| |
| public override long TotalTermFreq() |
| { |
| return TermsEnum.TotalTermFreq(); |
| } |
| |
| public override long Ord() |
| { |
| return OuterInstance.OrdBase + Ord_Renamed; |
| } |
| |
| public override SeekStatus SeekCeil(BytesRef target) |
| { |
| // already here |
| if (Term_Renamed != null && Term_Renamed.Equals(target)) |
| { |
| return SeekStatus.FOUND; |
| } |
| |
| int startIdx = OuterInstance.IndexedTermsArray.ToList().BinarySearch(target); |
| |
| if (startIdx >= 0) |
| { |
| // we hit the term exactly... lucky us! |
| TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(target); |
| Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); |
| Ord_Renamed = startIdx << OuterInstance.IndexIntervalBits; |
| SetTerm(); |
| Debug.Assert(Term_Renamed != null); |
| return SeekStatus.FOUND; |
| } |
| |
| // we didn't hit the term exactly |
| startIdx = -startIdx - 1; |
| |
| if (startIdx == 0) |
| { |
| // our target occurs *before* the first term |
| TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(target); |
| Debug.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND); |
| Ord_Renamed = 0; |
| SetTerm(); |
| Debug.Assert(Term_Renamed != null); |
| return SeekStatus.NOT_FOUND; |
| } |
| |
| // back up to the start of the block |
| startIdx--; |
| |
| if ((Ord_Renamed >> OuterInstance.IndexIntervalBits) == startIdx && Term_Renamed != null && Term_Renamed.CompareTo(target) <= 0) |
| { |
| // we are already in the right block and the current term is before the term we want, |
| // so we don't need to seek. |
| } |
| else |
| { |
| // seek to the right block |
| TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(OuterInstance.IndexedTermsArray[startIdx]); |
| Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); |
| Ord_Renamed = startIdx << OuterInstance.IndexIntervalBits; |
| SetTerm(); |
| Debug.Assert(Term_Renamed != null); // should be non-null since it's in the index |
| } |
| |
| while (Term_Renamed != null && Term_Renamed.CompareTo(target) < 0) |
| { |
| Next(); |
| } |
| |
| if (Term_Renamed == null) |
| { |
| return SeekStatus.END; |
| } |
| else if (Term_Renamed.CompareTo(target) == 0) |
| { |
| return SeekStatus.FOUND; |
| } |
| else |
| { |
| return SeekStatus.NOT_FOUND; |
| } |
| } |
| |
| public override void SeekExact(long targetOrd) |
| { |
| int delta = (int)(targetOrd - OuterInstance.OrdBase - Ord_Renamed); |
| //System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord + " ii=" + indexInterval); |
| if (delta < 0 || delta > OuterInstance.IndexInterval) |
| { |
| int idx = (int)((long)((ulong)targetOrd >> OuterInstance.IndexIntervalBits)); |
| BytesRef @base = OuterInstance.IndexedTermsArray[idx]; |
| //System.out.println(" do seek term=" + base.utf8ToString()); |
| Ord_Renamed = idx << OuterInstance.IndexIntervalBits; |
| delta = (int)(targetOrd - Ord_Renamed); |
| TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(@base); |
| Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); |
| } |
| else |
| { |
| //System.out.println("seek w/in block"); |
| } |
| |
| while (--delta >= 0) |
| { |
| BytesRef br = TermsEnum.Next(); |
| if (br == null) |
| { |
| Debug.Assert(false); |
| return; |
| } |
| Ord_Renamed++; |
| } |
| |
| SetTerm(); |
| Debug.Assert(Term_Renamed != null); |
| } |
| |
| internal BytesRef SetTerm() |
| { |
| Term_Renamed = TermsEnum.Term(); |
| //System.out.println(" setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == null ? "null" : prefix.utf8ToString())); |
| if (OuterInstance.Prefix != null && !StringHelper.StartsWith(Term_Renamed, OuterInstance.Prefix)) |
| { |
| Term_Renamed = null; |
| } |
| return Term_Renamed; |
| } |
| } |
| |
| /// <summary> |
| /// Returns the term (<seealso cref="BytesRef"/>) corresponding to |
| /// the provided ordinal. |
| /// </summary> |
| public virtual BytesRef LookupTerm(TermsEnum termsEnum, int ord) |
| { |
| termsEnum.SeekExact(ord); |
| return termsEnum.Term(); |
| } |
| |
| /// <summary> |
| /// Returns a SortedSetDocValues view of this instance </summary> |
| public virtual SortedSetDocValues GetIterator(AtomicReader reader) |
| { |
| if (Empty) |
| { |
| return DocValues.EMPTY_SORTED_SET; |
| } |
| else |
| { |
| return new Iterator(this, reader); |
| } |
| } |
| |
| private class Iterator : SortedSetDocValues |
| { |
| private readonly DocTermOrds OuterInstance; |
| |
| private readonly AtomicReader Reader; |
| private readonly TermsEnum Te; // used internally for lookupOrd() and lookupTerm() |
| |
| // currently we read 5 at a time (using the logic of the old iterator) |
| private readonly int[] Buffer = new int[5]; |
| |
| private int BufferUpto; |
| private int BufferLength; |
| |
| private int Tnum; |
| private int Upto; |
| private sbyte[] Arr; |
| |
| internal Iterator(DocTermOrds outerInstance, AtomicReader reader) |
| { |
| this.OuterInstance = outerInstance; |
| this.Reader = reader; |
| this.Te = TermsEnum(); |
| } |
| |
| public override long NextOrd() |
| { |
| while (BufferUpto == BufferLength) |
| { |
| if (BufferLength < Buffer.Length) |
| { |
| return NO_MORE_ORDS; |
| } |
| else |
| { |
| BufferLength = Read(Buffer); |
| BufferUpto = 0; |
| } |
| } |
| return Buffer[BufferUpto++]; |
| } |
| |
| /// <summary> |
| /// Buffer must be at least 5 ints long. Returns number |
| /// of term ords placed into buffer; if this count is |
| /// less than buffer.length then that is the end. |
| /// </summary> |
| internal virtual int Read(int[] buffer) |
| { |
| int bufferUpto = 0; |
| if (Arr == null) |
| { |
| // code is inlined into upto |
| //System.out.println("inlined"); |
| int code = Upto; |
| int delta = 0; |
| for (; ; ) |
| { |
| delta = (delta << 7) | (code & 0x7f); |
| if ((code & 0x80) == 0) |
| { |
| if (delta == 0) |
| { |
| break; |
| } |
| Tnum += delta - TNUM_OFFSET; |
| buffer[bufferUpto++] = OuterInstance.OrdBase + Tnum; |
| //System.out.println(" tnum=" + tnum); |
| delta = 0; |
| } |
| code = (int)((uint)code >> 8); |
| } |
| } |
| else |
| { |
| // code is a pointer |
| for (; ; ) |
| { |
| int delta = 0; |
| for (; ; ) |
| { |
| sbyte b = Arr[Upto++]; |
| delta = (delta << 7) | (b & 0x7f); |
| //System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b); |
| if ((b & 0x80) == 0) |
| { |
| break; |
| } |
| } |
| //System.out.println(" delta=" + delta); |
| if (delta == 0) |
| { |
| break; |
| } |
| Tnum += delta - TNUM_OFFSET; |
| //System.out.println(" tnum=" + tnum); |
| buffer[bufferUpto++] = OuterInstance.OrdBase + Tnum; |
| if (bufferUpto == buffer.Length) |
| { |
| break; |
| } |
| } |
| } |
| |
| return bufferUpto; |
| } |
| |
| public override int Document |
| { |
| set |
| { |
| Tnum = 0; |
| int code = OuterInstance.Index[value]; |
| if ((code & 0xff) == 1) |
| { |
| // a pointer |
| Upto = (int)((uint)code >> 8); |
| //System.out.println(" pointer! upto=" + upto); |
| int whichArray = ((int)((uint)value >> 16)) & 0xff; |
| Arr = OuterInstance.Tnums[whichArray]; |
| } |
| else |
| { |
| //System.out.println(" inline!"); |
| Arr = null; |
| Upto = code; |
| } |
| BufferUpto = 0; |
| BufferLength = Read(Buffer); |
| } |
| } |
| |
| public override void LookupOrd(long ord, BytesRef result) |
| { |
| BytesRef @ref = null; |
| try |
| { |
| @ref = OuterInstance.LookupTerm(Te, (int)ord); |
| } |
| catch (System.IO.IOException e) |
| { |
| throw new Exception(e.Message, e); |
| } |
| result.Bytes = @ref.Bytes; |
| result.Offset = @ref.Offset; |
| result.Length = @ref.Length; |
| } |
| |
| public override long ValueCount |
| { |
| get |
| { |
| return OuterInstance.NumTerms(); |
| } |
| } |
| |
| public override long LookupTerm(BytesRef key) |
| { |
| try |
| { |
| if (Te.SeekCeil(key) == SeekStatus.FOUND) |
| { |
| return Te.Ord(); |
| } |
| else |
| { |
| return -Te.Ord() - 1; |
| } |
| } |
| catch (System.IO.IOException e) |
| { |
| throw new Exception(e.Message, e); |
| } |
| } |
| |
| public override TermsEnum TermsEnum() |
| { |
| try |
| { |
| return OuterInstance.GetOrdTermsEnum(Reader); |
| } |
| catch (System.IO.IOException e) |
| { |
| throw new Exception(); |
| } |
| } |
| } |
| } |
| } |