blob: 439c10579dfab0584806b1ed9009cd8cd575cfb1 [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Index;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Store;
using J2N.Threading.Atomic;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.IO;
using System.Reflection;
namespace Lucene.Net.Facet.Taxonomy.Directory
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using AtomicReader = Lucene.Net.Index.AtomicReader;
using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
using LockObtainFailedException = Lucene.Net.Store.LockObtainFailedException; // javadocs
using BytesRef = Lucene.Net.Util.BytesRef;
using Cl2oTaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.Cl2oTaxonomyWriterCache;
using Directory = Lucene.Net.Store.Directory;
using CorruptIndexException = Lucene.Net.Index.CorruptIndexException; // javadocs
using DirectoryReader = Lucene.Net.Index.DirectoryReader;
using DocsEnum = Lucene.Net.Index.DocsEnum;
using Document = Lucene.Net.Documents.Document;
using Field = Lucene.Net.Documents.Field;
using FieldType = Lucene.Net.Documents.FieldType;
using IndexWriter = Lucene.Net.Index.IndexWriter;
using IndexWriterConfig = Lucene.Net.Index.IndexWriterConfig;
using LogByteSizeMergePolicy = Lucene.Net.Index.LogByteSizeMergePolicy;
using OpenMode = Lucene.Net.Index.OpenMode;
using ReaderManager = Lucene.Net.Index.ReaderManager;
using SegmentInfos = Lucene.Net.Index.SegmentInfos;
using StringField = Lucene.Net.Documents.StringField;
using ITaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.ITaxonomyWriterCache;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
using TextField = Lucene.Net.Documents.TextField;
using TieredMergePolicy = Lucene.Net.Index.TieredMergePolicy;
using TokenStream = Lucene.Net.Analysis.TokenStream;
/// <summary>
/// <see cref="ITaxonomyWriter"/> which uses a <see cref="Store.Directory"/> to store the taxonomy
/// information on disk, and keeps an additional in-memory cache of some or all
/// categories.
/// <para>
/// In addition to the permanently-stored information in the <see cref="Store.Directory"/>,
/// efficiency dictates that we also keep an in-memory cache of <b>recently
/// seen</b> or <b>all</b> categories, so that we do not need to go back to disk
/// for every category addition to see which ordinal this category already has,
/// if any. A <see cref="ITaxonomyWriterCache"/> object determines the specific caching
/// algorithm used.
/// </para>
/// <para>
/// This class offers some hooks for extending classes to control the
/// <see cref="IndexWriter"/> instance that is used. See <see cref="OpenIndexWriter"/>.
///
/// @lucene.experimental
/// </para>
/// </summary>
public class DirectoryTaxonomyWriter : ITaxonomyWriter
{
/// <summary>
/// Property name of user commit data that contains the index epoch. The epoch
/// changes whenever the taxonomy is recreated (i.e. opened with
/// <see cref="OpenMode.CREATE"/>.
/// <para>
/// Applications should not use this property in their commit data because it
/// will be overridden by this taxonomy writer.
/// </para>
/// </summary>
public const string INDEX_EPOCH = "index.epoch";
private readonly Directory dir;
private readonly IndexWriter indexWriter;
private readonly ITaxonomyWriterCache cache;
private readonly AtomicInt32 cacheMisses = new AtomicInt32(0);
// Records the taxonomy index epoch, updated on replaceTaxonomy as well.
private long indexEpoch;
private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
private Field parentStreamField;
private Field fullPathField;
private int cacheMissesUntilFill = 11;
private bool shouldFillCache = true;
// even though lazily initialized, not volatile so that access to it is
// faster. we keep a volatile boolean init instead.
private ReaderManager readerManager;
private volatile bool initializedReaderManager = false;
private volatile bool shouldRefreshReaderManager;
/// <summary>
/// We call the cache "complete" if we know that every category in our
/// taxonomy is in the cache. When the cache is <b>not</b> complete, and
/// we can't find a category in the cache, we still need to look for it
/// in the on-disk index; Therefore when the cache is not complete, we
/// need to open a "reader" to the taxonomy index.
/// The cache becomes incomplete if it was never filled with the existing
/// categories, or if a Put() to the cache ever returned true (meaning
/// that some of the cached data was cleared).
/// </summary>
private volatile bool cacheIsComplete;
private volatile bool isClosed = false;
private volatile TaxonomyIndexArrays taxoArrays;
private volatile int nextID;
/// <summary>
/// Reads the commit data from a <see cref="Store.Directory"/>. </summary>
private static IDictionary<string, string> ReadCommitData(Directory dir)
{
SegmentInfos infos = new SegmentInfos();
infos.Read(dir);
return infos.UserData;
}
/// <summary>
/// Forcibly unlocks the taxonomy in the named directory.
/// <para/>
/// Caution: this should only be used by failure recovery code, when it is
/// known that no other process nor thread is in fact currently accessing
/// this taxonomy.
/// <para/>
/// This method is unnecessary if your <see cref="Store.Directory"/> uses a
/// <see cref="NativeFSLockFactory"/> instead of the default
/// <see cref="SimpleFSLockFactory"/>. When the "native" lock is used, a lock
/// does not stay behind forever when the process using it dies.
/// </summary>
public static void Unlock(Directory directory)
{
IndexWriter.Unlock(directory);
}
/// <summary>
/// Construct a Taxonomy writer.
/// </summary>
/// <param name="directory">
/// The <see cref="Store.Directory"/> in which to store the taxonomy. Note that
/// the taxonomy is written directly to that directory (not to a
/// subdirectory of it). </param>
/// <param name="openMode">
/// Specifies how to open a taxonomy for writing: <see cref="OpenMode.APPEND"/>
/// means open an existing index for append (failing if the index does
/// not yet exist). <see cref="OpenMode.CREATE"/> means create a new index (first
/// deleting the old one if it already existed).
/// <see cref="OpenMode.CREATE_OR_APPEND"/> appends to an existing index if there
/// is one, otherwise it creates a new index. </param>
/// <param name="cache">
/// A <see cref="ITaxonomyWriterCache"/> implementation which determines
/// the in-memory caching policy. See for example
/// <see cref="WriterCache.LruTaxonomyWriterCache"/> and <see cref="Cl2oTaxonomyWriterCache"/>.
/// If null or missing, <see cref="DefaultTaxonomyWriterCache()"/> is used. </param>
/// <exception cref="CorruptIndexException">
/// if the taxonomy is corrupted. </exception>
/// <exception cref="LockObtainFailedException">
/// if the taxonomy is locked by another writer. If it is known
/// that no other concurrent writer is active, the lock might
/// have been left around by an old dead process, and should be
/// removed using <see cref="Unlock(Directory)"/>. </exception>
/// <exception cref="IOException">
/// if another error occurred. </exception>
public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode,
ITaxonomyWriterCache cache)
{
dir = directory;
IndexWriterConfig config = CreateIndexWriterConfig(openMode);
indexWriter = OpenIndexWriter(dir, config);
// verify (to some extent) that merge policy in effect would preserve category docids
if (indexWriter != null)
{
Debug.Assert(!(indexWriter.Config.MergePolicy is TieredMergePolicy), "for preserving category docids, merging none-adjacent segments is not allowed");
}
// after we opened the writer, and the index is locked, it's safe to check
// the commit data and read the index epoch
openMode = config.OpenMode;
if (!DirectoryReader.IndexExists(directory))
{
indexEpoch = 1;
}
else
{
string epochStr = null;
IDictionary<string, string> commitData = ReadCommitData(directory);
if (commitData != null && commitData.TryGetValue(INDEX_EPOCH, out string value))
{
epochStr = value;
}
// no commit data, or no epoch in it means an old taxonomy, so set its epoch to 1, for lack
// of a better value.
indexEpoch = epochStr == null ? 1 : Convert.ToInt64(epochStr, 16);
}
if (openMode == OpenMode.CREATE)
{
++indexEpoch;
}
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.OmitNorms = true;
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
fullPathField = new StringField(Consts.FULL, "", Field.Store.YES);
if (indexWriter == null)
return;
nextID = indexWriter.MaxDoc;
if (cache == null)
{
cache = DefaultTaxonomyWriterCache();
}
this.cache = cache;
if (nextID == 0)
{
cacheIsComplete = true;
// Make sure that the taxonomy always contain the root category
// with category id 0.
AddCategory(new FacetLabel());
}
else
{
// There are some categories on the disk, which we have not yet
// read into the cache, and therefore the cache is incomplete.
// We choose not to read all the categories into the cache now,
// to avoid terrible performance when a taxonomy index is opened
// to add just a single category. We will do it later, after we
// notice a few cache misses.
cacheIsComplete = false;
}
}
/// <summary>
/// Open internal index writer, which contains the taxonomy data.
/// <para/>
/// Extensions may provide their own <see cref="IndexWriter"/> implementation or instance.
/// <para/>
/// <b>NOTE:</b> the instance this method returns will be disposed upon calling
/// to <see cref="Dispose()"/>.
/// <para/>
/// <b>NOTE:</b> the merge policy in effect must not merge none adjacent segments. See
/// comment in <see cref="CreateIndexWriterConfig(OpenMode)"/> for the logic behind this.
/// </summary>
/// <seealso cref="CreateIndexWriterConfig(OpenMode)"/>
/// <param name="directory">
/// the <see cref="Store.Directory"/> on top of which an <see cref="IndexWriter"/>
/// should be opened. </param>
/// <param name="config">
/// configuration for the internal index writer. </param>
protected virtual IndexWriter OpenIndexWriter(Directory directory, IndexWriterConfig config)
{
return new IndexWriter(directory, config);
}
/// <summary>
/// Create the <see cref="IndexWriterConfig"/> that would be used for opening the internal index writer.
/// <para/>
/// Extensions can configure the <see cref="IndexWriter"/> as they see fit,
/// including setting a <see cref="Index.MergeScheduler"/>, or
/// <see cref="Index.IndexDeletionPolicy"/>, different RAM size
/// etc.
/// <para/>
/// <b>NOTE:</b> internal docids of the configured index must not be altered.
/// For that, categories are never deleted from the taxonomy index.
/// In addition, merge policy in effect must not merge none adjacent segments.
/// </summary>
/// <seealso cref="OpenIndexWriter(Directory, IndexWriterConfig)"/>
/// <param name="openMode"> see <see cref="OpenMode"/> </param>
protected virtual IndexWriterConfig CreateIndexWriterConfig(OpenMode openMode)
{
// TODO: should we use a more optimized Codec, e.g. Pulsing (or write custom)?
// The taxonomy has a unique structure, where each term is associated with one document
// :Post-Release-Update-Version.LUCENE_XY:
// Make sure we use a MergePolicy which always merges adjacent segments and thus
// keeps the doc IDs ordered as well (this is crucial for the taxonomy index).
return (new IndexWriterConfig(LuceneVersion.LUCENE_48, null)).SetOpenMode(openMode).SetMergePolicy(new LogByteSizeMergePolicy());
}
/// <summary>
/// Opens a <see cref="ReaderManager"/> from the internal <see cref="IndexWriter"/>.
/// </summary>
private void InitReaderManager()
{
if (!initializedReaderManager)
{
lock (this)
{
// verify that the taxo-writer hasn't been closed on us.
EnsureOpen();
if (!initializedReaderManager)
{
readerManager = new ReaderManager(indexWriter, false);
shouldRefreshReaderManager = false;
initializedReaderManager = true;
}
}
}
}
/// <summary>
/// Creates a new instance with a default cache as defined by
/// <see cref="DefaultTaxonomyWriterCache()"/>.
/// </summary>
public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode = OpenMode.CREATE_OR_APPEND)
: this(directory, openMode, DefaultTaxonomyWriterCache())
{
}
/// <summary>
/// Defines the default <see cref="ITaxonomyWriterCache"/> to use in constructors
/// which do not specify one.
/// <para>
/// The current default is <see cref="Cl2oTaxonomyWriterCache"/> constructed
/// with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
/// cached in memory while building it.
/// </para>
/// </summary>
public static ITaxonomyWriterCache DefaultTaxonomyWriterCache()
{
return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
}
/// <summary>
/// Frees used resources as well as closes the underlying <see cref="IndexWriter"/>,
/// which commits whatever changes made to it to the underlying
/// <see cref="Store.Directory"/>.
/// </summary>
public void Dispose()
{
lock (this)
{
if (!isClosed)
{
Commit();
DoClose();
}
}
}
private void DoClose()
{
indexWriter.Dispose();
isClosed = true;
CloseResources();
}
/// <summary>
/// A hook for extending classes to close additional resources that were used.
/// The default implementation closes the <see cref="Index.IndexReader"/> as well as the
/// <see cref="ITaxonomyWriterCache"/> instances that were used.
/// <para>
/// <b>NOTE:</b> if you override this method, you should include a
/// <c>base.CloseResources()</c> call in your implementation.
/// </para>
/// </summary>
protected virtual void CloseResources()
{
lock (this)
{
if (initializedReaderManager)
{
readerManager.Dispose();
readerManager = null;
initializedReaderManager = false;
}
if (cache != null)
{
cache.Dispose();
}
}
}
/// <summary>
/// Look up the given category in the cache and/or the on-disk storage,
/// returning the category's ordinal, or a negative number in case the
/// category does not yet exist in the taxonomy.
/// </summary>
protected virtual int FindCategory(FacetLabel categoryPath)
{
lock (this)
{
// If we can find the category in the cache, or we know the cache is
// complete, we can return the response directly from it
int res = cache.Get(categoryPath);
if (res >= 0 || cacheIsComplete)
{
return res;
}
cacheMisses.IncrementAndGet();
// After a few cache misses, it makes sense to read all the categories
// from disk and into the cache. The reason not to do this on the first
// cache miss (or even when opening the writer) is that it will
// significantly slow down the case when a taxonomy is opened just to
// add one category. The idea only spending a long time on reading
// after enough time was spent on cache misses is known as an "online
// algorithm".
PerhapsFillCache();
res = cache.Get(categoryPath);
if (res >= 0 || cacheIsComplete)
{
// if after filling the cache from the info on disk, the category is in it
// or the cache is complete, return whatever cache.get returned.
return res;
}
// if we get here, it means the category is not in the cache, and it is not
// complete, and therefore we must look for the category on disk.
// We need to get an answer from the on-disk index.
InitReaderManager();
int doc = -1;
DirectoryReader reader = readerManager.Acquire();
try
{
BytesRef catTerm = new BytesRef(FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length));
TermsEnum termsEnum = null; // reuse
DocsEnum docs = null; // reuse
foreach (AtomicReaderContext ctx in reader.Leaves)
{
Terms terms = ctx.AtomicReader.GetTerms(Consts.FULL);
if (terms != null)
{
termsEnum = terms.GetIterator(termsEnum);
if (termsEnum.SeekExact(catTerm))
{
// liveDocs=null because the taxonomy has no deletes
docs = termsEnum.Docs(null, docs, 0); // freqs not required
// if the term was found, we know it has exactly one document.
doc = docs.NextDoc() + ctx.DocBase;
break;
}
}
}
}
finally
{
readerManager.Release(reader);
}
if (doc > 0)
{
AddToCache(categoryPath, doc);
}
return doc;
}
}
public virtual int AddCategory(FacetLabel categoryPath)
{
EnsureOpen();
// check the cache outside the synchronized block. this results in better
// concurrency when categories are there.
int res = cache.Get(categoryPath);
if (res < 0)
{
// the category is not in the cache - following code cannot be executed in parallel.
lock (this)
{
res = FindCategory(categoryPath);
if (res < 0)
{
// This is a new category, and we need to insert it into the index
// (and the cache). Actually, we might also need to add some of
// the category's ancestors before we can add the category itself
// (while keeping the invariant that a parent is always added to
// the taxonomy before its child). internalAddCategory() does all
// this recursively
res = InternalAddCategory(categoryPath);
}
}
}
return res;
}
/// <summary>
/// Add a new category into the index (and the cache), and return its new
/// ordinal.
/// <para>
/// Actually, we might also need to add some of the category's ancestors
/// before we can add the category itself (while keeping the invariant that a
/// parent is always added to the taxonomy before its child). We do this by
/// recursion.
/// </para>
/// </summary>
private int InternalAddCategory(FacetLabel cp)
{
// Find our parent's ordinal (recursively adding the parent category
// to the taxonomy if it's not already there). Then add the parent
// ordinal as payloads (rather than a stored field; payloads can be
// more efficiently read into memory in bulk by LuceneTaxonomyReader)
int parent;
if (cp.Length > 1)
{
FacetLabel parentPath = cp.Subpath(cp.Length - 1);
parent = FindCategory(parentPath);
if (parent < 0)
{
parent = InternalAddCategory(parentPath);
}
}
else if (cp.Length == 1)
{
parent = TaxonomyReader.ROOT_ORDINAL;
}
else
{
parent = TaxonomyReader.INVALID_ORDINAL;
}
int id = AddCategoryDocument(cp, parent);
return id;
}
/// <summary>
/// Verifies that this instance wasn't closed, or throws
/// <see cref="ObjectDisposedException"/> if it is.
/// </summary>
protected internal void EnsureOpen()
{
if (isClosed)
{
throw new ObjectDisposedException(this.GetType().GetTypeInfo().FullName, "The taxonomy writer has already been closed");
}
}
/// <summary>
/// Note that the methods calling <see cref="AddCategoryDocument"/> are synchornized, so
/// this method is effectively synchronized as well.
/// </summary>
private int AddCategoryDocument(FacetLabel categoryPath, int parent)
{
// Before Lucene 2.9, position increments >=0 were supported, so we
// added 1 to parent to allow the parent -1 (the parent of the root).
// Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
// no longer enough, since 0 is not encoded consistently either (see
// comment in SinglePositionTokenStream). But because we must be
// backward-compatible with existing indexes, we can't just fix what
// we write here (e.g., to write parent+2), and need to do a workaround
// in the reader (which knows that anyway only category 0 has a parent
// -1).
parentStream.Set(Math.Max(parent + 1, 1));
Document d = new Document();
d.Add(parentStreamField);
fullPathField.SetStringValue(FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length));
d.Add(fullPathField);
// Note that we do no pass an Analyzer here because the fields that are
// added to the Document are untokenized or contains their own TokenStream.
// Therefore the IndexWriter's Analyzer has no effect.
indexWriter.AddDocument(d);
int id = nextID++;
// added a category document, mark that ReaderManager is not up-to-date
shouldRefreshReaderManager = true;
// also add to the parent array
taxoArrays = GetTaxoArrays().Add(id, parent);
// NOTE: this line must be executed last, or else the cache gets updated
// before the parents array (LUCENE-4596)
AddToCache(categoryPath, id);
return id;
}
private class SinglePositionTokenStream : TokenStream
{
private ICharTermAttribute termAtt;
private IPositionIncrementAttribute posIncrAtt;
private bool returned;
private int val;
private readonly string word;
public SinglePositionTokenStream(string word)
{
termAtt = AddAttribute<ICharTermAttribute>();
posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
this.word = word;
returned = true;
}
/// <summary>
/// Set the value we want to keep, as the position increment.
/// Note that when TermPositions.NextPosition() is later used to
/// retrieve this value, val-1 will be returned, not val.
/// <para>
/// IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
/// the retrieved position would be -1). But starting with Lucene 2.9,
/// this unfortunately changed, and only val>0 are safe. val=0 can
/// still be used, but don't count on the value you retrieve later
/// (it could be 0 or -1, depending on circumstances or versions).
/// This change is described in Lucene's JIRA: LUCENE-1542.
/// </para>
/// </summary>
public virtual void Set(int val)
{
this.val = val;
returned = false;
}
public sealed override bool IncrementToken()
{
if (returned)
{
return false;
}
ClearAttributes();
posIncrAtt.PositionIncrement = val;
termAtt.SetEmpty();
termAtt.Append(word);
returned = true;
return true;
}
}
private void AddToCache(FacetLabel categoryPath, int id)
{
if (cache.Put(categoryPath, id))
{
// If cache.put() returned true, it means the cache was limited in
// size, became full, and parts of it had to be evicted. It is
// possible that a relatively-new category that isn't yet visible
// to our 'reader' was evicted, and therefore we must now refresh
// the reader.
RefreshReaderManager();
cacheIsComplete = false;
}
}
private void RefreshReaderManager()
{
lock (this)
{
// this method is synchronized since it cannot happen concurrently with
// addCategoryDocument -- when this method returns, we must know that the
// reader manager's state is current. also, it sets shouldRefresh to false,
// and this cannot overlap with addCatDoc too.
// NOTE: since this method is sync'ed, it can call maybeRefresh, instead of
// maybeRefreshBlocking. If ever this is changed, make sure to change the
// call too.
if (shouldRefreshReaderManager && initializedReaderManager)
{
readerManager.MaybeRefresh();
shouldRefreshReaderManager = false;
}
}
}
public virtual void Commit()
{
lock (this)
{
EnsureOpen();
// LUCENE-4972: if we always call setCommitData, we create empty commits
string epochStr = null;
indexWriter.CommitData.TryGetValue(INDEX_EPOCH, out epochStr);
if (epochStr == null || Convert.ToInt64(epochStr, 16) != indexEpoch)
{
indexWriter.SetCommitData(CombinedCommitData(indexWriter.CommitData));
}
indexWriter.Commit();
}
}
/// <summary>
/// Combine original user data with the taxonomy epoch.
/// </summary>
private IDictionary<string, string> CombinedCommitData(IDictionary<string, string> commitData)
{
IDictionary<string, string> m = new Dictionary<string, string>();
if (commitData != null)
{
m.PutAll(commitData);
}
m[INDEX_EPOCH] = Convert.ToString(indexEpoch, 16);
return m;
}
public virtual void SetCommitData(IDictionary<string, string> commitUserData)
{
indexWriter.SetCommitData(CombinedCommitData(commitUserData));
}
public virtual IDictionary<string, string> CommitData
{
get
{
return CombinedCommitData(indexWriter.CommitData);
}
}
/// <summary>
/// prepare most of the work needed for a two-phase commit.
/// See <see cref="IndexWriter.PrepareCommit"/>.
/// </summary>
public virtual void PrepareCommit()
{
lock (this)
{
EnsureOpen();
// LUCENE-4972: if we always call setCommitData, we create empty commits
string epochStr;
if (!indexWriter.CommitData.TryGetValue(INDEX_EPOCH, out epochStr)
|| epochStr == null
|| Convert.ToInt64(epochStr, 16) != indexEpoch)
{
indexWriter.SetCommitData(CombinedCommitData(indexWriter.CommitData));
}
indexWriter.PrepareCommit();
}
}
public virtual int Count
{
get
{
EnsureOpen();
return nextID;
}
}
/// <summary>
/// Set the number of cache misses before an attempt is made to read the entire
/// taxonomy into the in-memory cache.
/// <para>
/// This taxonomy writer holds an in-memory cache of recently seen categories
/// to speed up operation. On each cache-miss, the on-disk index needs to be
/// consulted. When an existing taxonomy is opened, a lot of slow disk reads
/// like that are needed until the cache is filled, so it is more efficient to
/// read the entire taxonomy into memory at once. We do this complete read
/// after a certain number (defined by this method) of cache misses.
/// </para>
/// <para>
/// If the number is set to <c>0</c>, the entire taxonomy is read into the
/// cache on first use, without fetching individual categories first.
/// </para>
/// <para>
/// NOTE: it is assumed that this method is called immediately after the
/// taxonomy writer has been created.
/// </para>
/// </summary>
public virtual void SetCacheMissesUntilFill(int i)
{
EnsureOpen();
cacheMissesUntilFill = i;
}
// we need to guarantee that if several threads call this concurrently, only
// one executes it, and after it returns, the cache is updated and is either
// complete or not.
private void PerhapsFillCache()
{
lock (this)
{
if (cacheMisses < cacheMissesUntilFill)
{
return;
}
if (!shouldFillCache)
{
// we already filled the cache once, there's no need to re-fill it
return;
}
shouldFillCache = false;
InitReaderManager();
bool aborted = false;
DirectoryReader reader = readerManager.Acquire();
try
{
TermsEnum termsEnum = null;
DocsEnum docsEnum = null;
foreach (AtomicReaderContext ctx in reader.Leaves)
{
Terms terms = ctx.AtomicReader.GetTerms(Consts.FULL);
if (terms != null) // cannot really happen, but be on the safe side
{
termsEnum = terms.GetIterator(termsEnum);
while (termsEnum.Next() != null)
{
if (!cache.IsFull)
{
BytesRef t = termsEnum.Term;
// Since we guarantee uniqueness of categories, each term has exactly
// one document. Also, since we do not allow removing categories (and
// hence documents), there are no deletions in the index. Therefore, it
// is sufficient to call next(), and then doc(), exactly once with no
// 'validation' checks.
FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString()));
docsEnum = termsEnum.Docs(null, docsEnum, DocsFlags.NONE);
bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase);
Debug.Assert(!res, "entries should not have been evicted from the cache");
}
else
{
// the cache is full and the next put() will evict entries from it, therefore abort the iteration.
aborted = true;
break;
}
}
}
if (aborted)
{
break;
}
}
}
finally
{
readerManager.Release(reader);
}
cacheIsComplete = !aborted;
if (cacheIsComplete)
{
lock (this)
{
// everything is in the cache, so no need to keep readerManager open.
// this block is executed in a sync block so that it works well with
// initReaderManager called in parallel.
readerManager.Dispose();
readerManager = null;
initializedReaderManager = false;
}
}
}
}
private TaxonomyIndexArrays GetTaxoArrays()
{
if (taxoArrays == null)
{
lock (this)
{
if (taxoArrays == null)
{
InitReaderManager();
DirectoryReader reader = readerManager.Acquire();
try
{
// according to Java Concurrency, this might perform better on some
// JVMs, since the object initialization doesn't happen on the
// volatile member.
TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(reader);
taxoArrays = tmpArrays;
}
finally
{
readerManager.Release(reader);
}
}
}
}
return taxoArrays;
}
public virtual int GetParent(int ordinal)
{
EnsureOpen();
// Note: the following if() just enforces that a user can never ask
// for the parent of a nonexistant category - even if the parent array
// was allocated bigger than it really needs to be.
if (ordinal >= nextID)
{
throw new System.IndexOutOfRangeException("requested ordinal is bigger than the largest ordinal in the taxonomy");
}
int[] parents = GetTaxoArrays().Parents;
Debug.Assert(ordinal < parents.Length, "requested ordinal (" + ordinal + "); parents.length (" + parents.Length + ") !");
return parents[ordinal];
}
/// <summary>
/// Takes the categories from the given taxonomy directory, and adds the
/// missing ones to this taxonomy. Additionally, it fills the given
/// <see cref="IOrdinalMap"/> with a mapping from the original ordinal to the new
/// ordinal.
/// </summary>
public virtual void AddTaxonomy(Directory taxoDir, IOrdinalMap map)
{
EnsureOpen();
DirectoryReader r = DirectoryReader.Open(taxoDir);
try
{
int size = r.NumDocs;
IOrdinalMap ordinalMap = map;
ordinalMap.SetSize(size);
int @base = 0;
TermsEnum te = null;
DocsEnum docs = null;
foreach (AtomicReaderContext ctx in r.Leaves)
{
AtomicReader ar = ctx.AtomicReader;
Terms terms = ar.GetTerms(Consts.FULL);
te = terms.GetIterator(te);
while (te.Next() != null)
{
FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(te.Term.Utf8ToString()));
int ordinal = AddCategory(cp);
docs = te.Docs(null, docs, DocsFlags.NONE);
ordinalMap.AddMapping(docs.NextDoc() + @base, ordinal);
}
@base += ar.MaxDoc; // no deletions, so we're ok
}
ordinalMap.AddDone();
}
finally
{
r.Dispose();
}
}
/// <summary>
/// Mapping from old ordinal to new ordinals, used when merging indexes
/// wit separate taxonomies.
/// <para/>
/// <see cref="AddMapping"/> merges one or more taxonomies into the given taxonomy
/// (this). An <see cref="IOrdinalMap"/> is filled for each of the added taxonomies,
/// containing the new ordinal (in the merged taxonomy) of each of the
/// categories in the old taxonomy.
/// <para/>
/// There exist two implementations of <see cref="IOrdinalMap"/>: <see cref="MemoryOrdinalMap"/> and
/// <see cref="DiskOrdinalMap"/>. As their names suggest, the former keeps the map in
/// memory and the latter in a temporary disk file. Because these maps will
/// later be needed one by one (to remap the counting lists), not all at the
/// same time, it is recommended to put the first taxonomy's map in memory,
/// and all the rest on disk (later to be automatically read into memory one
/// by one, when needed).
/// </summary>
public interface IOrdinalMap
{
/// <summary>
/// Set the size of the map. This MUST be called before <see cref="AddMapping"/>.
/// It is assumed (but not verified) that <see cref="AddMapping"/> will then be
/// called exactly 'size' times, with different <c>origOrdinals</c> between 0
/// and size - 1.
/// </summary>
void SetSize(int taxonomySize);
/// <summary>
/// Record a mapping. </summary>
void AddMapping(int origOrdinal, int newOrdinal);
/// <summary>
/// Call <see cref="AddDone()"/> to say that all <see cref="AddMapping"/> have been done.
/// In some implementations this might free some resources.
/// </summary>
void AddDone();
/// <summary>
/// Return the map from the taxonomy's original (consecutive) ordinals
/// to the new taxonomy's ordinals. If the map has to be read from disk
/// and ordered appropriately, it is done when getMap() is called.
/// getMap() should only be called once, and only when the map is actually
/// needed. Calling it will also free all resources that the map might
/// be holding (such as temporary disk space), other than the returned int[].
/// </summary>
int[] GetMap();
}
/// <summary>
/// <see cref="IOrdinalMap"/> maintained in memory
/// </summary>
public sealed class MemoryOrdinalMap : IOrdinalMap
{
internal int[] map;
/// <summary>
/// Sole constructor.
/// </summary>
public MemoryOrdinalMap()
{
map = new int[] { };
}
public void SetSize(int taxonomySize)
{
map = new int[taxonomySize];
}
public void AddMapping(int origOrdinal, int newOrdinal)
{
if (map.Length - 1 >= origOrdinal)
{
map[origOrdinal] = newOrdinal;
}
else
{
Array.Resize(ref map, origOrdinal + 1);
map[origOrdinal] = newOrdinal;
}
}
public void AddDone() // nothing to do
{
}
public int[] GetMap()
{
return (int[])map.Clone(); // LUCENENET specific: Since this is clearly not meant to be written to, we are cloning the array https://msdn.microsoft.com/en-us/library/0fss9skc.aspx
}
}
/// <summary>
/// <see cref="IOrdinalMap"/> maintained on file system
/// </summary>
public sealed class DiskOrdinalMap : IOrdinalMap
{
internal string tmpfile;
internal OutputStreamDataOutput @out;
/// <summary>
/// Sole constructor.
/// </summary>
public DiskOrdinalMap(string tmpfile)
{
this.tmpfile = tmpfile;
var outfs = new FileStream(tmpfile, FileMode.OpenOrCreate, FileAccess.Write);
@out = new OutputStreamDataOutput(outfs);
}
public void AddMapping(int origOrdinal, int newOrdinal)
{
@out.WriteInt32(origOrdinal);
@out.WriteInt32(newOrdinal);
}
public void SetSize(int taxonomySize)
{
@out.WriteInt32(taxonomySize);
}
public void AddDone()
{
if (@out != null)
{
@out.Dispose();
@out = null;
}
}
int[] map = null;
public int[] GetMap()
{
if (map != null)
{
return map;
}
AddDone(); // in case this wasn't previously called
var ifs = new FileStream(tmpfile, FileMode.OpenOrCreate, FileAccess.Read);
var @in = new InputStreamDataInput(ifs);
map = new int[@in.ReadInt32()];
// NOTE: The current code assumes here that the map is complete,
// i.e., every ordinal gets one and exactly one value. Otherwise,
// we may run into an EOF here, or vice versa, not read everything.
for (int i = 0; i < map.Length; i++)
{
int origordinal = @in.ReadInt32();
int newordinal = @in.ReadInt32();
map[origordinal] = newordinal;
}
@in.Dispose();
// Delete the temporary file, which is no longer needed.
if (File.Exists(tmpfile))
{
File.Delete(tmpfile);
}
return map;
}
}
/// <summary>
/// Rollback changes to the taxonomy writer and closes the instance. Following
/// this method the instance becomes unusable (calling any of its API methods
/// will yield an <see cref="ObjectDisposedException"/>).
/// </summary>
public virtual void Rollback()
{
lock (this)
{
EnsureOpen();
indexWriter.Rollback();
DoClose();
}
}
/// <summary>
/// Replaces the current taxonomy with the given one. This method should
/// generally be called in conjunction with
/// <see cref="IndexWriter.AddIndexes(Directory[])"/> to replace both the taxonomy
/// as well as the search index content.
/// </summary>
public virtual void ReplaceTaxonomy(Directory taxoDir)
{
lock (this)
{
// replace the taxonomy by doing IW optimized operations
indexWriter.DeleteAll();
indexWriter.AddIndexes(taxoDir);
shouldRefreshReaderManager = true;
InitReaderManager(); // ensure that it's initialized
RefreshReaderManager();
nextID = indexWriter.MaxDoc;
taxoArrays = null; // must nullify so that it's re-computed next time it's needed
// need to clear the cache, so that addCategory won't accidentally return
// old categories that are in the cache.
cache.Clear();
cacheIsComplete = false;
shouldFillCache = true;
cacheMisses.Value = 0;
// update indexEpoch as a taxonomy replace is just like it has be recreated
++indexEpoch;
}
}
/// <summary>
/// Returns the <see cref="Store.Directory"/> of this taxonomy writer.
/// </summary>
public virtual Directory Directory
{
get
{
return dir;
}
}
/// <summary>
/// Used by <see cref="DirectoryTaxonomyReader"/> to support NRT.
/// <para>
/// <b>NOTE:</b> you should not use the obtained <see cref="IndexWriter"/> in any
/// way, other than opening an IndexReader on it, or otherwise, the taxonomy
/// index may become corrupt!
/// </para>
/// </summary>
internal IndexWriter InternalIndexWriter
{
get
{
return indexWriter;
}
}
/// <summary>
/// Expert: returns current index epoch, if this is a
/// near-real-time reader. Used by
/// <see cref="DirectoryTaxonomyReader"/> to support NRT.
///
/// @lucene.internal
/// </summary>
public long TaxonomyEpoch
{
get
{
return indexEpoch;
}
}
}
}