blob: 29c6f005dd355bc931cf65431e8b8f68770778db [file] [log] [blame]
using J2N.Collections.Generic.Extensions;
using Lucene.Net.Diagnostics;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using Console = Lucene.Net.Util.SystemConsole;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// This tool splits input index into multiple equal parts. The method employed
/// here uses <see cref="IndexWriter.AddIndexes(IndexReader[])"/> where the input data
/// comes from the input index with artificially applied deletes to the document
/// id-s that fall outside the selected partition.
/// <para>Note 1: Deletes are only applied to a buffered list of deleted docs and
/// don't affect the source index - this tool works also with read-only indexes.
/// </para>
/// <para>Note 2: the disadvantage of this tool is that source index needs to be
/// read as many times as there are parts to be created, hence the name of this
/// tool.
///
/// </para>
/// <para><b>NOTE</b>: this tool is unaware of documents added
/// atomically via <see cref="IndexWriter.AddDocuments(IEnumerable{IEnumerable{IIndexableField}}, Analysis.Analyzer)"/> or
/// <see cref="IndexWriter.UpdateDocuments(Term, IEnumerable{IEnumerable{IIndexableField}}, Analysis.Analyzer)"/>, which means it can easily
/// break up such document groups.
/// </para>
/// </summary>
public class MultiPassIndexSplitter
{
/// <summary>
/// Split source index into multiple parts. </summary>
/// <param name="version">lucene compatibility version</param>
/// <param name="in"> source index, can have deletions, can have
/// multiple segments (or multiple readers). </param>
/// <param name="outputs"> list of directories where the output parts will be stored. </param>
/// <param name="seq"> if true, then the source index will be split into equal
/// increasing ranges of document id-s. If false, source document id-s will be
/// assigned in a deterministic round-robin fashion to one of the output splits. </param>
/// <exception cref="IOException"> If there is a low-level I/O error </exception>
public virtual void Split(LuceneVersion version, IndexReader @in, Store.Directory[] outputs, bool seq)
{
if (outputs == null || outputs.Length < 2)
{
throw new IOException("Invalid number of outputs.");
}
if (@in == null || @in.NumDocs < 2)
{
throw new IOException("Not enough documents for splitting");
}
int numParts = outputs.Length;
// wrap a potentially read-only input
// this way we don't have to preserve original deletions because neither
// deleteDocument(int) or undeleteAll() is applied to the wrapped input index.
FakeDeleteIndexReader input = new FakeDeleteIndexReader(@in);
int maxDoc = input.MaxDoc;
int partLen = maxDoc / numParts;
for (int i = 0; i < numParts; i++)
{
input.UndeleteAll();
if (seq) // sequential range
{
int lo = partLen * i;
int hi = lo + partLen;
// below range
for (int j = 0; j < lo; j++)
{
input.DeleteDocument(j);
}
// above range - last part collects all id-s that remained due to
// integer rounding errors
if (i < numParts - 1)
{
for (int j = hi; j < maxDoc; j++)
{
input.DeleteDocument(j);
}
}
}
else
{
// round-robin
for (int j = 0; j < maxDoc; j++)
{
if ((j + numParts - i) % numParts != 0)
{
input.DeleteDocument(j);
}
}
}
using (IndexWriter w = new IndexWriter(outputs[i],
new IndexWriterConfig(version, null) { OpenMode = OpenMode.CREATE }))
{
Console.Error.WriteLine("Writing part " + (i + 1) + " ...");
// pass the subreaders directly, as our wrapper's numDocs/hasDeletetions are not up-to-date
IList<IndexReader> sr = input.GetSequentialSubReaders();
w.AddIndexes(sr.ToArray()); // TODO: maybe take List<IR> here?
}
}
Console.Error.WriteLine("Done.");
}
public static void Main(string[] args)
{
if (args.Length < 5)
{
// LUCENENET specific - our wrapper console shows the correct usage
throw new ArgumentException();
//Console.Error.WriteLine("Usage: MultiPassIndexSplitter -out <outputDir> -num <numParts> [-seq] <inputIndex1> [<inputIndex2 ...]");
//Console.Error.WriteLine("\tinputIndex\tpath to input index, multiple values are ok");
//Console.Error.WriteLine("\t-out ouputDir\tpath to output directory to contain partial indexes");
//Console.Error.WriteLine("\t-num numParts\tnumber of parts to produce");
//Console.Error.WriteLine("\t-seq\tsequential docid-range split (default is round-robin)");
//Environment.Exit(-1);
}
List<IndexReader> indexes = new List<IndexReader>();
try
{
string outDir = null;
int numParts = -1;
bool seq = false;
for (int i = 0; i < args.Length; i++)
{
if (args[i].Equals("-out", StringComparison.Ordinal))
{
outDir = args[++i];
}
else if (args[i].Equals("-num", StringComparison.Ordinal))
{
numParts = Convert.ToInt32(args[++i], CultureInfo.InvariantCulture);
}
else if (args[i].Equals("-seq", StringComparison.Ordinal))
{
seq = true;
}
else
{
DirectoryInfo file = new DirectoryInfo(args[i]);
if (!file.Exists)
{
Console.Error.WriteLine("Invalid input path - skipping: " + file);
continue;
}
using (Store.Directory dir = FSDirectory.Open(new DirectoryInfo(args[i])))
{
try
{
if (!DirectoryReader.IndexExists(dir))
{
Console.Error.WriteLine("Invalid input index - skipping: " + file);
continue;
}
}
catch (Exception)
{
Console.Error.WriteLine("Invalid input index - skipping: " + file);
continue;
}
indexes.Add(DirectoryReader.Open(dir));
}
}
}
if (outDir == null)
{
throw new Exception("Required argument missing: -out outputDir");
}
if (numParts < 2)
{
throw new Exception("Invalid value of required argument: -num numParts");
}
if (indexes.Count == 0)
{
throw new Exception("No input indexes to process");
}
DirectoryInfo @out = new DirectoryInfo(outDir);
@out.Create();
if (!new DirectoryInfo(outDir).Exists)
{
throw new Exception("Can't create output directory: " + @out);
}
Store.Directory[] dirs = new Store.Directory[numParts];
try
{
for (int i = 0; i < numParts; i++)
{
dirs[i] = FSDirectory.Open(new DirectoryInfo(Path.Combine(@out.FullName, "part-" + i)));
}
MultiPassIndexSplitter splitter = new MultiPassIndexSplitter();
IndexReader input;
if (indexes.Count == 1)
{
input = indexes[0];
}
else
{
input = new MultiReader(indexes.ToArray());
}
#pragma warning disable 612, 618
splitter.Split(LuceneVersion.LUCENE_CURRENT, input, dirs, seq);
#pragma warning restore 612, 618
}
finally
{
// LUCENENET specific - properly dispose directories to prevent resource leaks
IOUtils.Dispose(dirs);
}
}
finally
{
// LUCENENET specific - properly dispose index readers to prevent resource leaks
IOUtils.Dispose(indexes);
}
}
/// <summary>
/// This class emulates deletions on the underlying index.
/// </summary>
private sealed class FakeDeleteIndexReader : BaseCompositeReader<FakeDeleteAtomicIndexReader>
{
public FakeDeleteIndexReader(IndexReader reader)
: base(InitSubReaders(reader))
{
}
internal static FakeDeleteAtomicIndexReader[] InitSubReaders(IndexReader reader)
{
IList<AtomicReaderContext> leaves = reader.Leaves;
FakeDeleteAtomicIndexReader[] subs = new FakeDeleteAtomicIndexReader[leaves.Count];
int i = 0;
foreach (AtomicReaderContext ctx in leaves)
{
subs[i++] = new FakeDeleteAtomicIndexReader(ctx.AtomicReader);
}
return subs;
}
public void DeleteDocument(int docID)
{
int i = ReaderIndex(docID);
((FakeDeleteAtomicIndexReader)GetSequentialSubReaders()[i]).DeleteDocument(docID - ReaderBase(i));
}
public void UndeleteAll()
{
foreach (FakeDeleteAtomicIndexReader r in GetSequentialSubReaders())
{
r.UndeleteAll();
}
}
protected internal override void DoClose()
{
}
// no need to override numDocs/hasDeletions,
// as we pass the subreaders directly to IW.addIndexes().
}
private sealed class FakeDeleteAtomicIndexReader : FilterAtomicReader
{
internal FixedBitSet liveDocs;
public FakeDeleteAtomicIndexReader(AtomicReader reader) : base(reader)
{
UndeleteAll(); // initialize main bitset
}
public override int NumDocs => liveDocs.Cardinality();
public void UndeleteAll()
{
int maxDoc = m_input.MaxDoc;
liveDocs = new FixedBitSet(m_input.MaxDoc);
if (m_input.HasDeletions)
{
IBits oldLiveDocs = m_input.LiveDocs;
if (Debugging.AssertsEnabled) Debugging.Assert(oldLiveDocs != null);
// this loop is a little bit ineffective, as Bits has no nextSetBit():
for (int i = 0; i < maxDoc; i++)
{
if (oldLiveDocs.Get(i))
{
liveDocs.Set(i);
}
}
}
else
{
// mark all docs as valid
liveDocs.Set(0, maxDoc);
}
}
public void DeleteDocument(int n)
{
liveDocs.Clear(n);
}
public override IBits LiveDocs => liveDocs;
}
}
}