| using J2N.Collections.Generic.Extensions; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Store; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Globalization; |
| using System.IO; |
| using Console = Lucene.Net.Util.SystemConsole; |
| |
| namespace Lucene.Net.Index |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// This tool splits input index into multiple equal parts. The method employed |
| /// here uses <see cref="IndexWriter.AddIndexes(IndexReader[])"/> where the input data |
| /// comes from the input index with artificially applied deletes to the document |
| /// id-s that fall outside the selected partition. |
| /// <para>Note 1: Deletes are only applied to a buffered list of deleted docs and |
| /// don't affect the source index - this tool works also with read-only indexes. |
| /// </para> |
| /// <para>Note 2: the disadvantage of this tool is that source index needs to be |
| /// read as many times as there are parts to be created, hence the name of this |
| /// tool. |
| /// |
| /// </para> |
| /// <para><b>NOTE</b>: this tool is unaware of documents added |
| /// atomically via <see cref="IndexWriter.AddDocuments(IEnumerable{IEnumerable{IIndexableField}}, Analysis.Analyzer)"/> or |
| /// <see cref="IndexWriter.UpdateDocuments(Term, IEnumerable{IEnumerable{IIndexableField}}, Analysis.Analyzer)"/>, which means it can easily |
| /// break up such document groups. |
| /// </para> |
| /// </summary> |
| public class MultiPassIndexSplitter |
| { |
| |
| /// <summary> |
| /// Split source index into multiple parts. </summary> |
| /// <param name="version">lucene compatibility version</param> |
| /// <param name="in"> source index, can have deletions, can have |
| /// multiple segments (or multiple readers). </param> |
| /// <param name="outputs"> list of directories where the output parts will be stored. </param> |
| /// <param name="seq"> if true, then the source index will be split into equal |
| /// increasing ranges of document id-s. If false, source document id-s will be |
| /// assigned in a deterministic round-robin fashion to one of the output splits. </param> |
| /// <exception cref="IOException"> If there is a low-level I/O error </exception> |
| public virtual void Split(LuceneVersion version, IndexReader @in, Store.Directory[] outputs, bool seq) |
| { |
| if (outputs == null || outputs.Length < 2) |
| { |
| throw new IOException("Invalid number of outputs."); |
| } |
| if (@in == null || @in.NumDocs < 2) |
| { |
| throw new IOException("Not enough documents for splitting"); |
| } |
| int numParts = outputs.Length; |
| // wrap a potentially read-only input |
| // this way we don't have to preserve original deletions because neither |
| // deleteDocument(int) or undeleteAll() is applied to the wrapped input index. |
| FakeDeleteIndexReader input = new FakeDeleteIndexReader(@in); |
| int maxDoc = input.MaxDoc; |
| int partLen = maxDoc / numParts; |
| for (int i = 0; i < numParts; i++) |
| { |
| input.UndeleteAll(); |
| if (seq) // sequential range |
| { |
| int lo = partLen * i; |
| int hi = lo + partLen; |
| // below range |
| for (int j = 0; j < lo; j++) |
| { |
| input.DeleteDocument(j); |
| } |
| // above range - last part collects all id-s that remained due to |
| // integer rounding errors |
| if (i < numParts - 1) |
| { |
| for (int j = hi; j < maxDoc; j++) |
| { |
| input.DeleteDocument(j); |
| } |
| } |
| } |
| else |
| { |
| // round-robin |
| for (int j = 0; j < maxDoc; j++) |
| { |
| if ((j + numParts - i) % numParts != 0) |
| { |
| input.DeleteDocument(j); |
| } |
| } |
| } |
| using IndexWriter w = new IndexWriter(outputs[i], |
| new IndexWriterConfig(version, null) { OpenMode = OpenMode.CREATE }); |
| Console.Error.WriteLine("Writing part " + (i + 1) + " ..."); |
| // pass the subreaders directly, as our wrapper's numDocs/hasDeletetions are not up-to-date |
| IList<IndexReader> sr = input.GetSequentialSubReaders(); |
| w.AddIndexes(sr.ToArray()); // TODO: maybe take List<IR> here? |
| } |
| Console.Error.WriteLine("Done."); |
| } |
| |
| public static void Main(string[] args) |
| { |
| if (args.Length < 5) |
| { |
| // LUCENENET specific - our wrapper console shows the correct usage |
| throw new ArgumentException(); |
| //Console.Error.WriteLine("Usage: MultiPassIndexSplitter -out <outputDir> -num <numParts> [-seq] <inputIndex1> [<inputIndex2 ...]"); |
| //Console.Error.WriteLine("\tinputIndex\tpath to input index, multiple values are ok"); |
| //Console.Error.WriteLine("\t-out ouputDir\tpath to output directory to contain partial indexes"); |
| //Console.Error.WriteLine("\t-num numParts\tnumber of parts to produce"); |
| //Console.Error.WriteLine("\t-seq\tsequential docid-range split (default is round-robin)"); |
| //Environment.Exit(-1); |
| } |
| List<IndexReader> indexes = new List<IndexReader>(); |
| try |
| { |
| string outDir = null; |
| int numParts = -1; |
| bool seq = false; |
| for (int i = 0; i < args.Length; i++) |
| { |
| if (args[i].Equals("-out", StringComparison.Ordinal)) |
| { |
| outDir = args[++i]; |
| } |
| else if (args[i].Equals("-num", StringComparison.Ordinal)) |
| { |
| numParts = Convert.ToInt32(args[++i], CultureInfo.InvariantCulture); |
| } |
| else if (args[i].Equals("-seq", StringComparison.Ordinal)) |
| { |
| seq = true; |
| } |
| else |
| { |
| DirectoryInfo file = new DirectoryInfo(args[i]); |
| if (!file.Exists) |
| { |
| Console.Error.WriteLine("Invalid input path - skipping: " + file); |
| continue; |
| } |
| using Store.Directory dir = FSDirectory.Open(new DirectoryInfo(args[i])); |
| try |
| { |
| if (!DirectoryReader.IndexExists(dir)) |
| { |
| Console.Error.WriteLine("Invalid input index - skipping: " + file); |
| continue; |
| } |
| } |
| catch (Exception) |
| { |
| Console.Error.WriteLine("Invalid input index - skipping: " + file); |
| continue; |
| } |
| indexes.Add(DirectoryReader.Open(dir)); |
| } |
| } |
| if (outDir == null) |
| { |
| throw new Exception("Required argument missing: -out outputDir"); |
| } |
| if (numParts < 2) |
| { |
| throw new Exception("Invalid value of required argument: -num numParts"); |
| } |
| if (indexes.Count == 0) |
| { |
| throw new Exception("No input indexes to process"); |
| } |
| DirectoryInfo @out = new DirectoryInfo(outDir); |
| @out.Create(); |
| if (!new DirectoryInfo(outDir).Exists) |
| { |
| throw new Exception("Can't create output directory: " + @out); |
| } |
| Store.Directory[] dirs = new Store.Directory[numParts]; |
| try |
| { |
| for (int i = 0; i < numParts; i++) |
| { |
| dirs[i] = FSDirectory.Open(new DirectoryInfo(Path.Combine(@out.FullName, "part-" + i))); |
| } |
| MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); |
| IndexReader input; |
| if (indexes.Count == 1) |
| { |
| input = indexes[0]; |
| } |
| else |
| { |
| input = new MultiReader(indexes.ToArray()); |
| } |
| #pragma warning disable 612, 618 |
| splitter.Split(LuceneVersion.LUCENE_CURRENT, input, dirs, seq); |
| #pragma warning restore 612, 618 |
| } |
| finally |
| { |
| // LUCENENET specific - properly dispose directories to prevent resource leaks |
| IOUtils.Dispose(dirs); |
| } |
| } |
| finally |
| { |
| // LUCENENET specific - properly dispose index readers to prevent resource leaks |
| IOUtils.Dispose(indexes); |
| } |
| } |
| |
| /// <summary> |
| /// This class emulates deletions on the underlying index. |
| /// </summary> |
| private sealed class FakeDeleteIndexReader : BaseCompositeReader<FakeDeleteAtomicIndexReader> |
| { |
| |
| public FakeDeleteIndexReader(IndexReader reader) |
| : base(InitSubReaders(reader)) |
| { |
| } |
| |
| internal static FakeDeleteAtomicIndexReader[] InitSubReaders(IndexReader reader) |
| { |
| IList<AtomicReaderContext> leaves = reader.Leaves; |
| FakeDeleteAtomicIndexReader[] subs = new FakeDeleteAtomicIndexReader[leaves.Count]; |
| int i = 0; |
| foreach (AtomicReaderContext ctx in leaves) |
| { |
| subs[i++] = new FakeDeleteAtomicIndexReader(ctx.AtomicReader); |
| } |
| return subs; |
| } |
| |
| public void DeleteDocument(int docID) |
| { |
| int i = ReaderIndex(docID); |
| ((FakeDeleteAtomicIndexReader)GetSequentialSubReaders()[i]).DeleteDocument(docID - ReaderBase(i)); |
| } |
| |
| public void UndeleteAll() |
| { |
| foreach (FakeDeleteAtomicIndexReader r in GetSequentialSubReaders()) |
| { |
| r.UndeleteAll(); |
| } |
| } |
| |
| protected internal override void DoClose() |
| { |
| } |
| |
| // no need to override numDocs/hasDeletions, |
| // as we pass the subreaders directly to IW.addIndexes(). |
| } |
| |
| private sealed class FakeDeleteAtomicIndexReader : FilterAtomicReader |
| { |
| internal FixedBitSet liveDocs; |
| |
| public FakeDeleteAtomicIndexReader(AtomicReader reader) : base(reader) |
| { |
| UndeleteAll(); // initialize main bitset |
| } |
| |
| public override int NumDocs => liveDocs.Cardinality(); |
| |
| public void UndeleteAll() |
| { |
| int maxDoc = m_input.MaxDoc; |
| liveDocs = new FixedBitSet(m_input.MaxDoc); |
| if (m_input.HasDeletions) |
| { |
| IBits oldLiveDocs = m_input.LiveDocs; |
| if (Debugging.AssertsEnabled) Debugging.Assert(oldLiveDocs != null); |
| // this loop is a little bit ineffective, as Bits has no nextSetBit(): |
| for (int i = 0; i < maxDoc; i++) |
| { |
| if (oldLiveDocs.Get(i)) |
| { |
| liveDocs.Set(i); |
| } |
| } |
| } |
| else |
| { |
| // mark all docs as valid |
| liveDocs.Set(0, maxDoc); |
| } |
| } |
| |
| public void DeleteDocument(int n) |
| { |
| liveDocs.Clear(n); |
| } |
| |
| public override IBits LiveDocs => liveDocs; |
| } |
| } |
| } |