blob: b993f7b28551ddab9578c7206e1a67816e31974d [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Merges segments of approximately equal size, subject to
/// an allowed number of segments per tier. This is similar
/// to <see cref="LogByteSizeMergePolicy"/>, except this merge
/// policy is able to merge non-adjacent segment, and
/// separates how many segments are merged at once (<see cref="MaxMergeAtOnce"/>)
/// from how many segments are allowed
/// per tier (<see cref="SegmentsPerTier"/>). This merge
/// policy also does not over-merge (i.e. cascade merges).
///
/// <para/>For normal merging, this policy first computes a
/// "budget" of how many segments are allowed to be in the
/// index. If the index is over-budget, then the policy
/// sorts segments by decreasing size (pro-rating by percent
/// deletes), and then finds the least-cost merge. Merge
/// cost is measured by a combination of the "skew" of the
/// merge (size of largest segment divided by smallest segment),
/// total merge size and percent deletes reclaimed,
/// so that merges with lower skew, smaller size
/// and those reclaiming more deletes, are
/// favored.
///
/// <para/>If a merge will produce a segment that's larger than
/// <see cref="MaxMergedSegmentMB"/>, then the policy will
/// merge fewer segments (down to 1 at once, if that one has
/// deletions) to keep the segment size under budget.
///
/// <para/><b>NOTE</b>: This policy freely merges non-adjacent
/// segments; if this is a problem, use <see cref="LogMergePolicy"/>.
///
/// <para/><b>NOTE</b>: This policy always merges by byte size
/// of the segments, always pro-rates by percent deletes,
/// and does not apply any maximum segment size during
/// forceMerge (unlike <see cref="LogByteSizeMergePolicy"/>).
/// <para/>
/// @lucene.experimental
/// </summary>
// TODO
// - we could try to take into account whether a large
// merge is already running (under CMS) and then bias
// ourselves towards picking smaller merges if so (or,
// maybe CMS should do so)
public class TieredMergePolicy : MergePolicy
{
/// <summary>
/// Default noCFSRatio. If a merge's size is >= 10% of
/// the index, then we disable compound file for it.
/// </summary>
/// <seealso cref="MergePolicy.NoCFSRatio"/>
public new static readonly double DEFAULT_NO_CFS_RATIO = 0.1;
private int maxMergeAtOnce = 10;
private long maxMergedSegmentBytes = 5 * 1024 * 1024 * 1024L;
private int maxMergeAtOnceExplicit = 30;
private long floorSegmentBytes = 2 * 1024 * 1024L;
private double segsPerTier = 10.0;
private double forceMergeDeletesPctAllowed = 10.0;
private double reclaimDeletesWeight = 2.0;
/// <summary>
/// Sole constructor, setting all settings to their
/// defaults.
/// </summary>
public TieredMergePolicy()
: base(DEFAULT_NO_CFS_RATIO, MergePolicy.DEFAULT_MAX_CFS_SEGMENT_SIZE)
{
}
/// <summary>
/// Gets or sets maximum number of segments to be merged at a time
/// during "normal" merging. For explicit merging (eg,
/// <see cref="IndexWriter.ForceMerge(int)"/> or
/// <see cref="IndexWriter.ForceMergeDeletes()"/> was called), see
/// <see cref="MaxMergeAtOnceExplicit"/>. Default is 10.
/// </summary>
public virtual int MaxMergeAtOnce
{
get
{
return maxMergeAtOnce;
}
set
{
if (value < 2)
{
throw new System.ArgumentException("maxMergeAtOnce must be > 1 (got " + value + ")");
}
maxMergeAtOnce = value;
}
}
// TODO: should addIndexes do explicit merging, too? And,
// if user calls IW.maybeMerge "explicitly"
/// <summary>
/// Gets or sets maximum number of segments to be merged at a time,
/// during <see cref="IndexWriter.ForceMerge(int)"/> or
/// <see cref="IndexWriter.ForceMergeDeletes()"/>. Default is 30.
/// </summary>
public virtual int MaxMergeAtOnceExplicit
{
get
{
return maxMergeAtOnceExplicit;
}
set
{
if (value < 2)
{
throw new System.ArgumentException("maxMergeAtOnceExplicit must be > 1 (got " + value + ")");
}
maxMergeAtOnceExplicit = value;
}
}
/// <summary>
/// Gets or sets maximum sized segment to produce during
/// normal merging. This setting is approximate: the
/// estimate of the merged segment size is made by summing
/// sizes of to-be-merged segments (compensating for
/// percent deleted docs). Default is 5 GB.
/// </summary>
public virtual double MaxMergedSegmentMB
{
get
{
return maxMergedSegmentBytes / 1024 / 1024.0;
}
set
{
if (value < 0.0)
{
throw new System.ArgumentException("maxMergedSegmentMB must be >=0 (got " + value.ToString("0.0") + ")");
}
value *= 1024 * 1024;
maxMergedSegmentBytes = (value > long.MaxValue) ? long.MaxValue : (long)value;
}
}
/// <summary>
/// Controls how aggressively merges that reclaim more
/// deletions are favored. Higher values will more
/// aggressively target merges that reclaim deletions, but
/// be careful not to go so high that way too much merging
/// takes place; a value of 3.0 is probably nearly too
/// high. A value of 0.0 means deletions don't impact
/// merge selection.
/// </summary>
public virtual double ReclaimDeletesWeight
{
get
{
return reclaimDeletesWeight;
}
set
{
if (value < 0.0)
{
throw new System.ArgumentException("reclaimDeletesWeight must be >= 0.0 (got " + value.ToString("0.0") + ")");
}
reclaimDeletesWeight = value;
}
}
/// <summary>
/// Segments smaller than this are "rounded up" to this
/// size, ie treated as equal (floor) size for merge
/// selection. this is to prevent frequent flushing of
/// tiny segments from allowing a long tail in the index.
/// Default is 2 MB.
/// </summary>
public virtual double FloorSegmentMB
{
get
{
return floorSegmentBytes / (1024 * 1024.0);
}
set
{
if (value <= 0.0)
{
throw new System.ArgumentException("floorSegmentMB must be >= 0.0 (got " + value.ToString("0.0") + ")");
}
value *= 1024 * 1024;
floorSegmentBytes = (value > long.MaxValue) ? long.MaxValue : (long)value;
}
}
/// <summary>
/// When forceMergeDeletes is called, we only merge away a
/// segment if its delete percentage is over this
/// threshold. Default is 10%.
/// </summary>
public virtual double ForceMergeDeletesPctAllowed
{
get
{
return forceMergeDeletesPctAllowed;
}
set
{
if (value < 0.0 || value > 100.0)
{
throw new System.ArgumentException("forceMergeDeletesPctAllowed must be between 0.0 and 100.0 inclusive (got " + value.ToString("0.0") + ")");
}
forceMergeDeletesPctAllowed = value;
}
}
/// <summary>
/// Gets or sets the allowed number of segments per tier. Smaller
/// values mean more merging but fewer segments.
///
/// <para/><b>NOTE</b>: this value should be >= the
/// <see cref="MaxMergeAtOnce"/> otherwise you'll force too much
/// merging to occur.
///
/// <para/>Default is 10.0.
/// </summary>
public virtual double SegmentsPerTier
{
get
{
return segsPerTier;
}
set
{
if (value < 2.0)
{
throw new System.ArgumentException("segmentsPerTier must be >= 2.0 (got " + value.ToString("0.0") + ")");
}
segsPerTier = value;
}
}
private class SegmentByteSizeDescending : IComparer<SegmentCommitInfo>
{
private readonly TieredMergePolicy outerInstance;
public SegmentByteSizeDescending(TieredMergePolicy outerInstance)
{
this.outerInstance = outerInstance;
}
public virtual int Compare(SegmentCommitInfo o1, SegmentCommitInfo o2)
{
try
{
long sz1 = outerInstance.Size(o1);
long sz2 = outerInstance.Size(o2);
if (sz1 > sz2)
{
return -1;
}
else if (sz2 > sz1)
{
return 1;
}
else
{
return o1.Info.Name.CompareToOrdinal(o2.Info.Name);
}
}
catch (System.IO.IOException ioe)
{
throw new Exception(ioe.ToString(), ioe);
}
}
}
/// <summary>
/// Holds score and explanation for a single candidate
/// merge.
/// </summary>
protected abstract class MergeScore
{
/// <summary>
/// Sole constructor. (For invocation by subclass
/// constructors, typically implicit.)
/// </summary>
protected MergeScore()
{
}
/// <summary>
/// Returns the score for this merge candidate; lower
/// scores are better.
/// </summary>
public abstract double Score { get; }
/// <summary>
/// Human readable explanation of how the merge got this
/// score.
/// </summary>
public abstract string Explanation { get; }
}
public override MergeSpecification FindMerges(MergeTrigger mergeTrigger, SegmentInfos infos)
{
if (Verbose())
{
Message("findMerges: " + infos.Count + " segments");
}
if (infos.Count == 0)
{
return null;
}
ICollection<SegmentCommitInfo> merging = m_writer.Get().MergingSegments;
ICollection<SegmentCommitInfo> toBeMerged = new JCG.HashSet<SegmentCommitInfo>();
List<SegmentCommitInfo> infosSorted = new List<SegmentCommitInfo>(infos.AsList());
infosSorted.Sort(new SegmentByteSizeDescending(this));
// Compute total index bytes & print details about the index
long totIndexBytes = 0;
long minSegmentBytes = long.MaxValue;
foreach (SegmentCommitInfo info in infosSorted)
{
long segBytes = Size(info);
if (Verbose())
{
string extra = merging.Contains(info) ? " [merging]" : "";
if (segBytes >= maxMergedSegmentBytes / 2.0)
{
extra += " [skip: too large]";
}
else if (segBytes < floorSegmentBytes)
{
extra += " [floored]";
}
Message(" seg=" + m_writer.Get().SegString(info) + " size=" + string.Format("{0:0.000}", segBytes / 1024 / 1024.0) + " MB" + extra);
}
minSegmentBytes = Math.Min(segBytes, minSegmentBytes);
// Accum total byte size
totIndexBytes += segBytes;
}
// If we have too-large segments, grace them out
// of the maxSegmentCount:
int tooBigCount = 0;
while (tooBigCount < infosSorted.Count && Size(infosSorted[tooBigCount]) >= maxMergedSegmentBytes / 2.0)
{
totIndexBytes -= Size(infosSorted[tooBigCount]);
tooBigCount++;
}
minSegmentBytes = FloorSize(minSegmentBytes);
// Compute max allowed segs in the index
long levelSize = minSegmentBytes;
long bytesLeft = totIndexBytes;
double allowedSegCount = 0;
while (true)
{
double segCountLevel = bytesLeft / (double)levelSize;
if (segCountLevel < segsPerTier)
{
allowedSegCount += Math.Ceiling(segCountLevel);
break;
}
allowedSegCount += segsPerTier;
bytesLeft -= (long)(segsPerTier * levelSize);
levelSize *= maxMergeAtOnce;
}
int allowedSegCountInt = (int)allowedSegCount;
MergeSpecification spec = null;
// Cycle to possibly select more than one merge:
while (true)
{
long mergingBytes = 0;
// Gather eligible segments for merging, ie segments
// not already being merged and not already picked (by
// prior iteration of this loop) for merging:
IList<SegmentCommitInfo> eligible = new List<SegmentCommitInfo>();
for (int idx = tooBigCount; idx < infosSorted.Count; idx++)
{
SegmentCommitInfo info = infosSorted[idx];
if (merging.Contains(info))
{
mergingBytes += info.GetSizeInBytes();
}
else if (!toBeMerged.Contains(info))
{
eligible.Add(info);
}
}
bool maxMergeIsRunning = mergingBytes >= maxMergedSegmentBytes;
if (Verbose())
{
Message(" allowedSegmentCount=" + allowedSegCountInt + " vs count=" + infosSorted.Count + " (eligible count=" + eligible.Count + ") tooBigCount=" + tooBigCount);
}
if (eligible.Count == 0)
{
return spec;
}
if (eligible.Count >= allowedSegCountInt)
{
// OK we are over budget -- find best merge!
MergeScore bestScore = null;
IList<SegmentCommitInfo> best = null;
bool bestTooLarge = false;
long bestMergeBytes = 0;
// Consider all merge starts:
for (int startIdx = 0; startIdx <= eligible.Count - maxMergeAtOnce; startIdx++)
{
long totAfterMergeBytes = 0;
IList<SegmentCommitInfo> candidate = new List<SegmentCommitInfo>();
bool hitTooLarge = false;
for (int idx = startIdx; idx < eligible.Count && candidate.Count < maxMergeAtOnce; idx++)
{
SegmentCommitInfo info = eligible[idx];
long segBytes = Size(info);
if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes)
{
hitTooLarge = true;
// NOTE: we continue, so that we can try
// "packing" smaller segments into this merge
// to see if we can get closer to the max
// size; this in general is not perfect since
// this is really "bin packing" and we'd have
// to try different permutations.
continue;
}
candidate.Add(info);
totAfterMergeBytes += segBytes;
}
MergeScore score = Score(candidate, hitTooLarge, mergingBytes);
if (Verbose())
{
Message(" maybe=" + m_writer.Get().SegString(candidate) + " score=" + score.Score + " " + score.Explanation + " tooLarge=" + hitTooLarge + " size=" + string.Format("{0:0.000} MB", totAfterMergeBytes / 1024.0 / 1024.0));
}
// If we are already running a max sized merge
// (maxMergeIsRunning), don't allow another max
// sized merge to kick off:
if ((bestScore == null || score.Score < bestScore.Score) && (!hitTooLarge || !maxMergeIsRunning))
{
best = candidate;
bestScore = score;
bestTooLarge = hitTooLarge;
bestMergeBytes = totAfterMergeBytes;
}
}
if (best != null)
{
if (spec == null)
{
spec = new MergeSpecification();
}
OneMerge merge = new OneMerge(best);
spec.Add(merge);
foreach (SegmentCommitInfo info in merge.Segments)
{
toBeMerged.Add(info);
}
if (Verbose())
{
Message(" add merge=" + m_writer.Get().SegString(merge.Segments) + " size=" + string.Format("{0:0.000} MB", bestMergeBytes / 1024.0 / 1024.0) + " score=" + string.Format("{0:0.000}", bestScore.Score) + " " + bestScore.Explanation + (bestTooLarge ? " [max merge]" : ""));
}
}
else
{
return spec;
}
}
else
{
return spec;
}
}
}
/// <summary>
/// Expert: scores one merge; subclasses can override. </summary>
protected virtual MergeScore Score(IList<SegmentCommitInfo> candidate, bool hitTooLarge, long mergingBytes)
{
long totBeforeMergeBytes = 0;
long totAfterMergeBytes = 0;
long totAfterMergeBytesFloored = 0;
foreach (SegmentCommitInfo info in candidate)
{
long segBytes = Size(info);
totAfterMergeBytes += segBytes;
totAfterMergeBytesFloored += FloorSize(segBytes);
totBeforeMergeBytes += info.GetSizeInBytes();
}
// Roughly measure "skew" of the merge, i.e. how
// "balanced" the merge is (whether the segments are
// about the same size), which can range from
// 1.0/numSegsBeingMerged (good) to 1.0 (poor). Heavily
// lopsided merges (skew near 1.0) is no good; it means
// O(N^2) merge cost over time:
double skew;
if (hitTooLarge)
{
// Pretend the merge has perfect skew; skew doesn't
// matter in this case because this merge will not
// "cascade" and so it cannot lead to N^2 merge cost
// over time:
skew = 1.0 / maxMergeAtOnce;
}
else
{
skew = ((double)FloorSize(Size(candidate[0]))) / totAfterMergeBytesFloored;
}
// Strongly favor merges with less skew (smaller
// mergeScore is better):
double mergeScore = skew;
// Gently favor smaller merges over bigger ones. We
// don't want to make this exponent too large else we
// can end up doing poor merges of small segments in
// order to avoid the large merges:
mergeScore *= Math.Pow(totAfterMergeBytes, 0.05);
// Strongly favor merges that reclaim deletes:
double nonDelRatio = ((double)totAfterMergeBytes) / totBeforeMergeBytes;
mergeScore *= Math.Pow(nonDelRatio, reclaimDeletesWeight);
double finalMergeScore = mergeScore;
return new MergeScoreAnonymousInnerClassHelper(this, skew, nonDelRatio, finalMergeScore);
}
private class MergeScoreAnonymousInnerClassHelper : MergeScore
{
private readonly TieredMergePolicy outerInstance;
private double skew;
private double nonDelRatio;
private double finalMergeScore;
public MergeScoreAnonymousInnerClassHelper(TieredMergePolicy outerInstance, double skew, double nonDelRatio, double finalMergeScore)
{
this.outerInstance = outerInstance;
this.skew = skew;
this.nonDelRatio = nonDelRatio;
this.finalMergeScore = finalMergeScore;
}
public override double Score
{
get
{
return finalMergeScore;
}
}
public override string Explanation
{
get
{
return "skew=" + string.Format(CultureInfo.InvariantCulture, "{0:F3}", skew) + " nonDelRatio=" + string.Format(CultureInfo.InvariantCulture, "{0:F3}", nonDelRatio);
}
}
}
public override MergeSpecification FindForcedMerges(SegmentInfos infos, int maxSegmentCount, IDictionary<SegmentCommitInfo, bool?> segmentsToMerge)
{
if (Verbose())
{
Message("FindForcedMerges maxSegmentCount=" + maxSegmentCount +
" infos=" + m_writer.Get().SegString(infos.Segments) +
" segmentsToMerge=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", segmentsToMerge));
}
List<SegmentCommitInfo> eligible = new List<SegmentCommitInfo>();
bool forceMergeRunning = false;
ICollection<SegmentCommitInfo> merging = m_writer.Get().MergingSegments;
bool? segmentIsOriginal = false;
foreach (SegmentCommitInfo info in infos.Segments)
{
bool? isOriginal;
if (segmentsToMerge.TryGetValue(info, out isOriginal))
{
segmentIsOriginal = isOriginal;
if (!merging.Contains(info))
{
eligible.Add(info);
}
else
{
forceMergeRunning = true;
}
}
}
if (eligible.Count == 0)
{
return null;
}
if ((maxSegmentCount > 1 && eligible.Count <= maxSegmentCount) || (maxSegmentCount == 1 && eligible.Count == 1 && (segmentIsOriginal == false || IsMerged(infos, eligible[0]))))
{
if (Verbose())
{
Message("already merged");
}
return null;
}
eligible.Sort(new SegmentByteSizeDescending(this));
if (Verbose())
{
Message("eligible=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", eligible));
Message("forceMergeRunning=" + forceMergeRunning);
}
int end = eligible.Count;
MergeSpecification spec = null;
// Do full merges, first, backwards:
while (end >= maxMergeAtOnceExplicit + maxSegmentCount - 1)
{
if (spec == null)
{
spec = new MergeSpecification();
}
OneMerge merge = new OneMerge(eligible.SubList(end - maxMergeAtOnceExplicit, end));
if (Verbose())
{
Message("add merge=" + m_writer.Get().SegString(merge.Segments));
}
spec.Add(merge);
end -= maxMergeAtOnceExplicit;
}
if (spec == null && !forceMergeRunning)
{
// Do final merge
int numToMerge = end - maxSegmentCount + 1;
OneMerge merge = new OneMerge(eligible.SubList(end - numToMerge, end));
if (Verbose())
{
Message("add final merge=" + merge.SegString(m_writer.Get().Directory));
}
spec = new MergeSpecification();
spec.Add(merge);
}
return spec;
}
public override MergeSpecification FindForcedDeletesMerges(SegmentInfos infos)
{
if (Verbose())
{
Message("findForcedDeletesMerges infos=" + m_writer.Get().SegString(infos.Segments) + " forceMergeDeletesPctAllowed=" + forceMergeDeletesPctAllowed);
}
List<SegmentCommitInfo> eligible = new List<SegmentCommitInfo>();
ICollection<SegmentCommitInfo> merging = m_writer.Get().MergingSegments;
foreach (SegmentCommitInfo info in infos.Segments)
{
double pctDeletes = 100.0 * ((double)m_writer.Get().NumDeletedDocs(info)) / info.Info.DocCount;
if (pctDeletes > forceMergeDeletesPctAllowed && !merging.Contains(info))
{
eligible.Add(info);
}
}
if (eligible.Count == 0)
{
return null;
}
eligible.Sort(new SegmentByteSizeDescending(this));
if (Verbose())
{
Message("eligible=" + string.Format(J2N.Text.StringFormatter.InvariantCulture, "{0}", eligible));
}
int start = 0;
MergeSpecification spec = null;
while (start < eligible.Count)
{
// Don't enforce max merged size here: app is explicitly
// calling forceMergeDeletes, and knows this may take a
// long time / produce big segments (like forceMerge):
int end = Math.Min(start + maxMergeAtOnceExplicit, eligible.Count);
if (spec == null)
{
spec = new MergeSpecification();
}
OneMerge merge = new OneMerge(eligible.SubList(start, end));
if (Verbose())
{
Message("add merge=" + m_writer.Get().SegString(merge.Segments));
}
spec.Add(merge);
start = end;
}
return spec;
}
protected override void Dispose(bool disposing)
{
}
private long FloorSize(long bytes)
{
return Math.Max(floorSegmentBytes, bytes);
}
private bool Verbose()
{
IndexWriter w = m_writer.Get();
return w != null && w.infoStream.IsEnabled("TMP");
}
private void Message(string message)
{
m_writer.Get().infoStream.Message("TMP", message);
}
public override string ToString()
{
StringBuilder sb = new StringBuilder("[" + this.GetType().Name + ": ");
sb.Append("maxMergeAtOnce=").Append(maxMergeAtOnce).Append(", ");
sb.Append("maxMergeAtOnceExplicit=").Append(maxMergeAtOnceExplicit).Append(", ");
sb.Append("maxMergedSegmentMB=").Append(maxMergedSegmentBytes / 1024 / 1024.0).Append(", ");
sb.Append("floorSegmentMB=").Append(floorSegmentBytes / 1024 / 1024.0).Append(", ");
sb.Append("forceMergeDeletesPctAllowed=").Append(forceMergeDeletesPctAllowed).Append(", ");
sb.Append("segmentsPerTier=").Append(segsPerTier).Append(", ");
sb.Append("maxCFSSegmentSizeMB=").Append(MaxCFSSegmentSizeMB).Append(", ");
sb.Append("noCFSRatio=").Append(m_noCFSRatio);
return sb.ToString();
}
}
}