blob: ecd392f87ca0d2330964817a27565a253bee4df6 [file] [log] [blame]
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Codecs.PerField
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using FieldInfo = Lucene.Net.Index.FieldInfo;
using IOUtils = Lucene.Net.Util.IOUtils;
using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
using SegmentReadState = Lucene.Net.Index.SegmentReadState;
using SegmentWriteState = Lucene.Net.Index.SegmentWriteState;
using Terms = Lucene.Net.Index.Terms;
/// <summary>
/// Enables per field postings support.
/// <para/>
/// Note, when extending this class, the name (<see cref="PostingsFormat.Name"/>) is
/// written into the index. In order for the field to be read, the
/// name must resolve to your implementation via <see cref="PostingsFormat.ForName(string)"/>.
/// This method uses <see cref="IPostingsFormatFactory.GetPostingsFormat(string)"/> to resolve format names.
/// See <see cref="DefaultPostingsFormatFactory"/> for information about how to implement your own <see cref="PostingsFormat"/>.
/// <para/>
/// Files written by each posting format have an additional suffix containing the
/// format name. For example, in a per-field configuration instead of <c>_1.prx</c>
/// filenames would look like <c>_1_Lucene40_0.prx</c>.
/// <para/>
/// @lucene.experimental
/// </summary>
/// <seealso cref="IPostingsFormatFactory"/>
/// <seealso cref="DefaultPostingsFormatFactory"/>
[PostingsFormatName("PerField40")] // LUCENENET specific - using PostingsFormatName attribute to ensure the default name passed from subclasses is the same as this class name
public abstract class PerFieldPostingsFormat : PostingsFormat
{
// LUCENENET specific - removed this static variable because our name is determined by the PostingsFormatNameAttribute
///// <summary>
///// Name of this <seealso cref="PostingsFormat"/>. </summary>
//public static readonly string PER_FIELD_NAME = "PerField40";
/// <summary>
/// <see cref="FieldInfo"/> attribute name used to store the
/// format name for each field.
/// </summary>
public static readonly string PER_FIELD_FORMAT_KEY = typeof(PerFieldPostingsFormat).Name + ".format";
/// <summary>
/// <see cref="FieldInfo"/> attribute name used to store the
/// segment suffix name for each field.
/// </summary>
public static readonly string PER_FIELD_SUFFIX_KEY = typeof(PerFieldPostingsFormat).Name + ".suffix";
/// <summary>
/// Sole constructor. </summary>
public PerFieldPostingsFormat()
: base()
{
}
public override sealed FieldsConsumer FieldsConsumer(SegmentWriteState state)
{
return new FieldsWriter(this, state);
}
internal class FieldsConsumerAndSuffix : IDisposable
{
internal FieldsConsumer Consumer { get; set; }
internal int Suffix { get; set; }
public void Dispose()
{
Consumer.Dispose();
}
}
private class FieldsWriter : FieldsConsumer
{
private readonly PerFieldPostingsFormat outerInstance;
internal readonly IDictionary<PostingsFormat, FieldsConsumerAndSuffix> formats = new Dictionary<PostingsFormat, FieldsConsumerAndSuffix>();
internal readonly IDictionary<string, int?> suffixes = new Dictionary<string, int?>();
internal readonly SegmentWriteState segmentWriteState;
public FieldsWriter(PerFieldPostingsFormat outerInstance, SegmentWriteState state)
{
this.outerInstance = outerInstance;
segmentWriteState = state;
}
public override TermsConsumer AddField(FieldInfo field)
{
PostingsFormat format = outerInstance.GetPostingsFormatForField(field.Name);
if (format == null)
{
throw new InvalidOperationException("invalid null PostingsFormat for field=\"" + field.Name + "\"");
}
string formatName = format.Name;
string previousValue = field.PutAttribute(PER_FIELD_FORMAT_KEY, formatName);
Debug.Assert(previousValue == null);
int? suffix;
FieldsConsumerAndSuffix consumer;
if (!formats.TryGetValue(format, out consumer) || consumer == null)
{
// First time we are seeing this format; create a new instance
// bump the suffix
if (!suffixes.TryGetValue(formatName, out suffix) || suffix == null)
{
suffix = 0;
}
else
{
suffix = suffix + 1;
}
suffixes[formatName] = suffix;
string segmentSuffix = GetFullSegmentSuffix(field.Name,
segmentWriteState.SegmentSuffix,
GetSuffix(formatName, Convert.ToString(suffix, CultureInfo.InvariantCulture)));
consumer = new FieldsConsumerAndSuffix();
consumer.Consumer = format.FieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
consumer.Suffix = suffix.Value; // LUCENENET NOTE: At this point suffix cannot be null
formats[format] = consumer;
}
else
{
// we've already seen this format, so just grab its suffix
Debug.Assert(suffixes.ContainsKey(formatName));
suffix = consumer.Suffix;
}
previousValue = field.PutAttribute(PER_FIELD_SUFFIX_KEY, Convert.ToString(suffix, CultureInfo.InvariantCulture));
Debug.Assert(previousValue == null);
// TODO: we should only provide the "slice" of FIS
// that this PF actually sees ... then stuff like
// .hasProx could work correctly?
// NOTE: .hasProx is already broken in the same way for the non-perfield case,
// if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
return consumer.Consumer.AddField(field);
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
// Close all subs
IOUtils.Dispose(formats.Values);
}
}
}
internal static string GetSuffix(string formatName, string suffix)
{
return formatName + "_" + suffix;
}
internal static string GetFullSegmentSuffix(string fieldName, string outerSegmentSuffix, string segmentSuffix)
{
if (outerSegmentSuffix.Length == 0)
{
return segmentSuffix;
}
else
{
// TODO: support embedding; I think it should work but
// we need a test confirm to confirm
// return outerSegmentSuffix + "_" + segmentSuffix;
throw new InvalidOperationException("cannot embed PerFieldPostingsFormat inside itself (field \"" + fieldName + "\" returned PerFieldPostingsFormat)");
}
}
private class FieldsReader : FieldsProducer
{
private readonly PerFieldPostingsFormat outerInstance;
// LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java
internal readonly IDictionary<string, FieldsProducer> fields = new JCG.SortedDictionary<string, FieldsProducer>(StringComparer.Ordinal);
internal readonly IDictionary<string, FieldsProducer> formats = new Dictionary<string, FieldsProducer>();
public FieldsReader(PerFieldPostingsFormat outerInstance, SegmentReadState readState)
{
this.outerInstance = outerInstance;
// Read _X.per and init each format:
bool success = false;
try
{
// Read field name -> format name
foreach (FieldInfo fi in readState.FieldInfos)
{
if (fi.IsIndexed)
{
string fieldName = fi.Name;
string formatName = fi.GetAttribute(PER_FIELD_FORMAT_KEY);
if (formatName != null)
{
// null formatName means the field is in fieldInfos, but has no postings!
string suffix = fi.GetAttribute(PER_FIELD_SUFFIX_KEY);
Debug.Assert(suffix != null);
PostingsFormat format = PostingsFormat.ForName(formatName);
string segmentSuffix = GetSuffix(formatName, suffix);
// LUCENENET: Eliminated extra lookup by using TryGetValue instead of ContainsKey
if (!formats.TryGetValue(segmentSuffix, out Codecs.FieldsProducer field))
{
formats[segmentSuffix] = field = format.FieldsProducer(new SegmentReadState(readState, segmentSuffix));
}
fields[fieldName] = field;
}
}
}
success = true;
}
finally
{
if (!success)
{
IOUtils.DisposeWhileHandlingException(formats.Values);
}
}
}
public override IEnumerator<string> GetEnumerator()
{
return fields.Keys.GetEnumerator(); // LUCENENET NOTE: enumerators are not writable in .NET
}
public override Terms GetTerms(string field)
{
FieldsProducer fieldsProducer;
if (fields.TryGetValue(field, out fieldsProducer) && fieldsProducer != null)
{
return fieldsProducer.GetTerms(field);
}
return null;
}
public override int Count
{
get { return fields.Count; }
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
IOUtils.Dispose(formats.Values);
}
}
public override long RamBytesUsed()
{
long sizeInBytes = 0;
foreach (KeyValuePair<string, FieldsProducer> entry in formats)
{
sizeInBytes += entry.Key.Length * RamUsageEstimator.NUM_BYTES_CHAR;
sizeInBytes += entry.Value.RamBytesUsed();
}
return sizeInBytes;
}
public override void CheckIntegrity()
{
foreach (FieldsProducer producer in formats.Values)
{
producer.CheckIntegrity();
}
}
}
public override sealed FieldsProducer FieldsProducer(SegmentReadState state)
{
return new FieldsReader(this, state);
}
/// <summary>
/// Returns the postings format that should be used for writing
/// new segments of <paramref name="field"/>.
/// <para/>
/// The field to format mapping is written to the index, so
/// this method is only invoked when writing, not when reading.
/// </summary>
public abstract PostingsFormat GetPostingsFormatForField(string field);
}
}