blob: d739016f594fc5578a08f35f0ef98388b358d846 [file] [log] [blame]
using Lucene.Net.Search.Spell;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
namespace Lucene.Net.Search.Suggest
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Dictionary represented by a text file.
///
/// <para>Format allowed: 1 entry per line:</para>
/// <para>An entry can be: </para>
/// <list type="number">
/// <item><description>suggestion</description></item>
/// <item><description>suggestion <see cref="fieldDelimiter"/> weight</description></item>
/// <item><description>suggestion <see cref="fieldDelimiter"/> weight <see cref="fieldDelimiter"/> payload</description></item>
/// </list>
/// where the default <see cref="fieldDelimiter"/> is <see cref="DEFAULT_FIELD_DELIMITER"/> (a tab)
/// <para>
/// <b>NOTE:</b>
/// <list type="number">
/// <item><description>In order to have payload enabled, the first entry has to have a payload</description></item>
/// <item><description>If the weight for an entry is not specified then a value of 1 is used</description></item>
/// <item><description>A payload cannot be specified without having the weight specified for an entry</description></item>
/// <item><description>If the payload for an entry is not specified (assuming payload is enabled)
/// then an empty payload is returned</description></item>
/// <item><description>An entry cannot have more than two <see cref="fieldDelimiter"/>s</description></item>
/// </list>
/// </para>
/// <c>Example:</c><para/>
/// word1 word2 TAB 100 TAB payload1<para/>
/// word3 TAB 101<para/>
/// word4 word3 TAB 102<para/>
/// </summary>
public class FileDictionary : IDictionary
{
/// <summary>
/// Tab-delimited fields are most common thus the default, but one can override this via the constructor
/// </summary>
public const string DEFAULT_FIELD_DELIMITER = "\t";
private readonly TextReader @in; // LUCENENET: marked readonly
private string line;
private bool done = false;
private readonly string fieldDelimiter;
/// <summary>
/// Creates a dictionary based on an inputstream.
/// Using <see cref="DEFAULT_FIELD_DELIMITER"/> as the
/// field seperator in a line.
/// <para>
/// NOTE: content is treated as UTF-8
/// </para>
/// </summary>
public FileDictionary(Stream dictFile)
: this(dictFile, DEFAULT_FIELD_DELIMITER)
{
}
/// <summary>
/// Creates a dictionary based on a reader.
/// Using <see cref="DEFAULT_FIELD_DELIMITER"/> as the
/// field seperator in a line.
/// </summary>
public FileDictionary(TextReader reader)
: this(reader, DEFAULT_FIELD_DELIMITER)
{
}
/// <summary>
/// Creates a dictionary based on a reader.
/// Using <paramref name="fieldDelimiter"/> to seperate out the
/// fields in a line.
/// </summary>
public FileDictionary(TextReader reader, string fieldDelimiter)
{
@in = reader;
this.fieldDelimiter = fieldDelimiter;
}
/// <summary>
/// Creates a dictionary based on an inputstream.
/// Using <paramref name="fieldDelimiter"/> to seperate out the
/// fields in a line.
/// <para>
/// NOTE: content is treated as UTF-8
/// </para>
/// </summary>
public FileDictionary(Stream dictFile, string fieldDelimiter)
{
@in = IOUtils.GetDecodingReader(dictFile, Encoding.UTF8);
this.fieldDelimiter = fieldDelimiter;
}
public virtual IInputEnumerator GetEntryEnumerator()
{
try
{
return new FileEnumerator(this);
}
catch (IOException e)
{
throw new Exception(e.ToString(), e);
}
}
internal sealed class FileEnumerator : IInputEnumerator
{
private readonly FileDictionary outerInstance;
internal long curWeight;
internal readonly BytesRef spare = new BytesRef();
internal BytesRef curPayload = new BytesRef();
internal bool isFirstLine = true;
internal bool hasPayloads = false;
private BytesRef current;
internal FileEnumerator(FileDictionary outerInstance)
{
this.outerInstance = outerInstance;
outerInstance.line = outerInstance.@in.ReadLine();
if (outerInstance.line == null)
{
outerInstance.done = true;
IOUtils.Dispose(outerInstance.@in);
}
else
{
string[] fields = outerInstance.line.Split(new string[] { outerInstance.fieldDelimiter }, StringSplitOptions.RemoveEmptyEntries);
if (fields.Length > 3)
{
throw new ArgumentException("More than 3 fields in one line");
} // term, weight, payload
else if (fields.Length == 3)
{
hasPayloads = true;
spare.CopyChars(fields[0]);
ReadWeight(fields[1]);
curPayload.CopyChars(fields[2]);
} // term, weight
else if (fields.Length == 2)
{
spare.CopyChars(fields[0]);
ReadWeight(fields[1]);
} // only term
else
{
spare.CopyChars(fields[0]);
curWeight = 1;
}
}
}
public long Weight => curWeight;
public BytesRef Current => current;
public bool MoveNext()
{
if (outerInstance.done)
{
current = null;
return false;
}
if (isFirstLine)
{
isFirstLine = false;
current = spare;
return true;
}
outerInstance.line = outerInstance.@in.ReadLine();
if (outerInstance.line != null)
{
string[] fields = outerInstance.line.Split(new string[] { outerInstance.fieldDelimiter }, StringSplitOptions.RemoveEmptyEntries);
if (fields.Length > 3)
{
throw new ArgumentException("More than 3 fields in one line");
} // term, weight and payload
else if (fields.Length == 3)
{
spare.CopyChars(fields[0]);
ReadWeight(fields[1]);
if (hasPayloads)
{
curPayload.CopyChars(fields[2]);
}
} // term, weight
else if (fields.Length == 2)
{
spare.CopyChars(fields[0]);
ReadWeight(fields[1]);
if (hasPayloads) // have an empty payload
{
curPayload = new BytesRef();
}
} // only term
else
{
spare.CopyChars(fields[0]);
curWeight = 1;
if (hasPayloads)
{
curPayload = new BytesRef();
}
}
current = spare;
return true;
}
else
{
outerInstance.done = true;
IOUtils.Dispose(outerInstance.@in);
current = null;
return false;
}
}
public IComparer<BytesRef> Comparer => null;
public BytesRef Payload
=> (hasPayloads) ? curPayload : null;
public bool HasPayloads => hasPayloads;
internal void ReadWeight(string weight)
{
// LUCENENET specific - don't use exception, use TryParse
if (!long.TryParse(weight, NumberStyles.Integer, CultureInfo.InvariantCulture, out curWeight))
{
try
{
// keep reading floats for bw compat
curWeight = (long)double.Parse(weight, NumberStyles.Float, CultureInfo.InvariantCulture);
}
catch (FormatException e)
{
// LUCENENET TODO: This is just so we can see what string and what culture was being tested when parsing failed,
// to try to reproduce the conditions of the failure.
throw new FormatException($"Weight '{weight}' could not be parsed to long or double in culture '{CultureInfo.CurrentCulture.Name}'.", e);
}
}
}
public ICollection<BytesRef> Contexts => null;
public bool HasContexts => false;
}
}
}