blob: e3d2f0f7f9b12d25ed3eca21e6b9571be5edd265 [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Benchmarks.ByTask.Tasks;
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace Lucene.Net.Benchmarks.ByTask.Feeds
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A <see cref="ContentSource"/> reading one line at a time as a
/// <see cref="Documents.Document"/> from a single file. This saves IO
/// cost (over DirContentSource) of recursing through a directory and opening a
/// new file for every document.
/// </summary>
/// <remarks>
/// The expected format of each line is (arguments are separated by &lt;TAB&gt;):
/// <i>title, date, body</i>. If a line is read in a different format, a
/// <see cref="Exception"/> will be thrown. In general, you should use this
/// content source for files that were created with <see cref="WriteLineDocTask"/>.
/// </remarks>
public class LineDocSource : ContentSource
{
// LUCENENET specific - de-nested LineParser, SimpleLineParser, HeaderLineParser
private FileInfo file;
private TextReader reader;
private int readCount;
private LineParser docDataLineReader = null;
private bool skipHeaderLine = false;
private void OpenFile()
{
try
{
if (reader != null)
{
reader.Dispose();
}
Stream @is = StreamUtils.GetInputStream(file);
reader = new StreamReader(@is, m_encoding);
if (skipHeaderLine)
{
reader.ReadLine(); // skip one line - the header line - already handled that info
}
}
catch (IOException e)
{
throw new Exception(e.ToString(), e);
}
}
protected override void Dispose(bool disposing)
{
if (disposing && reader != null)
{
reader.Dispose();
reader = null;
}
}
public override DocData GetNextDocData(DocData docData)
{
string line;
int myID;
lock (this)
{
line = reader.ReadLine();
if (line == null)
{
if (!m_forever)
{
throw new NoMoreDataException();
}
// Reset the file
OpenFile();
return GetNextDocData(docData);
}
if (docDataLineReader == null)
{ // first line ever, one time initialization,
docDataLineReader = CreateDocDataLineReader(line);
if (skipHeaderLine)
{
return GetNextDocData(docData);
}
}
// increment IDS only once...
myID = readCount++;
}
// The date String was written in the format of DateTools.dateToString.
docData.Clear();
docData.ID = myID;
docDataLineReader.ParseLine(docData, line);
return docData;
}
private LineParser CreateDocDataLineReader(string line)
{
string[] header;
string headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
if (line.StartsWith(headIndicator, StringComparison.Ordinal))
{
header = line.Substring(headIndicator.Length).Split(WriteLineDocTask.SEP).TrimEnd();
skipHeaderLine = true; // mark to skip the header line when input file is reopened
}
else
{
header = WriteLineDocTask.DEFAULT_FIELDS;
}
// if a specific DocDataLineReader was configured, must respect it
string docDataLineReaderClassName = Config.Get("line.parser", null);
if (docDataLineReaderClassName != null)
{
try
{
Type clazz = Type.GetType(docDataLineReaderClassName);
return (LineParser)Activator.CreateInstance(clazz, (object)header);
}
catch (Exception e)
{
throw new Exception("Failed to instantiate " + docDataLineReaderClassName, e);
}
}
// if this the simple case,
if (Arrays.Equals(header, WriteLineDocTask.DEFAULT_FIELDS))
{
return new SimpleLineParser(header);
}
return new HeaderLineParser(header);
}
public override void ResetInputs()
{
base.ResetInputs();
OpenFile();
}
public override void SetConfig(Config config)
{
base.SetConfig(config);
string fileName = config.Get("docs.file", null);
if (fileName == null)
{
throw new ArgumentException("docs.file must be set");
}
file = new FileInfo(fileName);
if (m_encoding == null)
{
m_encoding = Encoding.UTF8;
}
}
}
/// <summary>Reader of a single input line into <see cref="DocData"/>.</summary>
public abstract class LineParser
{
protected readonly string[] m_header;
/// <summary>
/// Construct with the header
/// </summary>
/// <param name="header">header line found in the input file, or <c>null</c> if none.</param>
protected LineParser(string[] header) // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
{
this.m_header = header;
}
/// <summary>
/// parse an input line and fill doc data appropriately
/// </summary>
public abstract void ParseLine(DocData docData, string line);
}
/// <summary>
/// <see cref="LineParser"/> which ignores the header passed to its constructor
/// and assumes simply that field names and their order are the same
/// as in <see cref="WriteLineDocTask.DEFAULT_FIELDS"/>.
/// </summary>
public class SimpleLineParser : LineParser
{
public SimpleLineParser(string[] header)
: base(header)
{
}
public override void ParseLine(DocData docData, string line)
{
int k1 = 0;
int k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
if (k2 < 0)
{
throw new Exception("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
}
docData.Title = line.Substring(k1, k2 - k1);
k1 = k2 + 1;
k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
if (k2 < 0)
{
throw new Exception("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
}
docData.SetDate(line.Substring(k1, k2 - k1));
k1 = k2 + 1;
k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
if (k2 >= 0)
{
throw new Exception("line: [" + line + "] is in an invalid format (too many separators)!");
}
// last one
docData.Body = line.Substring(k1);
}
}
/// <summary>
/// <see cref="LineParser"/> which sets field names and order by
/// the header - any header - of the lines file.
/// It is less efficient than <see cref="SimpleLineParser"/> but more powerful.
/// </summary>
public class HeaderLineParser : LineParser
{
private enum FieldName { NAME, TITLE, DATE, BODY, PROP }
private readonly FieldName[] posToF;
public HeaderLineParser(string[] header)
: base(header)
{
posToF = new FieldName[header.Length];
for (int i = 0; i < header.Length; i++)
{
String f = header[i];
if (DocMaker.NAME_FIELD.Equals(f, StringComparison.Ordinal))
{
posToF[i] = FieldName.NAME;
}
else if (DocMaker.TITLE_FIELD.Equals(f, StringComparison.Ordinal))
{
posToF[i] = FieldName.TITLE;
}
else if (DocMaker.DATE_FIELD.Equals(f, StringComparison.Ordinal))
{
posToF[i] = FieldName.DATE;
}
else if (DocMaker.BODY_FIELD.Equals(f, StringComparison.Ordinal))
{
posToF[i] = FieldName.BODY;
}
else
{
posToF[i] = FieldName.PROP;
}
}
}
public override void ParseLine(DocData docData, string line)
{
int n = 0;
int k1 = 0;
int k2;
while ((k2 = line.IndexOf(WriteLineDocTask.SEP, k1)) >= 0)
{
if (n >= m_header.Length)
{
throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]");
}
SetDocDataField(docData, n, line.Substring(k1, k2 - k1));
++n;
k1 = k2 + 1;
}
if (n != m_header.Length - 1)
{
throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]");
}
// last one
SetDocDataField(docData, n, line.Substring(k1));
}
private void SetDocDataField(DocData docData, int position, string text)
{
switch (posToF[position])
{
case FieldName.NAME:
docData.Name = text;
break;
case FieldName.TITLE:
docData.Title = text;
break;
case FieldName.DATE:
docData.SetDate(text);
break;
case FieldName.BODY:
docData.Body = text;
break;
case FieldName.PROP:
var p = docData.Props;
if (p == null)
{
p = new Dictionary<string, string>();
docData.Props = p;
}
p[m_header[position]] = text;
break;
}
}
}
}