src/Lucene.Net.Benchmark/ByTask/Feeds/LineDocSource.cs - lucenenet - Git at Google

 using J2N.Text;
 using Lucene.Net.Benchmarks.ByTask.Tasks;
 using Lucene.Net.Benchmarks.ByTask.Utils;
 using Lucene.Net.Support;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Text;

 namespace Lucene.Net.Benchmarks.ByTask.Feeds
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// A <see cref="ContentSource"/> reading one line at a time as a
     /// <see cref="Documents.Document"/> from a single file. This saves IO
     /// cost (over DirContentSource) of recursing through a directory and opening a
     /// new file for every document.
     /// </summary>
     /// <remarks>
     /// The expected format of each line is (arguments are separated by &lt;TAB&gt;):
     /// <i>title, date, body</i>. If a line is read in a different format, a
     /// <see cref="Exception"/> will be thrown. In general, you should use this
     /// content source for files that were created with <see cref="WriteLineDocTask"/>.
     /// </remarks>
     public class LineDocSource : ContentSource
     {
         // LUCENENET specific - de-nested LineParser, SimpleLineParser, HeaderLineParser

         private FileInfo file;
         private TextReader reader;
         private int readCount;

         private LineParser docDataLineReader = null;
         private bool skipHeaderLine = false;

         private void OpenFile()
         {
             try
             {
                 if (reader != null)
                 {
                     reader.Dispose();
                 }
                 Stream @is = StreamUtils.GetInputStream(file);
                 reader = new StreamReader(@is, m_encoding);
                 if (skipHeaderLine)
                 {
                     reader.ReadLine(); // skip one line - the header line - already handled that info
                 }
             }
             catch (IOException e)
             {
                 throw new Exception(e.ToString(), e);
             }
         }

         protected override void Dispose(bool disposing)
         {
             if (disposing && reader != null)
             {
                 reader.Dispose();
                 reader = null;
             }
         }

         public override DocData GetNextDocData(DocData docData)
         {
             string line;
             int myID;


             lock (this)
             {
                 line = reader.ReadLine();
                 if (line == null)
                 {
                     if (!m_forever)
                     {
                         throw new NoMoreDataException();
                     }
                     // Reset the file
                     OpenFile();
                     return GetNextDocData(docData);
                 }
                 if (docDataLineReader == null)
                 { // first line ever, one time initialization,
                     docDataLineReader = CreateDocDataLineReader(line);
                     if (skipHeaderLine)
                     {
                         return GetNextDocData(docData);
                     }
                 }
                 // increment IDS only once...
                 myID = readCount++;
             }

             // The date String was written in the format of DateTools.dateToString.
             docData.Clear();
             docData.ID = myID;
             docDataLineReader.ParseLine(docData, line);
             return docData;
         }

         private LineParser CreateDocDataLineReader(string line)
         {
             string[] header;
             string headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;

             if (line.StartsWith(headIndicator, StringComparison.Ordinal))
             {
                 header = line.Substring(headIndicator.Length).Split(WriteLineDocTask.SEP).TrimEnd();
                 skipHeaderLine = true; // mark to skip the header line when input file is reopened
             }
             else
             {
                 header = WriteLineDocTask.DEFAULT_FIELDS;
             }

             // if a specific DocDataLineReader was configured, must respect it
             string docDataLineReaderClassName = Config.Get("line.parser", null);
             if (docDataLineReaderClassName != null)
             {
                 try
                 {
                     Type clazz = Type.GetType(docDataLineReaderClassName);
                     return (LineParser)Activator.CreateInstance(clazz, (object)header);
                 }
                 catch (Exception e)
                 {
                     throw new Exception("Failed to instantiate " + docDataLineReaderClassName, e);
                 }
             }

             // if this the simple case,
             if (Arrays.Equals(header, WriteLineDocTask.DEFAULT_FIELDS))
             {
                 return new SimpleLineParser(header);
             }
             return new HeaderLineParser(header);
         }

         public override void ResetInputs()
         {
             base.ResetInputs();
             OpenFile();
         }

         public override void SetConfig(Config config)
         {
             base.SetConfig(config);
             string fileName = config.Get("docs.file", null);
             if (fileName == null)
             {
                 throw new ArgumentException("docs.file must be set");
             }
             file = new FileInfo(fileName);
             if (m_encoding == null)
             {
                 m_encoding = Encoding.UTF8;
             }
         }
     }

     /// <summary>Reader of a single input line into <see cref="DocData"/>.</summary>
     public abstract class LineParser
     {
         protected readonly string[] m_header;

         /// <summary>
         /// Construct with the header
         /// </summary>
         /// <param name="header">header line found in the input file, or <c>null</c> if none.</param>
         protected LineParser(string[] header) // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
         {
             this.m_header = header;
         }

         /// <summary>
         /// parse an input line and fill doc data appropriately
         /// </summary>
         public abstract void ParseLine(DocData docData, string line);
     }

     /// <summary>
     /// <see cref="LineParser"/> which ignores the header passed to its constructor
     /// and assumes simply that field names and their order are the same
     /// as in <see cref="WriteLineDocTask.DEFAULT_FIELDS"/>.
     /// </summary>
     public class SimpleLineParser : LineParser
     {
         public SimpleLineParser(string[] header)
             : base(header)
         {
         }

         public override void ParseLine(DocData docData, string line)
         {
             int k1 = 0;
             int k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
             if (k2 < 0)
             {
                 throw new Exception("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
             }
             docData.Title = line.Substring(k1, k2 - k1);
             k1 = k2 + 1;
             k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
             if (k2 < 0)
             {
                 throw new Exception("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
             }
             docData.SetDate(line.Substring(k1, k2 - k1));
             k1 = k2 + 1;
             k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
             if (k2 >= 0)
             {
                 throw new Exception("line: [" + line + "] is in an invalid format (too many separators)!");
             }
             // last one
             docData.Body = line.Substring(k1);
         }
     }

     /// <summary>
     /// <see cref="LineParser"/> which sets field names and order by
     /// the header - any header - of the lines file.
     /// It is less efficient than <see cref="SimpleLineParser"/> but more powerful.
     /// </summary>
     public class HeaderLineParser : LineParser
     {
         private enum FieldName { NAME, TITLE, DATE, BODY, PROP }
         private readonly FieldName[] posToF;
         public HeaderLineParser(string[] header)
             : base(header)
         {
             posToF = new FieldName[header.Length];
             for (int i = 0; i < header.Length; i++)
             {
                 String f = header[i];
                 if (DocMaker.NAME_FIELD.Equals(f, StringComparison.Ordinal))
                 {
                     posToF[i] = FieldName.NAME;
                 }
                 else if (DocMaker.TITLE_FIELD.Equals(f, StringComparison.Ordinal))
                 {
                     posToF[i] = FieldName.TITLE;
                 }
                 else if (DocMaker.DATE_FIELD.Equals(f, StringComparison.Ordinal))
                 {
                     posToF[i] = FieldName.DATE;
                 }
                 else if (DocMaker.BODY_FIELD.Equals(f, StringComparison.Ordinal))
                 {
                     posToF[i] = FieldName.BODY;
                 }
                 else
                 {
                     posToF[i] = FieldName.PROP;
                 }
             }
         }

         public override void ParseLine(DocData docData, string line)
         {
             int n = 0;
             int k1 = 0;
             int k2;
             while ((k2 = line.IndexOf(WriteLineDocTask.SEP, k1)) >= 0)
             {
                 if (n >= m_header.Length)
                 {
                     throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]");
                 }
                 SetDocDataField(docData, n, line.Substring(k1, k2 - k1));
                 ++n;
                 k1 = k2 + 1;
             }
             if (n != m_header.Length - 1)
             {
                 throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]");
             }
             // last one
             SetDocDataField(docData, n, line.Substring(k1));
         }

         private void SetDocDataField(DocData docData, int position, string text)
         {
             switch (posToF[position])
             {
                 case FieldName.NAME:
                     docData.Name = text;
                     break;
                 case FieldName.TITLE:
                     docData.Title = text;
                     break;
                 case FieldName.DATE:
                     docData.SetDate(text);
                     break;
                 case FieldName.BODY:
                     docData.Body = text;
                     break;
                 case FieldName.PROP:
                     var p = docData.Props;
                     if (p == null)
                     {
                         p = new Dictionary<string, string>();
                         docData.Props = p;
                     }
                     p[m_header[position]] = text;
                     break;
             }
         }
     }
 }
	using J2N.Text;
	using Lucene.Net.Benchmarks.ByTask.Tasks;
	using Lucene.Net.Benchmarks.ByTask.Utils;
	using Lucene.Net.Support;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Text;

	namespace Lucene.Net.Benchmarks.ByTask.Feeds
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// A <see cref="ContentSource"/> reading one line at a time as a
	/// <see cref="Documents.Document"/> from a single file. This saves IO
	/// cost (over DirContentSource) of recursing through a directory and opening a
	/// new file for every document.
	/// </summary>
	/// <remarks>
	/// The expected format of each line is (arguments are separated by <TAB>):
	/// <i>title, date, body</i>. If a line is read in a different format, a
	/// <see cref="Exception"/> will be thrown. In general, you should use this
	/// content source for files that were created with <see cref="WriteLineDocTask"/>.
	/// </remarks>
	public class LineDocSource : ContentSource
	{
	// LUCENENET specific - de-nested LineParser, SimpleLineParser, HeaderLineParser

	private FileInfo file;
	private TextReader reader;
	private int readCount;

	private LineParser docDataLineReader = null;
	private bool skipHeaderLine = false;

	private void OpenFile()
	{
	try
	{
	if (reader != null)
	{
	reader.Dispose();
	}
	Stream @is = StreamUtils.GetInputStream(file);
	reader = new StreamReader(@is, m_encoding);
	if (skipHeaderLine)
	{
	reader.ReadLine(); // skip one line - the header line - already handled that info
	}
	}
	catch (IOException e)
	{
	throw new Exception(e.ToString(), e);
	}
	}

	protected override void Dispose(bool disposing)
	{
	if (disposing && reader != null)
	{
	reader.Dispose();
	reader = null;
	}
	}

	public override DocData GetNextDocData(DocData docData)
	{
	string line;
	int myID;


	lock (this)
	{
	line = reader.ReadLine();
	if (line == null)
	{
	if (!m_forever)
	{
	throw new NoMoreDataException();
	}
	// Reset the file
	OpenFile();
	return GetNextDocData(docData);
	}
	if (docDataLineReader == null)
	{ // first line ever, one time initialization,
	docDataLineReader = CreateDocDataLineReader(line);
	if (skipHeaderLine)
	{
	return GetNextDocData(docData);
	}
	}
	// increment IDS only once...
	myID = readCount++;
	}

	// The date String was written in the format of DateTools.dateToString.
	docData.Clear();
	docData.ID = myID;
	docDataLineReader.ParseLine(docData, line);
	return docData;
	}

	private LineParser CreateDocDataLineReader(string line)
	{
	string[] header;
	string headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;

	if (line.StartsWith(headIndicator, StringComparison.Ordinal))
	{
	header = line.Substring(headIndicator.Length).Split(WriteLineDocTask.SEP).TrimEnd();
	skipHeaderLine = true; // mark to skip the header line when input file is reopened
	}
	else
	{
	header = WriteLineDocTask.DEFAULT_FIELDS;
	}

	// if a specific DocDataLineReader was configured, must respect it
	string docDataLineReaderClassName = Config.Get("line.parser", null);
	if (docDataLineReaderClassName != null)
	{
	try
	{
	Type clazz = Type.GetType(docDataLineReaderClassName);
	return (LineParser)Activator.CreateInstance(clazz, (object)header);
	}
	catch (Exception e)
	{
	throw new Exception("Failed to instantiate " + docDataLineReaderClassName, e);
	}
	}

	// if this the simple case,
	if (Arrays.Equals(header, WriteLineDocTask.DEFAULT_FIELDS))
	{
	return new SimpleLineParser(header);
	}
	return new HeaderLineParser(header);
	}

	public override void ResetInputs()
	{
	base.ResetInputs();
	OpenFile();
	}

	public override void SetConfig(Config config)
	{
	base.SetConfig(config);
	string fileName = config.Get("docs.file", null);
	if (fileName == null)
	{
	throw new ArgumentException("docs.file must be set");
	}
	file = new FileInfo(fileName);
	if (m_encoding == null)
	{
	m_encoding = Encoding.UTF8;
	}
	}
	}

	/// <summary>Reader of a single input line into <see cref="DocData"/>.</summary>
	public abstract class LineParser
	{
	protected readonly string[] m_header;

	/// <summary>
	/// Construct with the header
	/// </summary>
	/// <param name="header">header line found in the input file, or <c>null</c> if none.</param>
	protected LineParser(string[] header) // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
	{
	this.m_header = header;
	}

	/// <summary>
	/// parse an input line and fill doc data appropriately
	/// </summary>
	public abstract void ParseLine(DocData docData, string line);
	}

	/// <summary>
	/// <see cref="LineParser"/> which ignores the header passed to its constructor
	/// and assumes simply that field names and their order are the same
	/// as in <see cref="WriteLineDocTask.DEFAULT_FIELDS"/>.
	/// </summary>
	public class SimpleLineParser : LineParser
	{
	public SimpleLineParser(string[] header)
	: base(header)
	{
	}

	public override void ParseLine(DocData docData, string line)
	{
	int k1 = 0;
	int k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
	if (k2 < 0)
	{
	throw new Exception("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
	}
	docData.Title = line.Substring(k1, k2 - k1);
	k1 = k2 + 1;
	k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
	if (k2 < 0)
	{
	throw new Exception("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
	}
	docData.SetDate(line.Substring(k1, k2 - k1));
	k1 = k2 + 1;
	k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
	if (k2 >= 0)
	{
	throw new Exception("line: [" + line + "] is in an invalid format (too many separators)!");
	}
	// last one
	docData.Body = line.Substring(k1);
	}
	}

	/// <summary>
	/// <see cref="LineParser"/> which sets field names and order by
	/// the header - any header - of the lines file.
	/// It is less efficient than <see cref="SimpleLineParser"/> but more powerful.
	/// </summary>
	public class HeaderLineParser : LineParser
	{
	private enum FieldName { NAME, TITLE, DATE, BODY, PROP }
	private readonly FieldName[] posToF;
	public HeaderLineParser(string[] header)
	: base(header)
	{
	posToF = new FieldName[header.Length];
	for (int i = 0; i < header.Length; i++)
	{
	String f = header[i];
	if (DocMaker.NAME_FIELD.Equals(f, StringComparison.Ordinal))
	{
	posToF[i] = FieldName.NAME;
	}
	else if (DocMaker.TITLE_FIELD.Equals(f, StringComparison.Ordinal))
	{
	posToF[i] = FieldName.TITLE;
	}
	else if (DocMaker.DATE_FIELD.Equals(f, StringComparison.Ordinal))
	{
	posToF[i] = FieldName.DATE;
	}
	else if (DocMaker.BODY_FIELD.Equals(f, StringComparison.Ordinal))
	{
	posToF[i] = FieldName.BODY;
	}
	else
	{
	posToF[i] = FieldName.PROP;
	}
	}
	}

	public override void ParseLine(DocData docData, string line)
	{
	int n = 0;
	int k1 = 0;
	int k2;
	while ((k2 = line.IndexOf(WriteLineDocTask.SEP, k1)) >= 0)
	{
	if (n >= m_header.Length)
	{
	throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]");
	}
	SetDocDataField(docData, n, line.Substring(k1, k2 - k1));
	++n;
	k1 = k2 + 1;
	}
	if (n != m_header.Length - 1)
	{
	throw new Exception("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]");
	}
	// last one
	SetDocDataField(docData, n, line.Substring(k1));
	}

	private void SetDocDataField(DocData docData, int position, string text)
	{
	switch (posToF[position])
	{
	case FieldName.NAME:
	docData.Name = text;
	break;
	case FieldName.TITLE:
	docData.Title = text;
	break;
	case FieldName.DATE:
	docData.SetDate(text);
	break;
	case FieldName.BODY:
	docData.Body = text;
	break;
	case FieldName.PROP:
	var p = docData.Props;
	if (p == null)
	{
	p = new Dictionary<string, string>();
	docData.Props = p;
	}
	p[m_header[position]] = text;
	break;
	}
	}
	}
	}