src/Lucene.Net.Benchmark/ByTask/Feeds/TrecContentSource.cs - lucenenet - Git at Google

 using J2N.Text;
 using Lucene.Net.Benchmarks.ByTask.Utils;
 using System;
 using System.Collections.Generic;
 using System.Globalization;
 using System.IO;
 using System.Text;
 using System.Threading;
 using Console = Lucene.Net.Support.SystemConsole;

 namespace Lucene.Net.Benchmarks.ByTask.Feeds
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Implements a <see cref="ContentSource"/> over the TREC collection.
     /// </summary>
     /// <remarks>
     /// Supports the following configuration parameters (on top of
     /// <see cref="ContentSource"/>):
     /// <list type="bullet">
     ///     <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir"
     ///         denotes a relative path (<b>default=work</b>).</description></item>
     ///     <item><term>docs.dir</term><description>specifies the directory where the TREC files reside.
     ///         Can be set to a relative path if "work.dir" is also specified
     ///         (<b>default=trec</b>).
     ///     </description></item>
     ///     <item><term>trec.doc.parser</term><description>specifies the <see cref="TrecDocParser"/> class to use for
     ///         parsing the TREC documents content (<b>default=TrecGov2Parser</b>).
     ///     </description></item>
     ///     <item><term>html.parser</term><description>specifies the <see cref="IHTMLParser"/> class to use for
     ///         parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>).
     ///     </description></item>
     ///     <item><term>content.source.encoding</term><description>if not specified, ISO-8859-1 is used.</description></item>
     ///     <item>content.source.excludeIteration<term></term><description>if <c>true</c>, do not append iteration number to docname</description></item>
     /// </list>
     /// </remarks>
     public class TrecContentSource : ContentSource
     {
         // LUCENENET specific - DateFormatInfo not used

         public static readonly string DOCNO = "<DOCNO>";
         public static readonly string TERMINATING_DOCNO = "</DOCNO>";
         public static readonly string DOC = "<DOC>";
         public static readonly string TERMINATING_DOC = "</DOC>";

         /// <summary>separator between lines in the buffer</summary>
         public static readonly string NEW_LINE = Environment.NewLine;

         private static readonly string[] DATE_FORMATS = {
             // LUCENENET specific: in JAVA, they don't care if it is an abbreviated or a full month name when parsing
             // so we provide definitions for both ways.
             "ddd, dd MMM yyyy hh:mm:ss K",   // Tue, 09 Dec 2003 22:39:08 GMT
             "ddd, dd MMMM yyyy hh:mm:ss K",  // Tue, 09 December 2003 22:39:08 GMT
             "ddd MMM dd hh:mm:ss yyyy K",    // Tue Dec 09 16:45:08 2003 EST
             "ddd MMMM dd hh:mm:ss yyyy K",   // Tue December 09 16:45:08 2003 EST
             "ddd, dd-MMM-':'y hh:mm:ss K",   // Tue, 09 Dec 2003 22:39:08 GMT
             "ddd, dd-MMMM-':'y hh:mm:ss K",  // Tue, 09 December 2003 22:39:08 GMT
             "ddd, dd-MMM-yyy hh:mm:ss K",    // Tue, 09 Dec 2003 22:39:08 GMT
             "ddd, dd-MMMM-yyy hh:mm:ss K",   // Tue, 09 December 2003 22:39:08 GMT
             "ddd MMM dd hh:mm:ss yyyy",      // Tue Dec 09 16:45:08 2003
             "ddd MMMM dd hh:mm:ss yyyy",     // Tue December 09 16:45:08 2003
             "dd MMM yyyy",                   // 1 Mar 1994
             "dd MMMM yyyy",                  // 1 March 1994
             "MMM dd, yyyy",                  // Feb 3, 1994
             "MMMM dd, yyyy",                 // February 3, 1994
             "yyMMdd",                        // 910513
             "hhmm K.K.K. MMM dd, yyyy",      // 0901 u.t.c. Apr 28, 1994
             "hhmm K.K.K. MMMM dd, yyyy",     // 0901 u.t.c. April 28, 1994
         };

         private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
         private DirectoryInfo dataDir = null;
         private List<FileInfo> inputFiles = new List<FileInfo>();
         private int nextFile = 0;
         // Use to synchronize threads on reading from the TREC documents.
         private object @lock = new object();

         // Required for test
         internal TextReader reader;
         internal int iteration = 0;
         internal IHTMLParser htmlParser;

         private bool excludeDocnameIteration;
         private TrecDocParser trecDocParser = new TrecGov2Parser(); // default
         internal TrecDocParser.ParsePathType currPathType; // not private for tests

         private StringBuilder GetDocBuffer()
         {
             StringBuilder sb = trecDocBuffer.Value;
             if (sb == null)
             {
                 sb = new StringBuilder();
                 trecDocBuffer.Value = sb;
             }
             return sb;
         }

         internal IHTMLParser HtmlParser
         {
             get { return htmlParser; }
         }

         /// <summary>
         /// Read until a line starting with the specified <paramref name="lineStart"/>.
         /// </summary>
         /// <param name="buf">Buffer for collecting the data if so specified.</param>
         /// <param name="lineStart">Line start to look for, must not be <c>null</c>.</param>
         /// <param name="collectMatchLine">Whether to collect the matching line into <c>buffer</c>.</param>
         /// <param name="collectAll">Whether to collect all lines into <c>buffer</c>.</param>
         /// <exception cref="IOException">If there is a low-level I/O error.</exception>
         /// <exception cref="NoMoreDataException">If the source is exhausted.</exception>
         private void Read(StringBuilder buf, string lineStart,
             bool collectMatchLine, bool collectAll)
         {
             string sep = "";
             while (true)
             {
                 string line = reader.ReadLine();

                 if (line == null)
                 {
                     OpenNextFile();
                     continue;
                 }

                 var _ = line.Length;

                 if (lineStart != null && line.StartsWith(lineStart, StringComparison.Ordinal))
                 {
                     if (collectMatchLine)
                     {
                         buf.Append(sep).Append(line);
                         sep = NEW_LINE;
                     }
                     return;
                 }

                 if (collectAll)
                 {
                     buf.Append(sep).Append(line);
                     sep = NEW_LINE;
                 }
             }
         }

         internal virtual void OpenNextFile()
         {
             Dispose();
             //currPathType = null;
             while (true)
             {
                 if (nextFile >= inputFiles.Count)
                 {
                     // exhausted files, start a new round, unless forever set to false.
                     if (!m_forever)
                     {
                         throw new NoMoreDataException();
                     }
                     nextFile = 0;
                     iteration++;
                 }
                 FileInfo f = inputFiles[nextFile++];
                 if (m_verbose)
                 {
                     Console.WriteLine("opening: " + f + " length: " + f.Length);
                 }
                 try
                 {
                     Stream inputStream = StreamUtils.GetInputStream(f); // support either gzip, bzip2, or regular text file, by extension
                     reader = new StreamReader(inputStream, m_encoding);
                     currPathType = TrecDocParser.PathType(f);
                     return;
                 }
                 catch (Exception e)
                 {
                     if (m_verbose)
                     {
                         Console.WriteLine("Skipping 'bad' file " + f.FullName + " due to " + e.Message);
                         continue;
                     }
                     throw new NoMoreDataException();
                 }
             }
         }

         public virtual DateTime? ParseDate(string dateStr)
         {
             dateStr = dateStr.Trim();
             DateTime d;
             if (DateTime.TryParseExact(dateStr, DATE_FORMATS, CultureInfo.InvariantCulture, DateTimeStyles.None, out d))
             {
                 return d;
             }
             else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out d))
             {
                 return d;
             }

             // do not fail test just because a date could not be parsed
             if (m_verbose)
             {
                 Console.WriteLine("failed to parse date (assigning 'now') for: " + dateStr);
             }
             return null;
         }

         protected override void Dispose(bool disposing)
         {
             if (reader == null)
             {
                 return;
             }

             try
             {
                 reader.Dispose();
             }
             catch (IOException e)
             {
                 if (m_verbose)
                 {
                     Console.WriteLine("failed to dispose reader !");
                     Console.WriteLine(e.ToString());
                 }
             }
             reader = null;
         }

         public override DocData GetNextDocData(DocData docData)
         {
             string name = null;
             StringBuilder docBuf = GetDocBuffer();
             TrecDocParser.ParsePathType parsedPathType;

             // protect reading from the TREC files by multiple threads. The rest of the
             // method, i.e., parsing the content and returning the DocData can run unprotected.
             lock (@lock)
             {
                 if (reader == null)
                 {
                     OpenNextFile();
                 }

                 // 1. skip until doc start - required for all TREC formats
                 docBuf.Length = 0;
                 Read(docBuf, DOC, false, false);

                 // save parsedFile for passing trecDataParser after the sync block, in
                 // case another thread will open another file in between.
                 parsedPathType = currPathType;

                 // 2. name - required for all TREC formats
                 docBuf.Length = 0;
                 Read(docBuf, DOCNO, true, false);
                 name = docBuf.ToString(DOCNO.Length, docBuf.IndexOf(TERMINATING_DOCNO,
                     DOCNO.Length, StringComparison.Ordinal) - DOCNO.Length).Trim();

                 if (!excludeDocnameIteration)
                 {
                     name = name + "_" + iteration;
                 }

                 // 3. read all until end of doc
                 docBuf.Length = 0;
                 Read(docBuf, TERMINATING_DOC, false, true);
             }

             // count char length of text to be parsed (may be larger than the resulted plain doc body text).
             AddBytes(docBuf.Length);

             // This code segment relies on HtmlParser being thread safe. When we get
             // here, everything else is already private to that thread, so we're safe.
             docData = trecDocParser.Parse(docData, name, this, docBuf, parsedPathType);
             AddItem();

             return docData;
         }

         public override void ResetInputs()
         {
             lock (@lock)
             {
                 base.ResetInputs();
                 Dispose();
                 nextFile = 0;
                 iteration = 0;
             }
         }

         public override void SetConfig(Config config)
         {
             base.SetConfig(config);
             // dirs
             DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
             string d = config.Get("docs.dir", "trec");
             dataDir = new DirectoryInfo(d);
             // files
             CollectFiles(dataDir, inputFiles);
             if (inputFiles.Count == 0)
             {
                 throw new ArgumentException("No files in dataDir: " + dataDir);
             }
             // trec doc parser
             try
             {
                 string trecDocParserClassName = config.Get("trec.doc.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.TrecGov2Parser, Lucene.Net.Benchmark");
                 trecDocParser = (TrecDocParser)Activator.CreateInstance(Type.GetType(trecDocParserClassName));
             }
             catch (Exception e)
             {
                 // Should not get here. Throw runtime exception.
                 throw new Exception(e.ToString(), e);
             }
             // html parser
             try
             {
                 string htmlParserClassName = config.Get("html.parser",
                     "Lucene.Net.Benchmarks.ByTask.Feeds.DemoHTMLParser, Lucene.Net.Benchmark");
                 htmlParser = (IHTMLParser)Activator.CreateInstance(Type.GetType(htmlParserClassName));
             }
             catch (Exception e)
             {
                 // Should not get here. Throw runtime exception.
                 throw new Exception(e.ToString(), e);
             }
             // encoding
             if (m_encoding == null)
             {
                 m_encoding = Encoding.GetEncoding("iso-8859-1"); //StandardCharsets.ISO_8859_1.name();
             }
             // iteration exclusion in doc name
             excludeDocnameIteration = config.Get("content.source.excludeIteration", false);
         }
     }
 }
	using J2N.Text;
	using Lucene.Net.Benchmarks.ByTask.Utils;
	using System;
	using System.Collections.Generic;
	using System.Globalization;
	using System.IO;
	using System.Text;
	using System.Threading;
	using Console = Lucene.Net.Support.SystemConsole;

	namespace Lucene.Net.Benchmarks.ByTask.Feeds
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Implements a <see cref="ContentSource"/> over the TREC collection.
	/// </summary>
	/// <remarks>
	/// Supports the following configuration parameters (on top of
	/// <see cref="ContentSource"/>):
	/// <list type="bullet">
	/// <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir"
	/// denotes a relative path (<b>default=work</b>).</description></item>
	/// <item><term>docs.dir</term><description>specifies the directory where the TREC files reside.
	/// Can be set to a relative path if "work.dir" is also specified
	/// (<b>default=trec</b>).
	/// </description></item>
	/// <item><term>trec.doc.parser</term><description>specifies the <see cref="TrecDocParser"/> class to use for
	/// parsing the TREC documents content (<b>default=TrecGov2Parser</b>).
	/// </description></item>
	/// <item><term>html.parser</term><description>specifies the <see cref="IHTMLParser"/> class to use for
	/// parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>).
	/// </description></item>
	/// <item><term>content.source.encoding</term><description>if not specified, ISO-8859-1 is used.</description></item>
	/// <item>content.source.excludeIteration<term></term><description>if <c>true</c>, do not append iteration number to docname</description></item>
	/// </list>
	/// </remarks>
	public class TrecContentSource : ContentSource
	{
	// LUCENENET specific - DateFormatInfo not used

	public static readonly string DOCNO = "<DOCNO>";
	public static readonly string TERMINATING_DOCNO = "</DOCNO>";
	public static readonly string DOC = "<DOC>";
	public static readonly string TERMINATING_DOC = "</DOC>";

	/// <summary>separator between lines in the buffer</summary>
	public static readonly string NEW_LINE = Environment.NewLine;

	private static readonly string[] DATE_FORMATS = {
	// LUCENENET specific: in JAVA, they don't care if it is an abbreviated or a full month name when parsing
	// so we provide definitions for both ways.
	"ddd, dd MMM yyyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT
	"ddd, dd MMMM yyyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT
	"ddd MMM dd hh:mm:ss yyyy K", // Tue Dec 09 16:45:08 2003 EST
	"ddd MMMM dd hh:mm:ss yyyy K", // Tue December 09 16:45:08 2003 EST
	"ddd, dd-MMM-':'y hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT
	"ddd, dd-MMMM-':'y hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT
	"ddd, dd-MMM-yyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT
	"ddd, dd-MMMM-yyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT
	"ddd MMM dd hh:mm:ss yyyy", // Tue Dec 09 16:45:08 2003
	"ddd MMMM dd hh:mm:ss yyyy", // Tue December 09 16:45:08 2003
	"dd MMM yyyy", // 1 Mar 1994
	"dd MMMM yyyy", // 1 March 1994
	"MMM dd, yyyy", // Feb 3, 1994
	"MMMM dd, yyyy", // February 3, 1994
	"yyMMdd", // 910513
	"hhmm K.K.K. MMM dd, yyyy", // 0901 u.t.c. Apr 28, 1994
	"hhmm K.K.K. MMMM dd, yyyy", // 0901 u.t.c. April 28, 1994
	};

	private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
	private DirectoryInfo dataDir = null;
	private List<FileInfo> inputFiles = new List<FileInfo>();
	private int nextFile = 0;
	// Use to synchronize threads on reading from the TREC documents.
	private object @lock = new object();

	// Required for test
	internal TextReader reader;
	internal int iteration = 0;
	internal IHTMLParser htmlParser;

	private bool excludeDocnameIteration;
	private TrecDocParser trecDocParser = new TrecGov2Parser(); // default
	internal TrecDocParser.ParsePathType currPathType; // not private for tests

	private StringBuilder GetDocBuffer()
	{
	StringBuilder sb = trecDocBuffer.Value;
	if (sb == null)
	{
	sb = new StringBuilder();
	trecDocBuffer.Value = sb;
	}
	return sb;
	}

	internal IHTMLParser HtmlParser
	{
	get { return htmlParser; }
	}

	/// <summary>
	/// Read until a line starting with the specified <paramref name="lineStart"/>.
	/// </summary>
	/// <param name="buf">Buffer for collecting the data if so specified.</param>
	/// <param name="lineStart">Line start to look for, must not be <c>null</c>.</param>
	/// <param name="collectMatchLine">Whether to collect the matching line into <c>buffer</c>.</param>
	/// <param name="collectAll">Whether to collect all lines into <c>buffer</c>.</param>
	/// <exception cref="IOException">If there is a low-level I/O error.</exception>
	/// <exception cref="NoMoreDataException">If the source is exhausted.</exception>
	private void Read(StringBuilder buf, string lineStart,
	bool collectMatchLine, bool collectAll)
	{
	string sep = "";
	while (true)
	{
	string line = reader.ReadLine();

	if (line == null)
	{
	OpenNextFile();
	continue;
	}

	var _ = line.Length;

	if (lineStart != null && line.StartsWith(lineStart, StringComparison.Ordinal))
	{
	if (collectMatchLine)
	{
	buf.Append(sep).Append(line);
	sep = NEW_LINE;
	}
	return;
	}

	if (collectAll)
	{
	buf.Append(sep).Append(line);
	sep = NEW_LINE;
	}
	}
	}

	internal virtual void OpenNextFile()
	{
	Dispose();
	//currPathType = null;
	while (true)
	{
	if (nextFile >= inputFiles.Count)
	{
	// exhausted files, start a new round, unless forever set to false.
	if (!m_forever)
	{
	throw new NoMoreDataException();
	}
	nextFile = 0;
	iteration++;
	}
	FileInfo f = inputFiles[nextFile++];
	if (m_verbose)
	{
	Console.WriteLine("opening: " + f + " length: " + f.Length);
	}
	try
	{
	Stream inputStream = StreamUtils.GetInputStream(f); // support either gzip, bzip2, or regular text file, by extension
	reader = new StreamReader(inputStream, m_encoding);
	currPathType = TrecDocParser.PathType(f);
	return;
	}
	catch (Exception e)
	{
	if (m_verbose)
	{
	Console.WriteLine("Skipping 'bad' file " + f.FullName + " due to " + e.Message);
	continue;
	}
	throw new NoMoreDataException();
	}
	}
	}

	public virtual DateTime? ParseDate(string dateStr)
	{
	dateStr = dateStr.Trim();
	DateTime d;
	if (DateTime.TryParseExact(dateStr, DATE_FORMATS, CultureInfo.InvariantCulture, DateTimeStyles.None, out d))
	{
	return d;
	}
	else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out d))
	{
	return d;
	}

	// do not fail test just because a date could not be parsed
	if (m_verbose)
	{
	Console.WriteLine("failed to parse date (assigning 'now') for: " + dateStr);
	}
	return null;
	}

	protected override void Dispose(bool disposing)
	{
	if (reader == null)
	{
	return;
	}

	try
	{
	reader.Dispose();
	}
	catch (IOException e)
	{
	if (m_verbose)
	{
	Console.WriteLine("failed to dispose reader !");
	Console.WriteLine(e.ToString());
	}
	}
	reader = null;
	}

	public override DocData GetNextDocData(DocData docData)
	{
	string name = null;
	StringBuilder docBuf = GetDocBuffer();
	TrecDocParser.ParsePathType parsedPathType;

	// protect reading from the TREC files by multiple threads. The rest of the
	// method, i.e., parsing the content and returning the DocData can run unprotected.
	lock (@lock)
	{
	if (reader == null)
	{
	OpenNextFile();
	}

	// 1. skip until doc start - required for all TREC formats
	docBuf.Length = 0;
	Read(docBuf, DOC, false, false);

	// save parsedFile for passing trecDataParser after the sync block, in
	// case another thread will open another file in between.
	parsedPathType = currPathType;

	// 2. name - required for all TREC formats
	docBuf.Length = 0;
	Read(docBuf, DOCNO, true, false);
	name = docBuf.ToString(DOCNO.Length, docBuf.IndexOf(TERMINATING_DOCNO,
	DOCNO.Length, StringComparison.Ordinal) - DOCNO.Length).Trim();

	if (!excludeDocnameIteration)
	{
	name = name + "_" + iteration;
	}

	// 3. read all until end of doc
	docBuf.Length = 0;
	Read(docBuf, TERMINATING_DOC, false, true);
	}

	// count char length of text to be parsed (may be larger than the resulted plain doc body text).
	AddBytes(docBuf.Length);

	// This code segment relies on HtmlParser being thread safe. When we get
	// here, everything else is already private to that thread, so we're safe.
	docData = trecDocParser.Parse(docData, name, this, docBuf, parsedPathType);
	AddItem();

	return docData;
	}

	public override void ResetInputs()
	{
	lock (@lock)
	{
	base.ResetInputs();
	Dispose();
	nextFile = 0;
	iteration = 0;
	}
	}

	public override void SetConfig(Config config)
	{
	base.SetConfig(config);
	// dirs
	DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
	string d = config.Get("docs.dir", "trec");
	dataDir = new DirectoryInfo(d);
	// files
	CollectFiles(dataDir, inputFiles);
	if (inputFiles.Count == 0)
	{
	throw new ArgumentException("No files in dataDir: " + dataDir);
	}
	// trec doc parser
	try
	{
	string trecDocParserClassName = config.Get("trec.doc.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.TrecGov2Parser, Lucene.Net.Benchmark");
	trecDocParser = (TrecDocParser)Activator.CreateInstance(Type.GetType(trecDocParserClassName));
	}
	catch (Exception e)
	{
	// Should not get here. Throw runtime exception.
	throw new Exception(e.ToString(), e);
	}
	// html parser
	try
	{
	string htmlParserClassName = config.Get("html.parser",
	"Lucene.Net.Benchmarks.ByTask.Feeds.DemoHTMLParser, Lucene.Net.Benchmark");
	htmlParser = (IHTMLParser)Activator.CreateInstance(Type.GetType(htmlParserClassName));
	}
	catch (Exception e)
	{
	// Should not get here. Throw runtime exception.
	throw new Exception(e.ToString(), e);
	}
	// encoding
	if (m_encoding == null)
	{
	m_encoding = Encoding.GetEncoding("iso-8859-1"); //StandardCharsets.ISO_8859_1.name();
	}
	// iteration exclusion in doc name
	excludeDocnameIteration = config.Get("content.source.excludeIteration", false);
	}
	}
	}