blob: e7f38f73f6898d88ba76d0382bead45ac4c9866f [file] [log] [blame]
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Support.IO;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
namespace Lucene.Net.Benchmarks.ByTask.Feeds
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A <see cref="ContentSource"/> reading from the Reuters collection.
/// <para/>
/// Config properties:
/// <list type="bullet">
/// <item><term><b>work.dir</b></term><description>path to the root of docs and indexes dirs (default <b>work</b>).</description></item>
/// <item><term><b>docs.dir</b></term><description>path to the docs dir (default <b>reuters-out</b>).</description></item>
/// </list>
/// </summary>
public class ReutersContentSource : ContentSource
{
// LUCENENET specific: DateFormatInfo not used
private DirectoryInfo dataDir = null;
private readonly List<FileInfo> inputFiles = new List<FileInfo>(); // LUCENENET: marked readonly
private int nextFile = 0;
private int iteration = 0;
public override void SetConfig(Config config)
{
base.SetConfig(config);
DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
string d = config.Get("docs.dir", "reuters-out");
dataDir = new DirectoryInfo(Path.Combine(workDir.FullName, d));
inputFiles.Clear();
CollectFiles(dataDir, inputFiles);
if (inputFiles.Count == 0)
{
throw new Exception("No txt files in dataDir: " + dataDir.FullName);
}
}
// LUCENENET specific: DateFormatInfo not used
private DateTime? ParseDate(string dateStr)
{
if (DateTime.TryParseExact(dateStr, "dd-MMM-yyyy hh:mm:ss.fff", CultureInfo.InvariantCulture, DateTimeStyles.None, out DateTime temp))
{
return temp;
}
else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp))
{
return temp;
}
return null;
}
protected override void Dispose(bool disposing)
{
// TODO implement?
}
public override DocData GetNextDocData(DocData docData)
{
FileInfo f = null;
string name = null;
lock (this)
{
if (nextFile >= inputFiles.Count)
{
// exhausted files, start a new round, unless forever set to false.
if (!m_forever)
{
throw new NoMoreDataException();
}
nextFile = 0;
iteration++;
}
f = inputFiles[nextFile++];
name = f.GetCanonicalPath() + "_" + iteration;
}
using TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8);
// First line is the date, 3rd is the title, rest is body
string dateStr = reader.ReadLine();
reader.ReadLine();// skip an empty line
string title = reader.ReadLine();
reader.ReadLine();// skip an empty line
StringBuilder bodyBuf = new StringBuilder(1024);
string line = null;
while ((line = reader.ReadLine()) != null)
{
bodyBuf.Append(line).Append(' ');
}
reader.Dispose();
AddBytes(f.Length);
DateTime? date = ParseDate(dateStr.Trim());
docData.Clear();
docData.Name = name;
docData.Body = bodyBuf.ToString();
docData.Title = title;
docData.SetDate(date);
return docData;
}
public override void ResetInputs()
{
lock (this)
{
base.ResetInputs();
nextFile = 0;
iteration = 0;
}
}
}
}