blob: 3868ba3d24bea91d6848b30bc9a3f2a83453683d [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Support.IO;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
// LUCENENET TODO: This had to be refactored significantly. We need tests to confirm it works.
namespace Lucene.Net.Benchmarks.ByTask.Feeds
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// A <see cref="ContentSource"/> using the Dir collection for its input. Supports
/// the following configuration parameters (on top of <see cref="ContentSource"/>):
/// <list type="bullet">
/// <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir" denotes a relative path (<b>default=work</b>).</description></item>
/// <item><term>docs.dir</term><description>specifies the directory the Dir collection. Can be set to a relative path if "work.dir" is also specified (<b>default=dir-out</b>).</description></item>
/// </list>
/// </summary>
public class DirContentSource : ContentSource
/// <summary>
/// Iterator over the files in the directory.
/// </summary>
public class Enumerator : IEnumerator<FileInfo>
private class Comparer : IComparer<FileInfo>
public int Compare(FileInfo a, FileInfo b)
string a2 = a.ToString();
string b2 = b.ToString();
int diff = a2.Length - b2.Length;
if (diff > 0)
while (diff-- > 0)
b2 = "0" + b2;
else if (diff < 0)
diff = -diff;
while (diff-- > 0)
a2 = "0" + a2;
/* note it's reversed because we're going to push,
which reverses again */
return b2.CompareToOrdinal(a2);
internal int count = 0;
internal Stack<FileInfo> stack = new Stack<FileInfo>();
/* this seems silly ... there must be a better way ...
not that this is good, but can it matter? */
private readonly Comparer c = new Comparer(); // LUCENENET: marked readonly
private FileInfo current;
public Enumerator(DirectoryInfo f)
internal void Push(DirectoryInfo f)
foreach (var dir in f.GetDirectories())
internal void Push(FileInfo[] files)
Array.Sort(files, c);
for (int i = 0; i < files.Length; i++)
// System.err.println("push " + files[i]);
public virtual int Count => count;
public virtual bool MoveNext()
if (stack.Count == 0)
current = null;
return false;
current = stack.Pop();
// System.err.println("pop " + object);
return true;
public virtual FileInfo Current => current;
object IEnumerator.Current => current;
public void Dispose()
protected virtual void Dispose(bool disposing)
public virtual void Reset()
private DirectoryInfo dataDir = null;
private int iteration = 0;
private Enumerator inputFiles = null;
private DateTime? ParseDate(string dateStr)
if (DateTime.TryParseExact(dateStr, "dd-MMM-yyyy hh:mm:ss.fff", CultureInfo.InvariantCulture, DateTimeStyles.None, out DateTime temp))
return temp;
else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out temp))
return temp;
return null;
/// <summary>
/// Releases resources used by the <see cref="DirContentSource"/> and
/// if overridden in a derived class, optionally releases unmanaged resources.
/// </summary>
/// <param name="disposing"><c>true</c> to release both managed and unmanaged resources;
/// <c>false</c> to release only unmanaged resources.</param>
protected override void Dispose(bool disposing)
if (disposing)
inputFiles?.Dispose(); // LUCENENET specific - dispose inputFiles
inputFiles = null;
public override DocData GetNextDocData(DocData docData)
FileInfo f = null;
string name = null;
lock (this)
if (!inputFiles.MoveNext())
// exhausted files, start a new round, unless forever set to false.
if (!m_forever)
throw new NoMoreDataException();
inputFiles = new Enumerator(dataDir);
f = inputFiles.Current;
// System.err.println(f);
name = f.GetCanonicalPath() + "_" + iteration;
string line = null;
string dateStr;
string title;
StringBuilder bodyBuf = new StringBuilder(1024);
using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8))
//First line is the date, 3rd is the title, rest is body
dateStr = reader.ReadLine();
reader.ReadLine();//skip an empty line
title = reader.ReadLine();
reader.ReadLine();//skip an empty line
while ((line = reader.ReadLine()) != null)
bodyBuf.Append(line).Append(' ');
DateTime? date = ParseDate(dateStr);
docData.Name = name;
docData.Body = bodyBuf.ToString();
docData.Title = title;
return docData;
public override void ResetInputs()
lock (this)
inputFiles = new Enumerator(dataDir);
iteration = 0;
public override void SetConfig(Config config)
DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
string d = config.Get("docs.dir", "dir-out");
if (Path.IsPathRooted(d))
dataDir = new DirectoryInfo(d);
dataDir = new DirectoryInfo(Path.Combine(workDir.FullName, d));
inputFiles = new Enumerator(dataDir);
if (inputFiles == null)
throw new Exception("No txt files in dataDir: " + dataDir.FullName);