blob: b4f4209b6a98bd7b3634a7cf8c0152f908e0c193 [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Benchmarks.ByTask.Feeds;
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Benchmarks.ByTask.Tasks
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A task which writes documents, one line per document. Each line is in the
/// following format: title &lt;TAB&gt; date &lt;TAB&gt; body. The output of this
/// task can be consumed by <see cref="LineDocSource"/> and is intended
/// to save the IO overhead of opening a file per document to be indexed.
/// </summary>
/// <remarks>
/// The format of the output is set according to the output file extension.
/// Compression is recommended when the output file is expected to be large.
/// See info on file extensions in <see cref="FileType"/>.
/// <para/>
/// Supports the following parameters:
/// <list type="bullet">
/// <item><term>line.file.out</term><description>the name of the file to write the output to. That parameter is mandatory. <b>NOTE:</b> the file is re-created.</description></item>
/// <item><term>line.fields</term><description>which fields should be written in each line. (optional, default: <see cref="DEFAULT_FIELDS"/>).</description></item>
/// <item><term>sufficient.fields</term><description>
/// list of field names, separated by comma, which,
/// if all of them are missing, the document will be skipped. For example, to require
/// that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
/// that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>
/// (optional, default: <see cref="DEFAULT_SUFFICIENT_FIELDS"/>).
/// </description></item>
/// </list>
/// <para/>
/// <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the
/// output is unspecified (as all will write to the same output file in a
/// non-synchronized way).
/// </remarks>
public class WriteLineDocTask : PerfTask
{
public const string FIELDS_HEADER_INDICATOR = "FIELDS_HEADER_INDICATOR###";
public const char SEP = '\t';
/// <summary>
/// Fields to be written by default
/// </summary>
public static readonly string[] DEFAULT_FIELDS = new string[] {
DocMaker.TITLE_FIELD,
DocMaker.DATE_FIELD,
DocMaker.BODY_FIELD,
};
/// <summary>
/// Default fields which at least one of them is required to not skip the doc.
/// </summary>
public static readonly string DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD + ',' + DocMaker.BODY_FIELD;
private int docSize = 0;
protected readonly string m_fname;
private readonly TextWriter lineFileOut;
private readonly DocMaker docMaker;
private readonly DisposableThreadLocal<StringBuilder> threadBuffer = new DisposableThreadLocal<StringBuilder>();
private readonly DisposableThreadLocal<Regex> threadNormalizer = new DisposableThreadLocal<Regex>();
private readonly string[] fieldsToWrite;
private readonly bool[] sufficientFields;
private readonly bool checkSufficientFields;
private readonly object lineFileLock = new object(); // LUCENENET specific - lock to ensure writes don't collide for this instance
public WriteLineDocTask(PerfRunData runData)
: base(runData)
{
Config config = runData.Config;
m_fname = config.Get("line.file.out", null);
if (m_fname == null)
{
throw new ArgumentException("line.file.out must be set");
}
Stream @out = StreamUtils.GetOutputStream(new FileInfo(m_fname));
lineFileOut = new StreamWriter(@out, Encoding.UTF8);
docMaker = runData.DocMaker;
// init fields
string f2r = config.Get("line.fields", null);
if (f2r == null)
{
fieldsToWrite = DEFAULT_FIELDS;
}
else
{
if (f2r.IndexOf(SEP) >= 0)
{
throw new ArgumentException("line.fields " + f2r + " should not contain the separator char: " + SEP);
}
fieldsToWrite = f2r.Split(',').TrimEnd();
}
// init sufficient fields
sufficientFields = new bool[fieldsToWrite.Length];
string suff = config.Get("sufficient.fields", DEFAULT_SUFFICIENT_FIELDS);
if (",".Equals(suff, StringComparison.Ordinal))
{
checkSufficientFields = false;
}
else
{
checkSufficientFields = true;
ISet<string> sf = new JCG.HashSet<string>(suff.Split(',').TrimEnd());
for (int i = 0; i < fieldsToWrite.Length; i++)
{
if (sf.Contains(fieldsToWrite[i]))
{
sufficientFields[i] = true;
}
}
}
WriteHeader(lineFileOut);
}
/// <summary>
/// Write header to the lines file - indicating how to read the file later.
/// </summary>
protected virtual void WriteHeader(TextWriter @out)
{
StringBuilder sb = threadBuffer.Value;
if (sb == null)
{
sb = new StringBuilder();
threadBuffer.Value = sb;
}
sb.Length = 0;
sb.Append(FIELDS_HEADER_INDICATOR);
foreach (string f in fieldsToWrite)
{
sb.Append(SEP).Append(f);
}
lock (lineFileLock) // LUCENENET specific - lock to ensure writes don't collide for this instance
{
@out.WriteLine(sb.ToString());
}
}
protected override string GetLogMessage(int recsCount)
{
return "Wrote " + recsCount + " line docs";
}
public override int DoLogic()
{
Document doc = docSize > 0 ? docMaker.MakeDocument(docSize) : docMaker.MakeDocument();
Regex matcher = threadNormalizer.Value;
if (matcher == null)
{
matcher = new Regex("[\t\r\n]+", RegexOptions.Compiled);
threadNormalizer.Value = matcher;
}
StringBuilder sb = threadBuffer.Value;
if (sb == null)
{
sb = new StringBuilder();
threadBuffer.Value = sb;
}
sb.Length = 0;
bool sufficient = !checkSufficientFields;
for (int i = 0; i < fieldsToWrite.Length; i++)
{
IIndexableField f = doc.GetField(fieldsToWrite[i]);
string text = f == null ? "" : matcher.Replace(f.GetStringValue(), " ").Trim();
sb.Append(text).Append(SEP);
sufficient |= text.Length > 0 && sufficientFields[i];
}
if (sufficient)
{
sb.Length--; // remove redundant last separator
// lineFileOut is a PrintWriter, which synchronizes internally in println.
lock (lineFileLock) // LUCENENET specific - lock to ensure writes don't collide for this instance
{
LineFileOut(doc).WriteLine(sb.ToString());
}
}
return 1;
}
/// <summary>
/// Selects output line file by written doc.
/// Default: original output line file.
/// </summary>
protected virtual TextWriter LineFileOut(Document doc)
{
return lineFileOut;
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
threadBuffer.Dispose(); // LUCENENET specific: ThreadLocal is disposable
threadNormalizer.Dispose(); // LUCENENET specific: ThreadLocal is disposable
lineFileOut.Dispose();
}
base.Dispose(disposing);
}
/// <summary>
/// Set the params (docSize only)
/// </summary>
/// <param name="params">docSize, or 0 for no limit.</param>
public override void SetParams(string @params)
{
base.SetParams(@params);
docSize = (int)float.Parse(@params, CultureInfo.InvariantCulture);
}
public override bool SupportsParams => true;
}
}