// LUCENENET TODO: Use HTML Agility pack instead of SAX ?
using J2N.Threading;
using Lucene.Net.Benchmarks.ByTask.Utils;
using Lucene.Net.Util;
using Sax;
using Sax.Helpers;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
namespace Lucene.Net.Benchmarks.ByTask.Feeds
/// <summary>
/// A <see cref="ContentSource"/> which reads the English Wikipedia dump. You can read
/// the <c>.bz2</c> file directly (it will be decompressed on the fly). Config
/// properties:
/// <list type="bullet">
/// <item><term></term><description>false|true (default <b>true</b>).</description></item>
/// <item><term>docs.file</term><description>&lt;path to the file&gt;</description></item>
/// </list>
/// </summary>
public class EnwikiContentSource : ContentSource
private class Parser : DefaultHandler//, IRunnable
private ThreadJob t;
private bool threadDone;
private bool stopped = false;
private string[] tuple;
private NoMoreDataException nmde;
private readonly StringBuilder contents = new StringBuilder();
private string title;
private string body;
private string time;
private string id;
private readonly EnwikiContentSource outerInstance;
public Parser(EnwikiContentSource outerInstance)
this.outerInstance = outerInstance;
internal string[] Next()
if (t == null)
threadDone = false;
t = new ThreadJob(Run);
t.IsBackground = true;
string[] result;
lock (this)
while (tuple == null && nmde == null && !threadDone && !stopped)
//catch (ThreadInterruptedException ie)
// throw new ThreadInterruptedException(ie.ToString(), ie);
if (tuple != null)
result = tuple;
tuple = null;
Monitor.Pulse(this);// notify();
return result;
if (nmde != null)
// Set to null so we will re-start thread in case
// we are re-used:
t = null;
throw nmde;
// The thread has exited yet did not hit end of
// data, so this means it hit an exception. We
// throw NoMorDataException here to force
// benchmark to stop the current alg:
throw new NoMoreDataException();
internal string Time(string original)
StringBuilder buffer = new StringBuilder();
buffer.Append(original.Substring(8, 10 - 8));
buffer.Append(months[Convert.ToInt32(original.Substring(5, 7 - 5), CultureInfo.InvariantCulture) - 1]);
buffer.Append(original.Substring(0, 4 - 0));
buffer.Append(' ');
buffer.Append(original.Substring(11, 19 - 11));
return buffer.ToString();
public override void Characters(char[] ch, int start, int length)
contents.Append(ch, start, length);
public override void EndElement(string @namespace, string simple, string qualified)
int elemType = GetElementType(qualified);
switch (elemType)
case PAGE:
// the body must be null and we either are keeping image docs or the
// title does not start with Image:
if (body != null && (outerInstance.keepImages || !title.StartsWith("Image:", StringComparison.Ordinal)))
string[] tmpTuple = new string[LENGTH];
tmpTuple[TITLE] = title.Replace('\t', ' ');
tmpTuple[DATE] = time.Replace('\t', ' ');
tmpTuple[BODY] = Regex.Replace(body, "[\t\n]", " ");
tmpTuple[ID] = id;
lock (this)
while (tuple != null && !stopped)
Monitor.Wait(this); //wait();
//catch (ThreadInterruptedException ie)
// throw new ThreadInterruptedException(ie.ToString(), ie);
tuple = tmpTuple;
Monitor.Pulse(this); //notify();
case BODY:
body = contents.ToString();
//workaround that startswith doesn't have an ignore case option, get at least 10 chars.
string startsWith = body.Substring(0, Math.Min(10, contents.Length) - 0).ToLowerInvariant();
if (startsWith.StartsWith("#redirect", StringComparison.Ordinal))
body = null;
case DATE:
time = Time(contents.ToString());
case TITLE:
title = contents.ToString();
case ID:
//the doc id is the first one in the page. All other ids after that one can be ignored according to the schema
if (id == null)
id = contents.ToString();
// this element should be discarded.
public void Run()
Sax.IXMLReader reader = new TagSoup.Parser(); //XMLReaderFactory.createXMLReader();
reader.ContentHandler = this;
reader.ErrorHandler = this;
while (!stopped)
Stream localFileIS = outerInstance.@is;
if (localFileIS != null)
{ // null means fileIS was closed on us
// To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
reader.Parse(new InputSource(IOUtils.GetDecodingReader(localFileIS, Encoding.UTF8)));
catch (IOException /*ioe*/)
lock (outerInstance)
if (localFileIS != outerInstance.@is)
// fileIS was closed on us, so, just fall through
// Exception is real
throw; // LUCENENET: CA2200: Rethrow to preserve stack details (
lock (this)
if (stopped || !outerInstance.m_forever)
nmde = new NoMoreDataException();
Monitor.Pulse(this); //notify();
else if (localFileIS == outerInstance.@is)
// If file is not already re-opened then re-open it now
outerInstance.@is = outerInstance.OpenInputStream();
catch (SAXException sae)
throw new Exception(sae.ToString(), sae);
catch (IOException ioe)
throw new Exception(ioe.ToString(), ioe);
lock (this)
threadDone = true;
Monitor.Pulse(this); //Notify();
public override void StartElement(string @namespace, string simple, string qualified,
IAttributes attributes)
int elemType = GetElementType(qualified);
switch (elemType)
case PAGE:
title = null;
body = null;
time = null;
id = null;
// intentional fall-through.
case BODY:
case DATE:
case TITLE:
case ID:
contents.Length = 0;
// this element should be discarded.
internal void Stop()
lock (this)
stopped = true;
if (tuple != null)
tuple = null;
Monitor.Pulse(this); //Notify();
private static readonly IDictionary<string, int?> ELEMENTS = new Dictionary<string, int?> // LUCENENET: Avoid static constructors (see
{ "page", PAGE },
{ "text", BODY },
{ "timestamp", DATE },
{ "title", TITLE },
{ "id", ID }
private const int TITLE = 0;
private const int DATE = TITLE + 1;
private const int BODY = DATE + 1;
private const int ID = BODY + 1;
private const int LENGTH = ID + 1;
// LENGTH is used as the size of the tuple, so whatever constants we need that
// should not be part of the tuple, we should define them after LENGTH.
private const int PAGE = LENGTH + 1;
private static readonly string[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG",
"SEP", "OCT", "NOV", "DEC"};
public EnwikiContentSource()
parser = new Parser(this);
/// <summary>
/// Returns the type of the element if defined, otherwise returns -1. This
/// method is useful in startElement and endElement, by not needing to compare
/// the element qualified name over and over.
/// </summary>
private static int GetElementType(string elem)
ELEMENTS.TryGetValue(elem, out int? val);
return val == null ? -1 : val.Value;
private FileInfo file;
private bool keepImages = true;
private Stream @is;
private readonly Parser parser;
protected override void Dispose(bool disposing)
if (disposing)
lock (this)
if (@is != null)
Thread.Sleep(1); // LUCENENET: Allow parser to stop before Dispose() is called
@is = null;
public override DocData GetNextDocData(DocData docData)
string[] tuple = parser.Next();
docData.Name = tuple[ID];
docData.Body = tuple[BODY];
docData.Title = tuple[TITLE];
return docData;
public override void ResetInputs()
@is = OpenInputStream();
/// <summary>Open the input stream.</summary>
protected virtual Stream OpenInputStream()
return StreamUtils.GetInputStream(file);
public override void SetConfig(Config config)
keepImages = config.Get("", true);
string fileName = config.Get("docs.file", null);
if (fileName != null)
file = new FileInfo(fileName);