| using J2N.Text; |
| using Lucene.Net.Benchmarks.ByTask.Utils; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| using System.Text; |
| using System.Threading; |
| using Console = Lucene.Net.Support.SystemConsole; |
| |
| namespace Lucene.Net.Benchmarks.ByTask.Feeds |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Implements a <see cref="ContentSource"/> over the TREC collection. |
| /// </summary> |
| /// <remarks> |
| /// Supports the following configuration parameters (on top of |
| /// <see cref="ContentSource"/>): |
| /// <list type="bullet"> |
| /// <item><term>work.dir</term><description>specifies the working directory. Required if "docs.dir" |
| /// denotes a relative path (<b>default=work</b>).</description></item> |
| /// <item><term>docs.dir</term><description>specifies the directory where the TREC files reside. |
| /// Can be set to a relative path if "work.dir" is also specified |
| /// (<b>default=trec</b>). |
| /// </description></item> |
| /// <item><term>trec.doc.parser</term><description>specifies the <see cref="TrecDocParser"/> class to use for |
| /// parsing the TREC documents content (<b>default=TrecGov2Parser</b>). |
| /// </description></item> |
| /// <item><term>html.parser</term><description>specifies the <see cref="IHTMLParser"/> class to use for |
| /// parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>). |
| /// </description></item> |
| /// <item><term>content.source.encoding</term><description>if not specified, ISO-8859-1 is used.</description></item> |
| /// <item>content.source.excludeIteration<term></term><description>if <c>true</c>, do not append iteration number to docname</description></item> |
| /// </list> |
| /// </remarks> |
| public class TrecContentSource : ContentSource |
| { |
| // LUCENENET specific - DateFormatInfo not used |
| |
| public static readonly string DOCNO = "<DOCNO>"; |
| public static readonly string TERMINATING_DOCNO = "</DOCNO>"; |
| public static readonly string DOC = "<DOC>"; |
| public static readonly string TERMINATING_DOC = "</DOC>"; |
| |
| /// <summary>separator between lines in the buffer</summary> |
| public static readonly string NEW_LINE = Environment.NewLine; |
| |
| private static readonly string[] DATE_FORMATS = { |
| // LUCENENET specific: in JAVA, they don't care if it is an abbreviated or a full month name when parsing |
| // so we provide definitions for both ways. |
| "ddd, dd MMM yyyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT |
| "ddd, dd MMMM yyyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT |
| "ddd MMM dd hh:mm:ss yyyy K", // Tue Dec 09 16:45:08 2003 EST |
| "ddd MMMM dd hh:mm:ss yyyy K", // Tue December 09 16:45:08 2003 EST |
| "ddd, dd-MMM-':'y hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT |
| "ddd, dd-MMMM-':'y hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT |
| "ddd, dd-MMM-yyy hh:mm:ss K", // Tue, 09 Dec 2003 22:39:08 GMT |
| "ddd, dd-MMMM-yyy hh:mm:ss K", // Tue, 09 December 2003 22:39:08 GMT |
| "ddd MMM dd hh:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 |
| "ddd MMMM dd hh:mm:ss yyyy", // Tue December 09 16:45:08 2003 |
| "dd MMM yyyy", // 1 Mar 1994 |
| "dd MMMM yyyy", // 1 March 1994 |
| "MMM dd, yyyy", // Feb 3, 1994 |
| "MMMM dd, yyyy", // February 3, 1994 |
| "yyMMdd", // 910513 |
| "hhmm K.K.K. MMM dd, yyyy", // 0901 u.t.c. Apr 28, 1994 |
| "hhmm K.K.K. MMMM dd, yyyy", // 0901 u.t.c. April 28, 1994 |
| }; |
| |
| private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>(); |
| private DirectoryInfo dataDir = null; |
| private List<FileInfo> inputFiles = new List<FileInfo>(); |
| private int nextFile = 0; |
| // Use to synchronize threads on reading from the TREC documents. |
| private object @lock = new object(); |
| |
| // Required for test |
| internal TextReader reader; |
| internal int iteration = 0; |
| internal IHTMLParser htmlParser; |
| |
| private bool excludeDocnameIteration; |
| private TrecDocParser trecDocParser = new TrecGov2Parser(); // default |
| internal TrecDocParser.ParsePathType currPathType; // not private for tests |
| |
| private StringBuilder GetDocBuffer() |
| { |
| StringBuilder sb = trecDocBuffer.Value; |
| if (sb == null) |
| { |
| sb = new StringBuilder(); |
| trecDocBuffer.Value = sb; |
| } |
| return sb; |
| } |
| |
| internal IHTMLParser HtmlParser |
| { |
| get { return htmlParser; } |
| } |
| |
| /// <summary> |
| /// Read until a line starting with the specified <paramref name="lineStart"/>. |
| /// </summary> |
| /// <param name="buf">Buffer for collecting the data if so specified.</param> |
| /// <param name="lineStart">Line start to look for, must not be <c>null</c>.</param> |
| /// <param name="collectMatchLine">Whether to collect the matching line into <c>buffer</c>.</param> |
| /// <param name="collectAll">Whether to collect all lines into <c>buffer</c>.</param> |
| /// <exception cref="IOException">If there is a low-level I/O error.</exception> |
| /// <exception cref="NoMoreDataException">If the source is exhausted.</exception> |
| private void Read(StringBuilder buf, string lineStart, |
| bool collectMatchLine, bool collectAll) |
| { |
| string sep = ""; |
| while (true) |
| { |
| string line = reader.ReadLine(); |
| |
| if (line == null) |
| { |
| OpenNextFile(); |
| continue; |
| } |
| |
| var _ = line.Length; |
| |
| if (lineStart != null && line.StartsWith(lineStart, StringComparison.Ordinal)) |
| { |
| if (collectMatchLine) |
| { |
| buf.Append(sep).Append(line); |
| sep = NEW_LINE; |
| } |
| return; |
| } |
| |
| if (collectAll) |
| { |
| buf.Append(sep).Append(line); |
| sep = NEW_LINE; |
| } |
| } |
| } |
| |
| internal virtual void OpenNextFile() |
| { |
| Dispose(); |
| //currPathType = null; |
| while (true) |
| { |
| if (nextFile >= inputFiles.Count) |
| { |
| // exhausted files, start a new round, unless forever set to false. |
| if (!m_forever) |
| { |
| throw new NoMoreDataException(); |
| } |
| nextFile = 0; |
| iteration++; |
| } |
| FileInfo f = inputFiles[nextFile++]; |
| if (m_verbose) |
| { |
| Console.WriteLine("opening: " + f + " length: " + f.Length); |
| } |
| try |
| { |
| Stream inputStream = StreamUtils.GetInputStream(f); // support either gzip, bzip2, or regular text file, by extension |
| reader = new StreamReader(inputStream, m_encoding); |
| currPathType = TrecDocParser.PathType(f); |
| return; |
| } |
| catch (Exception e) |
| { |
| if (m_verbose) |
| { |
| Console.WriteLine("Skipping 'bad' file " + f.FullName + " due to " + e.Message); |
| continue; |
| } |
| throw new NoMoreDataException(); |
| } |
| } |
| } |
| |
| public virtual DateTime? ParseDate(string dateStr) |
| { |
| dateStr = dateStr.Trim(); |
| DateTime d; |
| if (DateTime.TryParseExact(dateStr, DATE_FORMATS, CultureInfo.InvariantCulture, DateTimeStyles.None, out d)) |
| { |
| return d; |
| } |
| else if (DateTime.TryParse(dateStr, CultureInfo.InvariantCulture, DateTimeStyles.None, out d)) |
| { |
| return d; |
| } |
| |
| // do not fail test just because a date could not be parsed |
| if (m_verbose) |
| { |
| Console.WriteLine("failed to parse date (assigning 'now') for: " + dateStr); |
| } |
| return null; |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| if (reader == null) |
| { |
| return; |
| } |
| |
| try |
| { |
| reader.Dispose(); |
| } |
| catch (IOException e) |
| { |
| if (m_verbose) |
| { |
| Console.WriteLine("failed to dispose reader !"); |
| Console.WriteLine(e.ToString()); |
| } |
| } |
| reader = null; |
| } |
| |
| public override DocData GetNextDocData(DocData docData) |
| { |
| string name = null; |
| StringBuilder docBuf = GetDocBuffer(); |
| TrecDocParser.ParsePathType parsedPathType; |
| |
| // protect reading from the TREC files by multiple threads. The rest of the |
| // method, i.e., parsing the content and returning the DocData can run unprotected. |
| lock (@lock) |
| { |
| if (reader == null) |
| { |
| OpenNextFile(); |
| } |
| |
| // 1. skip until doc start - required for all TREC formats |
| docBuf.Length = 0; |
| Read(docBuf, DOC, false, false); |
| |
| // save parsedFile for passing trecDataParser after the sync block, in |
| // case another thread will open another file in between. |
| parsedPathType = currPathType; |
| |
| // 2. name - required for all TREC formats |
| docBuf.Length = 0; |
| Read(docBuf, DOCNO, true, false); |
| name = docBuf.ToString(DOCNO.Length, docBuf.IndexOf(TERMINATING_DOCNO, |
| DOCNO.Length, StringComparison.Ordinal) - DOCNO.Length).Trim(); |
| |
| if (!excludeDocnameIteration) |
| { |
| name = name + "_" + iteration; |
| } |
| |
| // 3. read all until end of doc |
| docBuf.Length = 0; |
| Read(docBuf, TERMINATING_DOC, false, true); |
| } |
| |
| // count char length of text to be parsed (may be larger than the resulted plain doc body text). |
| AddBytes(docBuf.Length); |
| |
| // This code segment relies on HtmlParser being thread safe. When we get |
| // here, everything else is already private to that thread, so we're safe. |
| docData = trecDocParser.Parse(docData, name, this, docBuf, parsedPathType); |
| AddItem(); |
| |
| return docData; |
| } |
| |
| public override void ResetInputs() |
| { |
| lock (@lock) |
| { |
| base.ResetInputs(); |
| Dispose(); |
| nextFile = 0; |
| iteration = 0; |
| } |
| } |
| |
| public override void SetConfig(Config config) |
| { |
| base.SetConfig(config); |
| // dirs |
| DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work")); |
| string d = config.Get("docs.dir", "trec"); |
| dataDir = new DirectoryInfo(d); |
| // files |
| CollectFiles(dataDir, inputFiles); |
| if (inputFiles.Count == 0) |
| { |
| throw new ArgumentException("No files in dataDir: " + dataDir); |
| } |
| // trec doc parser |
| try |
| { |
| string trecDocParserClassName = config.Get("trec.doc.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.TrecGov2Parser, Lucene.Net.Benchmark"); |
| trecDocParser = (TrecDocParser)Activator.CreateInstance(Type.GetType(trecDocParserClassName)); |
| } |
| catch (Exception e) |
| { |
| // Should not get here. Throw runtime exception. |
| throw new Exception(e.ToString(), e); |
| } |
| // html parser |
| try |
| { |
| string htmlParserClassName = config.Get("html.parser", |
| "Lucene.Net.Benchmarks.ByTask.Feeds.DemoHTMLParser, Lucene.Net.Benchmark"); |
| htmlParser = (IHTMLParser)Activator.CreateInstance(Type.GetType(htmlParserClassName)); |
| } |
| catch (Exception e) |
| { |
| // Should not get here. Throw runtime exception. |
| throw new Exception(e.ToString(), e); |
| } |
| // encoding |
| if (m_encoding == null) |
| { |
| m_encoding = Encoding.GetEncoding("iso-8859-1"); //StandardCharsets.ISO_8859_1.name(); |
| } |
| // iteration exclusion in doc name |
| excludeDocnameIteration = config.Get("content.source.excludeIteration", false); |
| } |
| } |
| } |