| Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
|
| ===================================================================
|
| --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (revision 729833)
|
| +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java (working copy)
|
| @@ -31,6 +31,10 @@
|
| import java.util.Locale; |
| import java.util.zip.GZIPInputStream; |
| |
| +import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; |
| +import org.apache.lucene.benchmark.byTask.feeds.DocData; |
| +import org.apache.lucene.benchmark.byTask.feeds.HTMLParser; |
| +import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; |
| import org.apache.lucene.benchmark.byTask.utils.Config; |
| |
| |
| @@ -44,7 +48,14 @@
|
| */ |
| public class TrecDocMaker extends BasicDocMaker { |
| |
| - private static final String newline = System.getProperty("line.separator"); |
| + private static final String DATE = "Date: "; |
| + private static final String DOCHDR = "<DOCHDR>"; |
| + private static final String TERM_DOCHDR = "</DOCHDR>"; |
| + private static final String TERM_DOCNO = "</DOCNO>"; |
| + private static final String DOCNO = "<DOCNO>"; |
| + private static final String TERM_DOC = "</DOC>"; |
| + private static final String DOC = "<DOC>"; |
| + private static final String NEW_LINE = System.getProperty("line.separator"); |
| |
| protected ThreadLocal dateFormat = new ThreadLocal(); |
| protected File dataDir = null; |
| @@ -135,25 +146,37 @@
|
| } |
| |
| // read until finding a line that starts with the specified prefix |
| - protected StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception { |
| + protected StringBuffer read(String prefix, StringBuffer sb, |
| + boolean collectMatchLine, boolean collectAll, |
| + String terminatingTag) throws Exception { |
| sb = (sb==null ? new StringBuffer() : sb); |
| String sep = ""; |
| while (true) { |
| String line = reader.readLine(); |
| - if (line==null) { |
| + if (line == null) { |
| openNextFile(); |
| continue; |
| } |
| if (line.startsWith(prefix)) { |
| if (collectMatchLine) { |
| - sb.append(sep+line); |
| - sep = newline; |
| + sb.append(sep).append(line); |
| + sep = NEW_LINE; |
| } |
| break; |
| } |
| + |
| + if (terminatingTag != null && line.startsWith(terminatingTag)) { |
| + // didn't find the prefix that was asked, but the terminating |
| + // tag was found. set the length to 0 to signal no match was |
| + // found. |
| + sb.setLength(0); |
| + break; |
| + } |
| + |
| + |
| if (collectAll) { |
| - sb.append(sep+line); |
| - sep = newline; |
| + sb.append(sep).append(line); |
| + sep = NEW_LINE; |
| } |
| } |
| //System.out.println("read: "+sb); |
| @@ -165,22 +188,31 @@
|
| openNextFile(); |
| } |
| // 1. skip until doc start |
| - read("<DOC>",null,false,false); |
| + read(DOC,null,false,false,null); |
| // 2. name |
| - StringBuffer sb = read("<DOCNO>",null,true,false); |
| - String name = sb.substring("<DOCNO>".length()); |
| - name = name.substring(0,name.indexOf("</DOCNO>"))+"_"+iteration; |
| + StringBuffer sb = read(DOCNO,null,true,false,null); |
| + String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length())); |
| + name = name + "_" + iteration; |
| // 3. skip until doc header |
| - read("<DOCHDR>",null,false,false); |
| + read(DOCHDR,null,false,false,null); |
| + boolean findTerminatingDocHdr = false; |
| // 4. date |
| - sb = read("Date: ",null,true,false); |
| - String dateStr = sb.substring("Date: ".length()); |
| + sb = read(DATE,null,true,false,TERM_DOCHDR); |
| + String dateStr = null; |
| + if (sb.length() != 0) { |
| + // Date found. |
| + dateStr = sb.substring(DATE.length()); |
| + findTerminatingDocHdr = true; |
| + } |
| + |
| // 5. skip until end of doc header |
| - read("</DOCHDR>",null,false,false); |
| + if (findTerminatingDocHdr) { |
| + read(TERM_DOCHDR,null,false,false,null); |
| + } |
| // 6. collect until end of doc |
| - sb = read("</DOC>",null,false,true); |
| + sb = read(TERM_DOC,null,false,true,null); |
| // this is the next document, so parse it |
| - Date date = parseDate(dateStr); |
| + Date date = dateStr != null ? parseDate(dateStr) : new Date(); |
| HTMLParser p = getHtmlParser(); |
| DocData docData = p.parse(name, date, sb, getDateFormat(0)); |
| addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text). |