docs/attachments/LUCENE-2768/LUCENE-2768.patch - lucene-jira-archive - Git at Google

 Index: lucene/common-build.xml
 --- lucene/common-build.xml	Thu Dec 09 13:13:27 2010 -0500
 +++ lucene/common-build.xml	Thu Dec 09 13:14:33 2010 -0500
 @@ -68,6 +68,7 @@
    <property name="tests.locale" value="random" />
    <property name="tests.timezone" value="random" />
    <property name="tests.directory" value="random" />
 +  <property name="tests.linedocsfile" value="europarl.lines.txt.gz" />
    <property name="tests.iter" value="1" />
    <property name="tests.seed" value="random" />
    <property name="tests.userdir" value="."/>
 @@ -459,6 +460,8 @@
  	      <sysproperty key="tests.timezone" value="${tests.timezone}"/>
                <!-- set the directory tests should run with -->
                <sysproperty key="tests.directory" value="${tests.directory}"/>
 +              <!-- set the line file source for oal.util.LineFileDocs -->
 +              <sysproperty key="tests.linedocsfile" value="${tests.linedocsfile}"/>
                <!-- set the number of times tests should run -->
                <sysproperty key="tests.iter" value="${tests.iter}"/>
                <!-- set the test seed -->
 Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py
 --- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py	Thu Dec 09 13:13:27 2010 -0500
 +++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py	Thu Dec 09 13:14:33 2010 -0500
 @@ -1,3 +1,18 @@
 +# Licensed to the Apache Software Foundation (ASF) under one or more
 +# contributor license agreements.  See the NOTICE file distributed with
 +# this work for additional information regarding copyright ownership.
 +# The ASF licenses this file to You under the Apache License, Version 2.0
 +# (the "License"); you may not use this file except in compliance with
 +# the License.  You may obtain a copy of the License at
 +#
 +#     http://www.apache.org/licenses/LICENSE-2.0
 +#
 +# Unless required by applicable law or agreed to in writing, software
 +# distributed under the License is distributed on an "AS IS" BASIS,
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
 +
  import types
  import os
  import sys
 Index: lucene/src/test/org/apache/lucene/index/TestNRTThreads.java
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ lucene/src/test/org/apache/lucene/index/TestNRTThreads.java	Thu Dec 09 13:14:33 2010 -0500
 @@ -0,0 +1,336 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.File;
 +import java.io.IOException;
 +import java.util.ArrayList;
 +import java.util.List;
 +import java.util.Set;
 +import java.util.concurrent.atomic.AtomicBoolean;
 +import java.util.concurrent.atomic.AtomicInteger;
 +
 +import org.apache.lucene.analysis.MockAnalyzer;
 +import org.apache.lucene.document.Document;
 +import org.apache.lucene.index.codecs.CodecProvider;
 +import org.apache.lucene.search.IndexSearcher;
 +import org.apache.lucene.search.PhraseQuery;
 +import org.apache.lucene.search.Query;
 +import org.apache.lucene.search.Sort;
 +import org.apache.lucene.search.SortField;
 +import org.apache.lucene.search.TermQuery;
 +import org.apache.lucene.store.FSDirectory;
 +import org.apache.lucene.store.MockDirectoryWrapper;
 +import org.apache.lucene.util.Bits;
 +import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.LineFileDocs;
 +import org.apache.lucene.util.LuceneTestCase;
 +import org.apache.lucene.util._TestUtil;
 +import org.junit.Test;
 +
 +import static org.junit.Assert.*;
 +import static org.junit.Assume.*;
 +
 +// TODO
 +//   - mix in optimize, addIndexes
 +
 +public class TestNRTThreads extends LuceneTestCase {
 +
 +  @Test
 +  public void testNRTThreads() throws Exception {
 +
 +    final long t0 = System.currentTimeMillis();
 +
 +    if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) {
 +      // no
 +      CodecProvider.getDefault().setDefaultFieldCodec("Standard");
 +    }
 +
 +    final LineFileDocs docs = new LineFileDocs(true);
 +    final File tempDir = _TestUtil.getTempDir("nrtopenfiles");
 +    final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir));
 +    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer());
 +    conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
 +      @Override
 +      public void warm(IndexReader reader) throws IOException {
 +        if (VERBOSE) {
 +          System.out.println("TEST: now warm merged reader=" + reader);
 +        }
 +        final int maxDoc = reader.maxDoc();
 +        final Bits delDocs = reader.getDeletedDocs();
 +        int sum = 0;
 +        final int inc = Math.max(1, maxDoc/50);
 +        for(int docID=0;docID<maxDoc;docID += inc) {
 +          if (delDocs == null || !delDocs.get(docID)) {
 +            final Document doc = reader.document(docID);
 +            sum += doc.getFields().size();
 +          }
 +        }
 +
 +        sum += new IndexSearcher(reader).search(new TermQuery(new Term("body", "united")), 10).totalHits;
 +
 +        if (VERBOSE) {
 +          System.out.println("TEST: warm visited " + sum + " fields");
 +        }
 +      }
 +      });
 +
 +    final IndexWriter writer = new IndexWriter(dir, conf);
 +    if (VERBOSE) {
 +      writer.setInfoStream(System.out);
 +    }
 +    MergeScheduler ms = writer.getConfig().getMergeScheduler();
 +    if (ms instanceof ConcurrentMergeScheduler) {
 +      // try to keep max file open count down
 +      ((ConcurrentMergeScheduler) ms).setMaxThreadCount(1);
 +      ((ConcurrentMergeScheduler) ms).setMaxMergeCount(1);
 +    }
 +    LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy();
 +    if (lmp.getMergeFactor() > 5) {
 +      lmp.setMergeFactor(5);
 +    }
 +
 +    final int NUM_INDEX_THREADS = 2;
 +    final int NUM_SEARCH_THREADS = 3;
 +    final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : 5;
 +
 +    final AtomicBoolean failed = new AtomicBoolean();
 +    final AtomicInteger addCount = new AtomicInteger();
 +    final AtomicInteger delCount = new AtomicInteger();
 +    final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000;
 +    Thread[] threads = new Thread[NUM_INDEX_THREADS];
 +    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
 +      threads[thread] = new Thread() {
 +          @Override
 +          public void run() {
 +            final List<String> toDeleteIDs = new ArrayList<String>();
 +            while(System.currentTimeMillis() < stopTime && !failed.get()) {
 +              try {
 +                Document doc = docs.nextDoc();
 +                if (doc == null) {
 +                  break;
 +                }
 +                if (random.nextBoolean()) {
 +                  if (VERBOSE) {
 +                    //System.out.println(Thread.currentThread().getName() + ": add doc id:" + doc.get("id"));
 +                  }
 +                  writer.addDocument(doc);
 +                } else {
 +                  // we use update but it never replaces a
 +                  // prior doc
 +                  if (VERBOSE) {
 +                    //System.out.println(Thread.currentThread().getName() + ": update doc id:" + doc.get("id"));
 +                  }
 +                  writer.updateDocument(new Term("id", doc.get("id")), doc);
 +                }
 +                if (random.nextInt(5) == 3) {
 +                  if (VERBOSE) {
 +                    //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("id"));
 +                  }
 +                  toDeleteIDs.add(doc.get("id"));
 +                }
 +                if (random.nextInt(50) == 17) {
 +                  if (VERBOSE) {
 +                    System.out.println(Thread.currentThread().getName() + ": apply " + toDeleteIDs.size() + " deletes");
 +                  }
 +                  for(String id : toDeleteIDs) {
 +                    writer.deleteDocuments(new Term("id", id));
 +                  }
 +                  delCount.addAndGet(toDeleteIDs.size());
 +                  toDeleteIDs.clear();
 +                }
 +                addCount.getAndIncrement();
 +              } catch (Exception exc) {
 +                System.out.println(Thread.currentThread().getName() + ": hit exc");
 +                exc.printStackTrace();
 +                failed.set(true);
 +                throw new RuntimeException(exc);
 +              }
 +            }
 +          }
 +        };
 +      threads[thread].setDaemon(true);
 +      threads[thread].start();
 +    }
 +
 +    if (VERBOSE) {
 +      System.out.println("TEST: DONE start indexing threads [" + (System.currentTimeMillis()-t0) + " ms]");
 +    }
 +
 +    // let index build up a bit
 +    Thread.sleep(100);
 +
 +    IndexReader r = IndexReader.open(writer);
 +    boolean any = false;
 +
 +    // silly starting guess:
 +    final AtomicInteger totTermCount = new AtomicInteger(100);
 +
 +    while(System.currentTimeMillis() < stopTime && !failed.get()) {
 +      if (random.nextBoolean()) {
 +        if (VERBOSE) {
 +          System.out.println("TEST: now reopen r=" + r);
 +        }
 +        final IndexReader r2 = r.reopen();
 +        if (r != r2) {
 +          r.close();
 +          r = r2;
 +        }
 +      } else {
 +        if (VERBOSE) {
 +          System.out.println("TEST: now close reader=" + r);
 +        }
 +        r.close();
 +        writer.commit();
 +        final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
 +        if (openDeletedFiles.size() > 0) {
 +          System.out.println("OBD files: " + openDeletedFiles);
 +        }
 +        any |= openDeletedFiles.size() > 0;
 +        //assertEquals("open but deleted: " + openDeletedFiles, 0, openDeletedFiles.size());
 +        if (VERBOSE) {
 +          System.out.println("TEST: now open");
 +        }
 +        r = IndexReader.open(writer);
 +      }
 +      if (VERBOSE) {
 +        System.out.println("TEST: got new reader=" + r);
 +      }
 +      //System.out.println("numDocs=" + r.numDocs() + "
 +      //openDelFileCount=" + dir.openDeleteFileCount());
 +
 +      smokeTestReader(r);
 +
 +      final IndexSearcher s = new IndexSearcher(r);
 +
 +      // run search threads
 +      final long searchStopTime = System.currentTimeMillis() + 500;
 +      final Thread[] searchThreads = new Thread[NUM_SEARCH_THREADS];
 +      final AtomicInteger totHits = new AtomicInteger();
 +      for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
 +        searchThreads[thread] = new Thread() {
 +          @Override
 +          public void run() {
 +            try {
 +              TermsEnum termsEnum = MultiFields.getTerms(s.getIndexReader(), "body").iterator();
 +              int seenTermCount = 0;
 +              int shift;
 +              int trigger;
 +              if (totTermCount.get() == 0) {
 +                shift = 0;
 +                trigger = 1;
 +              } else {
 +                shift = random.nextInt(totTermCount.get()/10);
 +                trigger = totTermCount.get()/10;
 +              }
 +              while(System.currentTimeMillis() < searchStopTime) {
 +                BytesRef term = termsEnum.next();
 +                if (term == null) {
 +                  totTermCount.set(seenTermCount);
 +                  seenTermCount = 0;
 +                  trigger = totTermCount.get()/10;
 +                  //System.out.println("trigger " + trigger);
 +                  shift = random.nextInt(totTermCount.get()/10);
 +                  termsEnum.seek(new BytesRef(""));
 +                  continue;
 +                }
 +                seenTermCount++;
 +                // search 10 terms
 +                if (trigger == 0) {
 +                  trigger = 1;
 +                }
 +                if ((seenTermCount + shift) % trigger == 0) {
 +                  //if (VERBOSE) {
 +                  //System.out.println(Thread.currentThread().getName() + " now search body:" + term.utf8ToString());
 +                  //}
 +                  totHits.addAndGet(runQuery(s, new TermQuery(new Term("body", term))));
 +                }
 +              }
 +              if (VERBOSE) {
 +                System.out.println(Thread.currentThread().getName() + ": search done");
 +              }
 +            } catch (Throwable t) {
 +              failed.set(true);
 +              t.printStackTrace(System.out);
 +              throw new RuntimeException(t);
 +            }
 +          }
 +          };
 +        searchThreads[thread].setDaemon(true);
 +        searchThreads[thread].start();
 +      }
 +
 +      for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
 +        searchThreads[thread].join();
 +      }
 +
 +      if (VERBOSE) {
 +        System.out.println("TEST: DONE search: totHits=" + totHits);
 +      }
 +    }
 +
 +    if (VERBOSE) {
 +      System.out.println("TEST: all searching done [" + (System.currentTimeMillis()-t0) + " ms]");
 +    }
 +
 +    //System.out.println("numDocs=" + r.numDocs() + " openDelFileCount=" + dir.openDeleteFileCount());
 +    r.close();
 +    final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
 +    if (openDeletedFiles.size() > 0) {
 +      System.out.println("OBD files: " + openDeletedFiles);
 +    }
 +    any |= openDeletedFiles.size() > 0;
 +
 +    assertFalse("saw non-zero open-but-deleted count", any);
 +    if (VERBOSE) {
 +      System.out.println("TEST: now join");
 +    }
 +    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
 +      threads[thread].join();
 +    }
 +    if (VERBOSE) {
 +      System.out.println("TEST: done join [" + (System.currentTimeMillis()-t0) + " ms]; addCount=" + addCount + " delCount=" + delCount);
 +    }
 +    writer.commit();
 +    assertEquals(addCount.get() - delCount.get(), writer.numDocs());
 +
 +    writer.close(false);
 +    dir.close();
 +    _TestUtil.rmDir(tempDir);
 +    docs.close();
 +    if (VERBOSE) {
 +      System.out.println("TEST: done [" + (System.currentTimeMillis()-t0) + " ms]");
 +    }
 +  }
 +
 +  private int runQuery(IndexSearcher s, Query q) throws Exception {
 +    s.search(q, 10);
 +    return s.search(q, null, 10, new Sort(new SortField("title", SortField.STRING))).totalHits;
 +  }
 +
 +  private void smokeTestReader(IndexReader r) throws Exception {
 +    IndexSearcher s = new IndexSearcher(r);
 +    runQuery(s, new TermQuery(new Term("body", "united")));
 +    runQuery(s, new TermQuery(new Term("titleTokenized", "states")));
 +    PhraseQuery pq = new PhraseQuery();
 +    pq.add(new Term("body", "united"));
 +    pq.add(new Term("body", "states"));
 +    runQuery(s, pq);
 +    s.close();
 +  }
 +}
 Index: lucene/src/test/org/apache/lucene/util/LineFileDocs.java
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ lucene/src/test/org/apache/lucene/util/LineFileDocs.java	Thu Dec 09 13:14:33 2010 -0500
 @@ -0,0 +1,155 @@
 +package org.apache.lucene.util;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.Closeable;
 +import java.io.FileNotFoundException;
 +import java.io.IOException;
 +import java.io.BufferedReader;
 +import java.io.InputStreamReader;
 +import java.io.InputStream;
 +import java.io.BufferedInputStream;
 +import java.util.concurrent.atomic.AtomicInteger;
 +import java.util.zip.GZIPInputStream;
 +
 +import org.apache.lucene.document.Document;
 +import org.apache.lucene.document.Field;
 +
 +// Minimal port of contrib/benchmark's LneDocSource +
 +// DocMaker, so tests can enum docs from a line file created
 +// by contrib/benchmark's WriteLineDoc task
 +public class LineFileDocs implements Closeable {
 +
 +  private BufferedReader reader;
 +  private final boolean forever;
 +  private final static int BUFFER_SIZE = 1 << 16;     // 64K
 +  private final AtomicInteger id = new AtomicInteger();
 +  private final String path;
 +
 +  // If forever is true, we rewind the file at EOF (repeat
 +  // the docs over and over)
 +  public LineFileDocs(String path, boolean forever) throws IOException {
 +    this.path = path;
 +    this.forever = forever;
 +    open();
 +  }
 +
 +  public LineFileDocs(boolean forever) throws IOException {
 +    this(LuceneTestCase.TEST_LINE_DOCS_FILE, forever);
 +  }
 +
 +  public synchronized void close() throws IOException {
 +    if (reader != null) {
 +      reader.close();
 +      reader = null;
 +    }
 +  }
 +
 +  private synchronized void open() throws IOException {
 +    System.out.println("PATH=" + path);
 +    InputStream is = getClass().getResourceAsStream(path);
 +    if (is == null) {
 +      throw new FileNotFoundException("cannot find line docs resource \"" + path + "\"");
 +    }
 +    if (path.toString().endsWith(".gz")) {
 +      is = new GZIPInputStream(is);
 +    }
 +    final InputStream in = new BufferedInputStream(is, BUFFER_SIZE);
 +    reader = new BufferedReader(new InputStreamReader(in, "UTF-8"), BUFFER_SIZE);
 +  }
 +
 +  public synchronized void reset() throws IOException {
 +    close();
 +    open();
 +    id.set(0);
 +  }
 +
 +  private final static char SEP = '\t';
 +
 +  private static final class DocState {
 +    final Document doc;
 +    final Field titleTokenized;
 +    final Field title;
 +    final Field body;
 +    final Field id;
 +    final Field date;
 +
 +    public DocState() {
 +      doc = new Document();
 +
 +      title = new Field("title", "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
 +      doc.add(title);
 +
 +      titleTokenized = new Field("titleTokenized", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
 +      doc.add(titleTokenized);
 +
 +      body = new Field("body", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
 +      doc.add(body);
 +
 +      id = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
 +      doc.add(id);
 +
 +      date = new Field("date", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
 +      doc.add(date);
 +    }
 +  }
 +
 +  private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>();
 +
 +  // Document instance is re-used per-thread
 +  public Document nextDoc() throws IOException {
 +    String line;
 +    synchronized(this) {
 +      line = reader.readLine();
 +      if (line == null) {
 +        if (forever) {
 +          if (LuceneTestCase.VERBOSE) {
 +            System.out.println("TEST: LineFileDocs: now rewind file...");
 +          }
 +          close();
 +          open();
 +          line = reader.readLine();
 +        }
 +        return null;
 +      }
 +    }
 +
 +    DocState docState = threadDocs.get();
 +    if (docState == null) {
 +      docState = new DocState();
 +      threadDocs.set(docState);
 +    }
 +
 +    int spot = line.indexOf(SEP);
 +    if (spot == -1) {
 +      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
 +    }
 +    int spot2 = line.indexOf(SEP, 1 + spot);
 +    if (spot2 == -1) {
 +      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
 +    }
 +
 +    docState.body.setValue(line.substring(1+spot2, line.length()));
 +    final String title = line.substring(0, spot);
 +    docState.title.setValue(title);
 +    docState.titleTokenized.setValue(title);
 +    docState.date.setValue(line.substring(1+spot, spot2));
 +    docState.id.setValue(Integer.toString(id.getAndIncrement()));
 +    return docState.doc;
 +  }
 +}
 Index: lucene/src/test/org/apache/lucene/util/LuceneTestCase.java
 --- lucene/src/test/org/apache/lucene/util/LuceneTestCase.java	Thu Dec 09 13:13:27 2010 -0500
 +++ lucene/src/test/org/apache/lucene/util/LuceneTestCase.java	Thu Dec 09 13:14:33 2010 -0500
 @@ -128,19 +128,21 @@
    // each test case (non-J4 tests) and each test class (J4
    // tests)
    /** Gets the codec to run tests with. */
 -  static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField");
 +  public static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField");
    /** Gets the locale to run tests with */
 -  static final String TEST_LOCALE = System.getProperty("tests.locale", "random");
 +  public static final String TEST_LOCALE = System.getProperty("tests.locale", "random");
    /** Gets the timezone to run tests with */
 -  static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random");
 +  public static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random");
    /** Gets the directory to run tests with */
 -  static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
 +  public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
    /** Get the number of times to run tests */
 -  static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1"));
 +  public static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1"));
    /** Get the random seed for tests */
 -  static final String TEST_SEED = System.getProperty("tests.seed", "random");
 +  public static final String TEST_SEED = System.getProperty("tests.seed", "random");
    /** whether or not nightly tests should run */
 -  static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false"));
 +  public static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false"));
 +  /** the line file used by LineFileDocs */
 +  public static final String TEST_LINE_DOCS_FILE = System.getProperty("tests.linedocsfile", "europarl.lines.txt.gz");

    private static final Pattern codecWithParam = Pattern.compile("(.*)\\(\\s*(\\d+)\\s*\\)");

 Index: lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz
 Binary file lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz has changed
 Index: lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py
 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 +++ lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py	Thu Dec 09 13:14:33 2010 -0500
 @@ -0,0 +1,137 @@
 +# Licensed to the Apache Software Foundation (ASF) under one or more
 +# contributor license agreements.  See the NOTICE file distributed with
 +# this work for additional information regarding copyright ownership.
 +# The ASF licenses this file to You under the Apache License, Version 2.0
 +# (the "License"); you may not use this file except in compliance with
 +# the License.  You may obtain a copy of the License at
 +#
 +#     http://www.apache.org/licenses/LICENSE-2.0
 +#
 +# Unless required by applicable law or agreed to in writing, software
 +# distributed under the License is distributed on an "AS IS" BASIS,
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
 +
 +import sys
 +import glob
 +import datetime
 +import tarfile
 +import re
 +
 +try:
 +  sys.argv.remove('-verbose')
 +  VERBOSE = True
 +except ValueError:
 +  VERBOSE = False
 +
 +try:
 +  sys.argv.remove('-docPerParagraph')
 +  docPerParagraph = True
 +except ValueError:
 +  docPerParagraph = False
 +
 +reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
 +reTagOnly = re.compile('^<.*?>$')
 +reNumberOnly = re.compile(r'^\d+\.?$')
 +
 +docCount = 0
 +didEnglish = False
 +
 +def write(date, title, pending, fOut):
 +  global docCount
 +  body = ' '.join(pending).replace('\t', ' ').strip()
 +  if len(body) > 0:
 +    line = '%s\t%s\t%s\n' % (title, date, body)
 +    fOut.write(line)
 +    docCount += 1
 +    del pending[:]
 +    if VERBOSE:
 +      print len(body)
 +
 +def processTar(fileName, fOut):
 +
 +  global didEnglish
 +
 +  t = tarfile.open(fileName, 'r:gz')
 +  for ti in t:
 +    if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):
 +
 +      tup = ti.name.split('/')
 +      lang = tup[1]
 +      year = int(tup[2][3:5])
 +      if year < 20:
 +        year += 2000
 +      else:
 +        year += 1900
 +
 +      month = int(tup[2][6:8])
 +      day = int(tup[2][9:11])
 +      date = datetime.date(year=year, month=month, day=day)
 +
 +      if VERBOSE:
 +        print
 +        print '%s: %s' % (ti.name, date)
 +      nextIsTitle = False
 +      title = None
 +      pending = []
 +      for line in t.extractfile(ti).readlines():
 +        line = line.strip()
 +        if reChapterOnly.match(line) is not None:
 +          if title is not None:
 +            write(date, title, pending, fOut)
 +          nextIsTitle = True
 +          continue
 +        if nextIsTitle:
 +          if not reNumberOnly.match(line) and not reTagOnly.match(line):
 +            title = line
 +            nextIsTitle = False
 +            if VERBOSE:
 +              print '  title %s' % line
 +          continue
 +        if line.lower() == '<p>':
 +          if docPerParagraph:
 +            write(date, title, pending, fOut)
 +          else:
 +            pending.append('PARSEP')
 +        elif not reTagOnly.match(line):
 +          pending.append(line)
 +      if title is not None and len(pending) > 0:
 +        write(date, title, pending, fOut)
 +
 +  didEnglish = True
 +
 +# '/x/lucene/data/europarl/all.lines.txt'
 +dirIn = sys.argv[1]
 +fileOut = sys.argv[2]
 +
 +fOut = open(fileOut, 'wb')
 +
 +for fileName in glob.glob('%s/??-??.tgz' % dirIn):
 +  if fileName.endswith('.tgz'):
 +    print 'process %s; %d docs so far...' % (fileName, docCount)
 +    processTar(fileName, fOut)
 +
 +print 'TOTAL: %s' % docCount
 +
 +#run something like this:
 +"""
 +
 +# Europarl V5 makes 76,917 docs, avg 38.6 KB per
 +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
 +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
 +rm /x/lucene/data/europarl/tmp.lines.txt
 +
 +# Run again, this time each paragraph is a doc:
 +# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
 +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
 +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
 +rm /x/lucene/data/europarl/tmp.lines.txt
 +
 +# ~5.5 MB gzip'd:
 +head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
 +head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
 +shuf tmp.txt > europarl.subset.txt
 +rm -f tmp.txt
 +gzip --best europarl.subset.txt
 +"""