blob: 0f77c99de00c9197b3fffffd935762e313b39a79 [file] [log] [blame]
Index: lucene/common-build.xml
--- lucene/common-build.xml Thu Dec 09 13:13:27 2010 -0500
+++ lucene/common-build.xml Thu Dec 09 13:14:33 2010 -0500
@@ -68,6 +68,7 @@
<property name="tests.locale" value="random" />
<property name="tests.timezone" value="random" />
<property name="tests.directory" value="random" />
+ <property name="tests.linedocsfile" value="europarl.lines.txt.gz" />
<property name="tests.iter" value="1" />
<property name="tests.seed" value="random" />
<property name="tests.userdir" value="."/>
@@ -459,6 +460,8 @@
<sysproperty key="tests.timezone" value="${tests.timezone}"/>
<!-- set the directory tests should run with -->
<sysproperty key="tests.directory" value="${tests.directory}"/>
+ <!-- set the line file source for oal.util.LineFileDocs -->
+ <sysproperty key="tests.linedocsfile" value="${tests.linedocsfile}"/>
<!-- set the number of times tests should run -->
<sysproperty key="tests.iter" value="${tests.iter}"/>
<!-- set the test seed -->
Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py
--- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py Thu Dec 09 13:13:27 2010 -0500
+++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py Thu Dec 09 13:14:33 2010 -0500
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import types
import os
import sys
Index: lucene/src/test/org/apache/lucene/index/TestNRTThreads.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/test/org/apache/lucene/index/TestNRTThreads.java Thu Dec 09 13:14:33 2010 -0500
@@ -0,0 +1,336 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import static org.junit.Assume.*;
+
+// TODO
+// - mix in optimize, addIndexes
+
+public class TestNRTThreads extends LuceneTestCase {
+
+ @Test
+ public void testNRTThreads() throws Exception {
+
+ final long t0 = System.currentTimeMillis();
+
+ if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) {
+ // no
+ CodecProvider.getDefault().setDefaultFieldCodec("Standard");
+ }
+
+ final LineFileDocs docs = new LineFileDocs(true);
+ final File tempDir = _TestUtil.getTempDir("nrtopenfiles");
+ final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir));
+ final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer());
+ conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
+ @Override
+ public void warm(IndexReader reader) throws IOException {
+ if (VERBOSE) {
+ System.out.println("TEST: now warm merged reader=" + reader);
+ }
+ final int maxDoc = reader.maxDoc();
+ final Bits delDocs = reader.getDeletedDocs();
+ int sum = 0;
+ final int inc = Math.max(1, maxDoc/50);
+ for(int docID=0;docID<maxDoc;docID += inc) {
+ if (delDocs == null || !delDocs.get(docID)) {
+ final Document doc = reader.document(docID);
+ sum += doc.getFields().size();
+ }
+ }
+
+ sum += new IndexSearcher(reader).search(new TermQuery(new Term("body", "united")), 10).totalHits;
+
+ if (VERBOSE) {
+ System.out.println("TEST: warm visited " + sum + " fields");
+ }
+ }
+ });
+
+ final IndexWriter writer = new IndexWriter(dir, conf);
+ if (VERBOSE) {
+ writer.setInfoStream(System.out);
+ }
+ MergeScheduler ms = writer.getConfig().getMergeScheduler();
+ if (ms instanceof ConcurrentMergeScheduler) {
+ // try to keep max file open count down
+ ((ConcurrentMergeScheduler) ms).setMaxThreadCount(1);
+ ((ConcurrentMergeScheduler) ms).setMaxMergeCount(1);
+ }
+ LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy();
+ if (lmp.getMergeFactor() > 5) {
+ lmp.setMergeFactor(5);
+ }
+
+ final int NUM_INDEX_THREADS = 2;
+ final int NUM_SEARCH_THREADS = 3;
+ final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : 5;
+
+ final AtomicBoolean failed = new AtomicBoolean();
+ final AtomicInteger addCount = new AtomicInteger();
+ final AtomicInteger delCount = new AtomicInteger();
+ final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000;
+ Thread[] threads = new Thread[NUM_INDEX_THREADS];
+ for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
+ threads[thread] = new Thread() {
+ @Override
+ public void run() {
+ final List<String> toDeleteIDs = new ArrayList<String>();
+ while(System.currentTimeMillis() < stopTime && !failed.get()) {
+ try {
+ Document doc = docs.nextDoc();
+ if (doc == null) {
+ break;
+ }
+ if (random.nextBoolean()) {
+ if (VERBOSE) {
+ //System.out.println(Thread.currentThread().getName() + ": add doc id:" + doc.get("id"));
+ }
+ writer.addDocument(doc);
+ } else {
+ // we use update but it never replaces a
+ // prior doc
+ if (VERBOSE) {
+ //System.out.println(Thread.currentThread().getName() + ": update doc id:" + doc.get("id"));
+ }
+ writer.updateDocument(new Term("id", doc.get("id")), doc);
+ }
+ if (random.nextInt(5) == 3) {
+ if (VERBOSE) {
+ //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("id"));
+ }
+ toDeleteIDs.add(doc.get("id"));
+ }
+ if (random.nextInt(50) == 17) {
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": apply " + toDeleteIDs.size() + " deletes");
+ }
+ for(String id : toDeleteIDs) {
+ writer.deleteDocuments(new Term("id", id));
+ }
+ delCount.addAndGet(toDeleteIDs.size());
+ toDeleteIDs.clear();
+ }
+ addCount.getAndIncrement();
+ } catch (Exception exc) {
+ System.out.println(Thread.currentThread().getName() + ": hit exc");
+ exc.printStackTrace();
+ failed.set(true);
+ throw new RuntimeException(exc);
+ }
+ }
+ }
+ };
+ threads[thread].setDaemon(true);
+ threads[thread].start();
+ }
+
+ if (VERBOSE) {
+ System.out.println("TEST: DONE start indexing threads [" + (System.currentTimeMillis()-t0) + " ms]");
+ }
+
+ // let index build up a bit
+ Thread.sleep(100);
+
+ IndexReader r = IndexReader.open(writer);
+ boolean any = false;
+
+ // silly starting guess:
+ final AtomicInteger totTermCount = new AtomicInteger(100);
+
+ while(System.currentTimeMillis() < stopTime && !failed.get()) {
+ if (random.nextBoolean()) {
+ if (VERBOSE) {
+ System.out.println("TEST: now reopen r=" + r);
+ }
+ final IndexReader r2 = r.reopen();
+ if (r != r2) {
+ r.close();
+ r = r2;
+ }
+ } else {
+ if (VERBOSE) {
+ System.out.println("TEST: now close reader=" + r);
+ }
+ r.close();
+ writer.commit();
+ final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
+ if (openDeletedFiles.size() > 0) {
+ System.out.println("OBD files: " + openDeletedFiles);
+ }
+ any |= openDeletedFiles.size() > 0;
+ //assertEquals("open but deleted: " + openDeletedFiles, 0, openDeletedFiles.size());
+ if (VERBOSE) {
+ System.out.println("TEST: now open");
+ }
+ r = IndexReader.open(writer);
+ }
+ if (VERBOSE) {
+ System.out.println("TEST: got new reader=" + r);
+ }
+ //System.out.println("numDocs=" + r.numDocs() + "
+ //openDelFileCount=" + dir.openDeleteFileCount());
+
+ smokeTestReader(r);
+
+ final IndexSearcher s = new IndexSearcher(r);
+
+ // run search threads
+ final long searchStopTime = System.currentTimeMillis() + 500;
+ final Thread[] searchThreads = new Thread[NUM_SEARCH_THREADS];
+ final AtomicInteger totHits = new AtomicInteger();
+ for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
+ searchThreads[thread] = new Thread() {
+ @Override
+ public void run() {
+ try {
+ TermsEnum termsEnum = MultiFields.getTerms(s.getIndexReader(), "body").iterator();
+ int seenTermCount = 0;
+ int shift;
+ int trigger;
+ if (totTermCount.get() == 0) {
+ shift = 0;
+ trigger = 1;
+ } else {
+ shift = random.nextInt(totTermCount.get()/10);
+ trigger = totTermCount.get()/10;
+ }
+ while(System.currentTimeMillis() < searchStopTime) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ totTermCount.set(seenTermCount);
+ seenTermCount = 0;
+ trigger = totTermCount.get()/10;
+ //System.out.println("trigger " + trigger);
+ shift = random.nextInt(totTermCount.get()/10);
+ termsEnum.seek(new BytesRef(""));
+ continue;
+ }
+ seenTermCount++;
+ // search 10 terms
+ if (trigger == 0) {
+ trigger = 1;
+ }
+ if ((seenTermCount + shift) % trigger == 0) {
+ //if (VERBOSE) {
+ //System.out.println(Thread.currentThread().getName() + " now search body:" + term.utf8ToString());
+ //}
+ totHits.addAndGet(runQuery(s, new TermQuery(new Term("body", term))));
+ }
+ }
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": search done");
+ }
+ } catch (Throwable t) {
+ failed.set(true);
+ t.printStackTrace(System.out);
+ throw new RuntimeException(t);
+ }
+ }
+ };
+ searchThreads[thread].setDaemon(true);
+ searchThreads[thread].start();
+ }
+
+ for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
+ searchThreads[thread].join();
+ }
+
+ if (VERBOSE) {
+ System.out.println("TEST: DONE search: totHits=" + totHits);
+ }
+ }
+
+ if (VERBOSE) {
+ System.out.println("TEST: all searching done [" + (System.currentTimeMillis()-t0) + " ms]");
+ }
+
+ //System.out.println("numDocs=" + r.numDocs() + " openDelFileCount=" + dir.openDeleteFileCount());
+ r.close();
+ final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
+ if (openDeletedFiles.size() > 0) {
+ System.out.println("OBD files: " + openDeletedFiles);
+ }
+ any |= openDeletedFiles.size() > 0;
+
+ assertFalse("saw non-zero open-but-deleted count", any);
+ if (VERBOSE) {
+ System.out.println("TEST: now join");
+ }
+ for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
+ threads[thread].join();
+ }
+ if (VERBOSE) {
+ System.out.println("TEST: done join [" + (System.currentTimeMillis()-t0) + " ms]; addCount=" + addCount + " delCount=" + delCount);
+ }
+ writer.commit();
+ assertEquals(addCount.get() - delCount.get(), writer.numDocs());
+
+ writer.close(false);
+ dir.close();
+ _TestUtil.rmDir(tempDir);
+ docs.close();
+ if (VERBOSE) {
+ System.out.println("TEST: done [" + (System.currentTimeMillis()-t0) + " ms]");
+ }
+ }
+
+ private int runQuery(IndexSearcher s, Query q) throws Exception {
+ s.search(q, 10);
+ return s.search(q, null, 10, new Sort(new SortField("title", SortField.STRING))).totalHits;
+ }
+
+ private void smokeTestReader(IndexReader r) throws Exception {
+ IndexSearcher s = new IndexSearcher(r);
+ runQuery(s, new TermQuery(new Term("body", "united")));
+ runQuery(s, new TermQuery(new Term("titleTokenized", "states")));
+ PhraseQuery pq = new PhraseQuery();
+ pq.add(new Term("body", "united"));
+ pq.add(new Term("body", "states"));
+ runQuery(s, pq);
+ s.close();
+ }
+}
Index: lucene/src/test/org/apache/lucene/util/LineFileDocs.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/test/org/apache/lucene/util/LineFileDocs.java Thu Dec 09 13:14:33 2010 -0500
@@ -0,0 +1,155 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.InputStream;
+import java.io.BufferedInputStream;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+// Minimal port of contrib/benchmark's LneDocSource +
+// DocMaker, so tests can enum docs from a line file created
+// by contrib/benchmark's WriteLineDoc task
+public class LineFileDocs implements Closeable {
+
+ private BufferedReader reader;
+ private final boolean forever;
+ private final static int BUFFER_SIZE = 1 << 16; // 64K
+ private final AtomicInteger id = new AtomicInteger();
+ private final String path;
+
+ // If forever is true, we rewind the file at EOF (repeat
+ // the docs over and over)
+ public LineFileDocs(String path, boolean forever) throws IOException {
+ this.path = path;
+ this.forever = forever;
+ open();
+ }
+
+ public LineFileDocs(boolean forever) throws IOException {
+ this(LuceneTestCase.TEST_LINE_DOCS_FILE, forever);
+ }
+
+ public synchronized void close() throws IOException {
+ if (reader != null) {
+ reader.close();
+ reader = null;
+ }
+ }
+
+ private synchronized void open() throws IOException {
+ System.out.println("PATH=" + path);
+ InputStream is = getClass().getResourceAsStream(path);
+ if (is == null) {
+ throw new FileNotFoundException("cannot find line docs resource \"" + path + "\"");
+ }
+ if (path.toString().endsWith(".gz")) {
+ is = new GZIPInputStream(is);
+ }
+ final InputStream in = new BufferedInputStream(is, BUFFER_SIZE);
+ reader = new BufferedReader(new InputStreamReader(in, "UTF-8"), BUFFER_SIZE);
+ }
+
+ public synchronized void reset() throws IOException {
+ close();
+ open();
+ id.set(0);
+ }
+
+ private final static char SEP = '\t';
+
+ private static final class DocState {
+ final Document doc;
+ final Field titleTokenized;
+ final Field title;
+ final Field body;
+ final Field id;
+ final Field date;
+
+ public DocState() {
+ doc = new Document();
+
+ title = new Field("title", "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
+ doc.add(title);
+
+ titleTokenized = new Field("titleTokenized", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(titleTokenized);
+
+ body = new Field("body", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
+ doc.add(body);
+
+ id = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+ doc.add(id);
+
+ date = new Field("date", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+ doc.add(date);
+ }
+ }
+
+ private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>();
+
+ // Document instance is re-used per-thread
+ public Document nextDoc() throws IOException {
+ String line;
+ synchronized(this) {
+ line = reader.readLine();
+ if (line == null) {
+ if (forever) {
+ if (LuceneTestCase.VERBOSE) {
+ System.out.println("TEST: LineFileDocs: now rewind file...");
+ }
+ close();
+ open();
+ line = reader.readLine();
+ }
+ return null;
+ }
+ }
+
+ DocState docState = threadDocs.get();
+ if (docState == null) {
+ docState = new DocState();
+ threadDocs.set(docState);
+ }
+
+ int spot = line.indexOf(SEP);
+ if (spot == -1) {
+ throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+ }
+ int spot2 = line.indexOf(SEP, 1 + spot);
+ if (spot2 == -1) {
+ throw new RuntimeException("line: [" + line + "] is in an invalid format !");
+ }
+
+ docState.body.setValue(line.substring(1+spot2, line.length()));
+ final String title = line.substring(0, spot);
+ docState.title.setValue(title);
+ docState.titleTokenized.setValue(title);
+ docState.date.setValue(line.substring(1+spot, spot2));
+ docState.id.setValue(Integer.toString(id.getAndIncrement()));
+ return docState.doc;
+ }
+}
Index: lucene/src/test/org/apache/lucene/util/LuceneTestCase.java
--- lucene/src/test/org/apache/lucene/util/LuceneTestCase.java Thu Dec 09 13:13:27 2010 -0500
+++ lucene/src/test/org/apache/lucene/util/LuceneTestCase.java Thu Dec 09 13:14:33 2010 -0500
@@ -128,19 +128,21 @@
// each test case (non-J4 tests) and each test class (J4
// tests)
/** Gets the codec to run tests with. */
- static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField");
+ public static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField");
/** Gets the locale to run tests with */
- static final String TEST_LOCALE = System.getProperty("tests.locale", "random");
+ public static final String TEST_LOCALE = System.getProperty("tests.locale", "random");
/** Gets the timezone to run tests with */
- static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random");
+ public static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random");
/** Gets the directory to run tests with */
- static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
+ public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
/** Get the number of times to run tests */
- static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1"));
+ public static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1"));
/** Get the random seed for tests */
- static final String TEST_SEED = System.getProperty("tests.seed", "random");
+ public static final String TEST_SEED = System.getProperty("tests.seed", "random");
/** whether or not nightly tests should run */
- static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false"));
+ public static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false"));
+ /** the line file used by LineFileDocs */
+ public static final String TEST_LINE_DOCS_FILE = System.getProperty("tests.linedocsfile", "europarl.lines.txt.gz");
private static final Pattern codecWithParam = Pattern.compile("(.*)\\(\\s*(\\d+)\\s*\\)");
Index: lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz
Binary file lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz has changed
Index: lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py Thu Dec 09 13:14:33 2010 -0500
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import glob
+import datetime
+import tarfile
+import re
+
+try:
+ sys.argv.remove('-verbose')
+ VERBOSE = True
+except ValueError:
+ VERBOSE = False
+
+try:
+ sys.argv.remove('-docPerParagraph')
+ docPerParagraph = True
+except ValueError:
+ docPerParagraph = False
+
+reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
+reTagOnly = re.compile('^<.*?>$')
+reNumberOnly = re.compile(r'^\d+\.?$')
+
+docCount = 0
+didEnglish = False
+
+def write(date, title, pending, fOut):
+ global docCount
+ body = ' '.join(pending).replace('\t', ' ').strip()
+ if len(body) > 0:
+ line = '%s\t%s\t%s\n' % (title, date, body)
+ fOut.write(line)
+ docCount += 1
+ del pending[:]
+ if VERBOSE:
+ print len(body)
+
+def processTar(fileName, fOut):
+
+ global didEnglish
+
+ t = tarfile.open(fileName, 'r:gz')
+ for ti in t:
+ if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):
+
+ tup = ti.name.split('/')
+ lang = tup[1]
+ year = int(tup[2][3:5])
+ if year < 20:
+ year += 2000
+ else:
+ year += 1900
+
+ month = int(tup[2][6:8])
+ day = int(tup[2][9:11])
+ date = datetime.date(year=year, month=month, day=day)
+
+ if VERBOSE:
+ print
+ print '%s: %s' % (ti.name, date)
+ nextIsTitle = False
+ title = None
+ pending = []
+ for line in t.extractfile(ti).readlines():
+ line = line.strip()
+ if reChapterOnly.match(line) is not None:
+ if title is not None:
+ write(date, title, pending, fOut)
+ nextIsTitle = True
+ continue
+ if nextIsTitle:
+ if not reNumberOnly.match(line) and not reTagOnly.match(line):
+ title = line
+ nextIsTitle = False
+ if VERBOSE:
+ print ' title %s' % line
+ continue
+ if line.lower() == '<p>':
+ if docPerParagraph:
+ write(date, title, pending, fOut)
+ else:
+ pending.append('PARSEP')
+ elif not reTagOnly.match(line):
+ pending.append(line)
+ if title is not None and len(pending) > 0:
+ write(date, title, pending, fOut)
+
+ didEnglish = True
+
+# '/x/lucene/data/europarl/all.lines.txt'
+dirIn = sys.argv[1]
+fileOut = sys.argv[2]
+
+fOut = open(fileOut, 'wb')
+
+for fileName in glob.glob('%s/??-??.tgz' % dirIn):
+ if fileName.endswith('.tgz'):
+ print 'process %s; %d docs so far...' % (fileName, docCount)
+ processTar(fileName, fOut)
+
+print 'TOTAL: %s' % docCount
+
+#run something like this:
+"""
+
+# Europarl V5 makes 76,917 docs, avg 38.6 KB per
+python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
+shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
+rm /x/lucene/data/europarl/tmp.lines.txt
+
+# Run again, this time each paragraph is a doc:
+# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
+python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
+shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
+rm /x/lucene/data/europarl/tmp.lines.txt
+
+# ~5.5 MB gzip'd:
+head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
+head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
+shuf tmp.txt > europarl.subset.txt
+rm -f tmp.txt
+gzip --best europarl.subset.txt
+"""