| Index: lucene/common-build.xml |
| --- lucene/common-build.xml Thu Dec 09 13:13:27 2010 -0500 |
| +++ lucene/common-build.xml Thu Dec 09 13:14:33 2010 -0500 |
| @@ -68,6 +68,7 @@ |
| <property name="tests.locale" value="random" /> |
| <property name="tests.timezone" value="random" /> |
| <property name="tests.directory" value="random" /> |
| + <property name="tests.linedocsfile" value="europarl.lines.txt.gz" /> |
| <property name="tests.iter" value="1" /> |
| <property name="tests.seed" value="random" /> |
| <property name="tests.userdir" value="."/> |
| @@ -459,6 +460,8 @@ |
| <sysproperty key="tests.timezone" value="${tests.timezone}"/> |
| <!-- set the directory tests should run with --> |
| <sysproperty key="tests.directory" value="${tests.directory}"/> |
| + <!-- set the line file source for oal.util.LineFileDocs --> |
| + <sysproperty key="tests.linedocsfile" value="${tests.linedocsfile}"/> |
| <!-- set the number of times tests should run --> |
| <sysproperty key="tests.iter" value="${tests.iter}"/> |
| <!-- set the test seed --> |
| Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py |
| --- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py Thu Dec 09 13:13:27 2010 -0500 |
| +++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py Thu Dec 09 13:14:33 2010 -0500 |
| @@ -1,3 +1,18 @@ |
| +# Licensed to the Apache Software Foundation (ASF) under one or more |
| +# contributor license agreements. See the NOTICE file distributed with |
| +# this work for additional information regarding copyright ownership. |
| +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| +# (the "License"); you may not use this file except in compliance with |
| +# the License. You may obtain a copy of the License at |
| +# |
| +# http://www.apache.org/licenses/LICENSE-2.0 |
| +# |
| +# Unless required by applicable law or agreed to in writing, software |
| +# distributed under the License is distributed on an "AS IS" BASIS, |
| +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +# See the License for the specific language governing permissions and |
| +# limitations under the License. |
| + |
| import types |
| import os |
| import sys |
| Index: lucene/src/test/org/apache/lucene/index/TestNRTThreads.java |
| --- /dev/null Thu Jan 01 00:00:00 1970 +0000 |
| +++ lucene/src/test/org/apache/lucene/index/TestNRTThreads.java Thu Dec 09 13:14:33 2010 -0500 |
| @@ -0,0 +1,336 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.File; |
| +import java.io.IOException; |
| +import java.util.ArrayList; |
| +import java.util.List; |
| +import java.util.Set; |
| +import java.util.concurrent.atomic.AtomicBoolean; |
| +import java.util.concurrent.atomic.AtomicInteger; |
| + |
| +import org.apache.lucene.analysis.MockAnalyzer; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.index.codecs.CodecProvider; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.PhraseQuery; |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.Sort; |
| +import org.apache.lucene.search.SortField; |
| +import org.apache.lucene.search.TermQuery; |
| +import org.apache.lucene.store.FSDirectory; |
| +import org.apache.lucene.store.MockDirectoryWrapper; |
| +import org.apache.lucene.util.Bits; |
| +import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.LineFileDocs; |
| +import org.apache.lucene.util.LuceneTestCase; |
| +import org.apache.lucene.util._TestUtil; |
| +import org.junit.Test; |
| + |
| +import static org.junit.Assert.*; |
| +import static org.junit.Assume.*; |
| + |
| +// TODO |
| +// - mix in optimize, addIndexes |
| + |
| +public class TestNRTThreads extends LuceneTestCase { |
| + |
| + @Test |
| + public void testNRTThreads() throws Exception { |
| + |
| + final long t0 = System.currentTimeMillis(); |
| + |
| + if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) { |
| + // no |
| + CodecProvider.getDefault().setDefaultFieldCodec("Standard"); |
| + } |
| + |
| + final LineFileDocs docs = new LineFileDocs(true); |
| + final File tempDir = _TestUtil.getTempDir("nrtopenfiles"); |
| + final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir)); |
| + final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()); |
| + conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() { |
| + @Override |
| + public void warm(IndexReader reader) throws IOException { |
| + if (VERBOSE) { |
| + System.out.println("TEST: now warm merged reader=" + reader); |
| + } |
| + final int maxDoc = reader.maxDoc(); |
| + final Bits delDocs = reader.getDeletedDocs(); |
| + int sum = 0; |
| + final int inc = Math.max(1, maxDoc/50); |
| + for(int docID=0;docID<maxDoc;docID += inc) { |
| + if (delDocs == null || !delDocs.get(docID)) { |
| + final Document doc = reader.document(docID); |
| + sum += doc.getFields().size(); |
| + } |
| + } |
| + |
| + sum += new IndexSearcher(reader).search(new TermQuery(new Term("body", "united")), 10).totalHits; |
| + |
| + if (VERBOSE) { |
| + System.out.println("TEST: warm visited " + sum + " fields"); |
| + } |
| + } |
| + }); |
| + |
| + final IndexWriter writer = new IndexWriter(dir, conf); |
| + if (VERBOSE) { |
| + writer.setInfoStream(System.out); |
| + } |
| + MergeScheduler ms = writer.getConfig().getMergeScheduler(); |
| + if (ms instanceof ConcurrentMergeScheduler) { |
| + // try to keep max file open count down |
| + ((ConcurrentMergeScheduler) ms).setMaxThreadCount(1); |
| + ((ConcurrentMergeScheduler) ms).setMaxMergeCount(1); |
| + } |
| + LogMergePolicy lmp = (LogMergePolicy) writer.getConfig().getMergePolicy(); |
| + if (lmp.getMergeFactor() > 5) { |
| + lmp.setMergeFactor(5); |
| + } |
| + |
| + final int NUM_INDEX_THREADS = 2; |
| + final int NUM_SEARCH_THREADS = 3; |
| + final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : 5; |
| + |
| + final AtomicBoolean failed = new AtomicBoolean(); |
| + final AtomicInteger addCount = new AtomicInteger(); |
| + final AtomicInteger delCount = new AtomicInteger(); |
| + final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000; |
| + Thread[] threads = new Thread[NUM_INDEX_THREADS]; |
| + for(int thread=0;thread<NUM_INDEX_THREADS;thread++) { |
| + threads[thread] = new Thread() { |
| + @Override |
| + public void run() { |
| + final List<String> toDeleteIDs = new ArrayList<String>(); |
| + while(System.currentTimeMillis() < stopTime && !failed.get()) { |
| + try { |
| + Document doc = docs.nextDoc(); |
| + if (doc == null) { |
| + break; |
| + } |
| + if (random.nextBoolean()) { |
| + if (VERBOSE) { |
| + //System.out.println(Thread.currentThread().getName() + ": add doc id:" + doc.get("id")); |
| + } |
| + writer.addDocument(doc); |
| + } else { |
| + // we use update but it never replaces a |
| + // prior doc |
| + if (VERBOSE) { |
| + //System.out.println(Thread.currentThread().getName() + ": update doc id:" + doc.get("id")); |
| + } |
| + writer.updateDocument(new Term("id", doc.get("id")), doc); |
| + } |
| + if (random.nextInt(5) == 3) { |
| + if (VERBOSE) { |
| + //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("id")); |
| + } |
| + toDeleteIDs.add(doc.get("id")); |
| + } |
| + if (random.nextInt(50) == 17) { |
| + if (VERBOSE) { |
| + System.out.println(Thread.currentThread().getName() + ": apply " + toDeleteIDs.size() + " deletes"); |
| + } |
| + for(String id : toDeleteIDs) { |
| + writer.deleteDocuments(new Term("id", id)); |
| + } |
| + delCount.addAndGet(toDeleteIDs.size()); |
| + toDeleteIDs.clear(); |
| + } |
| + addCount.getAndIncrement(); |
| + } catch (Exception exc) { |
| + System.out.println(Thread.currentThread().getName() + ": hit exc"); |
| + exc.printStackTrace(); |
| + failed.set(true); |
| + throw new RuntimeException(exc); |
| + } |
| + } |
| + } |
| + }; |
| + threads[thread].setDaemon(true); |
| + threads[thread].start(); |
| + } |
| + |
| + if (VERBOSE) { |
| + System.out.println("TEST: DONE start indexing threads [" + (System.currentTimeMillis()-t0) + " ms]"); |
| + } |
| + |
| + // let index build up a bit |
| + Thread.sleep(100); |
| + |
| + IndexReader r = IndexReader.open(writer); |
| + boolean any = false; |
| + |
| + // silly starting guess: |
| + final AtomicInteger totTermCount = new AtomicInteger(100); |
| + |
| + while(System.currentTimeMillis() < stopTime && !failed.get()) { |
| + if (random.nextBoolean()) { |
| + if (VERBOSE) { |
| + System.out.println("TEST: now reopen r=" + r); |
| + } |
| + final IndexReader r2 = r.reopen(); |
| + if (r != r2) { |
| + r.close(); |
| + r = r2; |
| + } |
| + } else { |
| + if (VERBOSE) { |
| + System.out.println("TEST: now close reader=" + r); |
| + } |
| + r.close(); |
| + writer.commit(); |
| + final Set<String> openDeletedFiles = dir.getOpenDeletedFiles(); |
| + if (openDeletedFiles.size() > 0) { |
| + System.out.println("OBD files: " + openDeletedFiles); |
| + } |
| + any |= openDeletedFiles.size() > 0; |
| + //assertEquals("open but deleted: " + openDeletedFiles, 0, openDeletedFiles.size()); |
| + if (VERBOSE) { |
| + System.out.println("TEST: now open"); |
| + } |
| + r = IndexReader.open(writer); |
| + } |
| + if (VERBOSE) { |
| + System.out.println("TEST: got new reader=" + r); |
| + } |
| + //System.out.println("numDocs=" + r.numDocs() + " |
| + //openDelFileCount=" + dir.openDeleteFileCount()); |
| + |
| + smokeTestReader(r); |
| + |
| + final IndexSearcher s = new IndexSearcher(r); |
| + |
| + // run search threads |
| + final long searchStopTime = System.currentTimeMillis() + 500; |
| + final Thread[] searchThreads = new Thread[NUM_SEARCH_THREADS]; |
| + final AtomicInteger totHits = new AtomicInteger(); |
| + for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) { |
| + searchThreads[thread] = new Thread() { |
| + @Override |
| + public void run() { |
| + try { |
| + TermsEnum termsEnum = MultiFields.getTerms(s.getIndexReader(), "body").iterator(); |
| + int seenTermCount = 0; |
| + int shift; |
| + int trigger; |
| + if (totTermCount.get() == 0) { |
| + shift = 0; |
| + trigger = 1; |
| + } else { |
| + shift = random.nextInt(totTermCount.get()/10); |
| + trigger = totTermCount.get()/10; |
| + } |
| + while(System.currentTimeMillis() < searchStopTime) { |
| + BytesRef term = termsEnum.next(); |
| + if (term == null) { |
| + totTermCount.set(seenTermCount); |
| + seenTermCount = 0; |
| + trigger = totTermCount.get()/10; |
| + //System.out.println("trigger " + trigger); |
| + shift = random.nextInt(totTermCount.get()/10); |
| + termsEnum.seek(new BytesRef("")); |
| + continue; |
| + } |
| + seenTermCount++; |
| + // search 10 terms |
| + if (trigger == 0) { |
| + trigger = 1; |
| + } |
| + if ((seenTermCount + shift) % trigger == 0) { |
| + //if (VERBOSE) { |
| + //System.out.println(Thread.currentThread().getName() + " now search body:" + term.utf8ToString()); |
| + //} |
| + totHits.addAndGet(runQuery(s, new TermQuery(new Term("body", term)))); |
| + } |
| + } |
| + if (VERBOSE) { |
| + System.out.println(Thread.currentThread().getName() + ": search done"); |
| + } |
| + } catch (Throwable t) { |
| + failed.set(true); |
| + t.printStackTrace(System.out); |
| + throw new RuntimeException(t); |
| + } |
| + } |
| + }; |
| + searchThreads[thread].setDaemon(true); |
| + searchThreads[thread].start(); |
| + } |
| + |
| + for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) { |
| + searchThreads[thread].join(); |
| + } |
| + |
| + if (VERBOSE) { |
| + System.out.println("TEST: DONE search: totHits=" + totHits); |
| + } |
| + } |
| + |
| + if (VERBOSE) { |
| + System.out.println("TEST: all searching done [" + (System.currentTimeMillis()-t0) + " ms]"); |
| + } |
| + |
| + //System.out.println("numDocs=" + r.numDocs() + " openDelFileCount=" + dir.openDeleteFileCount()); |
| + r.close(); |
| + final Set<String> openDeletedFiles = dir.getOpenDeletedFiles(); |
| + if (openDeletedFiles.size() > 0) { |
| + System.out.println("OBD files: " + openDeletedFiles); |
| + } |
| + any |= openDeletedFiles.size() > 0; |
| + |
| + assertFalse("saw non-zero open-but-deleted count", any); |
| + if (VERBOSE) { |
| + System.out.println("TEST: now join"); |
| + } |
| + for(int thread=0;thread<NUM_INDEX_THREADS;thread++) { |
| + threads[thread].join(); |
| + } |
| + if (VERBOSE) { |
| + System.out.println("TEST: done join [" + (System.currentTimeMillis()-t0) + " ms]; addCount=" + addCount + " delCount=" + delCount); |
| + } |
| + writer.commit(); |
| + assertEquals(addCount.get() - delCount.get(), writer.numDocs()); |
| + |
| + writer.close(false); |
| + dir.close(); |
| + _TestUtil.rmDir(tempDir); |
| + docs.close(); |
| + if (VERBOSE) { |
| + System.out.println("TEST: done [" + (System.currentTimeMillis()-t0) + " ms]"); |
| + } |
| + } |
| + |
| + private int runQuery(IndexSearcher s, Query q) throws Exception { |
| + s.search(q, 10); |
| + return s.search(q, null, 10, new Sort(new SortField("title", SortField.STRING))).totalHits; |
| + } |
| + |
| + private void smokeTestReader(IndexReader r) throws Exception { |
| + IndexSearcher s = new IndexSearcher(r); |
| + runQuery(s, new TermQuery(new Term("body", "united"))); |
| + runQuery(s, new TermQuery(new Term("titleTokenized", "states"))); |
| + PhraseQuery pq = new PhraseQuery(); |
| + pq.add(new Term("body", "united")); |
| + pq.add(new Term("body", "states")); |
| + runQuery(s, pq); |
| + s.close(); |
| + } |
| +} |
| Index: lucene/src/test/org/apache/lucene/util/LineFileDocs.java |
| --- /dev/null Thu Jan 01 00:00:00 1970 +0000 |
| +++ lucene/src/test/org/apache/lucene/util/LineFileDocs.java Thu Dec 09 13:14:33 2010 -0500 |
| @@ -0,0 +1,155 @@ |
| +package org.apache.lucene.util; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.Closeable; |
| +import java.io.FileNotFoundException; |
| +import java.io.IOException; |
| +import java.io.BufferedReader; |
| +import java.io.InputStreamReader; |
| +import java.io.InputStream; |
| +import java.io.BufferedInputStream; |
| +import java.util.concurrent.atomic.AtomicInteger; |
| +import java.util.zip.GZIPInputStream; |
| + |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.document.Field; |
| + |
| +// Minimal port of contrib/benchmark's LneDocSource + |
| +// DocMaker, so tests can enum docs from a line file created |
| +// by contrib/benchmark's WriteLineDoc task |
| +public class LineFileDocs implements Closeable { |
| + |
| + private BufferedReader reader; |
| + private final boolean forever; |
| + private final static int BUFFER_SIZE = 1 << 16; // 64K |
| + private final AtomicInteger id = new AtomicInteger(); |
| + private final String path; |
| + |
| + // If forever is true, we rewind the file at EOF (repeat |
| + // the docs over and over) |
| + public LineFileDocs(String path, boolean forever) throws IOException { |
| + this.path = path; |
| + this.forever = forever; |
| + open(); |
| + } |
| + |
| + public LineFileDocs(boolean forever) throws IOException { |
| + this(LuceneTestCase.TEST_LINE_DOCS_FILE, forever); |
| + } |
| + |
| + public synchronized void close() throws IOException { |
| + if (reader != null) { |
| + reader.close(); |
| + reader = null; |
| + } |
| + } |
| + |
| + private synchronized void open() throws IOException { |
| + System.out.println("PATH=" + path); |
| + InputStream is = getClass().getResourceAsStream(path); |
| + if (is == null) { |
| + throw new FileNotFoundException("cannot find line docs resource \"" + path + "\""); |
| + } |
| + if (path.toString().endsWith(".gz")) { |
| + is = new GZIPInputStream(is); |
| + } |
| + final InputStream in = new BufferedInputStream(is, BUFFER_SIZE); |
| + reader = new BufferedReader(new InputStreamReader(in, "UTF-8"), BUFFER_SIZE); |
| + } |
| + |
| + public synchronized void reset() throws IOException { |
| + close(); |
| + open(); |
| + id.set(0); |
| + } |
| + |
| + private final static char SEP = '\t'; |
| + |
| + private static final class DocState { |
| + final Document doc; |
| + final Field titleTokenized; |
| + final Field title; |
| + final Field body; |
| + final Field id; |
| + final Field date; |
| + |
| + public DocState() { |
| + doc = new Document(); |
| + |
| + title = new Field("title", "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS); |
| + doc.add(title); |
| + |
| + titleTokenized = new Field("titleTokenized", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); |
| + doc.add(titleTokenized); |
| + |
| + body = new Field("body", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); |
| + doc.add(body); |
| + |
| + id = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); |
| + doc.add(id); |
| + |
| + date = new Field("date", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); |
| + doc.add(date); |
| + } |
| + } |
| + |
| + private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>(); |
| + |
| + // Document instance is re-used per-thread |
| + public Document nextDoc() throws IOException { |
| + String line; |
| + synchronized(this) { |
| + line = reader.readLine(); |
| + if (line == null) { |
| + if (forever) { |
| + if (LuceneTestCase.VERBOSE) { |
| + System.out.println("TEST: LineFileDocs: now rewind file..."); |
| + } |
| + close(); |
| + open(); |
| + line = reader.readLine(); |
| + } |
| + return null; |
| + } |
| + } |
| + |
| + DocState docState = threadDocs.get(); |
| + if (docState == null) { |
| + docState = new DocState(); |
| + threadDocs.set(docState); |
| + } |
| + |
| + int spot = line.indexOf(SEP); |
| + if (spot == -1) { |
| + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); |
| + } |
| + int spot2 = line.indexOf(SEP, 1 + spot); |
| + if (spot2 == -1) { |
| + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); |
| + } |
| + |
| + docState.body.setValue(line.substring(1+spot2, line.length())); |
| + final String title = line.substring(0, spot); |
| + docState.title.setValue(title); |
| + docState.titleTokenized.setValue(title); |
| + docState.date.setValue(line.substring(1+spot, spot2)); |
| + docState.id.setValue(Integer.toString(id.getAndIncrement())); |
| + return docState.doc; |
| + } |
| +} |
| Index: lucene/src/test/org/apache/lucene/util/LuceneTestCase.java |
| --- lucene/src/test/org/apache/lucene/util/LuceneTestCase.java Thu Dec 09 13:13:27 2010 -0500 |
| +++ lucene/src/test/org/apache/lucene/util/LuceneTestCase.java Thu Dec 09 13:14:33 2010 -0500 |
| @@ -128,19 +128,21 @@ |
| // each test case (non-J4 tests) and each test class (J4 |
| // tests) |
| /** Gets the codec to run tests with. */ |
| - static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField"); |
| + public static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField"); |
| /** Gets the locale to run tests with */ |
| - static final String TEST_LOCALE = System.getProperty("tests.locale", "random"); |
| + public static final String TEST_LOCALE = System.getProperty("tests.locale", "random"); |
| /** Gets the timezone to run tests with */ |
| - static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random"); |
| + public static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random"); |
| /** Gets the directory to run tests with */ |
| - static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random"); |
| + public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random"); |
| /** Get the number of times to run tests */ |
| - static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1")); |
| + public static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1")); |
| /** Get the random seed for tests */ |
| - static final String TEST_SEED = System.getProperty("tests.seed", "random"); |
| + public static final String TEST_SEED = System.getProperty("tests.seed", "random"); |
| /** whether or not nightly tests should run */ |
| - static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false")); |
| + public static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false")); |
| + /** the line file used by LineFileDocs */ |
| + public static final String TEST_LINE_DOCS_FILE = System.getProperty("tests.linedocsfile", "europarl.lines.txt.gz"); |
| |
| private static final Pattern codecWithParam = Pattern.compile("(.*)\\(\\s*(\\d+)\\s*\\)"); |
| |
| Index: lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz |
| Binary file lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz has changed |
| Index: lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py |
| --- /dev/null Thu Jan 01 00:00:00 1970 +0000 |
| +++ lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py Thu Dec 09 13:14:33 2010 -0500 |
| @@ -0,0 +1,137 @@ |
| +# Licensed to the Apache Software Foundation (ASF) under one or more |
| +# contributor license agreements. See the NOTICE file distributed with |
| +# this work for additional information regarding copyright ownership. |
| +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| +# (the "License"); you may not use this file except in compliance with |
| +# the License. You may obtain a copy of the License at |
| +# |
| +# http://www.apache.org/licenses/LICENSE-2.0 |
| +# |
| +# Unless required by applicable law or agreed to in writing, software |
| +# distributed under the License is distributed on an "AS IS" BASIS, |
| +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +# See the License for the specific language governing permissions and |
| +# limitations under the License. |
| + |
| +import sys |
| +import glob |
| +import datetime |
| +import tarfile |
| +import re |
| + |
| +try: |
| + sys.argv.remove('-verbose') |
| + VERBOSE = True |
| +except ValueError: |
| + VERBOSE = False |
| + |
| +try: |
| + sys.argv.remove('-docPerParagraph') |
| + docPerParagraph = True |
| +except ValueError: |
| + docPerParagraph = False |
| + |
| +reChapterOnly = re.compile('^<CHAPTER ID=.*?>$') |
| +reTagOnly = re.compile('^<.*?>$') |
| +reNumberOnly = re.compile(r'^\d+\.?$') |
| + |
| +docCount = 0 |
| +didEnglish = False |
| + |
| +def write(date, title, pending, fOut): |
| + global docCount |
| + body = ' '.join(pending).replace('\t', ' ').strip() |
| + if len(body) > 0: |
| + line = '%s\t%s\t%s\n' % (title, date, body) |
| + fOut.write(line) |
| + docCount += 1 |
| + del pending[:] |
| + if VERBOSE: |
| + print len(body) |
| + |
| +def processTar(fileName, fOut): |
| + |
| + global didEnglish |
| + |
| + t = tarfile.open(fileName, 'r:gz') |
| + for ti in t: |
| + if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1): |
| + |
| + tup = ti.name.split('/') |
| + lang = tup[1] |
| + year = int(tup[2][3:5]) |
| + if year < 20: |
| + year += 2000 |
| + else: |
| + year += 1900 |
| + |
| + month = int(tup[2][6:8]) |
| + day = int(tup[2][9:11]) |
| + date = datetime.date(year=year, month=month, day=day) |
| + |
| + if VERBOSE: |
| + print |
| + print '%s: %s' % (ti.name, date) |
| + nextIsTitle = False |
| + title = None |
| + pending = [] |
| + for line in t.extractfile(ti).readlines(): |
| + line = line.strip() |
| + if reChapterOnly.match(line) is not None: |
| + if title is not None: |
| + write(date, title, pending, fOut) |
| + nextIsTitle = True |
| + continue |
| + if nextIsTitle: |
| + if not reNumberOnly.match(line) and not reTagOnly.match(line): |
| + title = line |
| + nextIsTitle = False |
| + if VERBOSE: |
| + print ' title %s' % line |
| + continue |
| + if line.lower() == '<p>': |
| + if docPerParagraph: |
| + write(date, title, pending, fOut) |
| + else: |
| + pending.append('PARSEP') |
| + elif not reTagOnly.match(line): |
| + pending.append(line) |
| + if title is not None and len(pending) > 0: |
| + write(date, title, pending, fOut) |
| + |
| + didEnglish = True |
| + |
| +# '/x/lucene/data/europarl/all.lines.txt' |
| +dirIn = sys.argv[1] |
| +fileOut = sys.argv[2] |
| + |
| +fOut = open(fileOut, 'wb') |
| + |
| +for fileName in glob.glob('%s/??-??.tgz' % dirIn): |
| + if fileName.endswith('.tgz'): |
| + print 'process %s; %d docs so far...' % (fileName, docCount) |
| + processTar(fileName, fOut) |
| + |
| +print 'TOTAL: %s' % docCount |
| + |
| +#run something like this: |
| +""" |
| + |
| +# Europarl V5 makes 76,917 docs, avg 38.6 KB per |
| +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt |
| +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt |
| +rm /x/lucene/data/europarl/tmp.lines.txt |
| + |
| +# Run again, this time each paragraph is a doc: |
| +# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per: |
| +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph |
| +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt |
| +rm /x/lucene/data/europarl/tmp.lines.txt |
| + |
| +# ~5.5 MB gzip'd: |
| +head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt |
| +head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt |
| +shuf tmp.txt > europarl.subset.txt |
| +rm -f tmp.txt |
| +gzip --best europarl.subset.txt |
| +""" |