| package org.apache.lucene.index; |
| |
| /** |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Random; |
| |
| import junit.framework.Assert; |
| |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.Fieldable; |
| import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.MockRAMDirectory; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util._TestUtil; |
| |
| public class TestStressIndexing2 extends LuceneTestCase { |
| static int maxFields=4; |
| static int bigFieldSize=10; |
| static boolean sameFieldOrder=false; |
| static int mergeFactor=3; |
| static int maxBufferedDocs=3; |
| static int seed=0; |
| |
| Random r; |
| |
| public class MockIndexWriter extends IndexWriter { |
| |
| public MockIndexWriter(Directory dir, IndexWriterConfig conf) throws IOException { |
| super(dir, conf); |
| } |
| |
| @Override |
| boolean testPoint(String name) { |
| // if (name.equals("startCommit")) { |
| if (r.nextInt(4) == 2) |
| Thread.yield(); |
| return true; |
| } |
| } |
| |
| public void testRandomIWReader() throws Throwable { |
| r = newRandom(); |
| Directory dir = new MockRAMDirectory(); |
| |
| // TODO: verify equals using IW.getReader |
| DocsAndWriter dw = indexRandomIWReader(10, 10, 100, dir); |
| IndexReader r = dw.writer.getReader(); |
| dw.writer.commit(); |
| verifyEquals(r, dir, "id"); |
| r.close(); |
| dw.writer.close(); |
| dir.close(); |
| } |
| |
| public void testRandom() throws Throwable { |
| r = newRandom(); |
| Directory dir1 = new MockRAMDirectory(); |
| // dir1 = FSDirectory.open("foofoofoo"); |
| Directory dir2 = new MockRAMDirectory(); |
| // mergeFactor=2; maxBufferedDocs=2; Map docs = indexRandom(1, 3, 2, dir1); |
| int maxThreadStates = 1+r.nextInt(10); |
| Map<String,Document> docs = indexRandom(10, 10, 100, dir1, maxThreadStates); |
| indexSerial(docs, dir2); |
| |
| // verifying verify |
| // verifyEquals(dir1, dir1, "id"); |
| // verifyEquals(dir2, dir2, "id"); |
| |
| verifyEquals(dir1, dir2, "id"); |
| } |
| |
| public void testMultiConfig() throws Throwable { |
| // test lots of smaller different params together |
| r = newRandom(); |
| for (int i=0; i<20; i++) { // increase iterations for better testing |
| sameFieldOrder=r.nextBoolean(); |
| mergeFactor=r.nextInt(3)+2; |
| maxBufferedDocs=r.nextInt(3)+2; |
| int maxThreadStates = 1+r.nextInt(10); |
| seed++; |
| |
| int nThreads=r.nextInt(5)+1; |
| int iter=r.nextInt(10)+1; |
| int range=r.nextInt(20)+1; |
| Directory dir1 = new MockRAMDirectory(); |
| Directory dir2 = new MockRAMDirectory(); |
| Map<String,Document> docs = indexRandom(nThreads, iter, range, dir1, maxThreadStates); |
| indexSerial(docs, dir2); |
| verifyEquals(dir1, dir2, "id"); |
| } |
| } |
| |
| |
| static Term idTerm = new Term("id",""); |
| IndexingThread[] threads; |
| static Comparator<Fieldable> fieldNameComparator = new Comparator<Fieldable>() { |
| public int compare(Fieldable o1, Fieldable o2) { |
| return o1.name().compareTo(o2.name()); |
| } |
| }; |
| |
| // This test avoids using any extra synchronization in the multiple |
| // indexing threads to test that IndexWriter does correctly synchronize |
| // everything. |
| |
| public static class DocsAndWriter { |
| Map<String,Document> docs; |
| IndexWriter writer; |
| } |
| |
| public DocsAndWriter indexRandomIWReader(int nThreads, int iterations, int range, Directory dir) throws IOException, InterruptedException { |
| Map<String,Document> docs = new HashMap<String,Document>(); |
| IndexWriter w = new MockIndexWriter(dir, new IndexWriterConfig( |
| TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT)).setOpenMode(OpenMode.CREATE).setRAMBufferSizeMB( |
| 0.1).setMaxBufferedDocs(maxBufferedDocs)); |
| LogMergePolicy lmp = (LogMergePolicy) w.getConfig().getMergePolicy(); |
| lmp.setUseCompoundFile(false); |
| lmp.setUseCompoundDocStore(false); |
| lmp.setMergeFactor(mergeFactor); |
| /*** |
| w.setMaxMergeDocs(Integer.MAX_VALUE); |
| w.setMaxFieldLength(10000); |
| w.setRAMBufferSizeMB(1); |
| w.setMergeFactor(10); |
| ***/ |
| |
| threads = new IndexingThread[nThreads]; |
| for (int i=0; i<threads.length; i++) { |
| IndexingThread th = new IndexingThread(); |
| th.w = w; |
| th.base = 1000000*i; |
| th.range = range; |
| th.iterations = iterations; |
| threads[i] = th; |
| } |
| |
| for (int i=0; i<threads.length; i++) { |
| threads[i].start(); |
| } |
| for (int i=0; i<threads.length; i++) { |
| threads[i].join(); |
| } |
| |
| // w.optimize(); |
| //w.close(); |
| |
| for (int i=0; i<threads.length; i++) { |
| IndexingThread th = threads[i]; |
| synchronized(th) { |
| docs.putAll(th.docs); |
| } |
| } |
| |
| _TestUtil.checkIndex(dir); |
| DocsAndWriter dw = new DocsAndWriter(); |
| dw.docs = docs; |
| dw.writer = w; |
| return dw; |
| } |
| |
| public Map<String,Document> indexRandom(int nThreads, int iterations, int range, Directory dir, int maxThreadStates) throws IOException, InterruptedException { |
| Map<String,Document> docs = new HashMap<String,Document>(); |
| for(int iter=0;iter<3;iter++) { |
| IndexWriter w = new MockIndexWriter(dir, new IndexWriterConfig( |
| TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT)).setOpenMode(OpenMode.CREATE) |
| .setRAMBufferSizeMB(0.1).setMaxBufferedDocs(maxBufferedDocs).setMaxThreadStates(maxThreadStates)); |
| LogMergePolicy lmp = (LogMergePolicy) w.getConfig().getMergePolicy(); |
| lmp.setUseCompoundFile(false); |
| lmp.setUseCompoundDocStore(false); |
| lmp.setMergeFactor(mergeFactor); |
| |
| threads = new IndexingThread[nThreads]; |
| for (int i=0; i<threads.length; i++) { |
| IndexingThread th = new IndexingThread(); |
| th.w = w; |
| th.base = 1000000*i; |
| th.range = range; |
| th.iterations = iterations; |
| threads[i] = th; |
| } |
| |
| for (int i=0; i<threads.length; i++) { |
| threads[i].start(); |
| } |
| for (int i=0; i<threads.length; i++) { |
| threads[i].join(); |
| } |
| |
| // w.optimize(); |
| w.close(); |
| |
| for (int i=0; i<threads.length; i++) { |
| IndexingThread th = threads[i]; |
| synchronized(th) { |
| docs.putAll(th.docs); |
| } |
| } |
| } |
| |
| _TestUtil.checkIndex(dir); |
| |
| return docs; |
| } |
| |
| |
| public static void indexSerial(Map<String,Document> docs, Directory dir) throws IOException { |
| IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); |
| |
| // index all docs in a single thread |
| Iterator<Document> iter = docs.values().iterator(); |
| while (iter.hasNext()) { |
| Document d = iter.next(); |
| ArrayList<Fieldable> fields = new ArrayList<Fieldable>(); |
| fields.addAll(d.getFields()); |
| // put fields in same order each time |
| Collections.sort(fields, fieldNameComparator); |
| |
| Document d1 = new Document(); |
| d1.setBoost(d.getBoost()); |
| for (int i=0; i<fields.size(); i++) { |
| d1.add(fields.get(i)); |
| } |
| w.addDocument(d1); |
| // System.out.println("indexing "+d1); |
| } |
| |
| w.close(); |
| } |
| |
| public static void verifyEquals(IndexReader r1, Directory dir2, String idField) throws Throwable { |
| IndexReader r2 = IndexReader.open(dir2, true); |
| verifyEquals(r1, r2, idField); |
| r2.close(); |
| } |
| |
| public static void verifyEquals(Directory dir1, Directory dir2, String idField) throws Throwable { |
| IndexReader r1 = IndexReader.open(dir1, true); |
| IndexReader r2 = IndexReader.open(dir2, true); |
| verifyEquals(r1, r2, idField); |
| r1.close(); |
| r2.close(); |
| } |
| |
| |
| public static void verifyEquals(IndexReader r1, IndexReader r2, String idField) throws Throwable { |
| assertEquals(r1.numDocs(), r2.numDocs()); |
| boolean hasDeletes = !(r1.maxDoc()==r2.maxDoc() && r1.numDocs()==r1.maxDoc()); |
| |
| int[] r2r1 = new int[r2.maxDoc()]; // r2 id to r1 id mapping |
| |
| TermDocs termDocs1 = r1.termDocs(); |
| TermDocs termDocs2 = r2.termDocs(); |
| |
| // create mapping from id2 space to id2 based on idField |
| idField = StringHelper.intern(idField); |
| TermEnum termEnum = r1.terms (new Term (idField, "")); |
| do { |
| Term term = termEnum.term(); |
| if (term==null || term.field() != idField) break; |
| |
| termDocs1.seek (termEnum); |
| if (!termDocs1.next()) { |
| // This doc is deleted and wasn't replaced |
| termDocs2.seek(termEnum); |
| assertFalse(termDocs2.next()); |
| continue; |
| } |
| |
| int id1 = termDocs1.doc(); |
| assertFalse(termDocs1.next()); |
| |
| termDocs2.seek(termEnum); |
| assertTrue(termDocs2.next()); |
| int id2 = termDocs2.doc(); |
| assertFalse(termDocs2.next()); |
| |
| r2r1[id2] = id1; |
| |
| // verify stored fields are equivalent |
| try { |
| verifyEquals(r1.document(id1), r2.document(id2)); |
| } catch (Throwable t) { |
| System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term="+ term); |
| System.out.println(" d1=" + r1.document(id1)); |
| System.out.println(" d2=" + r2.document(id2)); |
| throw t; |
| } |
| |
| try { |
| // verify term vectors are equivalent |
| verifyEquals(r1.getTermFreqVectors(id1), r2.getTermFreqVectors(id2)); |
| } catch (Throwable e) { |
| System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); |
| TermFreqVector[] tv1 = r1.getTermFreqVectors(id1); |
| System.out.println(" d1=" + tv1); |
| if (tv1 != null) |
| for(int i=0;i<tv1.length;i++) |
| System.out.println(" " + i + ": " + tv1[i]); |
| |
| TermFreqVector[] tv2 = r2.getTermFreqVectors(id2); |
| System.out.println(" d2=" + tv2); |
| if (tv2 != null) |
| for(int i=0;i<tv2.length;i++) |
| System.out.println(" " + i + ": " + tv2[i]); |
| |
| throw e; |
| } |
| |
| } while (termEnum.next()); |
| |
| termEnum.close(); |
| |
| // Verify postings |
| TermEnum termEnum1 = r1.terms (new Term ("", "")); |
| TermEnum termEnum2 = r2.terms (new Term ("", "")); |
| |
| // pack both doc and freq into single element for easy sorting |
| long[] info1 = new long[r1.numDocs()]; |
| long[] info2 = new long[r2.numDocs()]; |
| |
| for(;;) { |
| Term term1,term2; |
| |
| // iterate until we get some docs |
| int len1; |
| for(;;) { |
| len1=0; |
| term1 = termEnum1.term(); |
| if (term1==null) break; |
| termDocs1.seek(termEnum1); |
| while (termDocs1.next()) { |
| int d1 = termDocs1.doc(); |
| int f1 = termDocs1.freq(); |
| info1[len1] = (((long)d1)<<32) | f1; |
| len1++; |
| } |
| if (len1>0) break; |
| if (!termEnum1.next()) break; |
| } |
| |
| // iterate until we get some docs |
| int len2; |
| for(;;) { |
| len2=0; |
| term2 = termEnum2.term(); |
| if (term2==null) break; |
| termDocs2.seek(termEnum2); |
| while (termDocs2.next()) { |
| int d2 = termDocs2.doc(); |
| int f2 = termDocs2.freq(); |
| info2[len2] = (((long)r2r1[d2])<<32) | f2; |
| len2++; |
| } |
| if (len2>0) break; |
| if (!termEnum2.next()) break; |
| } |
| |
| if (!hasDeletes) |
| assertEquals(termEnum1.docFreq(), termEnum2.docFreq()); |
| |
| assertEquals(len1, len2); |
| if (len1==0) break; // no more terms |
| |
| assertEquals(term1, term2); |
| |
| // sort info2 to get it into ascending docid |
| Arrays.sort(info2, 0, len2); |
| |
| // now compare |
| for (int i=0; i<len1; i++) { |
| assertEquals(info1[i], info2[i]); |
| } |
| |
| termEnum1.next(); |
| termEnum2.next(); |
| } |
| } |
| |
| public static void verifyEquals(Document d1, Document d2) { |
| List<Fieldable> ff1 = d1.getFields(); |
| List<Fieldable> ff2 = d2.getFields(); |
| |
| Collections.sort(ff1, fieldNameComparator); |
| Collections.sort(ff2, fieldNameComparator); |
| |
| assertEquals(ff1 + " : " + ff2, ff1.size(), ff2.size()); |
| |
| for (int i=0; i<ff1.size(); i++) { |
| Fieldable f1 = ff1.get(i); |
| Fieldable f2 = ff2.get(i); |
| if (f1.isBinary()) { |
| assert(f2.isBinary()); |
| } else { |
| String s1 = f1.stringValue(); |
| String s2 = f2.stringValue(); |
| assertEquals(ff1 + " : " + ff2, s1,s2); |
| } |
| } |
| } |
| |
| public static void verifyEquals(TermFreqVector[] d1, TermFreqVector[] d2) { |
| if (d1 == null) { |
| assertTrue(d2 == null); |
| return; |
| } |
| assertTrue(d2 != null); |
| |
| assertEquals(d1.length, d2.length); |
| for(int i=0;i<d1.length;i++) { |
| TermFreqVector v1 = d1[i]; |
| TermFreqVector v2 = d2[i]; |
| if (v1 == null || v2 == null) |
| System.out.println("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.length); |
| assertEquals(v1.size(), v2.size()); |
| int numTerms = v1.size(); |
| String[] terms1 = v1.getTerms(); |
| String[] terms2 = v2.getTerms(); |
| int[] freq1 = v1.getTermFrequencies(); |
| int[] freq2 = v2.getTermFrequencies(); |
| for(int j=0;j<numTerms;j++) { |
| if (!terms1[j].equals(terms2[j])) |
| assertEquals(terms1[j], terms2[j]); |
| assertEquals(freq1[j], freq2[j]); |
| } |
| if (v1 instanceof TermPositionVector) { |
| assertTrue(v2 instanceof TermPositionVector); |
| TermPositionVector tpv1 = (TermPositionVector) v1; |
| TermPositionVector tpv2 = (TermPositionVector) v2; |
| for(int j=0;j<numTerms;j++) { |
| int[] pos1 = tpv1.getTermPositions(j); |
| int[] pos2 = tpv2.getTermPositions(j); |
| assertEquals(pos1.length, pos2.length); |
| TermVectorOffsetInfo[] offsets1 = tpv1.getOffsets(j); |
| TermVectorOffsetInfo[] offsets2 = tpv2.getOffsets(j); |
| if (offsets1 == null) |
| assertTrue(offsets2 == null); |
| else |
| assertTrue(offsets2 != null); |
| for(int k=0;k<pos1.length;k++) { |
| assertEquals(pos1[k], pos2[k]); |
| if (offsets1 != null) { |
| assertEquals(offsets1[k].getStartOffset(), |
| offsets2[k].getStartOffset()); |
| assertEquals(offsets1[k].getEndOffset(), |
| offsets2[k].getEndOffset()); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| private static class IndexingThread extends Thread { |
| IndexWriter w; |
| int base; |
| int range; |
| int iterations; |
| Map<String,Document> docs = new HashMap<String,Document>(); |
| Random r; |
| |
| public int nextInt(int lim) { |
| return r.nextInt(lim); |
| } |
| |
| // start is inclusive and end is exclusive |
| public int nextInt(int start, int end) { |
| return start + r.nextInt(end-start); |
| } |
| |
| char[] buffer = new char[100]; |
| |
| private int addUTF8Token(int start) { |
| final int end = start + nextInt(20); |
| if (buffer.length < 1+end) { |
| char[] newBuffer = new char[(int) ((1+end)*1.25)]; |
| System.arraycopy(buffer, 0, newBuffer, 0, buffer.length); |
| buffer = newBuffer; |
| } |
| |
| for(int i=start;i<end;i++) { |
| int t = nextInt(6); |
| if (0 == t && i < end-1) { |
| // Make a surrogate pair |
| // High surrogate |
| buffer[i++] = (char) nextInt(0xd800, 0xdc00); |
| // Low surrogate |
| buffer[i] = (char) nextInt(0xdc00, 0xe000); |
| } else if (t <= 1) |
| buffer[i] = (char) nextInt(0x80); |
| else if (2 == t) |
| buffer[i] = (char) nextInt(0x80, 0x800); |
| else if (3 == t) |
| buffer[i] = (char) nextInt(0x800, 0xd800); |
| else if (4 == t) |
| buffer[i] = (char) nextInt(0xe000, 0xffff); |
| else if (5 == t) { |
| // Illegal unpaired surrogate |
| if (r.nextBoolean()) |
| buffer[i] = (char) nextInt(0xd800, 0xdc00); |
| else |
| buffer[i] = (char) nextInt(0xdc00, 0xe000); |
| } |
| } |
| buffer[end] = ' '; |
| return 1+end; |
| } |
| |
| public String getString(int nTokens) { |
| nTokens = nTokens!=0 ? nTokens : r.nextInt(4)+1; |
| |
| // Half the time make a random UTF8 string |
| if (r.nextBoolean()) |
| return getUTF8String(nTokens); |
| |
| // avoid StringBuffer because it adds extra synchronization. |
| char[] arr = new char[nTokens*2]; |
| for (int i=0; i<nTokens; i++) { |
| arr[i*2] = (char)('A' + r.nextInt(10)); |
| arr[i*2+1] = ' '; |
| } |
| return new String(arr); |
| } |
| |
| public String getUTF8String(int nTokens) { |
| int upto = 0; |
| Arrays.fill(buffer, (char) 0); |
| for(int i=0;i<nTokens;i++) |
| upto = addUTF8Token(upto); |
| return new String(buffer, 0, upto); |
| } |
| |
| public String getIdString() { |
| return Integer.toString(base + nextInt(range)); |
| } |
| |
| public void indexDoc() throws IOException { |
| Document d = new Document(); |
| |
| ArrayList<Field> fields = new ArrayList<Field>(); |
| String idString = getIdString(); |
| Field idField = new Field(idTerm.field(), idString, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); |
| fields.add(idField); |
| |
| int nFields = nextInt(maxFields); |
| for (int i=0; i<nFields; i++) { |
| |
| Field.TermVector tvVal = Field.TermVector.NO; |
| switch (nextInt(4)) { |
| case 0: |
| tvVal = Field.TermVector.NO; |
| break; |
| case 1: |
| tvVal = Field.TermVector.YES; |
| break; |
| case 2: |
| tvVal = Field.TermVector.WITH_POSITIONS; |
| break; |
| case 3: |
| tvVal = Field.TermVector.WITH_POSITIONS_OFFSETS; |
| break; |
| } |
| |
| switch (nextInt(4)) { |
| case 0: |
| fields.add(new Field("f" + nextInt(100), getString(1), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS, tvVal)); |
| break; |
| case 1: |
| fields.add(new Field("f" + nextInt(100), getString(0), Field.Store.NO, Field.Index.ANALYZED, tvVal)); |
| break; |
| case 2: |
| fields.add(new Field("f" + nextInt(100), getString(0), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); |
| break; |
| case 3: |
| fields.add(new Field("f" + nextInt(100), getString(bigFieldSize), Field.Store.YES, Field.Index.ANALYZED, tvVal)); |
| break; |
| } |
| } |
| |
| if (sameFieldOrder) { |
| Collections.sort(fields, fieldNameComparator); |
| } else { |
| // random placement of id field also |
| Collections.swap(fields,nextInt(fields.size()), 0); |
| } |
| |
| for (int i=0; i<fields.size(); i++) { |
| d.add(fields.get(i)); |
| } |
| w.updateDocument(idTerm.createTerm(idString), d); |
| // System.out.println("indexing "+d); |
| docs.put(idString, d); |
| } |
| |
| public void deleteDoc() throws IOException { |
| String idString = getIdString(); |
| w.deleteDocuments(idTerm.createTerm(idString)); |
| docs.remove(idString); |
| } |
| |
| public void deleteByQuery() throws IOException { |
| String idString = getIdString(); |
| w.deleteDocuments(new TermQuery(idTerm.createTerm(idString))); |
| docs.remove(idString); |
| } |
| |
| @Override |
| public void run() { |
| try { |
| r = new Random(base+range+seed); |
| for (int i=0; i<iterations; i++) { |
| int what = nextInt(100); |
| if (what < 5) { |
| deleteDoc(); |
| } else if (what < 10) { |
| deleteByQuery(); |
| } else { |
| indexDoc(); |
| } |
| } |
| } catch (Throwable e) { |
| e.printStackTrace(); |
| Assert.fail(e.toString()); |
| } |
| |
| synchronized (this) { |
| docs.size(); |
| } |
| } |
| } |
| |
| } |