lucene/core/src/test/org/apache/lucene/index/TestBagOfPostings.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.index;


 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.CountDownLatch;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;

 /**
  * Simple test that adds numeric terms, where each term has the
  * docFreq of its integer value, and checks that the docFreq is correct.
  */
 @SuppressCodecs({"Direct"}) // at night this makes like 200k/300k docs and will make Direct's heart beat!
 public class TestBagOfPostings extends LuceneTestCase {
   public void test() throws Exception {
     List<String> postingsList = new ArrayList<>();
     int numTerms = atLeast(300);
     final int maxTermsPerDoc = TestUtil.nextInt(random(), 10, 20);

     boolean isSimpleText = "SimpleText".equals(TestUtil.getPostingsFormat("field"));

     IndexWriterConfig iwc = newIndexWriterConfig(random(), new MockAnalyzer(random()));

     if ((isSimpleText || iwc.getMergePolicy() instanceof MockRandomMergePolicy) && (TEST_NIGHTLY || RANDOM_MULTIPLIER > 1)) {
       // Otherwise test can take way too long (> 2 hours)
       numTerms /= 2;
     }

     if (VERBOSE) {
       System.out.println("maxTermsPerDoc=" + maxTermsPerDoc);
       System.out.println("numTerms=" + numTerms);
     }

     for (int i = 0; i < numTerms; i++) {
       String term = Integer.toString(i);
       for (int j = 0; j < i; j++) {
         postingsList.add(term);
       }
     }
     Collections.shuffle(postingsList, random());

     final ConcurrentLinkedQueue<String> postings = new ConcurrentLinkedQueue<>(postingsList);

     Directory dir = newFSDirectory(createTempDir("bagofpostings"));
     final RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);

     int threadCount = TestUtil.nextInt(random(), 1, 5);
     if (VERBOSE) {
       System.out.println("config: " + iw.w.getConfig());
       System.out.println("threadCount=" + threadCount);
     }

     Thread[] threads = new Thread[threadCount];
     final CountDownLatch startingGun = new CountDownLatch(1);

     for(int threadID=0;threadID<threadCount;threadID++) {
       threads[threadID] = new Thread() {
           @Override
           public void run() {
             try {
               Document document = new Document();
               Field field = newTextField("field", "", Field.Store.NO);
               document.add(field);
               startingGun.await();
               while (!postings.isEmpty()) {
                 StringBuilder text = new StringBuilder();
                 Set<String> visited = new HashSet<>();
                 for (int i = 0; i < maxTermsPerDoc; i++) {
                   String token = postings.poll();
                   if (token == null) {
                     break;
                   }
                   if (visited.contains(token)) {
                     // Put it back:
                     postings.add(token);
                     break;
                   }
                   text.append(' ');
                   text.append(token);
                   visited.add(token);
                 }
                 field.setStringValue(text.toString());
                 iw.addDocument(document);
               }
             } catch (Exception e) {
               throw new RuntimeException(e);
             }
           }
         };
       threads[threadID].start();
     }
     startingGun.countDown();
     for(Thread t : threads) {
       t.join();
     }

     iw.forceMerge(1);
     DirectoryReader ir = iw.getReader();
     assertEquals(1, ir.leaves().size());
     LeafReader air = ir.leaves().get(0).reader();
     Terms terms = air.terms("field");
     // numTerms-1 because there cannot be a term 0 with 0 postings:
     assertEquals(numTerms-1, terms.size());
     TermsEnum termsEnum = terms.iterator();
     BytesRef term;
     while ((term = termsEnum.next()) != null) {
       int value = Integer.parseInt(term.utf8ToString());
       assertEquals(value, termsEnum.docFreq());
       // don't really need to check more than this, as CheckIndex
       // will verify that docFreq == actual number of documents seen
       // from a postingsEnum.
     }
     ir.close();
     iw.close();
     dir.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.index;


	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;
	import java.util.concurrent.ConcurrentLinkedQueue;
	import java.util.concurrent.CountDownLatch;

	import org.apache.lucene.analysis.MockAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
	import org.apache.lucene.util.LuceneTestCase;
	import org.apache.lucene.util.TestUtil;

	/**
	* Simple test that adds numeric terms, where each term has the
	* docFreq of its integer value, and checks that the docFreq is correct.
	*/
	@SuppressCodecs({"Direct"}) // at night this makes like 200k/300k docs and will make Direct's heart beat!
	public class TestBagOfPostings extends LuceneTestCase {
	public void test() throws Exception {
	List<String> postingsList = new ArrayList<>();
	int numTerms = atLeast(300);
	final int maxTermsPerDoc = TestUtil.nextInt(random(), 10, 20);

	boolean isSimpleText = "SimpleText".equals(TestUtil.getPostingsFormat("field"));

	IndexWriterConfig iwc = newIndexWriterConfig(random(), new MockAnalyzer(random()));

	if ((isSimpleText \|\| iwc.getMergePolicy() instanceof MockRandomMergePolicy) && (TEST_NIGHTLY \|\| RANDOM_MULTIPLIER > 1)) {
	// Otherwise test can take way too long (> 2 hours)
	numTerms /= 2;
	}

	if (VERBOSE) {
	System.out.println("maxTermsPerDoc=" + maxTermsPerDoc);
	System.out.println("numTerms=" + numTerms);
	}

	for (int i = 0; i < numTerms; i++) {
	String term = Integer.toString(i);
	for (int j = 0; j < i; j++) {
	postingsList.add(term);
	}
	}
	Collections.shuffle(postingsList, random());

	final ConcurrentLinkedQueue<String> postings = new ConcurrentLinkedQueue<>(postingsList);

	Directory dir = newFSDirectory(createTempDir("bagofpostings"));
	final RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);

	int threadCount = TestUtil.nextInt(random(), 1, 5);
	if (VERBOSE) {
	System.out.println("config: " + iw.w.getConfig());
	System.out.println("threadCount=" + threadCount);
	}

	Thread[] threads = new Thread[threadCount];
	final CountDownLatch startingGun = new CountDownLatch(1);

	for(int threadID=0;threadID<threadCount;threadID++) {
	threads[threadID] = new Thread() {
	@Override
	public void run() {
	try {
	Document document = new Document();
	Field field = newTextField("field", "", Field.Store.NO);
	document.add(field);
	startingGun.await();
	while (!postings.isEmpty()) {
	StringBuilder text = new StringBuilder();
	Set<String> visited = new HashSet<>();
	for (int i = 0; i < maxTermsPerDoc; i++) {
	String token = postings.poll();
	if (token == null) {
	break;
	}
	if (visited.contains(token)) {
	// Put it back:
	postings.add(token);
	break;
	}
	text.append(' ');
	text.append(token);
	visited.add(token);
	}
	field.setStringValue(text.toString());
	iw.addDocument(document);
	}
	} catch (Exception e) {
	throw new RuntimeException(e);
	}
	}
	};
	threads[threadID].start();
	}
	startingGun.countDown();
	for(Thread t : threads) {
	t.join();
	}

	iw.forceMerge(1);
	DirectoryReader ir = iw.getReader();
	assertEquals(1, ir.leaves().size());
	LeafReader air = ir.leaves().get(0).reader();
	Terms terms = air.terms("field");
	// numTerms-1 because there cannot be a term 0 with 0 postings:
	assertEquals(numTerms-1, terms.size());
	TermsEnum termsEnum = terms.iterator();
	BytesRef term;
	while ((term = termsEnum.next()) != null) {
	int value = Integer.parseInt(term.utf8ToString());
	assertEquals(value, termsEnum.docFreq());
	// don't really need to check more than this, as CheckIndex
	// will verify that docFreq == actual number of documents seen
	// from a postingsEnum.
	}
	ir.close();
	iw.close();
	dir.close();
	}
	}