oak-benchmarks/src/main/java/org/apache/jackrabbit/oak/benchmark/FullTextSearchTest.java - jackrabbit-oak - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.jackrabbit.oak.benchmark;

 import static com.google.common.collect.Lists.newArrayList;
 import static com.google.common.collect.Sets.newHashSet;

 import java.io.File;
 import java.io.IOException;
 import java.util.List;
 import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import javax.jcr.Node;
 import javax.jcr.Repository;
 import javax.jcr.Session;
 import javax.jcr.query.Query;
 import javax.jcr.query.QueryManager;
 import javax.jcr.query.QueryResult;
 import javax.jcr.query.RowIterator;

 import org.apache.commons.io.FileUtils;
 import org.apache.jackrabbit.oak.Oak;
 import org.apache.jackrabbit.oak.benchmark.wikipedia.WikipediaImport;
 import org.apache.jackrabbit.oak.fixture.JcrCreator;
 import org.apache.jackrabbit.oak.fixture.OakRepositoryFixture;
 import org.apache.jackrabbit.oak.fixture.RepositoryFixture;
 import org.apache.jackrabbit.oak.jcr.Jcr;
 import org.apache.jackrabbit.oak.plugins.index.lucene.IndexCopier;
 import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexProvider;
 import org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneInitializerHelper;
 import org.apache.jackrabbit.oak.spi.commit.Observer;
 import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;

 public class FullTextSearchTest extends AbstractTest<FullTextSearchTest.TestContext> {

     /**
      * Pattern used to find words and other searchable tokens within the
      * imported Wikipedia pages.
      */
     private static final Pattern WORD_PATTERN =
             Pattern.compile("\\p{LD}{3,}");

     private int maxSampleSize = 100;

     private final boolean disableCopyOnRead = Boolean.getBoolean("disableCopyOnRead");

     private final WikipediaImport importer;

     private final Set<String> sampleSet = newHashSet();

     private final Random random = new Random(42); //fixed seed

     private int count = 0;

     private int maxRowsToFetch = Integer.getInteger("maxRowsToFetch",100);

     private TestContext defaultContext;

     /**
      * null means true; true means true
      */
     protected Boolean storageEnabled;

     private ExecutorService executorService = Executors.newFixedThreadPool(2);

     private File indexCopierDir;

     public FullTextSearchTest(File dump, boolean flat, boolean doReport, Boolean storageEnabled) {
         this.importer = new WikipediaImport(dump, flat, doReport) {
             @Override
             protected void pageAdded(String title, String text) {
                 count++;
                 if (count % 100 == 0
                         && sampleSet.size() < maxSampleSize
                         && text != null) {
                     List<String> words = newArrayList();

                     Matcher matcher = WORD_PATTERN.matcher(text);
                     while (matcher.find()) {
                         words.add(matcher.group());
                     }

                     if (!words.isEmpty()) {
                         sampleSet.add(words.get(words.size() / 2));
                     }
                 }
             }
         };
         this.storageEnabled = storageEnabled;
         this.indexCopierDir = createTemporaryFolder(null);
     }

     @Override
     public void beforeSuite() throws Exception {
         random.setSeed(42);
         sampleSet.clear();
         count = 0;

         importer.importWikipedia(loginWriter());
         Thread.sleep(10); // allow some time for the indexer to catch up

         defaultContext = new TestContext();
     }

     @Override
     protected void afterSuite() throws Exception {
         executorService.shutdown();
         executorService.awaitTermination(1, TimeUnit.MINUTES);
         FileUtils.deleteDirectory(indexCopierDir);
     }

     @Override
     protected TestContext prepareThreadExecutionContext() {
         return new TestContext();
     }

     @Override
     protected void runTest() throws Exception {
         runTest(defaultContext);
     }

     @SuppressWarnings("deprecation")
     @Override
     protected void runTest(TestContext ec)  throws Exception {
         QueryManager qm = ec.session.getWorkspace().getQueryManager();
         // TODO verify why "order by jcr:score()" accounts for what looks
         // like > 20% of the perf lost in Collections.sort
         for (String word : ec.words) {
             Query q = qm.createQuery("//*[jcr:contains(@text, '" + word + "')] ", Query.XPATH);
             QueryResult r = q.execute();
             RowIterator it = r.getRows();
             for (int rows = 0; it.hasNext() && rows < maxRowsToFetch; rows++) {
                 Node n = it.nextRow().getNode();
                 ec.hash += n.getProperty("text").getString().hashCode();
                 ec.hash += n.getProperty("title").getString().hashCode();
             }
         }
     }

     class TestContext {
         final Session session = loginWriter();
         final String[] words = getRandomWords();
         int hash = 0; // summary variable to prevent JIT compiler tricks
     }

     private String[] getRandomWords() {
         List<String> samples = newArrayList(sampleSet);
         String[] words = new String[100];
         for (int i = 0; i < words.length; i++) {
             words[i] = samples.get(random.nextInt(samples.size()));
         }
         return words;
     }

     @Override
     protected Repository[] createRepository(RepositoryFixture fixture) throws Exception {
         if (fixture instanceof OakRepositoryFixture) {
             return ((OakRepositoryFixture) fixture).setUpCluster(1, new JcrCreator() {
                 @Override
                 public Jcr customize(Oak oak) {
                     LuceneIndexProvider provider = createLuceneIndexProvider();
                     oak.with((QueryIndexProvider) provider)
                             .with((Observer) provider)
                             .with(new LuceneIndexEditorProvider())
                             .with(new LuceneInitializerHelper("luceneGlobal", storageEnabled));
                     return new Jcr(oak);
                 }
             });
         }
         return super.createRepository(fixture);
     }

     private LuceneIndexProvider createLuceneIndexProvider() {
         if (!disableCopyOnRead) {
             try {
                 IndexCopier copier = new IndexCopier(executorService, indexCopierDir, true);
                 return new LuceneIndexProvider(copier);
             } catch (IOException e) {
                 throw new RuntimeException(e);
             }
         }
         return new LuceneIndexProvider();
     }

     private File createTemporaryFolder(File parentFolder){
         File createdFolder = null;
         try {
             createdFolder = File.createTempFile("oak", "", parentFolder);
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
         createdFolder.delete();
         createdFolder.mkdir();
         return createdFolder;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.jackrabbit.oak.benchmark;

	import static com.google.common.collect.Lists.newArrayList;
	import static com.google.common.collect.Sets.newHashSet;

	import java.io.File;
	import java.io.IOException;
	import java.util.List;
	import java.util.Random;
	import java.util.Set;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;
	import java.util.concurrent.TimeUnit;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import javax.jcr.Node;
	import javax.jcr.Repository;
	import javax.jcr.Session;
	import javax.jcr.query.Query;
	import javax.jcr.query.QueryManager;
	import javax.jcr.query.QueryResult;
	import javax.jcr.query.RowIterator;

	import org.apache.commons.io.FileUtils;
	import org.apache.jackrabbit.oak.Oak;
	import org.apache.jackrabbit.oak.benchmark.wikipedia.WikipediaImport;
	import org.apache.jackrabbit.oak.fixture.JcrCreator;
	import org.apache.jackrabbit.oak.fixture.OakRepositoryFixture;
	import org.apache.jackrabbit.oak.fixture.RepositoryFixture;
	import org.apache.jackrabbit.oak.jcr.Jcr;
	import org.apache.jackrabbit.oak.plugins.index.lucene.IndexCopier;
	import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorProvider;
	import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexProvider;
	import org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneInitializerHelper;
	import org.apache.jackrabbit.oak.spi.commit.Observer;
	import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;

	public class FullTextSearchTest extends AbstractTest<FullTextSearchTest.TestContext> {

	/**
	* Pattern used to find words and other searchable tokens within the
	* imported Wikipedia pages.
	*/
	private static final Pattern WORD_PATTERN =
	Pattern.compile("\\p{LD}{3,}");

	private int maxSampleSize = 100;

	private final boolean disableCopyOnRead = Boolean.getBoolean("disableCopyOnRead");

	private final WikipediaImport importer;

	private final Set<String> sampleSet = newHashSet();

	private final Random random = new Random(42); //fixed seed

	private int count = 0;

	private int maxRowsToFetch = Integer.getInteger("maxRowsToFetch",100);

	private TestContext defaultContext;

	/**
	* null means true; true means true
	*/
	protected Boolean storageEnabled;

	private ExecutorService executorService = Executors.newFixedThreadPool(2);

	private File indexCopierDir;

	public FullTextSearchTest(File dump, boolean flat, boolean doReport, Boolean storageEnabled) {
	this.importer = new WikipediaImport(dump, flat, doReport) {
	@Override
	protected void pageAdded(String title, String text) {
	count++;
	if (count % 100 == 0
	&& sampleSet.size() < maxSampleSize
	&& text != null) {
	List<String> words = newArrayList();

	Matcher matcher = WORD_PATTERN.matcher(text);
	while (matcher.find()) {
	words.add(matcher.group());
	}

	if (!words.isEmpty()) {
	sampleSet.add(words.get(words.size() / 2));
	}
	}
	}
	};
	this.storageEnabled = storageEnabled;
	this.indexCopierDir = createTemporaryFolder(null);
	}

	@Override
	public void beforeSuite() throws Exception {
	random.setSeed(42);
	sampleSet.clear();
	count = 0;

	importer.importWikipedia(loginWriter());
	Thread.sleep(10); // allow some time for the indexer to catch up

	defaultContext = new TestContext();
	}

	@Override
	protected void afterSuite() throws Exception {
	executorService.shutdown();
	executorService.awaitTermination(1, TimeUnit.MINUTES);
	FileUtils.deleteDirectory(indexCopierDir);
	}

	@Override
	protected TestContext prepareThreadExecutionContext() {
	return new TestContext();
	}

	@Override
	protected void runTest() throws Exception {
	runTest(defaultContext);
	}

	@SuppressWarnings("deprecation")
	@Override
	protected void runTest(TestContext ec) throws Exception {
	QueryManager qm = ec.session.getWorkspace().getQueryManager();
	// TODO verify why "order by jcr:score()" accounts for what looks
	// like > 20% of the perf lost in Collections.sort
	for (String word : ec.words) {
	Query q = qm.createQuery("//*[jcr:contains(@text, '" + word + "')] ", Query.XPATH);
	QueryResult r = q.execute();
	RowIterator it = r.getRows();
	for (int rows = 0; it.hasNext() && rows < maxRowsToFetch; rows++) {
	Node n = it.nextRow().getNode();
	ec.hash += n.getProperty("text").getString().hashCode();
	ec.hash += n.getProperty("title").getString().hashCode();
	}
	}
	}

	class TestContext {
	final Session session = loginWriter();
	final String[] words = getRandomWords();
	int hash = 0; // summary variable to prevent JIT compiler tricks
	}

	private String[] getRandomWords() {
	List<String> samples = newArrayList(sampleSet);
	String[] words = new String[100];
	for (int i = 0; i < words.length; i++) {
	words[i] = samples.get(random.nextInt(samples.size()));
	}
	return words;
	}

	@Override
	protected Repository[] createRepository(RepositoryFixture fixture) throws Exception {
	if (fixture instanceof OakRepositoryFixture) {
	return ((OakRepositoryFixture) fixture).setUpCluster(1, new JcrCreator() {
	@Override
	public Jcr customize(Oak oak) {
	LuceneIndexProvider provider = createLuceneIndexProvider();
	oak.with((QueryIndexProvider) provider)
	.with((Observer) provider)
	.with(new LuceneIndexEditorProvider())
	.with(new LuceneInitializerHelper("luceneGlobal", storageEnabled));
	return new Jcr(oak);
	}
	});
	}
	return super.createRepository(fixture);
	}

	private LuceneIndexProvider createLuceneIndexProvider() {
	if (!disableCopyOnRead) {
	try {
	IndexCopier copier = new IndexCopier(executorService, indexCopierDir, true);
	return new LuceneIndexProvider(copier);
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	return new LuceneIndexProvider();
	}

	private File createTemporaryFolder(File parentFolder){
	File createdFolder = null;
	try {
	createdFolder = File.createTempFile("oak", "", parentFolder);
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	createdFolder.delete();
	createdFolder.mkdir();
	return createdFolder;
	}
	}