blob: 7e20c79782ab5d18b1a87414f5f7b1d2bfdc1f9a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.common.util.Utils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.UpdateHandler;
import org.apache.solr.update.UpdateLog;
import org.apache.solr.update.VersionInfo;
import org.apache.solr.util.TestHarness;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import static org.apache.solr.core.SolrCore.verbose;
import static org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
@LuceneTestCase.AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 6-Sep-2018
// can fail due to NPE uncaught exception in stress thread, probably because of null core
public class TestStressRecovery extends TestRTGBase {
@Before
public void beforeClass() throws Exception {
randomizeUpdateLogImpl();
initCore("solrconfig-tlog.xml","schema15.xml");
}
@After
public void afterClass() {
deleteCore();
}
// This points to the live model when state is ACTIVE, but a snapshot of the
// past when recovering.
volatile ConcurrentHashMap<Integer,DocInfo> visibleModel;
// This version simulates updates coming from the leader and sometimes being reordered
// and tests the ability to buffer updates and apply them later
@Test
// 12-Jun-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 04-May-2018
// commented out on: 24-Dec-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 6-Sep-2018
public void testStressRecovery() throws Exception {
assumeFalse("FIXME: This test is horribly slow sometimes on Windows!", Constants.WINDOWS);
final int commitPercent = 5 + random().nextInt(10);
final int softCommitPercent = 30+random().nextInt(75); // what percent of the commits are soft
final int deletePercent = 4+random().nextInt(25);
final int deleteByQueryPercent = random().nextInt(5);
final int ndocs = 5 + (random().nextBoolean() ? random().nextInt(25) : random().nextInt(200));
int nWriteThreads = 2 + random().nextInt(10); // fewer write threads to give recovery thread more of a chance
final int maxConcurrentCommits = nWriteThreads;
// query variables
final int percentRealtimeQuery = 75;
final int percentGetLatestVersions = random().nextInt(4);
final AtomicLong operations = new AtomicLong(atLeast(35)); // number of recovery loops to perform
int nReadThreads = 2 + random().nextInt(10); // fewer read threads to give writers more of a chance
initModel(ndocs);
final AtomicInteger numCommitting = new AtomicInteger();
List<Thread> threads = new ArrayList<>();
final AtomicLong testVersion = new AtomicLong(0);
final UpdateHandler uHandler = h.getCore().getUpdateHandler();
final UpdateLog uLog = uHandler.getUpdateLog();
final VersionInfo vInfo = uLog.getVersionInfo();
final Object stateChangeLock = new Object();
this.visibleModel = model;
final Semaphore[] writePermissions = new Semaphore[nWriteThreads];
for (int i=0; i<nWriteThreads; i++) writePermissions[i] = new Semaphore(Integer.MAX_VALUE, false);
final Semaphore readPermission = new Semaphore(Integer.MAX_VALUE, false);
for (int i=0; i<nWriteThreads; i++) {
final int threadNum = i;
Thread thread = new Thread("WRITER"+i) {
Random rand = new Random(random().nextInt());
Semaphore writePermission = writePermissions[threadNum];
@Override
public void run() {
try {
while (operations.get() > 0) {
writePermission.acquire();
int oper = rand.nextInt(10);
if (oper < commitPercent) {
if (numCommitting.incrementAndGet() <= maxConcurrentCommits) {
Map<Integer,DocInfo> newCommittedModel;
long version;
synchronized(globalLock) {
newCommittedModel = new HashMap<>(model); // take a snapshot
version = snapshotCount++;
}
synchronized (stateChangeLock) {
// These commits won't take affect if we are in recovery mode,
// so change the version to -1 so we won't update our model.
if (uLog.getState() != UpdateLog.State.ACTIVE) version = -1;
if (rand.nextInt(100) < softCommitPercent) {
verbose("softCommit start");
assertU(TestHarness.commit("softCommit","true"));
verbose("softCommit end");
} else {
verbose("hardCommit start");
assertU(commit());
verbose("hardCommit end");
}
}
synchronized(globalLock) {
// install this model snapshot only if it's newer than the current one
// install this model only if we are not in recovery mode.
if (version >= committedModelClock) {
if (VERBOSE) {
verbose("installing new committedModel version="+committedModelClock);
}
committedModel = newCommittedModel;
committedModelClock = version;
}
}
}
numCommitting.decrementAndGet();
continue;
}
int id;
if (rand.nextBoolean()) {
id = rand.nextInt(ndocs);
} else {
id = lastId; // reuse the last ID half of the time to force more race conditions
}
// set the lastId before we actually change it sometimes to try and
// uncover more race conditions between writing and reading
boolean before = rand.nextBoolean();
if (before) {
lastId = id;
}
DocInfo info = model.get(id);
long val = info.val;
long nextVal = Math.abs(val)+1;
// the version we set on the update should determine who wins
// These versions are not derived from the actual leader update handler hand hence this
// test may need to change depending on how we handle version numbers.
long version = testVersion.incrementAndGet();
// yield after getting the next version to increase the odds of updates happening out of order
if (rand.nextBoolean()) Thread.yield();
if (oper < commitPercent + deletePercent) {
verbose("deleting id",id,"val=",nextVal,"version",version);
Long returnedVersion = deleteAndGetVersion(Integer.toString(id), params("_version_",Long.toString(-version), DISTRIB_UPDATE_PARAM,FROM_LEADER));
// TODO: returning versions for these types of updates is redundant
// but if we do return, they had better be equal
if (returnedVersion != null) {
assertEquals(-version, returnedVersion.longValue());
}
// only update model if the version is newer
synchronized (model) {
DocInfo currInfo = model.get(id);
if (Math.abs(version) > Math.abs(currInfo.version)) {
model.put(id, new DocInfo(version, -nextVal));
}
}
verbose("deleting id", id, "val=",nextVal,"version",version,"DONE");
} else if (oper < commitPercent + deletePercent + deleteByQueryPercent) {
verbose("deleteByQuery id",id,"val=",nextVal,"version",version);
Long returnedVersion = deleteByQueryAndGetVersion("id:"+Integer.toString(id), params("_version_",Long.toString(-version), DISTRIB_UPDATE_PARAM,FROM_LEADER));
// TODO: returning versions for these types of updates is redundant
// but if we do return, they had better be equal
if (returnedVersion != null) {
assertEquals(-version, returnedVersion.longValue());
}
// only update model if the version is newer
synchronized (model) {
DocInfo currInfo = model.get(id);
if (Math.abs(version) > Math.abs(currInfo.version)) {
model.put(id, new DocInfo(version, -nextVal));
}
}
verbose("deleteByQuery id", id, "val=",nextVal,"version",version,"DONE");
} else {
verbose("adding id", id, "val=", nextVal,"version",version);
Long returnedVersion = addAndGetVersion(sdoc("id", Integer.toString(id), FIELD, Long.toString(nextVal), "_version_",Long.toString(version)), params(DISTRIB_UPDATE_PARAM,FROM_LEADER));
if (returnedVersion != null) {
assertEquals(version, returnedVersion.longValue());
}
// only update model if the version is newer
synchronized (model) {
DocInfo currInfo = model.get(id);
if (version > currInfo.version) {
model.put(id, new DocInfo(version, nextVal));
}
}
if (VERBOSE) {
verbose("adding id", id, "val=", nextVal,"version",version,"DONE");
}
}
// } // end sync
if (!before) {
lastId = id;
}
}
} catch (Throwable e) {
operations.set(-1L);
throw new RuntimeException(e);
}
}
};
threads.add(thread);
}
for (int i=0; i<nReadThreads; i++) {
Thread thread = new Thread("READER"+i) {
Random rand = new Random(random().nextInt());
@Override
public void run() {
try {
while (operations.get() > 0) {
// throttle reads (don't completely stop)
readPermission.tryAcquire(10, TimeUnit.MILLISECONDS);
// bias toward a recently changed doc
int id = rand.nextInt(100) < 25 ? lastId : rand.nextInt(ndocs);
// when indexing, we update the index, then the model
// so when querying, we should first check the model, and then the index
boolean realTime = rand.nextInt(100) < percentRealtimeQuery;
DocInfo info;
if (realTime) {
info = visibleModel.get(id);
} else {
synchronized(globalLock) {
info = committedModel.get(id);
}
}
if (VERBOSE) {
verbose("querying id", id);
}
SolrQueryRequest sreq;
if (realTime) {
sreq = req("wt","json", "qt","/get", "ids",Integer.toString(id));
} else {
sreq = req("wt","json", "q","id:"+Integer.toString(id), "omitHeader","true");
}
String response = h.query(sreq);
@SuppressWarnings({"rawtypes"})
Map rsp = (Map) Utils.fromJSONString(response);
@SuppressWarnings({"rawtypes"})
List doclist = (List)(((Map)rsp.get("response")).get("docs"));
if (doclist.size() == 0) {
// there's no info we can get back with a delete, so not much we can check without further synchronization
} else {
assertEquals(1, doclist.size());
long foundVal = (Long)(((Map)doclist.get(0)).get(FIELD));
long foundVer = (Long)(((Map)doclist.get(0)).get("_version_"));
if (foundVer < Math.abs(info.version)
|| (foundVer == info.version && foundVal != info.val) ) { // if the version matches, the val must
verbose("ERROR, id=", id, "found=",response,"model",info);
assertTrue(false);
}
}
}
if (rand.nextInt(100) < percentGetLatestVersions) {
getLatestVersions();
// TODO: some sort of validation that the latest version is >= to the latest version we added?
}
} catch (Throwable e) {
operations.set(-1L);
throw new RuntimeException(e);
}
}
};
threads.add(thread);
}
for (Thread thread : threads) {
thread.start();
}
int bufferedAddsApplied = 0;
do {
assertTrue(uLog.getState() == UpdateLog.State.ACTIVE);
// before we start buffering updates, we want to point
// visibleModel away from the live model.
visibleModel = new ConcurrentHashMap<>(model);
synchronized (stateChangeLock) {
uLog.bufferUpdates();
}
assertTrue(uLog.getState() == UpdateLog.State.BUFFERING);
// sometimes wait for a second to allow time for writers to write something
if (random().nextBoolean()) Thread.sleep(random().nextInt(10)+1);
Future<UpdateLog.RecoveryInfo> recoveryInfoF = uLog.applyBufferedUpdates();
if (recoveryInfoF != null) {
UpdateLog.RecoveryInfo recInfo = null;
int writeThreadNumber = 0;
int cnt = 5000;
while (recInfo == null) {
try {
// wait a short period of time for recovery to complete (and to give a chance for more writers to concurrently add docs)
cnt--;
recInfo = recoveryInfoF.get(random().nextInt(100/nWriteThreads), TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
// idle one more write thread
verbose("Operation",operations.get(),"Draining permits for write thread",writeThreadNumber);
writePermissions[writeThreadNumber++].drainPermits();
if (writeThreadNumber >= nWriteThreads) {
// if we hit the end, back up and give a few write permits
writeThreadNumber--;
writePermissions[writeThreadNumber].release(random().nextInt(2) + 1);
}
// throttle readers so they don't steal too much CPU from the recovery thread
readPermission.drainPermits();
}
if (cnt == 0) {
break;
}
}
if (recInfo != null) {
bufferedAddsApplied += recInfo.adds;
}
}
// put all writers back at full blast
for (Semaphore writePerm : writePermissions) {
// I don't think semaphores check for overflow, so we need to check mow many remain
int neededPermits = Integer.MAX_VALUE - writePerm.availablePermits();
if (neededPermits > 0) writePerm.release( neededPermits );
}
// put back readers at full blast and point back to live model
visibleModel = model;
int neededPermits = Integer.MAX_VALUE - readPermission.availablePermits();
if (neededPermits > 0) readPermission.release( neededPermits );
verbose("ROUND=",operations.get());
} while (operations.decrementAndGet() > 0);
verbose("bufferedAddsApplied=",bufferedAddsApplied);
for (Thread thread : threads) {
thread.join();
}
}
}