blob: dfd76158025c631b58e00f58073bede7ab1c2729 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.zookeeper.KeeperException;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class RollingRestartTest extends AbstractFullDistribZkTestBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final long MAX_WAIT_TIME = TimeUnit.NANOSECONDS.convert(300, TimeUnit.SECONDS);
public RollingRestartTest() {
sliceCount = 2;
fixShardCount(TEST_NIGHTLY ? 16 : 2);
}
@Override
public void distribSetUp() throws Exception {
super.distribSetUp();
useFactory("solr.StandardDirectoryFactory");
}
@Test
//commented 2-Aug-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18
public void test() throws Exception {
waitForRecoveriesToFinish(false);
restartWithRolesTest();
waitForRecoveriesToFinish(false);
}
public void restartWithRolesTest() throws Exception {
String leader = OverseerCollectionConfigSetProcessor.getLeaderNode(cloudClient.getZkStateReader().getZkClient());
assertNotNull(leader);
log.info("Current overseer leader = {}", leader);
cloudClient.getZkStateReader().getZkClient().printLayoutToStdOut();
int numDesignateOverseers = TEST_NIGHTLY ? 16 : 2;
numDesignateOverseers = Math.max(getShardCount(), numDesignateOverseers);
List<String> designates = new ArrayList<>();
List<CloudJettyRunner> designateJettys = new ArrayList<>();
for (int i = 0; i < numDesignateOverseers; i++) {
int n = random().nextInt(getShardCount());
String nodeName = cloudJettys.get(n).nodeName;
log.info("Chose {} as overseer designate", nodeName);
CollectionAdminRequest.addRole(nodeName,"overseer").process(cloudClient);
designates.add(nodeName);
designateJettys.add(cloudJettys.get(n));
}
waitUntilOverseerDesignateIsLeader(cloudClient.getZkStateReader().getZkClient(), designates, MAX_WAIT_TIME);
cloudClient.getZkStateReader().getZkClient().printLayoutToStdOut();
boolean sawLiveDesignate = false;
int numRestarts = 1 + random().nextInt(TEST_NIGHTLY ? 12 : 2);
for (int i = 0; i < numRestarts; i++) {
log.info("Rolling restart #{}", i + 1); // nowarn
for (CloudJettyRunner cloudJetty : designateJettys) {
log.info("Restarting {}", cloudJetty);
chaosMonkey.stopJetty(cloudJetty);
cloudClient.getZkStateReader().updateLiveNodes();
boolean liveDesignates = CollectionUtils.intersection(cloudClient.getZkStateReader().getClusterState().getLiveNodes(), designates).size() > 0;
if (liveDesignates) {
sawLiveDesignate = true;
boolean success = waitUntilOverseerDesignateIsLeader(cloudClient.getZkStateReader().getZkClient(), designates, MAX_WAIT_TIME);
if (!success) {
leader = OverseerCollectionConfigSetProcessor.getLeaderNode(cloudClient.getZkStateReader().getZkClient());
if (leader == null)
log.error("NOOVERSEER election queue is : {}"
, OverseerCollectionConfigSetProcessor.getSortedElectionNodes(cloudClient.getZkStateReader().getZkClient(),
"/overseer_elect/election"));
fail("No overseer designate as leader found after restart #" + (i + 1) + ": " + leader);
}
}
cloudJetty.jetty.start();
boolean success = waitUntilOverseerDesignateIsLeader(cloudClient.getZkStateReader().getZkClient(), designates, MAX_WAIT_TIME);
if (!success) {
leader = OverseerCollectionConfigSetProcessor.getLeaderNode(cloudClient.getZkStateReader().getZkClient());
if (leader == null)
log.error("NOOVERSEER election queue is :{}"
, OverseerCollectionConfigSetProcessor.getSortedElectionNodes(cloudClient.getZkStateReader().getZkClient(),
"/overseer_elect/election"));
fail("No overseer leader found after restart #" + (i + 1) + ": " + leader);
}
cloudClient.getZkStateReader().updateLiveNodes();
sawLiveDesignate = CollectionUtils.intersection(cloudClient.getZkStateReader().getClusterState().getLiveNodes(), designates).size() > 0;
}
}
assertTrue("Test may not be working if we never saw a live designate", sawLiveDesignate);
leader = OverseerCollectionConfigSetProcessor.getLeaderNode(cloudClient.getZkStateReader().getZkClient());
assertNotNull(leader);
log.info("Current overseer leader (after restart) = {}", leader);
cloudClient.getZkStateReader().getZkClient().printLayoutToStdOut();
}
static boolean waitUntilOverseerDesignateIsLeader(SolrZkClient testZkClient, List<String> overseerDesignates, long timeoutInNanos) throws KeeperException, InterruptedException {
long now = System.nanoTime();
long maxTimeout = now + timeoutInNanos; // the maximum amount of time we're willing to wait to see the designate as leader
long timeout = now + TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS);
boolean firstTime = true;
int stableCheckTimeout = 2000;
String oldleader = null;
while (System.nanoTime() < timeout && System.nanoTime() < maxTimeout) {
String newLeader = OverseerCollectionConfigSetProcessor.getLeaderNode(testZkClient);
if (newLeader != null && !newLeader.equals(oldleader)) {
// the leaders have changed, let's move the timeout further
timeout = System.nanoTime() + TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS);
log.info("oldLeader={} newLeader={} - Advancing timeout to: {}", oldleader, newLeader, timeout);
oldleader = newLeader;
}
if (!overseerDesignates.contains(newLeader)) {
Thread.sleep(500);
} else {
if (firstTime) {
firstTime = false;
Thread.sleep(stableCheckTimeout);
} else {
return true;
}
}
}
if (System.nanoTime() < maxTimeout) {
log.error("Max wait time exceeded");
}
return false;
}
}