blob: a641e9bf74b7d0891665c758a04107def3574c7c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.BaseHttpSolrClient.RemoteSolrException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.cloud.ClusterStateUtil;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.util.TestInjection;
import org.hamcrest.MatcherAssert;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import static org.hamcrest.CoreMatchers.anyOf;
import static org.hamcrest.CoreMatchers.is;
public class LeaderTragicEventTest extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private String collection;
@Before
public void setUp() throws Exception {
super.setUp();
collection = getSaferTestName();
// TODO Investigate why using same cluster for all tests in this class led to sporadic failure
configureCluster(2)
.addConfig("config", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
.configure();
cluster.getSolrClient().setDefaultCollection(collection);
}
@After
public void tearDown() throws Exception {
CollectionAdminRequest.deleteCollection(collection).process(cluster.getSolrClient());
cluster.shutdown();
super.tearDown();
}
@Test
public void testLeaderFailsOver() throws Exception {
CollectionAdminRequest
.createCollection(collection, "config", 1, 2)
.process(cluster.getSolrClient());
cluster.waitForActiveCollection(collection, 1, 2);
UpdateResponse updateResponse = new UpdateRequest().add("id", "1").commit(cluster.getSolrClient(), null);
assertEquals(0, updateResponse.getStatus());
Replica oldLeader = corruptLeader(collection);
waitForState("Now waiting for new replica to become leader", collection, (liveNodes, collectionState) -> {
Slice slice = collectionState.getSlice("shard1");
if (slice.getReplicas().size() != 2) return false;
if (slice.getLeader() == null) return false;
if (slice.getLeader().getName().equals(oldLeader.getName())) return false;
return true;
});
ClusterStateUtil.waitForAllActiveAndLiveReplicas(cluster.getSolrClient().getZkStateReader(), collection, 120000);
Slice shard = getCollectionState(collection).getSlice("shard1");
assertNotEquals("Old leader should not be leader again", oldLeader.getNodeName(), shard.getLeader().getNodeName());
assertEquals("Old leader should be a follower", oldLeader.getNodeName(), getNonLeader(shard).getNodeName());
// Check that we can continue indexing after this
updateResponse = new UpdateRequest().add("id", "2").commit(cluster.getSolrClient(), null);
assertEquals(0, updateResponse.getStatus());
try (SolrClient followerClient = new HttpSolrClient.Builder(oldLeader.getCoreUrl()).build()) {
QueryResponse queryResponse = new QueryRequest(new SolrQuery("*:*")).process(followerClient);
assertEquals(queryResponse.getResults().toString(), 2, queryResponse.getResults().getNumFound());
}
}
private Replica corruptLeader(String collection) throws IOException, SolrServerException {
try {
TestInjection.leaderTragedy = "true:100";
DocCollection dc = getCollectionState(collection);
Replica oldLeader = dc.getLeader("shard1");
log.info("Will crash leader : {}", oldLeader);
try (HttpSolrClient solrClient = new HttpSolrClient.Builder(dc.getLeader("shard1").getCoreUrl()).build()) {
new UpdateRequest().add("id", "99").commit(solrClient, null);
fail("Should have injected tragedy");
} catch (RemoteSolrException e) {
// solrClient.add would throw RemoteSolrException with code 500
// or 404 if the bad replica has already been deleted
MatcherAssert.assertThat(e.code(), anyOf(is(500), is(404)));
} catch (AlreadyClosedException e) {
// If giving up leadership, might be already closed/closing
}
return oldLeader;
} finally {
TestInjection.leaderTragedy = null;
}
}
private Replica getNonLeader(Slice slice) {
if (slice.getReplicas().size() <= 1) return null;
return slice.getReplicas(rep -> !rep.getName().equals(slice.getLeader().getName())).get(0);
}
@Test
public void testOtherReplicasAreNotActive() throws Exception {
int numReplicas = random().nextInt(2) + 1;
// won't do anything if leader is the only one active replica in the shard
CollectionAdminRequest
.createCollection(collection, "config", 1, numReplicas)
.process(cluster.getSolrClient());
cluster.waitForActiveCollection(collection, 1, numReplicas);
JettySolrRunner otherReplicaJetty = null;
if (numReplicas == 2) {
Slice shard = getCollectionState(collection).getSlice("shard1");
otherReplicaJetty = cluster.getReplicaJetty(getNonLeader(shard));
if (log.isInfoEnabled()) {
log.info("Stop jetty node : {} state:{}", otherReplicaJetty.getBaseUrl(), getCollectionState(collection));
}
otherReplicaJetty.stop();
cluster.waitForJettyToStop(otherReplicaJetty);
waitForState("Timeout waiting for replica get down", collection, (liveNodes, collectionState) -> getNonLeader(collectionState.getSlice("shard1")).getState() != Replica.State.ACTIVE);
}
Replica oldLeader = corruptLeader(collection);
if (otherReplicaJetty != null) {
otherReplicaJetty.start();
cluster.waitForNode(otherReplicaJetty, 30);
}
Replica leader = getCollectionState(collection).getSlice("shard1").getLeader();
assertEquals(leader.getName(), oldLeader.getName());
}
}