SOLR-3939: Consider a sync attempt from leader to replica that fails due to 404 a success.
SOLR-3940: Rejoining the leader election incorrectly triggers the code path for a fresh cluster start rather than fail over.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1397665 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index cb4852e..62a0fad 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -42,9 +42,7 @@
values of a multiValued field in their original order when highlighting.
(Joel Bernstein via yonik)
-* SOLR-3929
-support configuring IndexWriter max thread count in solrconfig
-
+* SOLR-3929: Support configuring IndexWriter max thread count in solrconfig.
(phunt via Mark Miller)
Optimizations
@@ -73,6 +71,12 @@
* SOLR-3917: Partial State on Schema-Browser UI is not defined for Dynamic
Fields & Types (steffkes)
+
+* SOLR-3939: Consider a sync attempt from leader to replica that fails due
+ to 404 a success. (Mark Miller, Joel Bernstein)
+
+* SOLR-3940: Rejoining the leader election incorrectly triggers the code path
+ for a fresh cluster start rather than fail over. (Mark Miller)
Other Changes
----------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
index 7baa465..77417e9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
@@ -324,7 +324,7 @@
SolrException.log(log, "Error trying to start recovery", t);
}
- leaderElector.joinElection(this);
+ leaderElector.joinElection(this, true);
}
private boolean shouldIBeLeader(ZkNodeProps leaderProps, SolrCore core) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index 4d3a016..07caa55 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -18,7 +18,6 @@
*/
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@@ -43,7 +42,7 @@
* Leader Election process. This class contains the logic by which a
* leader is chosen. First call * {@link #setup(ElectionContext)} to ensure
* the election process is init'd. Next call
- * {@link #joinElection(ElectionContext)} to start the leader election.
+ * {@link #joinElection(ElectionContext, boolean)} to start the leader election.
*
* The implementation follows the classic ZooKeeper recipe of creating an
* ephemeral, sequential node for each candidate and then looking at the set
@@ -203,7 +202,7 @@
*
* @return sequential node number
*/
- public int joinElection(ElectionContext context) throws KeeperException, InterruptedException, IOException {
+ public int joinElection(ElectionContext context, boolean replacement) throws KeeperException, InterruptedException, IOException {
final String shardsElectZkPath = context.electionPath + LeaderElector.ELECTION_NODE;
long sessionId = zkClient.getSolrZooKeeper().getSessionId();
@@ -259,7 +258,7 @@
}
}
int seq = getSeq(leaderSeqPath);
- checkIfIamLeader(seq, context, false);
+ checkIfIamLeader(seq, context, replacement);
return seq;
}
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 0ccab0e..db869ff 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -191,7 +191,7 @@
ZkController.this.overseer = new Overseer(shardHandler, adminPath, zkStateReader);
ElectionContext context = new OverseerElectionContext(zkClient, overseer, getNodeName());
- overseerElector.joinElection(context);
+ overseerElector.joinElection(context, true);
zkStateReader.createClusterStateWatchersAndUpdate();
// cc.newCmdDistribExecutor();
@@ -422,7 +422,7 @@
this.overseer = new Overseer(shardHandler, adminPath, zkStateReader);
ElectionContext context = new OverseerElectionContext(zkClient, overseer, getNodeName());
overseerElector.setup(context);
- overseerElector.joinElection(context);
+ overseerElector.joinElection(context, false);
zkStateReader.createClusterStateWatchersAndUpdate();
} catch (IOException e) {
@@ -730,7 +730,7 @@
leaderElector.setup(context);
electionContexts.put(coreZkNodeName, context);
- leaderElector.joinElection(context);
+ leaderElector.joinElection(context, false);
}
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java b/solr/core/src/java/org/apache/solr/update/PeerSync.java
index a98917e..0466864 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@@ -312,6 +312,11 @@
log.warn(msg() + " got a 503 from " + srsp.getShardAddress() + ", counting as success");
return true;
}
+
+ if (cantReachIsSuccess && sreq.purpose == 1 && srsp.getException() instanceof SolrException && ((SolrException) srsp.getException()).code() == 404) {
+ log.warn(msg() + " got a 404 from " + srsp.getShardAddress() + ", counting as success");
+ return true;
+ }
// TODO: at least log???
// srsp.getException().printStackTrace(System.out);
diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java
index dcf3963..ab55358 100644
--- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZkTest.java
@@ -49,6 +49,7 @@
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.client.solrj.request.CoreAdminRequest.Create;
+import org.apache.solr.client.solrj.request.CoreAdminRequest.Unload;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CoreAdminResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
@@ -742,10 +743,10 @@
0,
((HttpSolrServer) client).getBaseURL().length()
- DEFAULT_COLLECTION.length() - 1);
- createCollection(oneInstanceCollection2, collectionClients, baseUrl, 1, "slice1");
- createCollection(oneInstanceCollection2, collectionClients, baseUrl, 2, "slice2");
- createCollection(oneInstanceCollection2, collectionClients, baseUrl, 3, "slice2");
- createCollection(oneInstanceCollection2, collectionClients, baseUrl, 4, "slice1");
+ createSolrCore(oneInstanceCollection2, collectionClients, baseUrl, 1, "slice1");
+ createSolrCore(oneInstanceCollection2, collectionClients, baseUrl, 2, "slice2");
+ createSolrCore(oneInstanceCollection2, collectionClients, baseUrl, 3, "slice2");
+ createSolrCore(oneInstanceCollection2, collectionClients, baseUrl, 4, "slice1");
while (pending != null && pending.size() > 0) {
@@ -764,7 +765,7 @@
assertAllActive(oneInstanceCollection2, solrj.getZkStateReader());
- printLayout();
+ //printLayout();
// TODO: enable when we don't falsely get slice1...
// solrj.getZkStateReader().getLeaderUrl(oneInstanceCollection2, "slice1", 30000);
@@ -803,6 +804,27 @@
assertNotNull(slices);
String roles = slices.get("slice1").getReplicasMap().values().iterator().next().getStr(ZkStateReader.ROLES_PROP);
assertEquals("none", roles);
+
+
+ ZkCoreNodeProps props = new ZkCoreNodeProps(solrj.getZkStateReader().getClusterState().getLeader(oneInstanceCollection2, "slice1"));
+
+ // now test that unloading a core gets us a new leader
+ HttpSolrServer server = new HttpSolrServer(baseUrl);
+ Unload unloadCmd = new Unload(true);
+ unloadCmd.setCoreName(props.getCoreName());
+
+ String leader = props.getCoreUrl();
+
+ server.request(unloadCmd);
+
+ int tries = 50;
+ while (leader.equals(zkStateReader.getLeaderUrl(oneInstanceCollection2, "slice1", 10000))) {
+ Thread.sleep(100);
+ if (tries-- == 0) {
+ fail("Leader never changed");
+ }
+ }
+
}
private void testSearchByCollectionName() throws SolrServerException {
@@ -875,10 +897,10 @@
private void createCollection(String collection,
List<SolrServer> collectionClients, String baseUrl, int num) {
- createCollection(collection, collectionClients, baseUrl, num, null);
+ createSolrCore(collection, collectionClients, baseUrl, num, null);
}
- private void createCollection(final String collection,
+ private void createSolrCore(final String collection,
List<SolrServer> collectionClients, final String baseUrl, final int num,
final String shardId) {
Callable call = new Callable() {
diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
index 1b12c65..a4f69fc 100644
--- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionTest.java
@@ -40,7 +40,6 @@
import org.apache.zookeeper.KeeperException.NoNodeException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
-import org.junit.Ignore;
import org.junit.Test;
@Slow
@@ -114,7 +113,7 @@
elector, "shard1", "collection1", Integer.toString(nodeNumber),
props, zkStateReader);
elector.setup(context);
- seq = elector.joinElection(context);
+ seq = elector.joinElection(context, false);
electionDone = true;
seqToThread.put(seq, this);
}
@@ -175,7 +174,7 @@
ElectionContext context = new ShardLeaderElectionContextBase(elector,
"shard2", "collection1", "dummynode1", props, zkStateReader);
elector.setup(context);
- elector.joinElection(context);
+ elector.joinElection(context, false);
assertEquals("http://127.0.0.1/solr/",
getLeaderUrl("collection1", "shard2"));
}
@@ -188,7 +187,7 @@
ElectionContext firstContext = new ShardLeaderElectionContextBase(first,
"slice1", "collection2", "dummynode1", props, zkStateReader);
first.setup(firstContext);
- first.joinElection(firstContext);
+ first.joinElection(firstContext, false);
Thread.sleep(1000);
assertEquals("original leader was not registered", "http://127.0.0.1/solr/1/", getLeaderUrl("collection2", "slice1"));
@@ -199,7 +198,7 @@
ElectionContext context = new ShardLeaderElectionContextBase(second,
"slice1", "collection2", "dummynode1", props, zkStateReader);
second.setup(context);
- second.joinElection(context);
+ second.joinElection(context, false);
Thread.sleep(1000);
assertEquals("original leader should have stayed leader", "http://127.0.0.1/solr/1/", getLeaderUrl("collection2", "slice1"));
firstContext.cancelElection();
diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
index 6520c6b..59071c7 100644
--- a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
@@ -139,7 +139,7 @@
ShardLeaderElectionContextBase ctx = new ShardLeaderElectionContextBase(
elector, shardId, collection, nodeName + "_" + coreName, props,
zkStateReader);
- elector.joinElection(ctx);
+ elector.joinElection(ctx, false);
return shardId;
}
Thread.sleep(500);
@@ -876,7 +876,7 @@
new HttpShardHandlerFactory().getShardHandler(), "/admin/cores", reader);
ElectionContext ec = new OverseerElectionContext(zkClient, overseer, address.replaceAll("/", "_"));
overseerElector.setup(ec);
- overseerElector.joinElection(ec);
+ overseerElector.joinElection(ec, false);
return zkClient;
}