GEODE-7861: Improve error reporting in GMSJoinLeave.join() (#5839)
* GEODE-7861: Improve error reporting in GMSJoinLeave.join()
* Fix LocatorDUnitTest.testNoLocator
* Changes after the code review
* Fix typo
diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
index 55cf064..6eff0ba 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -1002,7 +1002,8 @@
// I guess it can throw this too...
} catch (GemFireConfigException ex) {
- assertThat(ex.getCause().getMessage().contains("Locator does not exist")).isTrue();
+ assertThat(ex.getCause().getMessage().contains("Could not contact any of the locators"))
+ .isTrue();
}
}
diff --git a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index aa53bdf..247a0e6 100644
--- a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -22,12 +22,14 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyBoolean;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Matchers.isA;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.timeout;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
@@ -115,11 +117,18 @@
public void initMocks(boolean enableNetworkPartition, boolean useTestGMSJoinLeave)
throws Exception {
+ String locator = "localhost[12345]";
+ initMocks(enableNetworkPartition, useTestGMSJoinLeave, locator, locator);
+ }
+
+ public void initMocks(boolean enableNetworkPartition, boolean useTestGMSJoinLeave,
+ String locators, String startLocator)
+ throws Exception {
mockConfig = mock(MembershipConfig.class);
when(mockConfig.isNetworkPartitionDetectionEnabled()).thenReturn(enableNetworkPartition);
when(mockConfig.getSecurityUDPDHAlgo()).thenReturn("");
- when(mockConfig.getStartLocator()).thenReturn("localhost[12345]");
- when(mockConfig.getLocators()).thenReturn("localhost[12345]");
+ when(mockConfig.getStartLocator()).thenReturn(startLocator);
+ when(mockConfig.getLocators()).thenReturn(locators);
when(mockConfig.getMcastPort()).thenReturn(0);
when(mockConfig.getMemberTimeout()).thenReturn(2000L);
@@ -1423,14 +1432,7 @@
@Test
public void testCoordinatorFindRequestSuccess() throws Exception {
initMocks(false);
- HashSet<MemberIdentifier> registrants = new HashSet<>();
- registrants.add(mockMembers[0]);
- FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0], false,
- null, registrants, false, true, null);
-
- when(locatorClient.requestToServer(isA(HostAndPort.class),
- isA(FindCoordinatorRequest.class), anyInt(), anyBoolean()))
- .thenReturn(fcr);
+ mockRequestToServer(isA(HostAndPort.class));
boolean foundCoordinator = gmsJoinLeave.findCoordinator();
assertTrue(gmsJoinLeave.searchState.toString(), foundCoordinator);
@@ -1441,24 +1443,82 @@
public void testCoordinatorFindRequestFailure() throws Exception {
try {
initMocks(false);
- HashSet<MemberIdentifier> registrants = new HashSet<>();
- registrants.add(mockMembers[0]);
- FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0],
- false, null, registrants, false, true, null);
+ mockRequestToServer(eq(new HostAndPort("localhost", 12346)));
GMSMembershipView view = createView();
JoinResponseMessage jrm = new JoinResponseMessage(mockMembers[0], view, 0);
gmsJoinLeave.setJoinResponseMessage(jrm);
- when(locatorClient.requestToServer(eq(new HostAndPort("localhost", 12346)),
- isA(FindCoordinatorRequest.class), anyInt(), anyBoolean()))
- .thenReturn(fcr);
-
- assertFalse("Should not be able to join ", gmsJoinLeave.join());
+ assertThatThrownBy(gmsJoinLeave::join)
+ .isInstanceOf(MembershipConfigurationException.class);
} finally {
-
}
}
+ @Test
+ public void testJoinFailureWhenSleepInterrupted() throws Exception {
+ initMocks(false);
+ mockRequestToServer(isA(HostAndPort.class));
+
+ when(mockConfig.getMemberTimeout()).thenReturn(100L);
+ when(mockConfig.getJoinTimeout()).thenReturn(1000L);
+
+ GMSJoinLeave spyGmsJoinLeave = spy(gmsJoinLeave);
+ when(spyGmsJoinLeave.pauseIfThereIsNoCoordinator(-1, GMSJoinLeave.JOIN_RETRY_SLEEP))
+ .thenThrow(new InterruptedException());
+
+ assertThatThrownBy(spyGmsJoinLeave::join)
+ .isInstanceOf(MembershipConfigurationException.class)
+ .hasMessageContaining("Retry sleep interrupted");
+ }
+
+ @Test
+ public void testJoinFailureWhenTimeout() throws Exception {
+ initMocks(false);
+ mockRequestToServer(isA(HostAndPort.class));
+
+ assertThatThrownBy(() -> gmsJoinLeave.join())
+ .isInstanceOf(MembershipConfigurationException.class)
+ .hasMessageContaining("Operation timed out");
+ }
+
+ @Test
+ public void testPauseIfThereIsNoCoordinator() throws InterruptedException {
+ locatorClient = mock(TcpClient.class);
+ gmsJoinLeave = new GMSJoinLeave(locatorClient);
+ assertThat(gmsJoinLeave.pauseIfThereIsNoCoordinator(-1, GMSJoinLeave.JOIN_RETRY_SLEEP))
+ .isFalse();
+ assertThat(gmsJoinLeave.pauseIfThereIsNoCoordinator(1, GMSJoinLeave.JOIN_RETRY_SLEEP)).isTrue();
+ }
+
+ @Test
+ public void testJoinFailureWhenNoLocator() throws Exception {
+ final String locator1 = "locator1[12345]";
+ final String locator2 = "locator2[54321]";
+ locatorClient = mock(TcpClient.class);
+
+ initMocks(false, false, locator1 + ',' + locator2, locator1);
+ when(locatorClient.requestToServer(any(), any(), anyInt(), anyBoolean()))
+ .thenThrow(IOException.class);
+
+ assertThatThrownBy(gmsJoinLeave::join)
+ .isInstanceOf(MembershipConfigurationException.class)
+ .hasMessageContaining(
+ "Could not contact any of the locators: [HostAndPort[locator1:12345], HostAndPort[locator2:54321]]")
+ .hasCauseInstanceOf(IOException.class);
+ }
+
+ private void mockRequestToServer(HostAndPort hostAndPort)
+ throws IOException, ClassNotFoundException {
+ HashSet<MemberIdentifier> registrants = new HashSet<>();
+ registrants.add(mockMembers[0]);
+
+ FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0], false,
+ null, registrants, false, true, null);
+ when(locatorClient.requestToServer(hostAndPort,
+ isA(FindCoordinatorRequest.class), anyInt(), anyBoolean()))
+ .thenReturn(fcr);
+ }
+
private void waitForViewAndFinalCheckInProgress(int viewId) throws InterruptedException {
// wait for the view processing thread to collect and process the requests
int sleeps = 0;
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
index c8135a9..3d13e1b 100644
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
@@ -565,12 +565,7 @@
this.isJoining = true; // added for bug #44373
// connect
- boolean ok = services.getJoinLeave().join();
-
- if (!ok) {
- throw new MembershipConfigurationException("Unable to join the distributed system. "
- + "Operation either timed out, was stopped or Locator does not exist.");
- }
+ services.getJoinLeave().join();
MembershipView<ID> initialView = createGeodeView(services.getJoinLeave().getView());
latestView = new MembershipView<>(initialView, initialView.getViewId());
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java
index 7228162..1880a26 100755
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java
@@ -16,6 +16,7 @@
import org.apache.geode.distributed.internal.membership.api.MemberIdentifier;
import org.apache.geode.distributed.internal.membership.api.MemberStartupException;
+import org.apache.geode.distributed.internal.membership.api.MembershipConfigurationException;
import org.apache.geode.distributed.internal.membership.gms.GMSMembershipView;
/**
@@ -26,10 +27,15 @@
public interface JoinLeave<ID extends MemberIdentifier> extends Service<ID> {
/**
- * joins the distributed system and returns true if successful, false if not. Throws
- * MemberStartupException and MemberConfigurationException
+ * joins the distributed system.
+ *
+ * @throws MemberStartupException if there was a problem joining the cluster after membership
+ * configuration has
+ * completed.
+ * @throws MembershipConfigurationException if operation either timed out, was stopped or locator
+ * does not exist.
*/
- boolean join() throws MemberStartupException;
+ void join() throws MemberStartupException;
/**
* leaves the distributed system. Should be invoked before stop()
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 8a75c81..281e328 100644
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -273,11 +273,13 @@
int lastFindCoordinatorInViewId = -1000;
final Set<FindCoordinatorResponse<ID>> responses = new HashSet<>();
public int responsesExpected;
+ Exception lastLocatorException;
void cleanup() {
alreadyTried.clear();
possibleCoordinator = null;
view = null;
+ lastLocatorException = null;
synchronized (responses) {
responses.clear();
}
@@ -315,14 +317,14 @@
* @return true if successful, false if not
*/
@Override
- public boolean join() throws MemberStartupException {
+ public void join() throws MemberStartupException {
try {
if (Boolean.getBoolean(BYPASS_DISCOVERY_PROPERTY)) {
synchronized (viewInstallationLock) {
becomeCoordinator();
}
- return true;
+ return;
}
SearchState<ID> state = searchState;
@@ -355,11 +357,11 @@
synchronized (viewInstallationLock) {
becomeCoordinator();
}
- return true;
+ return;
}
} else {
if (attemptToJoin()) {
- return true;
+ return;
}
if (this.isStopping) {
break;
@@ -383,40 +385,45 @@
break;
}
}
- try {
- if (found && !state.hasContactedAJoinedLocator) {
- // if locators are restarting they may be handing out IDs from a stale view that
- // we should go through quickly. Otherwise we should sleep a bit to let failure
- // detection select a new coordinator
- if (state.possibleCoordinator.getVmViewId() < 0) {
- logger.debug("sleeping for {} before making another attempt to find the coordinator",
- retrySleep);
- Thread.sleep(retrySleep);
- } else {
+ if (found && !state.hasContactedAJoinedLocator) {
+ try {
+ if (pauseIfThereIsNoCoordinator(state.possibleCoordinator.getVmViewId(), retrySleep)) {
// since we were given a coordinator that couldn't be used we should keep trying
tries = 0;
giveupTime = System.currentTimeMillis() + timeout;
}
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ throw new MembershipConfigurationException(
+ "Retry sleep interrupted. Giving up on joining the distributed system.");
}
- } catch (InterruptedException e) {
- logger.debug("retry sleep interrupted - giving up on joining the distributed system");
- return false;
}
} // for
if (!this.isJoined) {
logger.debug("giving up attempting to join the distributed system after "
+ (System.currentTimeMillis() - startTime) + "ms");
- }
- // to preserve old behavior we need to throw a MemberStartupException if
- // unable to contact any of the locators
- if (!this.isJoined && state.hasContactedAJoinedLocator) {
- throw new MemberStartupException("Unable to join the distributed system in "
- + (System.currentTimeMillis() - startTime) + "ms");
- }
+ // to preserve old behavior we need to throw a MemberStartupException if
+ // unable to contact any of the locators
+ if (state.hasContactedAJoinedLocator) {
+ throw new MemberStartupException("Unable to join the distributed system in "
+ + (System.currentTimeMillis() - startTime) + "ms");
+ }
- return this.isJoined;
+ if (state.locatorsContacted == 0) {
+ throw new MembershipConfigurationException(
+ "Unable to join the distributed system. Could not contact any of the locators: "
+ + locators,
+ state.lastLocatorException);
+ }
+
+ if (System.currentTimeMillis() > giveupTime) {
+ throw new MembershipConfigurationException(
+ "Unable to join the distributed system. Operation timed out");
+ }
+ }
+ return;
} finally {
// notify anyone waiting on the address to be completed
if (this.isJoined) {
@@ -428,6 +435,24 @@
}
}
+ boolean pauseIfThereIsNoCoordinator(int viewId, long retrySleep)
+ throws InterruptedException {
+ // if locators are restarting they may be handing out IDs from a stale view that
+ // we should go through quickly. Otherwise we should sleep a bit to let failure
+ // detection select a new coordinator
+ if (viewId < 0) {
+ // the process hasn't finished joining the cluster.
+ logger.debug("sleeping for {} before making another attempt to find the coordinator",
+ retrySleep);
+ Thread.sleep(retrySleep);
+ } else {
+ // the member has joined the cluster.
+ return true;
+ }
+
+ return false;
+ }
+
/**
* send a join request and wait for a reply. Process the reply. This may throw a
* MemberStartupException or an exception from the authenticator, if present.
@@ -1199,6 +1224,7 @@
} catch (IOException | ClassNotFoundException problem) {
logger.info("Unable to contact locator " + laddr + ": " + problem);
logger.debug("Exception thrown when contacting a locator", problem);
+ state.lastLocatorException = problem;
if (state.locatorsContacted == 0 && System.currentTimeMillis() < giveUpTime) {
try {
Thread.sleep(FIND_LOCATOR_RETRY_SLEEP);