Add option to override the FatClient timeout for Bootstrapping nodes

Patch by Raymond Huffman; reviewed by brandonwilliams and dcapwell for
CASSANDRA-15439
diff --git a/CHANGES.txt b/CHANGES.txt
index a506c9e..2d56a56 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 4.0.14
+ * Add timeout specifically for bootstrapping nodes (CASSANDRA-15439)
  * Bring Redhat packge dirs/ownership/perms in line with Debian package (CASSANDRA-19565)
 
 
diff --git a/conf/jvm-server.options b/conf/jvm-server.options
index e89cf73..d529a2b 100644
--- a/conf/jvm-server.options
+++ b/conf/jvm-server.options
@@ -74,6 +74,10 @@
 # before joining the ring.
 #-Dcassandra.ring_delay_ms=ms
 
+# Allows overriding the timeout after which an unresponsive bootstrapping node is considered failed
+# and is removed from gossip state and bootstrapTokens. (Default: cassandra.ring_delay * 2)
+#-Dcassandra.failed_bootstrap_timeout_ms=ms
+
 # Set the SSL port for encrypted communication. (Default: 7001)
 #-Dcassandra.ssl_storage_port=port
 
diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
index 4de333e..0377bc4 100644
--- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
+++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
@@ -151,6 +151,10 @@
     /** mx4jport */
     MX4JPORT ("mx4jport"),
 
+    RING_DELAY("cassandra.ring_delay_ms"),
+
+    FAILED_BOOTSTRAP_TIMEOUT("cassandra.failed_bootstrap_timeout_ms"),
+
     /**
      * When bootstraping we wait for all schema versions found in gossip to be seen, and if not seen in time we fail
      * the bootstrap; this property will avoid failing and allow bootstrap to continue if set to true.
diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java
index 63ff515..009e6b2 100644
--- a/src/java/org/apache/cassandra/gms/Gossiper.java
+++ b/src/java/org/apache/cassandra/gms/Gossiper.java
@@ -107,6 +107,7 @@
         SILENT_SHUTDOWN_STATES.add(VersionedValue.STATUS_BOOTSTRAPPING);
         SILENT_SHUTDOWN_STATES.add(VersionedValue.STATUS_BOOTSTRAPPING_REPLACE);
     }
+
     private static final List<String> ADMINISTRATIVELY_INACTIVE_STATES = Arrays.asList(VersionedValue.HIBERNATE,
                                                                                        VersionedValue.REMOVED_TOKEN,
                                                                                        VersionedValue.STATUS_LEFT);
@@ -126,7 +127,10 @@
 
     // Maximimum difference between generation value and local time we are willing to accept about a peer
     static final int MAX_GENERATION_DIFFERENCE = 86400 * 365;
-    private final long fatClientTimeout;
+
+    // half of QUARATINE_DELAY, to ensure justRemovedEndpoints has enough leeway to prevent re-gossip
+    private static final long FAT_CLIENT_TIMEOUT = (QUARANTINE_DELAY / 2);
+    private static final long FAILED_BOOTSTRAP_TIMEOUT = getFailedBootstrapTimeout();
     private final Random random = new Random();
 
     /* subscribers for interest in EndpointState change */
@@ -254,6 +258,25 @@
         return 259200 * 1000; // 3 days
     }
 
+    private static long getFailedBootstrapTimeout()
+    {
+        String newtimeout = CassandraRelevantProperties.FAILED_BOOTSTRAP_TIMEOUT.getString();
+        if (newtimeout != null)
+        {
+            long longValue = Long.parseLong(newtimeout);
+            if (longValue == -1)
+            {
+                longValue = Long.MAX_VALUE;
+            }
+            logger.info("Overriding FAILED_BOOTSTRAP_TIMEOUT to {}ms", longValue);
+            return longValue;
+        }
+        else
+        {
+            return FAT_CLIENT_TIMEOUT * 2;
+        }
+    }
+
     private static boolean isInGossipStage()
     {
         return ((JMXEnabledSingleThreadExecutor) Stage.GOSSIP.executor()).isExecutedBy(Thread.currentThread());
@@ -344,8 +367,6 @@
     @VisibleForTesting
     public Gossiper(boolean registerJmx)
     {
-        // half of QUARATINE_DELAY, to ensure justRemovedEndpoints has enough leeway to prevent re-gossip
-        fatClientTimeout = (QUARANTINE_DELAY / 2);
         /* register with the Failure Detector for receiving Failure detector events */
         FailureDetector.instance.registerFailureDetectionEventListener(this);
 
@@ -1048,6 +1069,7 @@
             {
                 // check if this is a fat client. fat clients are removed automatically from
                 // gossip after FatClientTimeout.  Do not remove dead states here.
+                long fatClientTimeout = getFatClientTimeoutForEndpoint(epState);
                 if (isGossipOnlyMember(endpoint)
                     && !justRemovedEndpoints.containsKey(endpoint)
                     && TimeUnit.NANOSECONDS.toMillis(nowNano - epState.getUpdateTimestamp()) > fatClientTimeout)
@@ -1095,6 +1117,24 @@
         }
     }
 
+    private static long getFatClientTimeoutForEndpoint(EndpointState epState)
+    {
+        return isBootstrappingState(epState) ?
+                FAILED_BOOTSTRAP_TIMEOUT :
+                FAT_CLIENT_TIMEOUT;
+    }
+
+    private static boolean isBootstrappingState(EndpointState epState)
+    {
+        String status = getGossipStatus(epState);
+        if (status.isEmpty())
+        {
+            return false;
+        }
+
+        return VersionedValue.BOOTSTRAPPING_STATUS.contains(status);
+    }
+
     protected long getExpireTimeForEndpoint(InetAddressAndPort endpoint)
     {
         /* default expireTime is aVeryLongTime */
diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java b/src/java/org/apache/cassandra/gms/VersionedValue.java
index 880cb98..f7b7c18 100644
--- a/src/java/org/apache/cassandra/gms/VersionedValue.java
+++ b/src/java/org/apache/cassandra/gms/VersionedValue.java
@@ -27,6 +27,7 @@
 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 
 import org.apache.cassandra.db.TypeSizes;
@@ -83,6 +84,7 @@
 
     // values for ApplicationState.REMOVAL_COORDINATOR
     public final static String REMOVAL_COORDINATOR = "REMOVER";
+    public final static Set<String> BOOTSTRAPPING_STATUS = ImmutableSet.of(STATUS_BOOTSTRAPPING, STATUS_BOOTSTRAPPING_REPLACE);
 
     public final int version;
     public final String value;