Fix flaky test_pending_range patch by Andrés de la Peña; reviewed by Berenguer Blasi for CASSANDRA-16614 Co-authored-by: Andrés de la Peña <a.penya.garcia@gmail.com> Co-authored-by: Bereng <berenguerblasi@gmail.com>

commit: 2032cb8503d9a3e90822de72458a09dd07d30b7e [log] [tgz]
author: Andrés de la Peña <a.penya.garcia@gmail.com> Wed Apr 21 16:53:39 2021 +0100
committer: Andrés de la Peña <a.penya.garcia@gmail.com> Mon Apr 26 12:22:49 2021 +0100
tree: 4324f3bd3b19ee6648475dcd27ea1917cbfb1b52
parent: a082ee1cac951e0d41503cae54454e81198d37ef [diff]
diff --git a/README.md b/README.md
index b7efc05..efefb21 100644
--- a/README.md
+++ b/README.md

@@ -78,6 +78,19 @@
 The tests will use this directory by default, avoiding the need for any
 environment variable (that still will have precedence if given though).
 
+To run a specific test file, class or individual test, you only have to 
+pass its path as an argument:
+
+    pytest --cassandra-dir=~/path/to/cassandra pending_range_test.py
+    pytest --cassandra-dir=~/path/to/cassandra pending_range_test.py::TestPendingRangeMovements
+    pytest --cassandra-dir=~/path/to/cassandra pending_range_test.py::TestPendingRangeMovements::test_pending_range
+    
+When adding a new test or modifying an existing one, it's always a good idea to
+run it several times to make sure it is stable. This can be easily done with 
+the ``--count`` option. For example, to run a test class 10 times:
+
+    pytest --count=10 --cassandra-dir=~/path/to/cassandra pending_range_test.py
+
 Existing tests are probably the best place to start to look at how to write
 tests.
 

diff --git a/pending_range_test.py b/pending_range_test.py
index 6371312..e643ad7 100644
--- a/pending_range_test.py
+++ b/pending_range_test.py

@@ -1,6 +1,7 @@
 import logging
 import pytest
 import re
+import threading
 
 from cassandra.query import SimpleStatement
 
@@ -23,7 +24,8 @@
             cluster.set_log_level('DEBUG')
 
         # Create 5 node cluster
-        cluster.populate(5).start()
+        ring_delay_ms = 3_600_000  # 1 hour
+        cluster.populate(5).start(jvm_args=['-Dcassandra.ring_delay_ms={}'.format(ring_delay_ms)])
         node1, node2 = cluster.nodelist()[0:2]
 
         # Set up RF=3 keyspace
@@ -46,27 +48,30 @@
 
         mark = node1.mark_log()
 
-        # Move a node
-        node1.nodetool('move {}'.format(token))
+        # Move a node without waiting for the response of nodetool, so we don't have to wait for ring_delay
+        threading.Thread(target=(lambda: node1.nodetool('move {}'.format(token)))).start()
 
         # Watch the log so we know when the node is moving
         node1.watch_log_for('Moving .* to {}'.format(token), timeout=10, from_mark=mark)
-        node1.watch_log_for('Sleeping 30000 ms before start streaming/fetching ranges', timeout=10, from_mark=mark)
+        node1.watch_log_for('Sleeping {} ms before start streaming/fetching ranges'.format(ring_delay_ms),
+                            timeout=10, from_mark=mark)
 
-        if cluster.version() >= '2.2':
-            if cluster.version() >= '4.0':
-                node2.watch_log_for('127.0.0.1:7000 state MOVING', timeout=10, filename='debug.log')
+        # Watch the logs so we know when all the nodes see the status update to MOVING
+        for node in cluster.nodelist():
+            if cluster.version() >= '2.2':
+                if cluster.version() >= '4.0':
+                    node.watch_log_for('127.0.0.1:7000 state MOVING', timeout=10, filename='debug.log')
+                else:
+                    node.watch_log_for('127.0.0.1 state moving', timeout=10, filename='debug.log')
             else:
-                node2.watch_log_for('127.0.0.1 state moving', timeout=10, filename='debug.log')
-        else:
-            # 2.1 doesn't have debug.log, so we are logging at trace, and look
-            # in the system.log file
-            node2.watch_log_for('127.0.0.1 state moving', timeout=10, filename='system.log')
+                # 2.1 doesn't have debug.log, so we are logging at trace, and look
+                # in the system.log file
+                node.watch_log_for('127.0.0.1 state moving', timeout=10, filename='system.log')
 
         # Once the node is MOVING, kill it immediately, let the other nodes notice
         node1.stop(gently=False, wait_other_notice=True)
 
-        # Verify other nodes believe this is Down/Moving
+        # Verify other nodes believe that the killed node is Down/Moving
         out, _, _ = node2.nodetool('ring')
         logger.debug("Nodetool Ring output: {}".format(out))
         assert re.search('127\.0\.0\.1.*?Down.*?Moving', out) is not None

diff --git a/requirements.txt b/requirements.txt
index 8e7ac0a..cf618d5 100644
--- a/requirements.txt
+++ b/requirements.txt

@@ -14,6 +14,7 @@
 mock
 pytest==3.6.4
 pytest-timeout
+pytest-repeat
 parse
 pycodestyle
 psutil
commit	2032cb8503d9a3e90822de72458a09dd07d30b7e	[log] [tgz]
author	Andrés de la Peña <a.penya.garcia@gmail.com>	Wed Apr 21 16:53:39 2021 +0100
committer	Andrés de la Peña <a.penya.garcia@gmail.com>	Mon Apr 26 12:22:49 2021 +0100
tree	4324f3bd3b19ee6648475dcd27ea1917cbfb1b52
parent	a082ee1cac951e0d41503cae54454e81198d37ef [diff]