wide_rows_test.py - cassandra-dtest - Git at Google

 import datetime
 import random
 import logging

 from dtest import Tester, create_ks
 from tools.assertions import assert_length_equal

 status_messages = (
     "I''m going to the Cassandra Summit in June!",
     "C* is awesome!",
     "All your sstables are belong to us.",
     "Just turned on another 50 C* nodes at <insert tech startup here>, scales beautifully.",
     "Oh, look! Cats, on reddit!",
     "Netflix recommendations are really good, wonder why?",
     "Spotify playlists are always giving me good tunes, wonder why?"
 )

 clients = (
     "Android",
     "iThing",
     "Chromium",
     "Mozilla",
     "Emacs"
 )

 logger = logging.getLogger(__name__)


 class TestWideRows(Tester):
     def test_wide_rows(self):
         self.write_wide_rows()

     def write_wide_rows(self):
         cluster = self.cluster
         cluster.populate(1).start()
         node1 = cluster.nodelist()[0]

         session = self.patient_cql_connection(node1)
         start_time = datetime.datetime.now()
         create_ks(session, 'wide_rows', 1)
         # Simple timeline:  user -> {date: value, ...}
         logger.debug('Create Table....')
         session.execute('CREATE TABLE user_events (userid text, event timestamp, value text, PRIMARY KEY (userid, event));')
         date = datetime.datetime.now()
         # Create a large timeline for each of a group of users:
         for user in ('ryan', 'cathy', 'mallen', 'joaquin', 'erin', 'ham'):
             logger.debug("Writing values for: %s" % user)
             for day in range(5000):
                 date_str = (date + datetime.timedelta(day)).strftime("%Y-%m-%d")
                 client = random.choice(clients)
                 msg = random.choice(status_messages)
                 query = "UPDATE user_events SET value = '{msg:%s, client:%s}' WHERE userid='%s' and event='%s';" % (msg, client, user, date_str)
                 # logger.debug(query)
                 session.execute(query)

         # logger.debug('Duration of test: %s' % (datetime.datetime.now() - start_time))

         # Pick out an update for a specific date:
         query = "SELECT value FROM user_events WHERE userid='ryan' and event='%s'" % \
                 (date + datetime.timedelta(10)).strftime("%Y-%m-%d")
         rows = session.execute(query)
         for value in rows:
             logger.debug(value)
             assert len(value[0]) > 0

     def test_column_index_stress(self):
         """Write a large number of columns to a single row and set
         'column_index_size_in_kb' to a sufficiently low value to force
         the creation of a column index. The test will then randomly
         read columns from that row and ensure that all data is
         returned. See CASSANDRA-5225.
         """
         cluster = self.cluster
         cluster.populate(1).start()
         (node1,) = cluster.nodelist()
         cluster.set_configuration_options(values={'column_index_size_in_kb': 1})  # reduce this value to force column index creation
         session = self.patient_cql_connection(node1)
         create_ks(session, 'wide_rows', 1)

         create_table_query = 'CREATE TABLE test_table (row varchar, name varchar, value int, PRIMARY KEY (row, name));'
         session.execute(create_table_query)

         # Now insert 100,000 columns to row 'row0'
         insert_column_query = "UPDATE test_table SET value = {value} WHERE row = '{row}' AND name = '{name}';"
         for i in range(100000):
             row = 'row0'
             name = 'val' + str(i)
             session.execute(insert_column_query.format(value=i, row=row, name=name))

         # now randomly fetch columns: 1 to 3 at a time
         for i in range(10000):
             select_column_query = "SELECT value FROM test_table WHERE row='row0' AND name in ('{name1}', '{name2}', '{name3}');"
             values2fetch = [str(random.randint(0, 99999)) for i in range(3)]
             # values2fetch is a list of random values.  Because they are random, they will not be unique necessarily.
             # To simplify the template logic in the select_column_query I will not expect the query to
             # necessarily return 3 values.  Hence I am computing the number of unique values in values2fetch
             # and using that in the assert at the end.
             expected_rows = len(set(values2fetch))
             rows = list(session.execute(select_column_query.format(name1="val" + values2fetch[0],
                                                                    name2="val" + values2fetch[1],
                                                                    name3="val" + values2fetch[2])))
             assert_length_equal(rows, expected_rows)
	import datetime
	import random
	import logging

	from dtest import Tester, create_ks
	from tools.assertions import assert_length_equal

	status_messages = (
	"I''m going to the Cassandra Summit in June!",
	"C* is awesome!",
	"All your sstables are belong to us.",
	"Just turned on another 50 C* nodes at <insert tech startup here>, scales beautifully.",
	"Oh, look! Cats, on reddit!",
	"Netflix recommendations are really good, wonder why?",
	"Spotify playlists are always giving me good tunes, wonder why?"
	)

	clients = (
	"Android",
	"iThing",
	"Chromium",
	"Mozilla",
	"Emacs"
	)

	logger = logging.getLogger(__name__)


	class TestWideRows(Tester):
	def test_wide_rows(self):
	self.write_wide_rows()

	def write_wide_rows(self):
	cluster = self.cluster
	cluster.populate(1).start()
	node1 = cluster.nodelist()[0]

	session = self.patient_cql_connection(node1)
	start_time = datetime.datetime.now()
	create_ks(session, 'wide_rows', 1)
	# Simple timeline: user -> {date: value, ...}
	logger.debug('Create Table....')
	session.execute('CREATE TABLE user_events (userid text, event timestamp, value text, PRIMARY KEY (userid, event));')
	date = datetime.datetime.now()
	# Create a large timeline for each of a group of users:
	for user in ('ryan', 'cathy', 'mallen', 'joaquin', 'erin', 'ham'):
	logger.debug("Writing values for: %s" % user)
	for day in range(5000):
	date_str = (date + datetime.timedelta(day)).strftime("%Y-%m-%d")
	client = random.choice(clients)
	msg = random.choice(status_messages)
	query = "UPDATE user_events SET value = '{msg:%s, client:%s}' WHERE userid='%s' and event='%s';" % (msg, client, user, date_str)
	# logger.debug(query)
	session.execute(query)

	# logger.debug('Duration of test: %s' % (datetime.datetime.now() - start_time))

	# Pick out an update for a specific date:
	query = "SELECT value FROM user_events WHERE userid='ryan' and event='%s'" % \
	(date + datetime.timedelta(10)).strftime("%Y-%m-%d")
	rows = session.execute(query)
	for value in rows:
	logger.debug(value)
	assert len(value[0]) > 0

	def test_column_index_stress(self):
	"""Write a large number of columns to a single row and set
	'column_index_size_in_kb' to a sufficiently low value to force
	the creation of a column index. The test will then randomly
	read columns from that row and ensure that all data is
	returned. See CASSANDRA-5225.
	"""
	cluster = self.cluster
	cluster.populate(1).start()
	(node1,) = cluster.nodelist()
	cluster.set_configuration_options(values={'column_index_size_in_kb': 1}) # reduce this value to force column index creation
	session = self.patient_cql_connection(node1)
	create_ks(session, 'wide_rows', 1)

	create_table_query = 'CREATE TABLE test_table (row varchar, name varchar, value int, PRIMARY KEY (row, name));'
	session.execute(create_table_query)

	# Now insert 100,000 columns to row 'row0'
	insert_column_query = "UPDATE test_table SET value = {value} WHERE row = '{row}' AND name = '{name}';"
	for i in range(100000):
	row = 'row0'
	name = 'val' + str(i)
	session.execute(insert_column_query.format(value=i, row=row, name=name))

	# now randomly fetch columns: 1 to 3 at a time
	for i in range(10000):
	select_column_query = "SELECT value FROM test_table WHERE row='row0' AND name in ('{name1}', '{name2}', '{name3}');"
	values2fetch = [str(random.randint(0, 99999)) for i in range(3)]
	# values2fetch is a list of random values. Because they are random, they will not be unique necessarily.
	# To simplify the template logic in the select_column_query I will not expect the query to
	# necessarily return 3 values. Hence I am computing the number of unique values in values2fetch
	# and using that in the assert at the end.
	expected_rows = len(set(values2fetch))
	rows = list(session.execute(select_column_query.format(name1="val" + values2fetch[0],
	name2="val" + values2fetch[1],
	name3="val" + values2fetch[2])))
	assert_length_equal(rows, expected_rows)