tests/custom_cluster/test_process_failures.py - impala - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import pytest

 from beeswaxd.BeeswaxService import QueryState
 from tests.common.custom_cluster_test_suite import (
     DEFAULT_CLUSTER_SIZE,
     CustomClusterTestSuite)

 # The exact query doesn't matter much for these tests, just want a query that touches
 # data on all nodes.
 QUERY = "select count(l_comment) from tpch.lineitem"

 # The default number of statestore subscribers should be the default cluster size plus
 # one for the catalogd.
 DEFAULT_NUM_SUBSCRIBERS = DEFAULT_CLUSTER_SIZE + 1


 class TestProcessFailures(CustomClusterTestSuite):
   """Validates killing and restarting impalad processes between query executions."""

   @classmethod
   def setup_class(cls):
     if cls.exploration_strategy() != 'exhaustive':
       pytest.skip('These tests only run in exhaustive')
     super(TestProcessFailures, cls).setup_class()

   @pytest.mark.execute_serially
   def test_restart_coordinator(self):
     """Restarts the coordinator between queries."""
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()

     self.execute_query_expect_success(client, QUERY)

     statestored = self.cluster.statestored
     impalad.restart()
     statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)

     # Reconnect
     client = impalad.service.create_beeswax_client()
     impalad.service.wait_for_metric_value('catalog.ready', 1, timeout=60)
     self.execute_query_expect_success(client, QUERY)

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(num_exclusive_coordinators=1,
       impalad_args="--status_report_max_retry_s=600 --status_report_interval_ms=1000")
   def test_kill_coordinator(self):
     """"Tests that when a coordinator running multiple queries is killed, all
     running fragments on executors are cancelled."""
     impalad = self.cluster.impalads[0]
     client = impalad.service.create_beeswax_client()
     assert client is not None
     # A query which is cancelable and takes long time to execute
     query = "select * from tpch.lineitem t1, tpch.lineitem t2, tpch.lineitem t3 " \
         "where t1.l_orderkey = t2.l_orderkey and t1.l_orderkey = t3.l_orderkey and " \
         "t3.l_orderkey = t2.l_orderkey order by t1.l_orderkey, t2.l_orderkey, " \
         "t3.l_orderkey limit 300"
     num_concurrent_queries = 3
     handles = []

     # Run num_concurrent_queries asynchronously
     for _ in xrange(num_concurrent_queries):
       handles.append(client.execute_async(query))

     # Wait for the queries to start running
     for handle in handles:
       self.wait_for_state(handle, QueryState.RUNNING, 1000, client=client)

     # Kill the coordinator
     impalad.kill()

     # Assert that all executors have 0 in-flight fragments
     for i in xrange(1, len(self.cluster.impalads)):
       self.cluster.impalads[i].service.wait_for_metric_value(
         "impala-server.num-fragments-in-flight", 0, timeout=30)

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(
       impalad_args='--use_local_catalog',
       catalogd_args='--catalog_topic_mode=minimal')
   def test_restart_statestore(self):
     """Tests the cluster still functions when the statestore dies."""
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     statestored = self.cluster.statestored
     statestored.kill()
     impalad.service.wait_for_metric_value(
         'statestore-subscriber.connected', False, timeout=60)

     # impalad should still see the same number of live backends
     assert impalad.service.get_num_known_live_backends() == DEFAULT_CLUSTER_SIZE

     self.execute_query_expect_success(client, QUERY)
     # Reconnect
     statestored.start()

     impalad.service.wait_for_metric_value(
         'statestore-subscriber.connected', True, timeout=60)
     statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)

     # Wait for the number of live backends to reach the cluster size. Even though
     # all backends have subscribed to the statestore, this impalad may not have
     # received the update yet.
     impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE, timeout=60)

     self.execute_query_expect_success(client, QUERY)

   @pytest.mark.execute_serially
   def test_kill_restart_worker(self):
     """Verifies a worker is able to be killed."""
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     self.execute_query_expect_success(client, QUERY)

     # select a different impalad and restart it
     worker_impalad = self.cluster.get_different_impalad(impalad)

     # Start executing a query. It will be cancelled due to a killed worker.
     handle = client.execute_async(QUERY)

     statestored = self.cluster.statestored
     worker_impalad.kill()

     # First wait until the the statestore realizes the impalad has gone down.
     statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS - 1, timeout=60)
     # Wait until the impalad registers another instance went down.
     impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE - 1, timeout=60)

     # Wait until the in-flight query has been cancelled.
     # The in-flight query should have been cancelled, reporting a failed worker as the
     # cause. The query may have been cancelled because the state store detected a failed
     # node, or because a stream sender failed to establish a thrift connection. It is
     # non-deterministic which of those paths will initiate cancellation, but in either
     # case the query status should include the failed (or unreachable) worker.
     query_id = handle.get_handle().id
     error_state = "Failed due to unreachable impalad"
     assert impalad.service.wait_for_query_status(client, query_id, error_state)

     # Assert that the query status on the query profile web page contains the expected
     # failed hostport.
     failed_hostport = "%s:%s" % (worker_impalad.service.hostname,
                                  worker_impalad.service.be_port)
     query_profile_page = impalad.service.read_query_profile_page(query_id)
     assert failed_hostport in query_profile_page,\
         "Query status did not contain expected hostport %s\n\n%s" % (failed_hostport,
             query_profile_page)

     # Should work fine even if a worker is down.
     self.execute_query_expect_success(client, QUERY)

     # Bring the worker back online and validate queries still work.
     worker_impalad.start()
     statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
     worker_impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
     self.execute_query_expect_success(client, QUERY)

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(
     impalad_args='--use_local_catalog',
     catalogd_args='--catalog_topic_mode=minimal')
   @pytest.mark.xfail(run=False, reason="IMPALA-9848")
   def test_restart_catalogd(self):
     # Choose a random impalad verify a query can run against it.
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     self.execute_query_expect_success(client, QUERY)

     # Kill the catalogd.
     catalogd = self.cluster.catalogd
     catalogd.kill()

     # The statestore should detect the catalog service has gone down.
     statestored = self.cluster.statestored
     statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS - 1, timeout=60)

     # We should still be able to execute queries using the impalad.
     self.execute_query_expect_success(client, QUERY)

     # Start the catalog service back up.
     catalogd.start()
     statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)

     # Execute a query against the catalog service.
     impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
     self.execute_query_expect_success(client, QUERY)

   @pytest.mark.execute_serially
   def test_restart_all_impalad(self):
     """Restarts all the impalads and runs a query"""
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     self.execute_query_expect_success(client, QUERY)

     # Kill each impalad and wait for the statestore to register the failures.
     for impalad_proc in self.cluster.impalads:
       impalad_proc.kill()
     statestored = self.cluster.statestored

     # There should be 1 remining subscriber, the catalogd
     statestored.service.wait_for_live_subscribers(1, timeout=60)

     # Start each impalad back up and wait for the statestore to see them.
     for impalad_proc in self.cluster.impalads:
       impalad_proc.start()

     # The impalads should re-register with the statestore on restart at which point they
     # can execute queries.
     statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
     for impalad in self.cluster.impalads:
       impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE, timeout=60)
       impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
       client = impalad.service.create_beeswax_client()
       self.execute_query_expect_success(client, QUERY)
       # Make sure the catalog service is actually back up by executing an operation
       # against it.
       self.execute_query_expect_success(client, QUERY)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import pytest

	from beeswaxd.BeeswaxService import QueryState
	from tests.common.custom_cluster_test_suite import (
	DEFAULT_CLUSTER_SIZE,
	CustomClusterTestSuite)

	# The exact query doesn't matter much for these tests, just want a query that touches
	# data on all nodes.
	QUERY = "select count(l_comment) from tpch.lineitem"

	# The default number of statestore subscribers should be the default cluster size plus
	# one for the catalogd.
	DEFAULT_NUM_SUBSCRIBERS = DEFAULT_CLUSTER_SIZE + 1


	class TestProcessFailures(CustomClusterTestSuite):
	"""Validates killing and restarting impalad processes between query executions."""

	@classmethod
	def setup_class(cls):
	if cls.exploration_strategy() != 'exhaustive':
	pytest.skip('These tests only run in exhaustive')
	super(TestProcessFailures, cls).setup_class()

	@pytest.mark.execute_serially
	def test_restart_coordinator(self):
	"""Restarts the coordinator between queries."""
	impalad = self.cluster.get_any_impalad()
	client = impalad.service.create_beeswax_client()

	self.execute_query_expect_success(client, QUERY)

	statestored = self.cluster.statestored
	impalad.restart()
	statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)

	# Reconnect
	client = impalad.service.create_beeswax_client()
	impalad.service.wait_for_metric_value('catalog.ready', 1, timeout=60)
	self.execute_query_expect_success(client, QUERY)

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(num_exclusive_coordinators=1,
	impalad_args="--status_report_max_retry_s=600 --status_report_interval_ms=1000")
	def test_kill_coordinator(self):
	""""Tests that when a coordinator running multiple queries is killed, all
	running fragments on executors are cancelled."""
	impalad = self.cluster.impalads[0]
	client = impalad.service.create_beeswax_client()
	assert client is not None
	# A query which is cancelable and takes long time to execute
	query = "select * from tpch.lineitem t1, tpch.lineitem t2, tpch.lineitem t3 " \
	"where t1.l_orderkey = t2.l_orderkey and t1.l_orderkey = t3.l_orderkey and " \
	"t3.l_orderkey = t2.l_orderkey order by t1.l_orderkey, t2.l_orderkey, " \
	"t3.l_orderkey limit 300"
	num_concurrent_queries = 3
	handles = []

	# Run num_concurrent_queries asynchronously
	for _ in xrange(num_concurrent_queries):
	handles.append(client.execute_async(query))

	# Wait for the queries to start running
	for handle in handles:
	self.wait_for_state(handle, QueryState.RUNNING, 1000, client=client)

	# Kill the coordinator
	impalad.kill()

	# Assert that all executors have 0 in-flight fragments
	for i in xrange(1, len(self.cluster.impalads)):
	self.cluster.impalads[i].service.wait_for_metric_value(
	"impala-server.num-fragments-in-flight", 0, timeout=30)

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(
	impalad_args='--use_local_catalog',
	catalogd_args='--catalog_topic_mode=minimal')
	def test_restart_statestore(self):
	"""Tests the cluster still functions when the statestore dies."""
	impalad = self.cluster.get_any_impalad()
	client = impalad.service.create_beeswax_client()
	statestored = self.cluster.statestored
	statestored.kill()
	impalad.service.wait_for_metric_value(
	'statestore-subscriber.connected', False, timeout=60)

	# impalad should still see the same number of live backends
	assert impalad.service.get_num_known_live_backends() == DEFAULT_CLUSTER_SIZE

	self.execute_query_expect_success(client, QUERY)
	# Reconnect
	statestored.start()

	impalad.service.wait_for_metric_value(
	'statestore-subscriber.connected', True, timeout=60)
	statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)

	# Wait for the number of live backends to reach the cluster size. Even though
	# all backends have subscribed to the statestore, this impalad may not have
	# received the update yet.
	impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE, timeout=60)

	self.execute_query_expect_success(client, QUERY)

	@pytest.mark.execute_serially
	def test_kill_restart_worker(self):
	"""Verifies a worker is able to be killed."""
	impalad = self.cluster.get_any_impalad()
	client = impalad.service.create_beeswax_client()
	self.execute_query_expect_success(client, QUERY)

	# select a different impalad and restart it
	worker_impalad = self.cluster.get_different_impalad(impalad)

	# Start executing a query. It will be cancelled due to a killed worker.
	handle = client.execute_async(QUERY)

	statestored = self.cluster.statestored
	worker_impalad.kill()

	# First wait until the the statestore realizes the impalad has gone down.
	statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS - 1, timeout=60)
	# Wait until the impalad registers another instance went down.
	impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE - 1, timeout=60)

	# Wait until the in-flight query has been cancelled.
	# The in-flight query should have been cancelled, reporting a failed worker as the
	# cause. The query may have been cancelled because the state store detected a failed
	# node, or because a stream sender failed to establish a thrift connection. It is
	# non-deterministic which of those paths will initiate cancellation, but in either
	# case the query status should include the failed (or unreachable) worker.
	query_id = handle.get_handle().id
	error_state = "Failed due to unreachable impalad"
	assert impalad.service.wait_for_query_status(client, query_id, error_state)

	# Assert that the query status on the query profile web page contains the expected
	# failed hostport.
	failed_hostport = "%s:%s" % (worker_impalad.service.hostname,
	worker_impalad.service.be_port)
	query_profile_page = impalad.service.read_query_profile_page(query_id)
	assert failed_hostport in query_profile_page,\
	"Query status did not contain expected hostport %s\n\n%s" % (failed_hostport,
	query_profile_page)

	# Should work fine even if a worker is down.
	self.execute_query_expect_success(client, QUERY)

	# Bring the worker back online and validate queries still work.
	worker_impalad.start()
	statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
	worker_impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
	self.execute_query_expect_success(client, QUERY)

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(
	impalad_args='--use_local_catalog',
	catalogd_args='--catalog_topic_mode=minimal')
	@pytest.mark.xfail(run=False, reason="IMPALA-9848")
	def test_restart_catalogd(self):
	# Choose a random impalad verify a query can run against it.
	impalad = self.cluster.get_any_impalad()
	client = impalad.service.create_beeswax_client()
	self.execute_query_expect_success(client, QUERY)

	# Kill the catalogd.
	catalogd = self.cluster.catalogd
	catalogd.kill()

	# The statestore should detect the catalog service has gone down.
	statestored = self.cluster.statestored
	statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS - 1, timeout=60)

	# We should still be able to execute queries using the impalad.
	self.execute_query_expect_success(client, QUERY)

	# Start the catalog service back up.
	catalogd.start()
	statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)

	# Execute a query against the catalog service.
	impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
	self.execute_query_expect_success(client, QUERY)

	@pytest.mark.execute_serially
	def test_restart_all_impalad(self):
	"""Restarts all the impalads and runs a query"""
	impalad = self.cluster.get_any_impalad()
	client = impalad.service.create_beeswax_client()
	self.execute_query_expect_success(client, QUERY)

	# Kill each impalad and wait for the statestore to register the failures.
	for impalad_proc in self.cluster.impalads:
	impalad_proc.kill()
	statestored = self.cluster.statestored

	# There should be 1 remining subscriber, the catalogd
	statestored.service.wait_for_live_subscribers(1, timeout=60)

	# Start each impalad back up and wait for the statestore to see them.
	for impalad_proc in self.cluster.impalads:
	impalad_proc.start()

	# The impalads should re-register with the statestore on restart at which point they
	# can execute queries.
	statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
	for impalad in self.cluster.impalads:
	impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE, timeout=60)
	impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
	client = impalad.service.create_beeswax_client()
	self.execute_query_expect_success(client, QUERY)
	# Make sure the catalog service is actually back up by executing an operation
	# against it.
	self.execute_query_expect_success(client, QUERY)