blob: 0d8541ef173a525a0e164691d1875befd50d188a [file] [log] [blame]
#!/usr/bin/env bash
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
source "$(dirname "$0")"/common_kubernetes.sh
CLUSTER_ROLE_BINDING="flink-role-binding-default"
CLUSTER_ID="flink-native-k8s-application-ha-1"
FLINK_IMAGE_NAME="test_kubernetes_application_ha"
LOCAL_LOGS_PATH="${TEST_DATA_DIR}/log"
IMAGE_BUILD_RETRIES=3
IMAGE_BUILD_BACKOFF=2
function internal_cleanup {
if [[ "${OS_TYPE}" != "linux" ]]; then
minikube ssh "sudo rm -rf /tmp/${CLUSTER_ID}"
else
sudo rm -rf /tmp/${CLUSTER_ID}
fi
kubectl delete deployment ${CLUSTER_ID}
kubectl delete cm --selector="app=${CLUSTER_ID}"
kubectl delete clusterrolebinding ${CLUSTER_ROLE_BINDING}
}
start_kubernetes
if ! retry_times $IMAGE_BUILD_RETRIES $IMAGE_BUILD_BACKOFF "build_image ${FLINK_IMAGE_NAME} $(get_host_machine_address)"; then
echo "ERROR: Could not build image. Aborting..."
exit 1
fi
kubectl create clusterrolebinding ${CLUSTER_ROLE_BINDING} --clusterrole=edit --serviceaccount=default:default --namespace=default
mkdir -p "$LOCAL_LOGS_PATH"
# Set the memory and cpu smaller than default, so that the jobmanager and taskmanager pods could be allocated in minikube.
"$FLINK_DIR"/bin/flink run-application -t kubernetes-application \
-Dkubernetes.cluster-id=${CLUSTER_ID} \
-Dkubernetes.container.image=${FLINK_IMAGE_NAME} \
-Djobmanager.memory.process.size=1088m \
-Dkubernetes.jobmanager.cpu=0.5 \
-Dkubernetes.taskmanager.cpu=0.5 \
-Dkubernetes.rest-service.exposed.type=NodePort \
-Dkubernetes.pod-template-file=${CONTAINER_SCRIPTS}/kubernetes-pod-template.yaml \
-Dhigh-availability=org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory \
-Dhigh-availability.storageDir=file:///flink-ha \
-Drestart-strategy=fixed-delay \
-Drestart-strategy.fixed-delay.attempts=10 \
local:///opt/flink/examples/streaming/StateMachineExample.jar
kubectl wait --for=condition=Available --timeout=30s deploy/${CLUSTER_ID} || exit 1
jm_pod_name=$(kubectl get pods --selector="app=${CLUSTER_ID},component=jobmanager" -o jsonpath='{..metadata.name}')
wait_num_checkpoints $jm_pod_name 3
job_id=$(kubectl logs $jm_pod_name | grep -E -o 'Job [a-z0-9]+ is submitted' | awk '{print $2}')
# Kill the JobManager
kubectl exec $jm_pod_name -- /bin/sh -c "kill 1"
# Check the new JobManager recovering from latest successful checkpoint
wait_for_logs $jm_pod_name "Restoring job $job_id from Checkpoint"
wait_num_checkpoints $jm_pod_name 1
"$FLINK_DIR"/bin/flink cancel -t kubernetes-application -Dkubernetes.cluster-id=${CLUSTER_ID} $job_id
kubectl wait --for=delete configmap --timeout=60s --selector="app=${CLUSTER_ID}"
kubectl wait --for=delete pod --timeout=60s --selector="app=${CLUSTER_ID}"