blob: f95daacc054e9b501cb8f28e17ad392499f543f7 [file] [log] [blame]
#!/usr/bin/env python
"""
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import logging
from resource_management.libraries.functions import file_system
from resource_management.libraries.functions import dfs_datanode_helper
RESULT_STATE_OK = 'OK'
RESULT_STATE_WARNING = 'WARNING'
RESULT_STATE_CRITICAL = 'CRITICAL'
RESULT_STATE_UNKNOWN = 'UNKNOWN'
DFS_DATA_DIR = '{{hdfs-site/dfs.datanode.data.dir}}'
DATA_DIR_MOUNT_FILE = "/var/lib/ambari-agent/data/datanode/dfs_data_dir_mount.hist"
logger = logging.getLogger()
def get_tokens():
"""
Returns a tuple of tokens in the format {{site/property}} that will be used
to build the dictionary passed into execute
"""
return (DFS_DATA_DIR, DATA_DIR_MOUNT_FILE)
def execute(configurations={}, parameters={}, host_name=None):
"""
Returns a tuple containing the result code and a pre-formatted result label
Keyword arguments:
configurations (dictionary): a mapping of configuration key to value
parameters (dictionary): a mapping of script parameter key to value
host_name (string): the name of this host where the alert is running
"""
warnings = []
errors = []
if configurations is None:
return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
# Check required properties
if DFS_DATA_DIR not in configurations:
return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(DFS_DATA_DIR)])
dfs_data_dir = configurations[DFS_DATA_DIR]
if dfs_data_dir is None:
return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script and the value is null'.format(DFS_DATA_DIR)])
data_dir_mount_file_exists = True
# This follows symlinks and will return False for a broken link (even in the middle of the linked list)
if not os.path.exists(DATA_DIR_MOUNT_FILE):
data_dir_mount_file_exists = False
warnings.append("File not found, {0} .".format(DATA_DIR_MOUNT_FILE))
valid_data_dirs = set() # data dirs that have been normalized
data_dirs_not_exist = set() # data dirs that do not exist
data_dirs_unknown = set() # data dirs for which could not determine mount
data_dirs_on_root = set() # set of data dirs that are on root mount
data_dirs_on_mount = set() # set of data dirs that are mounted on a device
data_dirs_unmounted = [] # list of data dirs that are known to have become unmounted
for data_dir in dfs_data_dir.split(","):
if data_dir is None or data_dir.strip() == "":
continue
data_dir = data_dir.strip()
valid_data_dirs.add(data_dir)
# Sort the data dirs, which is needed for deterministic behavior when running the unit tests.
valid_data_dirs = sorted(valid_data_dirs)
for data_dir in valid_data_dirs:
# This follows symlinks and will return False for a broken link (even in the middle of the linked list)
if os.path.isdir(data_dir):
curr_mount_point = file_system.get_mount_point_for_dir(data_dir)
curr_mount_point = curr_mount_point.strip() if curr_mount_point else curr_mount_point
if curr_mount_point is not None and curr_mount_point != "":
if curr_mount_point == "/":
data_dirs_on_root.add(data_dir)
else:
data_dirs_on_mount.add(data_dir)
else:
data_dirs_unknown.add(data_dir)
else:
data_dirs_not_exist.add(data_dir)
# To keep the messages consistent for all hosts, sort the sets into lists
valid_data_dirs = sorted(valid_data_dirs)
data_dirs_not_exist = sorted(data_dirs_not_exist)
data_dirs_unknown = sorted(data_dirs_unknown)
data_dirs_on_root = sorted(data_dirs_on_root)
if data_dirs_not_exist:
errors.append("Data dir(s) not found: {0} .".format(", ".join(data_dirs_not_exist)))
if data_dirs_unknown:
errors.append("Cannot find mount point for data dir(s): {0} .".format(", ".join(data_dirs_unknown)))
if data_dir_mount_file_exists:
# Make a precise determination on which data dirs have become unmounted.
class Params:
def __init__(self, mount_file):
self.data_dir_mount_file = mount_file
params = Params(DATA_DIR_MOUNT_FILE)
# This dictionary contains the expected values of <data_dir, mount_point>
# Hence, we only need to analyze the data dirs that are currently on the root partition
# and report an error if they were expected to be on a mount.
#
# If one of the data dirs is not present in the file, it means that DataNode has not been restarted after
# the configuration was changed on the server, so we cannot make any assertions about it.
expected_data_dir_to_mount = dfs_datanode_helper.get_data_dir_to_mount_from_file(params)
for data_dir in data_dirs_on_root:
if data_dir in expected_data_dir_to_mount and expected_data_dir_to_mount[data_dir] != "/":
data_dirs_unmounted.append(data_dir)
if len(data_dirs_unmounted) > 0:
errors.append("Detected data dir(s) that became unmounted and are now writing to the root partition: {0} .".format(", ".join(data_dirs_unmounted)))
else:
# Couldn't make guarantees about the expected value of mount points, so rely on this strategy that is likely to work.
# It will report false positives (aka false alarms) if the user actually intended to have
# 1+ data dirs on a mount and 1+ data dirs on the root partition.
if len(data_dirs_on_mount) >= 1 and len(data_dirs_on_root) >= 1:
errors.append("Detected at least one data dir on a mount point, but these are writing to the root partition: {0} .".format(", ".join(data_dirs_on_root)))
# Determine the status based on warnings and errors.
if len(errors) == 0:
status = RESULT_STATE_OK
messages = []
# Check for warnings
if len(warnings) > 0:
status = RESULT_STATE_WARNING
messages += warnings
if len(valid_data_dirs) > 0:
messages.append("Data dir(s) are fine, {0} .".format(", ".join(valid_data_dirs)))
else:
messages.append("No data dirs to analyze.")
return (status, ["\n".join(messages)])
else:
# Report errors
return (RESULT_STATE_CRITICAL, ["\n".join(errors)])