YARN-11703. Validate accessibility of Node Manager working directories (#6903)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 7747d4c..9503d47 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -2158,15 +2158,18 @@
NM_DISK_HEALTH_CHECK_PREFIX + "min-free-space-per-disk-mb";
/**
+ * By default, all the disk can be used before it is marked as offline.
+ */
+ public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
+
+ /**
* Enable/Disable the minimum disk free
* space threshold for disk health checker.
*/
public static final String NM_DISK_FREE_SPACE_THRESHOLD_ENABLED =
- NM_DISK_HEALTH_CHECK_PREFIX +
- "disk-free-space-threshold.enabled";
+ NM_DISK_HEALTH_CHECK_PREFIX + "disk-free-space-threshold.enabled";
- public static final boolean
- DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true;
+ public static final boolean DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED = true;
/**
* The minimum space that must be available on an offline
@@ -2180,9 +2183,13 @@
NM_DISK_HEALTH_CHECK_PREFIX +
"min-free-space-per-disk-watermark-high-mb";
/**
- * By default, all of the disk can be used before it is marked as offline.
+ * Validate content of the node manager directories can be accessed.
*/
- public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
+ public static final String NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED =
+ NM_DISK_HEALTH_CHECK_PREFIX + "working-dir-content-accessibility-validation.enabled";
+
+ public static final boolean DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED =
+ true;
/** The health checker scripts. */
public static final String NM_HEALTH_CHECK_SCRIPTS =
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 927d0c1..ac976b7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -1996,6 +1996,12 @@
</property>
<property>
+ <description>Validate content of the node manager directories can be accessed</description>
+ <name>yarn.nodemanager.disk-health-checker.working-dir-content-accessibility-validation.enabled</name>
+ <value>true</value>
+ </property>
+
+ <property>
<description>The maximum percentage of disk space utilization allowed after
which a disk is marked as bad. Values can range from 0.0 to 100.0.
If the value is greater than or equal to 100, the nodemanager will check
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
index 8ecaa6d..a5657ab 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java
@@ -21,6 +21,8 @@
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -28,22 +30,27 @@
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskValidator;
import org.apache.hadoop.util.DiskValidatorFactory;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
@@ -62,6 +69,7 @@
private boolean diskUtilizationThresholdEnabled;
private boolean diskFreeSpaceThresholdEnabled;
+ private boolean subAccessibilityValidationEnabled;
/**
* The enum defines disk failure type.
*/
@@ -242,16 +250,15 @@
throw new YarnRuntimeException(e);
}
- diskUtilizationThresholdEnabled = conf.
- getBoolean(YarnConfiguration.
- NM_DISK_UTILIZATION_THRESHOLD_ENABLED,
- YarnConfiguration.
- DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED);
- diskFreeSpaceThresholdEnabled = conf.
- getBoolean(YarnConfiguration.
- NM_DISK_FREE_SPACE_THRESHOLD_ENABLED,
- YarnConfiguration.
- DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED);
+ diskUtilizationThresholdEnabled = conf.getBoolean(
+ YarnConfiguration.NM_DISK_UTILIZATION_THRESHOLD_ENABLED,
+ YarnConfiguration.DEFAULT_NM_DISK_UTILIZATION_THRESHOLD_ENABLED);
+ diskFreeSpaceThresholdEnabled = conf.getBoolean(
+ YarnConfiguration.NM_DISK_FREE_SPACE_THRESHOLD_ENABLED,
+ YarnConfiguration.DEFAULT_NM_DISK_FREE_SPACE_THRESHOLD_ENABLED);
+ subAccessibilityValidationEnabled = conf.getBoolean(
+ YarnConfiguration.NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED,
+ YarnConfiguration.DEFAULT_NM_WORKING_DIR_CONTENT_ACCESSIBILITY_VALIDATION_ENABLED);
localDirs = new ArrayList<>(Arrays.asList(dirs));
errorDirs = new ArrayList<>();
@@ -448,8 +455,7 @@
// move testDirs out of any lock as it could wait for very long time in
// case of busy IO
- Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs,
- preCheckGoodDirs);
+ Map<String, DiskErrorInformation> dirsFailedCheck = testDirs(allLocalDirs, preCheckGoodDirs);
this.writeLock.lock();
try {
@@ -521,60 +527,89 @@
}
}
- Map<String, DiskErrorInformation> testDirs(List<String> dirs,
- Set<String> goodDirs) {
- HashMap<String, DiskErrorInformation> ret =
- new HashMap<String, DiskErrorInformation>();
- for (final String dir : dirs) {
- String msg;
- try {
- File testDir = new File(dir);
- diskValidator.checkStatus(testDir);
- float diskUtilizationPercentageCutoff = goodDirs.contains(dir) ?
- diskUtilizationPercentageCutoffHigh : diskUtilizationPercentageCutoffLow;
- long diskFreeSpaceCutoff = goodDirs.contains(dir) ?
- diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh;
-
- if (diskUtilizationThresholdEnabled
- && isDiskUsageOverPercentageLimit(testDir,
- diskUtilizationPercentageCutoff)) {
- msg =
- "used space above threshold of "
- + diskUtilizationPercentageCutoff
- + "%";
- ret.put(dir,
- new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg));
- continue;
- } else if (diskFreeSpaceThresholdEnabled
- && isDiskFreeSpaceUnderLimit(testDir, diskFreeSpaceCutoff)) {
- msg =
- "free space below limit of " + diskFreeSpaceCutoff
- + "MB";
- ret.put(dir,
- new DiskErrorInformation(DiskErrorCause.DISK_FULL, msg));
- continue;
- }
- } catch (IOException ie) {
- ret.put(dir,
- new DiskErrorInformation(DiskErrorCause.OTHER, ie.getMessage()));
- }
+ Map<String, DiskErrorInformation> testDirs(List<String> dirs, Set<String> goodDirs) {
+ final Map<String, DiskErrorInformation> ret = new HashMap<>(0);
+ for (String dir : dirs) {
+ LOG.debug("Start testing dir accessibility: {}", dir);
+ File testDir = new File(dir);
+ boolean goodDir = goodDirs.contains(dir);
+ Stream.of(
+ validateDisk(testDir),
+ validateUsageOverPercentageLimit(testDir, goodDir),
+ validateDiskFreeSpaceUnderLimit(testDir, goodDir),
+ validateSubsAccessibility(testDir)
+ )
+ .filter(Objects::nonNull)
+ .findFirst()
+ .ifPresent(diskErrorInformation -> ret.put(dir, diskErrorInformation));
}
return ret;
}
- private boolean isDiskUsageOverPercentageLimit(File dir,
- float diskUtilizationPercentageCutoff) {
- float freePercentage =
- 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace());
- float usedPercentage = 100.0F - freePercentage;
- return (usedPercentage > diskUtilizationPercentageCutoff
- || usedPercentage >= 100.0F);
+ private DiskErrorInformation validateDisk(File dir) {
+ try {
+ diskValidator.checkStatus(dir);
+ LOG.debug("Dir {} pass throw the disk validation", dir);
+ return null;
+ } catch (IOException | UncheckedIOException | SecurityException e) {
+ return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage());
+ }
}
- private boolean isDiskFreeSpaceUnderLimit(File dir,
- long freeSpaceCutoff) {
+ private DiskErrorInformation validateUsageOverPercentageLimit(File dir, boolean isGoodDir) {
+ if (!diskUtilizationThresholdEnabled) {
+ return null;
+ }
+ float diskUtilizationPercentageCutoff = isGoodDir
+ ? diskUtilizationPercentageCutoffHigh
+ : diskUtilizationPercentageCutoffLow;
+ float freePercentage = 100 * (dir.getUsableSpace() / (float) dir.getTotalSpace());
+ float usedPercentage = 100.0F - freePercentage;
+ if (usedPercentage > diskUtilizationPercentageCutoff || usedPercentage >= 100.0F) {
+ return new DiskErrorInformation(DiskErrorCause.DISK_FULL,
+ "used space above threshold of " + diskUtilizationPercentageCutoff + "%");
+ } else {
+ LOG.debug("Dir {} pass throw the usage over percentage validation", dir);
+ return null;
+ }
+ }
+
+ private DiskErrorInformation validateDiskFreeSpaceUnderLimit(File dir, boolean isGoodDir) {
+ if (!diskFreeSpaceThresholdEnabled) {
+ return null;
+ }
+ long freeSpaceCutoff = isGoodDir ? diskFreeSpaceCutoffLow : diskFreeSpaceCutoffHigh;
long freeSpace = dir.getUsableSpace() / (1024 * 1024);
- return freeSpace < freeSpaceCutoff;
+ if (freeSpace < freeSpaceCutoff) {
+ return new DiskErrorInformation(DiskErrorCause.DISK_FULL,
+ "free space below limit of " + freeSpaceCutoff + "MB");
+ } else {
+ LOG.debug("Dir {} pass throw the free space validation", dir);
+ return null;
+ }
+ }
+
+ private DiskErrorInformation validateSubsAccessibility(File dir) {
+ if (!subAccessibilityValidationEnabled) {
+ return null;
+ }
+ try (Stream<java.nio.file.Path> walk = Files.walk(dir.toPath())) {
+ List<File> subs = walk
+ .map(java.nio.file.Path::toFile)
+ .collect(Collectors.toList());
+ for (File sub : subs) {
+ if (sub.isDirectory()) {
+ DiskChecker.checkDir(sub);
+ } else if (!Files.isReadable(sub.toPath())) {
+ return new DiskErrorInformation(DiskErrorCause.OTHER, "Can not read " + sub);
+ } else {
+ LOG.debug("{} under {} is accessible", sub, dir);
+ }
+ }
+ } catch (IOException | UncheckedIOException | SecurityException e) {
+ return new DiskErrorInformation(DiskErrorCause.OTHER, e.getMessage());
+ }
+ return null;
}
private void createDir(FileContext localFs, Path dir, FsPermission perm)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
index 33bd4d9..0193f84 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java
@@ -20,8 +20,17 @@
import java.io.File;
import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.attribute.PosixFilePermissions;
+import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
+import java.util.Map;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
@@ -32,16 +41,11 @@
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.DirectoryCollection.DirsChangeListener;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
public class TestDirectoryCollection {
- private static final File testDir = new File("target",
- TestDirectoryCollection.class.getName()).getAbsoluteFile();
- private static final File testFile = new File(testDir, "testfile");
+ private File testDir;
+ private File testFile;
private Configuration conf;
private FileContext localFs;
@@ -50,7 +54,8 @@
public void setupForTests() throws IOException {
conf = new Configuration();
localFs = FileContext.getLocalFSFileContext(conf);
- testDir.mkdirs();
+ testDir = Files.createTempDirectory(TestDirectoryCollection.class.getName()).toFile();
+ testFile = new File(testDir, "testfile");
testFile.createNewFile();
}
@@ -516,6 +521,20 @@
Assert.assertEquals(listener3.num, 1);
}
+ @Test
+ public void testNonAccessibleSub() throws IOException {
+ Files.setPosixFilePermissions(testDir.toPath(),
+ PosixFilePermissions.fromString("rwx------"));
+ Files.setPosixFilePermissions(testFile.toPath(),
+ PosixFilePermissions.fromString("-w--w--w-"));
+ DirectoryCollection dc = new DirectoryCollection(new String[]{testDir.toString()});
+ Map<String, DirectoryCollection.DiskErrorInformation> diskErrorInformationMap =
+ dc.testDirs(Collections.singletonList(testDir.toString()), Collections.emptySet());
+ Assert.assertEquals(1, diskErrorInformationMap.size());
+ Assert.assertTrue(diskErrorInformationMap.values().iterator().next()
+ .message.contains(testFile.getName()));
+ }
+
static class DirsChangeListenerTest implements DirsChangeListener {
public int num = 0;
public DirsChangeListenerTest() {