HDDS-5058. Make getScmInfo retry for a duration.
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
index d96eb50..8f47756 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
@@ -510,6 +510,11 @@
"hdds.scm.ha.security.enable";
public static final boolean OZONE_SCM_HA_SECURITY_SUPPORTED_DEFAULT = false;
+ public static final String OZONE_SCM_INFO_WAIT_DURATION =
+ "ozone.scm.info.wait.duration";
+ public static final long OZONE_SCM_INFO_WAIT_DURATION_DEFAULT =
+ 10 * 60;
+
/**
* Never constructed.
*/
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 98684b8..8a5ebb5 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -2751,4 +2751,13 @@
filesystem semantics.
</description>
</property>
+
+ <property>
+ <name>ozone.scm.info.wait.duration</name>
+ <tag>OZONE, SCM, OM</tag>
+ <value>10m</value>
+ <description> Maximum amount of duration OM/SCM waits to get Scm Info
+ during OzoneManager init/SCM bootstrap.
+ </description>
+ </property>
</configuration>
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
index 99dc446..65acfae 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/proxy/SCMClientConfig.java
@@ -68,7 +68,7 @@
tags = {OZONE, SCM, CLIENT},
timeUnit = TimeUnit.MILLISECONDS,
description = "SCM Client timeout on waiting for the next connection " +
- "retry to other SCM IP. The default value is set to 2 minutes. "
+ "retry to other SCM IP. The default value is set to 2 seconds. "
)
private long retryInterval = 2 * 1000;
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
index db129f4..f9f88ef 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
@@ -33,6 +33,7 @@
import org.apache.hadoop.hdds.scm.protocolPB.ScmBlockLocationProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdds.scm.proxy.SCMBlockLocationFailoverProxyProvider;
+import org.apache.hadoop.hdds.scm.proxy.SCMClientConfig;
import org.apache.hadoop.hdds.scm.proxy.SCMContainerLocationFailoverProxyProvider;
import org.apache.hadoop.hdds.security.exception.SCMSecurityException;
import org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient;
@@ -63,7 +64,10 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
+import java.util.concurrent.TimeUnit;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_INFO_WAIT_DURATION_DEFAULT;
import static org.apache.hadoop.hdds.server.ServerUtils.getOzoneMetaDirPath;
import static org.apache.hadoop.ozone.OzoneConsts.DB_TRANSIENT_MARKER;
import static org.apache.hadoop.ozone.OzoneConsts.TRANSACTION_INFO_KEY;
@@ -79,8 +83,23 @@
public static ScmInfo getScmInfo(OzoneConfiguration conf)
throws IOException {
+ OzoneConfiguration configuration = new OzoneConfiguration(conf);
try {
- return getScmBlockClient(conf).getScmInfo();
+ long duration = conf.getTimeDuration(OZONE_SCM_INFO_WAIT_DURATION,
+ OZONE_SCM_INFO_WAIT_DURATION_DEFAULT, TimeUnit.SECONDS);
+ SCMClientConfig scmClientConfig =
+ configuration.getObject(SCMClientConfig.class);
+ int retryCount =
+ (int) (duration / (scmClientConfig.getRetryInterval()/1000));
+
+ // If duration is set to lesser value, fall back to actual default
+ // retry count.
+ if (retryCount > scmClientConfig.getRetryCount()) {
+ scmClientConfig.setRetryCount(retryCount);
+ configuration.setFromObject(scmClientConfig);
+ }
+
+ return getScmBlockClient(configuration).getScmInfo();
} catch (IOException e) {
throw e;
} catch (Exception e) {
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
index f0fbd22..d56499b 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
@@ -403,7 +403,7 @@
// For testing purpose only, not hit scm from om as Hadoop UGI can't login
// two principals in the same JVM.
if (!testSecureOmFlag) {
- ScmInfo scmInfo = getScmInfo(configuration);
+ ScmInfo scmInfo = HAUtils.getScmInfo(configuration);
if (!(scmInfo.getClusterId().equals(omStorage.getClusterID()) && scmInfo
.getScmId().equals(omStorage.getScmId()))) {
logVersionMismatch(conf, scmInfo);
@@ -930,7 +930,7 @@
StorageState state = omStorage.getState();
if (state != StorageState.INITIALIZED) {
try {
- ScmInfo scmInfo = getScmInfo(conf);
+ ScmInfo scmInfo = HAUtils.getScmInfo(conf);
String clusterId = scmInfo.getClusterId();
String scmId = scmInfo.getScmId();
if (clusterId == null || clusterId.isEmpty()) {
@@ -1008,11 +1008,6 @@
}
}
- private static ScmInfo getScmInfo(OzoneConfiguration conf)
- throws IOException {
- return HAUtils.getScmInfo(conf);
- }
-
/**
* Builds a message for logging startup information about an RPC server.
*