HBASE-28215 CreateTableProcedure and DeleteTableProcedure should sleep a while before retrying (#5502)

Signed-off-by: Duo Zhang <zhangduo@apache.org>
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java
index 17998fe..23ad3b4 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/CreateTableProcedure.java
@@ -35,12 +35,15 @@
 import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
 import org.apache.hadoop.hbase.master.MasterFileSystem;
 import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
+import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
+import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
 import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory;
 import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerValidationUtils;
 import org.apache.hadoop.hbase.rsgroup.RSGroupInfo;
 import org.apache.hadoop.hbase.util.CommonFSUtils;
 import org.apache.hadoop.hbase.util.FSTableDescriptors;
 import org.apache.hadoop.hbase.util.ModifyRegionUtils;
+import org.apache.hadoop.hbase.util.RetryCounter;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -51,6 +54,7 @@
 import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CreateTableState;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
 
 @InterfaceAudience.Private
 public class CreateTableProcedure extends AbstractStateMachineTableProcedure<CreateTableState> {
@@ -60,6 +64,7 @@
 
   private TableDescriptor tableDescriptor;
   private List<RegionInfo> newRegions;
+  private RetryCounter retryCounter;
 
   public CreateTableProcedure() {
     // Required by the Procedure framework to create the procedure on replay
@@ -80,7 +85,7 @@
 
   @Override
   protected Flow executeFromState(final MasterProcedureEnv env, final CreateTableState state)
-    throws InterruptedException {
+    throws InterruptedException, ProcedureSuspendedException {
     LOG.info("{} execute state={}", this, state);
     try {
       switch (state) {
@@ -131,6 +136,7 @@
           break;
         case CREATE_TABLE_POST_OPERATION:
           postCreate(env);
+          retryCounter = null;
           return Flow.NO_MORE_STATE;
         default:
           throw new UnsupportedOperationException("unhandled state=" + state);
@@ -139,13 +145,27 @@
       if (isRollbackSupported(state)) {
         setFailure("master-create-table", e);
       } else {
-        LOG.warn("Retriable error trying to create table=" + getTableName() + " state=" + state, e);
+        if (retryCounter == null) {
+          retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
+        }
+        long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
+        LOG.warn("Retriable error trying to create table={},state={},suspend {}secs.",
+          getTableName(), state, backoff / 1000, e);
+        throw suspend(Math.toIntExact(backoff), true);
       }
     }
+    retryCounter = null;
     return Flow.HAS_MORE_STATE;
   }
 
   @Override
+  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
+    setState(ProcedureProtos.ProcedureState.RUNNABLE);
+    env.getProcedureScheduler().addFront(this);
+    return false;
+  }
+
+  @Override
   protected void rollbackState(final MasterProcedureEnv env, final CreateTableState state)
     throws IOException {
     if (state == CreateTableState.CREATE_TABLE_PRE_OPERATION) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java
index 80fb5d0..8c2f106 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteTableProcedure.java
@@ -41,9 +41,12 @@
 import org.apache.hadoop.hbase.mob.MobConstants;
 import org.apache.hadoop.hbase.mob.MobUtils;
 import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
+import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
+import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
 import org.apache.hadoop.hbase.util.CommonFSUtils;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.util.RetryCounter;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -54,6 +57,7 @@
 import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.DeleteTableState;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
 
 @InterfaceAudience.Private
 public class DeleteTableProcedure extends AbstractStateMachineTableProcedure<DeleteTableState> {
@@ -61,6 +65,7 @@
 
   private List<RegionInfo> regions;
   private TableName tableName;
+  private RetryCounter retryCounter;
 
   public DeleteTableProcedure() {
     // Required by the Procedure framework to create the procedure on replay
@@ -79,7 +84,7 @@
 
   @Override
   protected Flow executeFromState(final MasterProcedureEnv env, DeleteTableState state)
-    throws InterruptedException {
+    throws InterruptedException, ProcedureSuspendedException {
     if (LOG.isTraceEnabled()) {
       LOG.trace(this + " execute state=" + state);
     }
@@ -124,6 +129,7 @@
           break;
         case DELETE_TABLE_POST_OPERATION:
           postDelete(env);
+          retryCounter = null;
           LOG.debug("Finished {}", this);
           return Flow.NO_MORE_STATE;
         default:
@@ -133,13 +139,27 @@
       if (isRollbackSupported(state)) {
         setFailure("master-delete-table", e);
       } else {
-        LOG.warn("Retriable error trying to delete table=" + getTableName() + " state=" + state, e);
+        if (retryCounter == null) {
+          retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
+        }
+        long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
+        LOG.warn("Retriable error trying to delete table={},state={},suspend {}secs.",
+          getTableName(), state, backoff / 1000, e);
+        throw suspend(Math.toIntExact(backoff), true);
       }
     }
+    retryCounter = null;
     return Flow.HAS_MORE_STATE;
   }
 
   @Override
+  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
+    setState(ProcedureProtos.ProcedureState.RUNNABLE);
+    env.getProcedureScheduler().addFront(this);
+    return false;
+  }
+
+  @Override
   protected boolean abort(MasterProcedureEnv env) {
     // TODO: Current behavior is: with no rollback and no abort support, procedure may get stuck
     // looping in retrying failing a step forever. Default behavior of abort is changed to support
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/BadMasterObserverForCreateDeleteTable.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/BadMasterObserverForCreateDeleteTable.java
new file mode 100644
index 0000000..454a24e
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/coprocessor/BadMasterObserverForCreateDeleteTable.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.coprocessor;
+
+import java.io.IOException;
+import java.util.Optional;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+
+/**
+ * A bad Master Observer to prevent user to create/delete table once.
+ */
+public class BadMasterObserverForCreateDeleteTable implements MasterObserver, MasterCoprocessor {
+  private boolean createFailedOnce = false;
+  private boolean deleteFailedOnce = false;
+
+  @Override
+  public void postCompletedCreateTableAction(ObserverContext<MasterCoprocessorEnvironment> ctx,
+    TableDescriptor desc, RegionInfo[] regions) throws IOException {
+    if (!createFailedOnce && !desc.getTableName().isSystemTable()) {
+      createFailedOnce = true;
+      throw new IOException("execute postCompletedCreateTableAction failed once.");
+    }
+  }
+
+  @Override
+  public void postCompletedDeleteTableAction(ObserverContext<MasterCoprocessorEnvironment> ctx,
+    TableName tableName) throws IOException {
+    if (!deleteFailedOnce && !tableName.isSystemTable()) {
+      deleteFailedOnce = true;
+      throw new IOException("execute postCompletedDeleteTableAction failed once.");
+    }
+  }
+
+  @Override
+  public Optional<MasterObserver> getMasterObserver() {
+    return Optional.of(this);
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateDeleteTableProcedureWithRetry.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateDeleteTableProcedureWithRetry.java
new file mode 100644
index 0000000..3491aa6
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateDeleteTableProcedureWithRetry.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.procedure;
+
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseTestingUtil;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.coprocessor.BadMasterObserverForCreateDeleteTable;
+import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
+import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
+import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.ModifyRegionUtils;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category({ MasterTests.class, MediumTests.class })
+public class TestCreateDeleteTableProcedureWithRetry {
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestCreateDeleteTableProcedureWithRetry.class);
+
+  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
+
+  private static final TableName TABLE_NAME =
+    TableName.valueOf(TestCreateDeleteTableProcedureWithRetry.class.getSimpleName());
+
+  private static final String CF = "cf";
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    Configuration conf = UTIL.getConfiguration();
+    conf.set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY,
+      BadMasterObserverForCreateDeleteTable.class.getName());
+    UTIL.startMiniCluster(1);
+  }
+
+  @AfterClass
+  public static void tearDown() throws Exception {
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testCreateDeleteTableRetry() throws IOException {
+    ProcedureExecutor<MasterProcedureEnv> procExec =
+      UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
+    TableDescriptor htd = MasterProcedureTestingUtility.createHTD(TABLE_NAME, CF);
+    RegionInfo[] regions = ModifyRegionUtils.createRegionInfos(htd, null);
+    CreateTableProcedure createProc =
+      new CreateTableProcedure(procExec.getEnvironment(), htd, regions);
+    ProcedureTestingUtility.submitAndWait(procExec, createProc);
+    Assert.assertTrue(UTIL.getAdmin().tableExists(TABLE_NAME));
+    MasterProcedureTestingUtility.validateTableCreation(UTIL.getMiniHBaseCluster().getMaster(),
+      TABLE_NAME, regions, CF);
+
+    UTIL.getAdmin().disableTable(TABLE_NAME);
+    DeleteTableProcedure deleteProc =
+      new DeleteTableProcedure(procExec.getEnvironment(), TABLE_NAME);
+    ProcedureTestingUtility.submitAndWait(procExec, deleteProc);
+    Assert.assertFalse(UTIL.getAdmin().tableExists(TABLE_NAME));
+    MasterProcedureTestingUtility.validateTableDeletion(UTIL.getMiniHBaseCluster().getMaster(),
+      TABLE_NAME);
+  }
+}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java
index bed41f4..0bb54cc 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestCreateTableProcedure.java
@@ -39,6 +39,7 @@
 import org.apache.hadoop.hbase.master.MasterFileSystem;
 import org.apache.hadoop.hbase.procedure2.Procedure;
 import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
+import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
 import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
 import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory;
 import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerForTest;
@@ -244,7 +245,8 @@
 
     @Override
     protected Flow executeFromState(MasterProcedureEnv env,
-      MasterProcedureProtos.CreateTableState state) throws InterruptedException {
+      MasterProcedureProtos.CreateTableState state)
+      throws InterruptedException, ProcedureSuspendedException {
 
       if (
         !failOnce && state == MasterProcedureProtos.CreateTableState.CREATE_TABLE_WRITE_FS_LAYOUT