Introduce /bkbot Command to Control CI Workflow Runs via PR Comments (#4673)
* Introduce /bkbot Command to Control CI Workflow Runs via PR Comments
diff --git a/.github/workflows/ci-bkbot.yaml b/.github/workflows/ci-bkbot.yaml
new file mode 100644
index 0000000..961f6b2
--- /dev/null
+++ b/.github/workflows/ci-bkbot.yaml
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Description:
+# This GitHub Actions workflow enables rerunning CI via PR/Issue comments using the /bkbot command.
+# Supported commands: /bkbot rerun [keyword]
+# - /bkbot rerun           => Rerun the latest run of each workflow under the same head SHA, limited to runs with a conclusion of failure/cancelled/timed_out/skipped (entire run).
+# - /bkbot rerun <keyword> => Regardless of workflow/job status, fetch all jobs in the latest runs, match by name, and rerun each matching job.
+# Logging instructions:
+# - Jobs that are failed/cancelled/timed_out/skipped are scanned from all the latest workflow runs (including those in progress), thus jobs fail/skipped during progress can be captured.
+# Triggering condition: When a new comment is created containing /bkbot.
+
+name: BookKeeper Bot
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  actions: write
+  contents: read
+
+jobs:
+  bkbot:
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    if: github.event_name == 'issue_comment' && contains(github.event.comment.body, '/bkbot')
+    steps:
+      - name: Execute bkbot command
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            // Supported commands:
+            // - /bkbot rerun
+            //   Reruns all completed workflows with conclusions of failure/timed_out/skipped/cancelled
+            //   If workflow is still running, cannot rerun whole workflow, just suggest using "/bkbot rerun jobname"
+            // - /bkbot rerun jobname
+            //   Matches job.name by keyword, reruns matching jobs (regardless of current state, failures are logged)
+            // - /bkbot stop or /bkbot cancel
+            //   Cancels all still running (queued/in_progress) workflow runs associated with the current PR
+
+            const commentBody = context.payload.comment.body.trim();
+            const prefix = '/bkbot';
+            if (!commentBody.startsWith(prefix)) {
+              console.log('Not a bkbot command, skipping ...');
+              return;
+            }
+
+            if (!context.payload.issue || !context.payload.issue.pull_request) {
+              console.error('This comment is not on a Pull Request. bkbot only works on PRs.');
+              return;
+            }
+
+            const parts = commentBody.split(/\s+/);
+            const sub = (parts[1] || '').toLowerCase();
+            const arg = parts.length > 2 ? parts.slice(2).join(' ') : '';
+
+            const supported = ['rerun', 'stop', 'cancel'];
+            if (!supported.includes(sub)) {
+              console.log(`Unsupported command '${sub}'. Supported: '/bkbot rerun [jobName?]', '/bkbot stop', '/bkbot cancel'.`);
+              return;
+            }
+
+            const prNum = context.payload.issue.number;
+
+            // Get PR info
+            let pr;
+            try {
+              ({ data: pr } = await github.rest.pulls.get({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                pull_number: prNum
+              }));
+            } catch (e) {
+              console.error(`Failed to fetch PR #${prNum}: ${e.message}`);
+              return;
+            }
+
+            const headSha = pr.head.sha;
+            const prBranch = pr.head.ref;
+            const prUser = (pr.head && pr.head.user && pr.head.user.login) ? pr.head.user.login : pr.user.login;
+            const prUrl = pr.html_url;
+
+            console.log(`bkbot handling PR #${prNum} ${prUrl}`);
+            console.log(`PR branch='${prBranch}', headSha='${headSha}', author='${prUser}'`);
+            console.log(`Command parsed => sub='${sub}', arg='${arg || ''}'`);
+
+            // Fetch workflow runs in this repo triggered by this user on this branch, then filter by headSha
+            let page = 1;
+            const allRunsRaw = [];
+            while (true) {
+              const { data } = await github.rest.actions.listWorkflowRunsForRepo({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                actor: prUser,
+                branch: prBranch,
+                per_page: 100,
+                page
+              });
+              const wr = data.workflow_runs || [];
+              if (wr.length === 0) break;
+              allRunsRaw.push(...wr);
+              if (wr.length < 100) break;
+              page++;
+            }
+
+            const runsAtHead = allRunsRaw.filter(r => r.head_sha === headSha);
+            if (runsAtHead.length === 0) {
+              console.error(`No workflow runs found for head SHA ${headSha} on branch ${prBranch}.`);
+              return;
+            }
+
+            // Only keep the latest run for each workflow_id
+            runsAtHead.sort((a, b) => {
+              if (a.workflow_id !== b.workflow_id) return a.workflow_id - b.workflow_id;
+              return new Date(b.created_at) - new Date(a.created_at);
+            });
+            const latestRuns = [];
+            const seen = new Set();
+            for (const r of runsAtHead) {
+              if (!seen.has(r.workflow_id)) {
+                seen.add(r.workflow_id);
+                latestRuns.push(r);
+              }
+            }
+
+            function runKey(r) {
+              return `[run_id=${r.id}] ${r.name || '(unnamed)'} | status=${r.status} | conclusion=${r.conclusion || '-'} | ${r.html_url}`;
+            }
+
+            console.log('--- Latest workflow runs for this PR headSHA (one per workflow) ---');
+            for (const r of latestRuns) console.log('- ' + runKey(r));
+
+            // Utility: list all jobs in a run
+            async function listAllJobs(runId) {
+              let jobs = [];
+              let p = 1;
+              while (true) {
+                const { data } = await github.rest.actions.listJobsForWorkflowRun({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  run_id: runId,
+                  per_page: 100,
+                  page: p
+                });
+                const js = data.jobs || [];
+                if (js.length === 0) break;
+                jobs.push(...js);
+                if (js.length < 100) break;
+                p++;
+              }
+              return jobs;
+            }
+
+            // Utility: rerun a single job
+            async function rerunJob(job, run) {
+              try {
+                if (github.rest.actions.reRunJobForWorkflowRun) {
+                  await github.rest.actions.reRunJobForWorkflowRun({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    job_id: job.id
+                  });
+                } else {
+                  await github.request('POST /repos/{owner}/{repo}/actions/jobs/{job_id}/rerun', {
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    job_id: job.id
+                  });
+                }
+                console.log(`Re-ran job '${job.name}' (job_id=${job.id}) in run '${run.name}' | ${run.html_url}`);
+                return true;
+              } catch (e) {
+                console.log(`Failed to re-run job '${job.name}' (job_id=${job.id}) in run '${run.name}': ${e.message}`);
+                return false;
+              }
+            }
+
+            // Command 1: /bkbot rerun
+            if (sub === 'rerun' && !arg) {
+              const targetConclusions = new Set(['failure', 'timed_out', 'cancelled', 'skipped']);
+              let fullRerunCount = 0;
+              let skippedRunning = 0;
+              let skippedConclusion = 0;
+
+              console.log('Mode: full workflow re-run for completed runs with conclusions in [failure,timed_out,cancelled,skipped].');
+              for (const r of latestRuns) {
+                if (r.status !== 'completed') {
+                  console.log(`Skip (still running) ${runKey(r)}. Cannot re-run whole workflow. Consider '/bkbot rerun <jobName>' for single job.`);
+                  skippedRunning++;
+                  continue;
+                }
+                if (!targetConclusions.has(r.conclusion)) {
+                  console.log(`Skip (conclusion not eligible) ${runKey(r)}`);
+                  skippedConclusion++;
+                  continue;
+                }
+                try {
+                  await github.rest.actions.reRunWorkflow({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    run_id: r.id
+                  });
+                  console.log(`Triggered full re-run for ${runKey(r)}`);
+                  fullRerunCount++;
+                } catch (e) {
+                  console.log(`Failed to trigger full re-run for ${runKey(r)}: ${e.message}`);
+                }
+              }
+
+              if (fullRerunCount === 0) {
+                console.error(`No eligible workflow runs to re-run. Skipped running=${skippedRunning}, skipped by conclusion=${skippedConclusion}.`);
+              } else {
+                console.log(`Finished. Triggered full re-run for ${fullRerunCount} workflow run(s). Skipped running=${skippedRunning}, skipped by conclusion=${skippedConclusion}.`);
+              }
+              return;
+            }
+
+            // Command 2: /bkbot rerun jobname
+            if (sub === 'rerun' && arg) {
+              const keyword = arg.trim();
+              console.log(`Mode: job-level re-run. keyword='${keyword}'`);
+
+              let matchedJobs = 0;
+              let successJobs = 0;
+
+              for (const r of latestRuns) {
+                let jobs = [];
+                try {
+                  jobs = await listAllJobs(r.id);
+                } catch (e) {
+                  console.log(`Failed to list jobs for ${runKey(r)}: ${e.message}`);
+                  continue;
+                }
+                for (const j of jobs) {
+                  if (j.name && j.name.includes(keyword)) {
+                    matchedJobs++;
+                    const ok = await rerunJob(j, r);
+                    if (ok) successJobs++;
+                  }
+                }
+              }
+
+              if (matchedJobs === 0) {
+                console.error(`No jobs matched keyword '${keyword}' among latest runs for this PR head.`);
+              } else {
+                console.log(`Finished. Matched ${matchedJobs} job(s); successfully requested re-run for ${successJobs} job(s).`);
+              }
+              return;
+            }
+
+            // Command 3: /bkbot stop or /bkbot cancel
+            if (sub === 'stop' || sub === 'cancel') {
+              console.log('Mode: cancel running workflow runs (queued/in_progress).');
+
+              let cancelCount = 0;
+              let alreadyCompleted = 0;
+
+              for (const r of latestRuns) {
+                if (r.status === 'completed') {
+                  console.log(`Skip (already completed) ${runKey(r)}`);
+                  alreadyCompleted++;
+                  continue;
+                }
+                try {
+                  await github.rest.actions.cancelWorkflowRun({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    run_id: r.id
+                  });
+                  console.log(`Cancel requested for ${runKey(r)}`);
+                  cancelCount++;
+                } catch (e) {
+                  console.log(`Failed to cancel ${runKey(r)}: ${e.message}`);
+                }
+              }
+
+              if (cancelCount === 0) {
+                console.error(`No running workflow runs to cancel. Already completed: ${alreadyCompleted}.`);
+              } else {
+                console.log(`Finished. Requested cancel for ${cancelCount} running workflow run(s). Already completed: ${alreadyCompleted}.`);
+              }
+              return;
+            }