From 35523d560a97152cad3e83cebfe86c49512b87ac Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Thu, 20 Oct 2022 13:21:04 -0400 Subject: [PATCH] GHA: Auto. re-run failed cirrus-cron builds once With a seemingly ever growing list of cirrus-cron jobs running on release branches, there are bound to be some hiccups. Sometimes a lot of them. Normally any failures require a human to eyeball the logs and/or manually re-run the job to see if it was simply a flake. This doesn't take long, but can be distracting and compounds over time. Attempt to alleviate some maintainer burden by using a new github action workflow to perform **one** automatic re-run on any failed builds. This task is scheduled an hour prior to a second failure check, and generation of notification e-mail for review. Note: If there are no failures, due to the auto. re-run or luck, no e-mail is generated. If this proves useful in this repo, I intend to re-use this workflow for other repo's cirrus-cron jobs. Signed-off-by: Chris Evich --- .../check_cirrus_cron/cron_failures.sh | 63 +++------- .github/actions/check_cirrus_cron/lib.sh | 70 ++++++++++- .../check_cirrus_cron/make_email_body.sh | 3 +- .../check_cirrus_cron/rerun_failed_tasks.sh | 112 ++++++++++++++++++ .github/workflows/rerun_cirrus_cron.yml | 61 ++++++++++ 5 files changed, 261 insertions(+), 48 deletions(-) create mode 100755 .github/actions/check_cirrus_cron/rerun_failed_tasks.sh create mode 100644 .github/workflows/rerun_cirrus_cron.yml diff --git a/.github/actions/check_cirrus_cron/cron_failures.sh b/.github/actions/check_cirrus_cron/cron_failures.sh index 1efe57c145..0e669c19c1 100755 --- a/.github/actions/check_cirrus_cron/cron_failures.sh +++ b/.github/actions/check_cirrus_cron/cron_failures.sh @@ -8,31 +8,25 @@ set -eo pipefail source $(dirname "${BASH_SOURCE[0]}")/lib.sh _errfmt="Expecting %s value to not be empty" -if [[ -z "$GITHUB_REPOSITORY" ]]; then +if [[ -z "$GITHUB_REPOSITORY" ]]; then # / err $(printf "$_errfmt" "\$GITHUB_REPOSITORY") -elif [[ -z "$NAME_ID_FILEPATH" ]]; then +elif [[ -z "$NAME_ID_FILEPATH" ]]; then # output filepath err $(printf "$_errfmt" "\$NAME_ID_FILEPATH") fi mkdir -p artifacts cat > ./artifacts/query_raw.json << "EOF" -{"query":" - query CronNameStatus($owner: String!, $repo: String!) { - ownerRepository(platform: \"LINUX\", owner: $owner, name: $repo) { - cronSettings { - name - lastInvocationBuild { - id - status - } +query { + ownerRepository(platform: "LINUX", owner: "@@OWNER@@", name: "@@REPO@@") { + cronSettings { + name + lastInvocationBuild { + id + status } } } -", -"variables":"{ - \"owner\": \"@@OWNER@@\", - \"repo\": \"@@REPO@@\" -}"} +} EOF # Makes for easier copy/pasting query to/from # https://cirrus-ci.com/explorer @@ -40,7 +34,6 @@ owner=$(cut -d '/' -f 1 <<<"$GITHUB_REPOSITORY") repo=$(cut -d '/' -f 2 <<<"$GITHUB_REPOSITORY") sed -i -r -e "s/@@OWNER@@/$owner/g" -e "s/@@REPO@@/$repo/g" ./artifacts/query_raw.json -echo "::group::Posting GraphQL Query" # Easier to debug in error-reply when query is compacted tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json | \ jq --indent 4 --color-output . @@ -48,21 +41,13 @@ tr -d '\n' < ./artifacts/query_raw.json | tr -s ' ' | tee ./artifacts/query.json if grep -q '@@' ./artifacts/query.json; then err "Found unreplaced substitution token in raw query JSON" fi -curl \ - --request POST \ - --silent \ - --location \ - --header 'content-type: application/json' \ - --url 'https://api.cirrus-ci.com/graphql' \ - --data @./artifacts/query.json \ - --output ./artifacts/reply.json -echo "::endgroup::" -echo "::group::Received GraphQL Reply" -jq --indent 4 --color-output . <./artifacts/reply.json || \ - cat ./artifacts/reply.json -echo "::endgroup::" +# The query should never ever return an empty-list, unless there are no cirrus-cron +# jobs defined for the repository. In that case, this monitoring script shouldn't +# be running anyway. +filt_head='.data.ownerRepository.cronSettings' +gql $(./artifacts/query.json) "$filt_head" > ./artifacts/reply.json # e.x. reply.json # { # "data": { @@ -87,22 +72,8 @@ echo "::endgroup::" # "lastInvocationBuild": { # "id": "5003065549914112", # "status": "FAILED" -# } # } -# ] -# } -# } -# } - -# This should never ever return an empty-list, unless there are no cirrus-cron -# jobs defined for the repository. In that case, this monitoring script shouldn't -# be running anyway. -filt_head='.data.ownerRepository.cronSettings' -if ! jq -e "$filt_head" ./artifacts/reply.json &> /dev/null -then - # Actual colorized JSON reply was printed above - err "Null/empty result filtering reply with '$filt_head'" -fi +# ... filt="$filt_head | map(select(.lastInvocationBuild.status==\"FAILED\") | { name:.name, id:.lastInvocationBuild.id} | join(\" \")) | join(\"\n\")" jq --raw-output "$filt" ./artifacts/reply.json > "$NAME_ID_FILEPATH" @@ -114,5 +85,7 @@ cat "$NAME_ID_FILEPATH" records=$(wc --words "$NAME_ID_FILEPATH" | cut -d ' ' -f 1) # Always two words per record failures=$((records/2)) +# Set the output of this step. +# Ref: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter echo "failures::$failures" >> $GITHUB_OUTPUT echo "Total failed Cirrus-CI cron builds: $failures" diff --git a/.github/actions/check_cirrus_cron/lib.sh b/.github/actions/check_cirrus_cron/lib.sh index 1838798dd1..70f08f8099 100644 --- a/.github/actions/check_cirrus_cron/lib.sh +++ b/.github/actions/check_cirrus_cron/lib.sh @@ -1,7 +1,75 @@ + +# Send text to stderr +msg() { + echo "$@" > /dev/stderr +} + # Must be called from top-level of script, not another function. err() { # Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions - echo "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[1]}::${1:-No error message given}" + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::$@" exit 1 } + +# Using python3 here is a compromise for readability and +# properly handling quote, control and unicode character encoding. +escape_query() { + local json_string + # Assume it's okay to squash repeated whitespaces inside the query + json_string=$(printf '%s' "$1" | \ + tr --delete '\r\n' | \ + tr --squeeze-repeats '[[:space:]]' | \ + python3 -c 'import sys,json; print(json.dumps(sys.stdin.read()))') + # The $json_string in message is already quoted + echo -n "$json_string" +} + +# Given a GraphQL query/mutation, fire it at the API. +# and return the output on stdout. The optional +# second parameter may contain a jq filter-string. +# When provided, if the GQL result is empty, null, +# fails to parse, or does not match the filter-string, +# non-zero will be returned. +gql() { + local e_query query + e_query=$(escape_query "$1") + query="{\"query\": $e_query}" + local filter + filter="$2" + local output + local filtered + msg "::group::Posting GraphQL Query and checking result" + msg "query: " + if ! jq -e . <<<"$query" > /dev/stderr; then + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Invalid query JSON: $query" + return 1 + fi + if output=$(curl \ + --request POST \ + --silent \ + --show-error \ + --location \ + --header 'content-type: application/json' \ + --header "Authorization: Bearer $SECRET_CIRRUS_API_KEY" \ + --url 'https://api.cirrus-ci.com/graphql' \ + --data "$query") && [[ -n "$output" ]]; then + + if filtered=$(jq -e "$filter" <<<"$output") && [[ -n "$filtered" ]]; then + msg "result:" + # Make debugging easier w/ formatted output + # to stderr for display, stdout for consumption by caller + jq --indent 2 . <<<"$output" | tee /dev/stderr + msg "::endgroup::" + return 0 + fi + + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query result did not pass filter '$2': '$output'" + msg "::endgroup::" + return 2 + fi + + msg "::error file=${BASH_SOURCE[1]},line=${BASH_LINENO[0]}::Query failed or result empty: '$output'" + msg "::endgroup::" + return 3 +} diff --git a/.github/actions/check_cirrus_cron/make_email_body.sh b/.github/actions/check_cirrus_cron/make_email_body.sh index f88803da9f..ab5a717eb0 100755 --- a/.github/actions/check_cirrus_cron/make_email_body.sh +++ b/.github/actions/check_cirrus_cron/make_email_body.sh @@ -14,8 +14,7 @@ if [[ -z "$GITHUB_REPOSITORY" ]]; then elif [[ -z "$GITHUB_WORKFLOW" ]]; then err $(printf "$_errfmt" "\$GITHUB_WORKFLOW") elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then - _errfmt="Expecting %s value to be a readable file" - err $(printf "$_errfmt" "\$NAME_ID_FILEPATH") + err "Expecting \$NAME_ID_FILEPATH value ($NAME_ID_FILEPATH) to be a readable file" fi mkdir -p artifacts diff --git a/.github/actions/check_cirrus_cron/rerun_failed_tasks.sh b/.github/actions/check_cirrus_cron/rerun_failed_tasks.sh new file mode 100755 index 0000000000..8432815d03 --- /dev/null +++ b/.github/actions/check_cirrus_cron/rerun_failed_tasks.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +set -eo pipefail + +# Intended to be executed from a github action workflow step. +# Input: File listing space separated failed cron build names and IDs +# Output: $GITHUB_WORKSPACE/artifacts/email_body.txt file +# +# HOW TO TEST: This script may be manually tested assuming you have +# access to the github containers-org. Cirrus API key. With that in-hand, +# this script may be manually run by: +# 1. export SECRET_CIRRUS_API_KEY= +# 2. Find an old podman build that failed on `main` or another **branch**. +# For example, from https://cirrus-ci.com/github/containers/podman/main +# (pick an old one from the bottom, since re-running it won't affect anybody) +# 3. Create a temp. file, like /tmp/fail with a single line, of the form: +# +# 4. export NAME_ID_FILEPATH=/tmp/fail +# 5. execute this script, and refresh the build in the WebUI, all unsuccessful +# tasks should change status to running or scheduled. Note: some later +# tasks may remain red as they wait for dependencies to run and pass. +# 6. After each run, cleanup with 'rm -rf ./artifacts' +# (unless you want to examine them) + +source $(dirname "${BASH_SOURCE[0]}")/lib.sh + +_errfmt="Expecting %s value to not be empty" +if [[ -z "$SECRET_CIRRUS_API_KEY" ]]; then + err $(printf "$_errfmt" "\$SECRET_CIRRUS_API_KEY") +elif [[ ! -r "$NAME_ID_FILEPATH" ]]; then # output from cron_failures.sh + err $(printf "Expecting %s value to be a readable file" "\$NAME_ID_FILEPATH") +fi + +mkdir -p artifacts +# If there are no tasks, don't fail reading the file +truncate -s 0 ./artifacts/rerun_tids.txt + +cat "$NAME_ID_FILEPATH" | \ + while read -r NAME BID; do + if [[ -z "$NAME" ]]; then + err $(printf "$_errfmt" "\$NAME") + elif [[ -z "$BID" ]]; then + err $(printf "$_errfmt" "\$BID") + fi + + id_status_q=" + query { + build(id: \"$BID\") { + tasks { + id, + status + } + } + } + " + task_id_status=$(gql "$id_status_q" '.data.build.tasks[0]') + # Expected query result like: + # { + # "data": { + # "build": { + # "tasks": [ + # { + # "id": "6321184690667520", + # "status": "COMPLETED" + # }, + # ... + msg "::group::Selecting failed/aborted tasks to re-run" + jq -r -e '.data.build.tasks[] | join(" ")' <<<"$task_id_status" | \ + while read -r TID STATUS; do + if [[ -z "$TID" ]] || [[ -z "$STATUS" ]]; then + # assume empty line and/or end of file + msg "Skipping TID '$TID' with status '$STATUS'" + continue + # Failed task dependencies will have 'aborted' status + elif [[ "$STATUS" == "FAILED" ]] || [[ "$STATUS" == "ABORTED" ]]; then + msg "Rerunning build $BID task $TID" + # Must send result through a file into rerun_tasks array + # because this section is executing in a child-shell + echo "$TID" >> ./artifacts/rerun_tids.txt + fi + done + declare -a rerun_tasks + mapfile rerun_tasks <./artifacts/rerun_tids.txt + msg "::endgroup::" + + if [[ "${#rerun_tasks[*]}" -eq 0 ]]; then + msg "No tasks to re-run for build $BID" + continue; + fi + + msg "::warning::Rerunning ${#rerun_tasks[*]} tasks for build $BID" + # Check-value returned if the gql call was successful + canary=$(uuidgen) + # Ensure the trailing ',' is stripped from the end (would be invalid JSON) + task_ids=$(printf '[%s]' $(printf '"%s",' ${rerun_tasks[@]} | head -c -1)) + rerun_m=" + mutation { + batchReRun(input: { + clientMutationId: \"$canary\", + taskIds: $task_ids + } + ) { + clientMutationId + } + } + " + filter='.data.batchReRun.clientMutationId' + result=$(gql "$rerun_m" "$filter") + if [[ $(jq -r -e "$filter"<<<"$result") != "$canary" ]]; then + err "Attempt to re-run tasks for build $BID failed: ${rerun_tasks[@]}" + fi + done diff --git a/.github/workflows/rerun_cirrus_cron.yml b/.github/workflows/rerun_cirrus_cron.yml new file mode 100644 index 0000000000..7fd01b071d --- /dev/null +++ b/.github/workflows/rerun_cirrus_cron.yml @@ -0,0 +1,61 @@ +--- + +# Format Ref: https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-syntax-for-github-actions + +# Required to un-FUBAR default ${{github.workflow}} value +name: rerun_cirrus_cron + +on: + # Note: This only applies to the main branch. + schedule: + # N/B: This should fire about an hour prior to check_cirrus_cron + # so the re-runs have a chance to complete. + - cron: '59 22 * * 1-5' + # Debug: Allow triggering job manually in github-actions WebUI + workflow_dispatch: {} + +env: + # Debug-mode can reveal secrets, only enable by a secret value. + # Ref: https://help.github.com/en/actions/configuring-and-managing-workflows/managing-a-workflow-run#enabling-step-debug-logging + ACTIONS_STEP_DEBUG: '${{ secrets.ACTIONS_STEP_DEBUG }}' + # CSV listing of e-mail addresses for delivery failure or error notices + RCPTCSV: rh.container.bot@gmail.com,podman-monitor@lists.podman.io + # Filename for table of cron-name to build-id data + # (must be in $GITHUB_WORKSPACE/artifacts/) + NAME_ID_FILEPATH: './artifacts/name_id.txt' + +permissions: + contents: read + +jobs: + cron_failures: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@629c2de402a417ea7690ca6ce3f33229e27606a5 # v2 + with: + persist-credentials: false + + - name: Get failed cron names and Build IDs + id: cron + run: './.github/actions/check_cirrus_cron/cron_failures.sh' + + - if: steps.cron.outputs.failures > 0 + shell: bash + run: './.github/actions/check_cirrus_cron/rerun_failed_tasks.sh' + uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2 + with: + name: ${{ github.job }}_artifacts + path: artifacts/* + + - if: failure() + name: Send error notification e-mail + uses: dawidd6/action-send-mail@a80d851dc950256421f1d1d735a2dc1ef314ac8f # v2.2.2 + with: + server_address: ${{secrets.ACTION_MAIL_SERVER}} + server_port: 465 + username: ${{secrets.ACTION_MAIL_USERNAME}} + password: ${{secrets.ACTION_MAIL_PASSWORD}} + subject: Github workflow error on ${{github.repository}} + to: ${{env.RCPTCSV}} + from: ${{secrets.ACTION_MAIL_SENDER}} + body: "Job failed: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}"