diff --git a/etc/evergreen_yml_components/configuration.yml b/etc/evergreen_yml_components/configuration.yml index 6a10dd7eb93..277301a86d3 100644 --- a/etc/evergreen_yml_components/configuration.yml +++ b/etc/evergreen_yml_components/configuration.yml @@ -148,6 +148,7 @@ post: - func: "generate hang analyzer tasks" - func: "attach bazel invocation text" - func: "save failed tests" + - func: "save bazel jvm dump" - func: "save hang analyzer debugger files" - func: "save disk statistics" - func: "save system resource information" @@ -180,6 +181,8 @@ post: # Timeout steps timeout: - func: "f_expansions_write" + - func: "signal bazel quit" + - func: "save bazel jvm dump" - func: "run hang analyzer" - func: "wait for resmoke to shutdown" - func: "save bazel run logs" diff --git a/etc/evergreen_yml_components/definitions.yml b/etc/evergreen_yml_components/definitions.yml index 8e6a146c745..963095b4029 100644 --- a/etc/evergreen_yml_components/definitions.yml +++ b/etc/evergreen_yml_components/definitions.yml @@ -3325,6 +3325,16 @@ functions: display_name: Bazel Run Logs optional: true + "signal bazel quit": + - *f_expansions_write + - command: subprocess.exec + display_name: "signal bazel quit" + params: + binary: bash + args: + - "src/evergreen/collect_bazel_jvm_dump.sh" + - "--signal-bazel-quit" + "archive hang analyzer debugger files": &archive_hang_analyzer_debugger_files command: s3.put display_name: "archive hang analyzer debugger files" @@ -3420,18 +3430,25 @@ functions: display_name: Bazel Header List "save bazel jvm dump": - command: s3.put - display_name: "attach bazel jvm dump" - params: - optional: true - aws_key: ${aws_key} - aws_secret: ${aws_secret} - local_file: src/jvm.out.tar.gz - remote_file: ${project}/${build_variant}/${revision}/jvm.out.${task_id}-${execution}.tar.gz - bucket: mciuploads - permissions: public-read - content_type: application/gzip - display_name: Bazel JVM dump + - *f_expansions_write + - command: subprocess.exec + display_name: "collect bazel jvm dump" + params: + binary: bash + args: + - "src/evergreen/collect_bazel_jvm_dump.sh" + - command: s3.put + display_name: "attach bazel jvm dump" + params: + optional: true + aws_key: ${aws_key} + aws_secret: ${aws_secret} + local_file: src/jvm.out.tar.gz + remote_file: ${project}/${build_variant}/${revision}/jvm.out.${task_id}-${execution}.tar.gz + bucket: mciuploads + permissions: public-read + content_type: application/gzip + display_name: Bazel JVM dump "save bazel exec logs": command: s3.put diff --git a/etc/evergreen_yml_components/tasks/compile_tasks.yml b/etc/evergreen_yml_components/tasks/compile_tasks.yml index b303ac5064b..d44afedb63b 100644 --- a/etc/evergreen_yml_components/tasks/compile_tasks.yml +++ b/etc/evergreen_yml_components/tasks/compile_tasks.yml @@ -102,6 +102,8 @@ variables: - func: "cleanup environment" timeout: - func: "f_expansions_write" + - func: "signal bazel quit" + - func: "save bazel jvm dump" - func: "run hang analyzer" - func: "wait for resmoke to shutdown" - func: "save bazel run logs" diff --git a/etc/evergreen_yml_components/tasks/compile_tasks_shared.yml b/etc/evergreen_yml_components/tasks/compile_tasks_shared.yml index 53bb5c49559..e88bdb3c1ee 100644 --- a/etc/evergreen_yml_components/tasks/compile_tasks_shared.yml +++ b/etc/evergreen_yml_components/tasks/compile_tasks_shared.yml @@ -76,6 +76,8 @@ variables: - func: "cleanup environment" timeout: - func: "f_expansions_write" + - func: "signal bazel quit" + - func: "save bazel jvm dump" - func: "run hang analyzer" - func: "wait for resmoke to shutdown" - func: "save bazel run logs" diff --git a/etc/evergreen_yml_components/tasks/resmoke/non_server_teams/tasks.yml b/etc/evergreen_yml_components/tasks/resmoke/non_server_teams/tasks.yml index f8a69658f7e..82c39c75461 100644 --- a/etc/evergreen_yml_components/tasks/resmoke/non_server_teams/tasks.yml +++ b/etc/evergreen_yml_components/tasks/resmoke/non_server_teams/tasks.yml @@ -1166,6 +1166,7 @@ task_groups: teardown_task: - func: "s3.put bazel build events" - func: "collect bazel debug logs" + - func: "save bazel jvm dump" - func: "debug full disk" - func: "attach bazel invocation" - func: "save failed tests" diff --git a/etc/evergreen_yml_components/tasks/resmoke/server_divisions/clusters_and_integrations/tasks.yml b/etc/evergreen_yml_components/tasks/resmoke/server_divisions/clusters_and_integrations/tasks.yml index 59a290f5ef8..a78ac230980 100644 --- a/etc/evergreen_yml_components/tasks/resmoke/server_divisions/clusters_and_integrations/tasks.yml +++ b/etc/evergreen_yml_components/tasks/resmoke/server_divisions/clusters_and_integrations/tasks.yml @@ -145,6 +145,7 @@ variables: - func: "generate hang analyzer tasks" - func: "attach bazel invocation text" - func: "save failed tests" + - func: "save bazel jvm dump" - func: "save hang analyzer debugger files" - func: "save disk statistics" - func: "save system resource information" @@ -191,6 +192,8 @@ variables: - func: "cleanup environment" timeout: - func: "f_expansions_write" + - func: "signal bazel quit" + - func: "save bazel jvm dump" - func: "run hang analyzer" - func: "wait for resmoke to shutdown" - func: "save bazel run logs" diff --git a/evergreen/bazel_evergreen_shutils.sh b/evergreen/bazel_evergreen_shutils.sh index fccb6e1db46..884469825da 100644 --- a/evergreen/bazel_evergreen_shutils.sh +++ b/evergreen/bazel_evergreen_shutils.sh @@ -176,9 +176,9 @@ bazel_evergreen_shutils::timeout_prefix() { # Produce the prefix if we have a binary if [[ -n "$timeout_bin" ]]; then if [[ "$need_timeout" == "explicit" ]]; then - echo "$timeout_bin ${build_timeout_seconds}" + echo "$timeout_bin -s QUIT ${build_timeout_seconds}" elif [[ "$need_timeout" == "fallback" ]]; then - echo "$timeout_bin 3600" + echo "$timeout_bin -s QUIT 3600" else echo "" fi @@ -191,7 +191,7 @@ bazel_evergreen_shutils::timeout_prefix() { bazel_evergreen_shutils::bazel_output_base() { local BAZEL_BINARY="$1" - "$BAZEL_BINARY" info output_base 2>/dev/null + "$BAZEL_BINARY" --batch info output_base 2>/dev/null || "$BAZEL_BINARY" info output_base 2>/dev/null } bazel_evergreen_shutils::bazel_pidfile_path() { @@ -201,12 +201,20 @@ bazel_evergreen_shutils::bazel_pidfile_path() { echo "${ob}/server/server.pid.txt" } -bazel_evergreen_shutils::is_bazel_server_running() { +bazel_evergreen_shutils::bazel_server_pid() { local BAZEL_BINARY="$1" local pf pid pf="$(bazel_evergreen_shutils::bazel_pidfile_path "$BAZEL_BINARY")" || return 1 [[ -f "$pf" ]] || return 1 pid="$(cat "$pf" 2>/dev/null || true)" + [[ "$pid" =~ ^[0-9]+$ ]] || return 1 + echo "$pid" +} + +bazel_evergreen_shutils::is_bazel_server_running() { + local BAZEL_BINARY="$1" + local pid + pid="$(bazel_evergreen_shutils::bazel_server_pid "$BAZEL_BINARY" 2>/dev/null || true)" [[ -n "$pid" ]] || return 1 if kill -0 "$pid" 2>/dev/null; then return 0 @@ -230,10 +238,259 @@ bazel_evergreen_shutils::print_bazel_server_pid() { fi } +bazel_evergreen_shutils::fast_bazel_server_pids() { + local pid + local -a live_pids=() + local -A seen_pids=() + + while IFS= read -r pid; do + if [[ ! "$pid" =~ ^[0-9]+$ ]] || [[ -n "${seen_pids[$pid]:-}" ]]; then + continue + fi + seen_pids["$pid"]=1 + if kill -0 "$pid" 2>/dev/null; then + live_pids+=("$pid") + fi + done < <(pgrep -f "java.*bazel" 2>/dev/null || true) + + if [[ ${#live_pids[@]} -eq 0 ]]; then + return 1 + fi + + printf '%s\n' "${live_pids[@]}" +} + +bazel_evergreen_shutils::bazel_server_pids() { + local BAZEL_BINARY="$1" + local pf pid + local -a candidate_pids=() + local -a live_pids=() + local -A seen_pids=() + + pf="$(bazel_evergreen_shutils::bazel_pidfile_path "$BAZEL_BINARY" 2>/dev/null)" || true + if [[ -f "$pf" ]]; then + pid="$(cat "$pf" 2>/dev/null || true)" + if [[ "$pid" =~ ^[0-9]+$ ]]; then + candidate_pids+=("$pid") + fi + fi + + while IFS= read -r pid; do + if [[ "$pid" =~ ^[0-9]+$ ]]; then + candidate_pids+=("$pid") + fi + done < <(pgrep -f "java.*bazel" 2>/dev/null || true) + + for pid in "${candidate_pids[@]}"; do + if [[ -n "${seen_pids[$pid]:-}" ]]; then + continue + fi + seen_pids["$pid"]=1 + if kill -0 "$pid" 2>/dev/null; then + live_pids+=("$pid") + fi + done + + if [[ ${#live_pids[@]} -eq 0 ]]; then + return 1 + fi + + printf '%s\n' "${live_pids[@]}" +} + +bazel_evergreen_shutils::bazel_cache_pidfiles() { + local pattern pidfile + local -a patterns=( + "${HOME}/.cache/bazel/_bazel_*/*/server/server.pid.txt" + "/private/var/tmp/_bazel_*/*/server/server.pid.txt" + "/var/tmp/_bazel_*/*/server/server.pid.txt" + ) + + shopt -s nullglob + for pattern in "${patterns[@]}"; do + for pidfile in $pattern; do + [[ -f "$pidfile" ]] && echo "$pidfile" + done + done + shopt -u nullglob +} + +bazel_evergreen_shutils::bazel_pidfile_path_for_pid() { + local server_pid="$1" + local pidfile candidate_pid + + [[ "$server_pid" =~ ^[0-9]+$ ]] || return 1 + + while IFS= read -r pidfile; do + candidate_pid="$(cat "$pidfile" 2>/dev/null || true)" + if [[ "$candidate_pid" == "$server_pid" ]]; then + echo "$pidfile" + return 0 + fi + done < <(bazel_evergreen_shutils::bazel_cache_pidfiles) + + return 1 +} + +bazel_evergreen_shutils::request_bazel_jvm_dump() { + local BAZEL_BINARY="$1" + local pid + local signaled_pid=0 + + echo "Scanning for bazel server processes to signal." >&2 + + while IFS= read -r pid; do + [[ -z "$pid" ]] && continue + echo "Sending SIGQUIT to bazel process ${pid}" >&2 + if kill -QUIT "$pid" 2>/dev/null; then + signaled_pid=1 + else + echo "Failed to send SIGQUIT to bazel process ${pid}" >&2 + fi + done < <(bazel_evergreen_shutils::fast_bazel_server_pids || true) + + if [[ "$signaled_pid" -eq 0 ]]; then + echo "No bazel process found to signal." >&2 + return 1 + fi + + # Bazel's JVM writes thread dumps asynchronously after SIGQUIT. + sleep 5 +} + +bazel_evergreen_shutils::bazel_jvm_out_snapshot_dir() { + echo "bazel_jvm_outs" +} + +bazel_evergreen_shutils::bazel_jvm_out_path_for_pid() { + local server_pid="$1" + local pidfile output_base candidate + + pidfile="$(bazel_evergreen_shutils::bazel_pidfile_path_for_pid "$server_pid")" || return 1 + output_base="$(dirname "$(dirname "$pidfile")")" + + for candidate in "${output_base}/server/jvm.out" "${output_base}/jvm.out"; do + if [[ -f "$candidate" ]]; then + echo "$candidate" + return 0 + fi + done + + echo "No bazel jvm.out file found for pid ${server_pid} under ${output_base}" >&2 + return 1 +} + +bazel_evergreen_shutils::bazel_jvm_out_path() { + local BAZEL_BINARY="$1" + local output_base jvm_out_path="" + local candidate + + output_base="$(bazel_evergreen_shutils::bazel_output_base "$BAZEL_BINARY")" || { + echo "Unable to determine bazel output_base" >&2 + return 1 + } + + for candidate in "${output_base}/server/jvm.out" "${output_base}/jvm.out"; do + if [[ -f "$candidate" ]]; then + jvm_out_path="$candidate" + break + fi + done + + if [[ -z "$jvm_out_path" ]]; then + echo "No bazel jvm.out file found under ${output_base}" >&2 + return 1 + fi + + echo "$jvm_out_path" +} + +bazel_evergreen_shutils::capture_bazel_jvm_out() { + local BAZEL_BINARY="$1" + local server_pid="${2:-}" + local jvm_out_path snapshot_dir timestamp output_prefix output_file + local capture_index=1 + + if [[ -z "$server_pid" ]]; then + server_pid="$(bazel_evergreen_shutils::bazel_server_pid "$BAZEL_BINARY" 2>/dev/null || true)" + fi + if [[ -z "$server_pid" ]]; then + while IFS= read -r server_pid; do + [[ -n "$server_pid" ]] && break + done < <(bazel_evergreen_shutils::fast_bazel_server_pids || true) + fi + + if [[ -n "$server_pid" ]]; then + jvm_out_path="$(bazel_evergreen_shutils::bazel_jvm_out_path_for_pid "$server_pid" 2>/dev/null || true)" + fi + if [[ -z "$jvm_out_path" ]]; then + jvm_out_path="$(bazel_evergreen_shutils::bazel_jvm_out_path "$BAZEL_BINARY" 2>/dev/null || true)" + fi + [[ -n "$jvm_out_path" ]] || return 1 + + snapshot_dir="$(bazel_evergreen_shutils::bazel_jvm_out_snapshot_dir)" + mkdir -p "$snapshot_dir" + + timestamp=$(date +%Y%m%d_%H%M%S) + if [[ -n "$server_pid" ]]; then + output_prefix="${snapshot_dir}/bazel_jvm_out_pid${server_pid}_${timestamp}" + else + output_prefix="${snapshot_dir}/bazel_jvm_out_pidunknown_${timestamp}" + fi + + output_file="${output_prefix}.txt" + while [[ -e "$output_file" ]]; do + output_file="${output_prefix}_${capture_index}.txt" + ((capture_index++)) + done + + cp "$jvm_out_path" "$output_file" + echo "Captured bazel jvm.out from ${jvm_out_path} to $(pwd)/${output_file}" >&2 + echo "$output_file" +} + +bazel_evergreen_shutils::package_bazel_jvm_out() { + local BAZEL_BINARY="$1" + local archive_path="${2:-jvm.out.tar.gz}" + local snapshot_dir live_server_pid="" + local -a snapshots=() + + snapshot_dir="$(bazel_evergreen_shutils::bazel_jvm_out_snapshot_dir)" + mkdir -p "$snapshot_dir" + + shopt -s nullglob + snapshots=("${snapshot_dir}"/*) + shopt -u nullglob + + while IFS= read -r live_server_pid; do + [[ -n "$live_server_pid" ]] && break + done < <(bazel_evergreen_shutils::fast_bazel_server_pids || true) + + if [[ -n "$live_server_pid" || ${#snapshots[@]} -eq 0 ]]; then + bazel_evergreen_shutils::capture_bazel_jvm_out "$BAZEL_BINARY" "$live_server_pid" >/dev/null || { + if [[ ${#snapshots[@]} -eq 0 ]]; then + return 1 + fi + } + shopt -s nullglob + snapshots=("${snapshot_dir}"/*) + shopt -u nullglob + fi + + if [[ ${#snapshots[@]} -eq 0 ]]; then + echo "No captured bazel jvm.out files found under $(pwd)/${snapshot_dir}" >&2 + return 1 + fi + + rm -f "$archive_path" + tar -czf "$archive_path" -C "$(dirname "$snapshot_dir")" "$(basename "$snapshot_dir")" + echo "Archived ${#snapshots[@]} bazel jvm dump file(s) from $(pwd)/${snapshot_dir} to $(pwd)/${archive_path}" >&2 +} + bazel_evergreen_shutils::jstack_bazel() { # Find all bazel processes (Java processes with "bazel" in command line) local pids - pids=$(pgrep -f "java.*bazel" || true) + pids=$(bazel_evergreen_shutils::fast_bazel_server_pids || true) if [[ -z "$pids" ]]; then return 1 fi @@ -310,6 +567,9 @@ bazel_evergreen_shutils::retry_bazel_cmd() { bazel_evergreen_shutils::print_bazel_server_pid "$BAZEL_BINARY" >&2 fi + local attempt_bazel_server_pid="" + attempt_bazel_server_pid="$(bazel_evergreen_shutils::bazel_server_pid "$BAZEL_BINARY" 2>/dev/null || true)" + # Reassemble the caller’s words into a single command string for eval. # We deliberately do *not* try to be clever here—this restores legacy behavior # where quoted pieces inside variables (e.g., --base_dir="..") are honored by the shell. @@ -375,6 +635,8 @@ bazel_evergreen_shutils::retry_bazel_cmd() { bazel_evergreen_shutils::print_bazel_server_pid "$BAZEL_BINARY" >&2 elif [[ $RET -eq 124 ]]; then echo "Bazel timed out." >&2 + bazel_evergreen_shutils::request_bazel_jvm_dump "$BAZEL_BINARY" || true + bazel_evergreen_shutils::capture_bazel_jvm_out "$BAZEL_BINARY" "$attempt_bazel_server_pid" >/dev/null || true "$BAZEL_BINARY" shutdown || true else if [[ ${RETRY_ON_FAIL:-0} -eq 1 ]]; then diff --git a/evergreen/collect_bazel_jvm_dump.sh b/evergreen/collect_bazel_jvm_dump.sh new file mode 100644 index 00000000000..433afe27472 --- /dev/null +++ b/evergreen/collect_bazel_jvm_dump.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +# Collects one or more bazel jvm.out snapshots into a task-local tarball and +# can optionally request a fresh dump from any live bazel server processes +# first. +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +. "$DIR/prelude.sh" +. "$DIR/bazel_evergreen_shutils.sh" + +set -o errexit +set -o pipefail + +signal_bazel_quit=false +if [[ "${1:-}" == "--signal-bazel-quit" ]]; then + signal_bazel_quit=true + shift +fi + +if [[ "$#" -ne 0 ]]; then + echo "Usage: $0 [--signal-bazel-quit]" >&2 + exit 1 +fi + +cd src + +BAZEL_BINARY="$(bazel_evergreen_shutils::bazel_get_binary_path)" +ARCHIVE_PATH="jvm.out.tar.gz" + +if $signal_bazel_quit; then + bazel_evergreen_shutils::request_bazel_jvm_dump "$BAZEL_BINARY" || true + exit 0 +fi + +rm -f "$ARCHIVE_PATH" +bazel_evergreen_shutils::package_bazel_jvm_out "$BAZEL_BINARY" "$ARCHIVE_PATH" || true