SERVER-123671 add bazel jvm dumper (#51601)

GitOrigin-RevId: d509f7aaf828f49d3a61afb47eb723818494ad68
This commit is contained in:
Daniel Moody 2026-04-13 11:16:06 -05:00 committed by MongoDB Bot
parent 3b350b159c
commit 74626bfa4a
8 changed files with 342 additions and 17 deletions

View File

@ -148,6 +148,7 @@ post:
- func: "generate hang analyzer tasks"
- func: "attach bazel invocation text"
- func: "save failed tests"
- func: "save bazel jvm dump"
- func: "save hang analyzer debugger files"
- func: "save disk statistics"
- func: "save system resource information"
@ -180,6 +181,8 @@ post:
# Timeout steps
timeout:
- func: "f_expansions_write"
- func: "signal bazel quit"
- func: "save bazel jvm dump"
- func: "run hang analyzer"
- func: "wait for resmoke to shutdown"
- func: "save bazel run logs"

View File

@ -3325,6 +3325,16 @@ functions:
display_name: Bazel Run Logs
optional: true
"signal bazel quit":
- *f_expansions_write
- command: subprocess.exec
display_name: "signal bazel quit"
params:
binary: bash
args:
- "src/evergreen/collect_bazel_jvm_dump.sh"
- "--signal-bazel-quit"
"archive hang analyzer debugger files": &archive_hang_analyzer_debugger_files
command: s3.put
display_name: "archive hang analyzer debugger files"
@ -3420,18 +3430,25 @@ functions:
display_name: Bazel Header List
"save bazel jvm dump":
command: s3.put
display_name: "attach bazel jvm dump"
params:
optional: true
aws_key: ${aws_key}
aws_secret: ${aws_secret}
local_file: src/jvm.out.tar.gz
remote_file: ${project}/${build_variant}/${revision}/jvm.out.${task_id}-${execution}.tar.gz
bucket: mciuploads
permissions: public-read
content_type: application/gzip
display_name: Bazel JVM dump
- *f_expansions_write
- command: subprocess.exec
display_name: "collect bazel jvm dump"
params:
binary: bash
args:
- "src/evergreen/collect_bazel_jvm_dump.sh"
- command: s3.put
display_name: "attach bazel jvm dump"
params:
optional: true
aws_key: ${aws_key}
aws_secret: ${aws_secret}
local_file: src/jvm.out.tar.gz
remote_file: ${project}/${build_variant}/${revision}/jvm.out.${task_id}-${execution}.tar.gz
bucket: mciuploads
permissions: public-read
content_type: application/gzip
display_name: Bazel JVM dump
"save bazel exec logs":
command: s3.put

View File

@ -102,6 +102,8 @@ variables:
- func: "cleanup environment"
timeout:
- func: "f_expansions_write"
- func: "signal bazel quit"
- func: "save bazel jvm dump"
- func: "run hang analyzer"
- func: "wait for resmoke to shutdown"
- func: "save bazel run logs"

View File

@ -76,6 +76,8 @@ variables:
- func: "cleanup environment"
timeout:
- func: "f_expansions_write"
- func: "signal bazel quit"
- func: "save bazel jvm dump"
- func: "run hang analyzer"
- func: "wait for resmoke to shutdown"
- func: "save bazel run logs"

View File

@ -1166,6 +1166,7 @@ task_groups:
teardown_task:
- func: "s3.put bazel build events"
- func: "collect bazel debug logs"
- func: "save bazel jvm dump"
- func: "debug full disk"
- func: "attach bazel invocation"
- func: "save failed tests"

View File

@ -145,6 +145,7 @@ variables:
- func: "generate hang analyzer tasks"
- func: "attach bazel invocation text"
- func: "save failed tests"
- func: "save bazel jvm dump"
- func: "save hang analyzer debugger files"
- func: "save disk statistics"
- func: "save system resource information"
@ -191,6 +192,8 @@ variables:
- func: "cleanup environment"
timeout:
- func: "f_expansions_write"
- func: "signal bazel quit"
- func: "save bazel jvm dump"
- func: "run hang analyzer"
- func: "wait for resmoke to shutdown"
- func: "save bazel run logs"

View File

@ -176,9 +176,9 @@ bazel_evergreen_shutils::timeout_prefix() {
# Produce the prefix if we have a binary
if [[ -n "$timeout_bin" ]]; then
if [[ "$need_timeout" == "explicit" ]]; then
echo "$timeout_bin ${build_timeout_seconds}"
echo "$timeout_bin -s QUIT ${build_timeout_seconds}"
elif [[ "$need_timeout" == "fallback" ]]; then
echo "$timeout_bin 3600"
echo "$timeout_bin -s QUIT 3600"
else
echo ""
fi
@ -191,7 +191,7 @@ bazel_evergreen_shutils::timeout_prefix() {
bazel_evergreen_shutils::bazel_output_base() {
local BAZEL_BINARY="$1"
"$BAZEL_BINARY" info output_base 2>/dev/null
"$BAZEL_BINARY" --batch info output_base 2>/dev/null || "$BAZEL_BINARY" info output_base 2>/dev/null
}
bazel_evergreen_shutils::bazel_pidfile_path() {
@ -201,12 +201,20 @@ bazel_evergreen_shutils::bazel_pidfile_path() {
echo "${ob}/server/server.pid.txt"
}
bazel_evergreen_shutils::is_bazel_server_running() {
bazel_evergreen_shutils::bazel_server_pid() {
local BAZEL_BINARY="$1"
local pf pid
pf="$(bazel_evergreen_shutils::bazel_pidfile_path "$BAZEL_BINARY")" || return 1
[[ -f "$pf" ]] || return 1
pid="$(cat "$pf" 2>/dev/null || true)"
[[ "$pid" =~ ^[0-9]+$ ]] || return 1
echo "$pid"
}
bazel_evergreen_shutils::is_bazel_server_running() {
local BAZEL_BINARY="$1"
local pid
pid="$(bazel_evergreen_shutils::bazel_server_pid "$BAZEL_BINARY" 2>/dev/null || true)"
[[ -n "$pid" ]] || return 1
if kill -0 "$pid" 2>/dev/null; then
return 0
@ -230,10 +238,259 @@ bazel_evergreen_shutils::print_bazel_server_pid() {
fi
}
bazel_evergreen_shutils::fast_bazel_server_pids() {
local pid
local -a live_pids=()
local -A seen_pids=()
while IFS= read -r pid; do
if [[ ! "$pid" =~ ^[0-9]+$ ]] || [[ -n "${seen_pids[$pid]:-}" ]]; then
continue
fi
seen_pids["$pid"]=1
if kill -0 "$pid" 2>/dev/null; then
live_pids+=("$pid")
fi
done < <(pgrep -f "java.*bazel" 2>/dev/null || true)
if [[ ${#live_pids[@]} -eq 0 ]]; then
return 1
fi
printf '%s\n' "${live_pids[@]}"
}
bazel_evergreen_shutils::bazel_server_pids() {
local BAZEL_BINARY="$1"
local pf pid
local -a candidate_pids=()
local -a live_pids=()
local -A seen_pids=()
pf="$(bazel_evergreen_shutils::bazel_pidfile_path "$BAZEL_BINARY" 2>/dev/null)" || true
if [[ -f "$pf" ]]; then
pid="$(cat "$pf" 2>/dev/null || true)"
if [[ "$pid" =~ ^[0-9]+$ ]]; then
candidate_pids+=("$pid")
fi
fi
while IFS= read -r pid; do
if [[ "$pid" =~ ^[0-9]+$ ]]; then
candidate_pids+=("$pid")
fi
done < <(pgrep -f "java.*bazel" 2>/dev/null || true)
for pid in "${candidate_pids[@]}"; do
if [[ -n "${seen_pids[$pid]:-}" ]]; then
continue
fi
seen_pids["$pid"]=1
if kill -0 "$pid" 2>/dev/null; then
live_pids+=("$pid")
fi
done
if [[ ${#live_pids[@]} -eq 0 ]]; then
return 1
fi
printf '%s\n' "${live_pids[@]}"
}
bazel_evergreen_shutils::bazel_cache_pidfiles() {
local pattern pidfile
local -a patterns=(
"${HOME}/.cache/bazel/_bazel_*/*/server/server.pid.txt"
"/private/var/tmp/_bazel_*/*/server/server.pid.txt"
"/var/tmp/_bazel_*/*/server/server.pid.txt"
)
shopt -s nullglob
for pattern in "${patterns[@]}"; do
for pidfile in $pattern; do
[[ -f "$pidfile" ]] && echo "$pidfile"
done
done
shopt -u nullglob
}
bazel_evergreen_shutils::bazel_pidfile_path_for_pid() {
local server_pid="$1"
local pidfile candidate_pid
[[ "$server_pid" =~ ^[0-9]+$ ]] || return 1
while IFS= read -r pidfile; do
candidate_pid="$(cat "$pidfile" 2>/dev/null || true)"
if [[ "$candidate_pid" == "$server_pid" ]]; then
echo "$pidfile"
return 0
fi
done < <(bazel_evergreen_shutils::bazel_cache_pidfiles)
return 1
}
bazel_evergreen_shutils::request_bazel_jvm_dump() {
local BAZEL_BINARY="$1"
local pid
local signaled_pid=0
echo "Scanning for bazel server processes to signal." >&2
while IFS= read -r pid; do
[[ -z "$pid" ]] && continue
echo "Sending SIGQUIT to bazel process ${pid}" >&2
if kill -QUIT "$pid" 2>/dev/null; then
signaled_pid=1
else
echo "Failed to send SIGQUIT to bazel process ${pid}" >&2
fi
done < <(bazel_evergreen_shutils::fast_bazel_server_pids || true)
if [[ "$signaled_pid" -eq 0 ]]; then
echo "No bazel process found to signal." >&2
return 1
fi
# Bazel's JVM writes thread dumps asynchronously after SIGQUIT.
sleep 5
}
bazel_evergreen_shutils::bazel_jvm_out_snapshot_dir() {
echo "bazel_jvm_outs"
}
bazel_evergreen_shutils::bazel_jvm_out_path_for_pid() {
local server_pid="$1"
local pidfile output_base candidate
pidfile="$(bazel_evergreen_shutils::bazel_pidfile_path_for_pid "$server_pid")" || return 1
output_base="$(dirname "$(dirname "$pidfile")")"
for candidate in "${output_base}/server/jvm.out" "${output_base}/jvm.out"; do
if [[ -f "$candidate" ]]; then
echo "$candidate"
return 0
fi
done
echo "No bazel jvm.out file found for pid ${server_pid} under ${output_base}" >&2
return 1
}
bazel_evergreen_shutils::bazel_jvm_out_path() {
local BAZEL_BINARY="$1"
local output_base jvm_out_path=""
local candidate
output_base="$(bazel_evergreen_shutils::bazel_output_base "$BAZEL_BINARY")" || {
echo "Unable to determine bazel output_base" >&2
return 1
}
for candidate in "${output_base}/server/jvm.out" "${output_base}/jvm.out"; do
if [[ -f "$candidate" ]]; then
jvm_out_path="$candidate"
break
fi
done
if [[ -z "$jvm_out_path" ]]; then
echo "No bazel jvm.out file found under ${output_base}" >&2
return 1
fi
echo "$jvm_out_path"
}
bazel_evergreen_shutils::capture_bazel_jvm_out() {
local BAZEL_BINARY="$1"
local server_pid="${2:-}"
local jvm_out_path snapshot_dir timestamp output_prefix output_file
local capture_index=1
if [[ -z "$server_pid" ]]; then
server_pid="$(bazel_evergreen_shutils::bazel_server_pid "$BAZEL_BINARY" 2>/dev/null || true)"
fi
if [[ -z "$server_pid" ]]; then
while IFS= read -r server_pid; do
[[ -n "$server_pid" ]] && break
done < <(bazel_evergreen_shutils::fast_bazel_server_pids || true)
fi
if [[ -n "$server_pid" ]]; then
jvm_out_path="$(bazel_evergreen_shutils::bazel_jvm_out_path_for_pid "$server_pid" 2>/dev/null || true)"
fi
if [[ -z "$jvm_out_path" ]]; then
jvm_out_path="$(bazel_evergreen_shutils::bazel_jvm_out_path "$BAZEL_BINARY" 2>/dev/null || true)"
fi
[[ -n "$jvm_out_path" ]] || return 1
snapshot_dir="$(bazel_evergreen_shutils::bazel_jvm_out_snapshot_dir)"
mkdir -p "$snapshot_dir"
timestamp=$(date +%Y%m%d_%H%M%S)
if [[ -n "$server_pid" ]]; then
output_prefix="${snapshot_dir}/bazel_jvm_out_pid${server_pid}_${timestamp}"
else
output_prefix="${snapshot_dir}/bazel_jvm_out_pidunknown_${timestamp}"
fi
output_file="${output_prefix}.txt"
while [[ -e "$output_file" ]]; do
output_file="${output_prefix}_${capture_index}.txt"
((capture_index++))
done
cp "$jvm_out_path" "$output_file"
echo "Captured bazel jvm.out from ${jvm_out_path} to $(pwd)/${output_file}" >&2
echo "$output_file"
}
bazel_evergreen_shutils::package_bazel_jvm_out() {
local BAZEL_BINARY="$1"
local archive_path="${2:-jvm.out.tar.gz}"
local snapshot_dir live_server_pid=""
local -a snapshots=()
snapshot_dir="$(bazel_evergreen_shutils::bazel_jvm_out_snapshot_dir)"
mkdir -p "$snapshot_dir"
shopt -s nullglob
snapshots=("${snapshot_dir}"/*)
shopt -u nullglob
while IFS= read -r live_server_pid; do
[[ -n "$live_server_pid" ]] && break
done < <(bazel_evergreen_shutils::fast_bazel_server_pids || true)
if [[ -n "$live_server_pid" || ${#snapshots[@]} -eq 0 ]]; then
bazel_evergreen_shutils::capture_bazel_jvm_out "$BAZEL_BINARY" "$live_server_pid" >/dev/null || {
if [[ ${#snapshots[@]} -eq 0 ]]; then
return 1
fi
}
shopt -s nullglob
snapshots=("${snapshot_dir}"/*)
shopt -u nullglob
fi
if [[ ${#snapshots[@]} -eq 0 ]]; then
echo "No captured bazel jvm.out files found under $(pwd)/${snapshot_dir}" >&2
return 1
fi
rm -f "$archive_path"
tar -czf "$archive_path" -C "$(dirname "$snapshot_dir")" "$(basename "$snapshot_dir")"
echo "Archived ${#snapshots[@]} bazel jvm dump file(s) from $(pwd)/${snapshot_dir} to $(pwd)/${archive_path}" >&2
}
bazel_evergreen_shutils::jstack_bazel() {
# Find all bazel processes (Java processes with "bazel" in command line)
local pids
pids=$(pgrep -f "java.*bazel" || true)
pids=$(bazel_evergreen_shutils::fast_bazel_server_pids || true)
if [[ -z "$pids" ]]; then
return 1
fi
@ -310,6 +567,9 @@ bazel_evergreen_shutils::retry_bazel_cmd() {
bazel_evergreen_shutils::print_bazel_server_pid "$BAZEL_BINARY" >&2
fi
local attempt_bazel_server_pid=""
attempt_bazel_server_pid="$(bazel_evergreen_shutils::bazel_server_pid "$BAZEL_BINARY" 2>/dev/null || true)"
# Reassemble the callers words into a single command string for eval.
# We deliberately do *not* try to be clever here—this restores legacy behavior
# where quoted pieces inside variables (e.g., --base_dir="..") are honored by the shell.
@ -375,6 +635,8 @@ bazel_evergreen_shutils::retry_bazel_cmd() {
bazel_evergreen_shutils::print_bazel_server_pid "$BAZEL_BINARY" >&2
elif [[ $RET -eq 124 ]]; then
echo "Bazel timed out." >&2
bazel_evergreen_shutils::request_bazel_jvm_dump "$BAZEL_BINARY" || true
bazel_evergreen_shutils::capture_bazel_jvm_out "$BAZEL_BINARY" "$attempt_bazel_server_pid" >/dev/null || true
"$BAZEL_BINARY" shutdown || true
else
if [[ ${RETRY_ON_FAIL:-0} -eq 1 ]]; then

View File

@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Collects one or more bazel jvm.out snapshots into a task-local tarball and
# can optionally request a fresh dump from any live bazel server processes
# first.
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
. "$DIR/prelude.sh"
. "$DIR/bazel_evergreen_shutils.sh"
set -o errexit
set -o pipefail
signal_bazel_quit=false
if [[ "${1:-}" == "--signal-bazel-quit" ]]; then
signal_bazel_quit=true
shift
fi
if [[ "$#" -ne 0 ]]; then
echo "Usage: $0 [--signal-bazel-quit]" >&2
exit 1
fi
cd src
BAZEL_BINARY="$(bazel_evergreen_shutils::bazel_get_binary_path)"
ARCHIVE_PATH="jvm.out.tar.gz"
if $signal_bazel_quit; then
bazel_evergreen_shutils::request_bazel_jvm_dump "$BAZEL_BINARY" || true
exit 0
fi
rm -f "$ARCHIVE_PATH"
bazel_evergreen_shutils::package_bazel_jvm_out "$BAZEL_BINARY" "$ARCHIVE_PATH" || true