SERVER-117045 Trigger core analysis tasks from bazel resmoke tasks (#47599)

GitOrigin-RevId: 3d697ff268a027bf991f42a9a066bc731e330a25
This commit is contained in:
Sean Lyons 2026-02-05 16:05:33 -05:00 committed by MongoDB Bot
parent 11d409d41b
commit 8ea21ae1d3
28 changed files with 1670 additions and 255 deletions

2
.github/CODEOWNERS vendored
View File

@ -98,6 +98,8 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
# The following patterns are parsed from ./buildscripts/bazel_testbuilds/OWNERS.yml
/buildscripts/bazel_testbuilds/ @10gen/devprod-build @svc-auto-approve-bot
/buildscripts/bazel_testbuilds/jstest_timeout* @10gen/devprod-correctness @svc-auto-approve-bot
/buildscripts/bazel_testbuilds/verify_resmoke_coredump_test.sh @10gen/devprod-correctness @svc-auto-approve-bot
# The following patterns are parsed from ./buildscripts/cost_model/OWNERS.yml
/buildscripts/cost_model/**/* @10gen/query-optimization @svc-auto-approve-bot

View File

@ -34,7 +34,7 @@ fi
main_pid=$!
echo "Process-under-test started with PID: ${main_pid}"
# This is mocked out in buildscripts/bazel_testbuilds/verify_coredump_test.sh, make sure
# This is mocked out in buildscripts/bazel_testbuilds/verify_unittest_coredump_test.sh, make sure
# to update the test if this is changed.
timeout_seconds=600

View File

@ -481,6 +481,7 @@ py_binary(
srcs = [
"create_rbe_sysroot.py",
],
visibility = ["//visibility:public"],
deps = [
"local_rbe_container_url",
],

View File

@ -143,6 +143,7 @@ def main(outfile: Annotated[str, typer.Option()], build_events: str = "build_eve
"content_type": "text/plain",
},
),
FunctionCall("generate result task hang analyzer"),
],
teardown_group=[
FunctionCall("kill processes"),

View File

@ -1,6 +1,7 @@
# Bazel build definitions used to test bazel features during build system development.
load("//bazel:mongo_src_rules.bzl", "mongo_cc_binary", "mongo_cc_unit_test")
load("//bazel/resmoke:resmoke.bzl", "resmoke_suite_test")
package(default_visibility = ["//visibility:public"])
@ -42,3 +43,25 @@ mongo_cc_unit_test(
"manual",
],
)
resmoke_suite_test(
name = "jstest_timeout",
srcs = [
"jstest_timeout.js",
],
config = "jstest_timeout.yml",
data = [
"//buildscripts/resmokeconfig:common_jstest_data",
],
resmoke_args = [
"--testTimeout=10", # The test sleeps for 30 seconds, so it should get killed.
],
tags = [
# Manual tag to prevent this from running in normal test suites
# since it's designed to fail.
"manual",
],
deps = [
"//src/mongo/shell:mongo",
],
)

View File

@ -3,3 +3,9 @@ filters:
- "*":
approvers:
- 10gen/devprod-build
- "jstest_timeout*":
approvers:
- 10gen/devprod-correctness
- "verify_resmoke_coredump_test.sh":
approvers:
- 10gen/devprod-correctness

View File

@ -0,0 +1 @@
sleep(30000);

View File

@ -0,0 +1,6 @@
test_kind: js_test
executor:
config:
shell_options:
nodb: ""

View File

@ -0,0 +1,122 @@
#!/bin/bash
#
# Test script to verify that the test timeouts in resmoke_suite_test generate coredumps and are picked up for core analysis tasks.
#
# This script:
# 1. Runs a `bazel test` on a resmoke suite that is expected to fail and generate a core.
# 2. Runs evergreen/fetch_remote_test_results.sh, which downloads test outputs for the remotely executed test.
# 3. Runs the gen_hang_analyzer_tasks script that generates an Evergreen task config for core analysis.
#
# Usage:
# ./buildscripts/bazel_testbuilds/verify_resmoke_coredump_test.sh
#
# Exit codes:
# 0 - Success (coredump and generate task config were created)
# 1 - Failure
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
# Change to repo root for bazel commands
cd "$REPO_ROOT" || exit 1
TEST_TARGET="//buildscripts/bazel_testbuilds:jstest_timeout"
# Cleanup function to remove temp directories
TEMP_DIR=""
cleanup() {
# Clean up temp directory if it was created
if [[ -n "${TEMP_DIR}" && -d "${TEMP_DIR}" ]]; then
rm -rf "${TEMP_DIR}" 2>/dev/null || true
fi
bazel shutdown
}
trap cleanup EXIT
echo "=== Coredump Generation Verification Test ==="
echo "Test target: ${TEST_TARGET}"
echo "Repository root: ${REPO_ROOT}"
echo ""
# Run the bazel test (expected to fail). --platforms is used so the test runs in the same environment regardless of the host.
echo "Running bazel test (this test is expected to fail)..."
echo "Command: bazel test --config=remote_test --zip_undeclared_test_outputs --build_event_json_file=build_events.json ${TEST_TARGET}"
echo ""
# Use --curses=no and --color=no to prevent interactive output that might cause hangs in CI.
bazel test --config=remote_test --zip_undeclared_test_outputs --build_event_json_file=build_events.json --curses=no --color=no "${TEST_TARGET}" 2>&1 && BAZEL_EXIT_CODE=0 || BAZEL_EXIT_CODE=$?
echo ""
echo "Bazel test exit code: ${BAZEL_EXIT_CODE}"
echo ""
# The test should fail (exit code != 0)
if [[ "${BAZEL_EXIT_CODE}" -eq 0 ]]; then
echo "ERROR: Test unexpectedly passed. The timeout mechanism may not have triggered."
exit 1
fi
echo "Test failed as expected. Now fetching remote test results..."
echo ""
# Fetch the remote test results. In reality this would be run a on different host than the one that ran `bazel test`.
TEMP_DIR=$(mktemp -d)
export ENGFLOW_KEY="${workdir}/src/engflow.key"
export ENGFLOW_CERT="${workdir}/src/engflow.cert"
export workdir="$TEMP_DIR" # Change workdir so the script downloads outputs to the temporary dir, rather than task workdir.
export test_label="$TEST_TARGET"
bash ./evergreen/fetch_remote_test_results.sh
echo ""
unset workdir # Unset workdir, it's a default Evergreen expansion that might confuse a later script.
OUTPUTS_DIR="${TEMP_DIR}"/results/buildscripts/bazel_testbuilds/jstest_timeout/shard_1/test.outputs
# List all files in the test output directory for debugging.
if [[ -d "${OUTPUTS_DIR}" ]]; then
echo "Contents of ${OUTPUTS_DIR}:"
find "${OUTPUTS_DIR}" -type f 2>/dev/null | head -50
echo ""
else
echo "FAILED: Test output directory not found: ${OUTPUTS_DIR}"
exit 1
fi
# Look for the expected core file.
echo "Searching for coredump files in ${OUTPUTS_DIR}..."
CORE_FILES=$(find "${OUTPUTS_DIR}" -type f \( -name "*.core" -o -name "*.core.gz" -o -name "dump_*.core*" \) 2>/dev/null)
COREDUMP_FOUND=0
COREDUMP_FILE=""
if [[ -n "${CORE_FILES}" ]]; then
COREDUMP_FOUND=1
COREDUMP_FILE=$(echo "${CORE_FILES}" | head -1)
echo "SUCCESS: Coredump file(s) found:"
echo "${CORE_FILES}"
else
echo "FAILED: No coredump files found."
exit 1
fi
# Create an expansions file that is like what will exist in the tests tasks.
EXPANSIONS_FILE="${TEMP_DIR}/expansions.yml"
cat <<EOF >"${EXPANSIONS_FILE}"
core_analyzer_distro_name: amazon2023-arm64-atlas-latest-m8g-2xlarge
task_name: "${TEST_TARGET}"
task_id: task_id_123
execution: 0
build_variant: build_variant_123
core_analyzer_results_url: https://core_analyzer_results_url
workdir: "${TEMP_DIR}"
EOF
GENERATED_TASK_FILE="${TEMP_DIR}/generated_tasks.json"
bazel run //buildscripts/resmokelib/hang_analyzer:gen_hang_analyzer_tasks --config=remote_test -- --expansions-file="${EXPANSIONS_FILE}" --output-file="${GENERATED_TASK_FILE}" --tests-use-bazel --use-mock-tasks
if [[ -f "${GENERATED_TASK_FILE}" ]]; then
echo "SUCCESS: Created the Evergreen task config ${GENERATED_TASK_FILE}"
cat "${GENERATED_TASK_FILE}"
echo ""
else
echo "FAILED: Did not generate an Evergreen task config at ${GENERATED_TASK_FILE}"
exit 1
fi

View File

@ -7,7 +7,7 @@
# 2. Verifies that a coredump file is created in the test outputs
#
# Usage:
# ./buildscripts/bazel_testbuilds/verify_coredump_test.sh
# ./buildscripts/bazel_testbuilds/verify_unittest_coredump_test.sh
#
# Exit codes:
# 0 - Success (coredump was created)

View File

@ -12,32 +12,38 @@ if __name__ == "__main__" and __package__ is None:
from buildscripts.local_rbe_container_url import calculate_local_rbe_container_url
def main():
os.chdir(os.environ.get("BUILD_WORKSPACE_DIRECTORY", "."))
def create_rbe_sysroot(dir) -> bool:
container_url = calculate_local_rbe_container_url()
if container_url == "UNKNOWN":
print("Could not determine local RBE container URL, cannot create rbe sysroot")
return 1
return False
print(f"Using local RBE container URL: {container_url}")
container_cli = shutil.which("docker") or shutil.which("podman")
if not container_cli:
print("Error: Neither docker nor podman is installed.", file=sys.stderr)
sys.exit(1)
return False
cid = subprocess.check_output([container_cli, "create", container_url]).decode().strip()
os.makedirs("./rbe_sysroot", exist_ok=True)
os.makedirs(dir, exist_ok=True)
subprocess.run(["sudo", container_cli, "cp", f"{cid}:/", "./rbe_sysroot/"], check=True)
subprocess.run(["sudo", container_cli, "cp", f"{cid}:/", dir], check=True)
user = getpass.getuser()
subprocess.run(["sudo", "chown", "-R", f"{user}:{user}", "./rbe_sysroot"], check=True)
subprocess.run(["sudo", "chown", "-R", f"{user}:{user}", dir], check=True)
subprocess.run([container_cli, "rm", cid], check=True)
return 0
return True
def main():
os.chdir(os.environ.get("BUILD_WORKSPACE_DIRECTORY", "."))
success = create_rbe_sysroot("./rbe_sysroot")
return not success
if __name__ == "__main__":
exit(main())
sys.exit(main())

View File

@ -30,8 +30,7 @@ mongo_js_library(
resmoke_suite_test(
name = "core",
srcs = [
"//jstests/core:all_subpackage_javascript_files",
"//jstests/core_standalone:all_subpackage_javascript_files",
"//jstests/core/timeseries/pbt:timeseries_cache_usage_pbt.js",
],
config = ":suites/core.yml",
data = [
@ -58,7 +57,7 @@ resmoke_suite_test(
resmoke_args = [
"--storageEngineCacheSizeGB=1",
],
shard_count = 24,
shard_count = 1,
tags = [
"ci-development-critical-single-variant",
],

View File

@ -17,6 +17,7 @@ py_library(
],
visibility = ["//visibility:public"],
deps = [
"//buildscripts:create_rbe_sysroot",
"//buildscripts:simple_report",
"//buildscripts/resmokelib/run",
"//buildscripts/resmokelib/symbolizer",
@ -42,3 +43,14 @@ py_library(
),
],
)
py_binary(
name = "gen_hang_analyzer_tasks",
srcs = ["gen_hang_analyzer_tasks.py"],
main = "gen_hang_analyzer_tasks.py",
deps = [
"//buildscripts/resmokelib",
"//buildscripts/resmokelib/core",
"//buildscripts/resmokelib/hang_analyzer",
],
)

View File

@ -12,7 +12,6 @@ sys.path.append(mongo_path)
from buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks import (
GENERATED_TASK_PREFIX,
RANDOM_STRING_LENGTH,
should_activate_core_analysis_task,
)
from buildscripts.resmokelib.utils import evergreen_conn
from buildscripts.util.read_config import read_config_file
@ -49,7 +48,9 @@ def maybe_attach_core_analyzer_task(
search_tasks = build.tasks
# The task id uses underscores instead of hyphens
task_id_search_term = f"{GENERATED_TASK_PREFIX}_{current_task_name.replace('-', '_')}"
task_id_search_term = (
f"{GENERATED_TASK_PREFIX}_{current_task_name.replace('-', '_').replace('/', '_')}"
)
matching_task = None
matching_execution = None
@ -99,16 +100,11 @@ def maybe_attach_core_analyzer_task(
if not gen_from_cur_execution:
return
if should_activate_core_analysis_task(current_task):
first_line = "Core analysis is in progress."
else:
first_line = "Core analysis was not scheduled because only archived fails were detected."
file_lines = [
first_line,
"This file will be overwritten with the results when core analysis is finished.",
"You can view the core analysis task at this here:",
core_analysis_task_url,
"If it was not scheduled, it can be manually activated.",
]
with open(results_output_file, "w") as file:

View File

@ -10,7 +10,10 @@ from opentelemetry.trace.status import StatusCode
from buildscripts.resmokelib import configure_resmoke
from buildscripts.resmokelib.hang_analyzer import dumper
from buildscripts.resmokelib.hang_analyzer.extractor import download_task_artifacts
from buildscripts.resmokelib.hang_analyzer.extractor import (
download_bazel_task_artifacts,
download_task_artifacts,
)
from buildscripts.resmokelib.plugin import PluginInterface, Subcommand
from buildscripts.resmokelib.utils.otel_utils import get_default_current_span
@ -29,6 +32,7 @@ class CoreAnalyzer(Subcommand):
self.boring_core_dump_pids = set(pid for pid in boring_pids_str.split(",") if pid)
else:
self.boring_core_dump_pids = set()
self.is_bazel_task = options.get("is_bazel_task", False)
self.root_logger = self.setup_logging(logger)
self.extra_otel_options = {}
for option in options["otel_extra_data"]:
@ -38,6 +42,7 @@ class CoreAnalyzer(Subcommand):
@TRACER.start_as_current_span("core_analyzer.execute")
def execute(self):
base_dir = self.options["working_dir"]
sysroot_dir = None
current_span = get_default_current_span(
{"failed_task_id": self.task_id} | self.extra_otel_options
)
@ -55,24 +60,51 @@ class CoreAnalyzer(Subcommand):
)
multiversion_dir = os.path.join(base_dir, "multiversion")
if not skip_download and not download_task_artifacts(
self.root_logger,
self.task_id,
base_dir,
dumpers.dbg,
multiversion_dir,
self.execution,
):
self.root_logger.error("Artifacts were not found.")
current_span.set_attributes(
{
"core_analyzer_execute_error": "Artifacts were not found.",
}
)
current_span.set_status(StatusCode.ERROR, description="Artifacts were not found.")
raise RuntimeError(
"Artifacts were not found for specified task. Could not analyze cores."
)
if self.is_bazel_task:
if not skip_download:
all_downloaded, sysroot_dir = download_bazel_task_artifacts(
self.root_logger,
self.task_id,
base_dir,
multiversion_dir,
self.execution,
)
if not all_downloaded:
self.root_logger.error("Artifacts were not found.")
current_span.set_attributes(
{
"core_analyzer_execute_error": "Artifacts were not found.",
}
)
current_span.set_status(
StatusCode.ERROR, description="Bazel artifacts were not found."
)
raise RuntimeError(
"Artifacts were not found for specified tasks. Could not analyze cores."
)
else:
if not skip_download and not download_task_artifacts(
self.root_logger,
self.task_id,
base_dir,
dumpers.dbg,
multiversion_dir,
self.execution,
):
self.root_logger.error("Artifacts were not found.")
current_span.set_attributes(
{
"core_analyzer_execute_error": "Artifacts were not found.",
}
)
current_span.set_status(
StatusCode.ERROR, description="Artifacts were not found."
)
raise RuntimeError(
"Artifacts were not found for specified task. Could not analyze cores."
)
with open(task_id_file, "w") as file:
file.write(self.task_id)
@ -92,6 +124,7 @@ class CoreAnalyzer(Subcommand):
install_dir,
analysis_dir,
multiversion_dir,
sysroot_dir,
self.gdb_index_cache,
self.boring_core_dump_pids,
)
@ -146,6 +179,13 @@ class CoreAnalyzerPlugin(PluginInterface):
help="Fetch corresponding core dumps and binaries for a given task id.",
)
parser.add_argument(
"--is-bazel-task",
action="store_true",
default=False,
help="Indicates that this is a bazel task and should use bazel-specific artifact download.",
)
parser.add_argument(
"--execution",
"-e",
@ -162,7 +202,7 @@ class CoreAnalyzerPlugin(PluginInterface):
action="store",
type=str,
default=None,
help="Directory that contains binaires and debugsymbols.",
help="Directory that contains binaries and debugsymbols.",
)
parser.add_argument(

View File

@ -15,7 +15,7 @@ from abc import ABCMeta, abstractmethod
from collections import namedtuple
from datetime import datetime, timedelta
from io import StringIO
from typing import List, Tuple
from typing import List, Optional, Tuple
import psutil
from opentelemetry import trace
@ -739,6 +739,7 @@ class GDBDumper(Dumper):
install_dir: str,
analysis_dir: str,
multiversion_dir: str,
sysroot_dir: Optional[str],
gdb_index_cache: str,
boring_core_dump_pids: set = None,
max_core_dumps: int = 10,
@ -783,6 +784,7 @@ class GDBDumper(Dumper):
exit_code, status = self.analyze_core(
core_file_path=core_file_path,
install_dir=install_dir,
sysroot_dir=sysroot_dir,
analysis_dir=analysis_dir,
tmp_dir=tmp_dir,
logger=logger,
@ -841,6 +843,7 @@ class GDBDumper(Dumper):
self,
core_file_path: str,
install_dir: str,
sysroot_dir: Optional[str],
analysis_dir: str,
tmp_dir: str,
multiversion_dir: str,
@ -881,6 +884,9 @@ class GDBDumper(Dumper):
logging_dir = os.path.join(analysis_dir, basename)
os.makedirs(logging_dir, exist_ok=True)
if sysroot_dir:
cmds.append(f"set sysroot {sysroot_dir}")
cmds += [
f"set solib-search-path {lib_dir}",
f"set index-cache directory {tmp_dir}",

View File

@ -12,6 +12,7 @@ import sys
import tarfile
import time
import urllib.request
import zipfile
from logging import Logger
from pathlib import Path
from typing import Callable, Optional
@ -20,8 +21,12 @@ from opentelemetry import trace
from opentelemetry.trace.status import StatusCode
from retry import retry
from buildscripts.create_rbe_sysroot import create_rbe_sysroot
from buildscripts.resmokelib.hang_analyzer.dumper import Dumper
from buildscripts.resmokelib.setup_multiversion.download import DownloadError
from buildscripts.resmokelib.setup_multiversion.download import (
DownloadError,
download_from_s3_with_requests,
)
from buildscripts.resmokelib.setup_multiversion.setup_multiversion import (
SetupMultiversion,
_DownloadOptions,
@ -441,10 +446,12 @@ def post_install_gdb_optimization(download_dir: str, root_looger: Logger):
root_looger.debug("Finished recalculating the debuglink for %s", file_path)
dist_dir = os.path.join(download_dir, "install", "dist-test")
bin_dir = os.path.join(dist_dir, "bin")
install_dir = os.path.join(download_dir, "install")
if os.path.exists(os.path.join(install_dir, "dist-test")):
install_dir = os.path.join(install_dir, "dist-test")
bin_dir = os.path.join(install_dir, "bin")
bin_files = [os.path.join(bin_dir, file_path) for file_path in os.listdir(bin_dir)]
lib_dir = os.path.join(dist_dir, "lib")
lib_dir = os.path.join(install_dir, "lib")
lib_files = []
if os.path.exists(lib_dir):
lib_files = [os.path.join(lib_dir, file_path) for file_path in os.listdir(lib_dir)]
@ -737,3 +744,261 @@ def _get_symbol_files():
for needle in glob.glob(haystack):
out.append((needle, os.path.join(os.getcwd(), os.path.basename(needle))))
return out
def check_manifest_for_cores(root_logger: Logger, manifest_url: str) -> bool:
"""Check if a test.outputs manifest contains core dumps."""
try:
with urllib.request.urlopen(manifest_url) as response:
manifest_content = response.read().decode("utf-8")
has_cores = ".core" in manifest_content or ".mdmp" in manifest_content
if has_cores:
root_logger.info("Manifest indicates core dumps are present")
else:
root_logger.info("Manifest indicates no core dumps")
return has_cores
except Exception as ex:
root_logger.warning(
f"Could not read manifest: {ex}. Will download entire test outputs and check for cores."
)
return True # If we can't read the manifest, assume cores might be present
@TRACER.start_as_current_span("core_analyzer.download_bazel_result_task_cores")
def download_bazel_result_task_cores(root_logger: Logger, task_id: str, download_dir: str) -> bool:
root_logger.info(f"Downloading cores from task {task_id}")
current_span = get_default_current_span({"download_task_id": task_id})
evg_api = evergreen_conn.get_evergreen_api()
task_info = evg_api.task_by_id(task_id)
core_dumps_dir = os.path.join(download_dir, "core-dumps")
os.makedirs(core_dumps_dir, exist_ok=True)
outputs_artifacts = []
manifest_map = {} # Map of outputs zip archive to its manifest
for artifact in task_info.artifacts:
if "test.outputs" in artifact.name and artifact.name.endswith(".zip"):
outputs_artifacts.append(artifact)
elif "_manifest__MANIFEST" in artifact.name:
manifest_map[artifact.name.replace("_manifest__MANIFEST", "__outputs.zip")] = (
artifact.url
)
if not outputs_artifacts:
root_logger.warning("No test.outputs artifacts found in result task")
return False
core_dumps_found = 0
for artifact in outputs_artifacts:
root_logger.info(f"Processing artifact: {artifact.name}")
# Check manifest first to see if cores are present
manifest_url = manifest_map.get(artifact.name)
if manifest_url:
if not check_manifest_for_cores(root_logger, manifest_url):
root_logger.info(f"Skipping {artifact.name} - no cores in manifest")
continue
else:
root_logger.warning(f"No manifest found for {artifact.name}, will download anyway")
file_name = artifact.name
zip_path = os.path.join(download_dir, file_name)
try:
@retry(tries=3, delay=5)
def download_outputs_zip():
root_logger.info(f"Downloading {file_name}")
if os.path.exists(zip_path):
os.remove(zip_path)
download_from_s3_with_requests(artifact.url, zip_path)
download_outputs_zip()
with zipfile.ZipFile(zip_path, "r") as zip_ref:
for member in zip_ref.namelist():
if member.endswith(".core") or member.endswith(".mdmp"):
core_name = os.path.basename(member)
extract_path = os.path.join(core_dumps_dir, core_name)
root_logger.info(f"Extracting core dump: {core_name}")
with zip_ref.open(member) as source, open(extract_path, "wb") as target:
shutil.copyfileobj(source, target)
core_dumps_found += 1
os.remove(zip_path)
except Exception as ex:
root_logger.error(f"Error processing artifact {artifact.name}: {ex}")
current_span.set_status(
StatusCode.ERROR, f"Failed to download artifact {artifact.name}"
)
current_span.set_attribute("download_error", str(ex))
root_logger.info(f"Downloaded {core_dumps_found} core dump(s)")
if core_dumps_found == 0:
root_logger.error("No core dumps found in test.outputs")
current_span.set_status(StatusCode.ERROR, "No core dumps found")
return False
current_span.set_attribute("core_dumps_found", core_dumps_found)
return True
def find_test_task_with_binaries(evg_api, results_task_id: str):
"""
Find the test task in the same build that has dist-tests artifacts.
:param evg_api: Evergreen API client
:param results_task_id:
:return: Task ID of the resmoke_tests task, or None if not found
"""
try:
task = evg_api.task_by_id(results_task_id)
tasks = evg_api.tasks_by_build(task.build_id)
if "_burn_in_" in task.display_name:
resmoke_tests_task = list(
filter(lambda t: t.display_name.startswith("resmoke_tests_burn_in"), tasks)
)
else:
resmoke_tests_task = list(filter(lambda t: t.display_name == "resmoke_tests", tasks))
assert (
len(resmoke_tests_task) == 1
), f"Could not find a unique resmoke test task in this variant {task.build_variant_display_name}"
return resmoke_tests_task[0]
except Exception as ex:
print(f"ERROR: Failed to query Evergreen for test task: {ex}")
return None
@TRACER.start_as_current_span("core_analyzer.download_bazel_test_task_binaries")
def download_bazel_test_task_binaries(root_logger: Logger, task_id: str, download_dir: str) -> bool:
evg_api = evergreen_conn.get_evergreen_api()
resmoke_task = find_test_task_with_binaries(evg_api, task_id)
root_logger.info(f"Downloading binaries from task {resmoke_task.task_id}")
dist_tests_artifacts = [
a for a in resmoke_task.artifacts if "Test binaries and libraries" in a.name
]
if not dist_tests_artifacts:
root_logger.error("No binary archive found in the resmoke_test task")
return False
install_dir = os.path.join(download_dir, "install")
os.makedirs(install_dir, exist_ok=True)
for artifact in dist_tests_artifacts:
file_name = "resmoke_tests.tgz"
download_path = os.path.join(download_dir, file_name)
try:
@retry(tries=3, delay=5)
def download_binary_artifact():
root_logger.info(f"Downloading {file_name}")
if os.path.exists(download_path):
os.remove(download_path)
download_from_s3_with_requests(artifact.url, download_path)
download_binary_artifact()
root_logger.info(f"Extracting {file_name}")
with tarfile.open(download_path, "r:gz") as tar:
# Extract members, mapping dist-tests -> dist-test
for member in tar.getmembers():
if member.name.startswith("dist-tests/"):
member.name = member.name.replace("dist-tests/", "dist-test/", 1)
tar.extract(member, install_dir)
os.remove(download_path)
except Exception as ex:
root_logger.error(f"Error downloading/extracting {file_name}: {ex}")
return False
root_logger.info("Successfully downloaded and extracted binaries")
return True
@TRACER.start_as_current_span("core_analyzer.download_bazel_task_artifacts")
def download_bazel_task_artifacts(
root_logger: Logger,
task_id: str,
download_dir: str,
retry_secs: int = 10,
download_timeout_secs: int = 30 * 60,
) -> bool:
if os.path.exists(download_dir):
# quick sanity check to ensure we don't delete a repo
if os.path.exists(os.path.join(download_dir, ".git")):
raise RuntimeError(f"Input dir cannot be a git repo: {download_dir}")
shutil.rmtree(download_dir)
root_logger.info(f"Deleted existing dir at {download_dir}")
os.mkdir(download_dir)
current_span = get_default_current_span({"task_id": task_id})
# Download cores and binaries in parallel
with OtelThreadPoolExecutor() as executor:
futures = []
# Download core dumps from result task
futures.append(
executor.submit(
run_with_retries,
root_logger=root_logger,
func=download_bazel_result_task_cores,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
task_id=task_id,
download_dir=download_dir,
)
)
# Download binaries from test task
futures.append(
executor.submit(
run_with_retries,
root_logger=root_logger,
func=download_bazel_test_task_binaries,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
task_id=task_id,
download_dir=download_dir,
)
)
all_downloaded = True
for future in concurrent.futures.as_completed(futures):
if not future.result():
current_span.set_status(
StatusCode.ERROR, "Errors occurred while fetching artifacts"
)
current_span.set_attribute(
"download_bazel_task_artifacts_error",
"Errors occurred while fetching artifacts",
)
root_logger.error("Errors occurred while fetching bazel artifacts")
all_downloaded = False
break
if all_downloaded and sys.platform.startswith("linux"):
sysroot = os.path.join(download_dir, "rbe_sysroot")
create_rbe_sysroot(sysroot)
post_install_gdb_optimization(download_dir, root_logger)
return all_downloaded, sysroot

View File

@ -2,6 +2,7 @@
"""Generate a task to run core analysis on uploaded core dumps in evergreen."""
import argparse
import glob
import json
import os
import pathlib
@ -9,14 +10,15 @@ import random
import re
import string
import sys
from typing import List
from abc import ABC, abstractmethod
from typing import List, NamedTuple, Optional
from unittest.mock import MagicMock
from shrub.v2 import BuildVariant, FunctionCall, ShrubProject, Task, TaskDependency
from shrub.v2.command import BuiltInCommand
from buildscripts.resmokelib.core.process import BORING_CORE_DUMP_PIDS_FILE
from buildscripts.resmokelib.hang_analyzer import dumper
from evergreen.api import RetryingEvergreenApi
from evergreen.task import Task as EvgTask
mongo_path = pathlib.Path(__file__).parents[3]
@ -32,6 +34,288 @@ LOCAL_BIN_DIR = os.path.join("dist-test", "bin")
MULTIVERSION_BIN_DIR = os.path.normpath("/data/multiversion")
class CoreInfo(NamedTuple):
path: str
binary_name: str
pid: str
marked_boring: bool
class CoreAnalysisTaskGenerator(ABC):
@abstractmethod
def get_core_analyzer_commands(
self,
task_id: str,
execution: str,
core_analyzer_results_url: str,
gdb_index_cache: str,
has_interesting_core_dumps: bool,
boring_core_dump_pids: set,
) -> List[FunctionCall]:
pass
@abstractmethod
def find_cores(self) -> list[CoreInfo]:
pass
def get_core_analysis_task_dependencies(self, compile_variant: str) -> set[TaskDependency]:
return []
def __init__(
self,
expansions_file: str = "../expansions.yml",
use_mock_tasks: bool = False,
):
self.expansions = read_config_file(expansions_file)
if use_mock_tasks:
task = MagicMock()
task.display_name = "resmoke_tests"
task.id = "resmoke_tests_task_id_123"
build = MagicMock()
build.get_tasks.return_value = [task]
self.evg_api = MagicMock()
self.evg_api.build_by_id.return_value = build
else:
try:
self.evg_api = evergreen_conn.get_evergreen_api()
except RuntimeError:
print(
"WARNING: Cannot generate core analysis because the evergreen api file could not be found.",
file=sys.stderr,
)
print(
"This is probably not an error, if you want core analysis to run on this task make sure",
file=sys.stderr,
)
print(
"the evergreen function 'configure evergreen api credentials' is called before this task",
file=sys.stderr,
)
return None
def generate(self) -> Optional[dict]:
if not sys.platform.startswith("linux"):
print("This platform is not supported, skipping core analysis task generation.")
return None
# gather information from the current task being run
distro = None
for distro_expansion in ["core_analyzer_distro_name", "large_distro_name", "distro_id"]:
if distro := self.expansions.get(distro_expansion, None):
break
assert distro is not None
current_task_name = self.expansions.get("task_name")
task_id = self.expansions.get("task_id")
execution = self.expansions.get("execution")
gdb_index_cache = (
"off" if self.expansions.get("core_analyzer_gdb_index_cache") == "off" else "on"
)
build_variant_name = self.expansions.get("build_variant")
core_analyzer_results_url = self.expansions.get("core_analyzer_results_url")
compile_variant = self.expansions.get("compile_variant")
task_info = self.evg_api.task_by_id(task_id)
skip_variants = ["commit-queue"]
if task_info.build_variant in skip_variants:
print(f"Skipping core analysis task generation for variant: {task_info.build_variant}")
return None
# make sure we are not creating an infinite loop by generating a task from another generated task
if current_task_name.startswith(GENERATED_TASK_PREFIX):
print(
f"Skipping task generation because {current_task_name} starts with {GENERATED_TASK_PREFIX}"
)
return None
cores = self.find_cores()
boring_cores = [core for core in cores if core.marked_boring]
interesting_cores = [core for core in cores if not core.marked_boring]
boring_core_dump_pids = set([core.pid for core in boring_cores])
if not cores:
print("No core dumps found.")
return None
print(f"Detected core dumps: {[core.path for core in cores]}")
print(f"Core dumps marked as boring by resmoke: {[core.path for core in boring_cores]}")
if not interesting_cores:
print(
"No interesting core dumps were found. Not activating the core analysis task. It is still generated, but must be manually activated."
)
should_activate = len(interesting_cores) > 0 and not self._should_skip_task(task_info)
build_variant = BuildVariant(name=build_variant_name)
commands = self.get_core_analyzer_commands(
task_id,
execution,
core_analyzer_results_url,
gdb_index_cache,
should_activate,
boring_core_dump_pids,
)
deps = self.get_core_analysis_task_dependencies(compile_variant)
sub_tasks = set(
[Task(get_generated_task_name(current_task_name, execution), commands, deps)]
)
build_variant.add_tasks(sub_tasks, distros=[distro], activate=should_activate)
shrub_project = ShrubProject.empty()
shrub_project.add_build_variant(build_variant)
# shrub.py currently does not support adding task deps that override the variant deps
output_dict = shrub_project.as_dict()
deps_list = []
for dep in deps:
deps_list.append(dep.as_dict())
for variant in output_dict["buildvariants"]:
for task in variant["tasks"]:
task["depends_on"] = deps_list
return output_dict
def _should_skip_task(self, task: EvgTask) -> bool:
# We hardcode some task names where the core analysis is extending the long pole
# of required patch builds by 100 mins and the BFs are taking too long to fix.
# This list is a quick fix to improve development velocity.
# TODO(SERVER-118661): Remove disagg suites from skip list.
skip_tasks = [
"disagg_repl_jscore_passthrough",
"disagg_repl_jscore_passthrough_secondary_reads",
"disagg_sharded_colls_jscore_passthrough_secondary_reads_with_balancer",
"disagg_two_nodes_repl_jscore_passthrough",
"no_passthrough_disagg_override",
]
current_task_name = task.display_name
if task.parent_task_id:
parent_task = self.evg_api.task_by_id(task.parent_task_id)
current_task_name = parent_task.display_name
if current_task_name in skip_tasks:
print(f"Not activating core analysis task for task: {current_task_name}")
return True
return False
class ResmokeCoreAnalysisTaskGenerator(CoreAnalysisTaskGenerator):
def get_core_analyzer_commands(
self,
task_id: str,
execution: str,
core_analyzer_results_url: str,
gdb_index_cache: str,
has_interesting_core_dumps: bool,
boring_core_dump_pids: set,
) -> List[FunctionCall]:
return _get_core_analyzer_commands(
task_id,
execution,
core_analyzer_results_url,
gdb_index_cache,
has_interesting_core_dumps,
boring_core_dump_pids,
)
def get_core_analysis_task_dependencies(self, compile_variant: str) -> set[TaskDependency]:
# TODO SERVER-92571 add archive_jstestshell_debug dep for variants that have it.
return {TaskDependency("archive_dist_test_debug", compile_variant)}
def find_cores(self) -> list[CoreInfo]:
cores = []
# LOCAL_BIN_DIR does not exists on non-resmoke tasks, so return early as there is no work to be done.
if not os.path.exists(LOCAL_BIN_DIR):
print(f"Skipping task generation because binary directory not found: {LOCAL_BIN_DIR}")
return cores
# Get boring core dump PIDs to pass to the analyzer
boring_core_dump_pids = set()
if os.path.exists(BORING_CORE_DUMP_PIDS_FILE):
with open(BORING_CORE_DUMP_PIDS_FILE, "r") as file:
boring_core_dump_pids = set(file.read().split())
task_id = self.expansions.get("task_id")
task_info = self.evg_api.task_by_id(task_id)
dumpers = dumper.get_dumpers(None, None)
for artifact in task_info.artifacts:
regex = re.search(r"Core Dump [0-9]+ \((.*)\.gz\)", artifact.name)
if not regex:
continue
core_file = regex.group(1)
binary_name, bin_version = dumpers.dbg.get_binary_from_core_dump(core_file)
dir_to_check = MULTIVERSION_BIN_DIR if bin_version else LOCAL_BIN_DIR
binary_files = os.listdir(dir_to_check)
if binary_name not in binary_files:
print(f"{core_file} was generated by {binary_name} but the binary was not found.")
continue
pid = get_core_pid(core_file)
boring = pid in boring_core_dump_pids
cores.append(
CoreInfo(path=core_file, binary_name=binary_name, marked_boring=boring, pid=pid)
)
return cores
class BazelCoreAnalysisTaskGenerator(CoreAnalysisTaskGenerator):
def get_core_analyzer_commands(
self,
task_id: str,
execution: str,
core_analyzer_results_url: str,
gdb_index_cache: str,
has_interesting_core_dumps: bool,
boring_core_dump_pids: set,
) -> List[FunctionCall]:
return _get_core_analyzer_commands(
task_id,
execution,
core_analyzer_results_url,
gdb_index_cache,
has_interesting_core_dumps,
boring_core_dump_pids,
is_bazel_task=True,
)
def find_cores(self) -> list[CoreInfo]:
cores = []
results_dir = os.path.join(self.expansions.get("workdir"), "results")
if not os.path.exists(results_dir):
return cores
# Search for core files in results/**/test.outputs/ directories
results_dirs = glob.glob(os.path.join(results_dir, "**", "test.outputs"), recursive=True)
for dir in results_dirs:
boring_dump_file = os.path.join(dir, BORING_CORE_DUMP_PIDS_FILE)
if os.path.exists(boring_dump_file):
with open(BORING_CORE_DUMP_PIDS_FILE, "r") as file:
boring_core_dump_pids = set(file.read().split())
else:
boring_core_dump_pids = {}
core_patterns = [
os.path.join(dir, "*.core"),
os.path.join(dir, "*.mdmp"),
]
for pattern in core_patterns:
for core in glob.glob(pattern, recursive=True):
# Check if resmoke reported this core dump as a "boring one", in the BORING_CORE_DUMP_PIDS_FILE.
pid = get_core_pid(os.path.basename(core))
boring = pid in boring_core_dump_pids
cores.append(CoreInfo(path=core, binary_name="", marked_boring=boring, pid=pid))
return cores
def get_generated_task_name(current_task_name: str, execution: str) -> str:
# random string so we do not define the same task name for multiple variants which causes issues
random_string = "".join(
@ -42,73 +326,23 @@ def get_generated_task_name(current_task_name: str, execution: str) -> str:
return f"{GENERATED_TASK_PREFIX}_{current_task_name}{execution}_{random_string}"
def should_activate_core_analysis_task(task: EvgTask, evg_api: RetryingEvergreenApi) -> bool:
# We hardcode some task names where the core analysis is extending the long pole
# of required patch builds by 100 mins and the BFs are taking too long to fix.
# This list is a quick fix to improve development velocity.
# TODO(SERVER-118661): Remove disagg suites from skip list.
skip_tasks = [
"disagg_repl_jscore_passthrough",
"disagg_repl_jscore_passthrough_secondary_reads",
"disagg_sharded_colls_jscore_passthrough_secondary_reads_with_balancer",
"disagg_two_nodes_repl_jscore_passthrough",
"no_passthrough_disagg_override",
]
current_task_name = task.display_name
if task.parent_task_id:
parent_task = evg_api.task_by_id(task.parent_task_id)
current_task_name = parent_task.display_name
if current_task_name in skip_tasks:
print(f"Skipping core analysis task generation for task: {current_task_name}")
return False
core_dump_pids = set()
for artifact in task.artifacts:
# Matches "Core Dump 2 (dump_mongo.670872.core.gz)", capturing "dump_mongo.670872.core"
regex = re.search(r"Core Dump [0-9]+ \((.*)\.gz\)", artifact.name)
if not regex:
continue
core_file = regex.group(1)
core_file_parts = core_file.split(".")
# Expected format is like dump_mongod.429814.core or dump_mongod-8.2.429814.core, where 429814 is the PID.
assert len(core_file_parts) >= 3, "Unknown core dump file name format"
assert str.isdigit(
core_file_parts[-2]
), "PID not in expected location of core dump file name"
pid = core_file_parts[-2]
core_dump_pids.add(pid)
boring_core_dump_pids = set()
if os.path.exists(BORING_CORE_DUMP_PIDS_FILE):
with open(BORING_CORE_DUMP_PIDS_FILE, "r") as file:
boring_core_dump_pids = set(file.read().split())
print(f"detected core dump pids: {core_dump_pids}")
print(f"boring core dump pids: {boring_core_dump_pids}")
interesting_core_dumps = core_dump_pids - boring_core_dump_pids
if interesting_core_dumps:
print(f"The following interesting core dump pids were found: {interesting_core_dumps}")
print("Activating core analysis task.")
should_activate = True
else:
print("No interesting core dumps were found. Not activating core analysis task.")
should_activate = False
return should_activate
def get_core_pid(core_file_name: str) -> int:
# Expected format is like dump_mongod.429814.core or dump_mongod-8.2.429814.core, where 429814 is the PID.
parts = core_file_name.split(".")
assert len(parts) >= 3, "Unknown core dump file name format"
assert str.isdigit(parts[-2]), "PID not in expected location of core dump file name"
pid = parts[-2]
return pid
def get_core_analyzer_commands(
def _get_core_analyzer_commands(
task_id: str,
execution: str,
core_analyzer_results_url: str,
gdb_index_cache: str,
has_interesting_core_dumps: bool,
boring_core_dump_pids: set,
is_bazel_task: bool = False,
) -> List[FunctionCall]:
"""Return setup commands."""
return [
@ -134,9 +368,9 @@ def get_core_analyzer_commands(
f"--gdb-index-cache={gdb_index_cache}",
f"--boring-core-dump-pids={','.join(boring_core_dump_pids)}",
"--generate-report",
"--otel-extra-data",
f"has_interesting_core_dumps={str(has_interesting_core_dumps).lower()}",
],
f"--otel-extra-data=has_interesting_core_dumps={str(has_interesting_core_dumps).lower()}",
]
+ ["--is-bazel-task" if is_bazel_task else None],
"env": {
"OTEL_TRACE_ID": "${otel_trace_id}",
"OTEL_PARENT_ID": "${otel_parent_id}",
@ -178,123 +412,6 @@ def get_core_analyzer_commands(
]
def generate(
expansions_file: str = "../expansions.yml", output_file: str = "hang_analyzer_task.json"
) -> None:
if not sys.platform.startswith("linux"):
print("This platform is not supported, skipping core analysis task generation.")
return
# gather information from the current task being run
expansions = read_config_file(expansions_file)
distro = None
for distro_expansion in ["core_analyzer_distro_name", "large_distro_name", "distro_id"]:
if distro := expansions.get(distro_expansion, None):
break
assert distro is not None
current_task_name = expansions.get("task_name")
task_id = expansions.get("task_id")
execution = expansions.get("execution")
gdb_index_cache = "off" if expansions.get("core_analyzer_gdb_index_cache") == "off" else "on"
build_variant_name = expansions.get("build_variant")
core_analyzer_results_url = expansions.get("core_analyzer_results_url")
compile_variant = expansions.get("compile_variant")
try:
evg_api = evergreen_conn.get_evergreen_api()
except RuntimeError:
print(
"WARNING: Cannot generate core analysis because the evergreen api file could not be found.",
file=sys.stderr,
)
print(
"This is probably not an error, if you want core analysis to run on this task make sure",
file=sys.stderr,
)
print(
"the evergreen function 'configure evergreen api credentials' is called before this task",
file=sys.stderr,
)
return
task_info = evg_api.task_by_id(task_id)
# make sure we are not creating an infinite loop by generating a task from another generated task
if current_task_name.startswith(GENERATED_TASK_PREFIX):
print(
f"Skipping task generation because {current_task_name} starts with {GENERATED_TASK_PREFIX}"
)
return
# LOCAL_BIN_DIR does not exists on non-resmoke tasks, so return early as there is no work to be done.
if not os.path.exists(LOCAL_BIN_DIR):
print(f"Skipping task generation because binary directory not found: {LOCAL_BIN_DIR}")
return
# See if any core dumps were uploaded for this task
has_known_core_dumps = False
dumpers = dumper.get_dumpers(None, None)
for artifact in task_info.artifacts:
regex = re.search(r"Core Dump [0-9]+ \((.*)\.gz\)", artifact.name)
if not regex:
continue
core_file = regex.group(1)
binary_name, bin_version = dumpers.dbg.get_binary_from_core_dump(core_file)
dir_to_check = MULTIVERSION_BIN_DIR if bin_version else LOCAL_BIN_DIR
binary_files = os.listdir(dir_to_check)
if binary_name in binary_files:
has_known_core_dumps = True
break
print(f"{core_file} was generated by {binary_name} but the binary was not found.")
if not has_known_core_dumps:
print(
"No core dumps with known binaries found for this task, skipping core analysis task generation."
)
return
should_activate = should_activate_core_analysis_task(task_info, evg_api)
# Get boring core dump PIDs to pass to the analyzer
boring_core_dump_pids = set()
if os.path.exists(BORING_CORE_DUMP_PIDS_FILE):
with open(BORING_CORE_DUMP_PIDS_FILE, "r") as file:
boring_core_dump_pids = set(file.read().split())
# Make the evergreen variant that will be generated
build_variant = BuildVariant(name=build_variant_name)
commands = get_core_analyzer_commands(
task_id,
execution,
core_analyzer_results_url,
gdb_index_cache,
should_activate,
boring_core_dump_pids,
)
deps = {TaskDependency("archive_dist_test_debug", compile_variant)}
# TODO SERVER-92571 add archive_jstestshell_debug dep for variants that have it.
sub_tasks = set([Task(get_generated_task_name(current_task_name, execution), commands, deps)])
build_variant.add_tasks(sub_tasks, distros=[distro], activate=should_activate)
shrub_project = ShrubProject.empty()
shrub_project.add_build_variant(build_variant)
# shrub.py currently does not support adding task deps that override the variant deps
output_dict = shrub_project.as_dict()
deps_list = []
for dep in deps:
deps_list.append(dep.as_dict())
for variant in output_dict["buildvariants"]:
for task in variant["tasks"]:
task["depends_on"] = deps_list
write_file(output_file, json.dumps(output_dict))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
@ -307,7 +424,23 @@ if __name__ == "__main__":
help="Name of output file to write the generated task config to.",
default="hang_analyzer_task.json",
)
parser.add_argument(
"--tests-use-bazel",
action="store_true",
help="Generate for bazel result task (look for cores in results/*/test.outputs/)",
)
parser.add_argument(
"--use-mock-tasks",
action="store_true",
help=argparse.SUPPRESS, # Use mock Evergreen tasks and skip Evergreen API calls, for unit testing this script.
)
args = parser.parse_args()
expansions_file = args.expansions_file
output_file = args.output_file
generate(expansions_file, output_file)
if args.tests_use_bazel:
generator = BazelCoreAnalysisTaskGenerator(args.expansions_file, args.use_mock_tasks)
else:
generator = ResmokeCoreAnalysisTaskGenerator(args.expansions_file, args.use_mock_tasks)
task_config = generator.generate()
if task_config:
write_file(args.output_file, json.dumps(task_config, indent=4))

View File

@ -52,3 +52,14 @@ py_test(
"//buildscripts/resmokelib/hang_analyzer",
],
)
py_test(
name = "test_gen_hang_analyzer_tasks",
srcs = [
"test_gen_hang_analyzer_tasks.py",
],
deps = [
"//buildscripts/resmokelib",
"//buildscripts/resmokelib/hang_analyzer",
],
)

View File

@ -53,6 +53,7 @@ class TestCoreDumpFiltering(unittest.TestCase):
install_dir,
analysis_dir,
multiversion_dir,
None,
"on",
boring_pids,
)
@ -88,7 +89,13 @@ class TestCoreDumpFiltering(unittest.TestCase):
# Pass empty set of boring PIDs
report = self.dumper.analyze_cores(
tmpdir, "/mock/install", tmpdir, "/mock/multiversion", "on", set()
tmpdir,
"/mock/install",
tmpdir,
"/mock/multiversion",
None,
"on",
set(),
)
# Should analyze all cores
@ -112,7 +119,13 @@ class TestCoreDumpFiltering(unittest.TestCase):
# Pass None for boring PIDs
report = self.dumper.analyze_cores(
tmpdir, "/mock/install", tmpdir, "/mock/multiversion", "on", None
tmpdir,
"/mock/install",
tmpdir,
"/mock/multiversion",
None,
"on",
None,
)
# Should analyze all cores
@ -137,7 +150,13 @@ class TestCoreDumpFiltering(unittest.TestCase):
# Should cap at 10 by default
report = self.dumper.analyze_cores(
tmpdir, "/mock/install", tmpdir, "/mock/multiversion", "on", None
tmpdir,
"/mock/install",
tmpdir,
"/mock/multiversion",
None,
"on",
None,
)
# Should only analyze 10 cores (default max)
@ -166,6 +185,7 @@ class TestCoreDumpFiltering(unittest.TestCase):
"/mock/install",
tmpdir,
"/mock/multiversion",
None,
"on",
None,
max_core_dumps=10,
@ -202,6 +222,7 @@ class TestCoreDumpFiltering(unittest.TestCase):
"/mock/install",
tmpdir,
"/mock/multiversion",
None,
"on",
boring_pids,
max_core_dumps=20,
@ -234,7 +255,13 @@ class TestCoreDumpFiltering(unittest.TestCase):
boring_pids = {"12345"}
self.dumper.analyze_cores(
tmpdir, "/mock/install", tmpdir, "/mock/multiversion", "on", boring_pids
tmpdir,
"/mock/install",
tmpdir,
"/mock/multiversion",
None,
"on",
boring_pids,
)
# Should analyze 2 cores (the unparseable ones are treated as interesting)
@ -271,7 +298,13 @@ class TestCoreDumpFiltering(unittest.TestCase):
boring_pids = {"12345"}
self.dumper.analyze_cores(
tmpdir, "/mock/install", tmpdir, "/mock/multiversion", "on", boring_pids
tmpdir,
"/mock/install",
tmpdir,
"/mock/multiversion",
None,
"on",
boring_pids,
)
# Should analyze 2 interesting cores

View File

@ -0,0 +1,585 @@
"""Unit tests for the buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks module."""
import sys
import unittest
from unittest.mock import MagicMock, patch
from buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks import (
GENERATED_TASK_PREFIX,
BazelCoreAnalysisTaskGenerator,
CoreInfo,
ResmokeCoreAnalysisTaskGenerator,
_get_core_analyzer_commands,
get_core_pid,
)
class TestCorePidExtraction(unittest.TestCase):
"""Unit tests for get_core_pid function."""
def test_standard_core_dump_format(self):
core_file = "dump_mongod.429814.core"
pid = get_core_pid(core_file)
self.assertEqual(pid, "429814")
def test_multiversion_core_dump_format(self):
core_file = "dump_mongod-8.2.429814.core"
pid = get_core_pid(core_file)
self.assertEqual(pid, "429814")
def test_with_path(self):
core_file = "/path/to/dump_mongod.789012.core"
pid = get_core_pid(core_file)
self.assertEqual(pid, "789012")
def test_invalid_format_non_digit_pid(self):
"""Test that non-digit PID raises an assertion."""
with self.assertRaises(AssertionError):
get_core_pid("dump_mongod.notanumber.core")
@unittest.skipIf(
not sys.platform.startswith("linux"),
reason="Core analysis is only support on linux",
)
class TestGetCoreAnalyzerCommands(unittest.TestCase):
"""Unit tests for get_core_analyzer_commands function."""
def test_returns_list_of_function_calls(self):
"""Test that function returns a list."""
commands = _get_core_analyzer_commands("task123", "0", "s3://results", "on", True, set())
self.assertIsInstance(commands, list)
self.assertGreater(len(commands), 0)
def test_includes_task_id_in_subprocess_command(self):
"""Test that task ID is included in subprocess command."""
task_id = "task_abc_123"
commands = _get_core_analyzer_commands(task_id, "0", "s3://results", "on", True, set())
# Find the subprocess.exec command
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
self.assertIsNotNone(subprocess_cmd)
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
self.assertIn(f"--task-id={task_id}", args)
def test_includes_execution_in_subprocess_command(self):
"""Test that execution is included in subprocess command."""
execution = "3"
commands = _get_core_analyzer_commands(
"task123", execution, "s3://results", "on", True, set()
)
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
self.assertIn(f"--execution={execution}", args)
def test_includes_gdb_index_cache_setting(self):
"""Test that gdb index cache setting is included."""
for cache_setting in ["on", "off"]:
commands = _get_core_analyzer_commands(
"task123", "0", "s3://results", cache_setting, True, set()
)
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
self.assertIn(f"--gdb-index-cache={cache_setting}", args)
def test_includes_boring_core_dump_pids(self):
"""Test that boring core dump PIDs are included."""
boring_pids = {"12345", "67890", "11111"}
commands = _get_core_analyzer_commands(
"task123", "0", "s3://results", "on", True, boring_pids
)
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
# Find the boring PIDs argument
boring_arg = None
for arg in args:
if arg and arg.startswith("--boring-core-dump-pids="):
boring_arg = arg
break
self.assertIsNotNone(boring_arg)
# Check that all PIDs are in the argument
for pid in boring_pids:
self.assertIn(pid, boring_arg)
def test_empty_boring_pids(self):
"""Test handling of empty boring PIDs set."""
commands = _get_core_analyzer_commands("task123", "0", "s3://results", "on", True, set())
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
boring_arg = None
for arg in args:
if arg and arg.startswith("--boring-core-dump-pids="):
boring_arg = arg
break
self.assertIsNotNone(boring_arg)
self.assertEqual(boring_arg, "--boring-core-dump-pids=")
def test_bazel_task_flag(self):
"""Test that is_bazel_task flag is passed correctly."""
commands = _get_core_analyzer_commands(
"task123", "0", "s3://results", "on", True, set(), is_bazel_task=True
)
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
self.assertIn("--is-bazel-task", args)
def test_non_bazel_task_no_flag(self):
"""Test that non-bazel tasks don't include the bazel flag."""
commands = _get_core_analyzer_commands(
"task123", "0", "s3://results", "on", True, set(), is_bazel_task=False
)
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
# Filter out None values
args = [arg for arg in args if arg is not None]
self.assertNotIn("--is-bazel-task", args)
def test_includes_s3_put_with_results_url(self):
"""Test that S3 put command includes correct results URL."""
results_url = "s3://bucket/path/to/results.tgz"
commands = _get_core_analyzer_commands("task123", "0", results_url, "on", True, set())
s3_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "s3.put" in str(cmd.as_dict()):
s3_cmd = cmd
break
self.assertIsNotNone(s3_cmd)
cmd_dict = s3_cmd.as_dict()
self.assertEqual(cmd_dict["params"]["remote_file"], results_url)
def test_includes_otel_extra_data(self):
"""Test that OTEL extra data includes has_interesting_core_dumps flag."""
for has_interesting in [True, False]:
commands = _get_core_analyzer_commands(
"task123", "0", "s3://results", "on", has_interesting, set()
)
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
expected_str = (
f"--otel-extra-data=has_interesting_core_dumps={str(has_interesting).lower()}"
)
self.assertIn(expected_str, args)
@unittest.skipIf(
not sys.platform.startswith("linux"),
reason="Core analysis is only support on linux",
)
class TestCoreAnalysisTaskGenerator(unittest.TestCase):
"""Unit tests for CoreAnalysisTaskGenerator base class."""
def setUp(self):
"""Set up test fixtures."""
self.expansions_file = "test_expansions.yml"
self.mock_expansions = {
"task_name": "resmoke_test",
"task_id": "test_task_123",
"execution": "0",
"build_variant": "ubuntu2204",
"distro_id": "ubuntu2204-large",
"core_analyzer_results_url": "s3://bucket/results.tgz",
"compile_variant": "ubuntu2204-compile",
"workdir": "/data/mci",
}
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file")
def test_generate_creates_task_config_with_interesting_cores(self, mock_read_config):
"""Test that generate creates proper task config when interesting cores are found."""
mock_read_config.return_value = self.mock_expansions
mock_cores = [
CoreInfo(
path="/tmp/dump_mongod.123.core",
binary_name="mongod",
pid="123",
marked_boring=False,
),
CoreInfo(
path="/tmp/dump_mongos.456.core",
binary_name="mongos",
pid="456",
marked_boring=False,
),
]
with patch.object(ResmokeCoreAnalysisTaskGenerator, "find_cores", return_value=mock_cores):
generator = ResmokeCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
result = generator.generate()
self.assertIsNotNone(result)
self.assertIn("buildvariants", result)
self.assertEqual(len(result["buildvariants"]), 1)
variant = result["buildvariants"][0]
self.assertEqual(variant["name"], "ubuntu2204")
self.assertEqual(len(variant["tasks"]), 1)
task = variant["tasks"][0]
self.assertTrue(task["activate"])
self.assertTrue(task["name"].startswith(GENERATED_TASK_PREFIX))
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file")
def test_generate_does_not_activate_with_only_boring_cores(self, mock_read_config):
"""Test that task is not activated when only boring cores are found."""
mock_read_config.return_value = self.mock_expansions
mock_cores = [
CoreInfo(
path="/tmp/dump_mongod.123.core",
binary_name="mongod",
pid="123",
marked_boring=True,
),
CoreInfo(
path="/tmp/dump_mongos.456.core",
binary_name="mongos",
pid="456",
marked_boring=True,
),
]
with patch.object(ResmokeCoreAnalysisTaskGenerator, "find_cores", return_value=mock_cores):
generator = ResmokeCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
result = generator.generate()
self.assertIsNotNone(result)
variant = result["buildvariants"][0]
task = variant["tasks"][0]
self.assertFalse(task["activate"])
def test_should_skip_task_for_hardcoded_task_names(self):
"""Test that hardcoded task names are skipped."""
with patch(
"buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file"
) as mock_read:
mock_read.return_value = self.mock_expansions
generator = ResmokeCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
# Test skipped task names
for task_name in ["no_passthrough_disagg_override", "disagg_repl_jscore_passthrough"]:
mock_task = MagicMock()
mock_task.display_name = task_name
mock_task.parent_task_id = None
mock_task.build_variant = "ubuntu2204"
self.assertTrue(generator._should_skip_task(mock_task))
def test_should_not_skip_normal_task(self):
"""Test that normal tasks are not skipped."""
with patch(
"buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file"
) as mock_read:
mock_read.return_value = self.mock_expansions
generator = ResmokeCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
mock_task = MagicMock()
mock_task.display_name = "normal_task"
mock_task.parent_task_id = None
mock_task.build_variant = "ubuntu2204"
self.assertFalse(generator._should_skip_task(mock_task))
@unittest.skipIf(
not sys.platform.startswith("linux"),
reason="Core analysis is only support on linux",
)
class TestResmokeCoreAnalysisTaskGenerator(unittest.TestCase):
"""Unit tests for ResmokeCoreAnalysisTaskGenerator."""
def setUp(self):
"""Set up test fixtures."""
self.expansions_file = "test_expansions.yml"
self.mock_expansions = {
"task_name": "resmoke_test",
"task_id": "test_task_123",
"execution": "0",
"build_variant": "ubuntu2204",
"distro_id": "ubuntu2204-large",
"core_analyzer_results_url": "s3://bucket/results.tgz",
"compile_variant": "ubuntu2204-compile",
"workdir": "/data/mci",
}
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.os.path.exists")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.os.listdir")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.dumper.get_dumpers")
def test_find_cores_discovers_cores_from_artifacts(
self, mock_get_dumpers, mock_listdir, mock_exists, mock_read_config
):
"""Test that find_cores discovers cores from task artifacts."""
mock_read_config.return_value = self.mock_expansions
# Mock binary directory exists
def exists_side_effect(path):
if "dist-test/bin" in path or "boring_core_dumps.txt" in path:
return True
return False
mock_exists.side_effect = exists_side_effect
mock_listdir.return_value = ["mongod", "mongos"]
# Mock task artifacts
mock_artifact1 = MagicMock()
mock_artifact1.name = "Core Dump 1 (dump_mongod.12345.core.gz)"
mock_artifact2 = MagicMock()
mock_artifact2.name = "Core Dump 2 (dump_mongos.67890.core.gz)"
mock_task = MagicMock()
mock_task.artifacts = [mock_artifact1, mock_artifact2]
# Mock dumper
mock_dbg = MagicMock()
mock_dbg.get_binary_from_core_dump.side_effect = [
("mongod", None),
("mongos", None),
]
mock_dumpers = MagicMock()
mock_dumpers.dbg = mock_dbg
mock_get_dumpers.return_value = mock_dumpers
with patch("builtins.open", unittest.mock.mock_open(read_data="")):
generator = ResmokeCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
generator.evg_api.task_by_id.return_value = mock_task
cores = generator.find_cores()
self.assertEqual(len(cores), 2)
self.assertEqual(cores[0].path, "dump_mongod.12345.core")
self.assertEqual(cores[0].binary_name, "mongod")
self.assertEqual(cores[0].pid, "12345")
self.assertEqual(cores[1].path, "dump_mongos.67890.core")
self.assertEqual(cores[1].binary_name, "mongos")
self.assertEqual(cores[1].pid, "67890")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.os.path.exists")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.os.listdir")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.dumper.get_dumpers")
def test_find_cores_marks_boring_cores(
self, mock_get_dumpers, mock_listdir, mock_exists, mock_read_config
):
"""Test that find_cores correctly marks boring cores."""
mock_read_config.return_value = self.mock_expansions
mock_exists.return_value = True
mock_listdir.return_value = ["mongod"]
# Mock artifact with boring core
mock_artifact = MagicMock()
mock_artifact.name = "Core Dump 1 (dump_mongod.12345.core.gz)"
mock_task = MagicMock()
mock_task.artifacts = [mock_artifact]
# Mock dumper
mock_dbg = MagicMock()
mock_dbg.get_binary_from_core_dump.return_value = ("mongod", None)
mock_dumpers = MagicMock()
mock_dumpers.dbg = mock_dbg
mock_get_dumpers.return_value = mock_dumpers
# Mock boring PIDs file with PID 12345
with patch("builtins.open", unittest.mock.mock_open(read_data="12345\n67890\n")):
generator = ResmokeCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
generator.evg_api.task_by_id.return_value = mock_task
cores = generator.find_cores()
self.assertEqual(len(cores), 1)
self.assertTrue(cores[0].marked_boring)
@unittest.skipIf(
not sys.platform.startswith("linux"),
reason="Core analysis is only support on linux",
)
class TestBazelCoreAnalysisTaskGenerator(unittest.TestCase):
"""Unit tests for BazelCoreAnalysisTaskGenerator."""
def setUp(self):
"""Set up test fixtures."""
self.expansions_file = "test_expansions.yml"
self.mock_expansions = {
"task_name": "bazel_test",
"task_id": "test_task_123",
"execution": "0",
"build_variant": "ubuntu2204",
"distro_id": "ubuntu2204-large",
"core_analyzer_results_url": "s3://bucket/results.tgz",
"compile_variant": "ubuntu2204-compile",
"workdir": "/data/mci",
}
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.os.path.exists")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.glob.glob")
def test_find_cores_discovers_cores_in_test_outputs(
self, mock_glob, mock_exists, mock_read_config
):
"""Test that find_cores discovers cores in test.outputs directories."""
mock_read_config.return_value = self.mock_expansions
def exists_side_effect(path):
if "results" in path and "boring_core_dumps.txt" not in path:
return True
return False
mock_exists.side_effect = exists_side_effect
# Mock glob to return test.outputs directories
def glob_side_effect(pattern, **kwargs):
if ".core" in pattern:
if "test1" in pattern:
return ["/data/mci/results/test1/test.outputs/dump_mongod.12345.core"]
elif "test2" in pattern:
return ["/data/mci/results/test2/test.outputs/dump_mongos.67890.core"]
elif ".mdmp" in pattern:
return []
elif "test.outputs" in pattern and "recursive" in kwargs:
return [
"/data/mci/results/test1/test.outputs",
"/data/mci/results/test2/test.outputs",
]
return []
mock_glob.side_effect = glob_side_effect
generator = BazelCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
cores = generator.find_cores()
self.assertEqual(len(cores), 2)
self.assertIn("dump_mongod.12345.core", cores[0].path)
self.assertEqual(cores[0].pid, "12345")
self.assertIn("dump_mongos.67890.core", cores[1].path)
self.assertEqual(cores[1].pid, "67890")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.os.path.exists")
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.glob.glob")
def test_find_cores_marks_boring_cores_from_test_outputs(
self, mock_glob, mock_exists, mock_read_config
):
"""Test that find_cores marks boring cores based on boring_core_dumps.txt."""
mock_read_config.return_value = self.mock_expansions
boring_file_path = None
def exists_side_effect(path):
if "results" in path and "test.outputs" not in path:
return True
if "boring_core_dumps.txt" in path:
nonlocal boring_file_path
boring_file_path = path
return True
return False
mock_exists.side_effect = exists_side_effect
def glob_side_effect(pattern, **kwargs):
if ".core" in pattern:
return ["/data/mci/results/test1/test.outputs/dump_mongod.12345.core"]
elif ".mdmp" in pattern:
return []
elif "test.outputs" in pattern and "recursive" in kwargs:
return ["/data/mci/results/test1/test.outputs"]
return []
mock_glob.side_effect = glob_side_effect
# Mock boring PIDs file
with patch("builtins.open", unittest.mock.mock_open(read_data="12345\n")):
generator = BazelCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
cores = generator.find_cores()
self.assertEqual(len(cores), 1)
self.assertTrue(cores[0].marked_boring)
@patch("buildscripts.resmokelib.hang_analyzer.gen_hang_analyzer_tasks.read_config_file")
def test_get_core_analyzer_commands_includes_bazel_flag(self, mock_read_config):
"""Test that get_core_analyzer_commands includes bazel flag."""
mock_read_config.return_value = self.mock_expansions
generator = BazelCoreAnalysisTaskGenerator(self.expansions_file, use_mock_tasks=True)
commands = generator.get_core_analyzer_commands(
"task123", "0", "s3://results", "on", True, set()
)
# Find subprocess command and verify it has --is-bazel-task flag
subprocess_cmd = None
for cmd in commands:
if hasattr(cmd, "as_dict") and "subprocess.exec" in str(cmd.as_dict()):
subprocess_cmd = cmd
break
self.assertIsNotNone(subprocess_cmd)
cmd_dict = subprocess_cmd.as_dict()
args = cmd_dict["params"]["args"]
self.assertIn("--is-bazel-task", args)
if __name__ == "__main__":
unittest.main()

View File

@ -1302,12 +1302,18 @@ functions:
- *execute_resmoke_tests_via_bazel_sh
"fetch remote test results":
- *f_expansions_write
- command: subprocess.exec
params:
binary: bash
add_expansions_to_env: true
args:
- "./src/evergreen/fetch_remote_test_results.sh"
- command: expansions.update
params:
file: "${workdir}/results/test_failures_exist.yml"
ignore_missing_file: true
- *f_expansions_write
"assume ECR role": &assume_ecr_role
command: ec2.assume_role
@ -2967,6 +2973,65 @@ functions:
display_name: Core Analyzer Results
optional: true
"generate result task hang analyzer":
- command: expansions.update
params:
updates:
- key: core_analyzer_results_url
value: ${project}/${build_variant}/${task_id}/${execution}/core-analyzer-results.tgz
- key: parent_task_id
value: ${parent_task_id}
- *f_expansions_write
- command: subprocess.exec
display_name: "Generate core analysis task"
params:
binary: bash
include_expansions_in_env:
- workdir
- test_failures_exist
args:
- "src/evergreen/generate_core_analysis_task.sh"
- "--tests-use-bazel"
- command: s3.put
params:
aws_key: ${aws_key}
aws_secret: ${aws_secret}
local_file: src/hang_analyzer_task.json
remote_file: ${project}/${build_variant}/${revision}/hang_analyzer_tasks/${task_id}.json
bucket: mciuploads
permissions: public-read
content_type: application/json
display_name: Generated Hang Analyzer Task Config - Execution ${execution}
optional: true
- command: generate.tasks
params:
optional: true
files:
- src/hang_analyzer_task.json
- command: subprocess.exec
params:
binary: bash
args:
- "src/evergreen/run_python_script.sh"
- "buildscripts/resmokelib/hang_analyzer/attach_core_analyzer_task.py"
- command: attach.artifacts
params:
optional: true
exact_file_names: true
files:
- src/core_analyzer_artifact.json
- command: s3.put
params:
aws_key: ${aws_key}
aws_secret: ${aws_secret}
local_file: src/core_analyzer_results.txt
remote_file: ${core_analyzer_results_url}
bucket: mciuploads
permissions: public-read
content_type: text/plain
display_name: Core Analyzer Results
optional: true
"save unsymbolized stacktraces and local invocation":
- command: s3.put
params:

View File

@ -2359,4 +2359,26 @@ tasks:
params:
binary: bash
args:
- "./src/buildscripts/bazel_testbuilds/verify_coredump_test.sh"
- "./src/buildscripts/bazel_testbuilds/verify_unittest_coredump_test.sh"
- name: verify_resmoke_coredump
tags: ["assigned_to_jira_team_devprod_correctness", "auxiliary"]
exec_timeout_secs: 1800 # 30 minute timeout
commands:
- command: manifest.load
- func: "git get project and add git tag"
- func: "f_expansions_write"
- func: "kill processes"
- func: "cleanup environment"
- func: "set up venv"
- func: "get engflow creds"
- command: subprocess.exec
display_name: "Verify resmoke coredump generation"
type: test
timeout_secs: 1800 # 30 minutes
params:
binary: bash
include_expansions_in_env:
- workdir
args:
- "./src/buildscripts/bazel_testbuilds/verify_resmoke_coredump_test.sh"

View File

@ -93,6 +93,7 @@ buildvariants:
build_mongot: true
download_mongot_release: true
compile_variant: *amazon-linux2023-arm64-static-compile
core_analyzer_distro_name: amazon2023-arm64-latest-xlarge
evergreen_remote_exec: on
skip_debug_link: true
remote_link: true

View File

@ -36,6 +36,9 @@ buildvariants:
- name: verify_ci_wrapper_coredump
distros:
- amazon2023-arm64-latest-m8g-2xlarge
- name: verify_resmoke_coredump
distros:
- amazon2023-arm64-latest-m8g-2xlarge
# Experimental variant running bazel targets for integration tests. To be removed with SERVER-103537.
- name: bazel-integration-tests

View File

@ -4,9 +4,8 @@
# bash fetch_remote_test_results.sh
#
# Assumes the following files exist:
# ./"build_events.json" Build events JSON containing the records of remote test executions
# "${workdir}/src/engflow.cert" EngFlow cert
# "${workdir}/src/engflow.key" EngFlow key
# ./"build_events.json" Build events JSON containing the records of remote test executions
# engflow.cert and engflow.key located in either ${workdir}/src or ${HOME}/.engflow/creds
#
# Required environment variables:
# * ${test_label} - The resmoke bazel target to get results for, like //buildscripts/resmokeconfig:core
@ -180,15 +179,48 @@ function write_bazel_invocation() {
sed "s/\S*\$/${test_label_escaped}/" ${workdir}/resmoke-tests-bazel-invocation.txt | tail -n 1 >"${workdir}/bazel-invocation.txt"
}
# Writes a YAML file indicating that test failures exist.
function write_test_failures_expansion() {
local output_file="${workdir}/results/test_failures_exist.yml"
mkdir -p "$(dirname "$output_file")"
echo "test_failures_exist: true" >"$output_file"
}
# Print the contents of all *test.log files.
function print_executor_logs() {
echo "Executor logs for all failed shards:"
find "${workdir}/results" -name '*test.log' -type f -exec cat {} +
}
# Resolves a file path from a list of candidate locations. Returns the first existing file path found.
function resolve_file() {
local -n paths=$1
for path in "${paths[@]}"; do
if [ -f "$path" ]; then
echo "$path"
return 0
fi
done
return 1
}
BEP_FILE='build_events.json'
ENGFLOW_CERT="${workdir}/src/engflow.cert"
ENGFLOW_KEY="${workdir}/src/engflow.key"
if ! [ -f "$ENGFLOW_CERT" ]; then
cert_candidates=(
"${workdir}/src/engflow.cert"
"${HOME}/.engflow/creds/engflow.crt"
)
ENGFLOW_CERT=$(resolve_file cert_candidates)
fi
if ! [ -f "$ENGFLOW_KEY" ]; then
key_candidates=(
"${workdir}/src/engflow.key"
"${HOME}/.engflow/creds/engflow.key"
)
ENGFLOW_KEY=$(resolve_file key_candidates)
fi
if [ ! -f "$BEP_FILE" ]; then
echo "Error: File '$BEP_FILE' not found" >&2
@ -217,6 +249,7 @@ while IFS= read -r test_result; do
if is_failure "$test_result"; then
is_failure_flag=1
fail_task=1
write_test_failures_expansion
fi
download_outputs "$test_result" "$is_failure_flag"
@ -239,6 +272,7 @@ if [[ "$failures" == 'No report.json files found' ]]; then
if [[ "$fail_task" -eq 1 ]]; then
echo 'No report/test logs were found, but the bazel test failed. Check the test executor logs below.'
fi
write_test_failures_expansion
print_executor_logs
exit $fail_task
else

View File

@ -0,0 +1,53 @@
# Conditionally runs buildscripts/resmokelib/hang_analyzer/gen_hang_analyzer_tasks.py, if core dumps are present.
set -o errexit
set -o verbose
# Check if test failures exist before proceeding. This expansion is created in fetch_remote_test_results.sh.
# We should only trigger core analysis if there are test failures.
if [ "${test_failures_exist}" != "true" ]; then
echo "No test failures detected (test_failures_exist: ${test_failures_exist}). Skipping core analysis task generation."
exit 0
fi
# Check if there are any core dumps present before proceeding. Presence of a core dump here
# does not necessarily mean a core analysis task will be generated, just that the python
# script will run. It has more conditional logic within it. This check is implemented here
# to avoid needing to setup the Python virtual environment for every results tasks.
# Search for core files in ${workdir}/results/**/test.outputs/ directories
results_dir="${workdir}/results"
if [ ! -d "$results_dir" ]; then
echo "No results directory found at $results_dir. Skipping core analysis task generation."
exit 0
fi
# Look for *.core or *.mdmp files in results/**/test.outputs/ directories
core_dumps_found=false
while IFS= read -r -d '' test_outputs_dir; do
if compgen -G "${test_outputs_dir}/*.core" >/dev/null || compgen -G "${test_outputs_dir}/*.mdmp" >/dev/null; then
core_dumps_found=true
break
fi
done < <(find "$results_dir" -type d -name "test.outputs" -print0)
if [ "$core_dumps_found" = false ]; then
echo "No core dumps found in $results_dir. Skipping core analysis task generation."
exit 0
fi
echo "Core dumps found. Proceeding with core analysis task generation."
# Virtual environment setup is performed here, so that results tasks remain fast in the
# common case where there are no core dumps.
bash "${workdir}/src/evergreen/functions/venv_setup.sh"
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
. "$DIR/prelude.sh"
bash "${workdir}/src/evergreen/functions/evergreen_api_credentials_configure.sh"
cd src
activate_venv
echo $python
$python buildscripts/resmokelib/hang_analyzer/gen_hang_analyzer_tasks.py "$@"

View File

@ -5,6 +5,7 @@ from urllib.request import urlretrieve
sys.path.append(os.path.join(os.path.dirname(__file__), "."))
from download_archive_dist_test_debug import get_task_id
from buildscripts.resmokelib.hang_analyzer.extractor import find_test_task_with_binaries
from buildscripts.resmokelib.utils import evergreen_conn
@ -13,19 +14,7 @@ def main():
evg_api = evergreen_conn.get_evergreen_api(evergreen_config=evg_config)
task_id = get_task_id(evg_api)
task = evg_api.task_by_id(task_id)
tasks_in_variant = evg_api.tasks_by_build(task.build_id)
if "_burn_in_" in task.display_name:
resmoke_tests_task = list(
filter(lambda t: t.display_name.startswith("resmoke_tests_burn_in"), tasks_in_variant)
)
else:
resmoke_tests_task = list(
filter(lambda t: t.display_name == "resmoke_tests", tasks_in_variant)
)
assert len(resmoke_tests_task) == 1, "Could not find a unique resmoke test task"
resmoke_tests_task = resmoke_tests_task[0]
resmoke_tests_task = find_test_task_with_binaries(evg_api, task_id)
output_dir = "/data/mci/artifacts-resmoke_tests"
os.mkdir(output_dir)