From 55f60db3068b6b25a9b2ba57fec1b7daad805d94 Mon Sep 17 00:00:00 2001 From: Sean Lyons Date: Fri, 8 May 2026 10:13:19 -0400 Subject: [PATCH] SERVER-123971 Add Evergreen parameter for disabling RBE for resmoke_tests (#53027) GitOrigin-RevId: cb70bf9c76902e66c16f2008d9de1b4276d996e9 --- bazel/resmoke/resmoke.bzl | 11 +- bazel/resmoke/resmoke_shim.py | 14 +- buildscripts/bazel_burn_in.py | 26 ++- buildscripts/generate_result_tasks.py | 111 ++++++++--- evergreen/bazel_test_results_shutils.sh | 240 +++++++++++++++++++++++ evergreen/fetch_remote_test_results.sh | 230 ++-------------------- evergreen/gather_local_test_results.sh | 130 ++++++++++++ evergreen/resmoke_tests_execute_bazel.sh | 37 +++- 8 files changed, 536 insertions(+), 263 deletions(-) create mode 100644 evergreen/bazel_test_results_shutils.sh create mode 100644 evergreen/gather_local_test_results.sh diff --git a/bazel/resmoke/resmoke.bzl b/bazel/resmoke/resmoke.bzl index 56d6f10aa0d..573bd33dd97 100644 --- a/bazel/resmoke/resmoke.bzl +++ b/bazel/resmoke/resmoke.bzl @@ -212,11 +212,8 @@ def resmoke_suite_test( }) + select({ "//bazel/resmoke:installed_dist_test_enabled": [ "--installDir=dist-test/bin", - "--mongoVersionFile=$(location //:.resmoke_mongo_version.yml)", - ], - "//conditions:default": [ - "--mongoVersionFile=$(location //bazel/resmoke:resmoke_mongo_version)", ], + "//conditions:default": [], }) deps_path = ":".join(["$(location %s)" % dep for dep in deps]) @@ -224,6 +221,7 @@ def resmoke_suite_test( default_data = [ generated_config, python_imports_target, + "//bazel/resmoke:resmoke_mongo_version", "//bazel/resmoke:on_feature_flags", "//bazel/resmoke:off_feature_flags", "//bazel/resmoke:unreleased_ifr_flags", @@ -264,8 +262,8 @@ def resmoke_suite_test( name = name, srcs = [resmoke_shim], data = merged_data + select({ - "//bazel/resmoke:installed_dist_test_enabled": ["//:installed-dist-test", "//:.resmoke_mongo_version.yml"], - "//conditions:default": ["//bazel/resmoke:resmoke_mongo_version"], + "//bazel/resmoke:installed_dist_test_enabled": ["//:installed-dist-test"], + "//conditions:default": [], }), deps = [ resmoke, @@ -284,6 +282,7 @@ def resmoke_suite_test( "--archiveLimitMb=500", "--testTimeout=$(RESMOKE_TEST_TIMEOUT)", "--historicTestRuntimes=$(location :%s)" % historic_runtimes, + "--mongoVersionFile=$(location //bazel/resmoke:resmoke_mongo_version)", ] + [ "--multiversionDir=$(location %s)" % native.package_relative_label(dep) for dep in multiversion_deps diff --git a/bazel/resmoke/resmoke_shim.py b/bazel/resmoke/resmoke_shim.py index 510e5d5f477..cf27d766cd1 100644 --- a/bazel/resmoke/resmoke_shim.py +++ b/bazel/resmoke/resmoke_shim.py @@ -122,20 +122,23 @@ class ResmokeShimContext: self.resource_monitor = None def create_short_symlinks(self): - """Create short symlinks in the original tmpdir to avoid long path issues.""" - original_tmpdir = tempfile.gettempdir() + """Create short symlinks in /tmp to avoid long path issues.""" + if os.path.isdir("/tmp") and os.access("/tmp", os.W_OK): + short_root = "/tmp" + else: + short_root = tempfile.gettempdir() # Create a short symlink to TEST_TMPDIR test_tempdir = os.environ.get("TEST_TMPDIR") if test_tempdir: - self.tmpdir_symlink = os.path.join(original_tmpdir, f"resmoke_tmp_{uuid.uuid1()}") + self.tmpdir_symlink = os.path.join(short_root, f"resmoke_tmp_{uuid.uuid1()}") os.symlink(test_tempdir, self.tmpdir_symlink) self.links.append(self.tmpdir_symlink) # Create a short symlink to TEST_UNDECLARED_OUTPUTS_DIR undeclared_outputs_dir = os.environ.get("TEST_UNDECLARED_OUTPUTS_DIR") if undeclared_outputs_dir: - self.outputs_symlink = os.path.join(original_tmpdir, f"resmoke_out_{uuid.uuid1()}") + self.outputs_symlink = os.path.join(short_root, f"resmoke_out_{uuid.uuid1()}") os.symlink(undeclared_outputs_dir, self.outputs_symlink) self.links.append(self.outputs_symlink) @@ -246,6 +249,9 @@ if __name__ == "__main__": outputs_dir = ctx.outputs_symlink if ctx.outputs_symlink else undeclared_output_dir + otel_dir = os.path.join(outputs_dir, "build", "metrics") + os.makedirs(otel_dir, exist_ok=True) + resmoke_args.append(f"--otelCollectorDir={otel_dir}") resmoke_args.append(f"--taskWorkDir={outputs_dir}") resmoke_args.append(f"--reportFile={os.path.join(outputs_dir, 'report.json')}") os.chdir(outputs_dir) diff --git a/buildscripts/bazel_burn_in.py b/buildscripts/bazel_burn_in.py index b8781fd5d11..a52e89e7d1d 100644 --- a/buildscripts/bazel_burn_in.py +++ b/buildscripts/bazel_burn_in.py @@ -48,6 +48,7 @@ from buildscripts.burn_in_tests import ( from buildscripts.ciconfig.evergreen import parse_evergreen_file from buildscripts.generate_result_tasks import make_results_task, make_task_group from buildscripts.util import buildozer_utils as buildozer +from buildscripts.util.read_config import read_config_file BAZEL_BURN_IN_TESTS = r"resmoke_tests_burn_in_*" @@ -298,6 +299,9 @@ def generate_tasks( ): os.chdir(os.environ.get("BUILD_WORKSPACE_DIRECTORY", ".")) + expansions = read_config_file("../expansions.yml") + resmoke_disable_rbe = expansions.get("resmoke_disable_rbe", "") == "true" + targets = query_targets_to_burn_in(origin_rev, test_changed_files) evg_conf = parse_evergreen_file("etc/evergreen.yml") @@ -340,6 +344,7 @@ def generate_tasks( variant.name, targets, f"resmoke_tests_burn_in_{variant.name}", + resmoke_disable_rbe=resmoke_disable_rbe, ) result_tasks[results_task_group.name] = burn_in_targets_to_run build_variant.add_task_group(results_task_group) @@ -354,9 +359,12 @@ def generate_tasks( shrub_project.add_build_variant(build_variant) project = shrub_project.as_dict() - tasks = [make_results_task(target) for target in targets_all] + [ - task.as_dict() for task in resmoke_tests_tasks - ] + tasks = [ + make_results_task( + target, resmoke_disable_rbe=resmoke_disable_rbe, generate_burn_in_targets=True + ) + for target in targets_all + ] + [task.as_dict() for task in resmoke_tests_tasks] project["tasks"] = tasks for variant in project.get("buildvariants", []): @@ -367,9 +375,15 @@ def generate_tasks( # these are not a dependency for the `resmoke_tests` task or the results tasks added here. # Set an explicitly depends_on in the task group's reference to override it. Remove with SERVER-119809. if task["name"] in result_tasks: - task["depends_on"] = { - "name": f"resmoke_tests_burn_in_{variant['name']}", - } + depends_on = [{"name": f"resmoke_tests_burn_in_{variant['name']}"}] + if resmoke_disable_rbe: + # archive_dist_test may live on a separate compile variant; resolve it + # per-variant here because Evergreen does not expand ${compile_variant} + # in depends_on.variant. + evg_variant = evg_conf.get_variant(variant["name"]) + compile_variant = evg_variant.expansion("compile_variant") or variant["name"] + depends_on.append({"name": "archive_dist_test", "variant": compile_variant}) + task["depends_on"] = depends_on else: task["depends_on"] = { "name": "version_burn_in_gen", diff --git a/buildscripts/generate_result_tasks.py b/buildscripts/generate_result_tasks.py index 1581b6814eb..8a77395d81e 100644 --- a/buildscripts/generate_result_tasks.py +++ b/buildscripts/generate_result_tasks.py @@ -52,10 +52,21 @@ def _bazel_binary() -> str: return os.environ.get("BAZEL_BINARY", "bazel") -def make_results_task(target: str) -> Task: +def make_results_task( + target: str, + resmoke_disable_rbe: bool = False, + generate_burn_in_targets: bool = False, +) -> Task: + if resmoke_disable_rbe: + results_func = "gather local test results" + else: + results_func = "fetch remote test results" + execute_params: dict = {"targets": target, "result_task": True} + if generate_burn_in_targets: + execute_params["generate_burn_in_targets"] = True commands = [ - FunctionCall("execute resmoke tests via bazel", {"targets": target, "result_task": True}), - FunctionCall("fetch remote test results", {"test_label": target}), + FunctionCall("execute resmoke tests via bazel", execute_params), + FunctionCall(results_func, {"test_label": target}), ] task = Task(target, commands).as_dict() @@ -67,25 +78,39 @@ def make_results_task(target: str) -> Task: return task -def make_task_group( - name: str, - variant: str, - targets, - resmoke_task: Optional[str] = "resmoke_tests", -) -> TaskGroup: - task_group = TaskGroup( - name=f"{name}_results_{variant}", - tasks=[], - max_hosts=len(targets), - setup_group_can_fail_task=True, - setup_group=[ - FunctionCall("git get project and add git tag"), - FunctionCall("set task expansion macros"), - FunctionCall("f_expansions_write"), - FunctionCall("set up venv"), - FunctionCall("configure evergreen api credentials"), - FunctionCall("set up credentials"), - FunctionCall("get engflow creds"), +def _make_setup_group(resmoke_task: str, resmoke_disable_rbe: bool) -> list: + common = [ + FunctionCall("git get project and add git tag"), + FunctionCall("set task expansion macros"), + FunctionCall("f_expansions_write"), + FunctionCall("set up venv"), + FunctionCall("configure evergreen api credentials"), + FunctionCall("set up credentials"), + FunctionCall("get engflow creds"), + ] + if resmoke_disable_rbe: + # Download and extract the pre-built dist-test binaries into src/ so that + # //bazel/resmoke:installed_dist_test_enabled can glob dist-test/** from the workspace root. + return common + [ + BuiltInCommand( + "s3.get", + { + "aws_key": "${aws_key_new}", + "aws_secret": "${aws_secret}", + "remote_file": "${mongo_binaries}", + "bucket": "mciuploads", + "local_file": "mongo-binaries.tgz", + }, + ), + BuiltInCommand( + "shell.exec", + { + "script": "tar -xf mongo-binaries.tgz -C src", + }, + ), + ] + else: + return common + [ BuiltInCommand( "s3.get", { @@ -110,7 +135,22 @@ def make_task_group( "optional": True, }, ), - ], + ] + + +def make_task_group( + name: str, + variant: str, + targets, + resmoke_task: Optional[str] = "resmoke_tests", + resmoke_disable_rbe: bool = False, +) -> TaskGroup: + task_group = TaskGroup( + name=f"{name}_results_{variant}", + tasks=[], + max_hosts=len(targets), + setup_group_can_fail_task=True, + setup_group=_make_setup_group(resmoke_task, resmoke_disable_rbe), # Between tasks, remove the test logs and outputs. The tasks share hosts and leaving them # can cause the task to include test logs from other bazel targets. setup_task=[BuiltInCommand("shell.exec", {"script": "rm -rf build/ results/ report.json"})], @@ -533,6 +573,7 @@ def main(outfile: Annotated[str, typer.Option()]): expansions = read_config_file("../expansions.yml") project_name = expansions.get("project", MASTER_PROJECT_NAME) evg_config_path = get_evergreen_config_path(project_name) + resmoke_disable_rbe = expansions.get("resmoke_disable_rbe", "") == "true" print(f"Parsing Evergreen configuration from {evg_config_path}...", file=sys.stderr) # Pre-warm the @cache-decorated resolvers so their bazel-run + YAML costs @@ -563,7 +604,9 @@ def main(outfile: Annotated[str, typer.Option()]): continue targets_all.update(targets) - task_group = make_task_group("resmoke_tests", variant.name, targets).as_dict() + task_group = make_task_group( + "resmoke_tests", variant.name, targets, resmoke_disable_rbe=resmoke_disable_rbe + ).as_dict() task_group["tasks"] = targets project["task_groups"].append(task_group) @@ -573,18 +616,28 @@ def main(outfile: Annotated[str, typer.Option()]): # Set an explicitly depends_on in the task group's reference to override it. # The task that generated the task is used as a no-op dependency, as a workaround for not # being able to set an empty depends_on. Remove with SERVER-119809. - build_variant["tasks"] = { - "name": task_group["name"], - "activate": False, - "depends_on": { + depends_on = [ + { "name": "bazel_result_tasks_gen", "variant": "generate-tasks-for-version", "omit_generated_tasks": True, }, + ] + if resmoke_disable_rbe: + # archive_dist_test may live on a separate compile variant; resolve it per-variant here + # because Evergreen does not expand ${compile_variant} in depends_on.variant. + compile_variant = variant.expansion("compile_variant") or variant.name + depends_on.append({"name": "archive_dist_test", "variant": compile_variant}) + build_variant["tasks"] = { + "name": task_group["name"], + "activate": False, + "depends_on": depends_on, } project["buildvariants"].append(build_variant) - project["tasks"] = [make_results_task(target) for target in targets_all] + project["tasks"] = [ + make_results_task(target, resmoke_disable_rbe=resmoke_disable_rbe) for target in targets_all + ] with open(outfile, "w") as f: f.write(json.dumps(project, indent=4)) diff --git a/evergreen/bazel_test_results_shutils.sh b/evergreen/bazel_test_results_shutils.sh new file mode 100644 index 00000000000..013b149d0eb --- /dev/null +++ b/evergreen/bazel_test_results_shutils.sh @@ -0,0 +1,240 @@ +# Shared bash helpers for processing bazel resmoke test outputs in Evergreen result tasks. +# +# Required environment variables (set by callers): +# * ${workdir} - The Evergreen workdir. +# * ${test_label} - The bazel test target label (e.g. //buildscripts/resmokeconfig:core). + +# Converts a bazel test label into the path-prefix convention used in ${workdir}/results. +# Example: //buildscripts/resmokeconfig:core -> buildscripts/resmokeconfig/core +function bazel_test_results::label_to_prefix() { + local label="$1" + label="${label#//}" + label="${label//:/\/}" + echo "${label}" +} + +# Symlinks test logs from a per-shard test.outputs/build/TestLogs directory into Evergreen's +# log ingestion folder. Must be invoked with the per-shard results directory as cwd. +function bazel_test_results::symlink_test_logs() { + local -r build_dir='test.outputs/build/TestLogs' + + if [[ ! -d "${build_dir}" ]]; then + echo "No test logs directory found at ${build_dir}, skipping symlink." + return + fi + + find "${build_dir}" -type f | while read -r file; do + rel_path="${file#${build_dir}/}" + target_path="${workdir}/build/TestLogs/${rel_path}" + target_dir=$(dirname "${target_path}") + + mkdir -p "${target_dir}" + + abs_file=$(realpath "${file}") + ln -sf "${abs_file}" "${target_path}" + done +} + +# Combines all resmoke OTel telemetry under ${workdir}/results into batched files under +# ${workdir}/build/OTelTraces. Evergreen processes slowly when there are many small files. +# However, files are kept under 4MB since that is the maximum size that +# Evergreen will send the the trace collector without re-batching them. +function bazel_test_results::combine_metrics() { + local -r output_dir="${workdir}/build/OTelTraces" + mkdir -p "${output_dir}" + + local max_size=$((4 * 1024 * 1024)) + local file_counter=0 + local current_size=0 + local current_output="${output_dir}/metrics.json" + + >"${current_output}" + + find "${workdir}/results" -wholename '*metrics/metrics*.json' -type f -print0 | while IFS= read -r -d '' file; do + local file_size=$(stat -c%s "${file}") + local newline_size=1 + + if ((current_size + file_size + newline_size > max_size && current_size > 0)); then + ((file_counter++)) + current_output="${output_dir}/metrics_${file_counter}.json" + current_size=0 + >"${current_output}" + fi + + cat "${file}" >>"${current_output}" + echo "" >>"${current_output}" + + current_size=$((current_size + file_size + newline_size)) + done +} + +# Combines all resmoke report JSONs into a single ${workdir}/report.json for attach.results. +function bazel_test_results::combine_reports() { + local -r report_files=$(find "${workdir}" -name 'report*.json' -type f 2>/dev/null) + + if [[ -z "${report_files}" ]]; then + echo 'No report.json files found' + return + fi + + local -r combined_report=$(echo "${report_files}" | xargs jq -s ' + { + results: map(.results // []) | add, + failures: (map(.results // []) | add | map(select(.status == "fail" or .status == "timeout")) | length) + } + ') + + local -r combined_report_file="${workdir}/report.json" + echo "${combined_report}" >"${combined_report_file}" + + local -r total_tests=$(echo "$combined_report" | jq '.results | length') + local -r failures=$(echo "$combined_report" | jq '.failures') + + echo "" + echo "Combined Report: ${total_tests} tests, ${failures} failures" + echo "Report written to: ${combined_report_file}" +} + +# Writes a YAML file indicating that test failures exist (consumed by expansions.update). +function bazel_test_results::write_test_failures_expansion() { + local -r output_file="${workdir}/results/test_failures_exist.yml" + mkdir -p "$(dirname "${output_file}")" + echo "test_failures_exist: true" >"${output_file}" +} + +# Prints all *test.log files with per-shard headers, ordered by shard number. +function bazel_test_results::print_executor_logs() { + local -r log_files=$(find "${workdir}/results" -name '*test.log' -type f 2>/dev/null) + + if [[ -z "${log_files}" ]]; then + return + fi + + local -r sorted_log_files=$(echo "${log_files}" | while IFS= read -r log_file; do + local shard_num=$(echo "${log_file}" | grep -oP 'shard_\K\d+(?=/)') + echo "${shard_num} ${log_file}" + done | sort -n | cut -d' ' -f2-) + + while IFS= read -r log_file; do + local shard_path=$(echo "${log_file}" | sed "s|${workdir}/results/||" | sed 's|/[^/]*$||') + + echo "================================================================================" + echo "Shard ${shard_path} log:" + echo "================================================================================" + cat "${log_file}" + echo "" + echo "================================================================================" + echo "" + done <<<"${sorted_log_files}" +} + +# Displays a formatted summary of test results. Caller passes parallel arrays by name. +# Usage: bazel_test_results::display_test_summary shard_names_var shard_statuses_var shard_test_counts_var +function bazel_test_results::display_test_summary() { + local -n _names="${1}" + local -n _statuses="${2}" + local -n _counts="${3}" + + echo "================================================================================" + echo "Test Results Summary" + echo "================================================================================" + echo "Target: ${test_label}" + echo "Total Shards: ${#_names[@]}" + echo "--------------------------------------------------------------------------------" + + local sorted_indices=() + for i in "${!_names[@]}"; do + sorted_indices+=("$i") + done + + IFS=$'\n' sorted_indices=($( + for i in "${sorted_indices[@]}"; do + local shard_num=$(echo "${_names[$i]}" | grep -oP 'shard_\K\d+$') + echo "${shard_num} ${i}" + done | sort -n | cut -d' ' -f2 + )) + + for i in "${sorted_indices[@]}"; do + local shard="${_names[$i]}" + local status="${_statuses[$i]}" + local test_counts="${_counts[$i]}" + + case "${status}" in + "PASSED") + echo " ✓ ${shard}: PASSED (${test_counts} tests passed)" + ;; + "FAILED") + if [[ "${test_counts}" == "0/0" ]]; then + echo " ✗ ${shard}: FAILED (no report generated)" + else + echo " ✗ ${shard}: FAILED (${test_counts} tests passed)" + fi + ;; + "TIMEOUT") + echo " ⏱ ${shard}: TIMEOUT" + ;; + "NO_REPORT") + echo " ✗ ${shard}: NO REPORT (no tests may have been run)" + ;; + esac + done + + echo "================================================================================" + echo "" +} + +# Reads a single test result's report.json and appends the corresponding entries to the +# parallel summary arrays passed by name. Must be invoked with the shard's results dir as cwd. +# Usage: bazel_test_results::record_shard_status +# names_var statuses_var counts_var +# Returns 0 if the shard had a report.json, 1 otherwise. +function bazel_test_results::record_shard_status() { + local -r shard_path="$1" + local -r is_failure_flag="$2" + local -r is_timeout_flag="$3" + local -n _names="${4}" + local -n _statuses="${5}" + local -n _counts="${6}" + + _names+=("${shard_path}") + + local report_file + report_file=$(compgen -G "test.outputs/report*.json" | head -n 1) + if [[ -n "${report_file}" ]]; then + local total_tests failed_tests passed_tests + IFS=$'\t' read -r total_tests failed_tests passed_tests < <( + jq -r '[ + (.results | length), + (.results | map(select(.status == "fail" or .status == "timeout")) | length), + (.results | map(select(.status == "pass")) | length) + ] | @tsv' "${report_file}" 2>/dev/null || printf "0\t0\t0\n" + ) + total_tests=${total_tests:-0} + failed_tests=${failed_tests:-0} + passed_tests=${passed_tests:-0} + + _counts+=("${passed_tests}/${total_tests}") + + if [[ "${is_timeout_flag}" -eq 1 ]]; then + _statuses+=("TIMEOUT") + elif [[ "${is_failure_flag}" -eq 1 || "${failed_tests}" -gt 0 ]]; then + if [[ "${total_tests}" -eq 0 ]]; then + _statuses+=("NO_REPORT") + else + _statuses+=("FAILED") + fi + else + _statuses+=("PASSED") + fi + return 0 + else + if [[ "${is_timeout_flag}" -eq 1 ]]; then + _statuses+=("TIMEOUT") + _counts+=("0/0") + else + _statuses+=("NO_REPORT") + _counts+=("0/0") + fi + return 1 + fi +} diff --git a/evergreen/fetch_remote_test_results.sh b/evergreen/fetch_remote_test_results.sh index 64c8d9af64a..2ac57c15f78 100644 --- a/evergreen/fetch_remote_test_results.sh +++ b/evergreen/fetch_remote_test_results.sh @@ -11,6 +11,9 @@ # * ${test_label} - The resmoke bazel target to get results for, like //buildscripts/resmokeconfig:core # * ${workdir} - The Evergreen workdir to use for test log and OTel trace ingestion. +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +. "$DIR/bazel_test_results_shutils.sh" + # Enumerates test results for each execution of ${test_label}. Shards/retries are individual executions with their own results. function enumerate_test_results() { jq --raw-output --compact-output --arg test_label "${test_label}" 'select(.testResult.testActionOutput != null) | @@ -91,144 +94,6 @@ function unzip_outputs() { fi } -# Symlinks test logs from a test result into Evergreen's log ingestion folder. -function symlink_test_logs() { - local build_dir='test.outputs/build/TestLogs' - - if [[ ! -d "$build_dir" ]]; then - return - fi - - find "$build_dir" -type f | while read -r file; do - # Get the relative path from the build directory - rel_path="${file#$build_dir/}" - target_path="${workdir}/build/TestLogs/${rel_path}" - target_dir=$(dirname "$target_path") - - mkdir -p "$target_dir" - - abs_file=$(realpath "$file") - ln -sf "$abs_file" "$target_path" - done -} - -# Displays a formatted summary of test results. -function display_test_summary() { - echo "================================================================================" - echo "Test Results Summary" - echo "================================================================================" - echo "Target: ${test_label}" - echo "Total Shards: ${#shard_names[@]}" - echo "--------------------------------------------------------------------------------" - - # Create a sorted list of indices based on shard names - local sorted_indices=() - for i in "${!shard_names[@]}"; do - sorted_indices+=("$i") - done - - # Sort indices by extracting and comparing shard numbers - IFS=$'\n' sorted_indices=($( - for i in "${sorted_indices[@]}"; do - local shard_num=$(echo "${shard_names[$i]}" | grep -oP 'shard_\K\d+$') - echo "$shard_num $i" - done | sort -n | cut -d' ' -f2 - )) - - for i in "${sorted_indices[@]}"; do - local shard="${shard_names[$i]}" - local status="${shard_statuses[$i]}" - local test_counts="${shard_test_counts[$i]}" - - # Format status with color indicators - case "$status" in - "PASSED") - echo " ✓ $shard: PASSED ($test_counts tests passed)" - ;; - "FAILED") - if [[ "$test_counts" == "0/0" ]]; then - echo " ✗ $shard: FAILED (no report generated)" - else - echo " ✗ $shard: FAILED ($test_counts tests passed)" - fi - ;; - "TIMEOUT") - echo " ⏱ $shard: TIMEOUT" - ;; - "NO_REPORT") - echo " ✗ $shard: NO REPORT (no tests may have been run)" - ;; - esac - done - - echo "================================================================================" - echo "" -} - -# Combine all resmoke telemetry and place it where Evergreen expects it: ${workdir}/build/OTelTraces. -# Metrics are batched into line-separated JSON files no greater than 4MB each. Evergreen processes -# fewer files faster, but hits message size limitations if they are too large. -function combine_metrics() { - local output_dir="${workdir}/build/OTelTraces" - mkdir -p "$output_dir" - - local max_size=$((4 * 1024 * 1024)) # 4MB in bytes - local file_counter=0 - local current_size=0 - local current_output="${output_dir}/metrics.json" - - # Create initial empty file - >"$current_output" - - find "${workdir}/results" -wholename '*metrics/metrics*.json' -type f -print0 | while IFS= read -r -d '' file; do - local file_size=$(stat -c%s "$file") - local newline_size=1 - - # Check if adding this file would exceed the limit - if ((current_size + file_size + newline_size > max_size && current_size > 0)); then - # Start a new file - ((file_counter++)) - current_output="${output_dir}/metrics_${file_counter}.json" - current_size=0 - >"$current_output" - fi - - # Append the file content - cat "$file" >>"$current_output" - echo "" >>"$current_output" # Adds a single newline after each file's content - - # Update current size - current_size=$((current_size + file_size + newline_size)) - done -} - -# Combines all Resmoke test report JSONs into a single JSON. -function combine_reports() { - local report_files=$(find "${workdir}" -name 'report*.json' -type f 2>/dev/null) - - if [[ -z "$report_files" ]]; then - echo 'No report.json files found' - return - fi - - local combined_report=$(echo "$report_files" | xargs jq -s ' - { - results: map(.results // []) | add, - failures: (map(.results // []) | add | map(select(.status == "fail" or .status == "timeout")) | length) - } - ') - - local combined_report_file="${workdir}/report.json" - echo "$combined_report" >"$combined_report_file" - - local total_tests=$(echo "$combined_report" | jq '.results | length') - local failures=$(echo "$combined_report" | jq '.failures') - - echo "" - echo "Combined Report: ${total_tests} tests, ${failures} failures" - echo "Report written to: $combined_report_file" -} - # Writes a user-friendly bazel invocation for re-running this test target. function write_bazel_invocation() { # Escape special characters in the label for the second sed expression. @@ -237,42 +102,6 @@ function write_bazel_invocation() { sed "s/\S*\$/${test_label_escaped}/" ${workdir}/resmoke-tests-bazel-invocation.txt | tail -n 1 >"${workdir}/bazel-invocation.txt" } -# Writes a YAML file indicating that test failures exist. -function write_test_failures_expansion() { - local output_file="${workdir}/results/test_failures_exist.yml" - mkdir -p "$(dirname "$output_file")" - echo "test_failures_exist: true" >"$output_file" -} - -# Print the contents of all *test.log files with headers per shard. -function print_executor_logs() { - local log_files=$(find "${workdir}/results" -name '*test.log' -type f 2>/dev/null) - - if [[ -z "$log_files" ]]; then - return - fi - - # Sort log files by shard number - local sorted_log_files=$(echo "$log_files" | while IFS= read -r log_file; do - # Extract shard number from path (e.g., /workdir/results/foo/bar/shard_1/test.log -> 1) - local shard_num=$(echo "$log_file" | grep -oP 'shard_\K\d+(?=/)') - echo "$shard_num $log_file" - done | sort -n | cut -d' ' -f2-) - - while IFS= read -r log_file; do - # Extract shard name from path (e.g., /workdir/results/foo/bar/shard_1/test.log -> foo/bar/shard_1) - local shard_path=$(echo "$log_file" | sed "s|${workdir}/results/||" | sed 's|/[^/]*$||') - - echo "================================================================================" - echo "Shard $shard_path log:" - echo "================================================================================" - cat "$log_file" - echo "" - echo "================================================================================" - echo "" - done <<<"$sorted_log_files" -} - # Resolves a file path from a list of candidate locations. Returns the first existing file path found. function resolve_file() { local -n paths=$1 @@ -340,48 +169,21 @@ while IFS= read -r test_result; do is_timeout_flag=1 is_failure_flag=1 fail_task=1 - write_test_failures_expansion + bazel_test_results::write_test_failures_expansion elif is_failure "$test_result"; then is_failure_flag=1 fail_task=1 - write_test_failures_expansion + bazel_test_results::write_test_failures_expansion fi download_outputs "$test_result" "$is_failure_flag" unzip_outputs "$is_failure_flag" - symlink_test_logs + bazel_test_results::symlink_test_logs - # Record shard information - shard_names+=("$target_prefix") - # Check if any report*.json files exist - if compgen -G "test.outputs/report*.json" >/dev/null; then - # Extract test counts from the report - report_file=$(compgen -G "test.outputs/report*.json" | head -n 1) - total_tests=$(jq '.results | length' "$report_file" 2>/dev/null || echo "0") - failed_tests=$(jq '.results | map(select(.status == "fail" or .status == "timeout")) | length' "$report_file" 2>/dev/null || echo "0") - passed_tests=$(jq '.results | map(select(.status == "pass")) | length' "$report_file" 2>/dev/null || echo "0") - - shard_test_counts+=("$passed_tests/$total_tests") - - if [[ "$is_timeout_flag" -eq 1 ]]; then - shard_statuses+=("TIMEOUT") - elif [[ "$is_failure_flag" -eq 1 ]]; then - if [[ "$total_tests" -eq 0 ]]; then - shard_statuses+=("NO_REPORT") - else - shard_statuses+=("FAILED") - fi - else - shard_statuses+=("PASSED") - fi - else - # No report file found - check if we have bazel-level status information - if [[ "$is_timeout_flag" -eq 1 ]]; then - shard_statuses+=("TIMEOUT") - shard_test_counts+=("0/0") - else - shard_statuses+=("NO_REPORT") - shard_test_counts+=("0/0") + if ! bazel_test_results::record_shard_status \ + "$target_prefix" "$is_failure_flag" "$is_timeout_flag" \ + shard_names shard_statuses shard_test_counts; then + if [[ "$is_timeout_flag" -ne 1 ]]; then missing_report=1 fi fi @@ -396,13 +198,13 @@ if [[ "$result_count" -eq 0 ]]; then exit 1 fi -print_executor_logs +bazel_test_results::print_executor_logs -display_test_summary +bazel_test_results::display_test_summary shard_names shard_statuses shard_test_counts -combine_metrics +bazel_test_results::combine_metrics -failures=$(combine_reports) +failures=$(bazel_test_results::combine_reports) write_bazel_invocation @@ -410,7 +212,7 @@ write_bazel_invocation for status in "${shard_statuses[@]}"; do if [[ "$status" == "TIMEOUT" || "$status" == "NO_REPORT" ]]; then echo "Error: One or more shards had TIMEOUT or NO_REPORT status. Not all tests ran or were reported." >&2 - write_test_failures_expansion + bazel_test_results::write_test_failures_expansion exit 1 fi done @@ -426,7 +228,7 @@ for status in "${shard_statuses[@]}"; do done if [[ "$has_test_failures" -eq 1 ]]; then - write_test_failures_expansion + bazel_test_results::write_test_failures_expansion fi exit 0 diff --git a/evergreen/gather_local_test_results.sh b/evergreen/gather_local_test_results.sh new file mode 100644 index 00000000000..a6d88ee8748 --- /dev/null +++ b/evergreen/gather_local_test_results.sh @@ -0,0 +1,130 @@ +# Gathers locally-executed bazel test results from bazel-testlogs/ and prepares them +# for Evergreen ingestion in the same layout as fetch_remote_test_results.sh. +# +# Usage: +# bash gather_local_test_results.sh +# +# Required environment variables: +# * ${test_label} - The bazel test target, e.g. //buildscripts/resmokeconfig:core +# * ${workdir} - The Evergreen workdir. + +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +. "$DIR/bazel_test_results_shutils.sh" + +readonly target_prefix=$(bazel_test_results::label_to_prefix "${test_label}") + +readonly bazel_testlogs="${workdir}/src/bazel-testlogs" +readonly target_outputs="${bazel_testlogs}/${target_prefix}" + +if [ ! -d "${target_outputs}" ]; then + echo "Error: No bazel test outputs found at ${target_outputs}" >&2 + echo "The test may have failed to build. Check the logs from the runner task." >&2 + exit 1 +fi + +# Collect shard directories. Sharded tests use shard__of_/; single-shard tests put files +# directly under the target's directory and are treated as shard 0. +declare -a shard_paths=() +declare -a shard_nums=() +if compgen -G "${target_outputs}/shard_*_of_*" >/dev/null; then + while IFS= read -r shard_dir; do + shard_num=$(basename "${shard_dir}" | sed 's/shard_\([0-9]\+\)_of_.*/\1/') + shard_paths+=("${shard_dir}") + shard_nums+=("${shard_num}") + done < <(find "${target_outputs}" -maxdepth 1 -type d -name 'shard_*_of_*' | sort) +else + shard_paths+=("${target_outputs}") + shard_nums+=("0") +fi + +shard_names=() +shard_statuses=() +shard_test_counts=() +fail_task=0 + +for i in "${!shard_paths[@]}"; do + shard_dir="${shard_paths[$i]}" + shard_num="${shard_nums[$i]}" + shard_path="${target_prefix}/shard_${shard_num}" + target_dir="${workdir}/results/${shard_path}" + mkdir -p "${target_dir}" + + is_failure_flag=0 + is_timeout_flag=0 + + # Copy test.log so it ends up in results//shard_/shard__test.log, + # matching the naming convention used by fetch_remote_test_results.sh and picked up + # by the teardown S3 put filter "**/*test.log". + if [ -f "${shard_dir}/test.log" ]; then + cp "${shard_dir}/test.log" "${target_dir}/shard_${shard_num}_test.log" + fi + + # Locate the undeclared outputs zip produced by --zip_undeclared_test_outputs. + output_zip="" + for candidate in \ + "${shard_dir}/test.outputs/outputs.zip" \ + "${shard_dir}/outputs.zip"; do + if [ -f "${candidate}" ]; then + output_zip="${candidate}" + break + fi + done + + if [ -n "${output_zip}" ]; then + mkdir -p "${target_dir}/test.outputs" + unzip -o -q "${output_zip}" -d "${target_dir}/test.outputs" + fi + + pushd "${target_dir}" >/dev/null + bazel_test_results::symlink_test_logs + + # Determine pass/fail from the extracted report.json. record_shard_status appends to the + # parallel summary arrays and returns non-zero if no report was found. + if compgen -G "test.outputs/report*.json" >/dev/null; then + report_file=$(compgen -G "test.outputs/report*.json" | head -n 1) + failed_tests=$(jq '.results | map(select(.status == "fail" or .status == "timeout")) | length' "${report_file}" 2>/dev/null || echo 0) + if [[ "${failed_tests}" -gt 0 ]]; then + is_failure_flag=1 + fail_task=1 + fi + fi + bazel_test_results::record_shard_status \ + "$shard_path" "$is_failure_flag" "$is_timeout_flag" \ + shard_names shard_statuses shard_test_counts || true + + # If this shard failed and produced an outputs zip, keep a copy alongside results/ so + # the teardown S3 put filter "**/*outputs.zip" attaches it to the task. + if [[ "${is_failure_flag}" -eq 1 && -n "${output_zip}" ]]; then + cp "${output_zip}" "shard_${shard_num}_test.outputs.zip" + fi + popd >/dev/null +done + +# Surface bazel's saved invocation (written by save_invocation in resmoke_tests_execute_bazel.sh) +# at ${workdir}/bazel-invocation.txt for the teardown S3 put. +if [ -f "${workdir}/src/bazel-invocation.txt" ]; then + cp "${workdir}/src/bazel-invocation.txt" "${workdir}/bazel-invocation.txt" +fi + +bazel_test_results::print_executor_logs + +bazel_test_results::display_test_summary shard_names shard_statuses shard_test_counts + +bazel_test_results::combine_metrics + +bazel_test_results::combine_reports + +# Check for system-level failures (TIMEOUT or NO_REPORT) +for status in "${shard_statuses[@]}"; do + if [[ "${status}" == "TIMEOUT" || "${status}" == "NO_REPORT" ]]; then + echo "Error: One or more shards had TIMEOUT or NO_REPORT status. Not all tests ran or were reported." >&2 + bazel_test_results::write_test_failures_expansion + exit 1 + fi +done + +if [[ "${fail_task}" -eq 1 ]]; then + bazel_test_results::write_test_failures_expansion +fi + +exit 0 diff --git a/evergreen/resmoke_tests_execute_bazel.sh b/evergreen/resmoke_tests_execute_bazel.sh index 444373c9d42..139d6826ea7 100644 --- a/evergreen/resmoke_tests_execute_bazel.sh +++ b/evergreen/resmoke_tests_execute_bazel.sh @@ -13,6 +13,9 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" # Result tasks re-invoke this script to conditionally re-execute the test. The test should # execute unless the task was activated by the resmoke_tests task that already ran all tests. exit_early_if_result_task() { + if [[ "${resmoke_disable_rbe}" == "true" ]]; then + return # Local exec: result tasks must always run bazel test themselves. + fi if [[ -f "src/build_events.json" && "$activated_by" == "mongodb-mongo-ci-user" ]]; then echo "Tests were executed by the resmoke_tests task, test results will be fetched from their remote execution." exit 0 @@ -56,10 +59,14 @@ build_ci_flags() { export compile_variant="${compile_variant}" export version_id="${version_id}" - if [[ "${evergreen_remote_exec}" == "on" ]]; then + if [[ "${evergreen_remote_exec}" == "on" && "${resmoke_disable_rbe}" != "true" ]]; then ci_flags="--config=remote_test ${ci_flags}" fi + if [[ "${resmoke_disable_rbe}" == "true" ]]; then + ci_flags+=" --//bazel/resmoke:installed_dist_test" + fi + if [ "${should_shuffle}" = true ]; then ci_flags+=" --test_arg=--shuffle" elif [ "${should_shuffle}" = false ]; then @@ -112,8 +119,20 @@ maybe_generate_burn_in_targets() { # Fetches then tests with retries. Leaves the result in the global RET. run_fetch_and_test() { + local fetch_attempts=3 + local test_attempts=2 + if [[ "${resmoke_disable_rbe}" == "true" ]]; then + # Local exec runs a full suite serially on a single host; retrying would just + # repeat hours of work. Cap to a single attempt and extend the bazel-level + # timeout well beyond the remote-exec default so the run can finish. + fetch_attempts=1 + test_attempts=1 + build_timeout_seconds=14400 + export build_timeout_seconds + fi + export RETRY_ON_FAIL=1 - bazel_evergreen_shutils::retry_bazel_cmd 3 "$BAZEL_BINARY" \ + bazel_evergreen_shutils::retry_bazel_cmd $fetch_attempts "$BAZEL_BINARY" \ fetch ${ci_flags} ${bazel_args} ${bazel_compile_flags} ${task_compile_flags} ${patch_compile_flags} ${targets} RET=$? @@ -122,7 +141,7 @@ run_fetch_and_test() { fi export RETRY_ON_FAIL=0 - bazel_evergreen_shutils::retry_bazel_cmd 2 "$BAZEL_BINARY" \ + bazel_evergreen_shutils::retry_bazel_cmd $test_attempts "$BAZEL_BINARY" \ test ${ci_flags} ${bazel_args} ${bazel_compile_flags} ${task_compile_flags} ${patch_compile_flags} --build_event_json_file=build_events.json ${targets} RET=$? @@ -152,7 +171,11 @@ activate_result_tasks() { return fi echo "Activating result task group..." - python buildscripts/evergreen_activate_result_tasks.py --expansion-file ../expansions.yml --build-events-file build_events.json + local extra_args="" + if [[ "${resmoke_disable_rbe}" != "true" ]]; then + extra_args="--build-events-file build_events.json" + fi + python buildscripts/evergreen_activate_result_tasks.py --expansion-file ../expansions.yml ${extra_args} } main() { @@ -178,6 +201,12 @@ main() { maybe_generate_burn_in_targets + if [[ "${resmoke_disable_rbe}" == "true" && -z "$result_task" ]]; then + # Local exec runner: skip bazel entirely; each result task will run its own bazel test. + activate_result_tasks + exit 0 + fi + set +o errexit run_fetch_and_test bazel_evergreen_shutils::write_last_engflow_link