SERVER-113067 Add timeout for resmoke hooks when hang analyzer is set (#44262)

GitOrigin-RevId: fe1283f6df85b800d34517708a976a2bb80a61c6
This commit is contained in:
Sean Lyons 2025-11-24 09:14:58 -05:00 committed by MongoDB Bot
parent 2bed090acf
commit 473af93c3d
11 changed files with 153 additions and 15 deletions

View File

@ -158,6 +158,7 @@ DEFAULTS = {
"validate_selector_paths": True,
# Internal testing options.
"internal_params": [],
"hang_analyzer_hook_timeout": 180.0, # seconds
# Evergreen options.
"evergreen_url": "evergreen.mongodb.com",
"build_id": None,
@ -507,6 +508,9 @@ INCLUDE_TAGS = None
# not be set by the user.
INTERNAL_PARAMS = []
# The timeout (in seconds) that hooks are allowed to run after the hang analyzer has signaled Resmoke.
HANG_ANALYZER_HOOK_TIMEOUT = None
# If set, then resmoke.py starts the specified number of Job instances to run tests.
JOBS = None

View File

@ -787,6 +787,7 @@ flags in common: {common_set}
_config.PAUSE_AFTER_POPULATE = config.pop("pause_after_populate")
_config.LOAD_ALL_EXTENSIONS = config.pop("load_all_extensions")
_config.NO_HOOKS = config.pop("no_hooks")
_config.HANG_ANALYZER_HOOK_TIMEOUT = config.pop("hang_analyzer_hook_timeout")
# Internal testing options.
_config.INTERNAL_PARAMS = config.pop("internal_params")

View File

@ -13,6 +13,7 @@ py_library(
"plugin.py",
"process.py",
"process_list.py",
"timeout_for_hang_analyzer.py",
],
visibility = ["//visibility:public"],
deps = [

View File

@ -0,0 +1,36 @@
import threading
from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED
class TimeoutForHangAnalyzer:
"""Runs a function in a separate thread. If the hang analyzer has been called, raise a TimeoutError
after the timeout duration. The function will continue to run in the background."""
def __init__(self, timeout, func, args=()):
self.func = func
self.args = args
self.result = None
self.exception = None
self.timeout = timeout
def _worker(self):
try:
self.result = self.func(*self.args)
except Exception as e:
self.exception = e
def run(self):
thread = threading.Thread(target=self._worker)
thread.start()
while True:
thread.join(self.timeout)
if HANG_ANALYZER_CALLED.is_set() and thread.is_alive():
raise TimeoutError(
f"Function {self.func} execution exceeded the time limit of {self.timeout} seconds."
)
elif not thread.is_alive():
if self.exception:
raise self.exception
return self.result

View File

@ -783,7 +783,7 @@ class TestRunner(Subcommand):
try:
proc.kill()
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess) as exc:
proc_msg += f" - target escaped: {type(exc).__name__ }"
proc_msg += f" - target escaped: {type(exc).__name__}"
else:
proc_msg += " - target destroyed\n"
print(proc_msg)
@ -1719,7 +1719,7 @@ class RunPlugin(PluginInterface):
dest="linear_chain",
choices=("on", "off"),
metavar="ON|OFF",
help="Enable or disable linear chaining for tests using " "ReplicaSetFixture.",
help="Enable or disable linear chaining for tests using ReplicaSetFixture.",
)
parser.add_argument(
@ -1965,7 +1965,7 @@ class RunPlugin(PluginInterface):
dest="majority_read_concern",
choices=("on", "off"),
metavar="ON|OFF",
help=("Enable or disable majority read concern support." " Defaults to %(default)s."),
help=("Enable or disable majority read concern support. Defaults to %(default)s."),
)
mongodb_server_options.add_argument(
@ -1979,7 +1979,7 @@ class RunPlugin(PluginInterface):
"--storageEngineCacheSizeGB",
dest="storage_engine_cache_size_gb",
metavar="CONFIG",
help="Sets the storage engine cache size configuration" " setting for all mongod's.",
help="Sets the storage engine cache size configuration setting for all mongod's.",
)
mongodb_server_options.add_argument(
@ -2153,6 +2153,14 @@ class RunPlugin(PluginInterface):
"--internalParam", action="append", dest="internal_params", help=argparse.SUPPRESS
)
internal_options.add_argument(
"--hangAnalyzerHookTimeout",
type=float,
dest="hang_analyzer_hook_timeout",
help="The time (in seconds) that hooks are allowed to run after the hang"
" analyzer has signaled Resmoke.",
)
internal_options.add_argument(
"--cedarReportFile",
dest="cedar_report_file",
@ -2272,7 +2280,7 @@ class RunPlugin(PluginInterface):
"--distroId",
dest="distro_id",
metavar="DISTRO_ID",
help=("Sets the identifier for the Evergreen distro running the" " tests."),
help=("Sets the identifier for the Evergreen distro running the tests."),
)
evergreen_options.add_argument(
@ -2280,14 +2288,14 @@ class RunPlugin(PluginInterface):
type=int,
dest="execution_number",
metavar="EXECUTION_NUMBER",
help=("Sets the number for the Evergreen execution running the" " tests."),
help=("Sets the number for the Evergreen execution running the tests."),
)
evergreen_options.add_argument(
"--gitRevision",
dest="git_revision",
metavar="GIT_REVISION",
help=("Sets the git revision for the Evergreen task running the" " tests."),
help=("Sets the git revision for the Evergreen task running the tests."),
)
# We intentionally avoid adding a new command line option that starts with --suite so it doesn't
@ -2307,7 +2315,7 @@ class RunPlugin(PluginInterface):
"--patchBuild",
action="store_true",
dest="patch_build",
help=("Indicates that the Evergreen task running the tests is a" " patch build."),
help=("Indicates that the Evergreen task running the tests is a patch build."),
)
evergreen_options.add_argument(
@ -2342,7 +2350,7 @@ class RunPlugin(PluginInterface):
"--variantName",
dest="variant_name",
metavar="VARIANT_NAME",
help=("Sets the name of the Evergreen build variant running the" " tests."),
help=("Sets the name of the Evergreen build variant running the tests."),
)
evergreen_options.add_argument(
@ -2390,7 +2398,7 @@ class RunPlugin(PluginInterface):
dest="benchmark_list_tests",
action="store_true",
# metavar="BENCHMARK_LIST_TESTS",
help=("Lists all Google benchmark test configurations in each" " test file."),
help=("Lists all Google benchmark test configurations in each test file."),
)
benchmark_min_time_help = (

View File

@ -12,6 +12,9 @@ from opentelemetry.context.context import Context
from opentelemetry.trace.status import StatusCode
from buildscripts.resmokelib import config, errors
from buildscripts.resmokelib.hang_analyzer.timeout_for_hang_analyzer import (
TimeoutForHangAnalyzer,
)
from buildscripts.resmokelib.testing import testcases
from buildscripts.resmokelib.testing.fixtures import shardedcluster
from buildscripts.resmokelib.testing.fixtures.interface import Fixture, create_fixture_table
@ -318,8 +321,16 @@ class Job(object):
"""Provide helper to run hook and archival."""
try:
success = False
hook_function(test, self.report)
TimeoutForHangAnalyzer(
timeout=config.HANG_ANALYZER_HOOK_TIMEOUT,
func=hook_function,
args=(test, self.report),
).run()
success = True
except TimeoutError:
self.logger.error(
f"The '{hook_function.__name__}' of hook {hook.__class__.__name__} did not complete in {config.HANG_ANALYZER_HOOK_TIMEOUT} seconds. The hook may continue to run in the background. When the hang analyzer is called, this timeout is enforced to ensure Resmoke can complete a graceful shutdown."
)
finally:
if not success and hook_failure_flag is not None:
hook_failure_flag.set()
@ -353,8 +364,16 @@ class Job(object):
hooks_failed = True
try:
for hook in self.hooks:
hook.after_suite(self.report, teardown_flag)
TimeoutForHangAnalyzer(
timeout=config.HANG_ANALYZER_HOOK_TIMEOUT,
func=hook.after_suite,
args=(self.report, teardown_flag),
).run()
hooks_failed = False
except TimeoutError:
self.logger.error(
f"The 'after_suite' of hook {hook.__class__.__name__} did not complete in {config.HANG_ANALYZER_HOOK_TIMEOUT} seconds. The hook may continue to run in the background. When the hang analyzer is called, this timeout is enforced to ensure Resmoke can complete a graceful shutdown."
)
finally:
if hooks_failed and hook_failure_flag is not None:
hook_failure_flag.set()

View File

@ -0,0 +1,21 @@
import threading
import time
from datetime import timedelta
from buildscripts.resmokelib.testing.hooks import interface
class SleepingHook(interface.Hook):
IS_BACKGROUND = True
REGISTERED_NAME = "SleepingHook"
def __init__(self, hook_logger, fixture, sleep_time=timedelta(seconds=120)):
self._sleep_time = sleep_time
self._thread = None
def before_test(self, test, test_report):
self._thread = threading.Thread(target=time.sleep, args=(self._sleep_time.total_seconds(),))
self._thread.start()
def after_test(self, test, test_report):
self._thread.join()

View File

@ -2,7 +2,7 @@ test_kind: js_test
selector:
roots:
- jstests/resmoke_selftest/end2end/timeout/fixture/*.js
- jstests/resmoke_selftest/end2end/timeout/fixture/timeout0.js
executor:
archive:

View File

@ -0,0 +1,19 @@
test_kind: js_test
selector:
roots:
- jstests/resmoke_selftest/end2end/timeout/fixture/timeout1.js
executor:
archive:
tests: true
fixture:
class: NoOpFixture
fixture:
class: ReplicaSetFixture
mongod_options:
set_parameters:
enableTestCommands: 1
num_nodes: 2
hooks:
- class: SleepingHook

View File

@ -157,9 +157,16 @@ class TestTimeout(_ResmokeSelftest):
signal_resmoke_process.start()
# Wait for resmoke_process to be killed by 'run-timeout' so this doesn't hang.
self.resmoke_process.wait()
try:
self.resmoke_process.wait(60)
return_code = signal_resmoke_process.wait(60)
except subprocess.TimeoutExpired:
self.resmoke_process.stop()
signal_resmoke_process.stop()
self.fail(
"Resmoke or the hang analyzer process did not terminate within 60 seconds of starting the hang analyzer."
)
return_code = signal_resmoke_process.wait()
if return_code != 0:
self.resmoke_process.stop()
self.assertEqual(return_code, 0)
@ -253,6 +260,25 @@ class TestTimeout(_ResmokeSelftest):
analysis_pids_to_expect = 6 # 2 tests * (2 mongod + 1 mongo)
self.assert_dir_file_count(self.test_dir, self.analysis_file, analysis_pids_to_expect)
def test_timeout_in_python_hook(self):
resmoke_args = [
"--resmokeModulesPath=buildscripts/tests/resmoke_end2end/test_resmoke_modules.yml",
"--suites=buildscripts/tests/resmoke_end2end/suites/sleeping_hook_timeout.yml",
"--taskId=123",
"--originSuite=resmoke_end2end_tests",
"--hangAnalyzerHookTimeout=1",
"--archiveMode=test_archival",
"--internalParam=test_analysis",
]
self.execute_resmoke(resmoke_args, sentinel_file="timeout0")
archival_dirs_to_expect = 2 # 2 mongod nodes
self.assert_dir_file_count(self.test_dir, self.archival_file, archival_dirs_to_expect)
analysis_pids_to_expect = 2 # 2 mongod
self.assert_dir_file_count(self.test_dir, self.analysis_file, analysis_pids_to_expect)
class TestTestTimeout(_ResmokeSelftest):
def test_individual_test_timeout(self):

View File

@ -0,0 +1,3 @@
// Signal that the test has started running
let sentinelPath = (_getEnv("TMPDIR") || _getEnv("TMP_DIR") || "/tmp") + "/timeout0.js.sentinel";
removeFile(sentinelPath);