SERVER-113067 Add timeout for resmoke hooks when hang analyzer is set (#44262)
GitOrigin-RevId: fe1283f6df85b800d34517708a976a2bb80a61c6
This commit is contained in:
parent
2bed090acf
commit
473af93c3d
@ -158,6 +158,7 @@ DEFAULTS = {
|
||||
"validate_selector_paths": True,
|
||||
# Internal testing options.
|
||||
"internal_params": [],
|
||||
"hang_analyzer_hook_timeout": 180.0, # seconds
|
||||
# Evergreen options.
|
||||
"evergreen_url": "evergreen.mongodb.com",
|
||||
"build_id": None,
|
||||
@ -507,6 +508,9 @@ INCLUDE_TAGS = None
|
||||
# not be set by the user.
|
||||
INTERNAL_PARAMS = []
|
||||
|
||||
# The timeout (in seconds) that hooks are allowed to run after the hang analyzer has signaled Resmoke.
|
||||
HANG_ANALYZER_HOOK_TIMEOUT = None
|
||||
|
||||
# If set, then resmoke.py starts the specified number of Job instances to run tests.
|
||||
JOBS = None
|
||||
|
||||
|
||||
@ -787,6 +787,7 @@ flags in common: {common_set}
|
||||
_config.PAUSE_AFTER_POPULATE = config.pop("pause_after_populate")
|
||||
_config.LOAD_ALL_EXTENSIONS = config.pop("load_all_extensions")
|
||||
_config.NO_HOOKS = config.pop("no_hooks")
|
||||
_config.HANG_ANALYZER_HOOK_TIMEOUT = config.pop("hang_analyzer_hook_timeout")
|
||||
|
||||
# Internal testing options.
|
||||
_config.INTERNAL_PARAMS = config.pop("internal_params")
|
||||
|
||||
@ -13,6 +13,7 @@ py_library(
|
||||
"plugin.py",
|
||||
"process.py",
|
||||
"process_list.py",
|
||||
"timeout_for_hang_analyzer.py",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
import threading
|
||||
|
||||
from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED
|
||||
|
||||
|
||||
class TimeoutForHangAnalyzer:
|
||||
"""Runs a function in a separate thread. If the hang analyzer has been called, raise a TimeoutError
|
||||
after the timeout duration. The function will continue to run in the background."""
|
||||
|
||||
def __init__(self, timeout, func, args=()):
|
||||
self.func = func
|
||||
self.args = args
|
||||
self.result = None
|
||||
self.exception = None
|
||||
self.timeout = timeout
|
||||
|
||||
def _worker(self):
|
||||
try:
|
||||
self.result = self.func(*self.args)
|
||||
except Exception as e:
|
||||
self.exception = e
|
||||
|
||||
def run(self):
|
||||
thread = threading.Thread(target=self._worker)
|
||||
thread.start()
|
||||
|
||||
while True:
|
||||
thread.join(self.timeout)
|
||||
if HANG_ANALYZER_CALLED.is_set() and thread.is_alive():
|
||||
raise TimeoutError(
|
||||
f"Function {self.func} execution exceeded the time limit of {self.timeout} seconds."
|
||||
)
|
||||
elif not thread.is_alive():
|
||||
if self.exception:
|
||||
raise self.exception
|
||||
return self.result
|
||||
@ -783,7 +783,7 @@ class TestRunner(Subcommand):
|
||||
try:
|
||||
proc.kill()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess) as exc:
|
||||
proc_msg += f" - target escaped: {type(exc).__name__ }"
|
||||
proc_msg += f" - target escaped: {type(exc).__name__}"
|
||||
else:
|
||||
proc_msg += " - target destroyed\n"
|
||||
print(proc_msg)
|
||||
@ -1719,7 +1719,7 @@ class RunPlugin(PluginInterface):
|
||||
dest="linear_chain",
|
||||
choices=("on", "off"),
|
||||
metavar="ON|OFF",
|
||||
help="Enable or disable linear chaining for tests using " "ReplicaSetFixture.",
|
||||
help="Enable or disable linear chaining for tests using ReplicaSetFixture.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@ -1965,7 +1965,7 @@ class RunPlugin(PluginInterface):
|
||||
dest="majority_read_concern",
|
||||
choices=("on", "off"),
|
||||
metavar="ON|OFF",
|
||||
help=("Enable or disable majority read concern support." " Defaults to %(default)s."),
|
||||
help=("Enable or disable majority read concern support. Defaults to %(default)s."),
|
||||
)
|
||||
|
||||
mongodb_server_options.add_argument(
|
||||
@ -1979,7 +1979,7 @@ class RunPlugin(PluginInterface):
|
||||
"--storageEngineCacheSizeGB",
|
||||
dest="storage_engine_cache_size_gb",
|
||||
metavar="CONFIG",
|
||||
help="Sets the storage engine cache size configuration" " setting for all mongod's.",
|
||||
help="Sets the storage engine cache size configuration setting for all mongod's.",
|
||||
)
|
||||
|
||||
mongodb_server_options.add_argument(
|
||||
@ -2153,6 +2153,14 @@ class RunPlugin(PluginInterface):
|
||||
"--internalParam", action="append", dest="internal_params", help=argparse.SUPPRESS
|
||||
)
|
||||
|
||||
internal_options.add_argument(
|
||||
"--hangAnalyzerHookTimeout",
|
||||
type=float,
|
||||
dest="hang_analyzer_hook_timeout",
|
||||
help="The time (in seconds) that hooks are allowed to run after the hang"
|
||||
" analyzer has signaled Resmoke.",
|
||||
)
|
||||
|
||||
internal_options.add_argument(
|
||||
"--cedarReportFile",
|
||||
dest="cedar_report_file",
|
||||
@ -2272,7 +2280,7 @@ class RunPlugin(PluginInterface):
|
||||
"--distroId",
|
||||
dest="distro_id",
|
||||
metavar="DISTRO_ID",
|
||||
help=("Sets the identifier for the Evergreen distro running the" " tests."),
|
||||
help=("Sets the identifier for the Evergreen distro running the tests."),
|
||||
)
|
||||
|
||||
evergreen_options.add_argument(
|
||||
@ -2280,14 +2288,14 @@ class RunPlugin(PluginInterface):
|
||||
type=int,
|
||||
dest="execution_number",
|
||||
metavar="EXECUTION_NUMBER",
|
||||
help=("Sets the number for the Evergreen execution running the" " tests."),
|
||||
help=("Sets the number for the Evergreen execution running the tests."),
|
||||
)
|
||||
|
||||
evergreen_options.add_argument(
|
||||
"--gitRevision",
|
||||
dest="git_revision",
|
||||
metavar="GIT_REVISION",
|
||||
help=("Sets the git revision for the Evergreen task running the" " tests."),
|
||||
help=("Sets the git revision for the Evergreen task running the tests."),
|
||||
)
|
||||
|
||||
# We intentionally avoid adding a new command line option that starts with --suite so it doesn't
|
||||
@ -2307,7 +2315,7 @@ class RunPlugin(PluginInterface):
|
||||
"--patchBuild",
|
||||
action="store_true",
|
||||
dest="patch_build",
|
||||
help=("Indicates that the Evergreen task running the tests is a" " patch build."),
|
||||
help=("Indicates that the Evergreen task running the tests is a patch build."),
|
||||
)
|
||||
|
||||
evergreen_options.add_argument(
|
||||
@ -2342,7 +2350,7 @@ class RunPlugin(PluginInterface):
|
||||
"--variantName",
|
||||
dest="variant_name",
|
||||
metavar="VARIANT_NAME",
|
||||
help=("Sets the name of the Evergreen build variant running the" " tests."),
|
||||
help=("Sets the name of the Evergreen build variant running the tests."),
|
||||
)
|
||||
|
||||
evergreen_options.add_argument(
|
||||
@ -2390,7 +2398,7 @@ class RunPlugin(PluginInterface):
|
||||
dest="benchmark_list_tests",
|
||||
action="store_true",
|
||||
# metavar="BENCHMARK_LIST_TESTS",
|
||||
help=("Lists all Google benchmark test configurations in each" " test file."),
|
||||
help=("Lists all Google benchmark test configurations in each test file."),
|
||||
)
|
||||
|
||||
benchmark_min_time_help = (
|
||||
|
||||
@ -12,6 +12,9 @@ from opentelemetry.context.context import Context
|
||||
from opentelemetry.trace.status import StatusCode
|
||||
|
||||
from buildscripts.resmokelib import config, errors
|
||||
from buildscripts.resmokelib.hang_analyzer.timeout_for_hang_analyzer import (
|
||||
TimeoutForHangAnalyzer,
|
||||
)
|
||||
from buildscripts.resmokelib.testing import testcases
|
||||
from buildscripts.resmokelib.testing.fixtures import shardedcluster
|
||||
from buildscripts.resmokelib.testing.fixtures.interface import Fixture, create_fixture_table
|
||||
@ -318,8 +321,16 @@ class Job(object):
|
||||
"""Provide helper to run hook and archival."""
|
||||
try:
|
||||
success = False
|
||||
hook_function(test, self.report)
|
||||
TimeoutForHangAnalyzer(
|
||||
timeout=config.HANG_ANALYZER_HOOK_TIMEOUT,
|
||||
func=hook_function,
|
||||
args=(test, self.report),
|
||||
).run()
|
||||
success = True
|
||||
except TimeoutError:
|
||||
self.logger.error(
|
||||
f"The '{hook_function.__name__}' of hook {hook.__class__.__name__} did not complete in {config.HANG_ANALYZER_HOOK_TIMEOUT} seconds. The hook may continue to run in the background. When the hang analyzer is called, this timeout is enforced to ensure Resmoke can complete a graceful shutdown."
|
||||
)
|
||||
finally:
|
||||
if not success and hook_failure_flag is not None:
|
||||
hook_failure_flag.set()
|
||||
@ -353,8 +364,16 @@ class Job(object):
|
||||
hooks_failed = True
|
||||
try:
|
||||
for hook in self.hooks:
|
||||
hook.after_suite(self.report, teardown_flag)
|
||||
TimeoutForHangAnalyzer(
|
||||
timeout=config.HANG_ANALYZER_HOOK_TIMEOUT,
|
||||
func=hook.after_suite,
|
||||
args=(self.report, teardown_flag),
|
||||
).run()
|
||||
hooks_failed = False
|
||||
except TimeoutError:
|
||||
self.logger.error(
|
||||
f"The 'after_suite' of hook {hook.__class__.__name__} did not complete in {config.HANG_ANALYZER_HOOK_TIMEOUT} seconds. The hook may continue to run in the background. When the hang analyzer is called, this timeout is enforced to ensure Resmoke can complete a graceful shutdown."
|
||||
)
|
||||
finally:
|
||||
if hooks_failed and hook_failure_flag is not None:
|
||||
hook_failure_flag.set()
|
||||
|
||||
21
buildscripts/tests/resmoke_end2end/hooks/sleeping_hook.py
Normal file
21
buildscripts/tests/resmoke_end2end/hooks/sleeping_hook.py
Normal file
@ -0,0 +1,21 @@
|
||||
import threading
|
||||
import time
|
||||
from datetime import timedelta
|
||||
|
||||
from buildscripts.resmokelib.testing.hooks import interface
|
||||
|
||||
|
||||
class SleepingHook(interface.Hook):
|
||||
IS_BACKGROUND = True
|
||||
REGISTERED_NAME = "SleepingHook"
|
||||
|
||||
def __init__(self, hook_logger, fixture, sleep_time=timedelta(seconds=120)):
|
||||
self._sleep_time = sleep_time
|
||||
self._thread = None
|
||||
|
||||
def before_test(self, test, test_report):
|
||||
self._thread = threading.Thread(target=time.sleep, args=(self._sleep_time.total_seconds(),))
|
||||
self._thread.start()
|
||||
|
||||
def after_test(self, test, test_report):
|
||||
self._thread.join()
|
||||
@ -2,7 +2,7 @@ test_kind: js_test
|
||||
|
||||
selector:
|
||||
roots:
|
||||
- jstests/resmoke_selftest/end2end/timeout/fixture/*.js
|
||||
- jstests/resmoke_selftest/end2end/timeout/fixture/timeout0.js
|
||||
|
||||
executor:
|
||||
archive:
|
||||
|
||||
@ -0,0 +1,19 @@
|
||||
test_kind: js_test
|
||||
|
||||
selector:
|
||||
roots:
|
||||
- jstests/resmoke_selftest/end2end/timeout/fixture/timeout1.js
|
||||
|
||||
executor:
|
||||
archive:
|
||||
tests: true
|
||||
fixture:
|
||||
class: NoOpFixture
|
||||
fixture:
|
||||
class: ReplicaSetFixture
|
||||
mongod_options:
|
||||
set_parameters:
|
||||
enableTestCommands: 1
|
||||
num_nodes: 2
|
||||
hooks:
|
||||
- class: SleepingHook
|
||||
@ -157,9 +157,16 @@ class TestTimeout(_ResmokeSelftest):
|
||||
signal_resmoke_process.start()
|
||||
|
||||
# Wait for resmoke_process to be killed by 'run-timeout' so this doesn't hang.
|
||||
self.resmoke_process.wait()
|
||||
try:
|
||||
self.resmoke_process.wait(60)
|
||||
return_code = signal_resmoke_process.wait(60)
|
||||
except subprocess.TimeoutExpired:
|
||||
self.resmoke_process.stop()
|
||||
signal_resmoke_process.stop()
|
||||
self.fail(
|
||||
"Resmoke or the hang analyzer process did not terminate within 60 seconds of starting the hang analyzer."
|
||||
)
|
||||
|
||||
return_code = signal_resmoke_process.wait()
|
||||
if return_code != 0:
|
||||
self.resmoke_process.stop()
|
||||
self.assertEqual(return_code, 0)
|
||||
@ -253,6 +260,25 @@ class TestTimeout(_ResmokeSelftest):
|
||||
analysis_pids_to_expect = 6 # 2 tests * (2 mongod + 1 mongo)
|
||||
self.assert_dir_file_count(self.test_dir, self.analysis_file, analysis_pids_to_expect)
|
||||
|
||||
def test_timeout_in_python_hook(self):
|
||||
resmoke_args = [
|
||||
"--resmokeModulesPath=buildscripts/tests/resmoke_end2end/test_resmoke_modules.yml",
|
||||
"--suites=buildscripts/tests/resmoke_end2end/suites/sleeping_hook_timeout.yml",
|
||||
"--taskId=123",
|
||||
"--originSuite=resmoke_end2end_tests",
|
||||
"--hangAnalyzerHookTimeout=1",
|
||||
"--archiveMode=test_archival",
|
||||
"--internalParam=test_analysis",
|
||||
]
|
||||
|
||||
self.execute_resmoke(resmoke_args, sentinel_file="timeout0")
|
||||
|
||||
archival_dirs_to_expect = 2 # 2 mongod nodes
|
||||
self.assert_dir_file_count(self.test_dir, self.archival_file, archival_dirs_to_expect)
|
||||
|
||||
analysis_pids_to_expect = 2 # 2 mongod
|
||||
self.assert_dir_file_count(self.test_dir, self.analysis_file, analysis_pids_to_expect)
|
||||
|
||||
|
||||
class TestTestTimeout(_ResmokeSelftest):
|
||||
def test_individual_test_timeout(self):
|
||||
|
||||
@ -0,0 +1,3 @@
|
||||
// Signal that the test has started running
|
||||
let sentinelPath = (_getEnv("TMPDIR") || _getEnv("TMP_DIR") || "/tmp") + "/timeout0.js.sentinel";
|
||||
removeFile(sentinelPath);
|
||||
Loading…
Reference in New Issue
Block a user