mongo/buildscripts/resmokelib/sighandler.py
Steve McClure d584641e47 SERVER-127547: Fix resmoke e2e flake (#54391)
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
GitOrigin-RevId: 85fb3a165f9f6fc0d3ae8517c260fd238281d43b
2026-05-26 20:00:44 +00:00

187 lines
6.9 KiB
Python

"""Utility to support asynchronously signaling the current process."""
import atexit
import os
import signal
import sys
import threading
import time
import traceback
import psutil
from buildscripts.resmokelib import config, parser, reportfile, testing
from buildscripts.resmokelib.flags import HANG_ANALYZER_CALLED
from buildscripts.resmokelib.utils.self_test_fakes import test_analysis
_IS_WINDOWS = sys.platform == "win32"
if _IS_WINDOWS:
import win32api
import win32event
def register(logger, suites, start_time):
"""Register an event object to wait for signal, or a signal handler for SIGUSR1."""
def _handle_sigusr1(signum, frame):
"""Signal handler for SIGUSR1.
The handler will dump the stacks of all threads and write out the report file and
log suite summaries.
"""
# Snapshot subprocesses before setting HANG_ANALYZER_CALLED. Other threads observe
# that flag and may tear down fixtures (killing mongod/mongo processes). Capturing
# here ensures we see the full process set before any teardown races can occur.
pids_to_analyze = _get_pids() if "is_inner_level" not in config.INTERNAL_PARAMS else []
HANG_ANALYZER_CALLED.set()
header_msg = "Dumping stacks due to SIGUSR1 signal"
_dump_and_log(header_msg, pids_to_analyze)
def _handle_set_event(event_handle):
"""Event object handler for Windows.
The handler will dump the stacks of all threads and write out the report file and
log suite summaries.
"""
while True:
try:
# Wait for task time out to dump stacks.
ret = win32event.WaitForSingleObject(event_handle, win32event.INFINITE)
if ret != win32event.WAIT_OBJECT_0:
logger.error("_handle_set_event WaitForSingleObject failed: %d" % ret)
return
except win32event.error as err:
logger.error("Exception from win32event.WaitForSingleObject with error: %s" % err)
else:
# Snapshot subprocesses before setting HANG_ANALYZER_CALLED for the same
# reason as in _handle_sigusr1.
pids_to_analyze = (
_get_pids() if "is_inner_level" not in config.INTERNAL_PARAMS else []
)
HANG_ANALYZER_CALLED.set()
header_msg = "Dumping stacks due to signal from win32event.SetEvent"
_dump_and_log(header_msg, pids_to_analyze)
def _dump_and_log(header_msg, pids_to_analyze):
"""Dump the stacks of all threads, write report file, and log suite summaries."""
_dump_stacks(logger, header_msg)
reportfile.write(suites)
testing.suite.Suite.log_summaries(logger, suites, time.time() - start_time)
if pids_to_analyze:
_analyze_pids(logger, pids_to_analyze)
# On Windows spawn a thread to wait on an event object for signal to dump stacks. For Cygwin
# platforms, we use a signal handler since it supports POSIX signals.
if _IS_WINDOWS:
# Create unique event_name.
event_name = "Global\\Mongo_Python_" + str(os.getpid())
try:
security_attributes = None
manual_reset = False
initial_state = False
task_timeout_handle = win32event.CreateEvent(
security_attributes, manual_reset, initial_state, event_name
)
except win32event.error as err:
logger.error("Exception from win32event.CreateEvent with error: %s" % err)
return
# Register to close event object handle on exit.
atexit.register(win32api.CloseHandle, task_timeout_handle)
# Create thread.
event_handler_thread = threading.Thread(
target=_handle_set_event,
kwargs={"event_handle": task_timeout_handle},
name="windows_event_handler_thread",
)
event_handler_thread.daemon = True
event_handler_thread.start()
else:
# Otherwise register a signal handler
signal.signal(signal.SIGUSR1, _handle_sigusr1)
def _dump_stacks(logger, header_msg):
"""Signal handler that will dump the stacks of all threads."""
sb = []
sb.append(header_msg)
frames = sys._current_frames()
sb.append("Total threads: %d" % (len(frames)))
sb.append("")
for thread_id in frames:
stack = frames[thread_id]
sb.append("Thread %d:" % (thread_id))
sb.append("".join(traceback.format_stack(stack)))
logger.info("\n".join(sb))
def _get_pids():
"""Return all PIDs spawned by the current resmoke process and their child PIDs."""
pids = [] # Gather fixture PIDs + any PIDs spawned by the fixtures.
parent = psutil.Process() # current process
for child in parent.children(recursive=True):
# Don't signal python threads. They have already been signalled in the evergreen timeout
# section.
if "python" not in child.name().lower():
pids.append(child.pid)
return pids
def _analyze_pids(logger, pids):
"""Analyze the PIDs spawned by the current resmoke process."""
# If 'test_analysis' is specified, we will just write the pids out to a file and kill them
# Instead of running analysis. This option will only be specified in resmoke selftests.
if "test_analysis" in config.INTERNAL_PARAMS:
test_analysis(logger, pids)
return
# See hang-analyzer argument options here:
# https://github.com/10gen/mongo/blob/8636ede10bd70b32ff4b6cd115132ab0f22b89c7/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py#L245
hang_analyzer_args = [
"hang-analyzer",
"-c",
"-o",
"file",
"-o",
"stdout",
"-k",
"-d",
",".join([str(p) for p in pids]),
]
_hang_analyzer = parser.parse_command_line(hang_analyzer_args, logger=logger)
# Evergreen has a 15 minute timeout for task timeout commands
# Limit the hang analyzer to 12 minutes so there is time for other tasks.
hang_analyzer_hard_timeout = None
if config.EVERGREEN_TASK_ID:
hang_analyzer_hard_timeout = 60 * 12
logger.info(
"Limit the resmoke invoked hang analyzer to 12 minutes so there is time for resmoke to finish up."
)
hang_analyzer_thread = threading.Thread(target=_hang_analyzer.execute, daemon=True)
hang_analyzer_thread.start()
hang_analyzer_thread.join(hang_analyzer_hard_timeout)
if hang_analyzer_thread.is_alive():
logger.warning(
"Resmoke invoked hang analyzer thread did not finish, but will continue running in the background. The thread may be disruputed and may show extraneous output."
)
logger.warning("Cleaning up resmoke child processes so that resmoke can fail gracefully.")
_hang_analyzer.kill_rogue_processes()
else:
logger.info("Done running resmoke invoked hang analyzer thread.")