SERVER-27874 Display locks and generate digraph for threads using LockManager locks and/or pthread_mutexes
This commit is contained in:
parent
38a2af996a
commit
5fe822f53e
4
.gdbinit
4
.gdbinit
@ -2,5 +2,7 @@
|
||||
set python print-stack full
|
||||
|
||||
# Load the mongodb pretty printers
|
||||
#
|
||||
source buildscripts/gdb/mongo.py
|
||||
|
||||
# Load the mongodb lock analysis
|
||||
source buildscripts/gdb/mongo_lock.py
|
||||
|
||||
@ -4,6 +4,7 @@ from __future__ import print_function
|
||||
|
||||
import gdb.printing
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
try:
|
||||
@ -29,6 +30,27 @@ def get_process_name():
|
||||
return os.path.splitext(os.path.basename(main_binary_name))[0]
|
||||
|
||||
|
||||
def get_thread_id():
|
||||
"""Returns the thread_id of the current GDB thread"""
|
||||
# GDB thread example:
|
||||
# RHEL
|
||||
# [Current thread is 1 (Thread 0x7f072426cca0 (LWP 12867))]
|
||||
thread_info = gdb.execute("thread", from_tty=False, to_string=True)
|
||||
|
||||
if sys.platform.startswith("linux"):
|
||||
match = re.search(r"Thread (?P<pthread_id>0x[0-9a-f]+)", thread_info)
|
||||
if match:
|
||||
return int(match.group("pthread_id"), 16)
|
||||
elif sys.platform.startswith("sunos"):
|
||||
match = re.search(r"Thread (?P<pthread_id>[0-9]+)", thread_info)
|
||||
if match:
|
||||
return int(match.group("pthread_id"), 10)
|
||||
lwpid = gdb.selected_thread().ptid[1]
|
||||
if lwpid != 0:
|
||||
return lwpid
|
||||
raise ValueError("Failed to find thread id in {}".format(thread_info))
|
||||
|
||||
|
||||
###################################################################################################
|
||||
#
|
||||
# Pretty-Printers
|
||||
@ -336,35 +358,34 @@ class DumpGlobalServiceContext(gdb.Command):
|
||||
DumpGlobalServiceContext()
|
||||
|
||||
|
||||
class MongoDBAnalyze(gdb.Command):
|
||||
"""Analyze MongoDB process"""
|
||||
class MongoDBDumpLocks(gdb.Command):
|
||||
"""Dump locks in mongod process"""
|
||||
|
||||
def __init__(self):
|
||||
register_mongo_command(self, "mongodb-analyze", gdb.COMMAND_STATUS)
|
||||
register_mongo_command(self, "mongodb-dump-locks", gdb.COMMAND_DATA)
|
||||
|
||||
def invoke(self, arg, _from_tty):
|
||||
print("Running Hang Analyzer Supplement")
|
||||
|
||||
main_binary_name = get_process_name()
|
||||
if main_binary_name.endswith('mongod'):
|
||||
self.analyze_mongod()
|
||||
if main_binary_name == 'mongod':
|
||||
self.dump_mongod_locks()
|
||||
else:
|
||||
print("No process specific analysis done for: %s" % (main_binary_name))
|
||||
print("Not invoking mongod lock dump for: %s" % (main_binary_name))
|
||||
|
||||
def analyze_mongod(self):
|
||||
def dump_mongod_locks(self):
|
||||
"""GDB in-process python supplement"""
|
||||
|
||||
try:
|
||||
|
||||
# Call into mongod, and dump the state of lock manager
|
||||
# Note that output will go to mongod's standard output, not the debugger output window
|
||||
gdb.execute("call ('mongo::(anonymous namespace)::globalLockManager').dump()",
|
||||
from_tty=False, to_string=False)
|
||||
except gdb.error as gdberr:
|
||||
print("Ignoring error '%s'" % str(gdberr))
|
||||
print("Ignoring error '%s' in dump_mongod_locks" % str(gdberr))
|
||||
|
||||
# Register command
|
||||
MongoDBAnalyze()
|
||||
MongoDBDumpLocks()
|
||||
|
||||
|
||||
class MongoDBUniqueStack(gdb.Command):
|
||||
@ -394,7 +415,7 @@ class MongoDBUniqueStack(gdb.Command):
|
||||
thread_info = {} # thread dict to hold per thread data
|
||||
thread_info['frames'] = [] # the complete backtrace per thread from gdb
|
||||
thread_info['functions'] = [] # list of function names from frames
|
||||
thread_info['pthread'] = hex(int(gdb.parse_and_eval("(uint64_t)pthread_self()")))
|
||||
thread_info['pthread'] = hex(get_thread_id())
|
||||
|
||||
frame = gdb.newest_frame()
|
||||
while frame:
|
||||
@ -480,7 +501,7 @@ class MongoDBHelp(gdb.Command):
|
||||
"""Dump list of mongodb commands"""
|
||||
|
||||
def __init__(self):
|
||||
gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_DATA)
|
||||
gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_SUPPORT)
|
||||
|
||||
def invoke(self, arg, _from_tty):
|
||||
print("Command - Description")
|
||||
|
||||
311
buildscripts/gdb/mongo_lock.py
Normal file
311
buildscripts/gdb/mongo_lock.py
Normal file
@ -0,0 +1,311 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import gdb
|
||||
import gdb.printing
|
||||
import re
|
||||
import sys
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
# GDB only permits converting a gdb.Value instance to its numerical address when using the
|
||||
# long() constructor in Python 2 and not when using the int() constructor. We define the
|
||||
# 'long' class as an alias for the 'int' class in Python 3 for compatibility.
|
||||
long = int
|
||||
|
||||
|
||||
class Thread(object):
|
||||
def __init__(self, thread_id, lwpid):
|
||||
self.thread_id = thread_id
|
||||
self.lwpid = lwpid
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, Thread):
|
||||
return self.thread_id == other.thread_id
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __str__(self):
|
||||
return "Thread 0x{:012x} (LWP {})".format(self.thread_id, self.lwpid)
|
||||
|
||||
def key(self):
|
||||
return "Thread 0x{:012x}".format(self.thread_id)
|
||||
|
||||
|
||||
class Lock(object):
|
||||
def __init__(self, addr, resource):
|
||||
self.addr = addr
|
||||
self.resource = resource
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, Lock):
|
||||
return self.addr == other.addr
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __str__(self):
|
||||
return "Lock 0x{:012x} ({})".format(self.addr, self.resource)
|
||||
|
||||
def key(self):
|
||||
return "Lock 0x{:012x}".format(self.addr)
|
||||
|
||||
|
||||
class Graph(object):
|
||||
# The Graph is a dict with the following structure:
|
||||
# {'node_key': {'node': {id: val}, 'next_nodes': [node_key_1, ...]}}
|
||||
# Example graph:
|
||||
# {
|
||||
# 'Lock 1': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 1']},
|
||||
# 'Lock 2': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 2']},
|
||||
# 'Thread 1': {'node': {1: 123}, 'next_nodes': ['Lock 2']},
|
||||
# 'Thread 2': {'node': {2: 456}, 'next_nodes': ['Lock 1']}
|
||||
# }
|
||||
def __init__(self):
|
||||
self.nodes = {}
|
||||
|
||||
def add_node(self, node):
|
||||
if not self.find_node(node):
|
||||
self.nodes[node.key()] = {'node': node, 'next_nodes': []}
|
||||
|
||||
def find_node(self, node):
|
||||
if node.key() in self.nodes:
|
||||
return self.nodes[node.key()]
|
||||
return None
|
||||
|
||||
def find_from_node(self, from_node):
|
||||
for node_key in self.nodes:
|
||||
node = self.nodes[node_key]
|
||||
for next_node in node['next_nodes']:
|
||||
if next_node == from_node['node'].key():
|
||||
return node
|
||||
return None
|
||||
|
||||
def remove_nodes_without_edge(self):
|
||||
# Rebuild graph by removing any nodes which do not have any incoming or outgoing any edges.
|
||||
temp_nodes = {}
|
||||
for node_key in self.nodes:
|
||||
node = self.nodes[node_key]
|
||||
if node['next_nodes'] or self.find_from_node(node) is not None:
|
||||
temp_nodes[node_key] = self.nodes[node_key]
|
||||
self.nodes = temp_nodes
|
||||
|
||||
def add_edge(self, from_node, to_node):
|
||||
f = self.find_node(from_node)
|
||||
if f is None:
|
||||
self.add_node(from_node)
|
||||
f = self.nodes[from_node.key()]
|
||||
|
||||
t = self.find_node(to_node)
|
||||
if t is None:
|
||||
self.add_node(to_node)
|
||||
t = self.nodes[to_node.key()]
|
||||
|
||||
for n in f['next_nodes']:
|
||||
if n == to_node.key():
|
||||
return
|
||||
self.nodes[from_node.key()]['next_nodes'].append(to_node.key())
|
||||
|
||||
def print(self):
|
||||
for node_key in self.nodes:
|
||||
print("Node", self.nodes[node_key]['node'])
|
||||
for to in self.nodes[node_key]['next_nodes']:
|
||||
print(" ->", to)
|
||||
|
||||
def to_graph(self):
|
||||
sb = []
|
||||
sb.append('digraph "mongod+lock-status" {')
|
||||
for node_key in self.nodes:
|
||||
for next_node_key in self.nodes[node_key]['next_nodes']:
|
||||
sb.append(' "%s" -> "%s";' % (node_key, next_node_key))
|
||||
for node_key in self.nodes:
|
||||
sb.append(' "%s" [label="%s"]' % (node_key, self.nodes[node_key]['node']))
|
||||
sb.append("}")
|
||||
return "\n".join(sb)
|
||||
|
||||
|
||||
def find_lwpid(thread_dict, search_thread_id):
|
||||
for (lwpid, thread_id) in thread_dict.items():
|
||||
if thread_id == search_thread_id:
|
||||
return lwpid
|
||||
return None
|
||||
|
||||
|
||||
def find_func_block(block):
|
||||
while block:
|
||||
if block.function:
|
||||
return block
|
||||
block = block.superblock
|
||||
return None
|
||||
|
||||
|
||||
def find_frame(function_name_pattern):
|
||||
frame = gdb.newest_frame()
|
||||
while frame:
|
||||
block = None
|
||||
try:
|
||||
block = frame.block()
|
||||
except RuntimeError as err:
|
||||
if err.args[0] != "Cannot locate block for frame.":
|
||||
raise
|
||||
|
||||
block = find_func_block(block)
|
||||
if block and re.match(function_name_pattern, block.function.name):
|
||||
return frame
|
||||
frame = frame.older()
|
||||
return None
|
||||
|
||||
|
||||
def find_mutex_holder(graph, thread_dict, show):
|
||||
frame = find_frame(r'std::mutex::lock\(\)')
|
||||
if frame is None:
|
||||
return
|
||||
|
||||
frame.select()
|
||||
|
||||
# Waiting for mutex locking!
|
||||
mutex_this, _ = gdb.lookup_symbol("this", frame.block())
|
||||
mutex_value = mutex_this.value(frame)
|
||||
# The mutex holder is a LWPID
|
||||
mutex_holder = int(mutex_value["_M_mutex"]["__data"]["__owner"])
|
||||
mutex_holder_id = thread_dict[mutex_holder]
|
||||
|
||||
(_, mutex_waiter_lwpid, _) = gdb.selected_thread().ptid
|
||||
mutex_waiter_id = thread_dict[mutex_waiter_lwpid]
|
||||
if show:
|
||||
print("Mutex at {} held by thread 0x{:x} (LWP {}) "
|
||||
" waited on by thread 0x{:x} (LWP {})".format(mutex_value,
|
||||
mutex_holder_id,
|
||||
mutex_holder,
|
||||
mutex_waiter_id,
|
||||
mutex_waiter_lwpid))
|
||||
if graph:
|
||||
graph.add_edge(Thread(mutex_waiter_id, mutex_waiter_lwpid),
|
||||
Lock(long(mutex_value), "Mutex"))
|
||||
graph.add_edge(Lock(long(mutex_value), "Mutex"), Thread(mutex_holder_id, mutex_holder))
|
||||
|
||||
|
||||
def find_lock_manager_holders(graph, thread_dict, show):
|
||||
frame = find_frame(r'mongo::LockerImpl\<.*\>::')
|
||||
if not frame:
|
||||
return
|
||||
|
||||
frame.select()
|
||||
|
||||
(_, lwpid, _) = gdb.selected_thread().ptid
|
||||
|
||||
locker_ptr_type = gdb.lookup_type("mongo::LockerImpl<false>").pointer()
|
||||
lock_head = gdb.parse_and_eval(
|
||||
"mongo::getGlobalLockManager()->_getBucket(resId)->findOrInsert(resId)")
|
||||
|
||||
grantedList = lock_head.dereference()["grantedList"]
|
||||
lock_request_ptr = grantedList["_front"]
|
||||
while lock_request_ptr:
|
||||
lock_request = lock_request_ptr.dereference()
|
||||
locker_ptr = lock_request["locker"]
|
||||
locker_ptr = locker_ptr.cast(locker_ptr_type)
|
||||
locker = locker_ptr.dereference()
|
||||
lock_thread_id = int(locker["_threadId"]["_M_thread"])
|
||||
lock_thread_lwpid = find_lwpid(thread_dict, lock_thread_id)
|
||||
if show:
|
||||
print("MongoDB Lock at {} ({}) held by thread id 0x{:x} (LWP {})".format(
|
||||
lock_head, lock_request["mode"], lock_thread_id, lock_thread_lwpid) +
|
||||
" waited on by thread 0x{:x} (LWP {})".format(thread_dict[lwpid], lwpid))
|
||||
if graph:
|
||||
graph.add_edge(Thread(thread_dict[lwpid], lwpid), Lock(long(lock_head), "MongoDB lock"))
|
||||
graph.add_edge(Lock(long(lock_head), "MongoDB lock"),
|
||||
Thread(lock_thread_id, lock_thread_lwpid))
|
||||
lock_request_ptr = lock_request["next"]
|
||||
|
||||
|
||||
def get_locks(graph, thread_dict, show=False):
|
||||
for thread in gdb.selected_inferior().threads():
|
||||
try:
|
||||
if not thread.is_valid():
|
||||
continue
|
||||
thread.switch()
|
||||
find_mutex_holder(graph, thread_dict, show)
|
||||
find_lock_manager_holders(graph, thread_dict, show)
|
||||
except gdb.error as err:
|
||||
print("Ignoring GDB error '%s' in get_locks" % str(err))
|
||||
|
||||
|
||||
def get_threads_info(graph=None):
|
||||
thread_dict = {}
|
||||
for thread in gdb.selected_inferior().threads():
|
||||
try:
|
||||
if not thread.is_valid():
|
||||
continue
|
||||
thread.switch()
|
||||
# PTID is a tuple: Process ID (PID), Lightweight Process ID (LWPID), Thread ID (TID)
|
||||
(_, lwpid, _) = thread.ptid
|
||||
thread_num = thread.num
|
||||
thread_id = get_thread_id()
|
||||
if not thread_id:
|
||||
print("Unable to retrieve thread_info for thread %d" % thread_num)
|
||||
continue
|
||||
thread_dict[lwpid] = thread_id
|
||||
except gdb.error as err:
|
||||
print("Ignoring GDB error '%s' in get_threads_info" % str(err))
|
||||
|
||||
return thread_dict
|
||||
|
||||
|
||||
class MongoDBShowLocks(gdb.Command):
|
||||
"""Show MongoDB locks"""
|
||||
def __init__(self):
|
||||
register_mongo_command(self, "mongodb-show-locks", gdb.COMMAND_DATA)
|
||||
|
||||
def invoke(self, arg, _from_tty):
|
||||
main_binary_name = get_process_name()
|
||||
if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'):
|
||||
self.mongodb_show_locks()
|
||||
else:
|
||||
print("No show MongoDB locks done for: %s" % (main_binary_name))
|
||||
|
||||
def mongodb_show_locks(self):
|
||||
"""GDB in-process python supplement"""
|
||||
try:
|
||||
thread_dict = get_threads_info()
|
||||
get_locks(graph=None, thread_dict=thread_dict, show=True)
|
||||
except gdb.error as err:
|
||||
print("Ignoring GDB error '%s' in mongodb_show_locks" % str(err))
|
||||
|
||||
MongoDBShowLocks()
|
||||
|
||||
|
||||
class MongoDBWaitsForGraph(gdb.Command):
|
||||
"""Create MongoDB WaitsFor graph [graph_file]"""
|
||||
def __init__(self):
|
||||
register_mongo_command(self, "mongodb-waitsfor-graph", gdb.COMMAND_DATA)
|
||||
|
||||
def invoke(self, arg, _from_tty):
|
||||
main_binary_name = get_process_name()
|
||||
if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'):
|
||||
self.mongodb_waitsfor_graph(arg)
|
||||
else:
|
||||
print("No create MongoDB WaitsFor graph done for: %s" % (main_binary_name))
|
||||
|
||||
def mongodb_waitsfor_graph(self, file=None):
|
||||
"""GDB in-process python supplement"""
|
||||
|
||||
graph = Graph()
|
||||
try:
|
||||
thread_dict = get_threads_info(graph=graph)
|
||||
get_locks(graph=graph, thread_dict=thread_dict, show=False)
|
||||
graph.remove_nodes_without_edge()
|
||||
if file:
|
||||
print("Saving digraph to %s" % file)
|
||||
with open(file, 'w') as f:
|
||||
f.write(graph.to_graph())
|
||||
else:
|
||||
print(graph.to_graph())
|
||||
|
||||
except gdb.error as err:
|
||||
print("Ignoring GDB error '%s' in mongod_deadlock_graph" % str(err))
|
||||
|
||||
|
||||
MongoDBWaitsForGraph()
|
||||
|
||||
print("MongoDB Lock analysis commands loaded")
|
||||
@ -301,8 +301,12 @@ class GDBDumper(object):
|
||||
root_logger.info("dir %s" % script_dir)
|
||||
gdb_dir = os.path.join(script_dir, "gdb")
|
||||
printers_script = os.path.join(gdb_dir, "mongo.py")
|
||||
mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py")
|
||||
|
||||
bt_command = "mongodb-uniqstack bt"
|
||||
mongodb_show_locks = "mongodb-show-locks"
|
||||
mongodb_deadlock = "mongodb-waitsfor-graph debugger_deadlock_%s_%d.gv" % \
|
||||
(process_name, pid)
|
||||
if sys.platform.startswith("sunos"):
|
||||
'''
|
||||
On Solaris, currently calling mongo-uniqstack leads to an error:
|
||||
@ -318,20 +322,29 @@ class GDBDumper(object):
|
||||
When the function is done executing, GDB will silently stop.
|
||||
'''
|
||||
bt_command = "thread apply all bt"
|
||||
mongodb_show_locks = ""
|
||||
mongodb_deadlock = ""
|
||||
|
||||
cmds = [
|
||||
"set pagination off",
|
||||
"set interactive-mode off",
|
||||
"set print thread-events off", # Python calls to gdb.parse_and_eval may cause threads
|
||||
# to start and finish. This suppresses those messages
|
||||
# from appearing in the return output.
|
||||
"file %s" % process_name, # Solaris must load the process to read the symbols.
|
||||
"attach %d" % pid,
|
||||
"info sharedlibrary",
|
||||
"info threads", # Dump a simple list of commands to get the thread name
|
||||
"set python print-stack full",
|
||||
"source " + printers_script,
|
||||
"source %s" % printers_script,
|
||||
"source %s" % mongo_lock_script,
|
||||
bt_command,
|
||||
dump_command,
|
||||
"mongodb-analyze",
|
||||
"mongodb-dump-locks",
|
||||
mongodb_show_locks,
|
||||
mongodb_deadlock,
|
||||
"mongodb-javascript-stack", # The mongodb-javascript-stack command executes code in
|
||||
# order to dump JavaScript backtraces and should therefore
|
||||
# should be one of the last analysis commands.
|
||||
# be one of the last analysis commands.
|
||||
"set confirm off",
|
||||
"quit",
|
||||
]
|
||||
|
||||
@ -1116,7 +1116,7 @@ post:
|
||||
target: "src/mongo-hanganalyzer.tgz"
|
||||
source_dir: "src"
|
||||
include:
|
||||
- "./debugger*.log"
|
||||
- "./debugger*.*"
|
||||
- command: s3.put
|
||||
params:
|
||||
aws_key: ${aws_key}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user