SERVER-27874 Display locks and generate digraph for threads using LockManager locks and/or pthread_mutexes

This commit is contained in:
Jonathan Abrahams 2017-03-16 14:39:09 -04:00
parent 38a2af996a
commit 5fe822f53e
5 changed files with 365 additions and 18 deletions

View File

@ -2,5 +2,7 @@
set python print-stack full
# Load the mongodb pretty printers
#
source buildscripts/gdb/mongo.py
# Load the mongodb lock analysis
source buildscripts/gdb/mongo_lock.py

View File

@ -4,6 +4,7 @@ from __future__ import print_function
import gdb.printing
import os
import re
import sys
try:
@ -29,6 +30,27 @@ def get_process_name():
return os.path.splitext(os.path.basename(main_binary_name))[0]
def get_thread_id():
"""Returns the thread_id of the current GDB thread"""
# GDB thread example:
# RHEL
# [Current thread is 1 (Thread 0x7f072426cca0 (LWP 12867))]
thread_info = gdb.execute("thread", from_tty=False, to_string=True)
if sys.platform.startswith("linux"):
match = re.search(r"Thread (?P<pthread_id>0x[0-9a-f]+)", thread_info)
if match:
return int(match.group("pthread_id"), 16)
elif sys.platform.startswith("sunos"):
match = re.search(r"Thread (?P<pthread_id>[0-9]+)", thread_info)
if match:
return int(match.group("pthread_id"), 10)
lwpid = gdb.selected_thread().ptid[1]
if lwpid != 0:
return lwpid
raise ValueError("Failed to find thread id in {}".format(thread_info))
###################################################################################################
#
# Pretty-Printers
@ -336,35 +358,34 @@ class DumpGlobalServiceContext(gdb.Command):
DumpGlobalServiceContext()
class MongoDBAnalyze(gdb.Command):
"""Analyze MongoDB process"""
class MongoDBDumpLocks(gdb.Command):
"""Dump locks in mongod process"""
def __init__(self):
register_mongo_command(self, "mongodb-analyze", gdb.COMMAND_STATUS)
register_mongo_command(self, "mongodb-dump-locks", gdb.COMMAND_DATA)
def invoke(self, arg, _from_tty):
print("Running Hang Analyzer Supplement")
main_binary_name = get_process_name()
if main_binary_name.endswith('mongod'):
self.analyze_mongod()
if main_binary_name == 'mongod':
self.dump_mongod_locks()
else:
print("No process specific analysis done for: %s" % (main_binary_name))
print("Not invoking mongod lock dump for: %s" % (main_binary_name))
def analyze_mongod(self):
def dump_mongod_locks(self):
"""GDB in-process python supplement"""
try:
# Call into mongod, and dump the state of lock manager
# Note that output will go to mongod's standard output, not the debugger output window
gdb.execute("call ('mongo::(anonymous namespace)::globalLockManager').dump()",
from_tty=False, to_string=False)
except gdb.error as gdberr:
print("Ignoring error '%s'" % str(gdberr))
print("Ignoring error '%s' in dump_mongod_locks" % str(gdberr))
# Register command
MongoDBAnalyze()
MongoDBDumpLocks()
class MongoDBUniqueStack(gdb.Command):
@ -394,7 +415,7 @@ class MongoDBUniqueStack(gdb.Command):
thread_info = {} # thread dict to hold per thread data
thread_info['frames'] = [] # the complete backtrace per thread from gdb
thread_info['functions'] = [] # list of function names from frames
thread_info['pthread'] = hex(int(gdb.parse_and_eval("(uint64_t)pthread_self()")))
thread_info['pthread'] = hex(get_thread_id())
frame = gdb.newest_frame()
while frame:
@ -480,7 +501,7 @@ class MongoDBHelp(gdb.Command):
"""Dump list of mongodb commands"""
def __init__(self):
gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_DATA)
gdb.Command.__init__(self, "mongodb-help", gdb.COMMAND_SUPPORT)
def invoke(self, arg, _from_tty):
print("Command - Description")

View File

@ -0,0 +1,311 @@
from __future__ import print_function
import gdb
import gdb.printing
import re
import sys
if sys.version_info[0] >= 3:
# GDB only permits converting a gdb.Value instance to its numerical address when using the
# long() constructor in Python 2 and not when using the int() constructor. We define the
# 'long' class as an alias for the 'int' class in Python 3 for compatibility.
long = int
class Thread(object):
def __init__(self, thread_id, lwpid):
self.thread_id = thread_id
self.lwpid = lwpid
def __eq__(self, other):
if isinstance(other, Thread):
return self.thread_id == other.thread_id
return NotImplemented
def __ne__(self, other):
return not self == other
def __str__(self):
return "Thread 0x{:012x} (LWP {})".format(self.thread_id, self.lwpid)
def key(self):
return "Thread 0x{:012x}".format(self.thread_id)
class Lock(object):
def __init__(self, addr, resource):
self.addr = addr
self.resource = resource
def __eq__(self, other):
if isinstance(other, Lock):
return self.addr == other.addr
return NotImplemented
def __ne__(self, other):
return not self == other
def __str__(self):
return "Lock 0x{:012x} ({})".format(self.addr, self.resource)
def key(self):
return "Lock 0x{:012x}".format(self.addr)
class Graph(object):
# The Graph is a dict with the following structure:
# {'node_key': {'node': {id: val}, 'next_nodes': [node_key_1, ...]}}
# Example graph:
# {
# 'Lock 1': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 1']},
# 'Lock 2': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 2']},
# 'Thread 1': {'node': {1: 123}, 'next_nodes': ['Lock 2']},
# 'Thread 2': {'node': {2: 456}, 'next_nodes': ['Lock 1']}
# }
def __init__(self):
self.nodes = {}
def add_node(self, node):
if not self.find_node(node):
self.nodes[node.key()] = {'node': node, 'next_nodes': []}
def find_node(self, node):
if node.key() in self.nodes:
return self.nodes[node.key()]
return None
def find_from_node(self, from_node):
for node_key in self.nodes:
node = self.nodes[node_key]
for next_node in node['next_nodes']:
if next_node == from_node['node'].key():
return node
return None
def remove_nodes_without_edge(self):
# Rebuild graph by removing any nodes which do not have any incoming or outgoing any edges.
temp_nodes = {}
for node_key in self.nodes:
node = self.nodes[node_key]
if node['next_nodes'] or self.find_from_node(node) is not None:
temp_nodes[node_key] = self.nodes[node_key]
self.nodes = temp_nodes
def add_edge(self, from_node, to_node):
f = self.find_node(from_node)
if f is None:
self.add_node(from_node)
f = self.nodes[from_node.key()]
t = self.find_node(to_node)
if t is None:
self.add_node(to_node)
t = self.nodes[to_node.key()]
for n in f['next_nodes']:
if n == to_node.key():
return
self.nodes[from_node.key()]['next_nodes'].append(to_node.key())
def print(self):
for node_key in self.nodes:
print("Node", self.nodes[node_key]['node'])
for to in self.nodes[node_key]['next_nodes']:
print(" ->", to)
def to_graph(self):
sb = []
sb.append('digraph "mongod+lock-status" {')
for node_key in self.nodes:
for next_node_key in self.nodes[node_key]['next_nodes']:
sb.append(' "%s" -> "%s";' % (node_key, next_node_key))
for node_key in self.nodes:
sb.append(' "%s" [label="%s"]' % (node_key, self.nodes[node_key]['node']))
sb.append("}")
return "\n".join(sb)
def find_lwpid(thread_dict, search_thread_id):
for (lwpid, thread_id) in thread_dict.items():
if thread_id == search_thread_id:
return lwpid
return None
def find_func_block(block):
while block:
if block.function:
return block
block = block.superblock
return None
def find_frame(function_name_pattern):
frame = gdb.newest_frame()
while frame:
block = None
try:
block = frame.block()
except RuntimeError as err:
if err.args[0] != "Cannot locate block for frame.":
raise
block = find_func_block(block)
if block and re.match(function_name_pattern, block.function.name):
return frame
frame = frame.older()
return None
def find_mutex_holder(graph, thread_dict, show):
frame = find_frame(r'std::mutex::lock\(\)')
if frame is None:
return
frame.select()
# Waiting for mutex locking!
mutex_this, _ = gdb.lookup_symbol("this", frame.block())
mutex_value = mutex_this.value(frame)
# The mutex holder is a LWPID
mutex_holder = int(mutex_value["_M_mutex"]["__data"]["__owner"])
mutex_holder_id = thread_dict[mutex_holder]
(_, mutex_waiter_lwpid, _) = gdb.selected_thread().ptid
mutex_waiter_id = thread_dict[mutex_waiter_lwpid]
if show:
print("Mutex at {} held by thread 0x{:x} (LWP {}) "
" waited on by thread 0x{:x} (LWP {})".format(mutex_value,
mutex_holder_id,
mutex_holder,
mutex_waiter_id,
mutex_waiter_lwpid))
if graph:
graph.add_edge(Thread(mutex_waiter_id, mutex_waiter_lwpid),
Lock(long(mutex_value), "Mutex"))
graph.add_edge(Lock(long(mutex_value), "Mutex"), Thread(mutex_holder_id, mutex_holder))
def find_lock_manager_holders(graph, thread_dict, show):
frame = find_frame(r'mongo::LockerImpl\<.*\>::')
if not frame:
return
frame.select()
(_, lwpid, _) = gdb.selected_thread().ptid
locker_ptr_type = gdb.lookup_type("mongo::LockerImpl<false>").pointer()
lock_head = gdb.parse_and_eval(
"mongo::getGlobalLockManager()->_getBucket(resId)->findOrInsert(resId)")
grantedList = lock_head.dereference()["grantedList"]
lock_request_ptr = grantedList["_front"]
while lock_request_ptr:
lock_request = lock_request_ptr.dereference()
locker_ptr = lock_request["locker"]
locker_ptr = locker_ptr.cast(locker_ptr_type)
locker = locker_ptr.dereference()
lock_thread_id = int(locker["_threadId"]["_M_thread"])
lock_thread_lwpid = find_lwpid(thread_dict, lock_thread_id)
if show:
print("MongoDB Lock at {} ({}) held by thread id 0x{:x} (LWP {})".format(
lock_head, lock_request["mode"], lock_thread_id, lock_thread_lwpid) +
" waited on by thread 0x{:x} (LWP {})".format(thread_dict[lwpid], lwpid))
if graph:
graph.add_edge(Thread(thread_dict[lwpid], lwpid), Lock(long(lock_head), "MongoDB lock"))
graph.add_edge(Lock(long(lock_head), "MongoDB lock"),
Thread(lock_thread_id, lock_thread_lwpid))
lock_request_ptr = lock_request["next"]
def get_locks(graph, thread_dict, show=False):
for thread in gdb.selected_inferior().threads():
try:
if not thread.is_valid():
continue
thread.switch()
find_mutex_holder(graph, thread_dict, show)
find_lock_manager_holders(graph, thread_dict, show)
except gdb.error as err:
print("Ignoring GDB error '%s' in get_locks" % str(err))
def get_threads_info(graph=None):
thread_dict = {}
for thread in gdb.selected_inferior().threads():
try:
if not thread.is_valid():
continue
thread.switch()
# PTID is a tuple: Process ID (PID), Lightweight Process ID (LWPID), Thread ID (TID)
(_, lwpid, _) = thread.ptid
thread_num = thread.num
thread_id = get_thread_id()
if not thread_id:
print("Unable to retrieve thread_info for thread %d" % thread_num)
continue
thread_dict[lwpid] = thread_id
except gdb.error as err:
print("Ignoring GDB error '%s' in get_threads_info" % str(err))
return thread_dict
class MongoDBShowLocks(gdb.Command):
"""Show MongoDB locks"""
def __init__(self):
register_mongo_command(self, "mongodb-show-locks", gdb.COMMAND_DATA)
def invoke(self, arg, _from_tty):
main_binary_name = get_process_name()
if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'):
self.mongodb_show_locks()
else:
print("No show MongoDB locks done for: %s" % (main_binary_name))
def mongodb_show_locks(self):
"""GDB in-process python supplement"""
try:
thread_dict = get_threads_info()
get_locks(graph=None, thread_dict=thread_dict, show=True)
except gdb.error as err:
print("Ignoring GDB error '%s' in mongodb_show_locks" % str(err))
MongoDBShowLocks()
class MongoDBWaitsForGraph(gdb.Command):
"""Create MongoDB WaitsFor graph [graph_file]"""
def __init__(self):
register_mongo_command(self, "mongodb-waitsfor-graph", gdb.COMMAND_DATA)
def invoke(self, arg, _from_tty):
main_binary_name = get_process_name()
if main_binary_name.endswith('mongod') or main_binary_name.endswith('mongo'):
self.mongodb_waitsfor_graph(arg)
else:
print("No create MongoDB WaitsFor graph done for: %s" % (main_binary_name))
def mongodb_waitsfor_graph(self, file=None):
"""GDB in-process python supplement"""
graph = Graph()
try:
thread_dict = get_threads_info(graph=graph)
get_locks(graph=graph, thread_dict=thread_dict, show=False)
graph.remove_nodes_without_edge()
if file:
print("Saving digraph to %s" % file)
with open(file, 'w') as f:
f.write(graph.to_graph())
else:
print(graph.to_graph())
except gdb.error as err:
print("Ignoring GDB error '%s' in mongod_deadlock_graph" % str(err))
MongoDBWaitsForGraph()
print("MongoDB Lock analysis commands loaded")

View File

@ -301,8 +301,12 @@ class GDBDumper(object):
root_logger.info("dir %s" % script_dir)
gdb_dir = os.path.join(script_dir, "gdb")
printers_script = os.path.join(gdb_dir, "mongo.py")
mongo_lock_script = os.path.join(gdb_dir, "mongo_lock.py")
bt_command = "mongodb-uniqstack bt"
mongodb_show_locks = "mongodb-show-locks"
mongodb_deadlock = "mongodb-waitsfor-graph debugger_deadlock_%s_%d.gv" % \
(process_name, pid)
if sys.platform.startswith("sunos"):
'''
On Solaris, currently calling mongo-uniqstack leads to an error:
@ -318,20 +322,29 @@ class GDBDumper(object):
When the function is done executing, GDB will silently stop.
'''
bt_command = "thread apply all bt"
mongodb_show_locks = ""
mongodb_deadlock = ""
cmds = [
"set pagination off",
"set interactive-mode off",
"set print thread-events off", # Python calls to gdb.parse_and_eval may cause threads
# to start and finish. This suppresses those messages
# from appearing in the return output.
"file %s" % process_name, # Solaris must load the process to read the symbols.
"attach %d" % pid,
"info sharedlibrary",
"info threads", # Dump a simple list of commands to get the thread name
"set python print-stack full",
"source " + printers_script,
"source %s" % printers_script,
"source %s" % mongo_lock_script,
bt_command,
dump_command,
"mongodb-analyze",
"mongodb-dump-locks",
mongodb_show_locks,
mongodb_deadlock,
"mongodb-javascript-stack", # The mongodb-javascript-stack command executes code in
# order to dump JavaScript backtraces and should therefore
# should be one of the last analysis commands.
# be one of the last analysis commands.
"set confirm off",
"quit",
]

View File

@ -1116,7 +1116,7 @@ post:
target: "src/mongo-hanganalyzer.tgz"
source_dir: "src"
include:
- "./debugger*.log"
- "./debugger*.*"
- command: s3.put
params:
aws_key: ${aws_key}