mongo-python-driver/test/high_availability/ha_tools.py
2015-03-25 11:55:02 -07:00

510 lines
14 KiB
Python

# Copyright 2009-2015 MongoDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tools for testing high availability in PyMongo."""
import os
import random
import shutil
import signal
import socket
import subprocess
import sys
import time
from stat import S_IRUSR
import pymongo
import pymongo.errors
from pymongo.read_preferences import ReadPreference
from test.utils import connected
home = os.environ.get('HOME')
default_dbpath = os.path.join(home, 'data', 'pymongo_high_availability')
dbpath = os.environ.get('DBPATH', default_dbpath)
default_logpath = os.path.join(home, 'log', 'pymongo_high_availability')
logpath = os.path.expanduser(os.environ.get('LOGPATH', default_logpath))
hostname = os.environ.get('HOSTNAME', 'localhost')
port = int(os.environ.get('DBPORT', 27017))
mongod = os.path.expanduser(os.environ.get('MONGOD', 'mongod'))
mongos = os.path.expanduser(os.environ.get('MONGOS', 'mongos'))
replica_set_name = os.environ.get('SETNAME', 'repl0')
ha_tools_debug = bool(os.environ.get('HA_TOOLS_DEBUG'))
nodes = {}
routers = {}
cur_port = port
key_file = None
try:
from subprocess import DEVNULL # Python 3.
except ImportError:
DEVNULL = open(os.devnull, 'wb')
def kill_members(members, sig, hosts=nodes):
for member in sorted(members):
try:
if ha_tools_debug:
print('killing %s' % (member,))
proc = hosts[member]['proc']
if 'java' in sys.platform:
# _process is a wrapped java.lang.UNIXProcess.
proc._process.destroy()
# Not sure if cygwin makes sense here...
elif sys.platform in ('win32', 'cygwin'):
os.kill(proc.pid, signal.CTRL_C_EVENT)
else:
os.kill(proc.pid, sig)
except OSError:
if ha_tools_debug:
print('%s already dead?' % (member,))
def kill_all_members():
kill_members(nodes.keys(), 2, nodes)
kill_members(routers.keys(), 2, routers)
def wait_for(proc, port_num):
trys = 0
while proc.poll() is None and trys < 160:
trys += 1
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
try:
s.connect((hostname, port_num))
return True
except (IOError, socket.error):
time.sleep(0.25)
finally:
s.close()
kill_all_members()
return False
def start_subprocess(cmd):
"""Run cmd (a list of strings) and return a Popen instance."""
return subprocess.Popen(cmd, stdout=DEVNULL, stderr=DEVNULL)
def start_replica_set(members, auth=False, fresh=True):
global cur_port
global key_file
if fresh:
if os.path.exists(dbpath):
try:
shutil.rmtree(dbpath)
except OSError:
pass
try:
os.makedirs(dbpath)
except OSError as exc:
print(exc)
print("\tWhile creating %s" % (dbpath,))
if auth:
key_file = os.path.join(dbpath, 'key.txt')
if not os.path.exists(key_file):
with open(key_file, 'w') as f:
f.write("my super secret system password")
os.chmod(key_file, S_IRUSR)
for i in range(len(members)):
host = '%s:%d' % (hostname, cur_port)
members[i].update({'_id': i, 'host': host})
path = os.path.join(dbpath, 'db' + str(i))
if not os.path.exists(path):
os.makedirs(path)
member_logpath = os.path.join(logpath, 'db' + str(i) + '.log')
if not os.path.exists(os.path.dirname(member_logpath)):
os.makedirs(os.path.dirname(member_logpath))
cmd = [mongod,
'--dbpath', path,
'--port', str(cur_port),
'--replSet', replica_set_name,
'--nojournal', '--oplogSize', '64',
'--logappend', '--logpath', member_logpath]
if auth:
cmd += ['--keyFile', key_file]
if ha_tools_debug:
print('starting %s' % (' '.join(cmd),))
proc = start_subprocess(cmd)
nodes[host] = {'proc': proc, 'cmd': cmd, 'dbpath': path}
res = wait_for(proc, cur_port)
cur_port += 1
if not res:
return None
config = {'_id': replica_set_name, 'members': members}
primary = members[0]['host']
c = pymongo.MongoClient(primary)
try:
if ha_tools_debug:
print('rs.initiate(%s)' % (config,))
c.admin.command('replSetInitiate', config)
except pymongo.errors.OperationFailure as exc:
# Already initialized from a previous run?
if ha_tools_debug:
print(exc)
expected_arbiters = 0
for member in members:
if member.get('arbiterOnly'):
expected_arbiters += 1
expected_secondaries = len(members) - expected_arbiters - 1
# Wait a minute for replica set to come up.
patience = 1
for i in range(int(patience * 60 / 2)):
time.sleep(2)
try:
if (get_primary() and
len(get_secondaries()) == expected_secondaries and
len(get_arbiters()) == expected_arbiters):
break
except pymongo.errors.ConnectionFailure:
# Keep waiting
pass
if ha_tools_debug:
print('waiting for RS %s' % (i,))
else:
kill_all_members()
raise Exception(
"Replica set still not initalized after %s minutes" % patience)
return primary, replica_set_name
def create_sharded_cluster(num_routers=3):
global cur_port
if not os.path.exists(logpath):
os.makedirs(logpath)
# Start a config server
configdb_host = '%s:%d' % (hostname, cur_port)
path = os.path.join(dbpath, 'configdb')
if not os.path.exists(path):
os.makedirs(path)
configdb_logpath = os.path.join(logpath, 'configdb.log')
cmd = [mongod,
'--dbpath', path,
'--port', str(cur_port),
'--nojournal', '--logappend',
'--logpath', configdb_logpath]
proc = start_subprocess(cmd)
nodes[configdb_host] = {'proc': proc, 'cmd': cmd, 'dbpath': path}
res = wait_for(proc, cur_port)
if not res:
return None
# ...and a shard server
cur_port = cur_port + 1
shard_host = '%s:%d' % (hostname, cur_port)
path = os.path.join(dbpath, 'shard1')
if not os.path.exists(path):
os.makedirs(path)
db_logpath = os.path.join(logpath, 'shard1.log')
cmd = [mongod,
'--dbpath', path,
'--port', str(cur_port),
'--nojournal', '--logappend',
'--logpath', db_logpath]
proc = start_subprocess(cmd)
nodes[shard_host] = {'proc': proc, 'cmd': cmd, 'dbpath': path}
res = wait_for(proc, cur_port)
if not res:
return None
# ...and a few mongos instances
cur_port = cur_port + 1
for i in range(num_routers):
cur_port = cur_port + i
host = '%s:%d' % (hostname, cur_port)
mongos_logpath = os.path.join(logpath, 'mongos' + str(i) + '.log')
cmd = [mongos,
'--port', str(cur_port),
'--logappend',
'--logpath', mongos_logpath,
'--configdb', configdb_host]
proc = start_subprocess(cmd)
routers[host] = {'proc': proc, 'cmd': cmd}
res = wait_for(proc, cur_port)
if not res:
return None
# Add the shard
client = pymongo.MongoClient(host)
try:
client.admin.command({'addshard': shard_host})
except pymongo.errors.OperationFailure:
# Already configured.
pass
return get_mongos_seed_list()
# Connect to a random member
def get_client():
# Attempt a direct connection to each node until one succeeds. Using a
# non-PRIMARY read preference allows us to use the node even if it's a
# secondary.
for i, node in enumerate(nodes.keys()):
try:
return connected(
pymongo.MongoClient(
node,
read_preference=ReadPreference.PRIMARY_PREFERRED))
except pymongo.errors.ConnectionFailure:
if i == len(nodes) - 1:
raise
def get_mongos_seed_list():
members = routers.keys()
return ','.join(members)
def kill_mongos(host):
kill_members([host], 2, hosts=routers)
return host
def restart_mongos(host):
restart_members([host], True)
def get_members_in_state(state):
status = get_client().admin.command('replSetGetStatus')
members = status['members']
return [k['name'] for k in members if k['state'] == state]
def get_primary():
try:
primaries = get_members_in_state(1)
assert len(primaries) <= 1
if primaries:
return primaries[0]
except (pymongo.errors.ConnectionFailure, pymongo.errors.OperationFailure):
pass
return None
def wait_for_primary():
for _ in range(30):
time.sleep(1)
if get_primary():
break
else:
raise AssertionError("Primary didn't come back up")
def get_random_secondary():
secondaries = get_members_in_state(2)
if len(secondaries):
return random.choice(secondaries)
return None
def get_secondaries():
return get_members_in_state(2)
def get_arbiters():
return get_members_in_state(7)
def get_recovering():
return get_members_in_state(3)
def get_passives():
return get_client().admin.command('ismaster').get('passives', [])
def get_hosts():
return get_client().admin.command('ismaster').get('hosts', [])
def get_hidden_members():
# Both 'hidden' and 'slaveDelay'
secondaries = get_secondaries()
readers = get_hosts() + get_passives()
for member in readers:
try:
secondaries.remove(member)
except:
# Skip primary
pass
return secondaries
def get_tags(member):
config = get_client().local.system.replset.find_one()
for m in config['members']:
if m['host'] == member:
return m.get('tags', {})
raise Exception('member %s not in config' % repr(member))
def kill_primary(sig=2):
primary = get_primary()
kill_members([primary], sig)
return primary
def kill_secondary(sig=2):
secondary = get_random_secondary()
kill_members([secondary], sig)
return secondary
def kill_all_secondaries(sig=2):
secondaries = get_secondaries()
kill_members(secondaries, sig)
return secondaries
# TODO: refactor w/ start_replica_set
def add_member(auth=False):
global cur_port
host = '%s:%d' % (hostname, cur_port)
primary = get_primary()
assert primary
c = pymongo.MongoClient(primary)
config = c.local.system.replset.find_one()
_id = max([member['_id'] for member in config['members']]) + 1
member = {'_id': _id, 'host': host}
path = os.path.join(dbpath, 'db' + str(_id))
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
member_logpath = os.path.join(logpath, 'db' + str(_id) + '.log')
if not os.path.exists(os.path.dirname(member_logpath)):
os.makedirs(os.path.dirname(member_logpath))
cmd = [mongod,
'--dbpath', path,
'--port', str(cur_port),
'--replSet', replica_set_name,
'--nojournal', '--oplogSize', '64',
'--logappend', '--logpath', member_logpath]
if auth:
cmd += ['--keyFile', key_file]
if ha_tools_debug:
print('starting %s' % ' '.join(cmd))
proc = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
nodes[host] = {'proc': proc, 'cmd': cmd, 'dbpath': path}
res = wait_for(proc, cur_port)
cur_port += 1
config['members'].append(member)
config['version'] += 1
if ha_tools_debug:
print({'replSetReconfig': config})
response = c.admin.command({'replSetReconfig': config})
if ha_tools_debug:
print(response)
if not res:
return None
return host
def stepdown_primary():
primary = get_primary()
if primary:
if ha_tools_debug:
print('stepping down primary: %s' % (primary,))
c = pymongo.MongoClient(primary)
for _ in range(10):
try:
c.admin.command('replSetStepDown', 20)
except pymongo.errors.OperationFailure as exc:
if ha_tools_debug:
print('Code %s from replSetStepDown: %s' % (exc.code, exc))
print('Trying again in one second....')
time.sleep(1)
except pymongo.errors.ConnectionFailure as exc:
# replSetStepDown causes mongod to close all connections.
if ha_tools_debug:
print('Exception from replSetStepDown: %s' % exc)
# Seems to have succeeded.
break
else:
raise AssertionError("Couldn't complete replSetStepDown")
if ha_tools_debug:
print('\tcalled replSetStepDown')
elif ha_tools_debug:
print('stepdown_primary() found no primary')
def set_maintenance(member, value):
"""Put a member into RECOVERING state if value is True, else normal state.
"""
c = pymongo.MongoClient(member)
c.admin.command('replSetMaintenance', value)
start = time.time()
while value != (member in get_recovering()):
assert (time.time() - start) <= 10, (
"Member %s never switched state" % member)
time.sleep(0.25)
def restart_members(members, router=False):
restarted = []
for member in members:
if router:
cmd = routers[member]['cmd']
else:
cmd = nodes[member]['cmd']
lockfile_path = os.path.join(nodes[member]['dbpath'], 'mongod.lock')
if os.path.exists(lockfile_path):
os.remove(lockfile_path)
proc = start_subprocess(cmd)
if router:
routers[member]['proc'] = proc
else:
nodes[member]['proc'] = proc
res = wait_for(proc, int(member.split(':')[1]))
if res:
restarted.append(member)
return restarted