NewsBlur/vendor/mms-agent/blockingStats.py
2020-06-17 03:56:44 -04:00

304 lines
12 KiB
Python
Executable file

"""
(C) Copyright 2011, 10gen
This is a label on a mattress. Do not modify this file!
"""
import threading, time, datetime, pymongo, traceback, socket, warnings
warnings.simplefilter( 'ignore', DeprecationWarning )
blockingStatsAgentVersion = "1.5.7"
class BlockingMongoStatsThread( threading.Thread ):
""" Pull the blocking data from the various hosts. """
def __init__( self, hostKey, mmsAgent):
""" Initialize the object """
self.hostKey = hostKey
self.mmsAgent = mmsAgent
self.logger = mmsAgent.logger
self.slowDbStats = False
self.lastDbStatsCheck = time.time()
self.running = True
self.host = mmsAgent.extractHostname( hostKey )
self.port = mmsAgent.extractPort( hostKey )
threading.Thread.__init__( self )
def stopThread( self ):
""" Stop the thread """
self.running = False
def run( self ):
""" Pull the data from the various hosts. """
self.logger.info( 'starting blocking stats monitoring: ' + self.hostKey )
sleepTime = ( self.mmsAgent.collectionInterval / 2 ) - 1
if ( sleepTime < 1 ):
sleepTime = 1
monitorConn = None
passes = 0
while not self.mmsAgent.done and self.running:
try:
time.sleep( sleepTime )
passes = passes + 1
if passes % 60 == 0:
monitorConn = self.mmsAgent.closeDbConnection( self.hostKey, monitorConn )
if not monitorConn:
monitorConn = self.mmsAgent.getDbConnection( self.hostKey )
if not self._collectBlockingStats( passes, monitorConn ):
monitorConn = self.mmsAgent.closeDbConnection( self.hostKey, monitorConn )
except Exception as e:
monitorConn = self.mmsAgent.closeDbConnection( self.hostKey, monitorConn )
self.logger.error( 'Problem collecting blocking data from: ' + self.hostKey + " - exception: " + traceback.format_exc( e ) )
self.logger.info( 'stopping blocking stats monitoring: ' + self.hostKey )
self.mmsAgent.closeDbConnection( self.hostKey, monitorConn )
def _collectBlockingStats( self, passes, monitorConn ):
""" Collect the blocking stats from the hosts """
try:
if not self.mmsAgent.haveHostDef( self.hostKey ):
return False
# Close the connection once per hour
# Verify the connection.
if not self.mmsAgent.isValidMonitorConn( self.hostKey, monitorConn ):
return False
stats = self._collectStats( passes, monitorConn )
try:
stats['hostIpAddr'] = socket.gethostbyname(self.hostKey[0 : self.hostKey.find( ':' )])
except Exception as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'hostIpAddr', e )
stats['host'] = self.host
stats['port'] = self.port
# Make sure we ended up with the same connection.
if not self.mmsAgent.isValidMonitorConn( self.hostKey, monitorConn ):
return False
self.mmsAgent.setHostState( self.hostKey, 'mongoBlocking', stats )
return True
except Exception as e:
self.logger.error( 'Problem collecting blocking data from (check if it is up and DNS): ' + self.hostKey + " - exception: " + traceback.format_exc( e ) )
return False
def _collectStats( self, passes, monitorConn ):
""" Make the call to mongo host and collect the blocking data """
root = {}
# Set the agent version and hostname.
root['agentVersion'] = self.mmsAgent.agentVersion
root['agentHostname'] = self.mmsAgent.agentHostname
isMaster = monitorConn.admin.command( 'ismaster' )
root['isMaster'] = isMaster
isMongos = ( 'msg' in isMaster and isMaster['msg'] == 'isdbgrid' )
# Check to see if this is a mongod host
try:
if isMaster['ismaster'] == True and isMongos:
# Look at the shards
root['shards'] = list( monitorConn.config.shards.find() )
# Pull from config.locks
try:
root['locks'] = list( monitorConn.config.locks.find( limit=200, sort=[ ( "$natural" , pymongo.DESCENDING ) ]) )
except Exception as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'config.locks.find', e )
# Pull from config.collections if enabled
try:
if self.mmsAgent.settings.configCollectionsEnabled:
root['configCollections'] = list( monitorConn.config.collections.find( limit=200, sort=[ ( "$natural" , pymongo.DESCENDING ) ] ) )
except Exception as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'config.collections.find', e )
# Pull from config.databases if enabled
try:
if self.mmsAgent.settings.configDatabasesEnabled:
root['configDatabases'] = list( monitorConn.config.databases.find( limit=200, sort=[ ( "$natural" , pymongo.DESCENDING ) ] ) )
except Exception as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'config.databases.find', e )
try:
root['configLockpings'] = list( monitorConn.config.lockpings.find( limit=200, sort=[ ( "$natural" , pymongo.DESCENDING ) ] ) )
except Exception as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'config.lockpings.find', e )
# Look at the mongos instances - only pull hosts that have a ping time
# updated in the last twenty minutes (and max 1k).
queryTime = datetime.datetime.utcnow() - datetime.timedelta( seconds=1200 )
root['mongoses'] = list( monitorConn.config.mongos.find( { 'ping' : { '$gte' : queryTime } } ).limit( 1000 ) )
# Get the shard chunk counts.
shardChunkCounts = []
positions = { }
counter = 0
if passes % 10 == 0:
for chunk in monitorConn.config.chunks.find():
key = chunk['ns'] + chunk['shard']
if key not in positions:
count = {}
positions[key] = counter
shardChunkCounts.append( count )
counter = counter + 1
count['count'] = 0
count['ns'] = chunk['ns']
count['shard'] = chunk['shard']
shardChunkCounts[positions[key]]['count'] = shardChunkCounts[positions[key]]['count'] + 1
root['shardChunkCounts'] = shardChunkCounts
except pymongo.errors.OperationFailure:
pass
root['serverStatus'] = monitorConn.admin.command( 'serverStatus' )
isReplSet = False
isArbiter = False
if 'repl' in root['serverStatus']:
# Check to see if this is a replica set
try:
root['replStatus'] = monitorConn.admin.command( 'replSetGetStatus' )
if root['replStatus']['myState'] == 7:
isArbiter = True
isReplSet = True
except pymongo.errors.OperationFailure:
pass
if isReplSet:
oplog = "oplog.rs"
else:
oplog = "oplog.$main"
try:
root['localSystemReplSet'] = monitorConn.local.system.replset.find_one()
except pymongo.errors.OperationFailure as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'local.system.replset.findOne', e )
localConn = monitorConn.local
oplogStats = {}
# Get oplog status
if isArbiter:
# Do nothing for the time being
pass
elif isMaster['ismaster'] == True or isReplSet:
try:
oplogStats["start"] = localConn[oplog].find( limit=1, sort=[ ( "$natural" , pymongo.ASCENDING ) ], fields={ 'ts' : 1 } )[0]["ts"]
oplogStats["end"] = localConn[oplog].find( limit=1, sort=[ ( "$natural" , pymongo.DESCENDING ) ], fields={ 'ts' : 1} )[0]["ts"]
oplogStats['rsStats'] = localConn.command( {'collstats' : 'oplog.rs' } )
except pymongo.errors.OperationFailure as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'local.' + oplog + '.find', e )
else:
# Slave
try:
oplogStats["sources"] = {}
for s in localConn.sources.find():
oplogStats["sources"][s["host"]] = s
except pymongo.errors.OperationFailure as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'local.sources.find', e )
root["oplog"] = oplogStats
# Load the config.gettings collection (balancer info etc.)
if not isArbiter:
try:
root['configSettings'] = list( monitorConn.config.settings.find() )
except Exception as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'config.settings.find', e )
# per db info - mongos doesn't allow calls to local
root['databases'] = { }
root['dbProfiling'] = { }
root['dbProfileData'] = { }
profilerEnabled = self.mmsAgent.hostDefValue( self.hostKey, 'profiler' )
if ( passes % 20 == 0 or profilerEnabled ) and not isArbiter and not isMongos:
count = 0
for x in monitorConn.database_names():
try:
if passes % 20 == 0:
if self.slowDbStats and ( ( time.time() - self.lastDbStatsCheck ) < 7200 ):
continue
if not self.mmsAgent.disableDbstats:
count += 1
if count > 100:
break
startTime = time.time()
temp = monitorConn[x].command( 'dbstats' )
# work around Python 2.4 and older bug
for f in temp:
# this is super hacky b/c of Python 2.4
if isinstance( temp[f] , (int, float, complex)) and str(temp[f]) == "-inf":
temp[f] = 0
root['databases'][x] = temp
if ( time.time() - startTime ) > 6:
self.slowDbStats = True
else:
self.slowDbStats = False
self.lastDbStatsCheck = time.time()
# If the profiler is enabled in MMS, collect data
if profilerEnabled and not isArbiter and not self.mmsAgent.settings.disableProfileDataCollection:
try:
# Get the most recent entries.
profileData = list( monitorConn[x].system.profile.find( spec=None, fields=None, skip=0, limit=20, sort=[ ( "$natural", pymongo.DESCENDING ) ] ) )
if len( profileData ) > 0:
root['dbProfileData'][x] = profileData
except Exception as e:
self.mmsAgent.handleOperationFailure( self.hostKey, 'system.profile.find-' + x, e )
# Check to see if the profiler is enabled
try:
profiling = monitorConn[x].command( { 'profile' : -1 } )
if profiling is not None and 'ok' in profiling:
del profiling['ok']
root['dbProfiling'][x] = profiling
except Exception:
pass
except:
continue
if 'serverStatus' in root:
del root['serverStatus']
return root