lib/arborist/monitor/snmp/cpu.rb
changeset 8 e0b7c95a154f
parent 4 e6eb11b1e00d
child 14 d5cb8bd33170
equal deleted inserted replaced
7:4548e58c8c66 8:e0b7c95a154f
       
     1 # -*- ruby -*-
       
     2 # vim: set noet nosta sw=4 ts=4 :
       
     3 
       
     4 require 'arborist/monitor/snmp' unless defined?( Arborist::Monitor::SNMP )
       
     5 
       
     6 # Machine load/cpu checks.
       
     7 #
       
     8 # Sets current 1, 5, and 15 minute loads under the 'load' attribute,
       
     9 # and calculates/warns on cpu overutilization.
       
    10 #
       
    11 class Arborist::Monitor::SNMP::CPU
       
    12 	include Arborist::Monitor::SNMP
       
    13 
       
    14 	extend Configurability, Loggability
       
    15 	log_to :arborist_snmp
       
    16 
       
    17 	# OIDS for discovering system load.
       
    18 	#
       
    19 	OIDS = {
       
    20 		load: '1.3.6.1.4.1.2021.10.1.3',
       
    21 		cpu:  '1.3.6.1.2.1.25.3.3.1.2'
       
    22 	}
       
    23 
       
    24 	# When walking load OIDS, the iterator count matches
       
    25 	# these labels.
       
    26 	#
       
    27 	LOADKEYS = {
       
    28 		1 => :load1,
       
    29 		2 => :load5,
       
    30 		3 => :load15
       
    31 	}
       
    32 
       
    33 
       
    34 	# Global defaults for instances of this monitor
       
    35 	#
       
    36 	configurability( 'arborist.snmp.cpu' ) do
       
    37 		# What overutilization percentage qualifies as a warning
       
    38 		setting :warn_at, default: 80
       
    39 	end
       
    40 
       
    41 
       
    42 	### Return the properties used by this monitor.
       
    43 	###
       
    44 	def self::node_properties
       
    45 		return USED_PROPERTIES
       
    46 	end
       
    47 
       
    48 
       
    49 	### Class #run creates a new instance and immediately runs it.
       
    50 	###
       
    51 	def self::run( nodes )
       
    52 		return new.run( nodes )
       
    53 	end
       
    54 
       
    55 
       
    56 	### Perform the monitoring checks.
       
    57 	###
       
    58 	def run( nodes )
       
    59 		super do |host, snmp|
       
    60 			self.find_load( host, snmp )
       
    61 		end
       
    62 	end
       
    63 
       
    64 
       
    65 	#########
       
    66 	protected
       
    67 	#########
       
    68 
       
    69 	### Return system CPU data.
       
    70 	###
       
    71 	def cpu( snmp )
       
    72 		return snmp.walk( OIDS[:cpu] )
       
    73 	end
       
    74 
       
    75 
       
    76 	### Find load data, add additional niceties for reporting.
       
    77 	###
       
    78 	def format_load( snmp )
       
    79 		info = { cpu: {}, load: {} }
       
    80 		cpus = self.cpu( snmp )
       
    81 
       
    82 		info[ :cpu ][ :count ] = cpus.size
       
    83 
       
    84 		# Windows SNMP doesn't have a concept of "load" over time,
       
    85 		# so we have to just use the current averaged CPU usage.
       
    86 		#
       
    87 		# This means that windows machines will very likely want to
       
    88 		# adjust the default "overutilization" number, considering
       
    89 		# it's really just how much of the CPU is used at the time of
       
    90 		# the monitor run, along with liberal use of the Observer "only
       
    91 		# alert after X events" pragmas.
       
    92 		#
       
    93 		if self.system =~ /windows\s+/i
       
    94 			info[ :cpu ][ :usage ] = cpus.values.inject( :+ ).to_f / cpus.size
       
    95 			info[ :message ] = "System is %0.1f%% in use." % [ info[ :cpu ][ :usage ] ]
       
    96 
       
    97 		# UCDavis stuff is better for alerting only after there has been
       
    98 		# an extended load event.  Use the 5 minute average to avoid
       
    99 		# state changes on transient spikes.
       
   100 		#
       
   101 		else
       
   102 			snmp.walk( OIDS[:load] ).each_with_index do |(_, value), idx|
       
   103 				next unless LOADKEYS[ idx + 1 ]
       
   104 				info[ :load ][ LOADKEYS[idx + 1] ] = value.to_f
       
   105 			end
       
   106 
       
   107 			percentage = (( ( info[:load][ :load5 ] / cpus.size ) - 1 ) * 100 ).round( 1 )
       
   108 
       
   109 			if percentage < 0
       
   110 				info[ :message ] = "System is %0.1f%% idle." % [ percentage.abs ]
       
   111 				info[ :cpu ][ :usage ] = percentage + 100
       
   112 			else
       
   113 				info[ :message ] = "System is %0.1f%% overloaded." % [ percentage ]
       
   114 				info[ :cpu ][ :usage ] = percentage
       
   115 			end
       
   116 		end
       
   117 
       
   118 		return info
       
   119 	end
       
   120 
       
   121 
       
   122 	### Collect the load information for +host+ from an existing
       
   123 	### (and open) +snmp+ connection.
       
   124 	###
       
   125 	def find_load( host, snmp )
       
   126 		info = self.format_load( snmp )
       
   127 
       
   128 		config  = identifiers[ host ].last || {}
       
   129 		warn_at = config[ 'warn_at' ] || self.class.warn_at
       
   130 		usage   = info.dig( :cpu, :usage ) || 0
       
   131 
       
   132 		if usage >= warn_at
       
   133 			info[ :warning ] = "%0.1f utilization exceeds %0.1f percent" % [ usage, warn_at ]
       
   134 		end
       
   135 
       
   136 		self.results[ host ] = info
       
   137 	end
       
   138 
       
   139 end # class Arborist::Monitor::SNMP::CPU
       
   140