lib/arborist/monitor/snmp/cpu.rb
changeset 8 e0b7c95a154f
parent 4 e6eb11b1e00d
child 14 d5cb8bd33170
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/arborist/monitor/snmp/cpu.rb	Wed Apr 04 11:00:35 2018 -0700
@@ -0,0 +1,140 @@
+# -*- ruby -*-
+# vim: set noet nosta sw=4 ts=4 :
+
+require 'arborist/monitor/snmp' unless defined?( Arborist::Monitor::SNMP )
+
+# Machine load/cpu checks.
+#
+# Sets current 1, 5, and 15 minute loads under the 'load' attribute,
+# and calculates/warns on cpu overutilization.
+#
+class Arborist::Monitor::SNMP::CPU
+	include Arborist::Monitor::SNMP
+
+	extend Configurability, Loggability
+	log_to :arborist_snmp
+
+	# OIDS for discovering system load.
+	#
+	OIDS = {
+		load: '1.3.6.1.4.1.2021.10.1.3',
+		cpu:  '1.3.6.1.2.1.25.3.3.1.2'
+	}
+
+	# When walking load OIDS, the iterator count matches
+	# these labels.
+	#
+	LOADKEYS = {
+		1 => :load1,
+		2 => :load5,
+		3 => :load15
+	}
+
+
+	# Global defaults for instances of this monitor
+	#
+	configurability( 'arborist.snmp.cpu' ) do
+		# What overutilization percentage qualifies as a warning
+		setting :warn_at, default: 80
+	end
+
+
+	### Return the properties used by this monitor.
+	###
+	def self::node_properties
+		return USED_PROPERTIES
+	end
+
+
+	### Class #run creates a new instance and immediately runs it.
+	###
+	def self::run( nodes )
+		return new.run( nodes )
+	end
+
+
+	### Perform the monitoring checks.
+	###
+	def run( nodes )
+		super do |host, snmp|
+			self.find_load( host, snmp )
+		end
+	end
+
+
+	#########
+	protected
+	#########
+
+	### Return system CPU data.
+	###
+	def cpu( snmp )
+		return snmp.walk( OIDS[:cpu] )
+	end
+
+
+	### Find load data, add additional niceties for reporting.
+	###
+	def format_load( snmp )
+		info = { cpu: {}, load: {} }
+		cpus = self.cpu( snmp )
+
+		info[ :cpu ][ :count ] = cpus.size
+
+		# Windows SNMP doesn't have a concept of "load" over time,
+		# so we have to just use the current averaged CPU usage.
+		#
+		# This means that windows machines will very likely want to
+		# adjust the default "overutilization" number, considering
+		# it's really just how much of the CPU is used at the time of
+		# the monitor run, along with liberal use of the Observer "only
+		# alert after X events" pragmas.
+		#
+		if self.system =~ /windows\s+/i
+			info[ :cpu ][ :usage ] = cpus.values.inject( :+ ).to_f / cpus.size
+			info[ :message ] = "System is %0.1f%% in use." % [ info[ :cpu ][ :usage ] ]
+
+		# UCDavis stuff is better for alerting only after there has been
+		# an extended load event.  Use the 5 minute average to avoid
+		# state changes on transient spikes.
+		#
+		else
+			snmp.walk( OIDS[:load] ).each_with_index do |(_, value), idx|
+				next unless LOADKEYS[ idx + 1 ]
+				info[ :load ][ LOADKEYS[idx + 1] ] = value.to_f
+			end
+
+			percentage = (( ( info[:load][ :load5 ] / cpus.size ) - 1 ) * 100 ).round( 1 )
+
+			if percentage < 0
+				info[ :message ] = "System is %0.1f%% idle." % [ percentage.abs ]
+				info[ :cpu ][ :usage ] = percentage + 100
+			else
+				info[ :message ] = "System is %0.1f%% overloaded." % [ percentage ]
+				info[ :cpu ][ :usage ] = percentage
+			end
+		end
+
+		return info
+	end
+
+
+	### Collect the load information for +host+ from an existing
+	### (and open) +snmp+ connection.
+	###
+	def find_load( host, snmp )
+		info = self.format_load( snmp )
+
+		config  = identifiers[ host ].last || {}
+		warn_at = config[ 'warn_at' ] || self.class.warn_at
+		usage   = info.dig( :cpu, :usage ) || 0
+
+		if usage >= warn_at
+			info[ :warning ] = "%0.1f utilization exceeds %0.1f percent" % [ usage, warn_at ]
+		end
+
+		self.results[ host ] = info
+	end
+
+end # class Arborist::Monitor::SNMP::CPU
+