lib/arborist/monitor/snmp/cpu.rb
author Mahlon E. Smith <mahlon@martini.nu>
Tue, 28 Apr 2020 10:10:19 -0700
changeset 26 54f2f57cc0b0
parent 19 77084121952b
permissions -rw-r--r--
Automatically clear stale mounts from node properties during disk checks.

# -*- ruby -*-
# vim: set noet nosta sw=4 ts=4 :

require 'arborist/monitor/snmp' unless defined?( Arborist::Monitor::SNMP )

# Machine load/cpu checks.
#
# Sets current 1, 5, and 15 minute loads under the 'load' attribute,
# and calculates/warns on cpu overutilization.
#
class Arborist::Monitor::SNMP::CPU
	include Arborist::Monitor::SNMP

	extend Configurability, Loggability
	log_to :arborist_snmp

	# OIDS for discovering system load.
	#
	OIDS = {
		load: '1.3.6.1.4.1.2021.10.1.3',
		cpu:  '1.3.6.1.2.1.25.3.3.1.2'
	}

	# When walking load OIDS, the iterator count matches
	# these labels.
	#
	LOADKEYS = %i[ load1 load5 load15 ]


	# Global defaults for instances of this monitor
	#
	configurability( 'arborist.snmp.cpu' ) do
		# What overutilization percentage qualifies as a warning
		setting :warn_at, default: 80
	end


	### Return the properties used by this monitor.
	###
	def self::node_properties
		return USED_PROPERTIES
	end


	### Class #run creates a new instance and immediately runs it.
	###
	def self::run( nodes )
		return new.run( nodes )
	end


	### Perform the monitoring checks.
	###
	def run( nodes )
		super do |host, snmp|
			self.find_load( host, snmp )
		end
	end


	#########
	protected
	#########

	### Find load data, add additional niceties for reporting.
	###
	def format_load( snmp )
		info = { cpu: {}, load: {} }
		cpus = snmp.walk( oid: OIDS[:cpu] ).each_with_object( [] ) do |(_, value), acc|
			acc << value
		end

		info[ :cpu ][ :count ] = cpus.size

		# Windows SNMP doesn't have a concept of "load" over time,
		# so we have to just use the current averaged CPU usage.
		#
		# This means that windows machines will very likely want to
		# adjust the default "overutilization" number, considering
		# it's really just how much of the CPU is used at the time of
		# the monitor run, along with liberal use of the Observer "only
		# alert after X events" pragmas.
		#
		if self.system =~ /windows\s+/i
			info[ :cpu ][ :usage ] = cpus.inject( :+ ).to_f / cpus.size
			info[ :message ] = "System is %0.1f%% in use." % [ info[ :cpu ][ :usage ] ]


		# UCDavis stuff is better for alerting only after there has been
		# an extended load event.  Use the 5 minute average to avoid
		# state changes on transient spikes.
		#
		else
			snmp.walk( oid: OIDS[:load] ).each_with_index do |(_, value), idx|
				next unless LOADKEYS[ idx ]
				info[ :load ][ LOADKEYS[idx] ] = value.to_f
			end

			percentage = (( ( info[:load][ :load5 ] / cpus.size) - 1 ) * 100 ).round( 1 )

			if percentage < 0
				info[ :message ] = "System is %0.1f%% idle." % [ percentage.abs ]
				info[ :cpu ][ :usage ] = percentage + 100
			else
				info[ :message ] = "System is %0.1f%% overloaded." % [ percentage ]
				info[ :cpu ][ :usage ] = percentage
			end
		end

		return info
	end


	### Collect the load information for +host+ from an existing
	### (and open) +snmp+ connection.
	###
	def find_load( host, snmp )
		info = self.format_load( snmp )

		config  = self.identifiers[ host ].last['config'] || {}
		warn_at = config[ 'warn_at' ] || self.class.warn_at
		usage   = info.dig( :cpu, :usage ) || 0

		if usage >= warn_at
			info[ :warning ] = "%0.1f utilization exceeds %0.1f percent" % [ usage, warn_at ]
		end

		self.results[ host ] = info
	end

end # class Arborist::Monitor::SNMP::CPU