# HG changeset patch # User Mahlon E. Smith # Date 1522864835 25200 # Node ID e0b7c95a154facc5aa66f5cb3057b13f1d00287c # Parent 4548e58c8c66d2b36d3a517439cd585b0348d402 Refactor for real world usage and latest Arborist behaviors. diff -r 4548e58c8c66 -r e0b7c95a154f .gems --- a/.gems Wed Aug 30 13:55:02 2017 -0700 +++ b/.gems Wed Apr 04 11:00:35 2018 -0700 @@ -1,2 +1,2 @@ -snmp -v1.2.0 +net-snmp2 -v0.3.1 diff -r 4548e58c8c66 -r e0b7c95a154f README.md --- a/README.md Wed Aug 30 13:55:02 2017 -0700 +++ b/README.md Wed Apr 04 11:00:35 2018 -0700 @@ -1,4 +1,6 @@ -# Arborist-SNMP + +Arborist-SNMP +============= home : http://bitbucket.org/mahlon/Arborist-SNMP @@ -7,79 +9,234 @@ : http://code.martini.nu/Arborist-SNMP -## Description +Description +----------- Arborist is a monitoring toolkit that follows the UNIX philosophy of small parts and loose coupling for stability, reliability, and customizability. -This adds SNMP support to Arborist's monitoring, for things such as: +This adds various SNMP support to Arborist's monitoring, specifically +for OIDS involving: - Disk space capacity - - System load - - Free memory - - Swap in use + - System CPU utilization + - Memory and swap usage - Running process checks - -## Prerequisites - -* Ruby 2.2 or better +It tries to provide sane defaults, while allowing fine grained settings +per resource node. Both Windows and UCD-SNMP systems are supported. -## Installation +Prerequisites +------------- + + * Ruby 2.3 or better + * Net-SNMP libraries + + +Installation +------------ $ gem install arborist-snmp -## Usage +Configuration +------------- + +Global configuration overrides can be added to the Arborist config file, +under the `snmp` key. + +The defaults are as follows: -In this example, we've created a resource node under an existing host, like so: + arborist: + snmp: + timeout: 2 + retries: 1 + community: public + version: 2c + port: 161 + batchsize: 25 + cpu: + warn_at: 80 + disk: + warn_at: 90 + include: ~ + exclude: + - "^/dev(/.+)?$" + - "^/net(/.+)?$" + - "^/proc$" + - "^/run$" + - "^/sys/" + memory: + physical_warn_at: ~ + swap_warn_at: 60 + processes: + check: [] + + +The `warn_at` keys imply usage capacity as a percentage. ie: "Warn me +when a disk mount point is at 90 percent utilization." + + +### Library Options + + * **timeout**: How long to wait for an SNMP response, in seconds? + * **retries**: If an error occurs during SNMP communication, try again this many times before giving up. + * **community**: The SNMP community name for reading data. + * **version**: The SNMP protocol version. 1 and 2c are supported. + * **port**: The UDP port SNMP is listening on. + * **batchsize**: How many hosts to gather SNMP data on simultaneously. + + +### Category Options and Behavior + +#### CPU + + * **warn_at**: Set the node to a `warning` state when utilization is at or over this percentage. + +Utilization takes into account CPU core counts, and uses the 5 minute +load average to calculate a percentage of current CPU use. + +2 properties are set on the node. `cpu` contains the detected CPU count +and current utilization. `load` contains the 1, 5, and 15 minute load +averages of the machine. - Arborist::Host( 'example' ) do - description "Example host" - address '10.6.0.169' - resource 'load', description: 'machine load' - resource 'disk' do - include: [ '/', '/mnt' ] - end + +#### Disk + + * **warn_at**: Set the node to a `warning` state when disk capacity is at or over this amount. + You can also set this to a Hash, keyed on mount name, if you want differing + warning values per mount point. A mount point that is at 100% capacity will + be explicity set to `down`, as the resource it represents has been exhausted. + * **include**: String or Array of Strings. If present, only matching mount points are + considered while performing checks. These are treated as regular expressions. + * **exclude**: String or Array of Strings. If present, matching mount point are removed + from evaluation. These are treated as regular expressions. + + +#### Memory + + * **physical_warn_at**: Set the node to a `warning` state when RAM utilization is at or over this percentage. + * **swap_warn_at**: Set the node to a `warning` state when swap utilization is at or over this percentage. + +Warnings are only set for swap my default, since that is usually a +better indication of an impending problem. + + +#### Processes + + * **check**: String or Array of Strings. A list of processes to check if running. These are + treated as regular expressions, and include process arguments. + +If any process in the list is not found in the process table, the +resource is set to a `down` state. + + +Examples +-------- + +In the simplest form, using default behaviors and settings, here's an +example Monitor configuration: + + require 'arborist/snmp' + + Arborist::Monitor 'cpu load check', :cpu do + every 1.minute + match type: 'resource', category: 'cpu' + exec( Arborist::Monitor::SNMP::CPU ) + end + + Arborist::Monitor 'partition capacity', :disk do + every 1.minute + match type: 'resource', category: 'disk' + exec( Arborist::Monitor::SNMP::Disk ) + end + + Arborist::Monitor 'process checks', :proc do + every 1.minute + match type: 'resource', category: 'process' + exec( Arborist::Monitor::SNMP::Process ) + end + + Arborist::Monitor 'memory', :memory do + every 1.minute + match type: 'resource', category: 'memory' + exec( Arborist::Monitor::SNMP::Memory ) end -From a monitor file, require this library, and create an snmp instance. -You can reuse a single instance, or create individual ones per monitor. +Additionally, if you'd like these SNMP monitors to rely on the SNMP +service itself, you can add a UDP check for that. - require 'arborist/monitor/snmp' - - Arborist::Monitor '5 minute load average check' do + Arborist::Monitor 'udp service checks', :udp do every 30.seconds - match type: 'resource', category: 'load' - include_down true - use :addresses - - snmp = Arborist::Monitor::SNMP::Load( error_at: 10 ) - exec( snmp ) + match type: 'service', protocol: 'udp' + exec( Arborist::Monitor::Socket::UDP ) end - Arborist::Monitor 'mount capacity check' do - every 30.seconds - match type: 'resource', category: 'disk' - include_down true - use :addresses, :config + +And a default node declaration: - exec( Arborist::Monitor::SNMP::Disk ) + Arborist::Host 'example' do + description 'An example host' + address 'demo.example.com' + + resource 'cpu' + resource 'memory' + resource 'disk' end -Please see the rdoc for all the mode types and error_at options. Per -node "config" vars override global defaults when instantiating the -monitor. + +All configuration can be overridden from the defaults using the `config` +pragma, per node. Here's a more elaborate example that performs the following: + + * All SNMP monitored resources are quieted if the SNMP service itself is unavailable. + * Only monitor specific disk partitions, warning at different capacities . + * Ensure the 'important' processing is running with the '--production' flag. + * Warns at 95% memory utilization OR 10% swap. + + + Arborist::Host 'example' do + description 'An example host' + address 'demo.example.com' + + service 'snmp', protocol: 'udp' + + resource 'cpu', description: 'machine cpu load' do + depends_on 'example-snmp' + end + + resource 'memory', description: 'machine ram and swap' do + depends_on 'example-snmp' + config physical_warn_at: 95, swap_warn_at: 10 + end + + resource 'disk', description: 'partition capacity' do + depends_on 'example-snmp' + config \ + include: [ + '^/tmp', + '^/var' + ], + warn_at: { + '/tmp' => 50, + '/var' => 80 + } + end + + resource 'process' do + depends_on 'example-snmp' + config check: 'important --production' + end + end ## License -Copyright (c) 2016, Michael Granger and Mahlon E. Smith +Copyright (c) 2016-2018 Michael Granger and Mahlon E. Smith All rights reserved. Redistribution and use in source and binary forms, with or without diff -r 4548e58c8c66 -r e0b7c95a154f Rakefile --- a/Rakefile Wed Aug 30 13:55:02 2017 -0700 +++ b/Rakefile Wed Apr 04 11:00:35 2018 -0700 @@ -13,7 +13,7 @@ end # parse the current library version -$version = ( LIBDIR + 'arborist' + 'monitor' + "#{PROJECT}.rb" ).read.split(/\n/). +$version = ( LIBDIR + 'arborist' + "#{PROJECT}.rb" ).read.split(/\n/). select{|line| line =~ /VERSION =/}.first.match(/([\d|.]+)/)[1] task :default => [ :spec, :docs, :package ] @@ -30,7 +30,7 @@ s.homepage = 'http://bitbucket.org/mahlon/Arborist-SNMP' s.authors = [ 'Mahlon E. Smith ', 'Michael Granger ' ] s.platform = Gem::Platform::RUBY - s.summary = "Common SNMP support for Arborist" + s.summary = "SNMP support for Arborist monitors" s.name = 'arborist-' + PROJECT s.version = $version s.license = 'BSD-3-Clause' @@ -40,12 +40,12 @@ s.files = File.read( __FILE__ ).split( /^__END__/, 2 ).last.split # s.executables = %w[] s.description = <<-EOF - This library adds common SNMP support to Arborist monitors. + This library adds common SNMP resource support to Arborist monitors. EOF s.required_ruby_version = '>= 2' s.add_dependency 'arborist', "~> 0.1" - s.add_dependency 'snmp', "~> 1.2" + s.add_dependency 'net-snmp2', "~> 0.3" end Gem::PackageTask.new( spec ) do |pkg| @@ -109,9 +109,10 @@ ### M A N I F E S T ######################################################################## __END__ +lib/arborist/snmp.rb lib/arborist/monitor/snmp.rb lib/arborist/monitor/snmp/swap.rb lib/arborist/monitor/snmp/disk.rb lib/arborist/monitor/snmp/process.rb lib/arborist/monitor/snmp/memory.rb -lib/arborist/monitor/snmp/load.rb +lib/arborist/monitor/snmp/cpu.rb diff -r 4548e58c8c66 -r e0b7c95a154f lib/arborist/monitor/snmp.rb --- a/lib/arborist/monitor/snmp.rb Wed Aug 30 13:55:02 2017 -0700 +++ b/lib/arborist/monitor/snmp.rb Wed Apr 04 11:00:35 2018 -0700 @@ -1,109 +1,133 @@ # -*- ruby -*- # vim: set noet nosta sw=4 ts=4 : #encoding: utf-8 -# + +require 'arborist/monitor' unless defined?( Arborist::Monitor ) +require 'net-snmp2' + # SNMP checks for Arborist. Requires an SNMP agent to be installed -# on target machine, and the various "pieces" enabled. For your platform. +# on target machine, and the various "pieces" enabled for your platform. # # For example, for disk monitoring with Net-SNMP, you'll want to set # 'includeAllDisks' in the snmpd.conf. bsnmpd on FreeBSD benefits from # the 'bsnmp-ucd' package. Etc. # - -require 'loggability' -require 'arborist/monitor' unless defined?( Arborist::Monitor ) -require 'snmp' - -using Arborist::TimeRefinements +module Arborist::Monitor::SNMP + using Arborist::TimeRefinements + extend Configurability, Loggability -# Shared SNMP monitor logic. -# -module Arborist::Monitor::SNMP - extend Loggability - log_to :arborist - - # The version of this library. - VERSION = '0.3.1' - - # Global defaults for instances of this monitor - # - DEFAULT_OPTIONS = { - timeout: 2, - retries: 1, - community: 'public', - port: 161 - } + # Loggability API + log_to :arborist_snmp # Always request the node addresses and any config. USED_PROPERTIES = [ :addresses, :config ].freeze - ### Return the properties used by this monitor. - def self::node_properties - return USED_PROPERTIES + # The OID that returns the system environment. + IDENTIFICATION_OID = '1.3.6.1.2.1.1.1.0' + + # Global defaults for instances of this monitor + # + configurability( 'arborist.snmp' ) do + setting :timeout, default: 2 + setting :retries, default: 1 + setting :community, default: 'public' + setting :version, default: '2c' + setting :port, default: 161 + + # How many hosts to check simultaneously + setting :batchsize, default: 25 end + # Indicate to FFI that we're using threads. + Net::SNMP.thread_safe = true + + + # The system type, as advertised. + attr_reader :system + + # The mapping of addresses back to node identifiers. + attr_reader :identifiers + + # The results hash that is sent back to the manager. + attr_reader :results + ### Connect to the SNMP daemon and yield. ### def run( nodes ) - self.log.debug "Got nodes to SNMP check: %p" % [ nodes ] - opts = Arborist::Monitor::SNMP::DEFAULT_OPTIONS # Create mapping of addresses back to node identifiers, # and retain any custom (overrides) config per node. # @identifiers = {} @results = {} - nodes.each_pair do |(identifier, props)| next unless props.key?( 'addresses' ) address = props[ 'addresses' ].first - @identifiers[ address ] = [ identifier, props['config'] ] + self.identifiers[ address ] = [ identifier, props['config'] ] end # Perform the work! # - threads = [] - @identifiers.keys.each do |host| - thr = Thread.new do - Thread.current.abort_on_exception = true + mainstart = Time.now + threads = ThreadGroup.new + batchcount = nodes.size / Arborist::Monitor::SNMP.batchsize + self.log.debug "Starting SNMP run for %d nodes" % [ nodes.size ] - config = @identifiers[host].last || {} - opts = { - host: host, - port: config[ 'port' ] || opts[ :port ], - community: config[ 'community' ] || opts[ :community ], - timeout: config[ 'timeout' ] || opts[ :timeout ], - retries: config[ 'retries' ] || opts[ :retries ] - } + self.identifiers.keys.each_slice( Arborist::Monitor::SNMP.batchsize ).each_with_index do |slice, batch| + slicestart = Time.now + self.log.debug " %d hosts (batch %d of %d)" % [ + slice.size, + batch + 1, + batchcount + 1 + ] + + slice.each do |host| + thr = Thread.new do + config = self.identifiers[ host ].last || {} + opts = { + peername: host, + port: config[ 'port' ] || Arborist::Monitor::SNMP.port, + version: config[ 'version' ] || Arborist::Monitor::SNMP.version, + community: config[ 'community' ] || Arborist::Monitor::SNMP.community, + timeout: config[ 'timeout' ] || Arborist::Monitor::SNMP.timeout, + retries: config[ 'retries' ] || Arborist::Monitor::SNMP.retries + } - begin - SNMP::Manager.open( opts ) do |snmp| - yield( snmp, host ) + snmp = Net::SNMP::Session.open( opts ) + begin + @system = snmp.get( IDENTIFICATION_OID ).varbinds.first.value + yield( host, snmp ) + + rescue Net::SNMP::TimeoutError, Net::SNMP::Error => err + self.log.error "%s: %s %s" % [ host, err.message, snmp.error_message ] + self.results[ host ] = { + error: "%s" % [ snmp.error_message ] + } + rescue => err + self.results[ host ] = { + error: "Uncaught exception. (%s: %s)" % [ err.class.name, err.message ] + } + ensure + snmp.close end - rescue SNMP::RequestTimeout - @results[ host ] = { - error: "Host is not responding to SNMP requests." - } - rescue StandardError => err - @results[ host ] = { - error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ] - } end + + threads.add( thr ) end - threads << thr + + # Wait for thread completions + threads.list.map( &:join ) + self.log.debug " finished after %0.1f seconds." % [ Time.now - slicestart ] end - - # Wait for thread completion - threads.map( &:join ) + self.log.debug "Completed SNMP run for %d nodes after %0.1f seconds." % [ nodes.size, Time.now - mainstart ] # Map everything back to identifier -> attribute(s), and send to the manager. # - reply = @results.each_with_object({}) do |(address, results), hash| - identifier = @identifiers[ address ] or next + reply = self.results.each_with_object({}) do |(address, results), hash| + identifier = self.identifiers[ address ] or next hash[ identifier.first ] = results end - self.log.debug "Sending to manager: %p" % [ reply ] return reply ensure @@ -113,9 +137,8 @@ end # Arborist::Monitor::SNMP +require 'arborist/monitor/snmp/cpu' require 'arborist/monitor/snmp/disk' -require 'arborist/monitor/snmp/load' +require 'arborist/monitor/snmp/process' require 'arborist/monitor/snmp/memory' -require 'arborist/monitor/snmp/process' -require 'arborist/monitor/snmp/swap' diff -r 4548e58c8c66 -r e0b7c95a154f lib/arborist/monitor/snmp/cpu.rb --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/arborist/monitor/snmp/cpu.rb Wed Apr 04 11:00:35 2018 -0700 @@ -0,0 +1,140 @@ +# -*- ruby -*- +# vim: set noet nosta sw=4 ts=4 : + +require 'arborist/monitor/snmp' unless defined?( Arborist::Monitor::SNMP ) + +# Machine load/cpu checks. +# +# Sets current 1, 5, and 15 minute loads under the 'load' attribute, +# and calculates/warns on cpu overutilization. +# +class Arborist::Monitor::SNMP::CPU + include Arborist::Monitor::SNMP + + extend Configurability, Loggability + log_to :arborist_snmp + + # OIDS for discovering system load. + # + OIDS = { + load: '1.3.6.1.4.1.2021.10.1.3', + cpu: '1.3.6.1.2.1.25.3.3.1.2' + } + + # When walking load OIDS, the iterator count matches + # these labels. + # + LOADKEYS = { + 1 => :load1, + 2 => :load5, + 3 => :load15 + } + + + # Global defaults for instances of this monitor + # + configurability( 'arborist.snmp.cpu' ) do + # What overutilization percentage qualifies as a warning + setting :warn_at, default: 80 + end + + + ### Return the properties used by this monitor. + ### + def self::node_properties + return USED_PROPERTIES + end + + + ### Class #run creates a new instance and immediately runs it. + ### + def self::run( nodes ) + return new.run( nodes ) + end + + + ### Perform the monitoring checks. + ### + def run( nodes ) + super do |host, snmp| + self.find_load( host, snmp ) + end + end + + + ######### + protected + ######### + + ### Return system CPU data. + ### + def cpu( snmp ) + return snmp.walk( OIDS[:cpu] ) + end + + + ### Find load data, add additional niceties for reporting. + ### + def format_load( snmp ) + info = { cpu: {}, load: {} } + cpus = self.cpu( snmp ) + + info[ :cpu ][ :count ] = cpus.size + + # Windows SNMP doesn't have a concept of "load" over time, + # so we have to just use the current averaged CPU usage. + # + # This means that windows machines will very likely want to + # adjust the default "overutilization" number, considering + # it's really just how much of the CPU is used at the time of + # the monitor run, along with liberal use of the Observer "only + # alert after X events" pragmas. + # + if self.system =~ /windows\s+/i + info[ :cpu ][ :usage ] = cpus.values.inject( :+ ).to_f / cpus.size + info[ :message ] = "System is %0.1f%% in use." % [ info[ :cpu ][ :usage ] ] + + # UCDavis stuff is better for alerting only after there has been + # an extended load event. Use the 5 minute average to avoid + # state changes on transient spikes. + # + else + snmp.walk( OIDS[:load] ).each_with_index do |(_, value), idx| + next unless LOADKEYS[ idx + 1 ] + info[ :load ][ LOADKEYS[idx + 1] ] = value.to_f + end + + percentage = (( ( info[:load][ :load5 ] / cpus.size ) - 1 ) * 100 ).round( 1 ) + + if percentage < 0 + info[ :message ] = "System is %0.1f%% idle." % [ percentage.abs ] + info[ :cpu ][ :usage ] = percentage + 100 + else + info[ :message ] = "System is %0.1f%% overloaded." % [ percentage ] + info[ :cpu ][ :usage ] = percentage + end + end + + return info + end + + + ### Collect the load information for +host+ from an existing + ### (and open) +snmp+ connection. + ### + def find_load( host, snmp ) + info = self.format_load( snmp ) + + config = identifiers[ host ].last || {} + warn_at = config[ 'warn_at' ] || self.class.warn_at + usage = info.dig( :cpu, :usage ) || 0 + + if usage >= warn_at + info[ :warning ] = "%0.1f utilization exceeds %0.1f percent" % [ usage, warn_at ] + end + + self.results[ host ] = info + end + +end # class Arborist::Monitor::SNMP::CPU + diff -r 4548e58c8c66 -r e0b7c95a154f lib/arborist/monitor/snmp/disk.rb --- a/lib/arborist/monitor/snmp/disk.rb Wed Aug 30 13:55:02 2017 -0700 +++ b/lib/arborist/monitor/snmp/disk.rb Wed Apr 04 11:00:35 2018 -0700 @@ -3,89 +3,89 @@ require 'arborist/monitor/snmp' unless defined?( Arborist::Monitor::SNMP ) -# SNMP Disk capacity checks. -# Returns all mounts with their current usage percentage in a "mount" attribute. +# Disk capacity checks. +# +# Sets all configured mounts with their current usage percentage +# in an attribute named "mounts". # class Arborist::Monitor::SNMP::Disk include Arborist::Monitor::SNMP - extend Loggability - log_to :arborist - - # The OID that returns the system environment. - IDENTIFICATION_OID = '1.3.6.1.2.1.1.1.0' - - # For net-snmp systems, ignore mount types that match - # this regular expression. This includes null/union mounts - # and NFS, currently. - STORAGE_IGNORE = %r{25.3.9.(?:2|14)$} - - # The OID that matches a local windows hard disk. Anything else - # is a remote (SMB) mount. - WINDOWS_DEVICE = '1.3.6.1.2.1.25.2.1.4' + extend Configurability, Loggability + log_to :arborist_snmp # OIDS required to pull disk information from net-snmp. # - STORAGE_NET_SNMP = [ - '1.3.6.1.4.1.2021.9.1.2', # paths - '1.3.6.1.2.1.25.3.8.1.4', # types - '1.3.6.1.4.1.2021.9.1.9' # percents + STORAGE_NET_SNMP = { + path: '1.3.6.1.4.1.2021.9.1.2', + percent: '1.3.6.1.4.1.2021.9.1.9', + type: '1.3.6.1.2.1.25.3.8.1.4' + } + + # The OID that matches a local windows hard disk. + # + WINDOWS_DEVICES = [ + '1.3.6.1.2.1.25.2.1.4', # local disk + '1.3.6.1.2.1.25.2.1.7' # removables, but we have to include them for iscsi mounts ] # OIDS required to pull disk information from Windows. # - STORAGE_WINDOWS = [ - '1.3.6.1.2.1.25.2.3.1.2', # types - '1.3.6.1.2.1.25.2.3.1.3', # paths - '1.3.6.1.2.1.25.2.3.1.5', # totalsize - '1.3.6.1.2.1.25.2.3.1.6' # usedsize - ] - - # Global defaults for instances of this monitor - # - DEFAULT_OPTIONS = { - error_at: 95, # in percent full - include: [], # if non-empty, only these paths are included in checks - exclude: [] # paths to exclude from checks + STORAGE_WINDOWS = { + type: '1.3.6.1.2.1.25.2.3.1.2', + path: '1.3.6.1.2.1.25.2.3.1.3', + total: '1.3.6.1.2.1.25.2.3.1.5', + used: '1.3.6.1.2.1.25.2.3.1.6' } + # The fallback warning capacity. + WARN_AT = 90 - ### This monitor is complex enough to require creating an instance from the caller. - ### Provide a friendlier error message the class was provided to exec() directly. + + # Configurability API + # + configurability( 'arborist.snmp.disk' ) do + # What percentage qualifies as a warning + setting :warn_at, default: WARN_AT + + # If non-empty, only these paths are included in checks. + # + setting :include do |val| + if val + mounts = Array( val ).map{|m| Regexp.new(m) } + Regexp.union( mounts ) + end + end + + # Paths to exclude from checks + # + setting :exclude, + default: [ '^/dev(/.+)?$', '^/net(/.+)?$', '^/proc$', '^/run$', '^/sys/' ] do |val| + mounts = Array( val ).map{|m| Regexp.new(m) } + Regexp.union( mounts ) + end + end + + + ### Return the properties used by this monitor. + ### + def self::node_properties + return USED_PROPERTIES + end + + + ### Class #run creates a new instance and immediately runs it. ### def self::run( nodes ) return new.run( nodes ) end - ### Create a new instance of this monitor. - ### - def initialize( options=DEFAULT_OPTIONS ) - options = DEFAULT_OPTIONS.merge( options || {} ) - %i[ include exclude ].each do |opt| - options[ opt ] = Array( options[opt] ) - end - - options.each do |name, value| - self.public_send( "#{name.to_s}=", value ) - end - end - - # Set an error if mount points are above this percentage. - attr_accessor :error_at - - # Only check these specific mount points. - attr_accessor :include - - # Exclude these mount points (array of paths) from checks. - attr_accessor :exclude - - ### Perform the monitoring checks. ### def run( nodes ) - super do |snmp, host| - self.gather_disks( snmp, host ) + super do |host, snmp| + self.gather_disks( host, snmp ) end end @@ -95,73 +95,89 @@ ######### ### Collect mount point usage for +host+ from an existing (and open) - #### +snmp+ connection. + ### +snmp+ connection. ### - def gather_disks( snmp, host ) - self.log.debug "Getting disk information for %s" % [ host ] - errors = [] - results = {} - mounts = self.get_disk_percentages( snmp ) - config = @identifiers[ host ].last || {} + def gather_disks( host, snmp ) + mounts = self.system =~ /windows\s+/i ? self.windows_disks( snmp ) : self.unix_disks( snmp ) + config = self.identifiers[ host ].last || {} + warn_at = config[ 'warn_at' ] || self.class.warn_at + + includes = self.format_mounts( config, 'include' ) || self.class.include + excludes = self.format_mounts( config, 'exclude' ) || self.class.exclude + + mounts.reject! do |path, percentage| + excludes.match( path ) || ( includes && ! includes.match( path ) ) + end - includes = config[ 'include' ] || self.include - excludes = config[ 'exclude' ] || self.exclude - + errors = [] + warnings = [] mounts.each_pair do |path, percentage| - next if excludes.include?( path ) - next if ! includes.empty? && ! includes.include?( path ) - if percentage >= ( config[ 'error_at' ] || self.error_at ) - errors << "%s at %d%% capacity" % [ path, percentage ] + + warn = begin + if warn_at.is_a?( Hash ) + warn_at[ path ] || WARN_AT + else + warn_at + end + end + + self.log.debug "%s:%s -> at %d, warn at %d" % [ host, path, percentage, warn ] + + if percentage >= warn.to_i + if percentage >= 100 + errors << "%s at %d%% capacity" % [ path, percentage ] + else + warnings << "%s at %d%% capacity" % [ path, percentage ] + end end end - results[ :mounts ] = mounts - results[ :error ] = errors.join( ', ' ) unless errors.empty? - - @results[ host ] = results + self.results[ host ] = { mounts: mounts } + self.results[ host ][ :error ] = errors.join(', ') unless errors.empty? + self.results[ host ][ :warning ] = warnings.join(', ') unless warnings.empty? end - ### Given a SNMP object, return a hash of: - ### - ### device path => percentage full + ### Return a single regexp for the 'include' or 'exclude' section of + ### resource node's +config+, or nil if nonexistent. ### - def get_disk_percentages( snmp ) + def format_mounts( config, section ) + list = config[ section ] || return + mounts = Array( list ).map{|m| Regexp.new(m) } + return Regexp.union( mounts ) + end - # Does this look like a windows system, or a net-snmp based one? - system_type = snmp.get( SNMP::ObjectId.new( IDENTIFICATION_OID ) ).varbind_list.first.value - disks = {} - # Windows has it's own MIBs. - # - if system_type =~ /windows/i - snmp.walk( STORAGE_WINDOWS ) do |list| - next unless list[0].value.to_s == WINDOWS_DEVICE - disks[ list[1].value.to_s ] = ( list[3].value.to_f / list[2].value.to_f ) * 100 - end - return disks - end + ### Fetch information for Windows systems. + ### + def windows_disks( snmp ) + raw = snmp.get_bulk([ + STORAGE_WINDOWS[:path], + STORAGE_WINDOWS[:type], + STORAGE_WINDOWS[:total], + STORAGE_WINDOWS[:used] + ]).varbinds.map( &:value ) - # Everything else. - # - snmp.walk( STORAGE_NET_SNMP ) do |list| - mount = list[0].value.to_s - next if mount == 'noSuchInstance' - - next if list[2].value.to_s == 'noSuchInstance' - used = list[2].value.to_i - - unless list[1].value.to_s == 'noSuchInstance' - typeoid = list[1].value.join('.').to_s - next if typeoid =~ STORAGE_IGNORE - end - next if mount =~ /\/(?:dev|proc)$/ - - disks[ mount ] = used + disks = {} + raw.each_slice( 4 ) do |device| + next unless device[1].respond_to?( :oid ) && WINDOWS_DEVICES.include?( device[1].oid ) + next if device[2].zero? + disks[ device[0] ] = (( device[3].to_f / device[2] ) * 100).round( 1 ) end return disks end + + ### Fetch information for Unix/MacOS systems. + ### + def unix_disks( snmp ) + raw = snmp.get_bulk([ + STORAGE_NET_SNMP[:path], + STORAGE_NET_SNMP[:percent] ]).varbinds.map( &:value ) + + return Hash[ *raw ] + end + end # class Arborist::Monitor::SNMP::Disk diff -r 4548e58c8c66 -r e0b7c95a154f lib/arborist/monitor/snmp/load.rb --- a/lib/arborist/monitor/snmp/load.rb Wed Aug 30 13:55:02 2017 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -# -*- ruby -*- -# vim: set noet nosta sw=4 ts=4 : - -require 'arborist/monitor/snmp' unless defined?( Arborist::Monitor::SNMP ) - -# SNMP 5 minute load checks. -# Sets current 5 minute load as a 'load5' attribute. -# -class Arborist::Monitor::SNMP::Load - include Arborist::Monitor::SNMP - - extend Loggability - log_to :arborist - - # OIDS for discovering system load. - # - LOAD = { - five_min: '1.3.6.1.4.1.2021.10.1.3.2' - } - - # Global defaults for instances of this monitor - # - DEFAULT_OPTIONS = { - error_at: 7 - } - - - ### Class #run creates a new instance. - ### - def self::run( nodes ) - return new.run( nodes ) - end - - - ### Create a new instance of this monitor. - ### - def initialize( options=DEFAULT_OPTIONS ) - options = DEFAULT_OPTIONS.merge( options || {} ) - options.each do |name, value| - self.public_send( "#{name.to_s}=", value ) - end - end - - # Set an error if mount points are above this percentage. - attr_accessor :error_at - - - ### Perform the monitoring checks. - ### - def run( nodes ) - super do |snmp, host| - self.gather_load( snmp, host ) - end - end - - - ######### - protected - ######### - - ### Collect the load information for +host+ from an existing - ### (and open) +snmp+ connection. - ### - def gather_load( snmp, host ) - self.log.debug "Getting system load for: %s" % [ host ] - load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f - self.log.debug " Load on %s: %0.2f" % [ host, load5 ] - - config = @identifiers[ host ].last || {} - error_at = config[ 'error_at' ] || self.error_at - if load5 >= error_at - @results[ host ] = { - error: "Load has exceeded %0.2f over a 5 minute average" % [ error_at ], - load5: load5 - } - else - @results[ host ] = { load5: load5 } - end - end - -end # class Arborist::Monitor::SNMP::Load - diff -r 4548e58c8c66 -r e0b7c95a154f lib/arborist/monitor/snmp/memory.rb --- a/lib/arborist/monitor/snmp/memory.rb Wed Aug 30 13:55:02 2017 -0700 +++ b/lib/arborist/monitor/snmp/memory.rb Wed Apr 04 11:00:35 2018 -0700 @@ -3,54 +3,73 @@ require 'arborist/monitor/snmp' unless defined?( Arborist::Monitor::SNMP ) -# SNMP memory availability checks. -# Returns total available memory in Kb to the 'available_memory' attribute. +# SNMP memory and swap utilization checks. +# +# Set 'usage' and 'available' keys as properties, in percentage/GBs, +# respectively. +# +# By default, doesn't warn on memory usage, only swap, since +# that's more indicitive of a problem. You can still set the +# 'physical_warn_at' key to force warnings on ram usage, for embedded +# systems or other similar things without virtual memory. # class Arborist::Monitor::SNMP::Memory include Arborist::Monitor::SNMP - extend Loggability - log_to :arborist + extend Configurability, Loggability + log_to :arborist_snmp # OIDS for discovering memory usage. # MEMORY = { - mem_avail: '1.3.6.1.4.1.2021.4.6.0' + total: '1.3.6.1.4.1.2021.4.5.0', + avail: '1.3.6.1.4.1.2021.4.6.0', + windows: { + label: '1.3.6.1.2.1.25.2.3.1.3', + units: '1.3.6.1.2.1.25.2.3.1.4', + total: '1.3.6.1.2.1.25.2.3.1.5', + used: '1.3.6.1.2.1.25.2.3.1.6' + } + } + + # OIDS for discovering swap usage. + # + SWAP = { + total: '1.3.6.1.4.1.2021.4.3.0', + avail: '1.3.6.1.4.1.2021.4.4.0' } # Global defaults for instances of this monitor # - DEFAULT_OPTIONS = { - error_at: 95, # in percent full - } + configurability( 'arborist.snmp.memory' ) do + # What memory usage percentage qualifies as a warning + setting :physical_warn_at, default: nil + + # What swap usage percentage qualifies as a warning + setting :swap_warn_at, default: 60 + end - ### This monitor is complex enough to require creating an instance from the caller. - ### Provide a friendlier error message the class was provided to exec() directly. + ### Return the properties used by this monitor. + ### + def self::node_properties + return USED_PROPERTIES + end + + + + ### Class #run creates a new instance and immediately runs it. ### def self::run( nodes ) return new.run( nodes ) end - ### Create a new instance of this monitor. - ### - def initialize( options=DEFAULT_OPTIONS ) - options = DEFAULT_OPTIONS.merge( options || {} ) - options.each do |name, value| - self.public_send( "#{name.to_s}=", value ) - end - end - - # Set an error if memory used is below this many kilobytes. - attr_accessor :error_at - - ### Perform the monitoring checks. ### def run( nodes ) - super do |snmp, host| - self.gather_free_memory( snmp, host ) + super do |host, snmp| + self.gather_memory( host, snmp ) end end @@ -62,22 +81,100 @@ ### Collect available memory information for +host+ from an existing ### (and open) +snmp+ connection. ### - def gather_free_memory( snmp, host ) - self.log.debug "Getting available memory for: %s" % [ host ] - mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f - self.log.debug " Available memory on %s: %0.2f" % [ host, mem_avail ] + def gather_memory( host, snmp ) + info = self.system =~ /windows\s+/i ? self.get_windows( snmp ) : self.get_mem( snmp ) + + config = identifiers[ host ].last || {} + physical_warn_at = config[ 'physical_warn_at' ] || self.class.physical_warn_at + swap_warn_at = config[ 'swap_warn_at' ] || self.class.swap_warn_at + + self.log.debug "Memory data on %s: %p" % [ host, info ] + memory, swap = info[:memory], info[:swap] + self.results[ host ] = { memory: memory, swap: swap } - config = @identifiers[ host ].last || {} - error_at = config['error_at'] || self.error_at - if mem_avail <= error_at - @results[ host ] = { - error: "Available memory is under %0.1fMB" % [ error_at.to_f / 1024 ], - available_memory: mem_avail - } - else - @results[ host ] = { available_memory: mem_avail } + memusage = memory[ :usage ].to_i + if physical_warn_at && memusage >= physical_warn_at + self.results[ host ][ :warning ] = "%0.1f memory utilization exceeds %0.1f percent" % [ + memusage, + physical_warn_at + ] + end + + swapusage = swap[ :usage ].to_i + if swapusage >= swap_warn_at + self.results[ host ][ :warning ] = "%0.1f swap utilization exceeds %0.1f percent" % [ + swapusage, + swap_warn_at + ] end end + + ### Return a hash of usage percentage in use, and free mem in + ### megs. + ### + def get_mem( snmp ) + info = {} + info[ :memory ] = self.calc_memory( snmp, MEMORY ) + info[ :swap ] = self.calc_memory( snmp, SWAP ) + + return info + end + + + ### Windows appends virtual and physical memory onto the last two items + ### of the storage iterator, because that made sense in someone's mind. + ### Walk the whole oid tree, and get the values we're after, return + ### a hash of usage percentage in use and free mem in megs. + ### + def get_windows( snmp ) + info = { memory: {}, swap: {} } + mem_idx, swap_idx = nil + + snmp.walk( MEMORY[:windows][:label] ).each_with_index do |(_, val), i| + mem_idx = i + 1 if val =~ /physical memory/i + swap_idx = i + 1 if val =~ /virtual memory/i + end + return info unless mem_idx + + info[ :memory ] = self.calc_windows_memory( snmp, mem_idx ) + info[ :swap ] = self.calc_windows_memory( snmp, swap_idx ) + + return info + end + + + ### Format usage and available amount, given an OID hash. + ### + def calc_memory( snmp, oids ) + info = { usage: 0, available: 0 } + avail = snmp.get( oids[:avail] ).varbinds.first.value.to_f + total = snmp.get( oids[:total] ).varbinds.first.value.to_f + used = total - avail + + return info if avail.zero? + + info[ :usage ] = (( used / total ) * 100 ).round( 2 ) + info[ :available ] = (( total - used ) / 1024 ).round( 2 ) + return info + end + + + ### Format usage and available amount for windows. + ### + def calc_windows_memory( snmp, idx) + info = { usage: 0, available: 0 } + return info unless idx + + units = snmp.get( MEMORY[:windows][:units] + ".#{idx}" ).varbinds.first.value + total = snmp.get( MEMORY[:windows][:total] + ".#{idx}" ).varbinds.first.value.to_f * units + used = snmp.get( MEMORY[:windows][:used] + ".#{idx}" ).varbinds.first.value.to_f * units + + info[ :usage ] = (( used / total ) * 100 ).round( 2 ) + info[ :available ] = (( total - used ) / 1024 / 1024 ).round( 2 ) + return info + end + + end # class Arborist::Monitor::SNMP::Memory diff -r 4548e58c8c66 -r e0b7c95a154f lib/arborist/monitor/snmp/process.rb --- a/lib/arborist/monitor/snmp/process.rb Wed Aug 30 13:55:02 2017 -0700 +++ b/lib/arborist/monitor/snmp/process.rb Wed Apr 04 11:00:35 2018 -0700 @@ -5,56 +5,60 @@ # SNMP running process checks. # +# This only checks running userland processes. +# class Arborist::Monitor::SNMP::Process include Arborist::Monitor::SNMP - extend Loggability - log_to :arborist + extend Configurability, Loggability + log_to :arborist_snmp + # OIDS for discovering running processes. + # Of course, Windows does it slightly differently. # PROCESS = { - list: '1.3.6.1.2.1.25.4.2.1.4', - args: '1.3.6.1.2.1.25.4.2.1.5' + netsnmp: { + list: '1.3.6.1.2.1.25.4.2.1.4', + args: '1.3.6.1.2.1.25.4.2.1.5' + }, + windows: { + list: '1.3.6.1.2.1.25.4.2.1.2', + path: '1.3.6.1.2.1.25.4.2.1.4', + args: '1.3.6.1.2.1.25.4.2.1.5' + } } + # Global defaults for instances of this monitor # - DEFAULT_OPTIONS = { - processes: [] # list of procs to match - } + configurability( 'arborist.snmp.processes' ) do + # Default list of processes to check for + setting :check, default: [] do |val| + Array( val ) + end + end - ### This monitor is complex enough to require creating an instance from the caller. - ### Provide a friendlier error message the class was provided to exec() directly. + ### Return the properties used by this monitor. + ### + def self::node_properties + return USED_PROPERTIES + end + + + ### Class #run creates a new instance and immediately runs it. ### def self::run( nodes ) return new.run( nodes ) end - ### Create a new instance of this monitor. - ### - def initialize( options=DEFAULT_OPTIONS ) - options = DEFAULT_OPTIONS.merge( options || {} ) - %i[ processes ].each do |opt| - options[ opt ] = Array( options[opt] ) - end - - options.each do |name, value| - self.public_send( "#{name.to_s}=", value ) - end - end - - # Set an error if processes in this array aren't running. - attr_accessor :processes - - ### Perform the monitoring checks. ### def run( nodes ) - super do |snmp, host| - self.gather_processlist( snmp, host ) + super do |host, snmp| + self.gather_processlist( host, snmp ) end end @@ -66,32 +70,52 @@ ### Collect running processes on +host+ from an existing (and open) #### +snmp+ connection. ### - def gather_processlist( snmp, host ) - self.log.debug "Getting running process list for %s" % [ host ] - config = @identifiers[ host ].last || {} - procs = [] + def gather_processlist( host, snmp ) + config = self.identifiers[ host ].last || {} errors = [] + procs = self.system =~ /windows\s+/i ? self.get_windows( snmp ) : self.get_procs( snmp ) - snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list| - process = list[0].value.to_s - args = list[1].value.to_s - procs << "%s %s " % [ process, args ] + self.log.debug "Running processes for host: %s: %p" % [ host, procs ] + self.results[ host ] = { count: procs.size } + + # Check against what is running. + # + Array( config['processes'] || self.class.check ).each do |process| + process_r = Regexp.new( process ) + found = procs.find{|p| p.match(process_r) } + errors << "'%s' is not running" % [ process ] unless found end - # Check against the running stuff, setting an error if - # one isn't found. - # - Array( config['processes'] || self.processes ).each do |process| - process_r = Regexp.new( process ) - found = procs.find{|p| p.match(process_r) } - errors << "Process '%s' is not running" % [ process, host ] unless found + self.results[ host ][ :error ] = errors.join( ', ' ) unless errors.empty? + end + + + ### Parse OIDS and return an Array of running processes. + ### Windows specific behaviors. + ### + def get_windows( snmp ) + oids = [ PROCESS[:windows][:path], PROCESS[:windows][:list], PROCESS[:windows][:args] ] + return snmp.walk( oids ).each_slice( 3 ). each_with_object( [] ) do |vals, acc| + path, process, args = vals[0][1], vals[1][1], vals[2][1] + next if path.empty? + + process = "%s%s" % [ path, process ] + process << " %s" % [ args ] unless args.empty? + acc << process end + end - self.log.debug " %d running processes" % [ procs.length ] - if errors.empty? - @results[ host ] = {} - else - @results[ host ] = { error: errors.join( ', ' ) } + + ### Parse OIDS and return an Array of running processes. + ### + def get_procs( snmp ) + oids = [ PROCESS[:netsnmp][:list], PROCESS[:netsnmp][:args] ] + return snmp.walk( oids ).each_slice( 2 ).each_with_object( [] ) do |vals, acc| + process, args = vals[0][1], vals[1][1] + next if process.empty? + + process << " %s" % [ args ] unless args.empty? + acc << process end end diff -r 4548e58c8c66 -r e0b7c95a154f lib/arborist/snmp.rb --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/arborist/snmp.rb Wed Apr 04 11:00:35 2018 -0700 @@ -0,0 +1,35 @@ +# -*- ruby -*- +#encoding: utf-8 + +require 'loggability' +require 'arborist' + + +# Various monitoring checks using SNMP, for the Arborist monitoring toolkit. +module Arborist::SNMP + extend Loggability + + # Loggability API -- set up a log host for this library + log_as :arborist_snmp + + + # Package version + VERSION = '0.4.0' + + # Version control revision + REVISION = %q$Revision$ + + + ### Return the name of the library with the version, and optionally the build ID if + ### +include_build+ is true. + def self::version_string( include_build: false ) + str = "%p v%s" % [ self, VERSION ] + str << ' (' << REVISION.strip << ')' if include_build + return str + end + + + require 'arborist/monitor/snmp' + +end # module Arborist::SNMP +