lib/arborist/monitor/snmp.rb
changeset 0 8547a1ce445e
child 1 8446f55f7e58
equal deleted inserted replaced
-1:000000000000 0:8547a1ce445e
       
     1 # -*- ruby -*-
       
     2 # vim: set noet nosta sw=4 ts=4 :
       
     3 #encoding: utf-8
       
     4 #
       
     5 # SNMP checks for Arborist.  Requires an SNMP agent to be installed
       
     6 # on target machine, and the various "pieces" enabled.  For your platform.
       
     7 #
       
     8 # For example, for disk monitoring with Net-SNMP, you'll want to set
       
     9 # 'includeAllDisks' in the snmpd.conf. bsnmpd on FreeBSD benefits from
       
    10 # the 'bsnmp-ucd' package.  Etc.
       
    11 #
       
    12 
       
    13 require 'loggability'
       
    14 require 'arborist/monitor' unless defined?( Arborist::Monitor )
       
    15 require 'snmp'
       
    16 
       
    17 using Arborist::TimeRefinements
       
    18 
       
    19 # SNMP specific monitors and monitor logic.
       
    20 #
       
    21 class Arborist::Monitor::SNMP
       
    22 	extend Loggability
       
    23 	log_to :arborist
       
    24 
       
    25 	# The version of this library.
       
    26 	VERSION = '0.1.0'
       
    27 
       
    28 	# "Modes" that this monitor understands.
       
    29 	VALID_MODES = %i[ disk load memory swap process ]
       
    30 
       
    31 	# The OID that returns the system environment.
       
    32 	IDENTIFICATION_OID = '1.3.6.1.2.1.1.1.0'
       
    33 
       
    34 	# For net-snmp systems, ignore mount types that match
       
    35 	# this regular expression.  This includes null/union mounts
       
    36 	# and NFS, currently.
       
    37 	STORAGE_IGNORE = %r{25.3.9.(?:2|14)$}
       
    38 
       
    39 	# The OID that matches a local windows hard disk.  Anything else
       
    40 	# is a remote (SMB) mount.
       
    41 	WINDOWS_DEVICE = '1.3.6.1.2.1.25.2.1.4'
       
    42 
       
    43 	# OIDS required to pull disk information from net-snmp.
       
    44 	#
       
    45 	STORAGE_NET_SNMP = [
       
    46 		'1.3.6.1.4.1.2021.9.1.2', # paths
       
    47 		'1.3.6.1.2.1.25.3.8.1.4', # types
       
    48 		'1.3.6.1.4.1.2021.9.1.9'  # percents
       
    49 	]
       
    50 
       
    51 	# OIDS required to pull disk information from Windows.
       
    52 	#
       
    53 	STORAGE_WINDOWS = [
       
    54 		'1.3.6.1.2.1.25.2.3.1.2', # types
       
    55 		'1.3.6.1.2.1.25.2.3.1.3', # paths
       
    56 		'1.3.6.1.2.1.25.2.3.1.5', # totalsize
       
    57 		'1.3.6.1.2.1.25.2.3.1.6'  # usedsize
       
    58 	]
       
    59 
       
    60 	# OIDS for discovering memory usage.
       
    61 	#
       
    62 	MEMORY = {
       
    63 		swap_total: '1.3.6.1.4.1.2021.4.3.0',
       
    64 		swap_avail: '1.3.6.1.4.1.2021.4.4.0',
       
    65 		mem_avail:  '1.3.6.1.4.1.2021.4.6.0'
       
    66 	}
       
    67 
       
    68 	# OIDS for discovering system load.
       
    69 	#
       
    70 	LOAD = {
       
    71 		five_min: '1.3.6.1.4.1.2021.10.1.3.2'
       
    72 	}
       
    73 
       
    74 	# OIDS for discovering running processes.
       
    75 	#
       
    76 	PROCESS = {
       
    77 		 list: '1.3.6.1.2.1.25.4.2.1.4',
       
    78 		 args: '1.3.6.1.2.1.25.4.2.1.5'
       
    79 	}
       
    80 
       
    81 
       
    82 	# Defaults for instances of this monitor
       
    83 	#
       
    84 	DEFAULT_OPTIONS = {
       
    85 		timeout:          2,
       
    86 		retries:          1,
       
    87 		community:        'public',
       
    88 		port:             161,
       
    89 		storage_error_at: 95,    # in percent full
       
    90 		load_error_at:    7,
       
    91 		swap_error_at:    25,    # in percent remaining
       
    92 		mem_error_at:     51200, # in kilobytes
       
    93 		processes:        []     # list of procs to match
       
    94 	}
       
    95 
       
    96 
       
    97 	### This monitor is complex enough to require creating an instance from the caller.
       
    98 	### Provide a friendlier error message the class was provided to exec() directly.
       
    99 	###
       
   100 	def self::run( nodes )
       
   101 		self.log.error "Please use %s via an instance." % [ self.name ]
       
   102 		return {}
       
   103 	end
       
   104 
       
   105 
       
   106 	### Create a new instance of this monitor.
       
   107 	###
       
   108 	def initialize( options=DEFAULT_OPTIONS )
       
   109 		options = DEFAULT_OPTIONS.merge( options || {} )
       
   110 
       
   111 		options.each do |name, value|
       
   112 			self.public_send( "#{name}=", value )
       
   113 		end
       
   114 	end
       
   115 
       
   116 
       
   117 	# The mode (section) that this SMMP instance should check.
       
   118 	# Must be a +VALID_MODES+ mode.
       
   119 	attr_reader :mode
       
   120 
       
   121 	# Mapping of node addresses back to the node identifier.
       
   122 	attr_reader :identifiers
       
   123 
       
   124 	# The results from the SNMP daemons, keyed by address.
       
   125 	attr_reader :results
       
   126 
       
   127 	# A timeout in seconds if the SNMP server isn't responding.
       
   128 	attr_accessor :timeout
       
   129 
       
   130 	# Retry with the timeout this many times.  Defaults to 1.
       
   131 	attr_accessor :retries
       
   132 
       
   133 	# The SNMP UDP port, if running on non default.
       
   134 	attr_accessor :port
       
   135 
       
   136 	# The community string to connect with.
       
   137 	attr_accessor :community
       
   138 
       
   139 	# Set an error if mount points are above this percentage.
       
   140 	attr_accessor :storage_error_at
       
   141 
       
   142 	# Set an error if the 5 minute load average exceeds this.
       
   143 	attr_accessor :load_error_at
       
   144 
       
   145 	# Set an error if used swap exceeds this percentage.
       
   146 	attr_accessor :swap_error_at
       
   147 
       
   148 	# Set an error if memory used is below this many kilobytes.
       
   149 	attr_accessor :mem_error_at
       
   150 
       
   151 	# Set an error if processes in this array aren't running.
       
   152 	attr_accessor :processes
       
   153 
       
   154 
       
   155 	### Set the SNMP mode, after validation.
       
   156 	###
       
   157 	def mode=( mode )
       
   158 		unless VALID_MODES.include?( mode.to_sym )
       
   159 			self.log.error "Unknown SNMP mode: %s" % [ mode ]
       
   160 			return nil
       
   161 		end
       
   162 
       
   163 		@mode    = mode.to_sym
       
   164 		@results = {}
       
   165 	end
       
   166 
       
   167 
       
   168 	### Perform the monitoring checks.
       
   169 	###
       
   170 	def run( nodes )
       
   171 		self.log.debug "Got nodes to SNMP check: %p" % [ nodes ]
       
   172 
       
   173 		# Sanity check.
       
   174 		#
       
   175 		unless self.mode
       
   176 			self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ]
       
   177 			return {}
       
   178 		end
       
   179 
       
   180 		# Create mapping of addresses back to node identifiers.
       
   181 		#
       
   182 		@identifiers = nodes.each_with_object({}) do |(identifier, props), hash|
       
   183 			next unless props.key?( 'addresses' )
       
   184 			address = props[ 'addresses' ].first
       
   185 			hash[ address ] = identifier
       
   186 		end
       
   187 
       
   188 		# Perform the work!
       
   189 		#
       
   190 		threads = []
       
   191 		self.identifiers.keys.each do |host|
       
   192 			thr = Thread.new do
       
   193 				Thread.current.abort_on_exception = true
       
   194 				opts = {
       
   195 					host:      host,
       
   196 					port:      self.port,
       
   197 					community: self.community,
       
   198 					timeout:   self.timeout,
       
   199 					retries:   self.retries
       
   200 				}
       
   201 
       
   202 				begin
       
   203 					SNMP::Manager.open( opts ) do |snmp|
       
   204 						case self.mode
       
   205 						when :disk
       
   206 							self.gather_disks( snmp, host )
       
   207 						when :load
       
   208 							self.gather_load( snmp, host )
       
   209 						when :memory
       
   210 							self.gather_free_memory( snmp, host )
       
   211 						when :swap
       
   212 							self.gather_swap( snmp, host )
       
   213 						when :process
       
   214 							self.gather_processlist( snmp, host )
       
   215 						end
       
   216 					end
       
   217 				rescue SNMP::RequestTimeout
       
   218 					self.results[ host ] = {
       
   219 						error: "Host is not responding to SNMP requests."
       
   220 					}
       
   221 				rescue StandardError => err
       
   222 					self.results[ host ] = {
       
   223 						error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ]
       
   224 					}
       
   225 				end
       
   226 			end
       
   227 			threads << thr
       
   228 		end
       
   229 
       
   230 		# Wait for thread completion
       
   231 		threads.map( &:join )
       
   232 
       
   233 		# Map everything back to identifier -> attribute(s), and send to the manager.
       
   234 		#
       
   235 		reply = self.results.each_with_object({}) do |(address, results), hash|
       
   236 			identifier = self.identifiers[ address ] or next
       
   237 			hash[ identifier ] = results
       
   238 		end
       
   239 		self.log.debug "Sending to manager: %p" % [ reply ]
       
   240 		return reply
       
   241 	end
       
   242 
       
   243 
       
   244 	#########
       
   245 	protected
       
   246 	#########
       
   247 
       
   248 	### Collect the load information for +host+ from an existing
       
   249 	### (and open) +snmp+ connection.
       
   250 	###
       
   251 	def gather_load( snmp, host )
       
   252 		self.log.debug "Getting system load for: %s" % [ host ]
       
   253 		load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f
       
   254 		self.log.debug "  Load on %s: %0.2f" % [ host, load5 ]
       
   255 
       
   256 		if load5 >= self.load_error_at
       
   257 			self.results[ host ] = {
       
   258 				error: "Load has exceeded %0.2f over a 5 minute average" % [ self.load_error_at ],
       
   259 				load5: load5
       
   260 			}
       
   261 		else
       
   262 			self.results[ host ] = { load5: load5 }
       
   263 		end
       
   264 	end
       
   265 
       
   266 
       
   267 	### Collect available memory information for +host+ from an existing
       
   268 	### (and open) +snmp+ connection.
       
   269 	###
       
   270 	def gather_free_memory( snmp, host )
       
   271 		self.log.debug "Getting available memory for: %s" % [ host ]
       
   272 		mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f
       
   273 		self.log.debug "  Available memory on %s: %0.2f" % [ host, mem_avail ]
       
   274 
       
   275 		if mem_avail <= self.mem_error_at
       
   276 			self.results[ host ] = {
       
   277 				error: "Available memory is under %0.1fMB" % [ self.mem_error_at.to_f / 1024 ],
       
   278 				available_memory: mem_avail
       
   279 			}
       
   280 		else
       
   281 			self.results[ host ] = { available_memory: mem_avail }
       
   282 		end
       
   283 	end
       
   284 
       
   285 
       
   286 	### Collect used swap information for +host+ from an existing (and
       
   287 	### open) +snmp+ connection.
       
   288 	###
       
   289 	def gather_swap( snmp, host )
       
   290 		self.log.debug "Getting used swap for: %s" % [ host ]
       
   291 
       
   292 		swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f
       
   293 		swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f
       
   294 		swap_used  = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f
       
   295 		self.log.debug "  Swap in use on %s: %0.2f" % [ host, swap_used ]
       
   296 
       
   297 		if swap_used >= self.swap_error_at
       
   298 			self.results[ host ] = {
       
   299 				error: "%0.2f%% swap in use" % [ swap_used ],
       
   300 				swap_used: swap_used
       
   301 			}
       
   302 		else
       
   303 			self.results[ host ] = { swap_used: swap_used }
       
   304 		end
       
   305 	end
       
   306 
       
   307 
       
   308 	### Collect mount point usage for +host+ from an existing (and open)
       
   309 	#### +snmp+ connection.
       
   310 	###
       
   311 	def gather_disks( snmp, host )
       
   312 		self.log.debug "Getting disk information for %s" % [ host ]
       
   313 		errors  = []
       
   314 		results = {}
       
   315 		mounts  = self.get_disk_percentages( snmp )
       
   316 
       
   317 		mounts.each_pair do |path, percentage|
       
   318 			if percentage >= self.storage_error_at
       
   319 				errors << "Mount %s at %d%% capacity" % [ path, percentage ]
       
   320 			end
       
   321 		end
       
   322 
       
   323 		results[ :mounts ] = mounts
       
   324 		results[ :error ] = errors.join( ', ' ) unless errors.empty?
       
   325 
       
   326 		self.results[ host ] = results
       
   327 	end
       
   328 
       
   329 
       
   330 	### Collect running processes on +host+ from an existing (and open)
       
   331 	#### +snmp+ connection.
       
   332 	###
       
   333 	def gather_processlist( snmp, host )
       
   334 		self.log.debug "Getting running process list for %s" % [ host ]
       
   335 		procs = []
       
   336 
       
   337 		snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list|
       
   338 			process = list[0].value.to_s
       
   339 			args    = list[1].value.to_s
       
   340 			procs << "%s %s " % [ process, args ]
       
   341 		end
       
   342 
       
   343 		# Check against the running stuff, setting an error if
       
   344 		# one isn't found.
       
   345 		#
       
   346 		errors = []
       
   347 		Array( self.processes ).each do |process|
       
   348 			process_r = Regexp.new( process )
       
   349 			found = procs.find{|p| p.match(process_r) }
       
   350 			errors << "Process '%s' is not running" % [ process, host ] unless found
       
   351 		end
       
   352 
       
   353 		self.log.debug "  %d running processes" % [ procs.length ]
       
   354 		if errors.empty?
       
   355 			self.results[ host ] = {}
       
   356 		else
       
   357 			self.results[ host ] = { error: errors.join( ', ' ) }
       
   358 		end
       
   359 	end
       
   360 
       
   361 
       
   362 	### Given a SNMP object, return a hash of:
       
   363 	###
       
   364 	###    device path => percentage full
       
   365 	###
       
   366 	def get_disk_percentages( snmp )
       
   367 
       
   368 		# Does this look like a windows system, or a net-snmp based one?
       
   369 		system_type = snmp.get( SNMP::ObjectId.new( IDENTIFICATION_OID ) ).varbind_list.first.value
       
   370 		disks = {}
       
   371 
       
   372 		# Windows has it's own MIBs.
       
   373 		#
       
   374 		if system_type =~ /windows/i
       
   375 			snmp.walk( STORAGE_WINDOWS ) do |list|
       
   376 				next unless list[0].value.to_s == WINDOWS_DEVICE
       
   377 				disks[ list[1].value.to_s ] = ( list[3].value.to_f / list[2].value.to_f ) * 100
       
   378 			end
       
   379 			return disks
       
   380 		end
       
   381 
       
   382 		# Everything else.
       
   383 		#
       
   384 		snmp.walk( STORAGE_NET_SNMP ) do |list|
       
   385 			mount   = list[0].value.to_s
       
   386 			next if mount == 'noSuchInstance'
       
   387 
       
   388 			next if list[2].value.to_s == 'noSuchInstance'
       
   389 			used    = list[2].value.to_i
       
   390 
       
   391 			typeoid = list[1].value.join('.').to_s
       
   392 			next if typeoid =~ STORAGE_IGNORE
       
   393 			next if mount =~ /\/(?:dev|proc)$/
       
   394 
       
   395 			self.log.debug "   %s -> %s -> %s" % [ mount, typeoid, used ]
       
   396 			disks[ mount ] = used
       
   397 		end
       
   398 
       
   399 		return disks
       
   400 	end
       
   401 end # class Arborist::Monitor::SNMP
       
   402