lib/arborist/monitor/snmp.rb
changeset 4 e6eb11b1e00d
parent 3 d46ca2b52efe
child 7 4548e58c8c66
equal deleted inserted replaced
3:d46ca2b52efe 4:e6eb11b1e00d
    14 require 'arborist/monitor' unless defined?( Arborist::Monitor )
    14 require 'arborist/monitor' unless defined?( Arborist::Monitor )
    15 require 'snmp'
    15 require 'snmp'
    16 
    16 
    17 using Arborist::TimeRefinements
    17 using Arborist::TimeRefinements
    18 
    18 
    19 # SNMP specific monitors and monitor logic.
    19 # Shared SNMP monitor logic.
    20 #
    20 #
    21 class Arborist::Monitor::SNMP
    21 module Arborist::Monitor::SNMP
    22 	extend Loggability
    22 	extend Loggability
    23 	log_to :arborist
    23 	log_to :arborist
    24 
    24 
    25 	# The version of this library.
    25 	# The version of this library.
    26 	VERSION = '0.2.1'
    26 	VERSION = '0.3.0'
    27 
       
    28 	# "Modes" that this monitor understands.
       
    29 	VALID_MODES = %i[ disk load memory swap process ]
       
    30 
       
    31 	# The OID that returns the system environment.
       
    32 	IDENTIFICATION_OID = '1.3.6.1.2.1.1.1.0'
       
    33 
       
    34 	# For net-snmp systems, ignore mount types that match
       
    35 	# this regular expression.  This includes null/union mounts
       
    36 	# and NFS, currently.
       
    37 	STORAGE_IGNORE = %r{25.3.9.(?:2|14)$}
       
    38 
       
    39 	# The OID that matches a local windows hard disk.  Anything else
       
    40 	# is a remote (SMB) mount.
       
    41 	WINDOWS_DEVICE = '1.3.6.1.2.1.25.2.1.4'
       
    42 
       
    43 	# OIDS required to pull disk information from net-snmp.
       
    44 	#
       
    45 	STORAGE_NET_SNMP = [
       
    46 		'1.3.6.1.4.1.2021.9.1.2', # paths
       
    47 		'1.3.6.1.2.1.25.3.8.1.4', # types
       
    48 		'1.3.6.1.4.1.2021.9.1.9'  # percents
       
    49 	]
       
    50 
       
    51 	# OIDS required to pull disk information from Windows.
       
    52 	#
       
    53 	STORAGE_WINDOWS = [
       
    54 		'1.3.6.1.2.1.25.2.3.1.2', # types
       
    55 		'1.3.6.1.2.1.25.2.3.1.3', # paths
       
    56 		'1.3.6.1.2.1.25.2.3.1.5', # totalsize
       
    57 		'1.3.6.1.2.1.25.2.3.1.6'  # usedsize
       
    58 	]
       
    59 
       
    60 	# OIDS for discovering memory usage.
       
    61 	#
       
    62 	MEMORY = {
       
    63 		swap_total: '1.3.6.1.4.1.2021.4.3.0',
       
    64 		swap_avail: '1.3.6.1.4.1.2021.4.4.0',
       
    65 		mem_avail:  '1.3.6.1.4.1.2021.4.6.0'
       
    66 	}
       
    67 
       
    68 	# OIDS for discovering system load.
       
    69 	#
       
    70 	LOAD = {
       
    71 		five_min: '1.3.6.1.4.1.2021.10.1.3.2'
       
    72 	}
       
    73 
       
    74 	# OIDS for discovering running processes.
       
    75 	#
       
    76 	PROCESS = {
       
    77 		 list: '1.3.6.1.2.1.25.4.2.1.4',
       
    78 		 args: '1.3.6.1.2.1.25.4.2.1.5'
       
    79 	}
       
    80 
       
    81 
    27 
    82 	# Global defaults for instances of this monitor
    28 	# Global defaults for instances of this monitor
    83 	#
    29 	#
    84 	DEFAULT_OPTIONS = {
    30 	DEFAULT_OPTIONS = {
    85 		timeout:          2,
    31 		timeout:   2,
    86 		retries:          1,
    32 		retries:   1,
    87 		community:        'public',
    33 		community: 'public',
    88 		port:             161,
    34 		port:      161
    89 		storage_error_at: 95,    # in percent full
       
    90 		storage_include:  [],    # if non-empty, only these paths are included in checks
       
    91 		storage_exclude:  [],    # paths to exclude from checks
       
    92 		load_error_at:    7,
       
    93 		swap_error_at:    25,    # in percent remaining
       
    94 		mem_error_at:     51200, # in kilobytes
       
    95 		processes:        []     # list of procs to match
       
    96 	}
    35 	}
    97 
    36 
    98 
    37 
    99 	### This monitor is complex enough to require creating an instance from the caller.
    38 	### Connect to the SNMP daemon and yield.
   100 	### Provide a friendlier error message the class was provided to exec() directly.
       
   101 	###
       
   102 	def self::run( nodes )
       
   103 		self.log.error "Please use %s via an instance." % [ self.name ]
       
   104 		return {}
       
   105 	end
       
   106 
       
   107 
       
   108 	### Create a new instance of this monitor.
       
   109 	###
       
   110 	def initialize( options=DEFAULT_OPTIONS )
       
   111 		options = DEFAULT_OPTIONS.merge( options || {} )
       
   112 		%i[ storage_include storage_exclude processes ].each do |opt|
       
   113 			options[ opt ] = Array( options[opt] )
       
   114 		end
       
   115 
       
   116 		options.each do |name, value|
       
   117 			self.public_send( "#{name.to_s}=", value )
       
   118 		end
       
   119 	end
       
   120 
       
   121 
       
   122 	# The mode (section) that this SMMP instance should check.
       
   123 	# Must be a +VALID_MODES+ mode.
       
   124 	attr_reader :mode
       
   125 
       
   126 	# Mapping of node addresses back to the node identifier.
       
   127 	attr_reader :identifiers
       
   128 
       
   129 	# The results from the SNMP daemons, keyed by address.
       
   130 	attr_reader :results
       
   131 
       
   132 	# A timeout in seconds if the SNMP server isn't responding.
       
   133 	attr_accessor :timeout
       
   134 
       
   135 	# Retry with the timeout this many times.  Defaults to 1.
       
   136 	attr_accessor :retries
       
   137 
       
   138 	# The SNMP UDP port, if running on non default.
       
   139 	attr_accessor :port
       
   140 
       
   141 	# The community string to connect with.
       
   142 	attr_accessor :community
       
   143 
       
   144 	# Set an error if mount points are above this percentage.
       
   145 	attr_accessor :storage_error_at
       
   146 
       
   147 	# Only check these specific mount points.
       
   148 	attr_accessor :storage_include
       
   149 
       
   150 	# Exclude these mount points (array of paths) from checks.
       
   151 	attr_accessor :storage_exclude
       
   152 
       
   153 	# Set an error if the 5 minute load average exceeds this.
       
   154 	attr_accessor :load_error_at
       
   155 
       
   156 	# Set an error if used swap exceeds this percentage.
       
   157 	attr_accessor :swap_error_at
       
   158 
       
   159 	# Set an error if memory used is below this many kilobytes.
       
   160 	attr_accessor :mem_error_at
       
   161 
       
   162 	# Set an error if processes in this array aren't running.
       
   163 	attr_accessor :processes
       
   164 
       
   165 
       
   166 	### Set the SNMP mode, after validation.
       
   167 	###
       
   168 	def mode=( mode )
       
   169 		unless VALID_MODES.include?( mode.to_sym )
       
   170 			self.log.error "Unknown SNMP mode: %s" % [ mode ]
       
   171 			return nil
       
   172 		end
       
   173 
       
   174 		@mode    = mode.to_sym
       
   175 		@results = {}
       
   176 	end
       
   177 
       
   178 
       
   179 	### Perform the monitoring checks.
       
   180 	###
    39 	###
   181 	def run( nodes )
    40 	def run( nodes )
   182 		self.log.debug "Got nodes to SNMP check: %p" % [ nodes ]
    41 		self.log.debug "Got nodes to SNMP check: %p" % [ nodes ]
   183 
    42 		opts = Arborist::Monitor::SNMP::DEFAULT_OPTIONS
   184 		# Sanity check.
       
   185 		#
       
   186 		unless self.mode
       
   187 			self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ]
       
   188 			return {}
       
   189 		end
       
   190 
    43 
   191 		# Create mapping of addresses back to node identifiers,
    44 		# Create mapping of addresses back to node identifiers,
   192 		# and retain any custom configs per node.
    45 		# and retain any custom (overrides) config per node.
   193 		#
    46 		#
   194 		@identifiers = {}
    47 		@identifiers = {}
       
    48 		@results     = {}
       
    49 
   195 		nodes.each_pair do |(identifier, props)|
    50 		nodes.each_pair do |(identifier, props)|
   196 			next unless props.key?( 'addresses' )
    51 			next unless props.key?( 'addresses' )
   197 			address = props[ 'addresses' ].first
    52 			address = props[ 'addresses' ].first
   198 			self.identifiers[ address ] = [ identifier, props['config'] ]
    53 			@identifiers[ address ] = [ identifier, props['config'] ]
   199 		end
    54 		end
   200 
    55 
   201 		# Perform the work!
    56 		# Perform the work!
   202 		#
    57 		#
   203 		threads = []
    58 		threads = []
   204 		self.identifiers.keys.each do |host|
    59 		@identifiers.keys.each do |host|
   205 			thr = Thread.new do
    60 			thr = Thread.new do
   206 				Thread.current.abort_on_exception = true
    61 				Thread.current.abort_on_exception = true
   207 
    62 
   208 				config = self.identifiers[host].last || {}
    63 				config = @identifiers[host].last || {}
   209 				opts = {
    64 				opts = {
   210 					host:      host,
    65 					host:      host,
   211 					port:      config[ 'port' ] || self.port,
    66 					port:      config[ 'port' ]      || opts[ :port ],
   212 					community: config[ 'community' ] || self.community,
    67 					community: config[ 'community' ] || opts[ :community ],
   213 					timeout:   config[ 'timeout' ] || self.timeout,
    68 					timeout:   config[ 'timeout' ]   || opts[ :timeout ],
   214 					retries:   config[ 'retries' ] || self.retries
    69 					retries:   config[ 'retries' ]   || opts[ :retries ]
   215 				}
    70 				}
   216 
    71 
   217 				begin
    72 				begin
   218 					SNMP::Manager.open( opts ) do |snmp|
    73 					SNMP::Manager.open( opts ) do |snmp|
   219 						case self.mode
    74 						yield( snmp, host )
   220 						when :disk
       
   221 							self.gather_disks( snmp, host )
       
   222 						when :load
       
   223 							self.gather_load( snmp, host )
       
   224 						when :memory
       
   225 							self.gather_free_memory( snmp, host )
       
   226 						when :swap
       
   227 							self.gather_swap( snmp, host )
       
   228 						when :process
       
   229 							self.gather_processlist( snmp, host )
       
   230 						end
       
   231 					end
    75 					end
   232 				rescue SNMP::RequestTimeout
    76 				rescue SNMP::RequestTimeout
   233 					self.results[ host ] = {
    77 					@results[ host ] = {
   234 						error: "Host is not responding to SNMP requests."
    78 						error: "Host is not responding to SNMP requests."
   235 					}
    79 					}
   236 				rescue StandardError => err
    80 				rescue StandardError => err
   237 					self.results[ host ] = {
    81 					@results[ host ] = {
   238 						error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ]
    82 						error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ]
   239 					}
    83 					}
   240 				end
    84 				end
   241 			end
    85 			end
   242 			threads << thr
    86 			threads << thr
   245 		# Wait for thread completion
    89 		# Wait for thread completion
   246 		threads.map( &:join )
    90 		threads.map( &:join )
   247 
    91 
   248 		# Map everything back to identifier -> attribute(s), and send to the manager.
    92 		# Map everything back to identifier -> attribute(s), and send to the manager.
   249 		#
    93 		#
   250 		reply = self.results.each_with_object({}) do |(address, results), hash|
    94 		reply = @results.each_with_object({}) do |(address, results), hash|
   251 			identifier = self.identifiers[ address ] or next
    95 			identifier = @identifiers[ address ] or next
   252 			hash[ identifier.first ] = results
    96 			hash[ identifier.first ] = results
   253 		end
    97 		end
   254 		self.log.debug "Sending to manager: %p" % [ reply ]
    98 		self.log.debug "Sending to manager: %p" % [ reply ]
   255 		return reply
    99 		return reply
       
   100 
       
   101 	ensure
       
   102 		@identifiers = {}
       
   103 		@results     = {}
   256 	end
   104 	end
   257 
   105 
       
   106 end # Arborist::Monitor::SNMP
   258 
   107 
   259 	#########
   108 require 'arborist/monitor/snmp/disk'
   260 	protected
   109 require 'arborist/monitor/snmp/load'
   261 	#########
   110 require 'arborist/monitor/snmp/memory'
       
   111 require 'arborist/monitor/snmp/process'
       
   112 require 'arborist/monitor/snmp/swap'
   262 
   113 
   263 	### Collect the load information for +host+ from an existing
       
   264 	### (and open) +snmp+ connection.
       
   265 	###
       
   266 	def gather_load( snmp, host )
       
   267 		self.log.debug "Getting system load for: %s" % [ host ]
       
   268 		load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f
       
   269 		self.log.debug "  Load on %s: %0.2f" % [ host, load5 ]
       
   270 
       
   271 		config = self.identifiers[ host ].last || {}
       
   272 		error_at = config[ 'load_error_at' ] || self.load_error_at
       
   273 		if load5 >= error_at
       
   274 			self.results[ host ] = {
       
   275 				error: "Load has exceeded %0.2f over a 5 minute average" % [ error_at ],
       
   276 				load5: load5
       
   277 			}
       
   278 		else
       
   279 			self.results[ host ] = { load5: load5 }
       
   280 		end
       
   281 	end
       
   282 
       
   283 
       
   284 	### Collect available memory information for +host+ from an existing
       
   285 	### (and open) +snmp+ connection.
       
   286 	###
       
   287 	def gather_free_memory( snmp, host )
       
   288 		self.log.debug "Getting available memory for: %s" % [ host ]
       
   289 		mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f
       
   290 		self.log.debug "  Available memory on %s: %0.2f" % [ host, mem_avail ]
       
   291 
       
   292 		config = self.identifiers[ host ].last || {}
       
   293 		error_at = config['mem_error_at'] || self.mem_error_at
       
   294 		if mem_avail <= error_at
       
   295 			self.results[ host ] = {
       
   296 				error: "Available memory is under %0.1fMB" % [ error_at.to_f / 1024 ],
       
   297 				available_memory: mem_avail
       
   298 			}
       
   299 		else
       
   300 			self.results[ host ] = { available_memory: mem_avail }
       
   301 		end
       
   302 	end
       
   303 
       
   304 
       
   305 	### Collect used swap information for +host+ from an existing (and
       
   306 	### open) +snmp+ connection.
       
   307 	###
       
   308 	def gather_swap( snmp, host )
       
   309 		self.log.debug "Getting used swap for: %s" % [ host ]
       
   310 
       
   311 		swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f
       
   312 		swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f
       
   313 		swap_used  = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f
       
   314 		self.log.debug "  Swap in use on %s: %0.2f" % [ host, swap_used ]
       
   315 
       
   316 		config = self.identifiers[ host ].last || {}
       
   317 		if swap_used >= ( config['swap_error_at'] || self.swap_error_at )
       
   318 			self.results[ host ] = {
       
   319 				error: "%0.2f%% swap in use" % [ swap_used ],
       
   320 				swap_used: swap_used
       
   321 			}
       
   322 		else
       
   323 			self.results[ host ] = { swap_used: swap_used }
       
   324 		end
       
   325 	end
       
   326 
       
   327 
       
   328 	### Collect mount point usage for +host+ from an existing (and open)
       
   329 	#### +snmp+ connection.
       
   330 	###
       
   331 	def gather_disks( snmp, host )
       
   332 		self.log.debug "Getting disk information for %s" % [ host ]
       
   333 		errors  = []
       
   334 		results = {}
       
   335 		mounts  = self.get_disk_percentages( snmp )
       
   336 		config  = self.identifiers[ host ].last || {}
       
   337 
       
   338 		includes = config[ 'storage_include' ] || self.storage_include
       
   339 		excludes = config[ 'storage_exclude' ] || self.storage_exclude
       
   340 
       
   341 		mounts.each_pair do |path, percentage|
       
   342 			next if excludes.include?( path )
       
   343 			next if ! includes.empty? && ! includes.include?( path )
       
   344 			if percentage >= ( config[ 'storage_error_at' ] || self.storage_error_at )
       
   345 				errors << "%s at %d%% capacity" % [ path, percentage ]
       
   346 			end
       
   347 		end
       
   348 
       
   349 		results[ :mounts ] = mounts
       
   350 		results[ :error ] = errors.join( ', ' ) unless errors.empty?
       
   351 
       
   352 		self.results[ host ] = results
       
   353 	end
       
   354 
       
   355 
       
   356 	### Collect running processes on +host+ from an existing (and open)
       
   357 	#### +snmp+ connection.
       
   358 	###
       
   359 	def gather_processlist( snmp, host )
       
   360 		self.log.debug "Getting running process list for %s" % [ host ]
       
   361 		config = self.identifiers[ host ].last || {}
       
   362 		procs  = []
       
   363 		errors = []
       
   364 
       
   365 		snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list|
       
   366 			process = list[0].value.to_s
       
   367 			args    = list[1].value.to_s
       
   368 			procs << "%s %s " % [ process, args ]
       
   369 		end
       
   370 
       
   371 		# Check against the running stuff, setting an error if
       
   372 		# one isn't found.
       
   373 		#
       
   374 		Array( config['processes'] || self.processes ).each do |process|
       
   375 			process_r = Regexp.new( process )
       
   376 			found = procs.find{|p| p.match(process_r) }
       
   377 			errors << "Process '%s' is not running" % [ process, host ] unless found
       
   378 		end
       
   379 
       
   380 		self.log.debug "  %d running processes" % [ procs.length ]
       
   381 		if errors.empty?
       
   382 			self.results[ host ] = {}
       
   383 		else
       
   384 			self.results[ host ] = { error: errors.join( ', ' ) }
       
   385 		end
       
   386 	end
       
   387 
       
   388 
       
   389 	### Given a SNMP object, return a hash of:
       
   390 	###
       
   391 	###    device path => percentage full
       
   392 	###
       
   393 	def get_disk_percentages( snmp )
       
   394 
       
   395 		# Does this look like a windows system, or a net-snmp based one?
       
   396 		system_type = snmp.get( SNMP::ObjectId.new( IDENTIFICATION_OID ) ).varbind_list.first.value
       
   397 		disks = {}
       
   398 
       
   399 		# Windows has it's own MIBs.
       
   400 		#
       
   401 		if system_type =~ /windows/i
       
   402 			snmp.walk( STORAGE_WINDOWS ) do |list|
       
   403 				next unless list[0].value.to_s == WINDOWS_DEVICE
       
   404 				disks[ list[1].value.to_s ] = ( list[3].value.to_f / list[2].value.to_f ) * 100
       
   405 			end
       
   406 			return disks
       
   407 		end
       
   408 
       
   409 		# Everything else.
       
   410 		#
       
   411 		snmp.walk( STORAGE_NET_SNMP ) do |list|
       
   412 			mount = list[0].value.to_s
       
   413 			next if mount == 'noSuchInstance'
       
   414 
       
   415 			next if list[2].value.to_s == 'noSuchInstance'
       
   416 			used = list[2].value.to_i
       
   417 
       
   418 			unless list[1].value.to_s == 'noSuchInstance'
       
   419 				typeoid = list[1].value.join('.').to_s
       
   420 				next if typeoid =~ STORAGE_IGNORE
       
   421 			end
       
   422 			next if mount =~ /\/(?:dev|proc)$/
       
   423 
       
   424 			disks[ mount ] = used
       
   425 		end
       
   426 
       
   427 		return disks
       
   428 	end
       
   429 end # class Arborist::Monitor::SNMP
       
   430