lib/arborist/monitor/snmp.rb
changeset 1 8446f55f7e58
parent 0 8547a1ce445e
child 3 d46ca2b52efe
equal deleted inserted replaced
0:8547a1ce445e 1:8446f55f7e58
    21 class Arborist::Monitor::SNMP
    21 class Arborist::Monitor::SNMP
    22 	extend Loggability
    22 	extend Loggability
    23 	log_to :arborist
    23 	log_to :arborist
    24 
    24 
    25 	# The version of this library.
    25 	# The version of this library.
    26 	VERSION = '0.1.0'
    26 	VERSION = '0.2.0'
    27 
    27 
    28 	# "Modes" that this monitor understands.
    28 	# "Modes" that this monitor understands.
    29 	VALID_MODES = %i[ disk load memory swap process ]
    29 	VALID_MODES = %i[ disk load memory swap process ]
    30 
    30 
    31 	# The OID that returns the system environment.
    31 	# The OID that returns the system environment.
    77 		 list: '1.3.6.1.2.1.25.4.2.1.4',
    77 		 list: '1.3.6.1.2.1.25.4.2.1.4',
    78 		 args: '1.3.6.1.2.1.25.4.2.1.5'
    78 		 args: '1.3.6.1.2.1.25.4.2.1.5'
    79 	}
    79 	}
    80 
    80 
    81 
    81 
    82 	# Defaults for instances of this monitor
    82 	# Global defaults for instances of this monitor
    83 	#
    83 	#
    84 	DEFAULT_OPTIONS = {
    84 	DEFAULT_OPTIONS = {
    85 		timeout:          2,
    85 		timeout:          2,
    86 		retries:          1,
    86 		retries:          1,
    87 		community:        'public',
    87 		community:        'public',
    88 		port:             161,
    88 		port:             161,
    89 		storage_error_at: 95,    # in percent full
    89 		storage_error_at: 95,    # in percent full
       
    90 		storage_include:  [],    # if non-empty, only these paths are included in checks
       
    91 		storage_exclude:  [],    # paths to exclude from checks
    90 		load_error_at:    7,
    92 		load_error_at:    7,
    91 		swap_error_at:    25,    # in percent remaining
    93 		swap_error_at:    25,    # in percent remaining
    92 		mem_error_at:     51200, # in kilobytes
    94 		mem_error_at:     51200, # in kilobytes
    93 		processes:        []     # list of procs to match
    95 		processes:        []     # list of procs to match
    94 	}
    96 	}
   105 
   107 
   106 	### Create a new instance of this monitor.
   108 	### Create a new instance of this monitor.
   107 	###
   109 	###
   108 	def initialize( options=DEFAULT_OPTIONS )
   110 	def initialize( options=DEFAULT_OPTIONS )
   109 		options = DEFAULT_OPTIONS.merge( options || {} )
   111 		options = DEFAULT_OPTIONS.merge( options || {} )
       
   112 		%i[ storage_include storage_exclude processes ].each do |opt|
       
   113 			options[ opt ] = Array( options[opt] )
       
   114 		end
   110 
   115 
   111 		options.each do |name, value|
   116 		options.each do |name, value|
   112 			self.public_send( "#{name}=", value )
   117 			self.public_send( "#{name.to_s}=", value )
   113 		end
   118 		end
   114 	end
   119 	end
   115 
   120 
   116 
   121 
   117 	# The mode (section) that this SMMP instance should check.
   122 	# The mode (section) that this SMMP instance should check.
   136 	# The community string to connect with.
   141 	# The community string to connect with.
   137 	attr_accessor :community
   142 	attr_accessor :community
   138 
   143 
   139 	# Set an error if mount points are above this percentage.
   144 	# Set an error if mount points are above this percentage.
   140 	attr_accessor :storage_error_at
   145 	attr_accessor :storage_error_at
       
   146 
       
   147 	# Only check these specific mount points.
       
   148 	attr_accessor :storage_include
       
   149 
       
   150 	# Exclude these mount points (array of paths) from checks.
       
   151 	attr_accessor :storage_exclude
   141 
   152 
   142 	# Set an error if the 5 minute load average exceeds this.
   153 	# Set an error if the 5 minute load average exceeds this.
   143 	attr_accessor :load_error_at
   154 	attr_accessor :load_error_at
   144 
   155 
   145 	# Set an error if used swap exceeds this percentage.
   156 	# Set an error if used swap exceeds this percentage.
   175 		unless self.mode
   186 		unless self.mode
   176 			self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ]
   187 			self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ]
   177 			return {}
   188 			return {}
   178 		end
   189 		end
   179 
   190 
   180 		# Create mapping of addresses back to node identifiers.
   191 		# Create mapping of addresses back to node identifiers,
   181 		#
   192 		# and retain any custom configs per node.
   182 		@identifiers = nodes.each_with_object({}) do |(identifier, props), hash|
   193 		#
       
   194 		@identifiers = {}
       
   195 		nodes.each_pair do |(identifier, props)|
   183 			next unless props.key?( 'addresses' )
   196 			next unless props.key?( 'addresses' )
   184 			address = props[ 'addresses' ].first
   197 			address = props[ 'addresses' ].first
   185 			hash[ address ] = identifier
   198 			self.identifiers[ address ] = [ identifier, props['config'] ]
   186 		end
   199 		end
   187 
   200 
   188 		# Perform the work!
   201 		# Perform the work!
   189 		#
   202 		#
   190 		threads = []
   203 		threads = []
   191 		self.identifiers.keys.each do |host|
   204 		self.identifiers.keys.each do |host|
   192 			thr = Thread.new do
   205 			thr = Thread.new do
   193 				Thread.current.abort_on_exception = true
   206 				Thread.current.abort_on_exception = true
       
   207 
       
   208 				config = self.identifiers[host].last || {}
   194 				opts = {
   209 				opts = {
   195 					host:      host,
   210 					host:      host,
   196 					port:      self.port,
   211 					port:      config[ 'port' ] || self.port,
   197 					community: self.community,
   212 					community: config[ 'community' ] || self.community,
   198 					timeout:   self.timeout,
   213 					timeout:   config[ 'timeout' ] || self.timeout,
   199 					retries:   self.retries
   214 					retries:   config[ 'retries' ] || self.retries
   200 				}
   215 				}
   201 
   216 
   202 				begin
   217 				begin
   203 					SNMP::Manager.open( opts ) do |snmp|
   218 					SNMP::Manager.open( opts ) do |snmp|
   204 						case self.mode
   219 						case self.mode
   231 		threads.map( &:join )
   246 		threads.map( &:join )
   232 
   247 
   233 		# Map everything back to identifier -> attribute(s), and send to the manager.
   248 		# Map everything back to identifier -> attribute(s), and send to the manager.
   234 		#
   249 		#
   235 		reply = self.results.each_with_object({}) do |(address, results), hash|
   250 		reply = self.results.each_with_object({}) do |(address, results), hash|
   236 			identifier = self.identifiers[ address ] or next
   251 			identifier = self.identifiers[ address ].first
   237 			hash[ identifier ] = results
   252 			hash[ identifier ] = results
   238 		end
   253 		end
   239 		self.log.debug "Sending to manager: %p" % [ reply ]
   254 		self.log.debug "Sending to manager: %p" % [ reply ]
   240 		return reply
   255 		return reply
   241 	end
   256 	end
   251 	def gather_load( snmp, host )
   266 	def gather_load( snmp, host )
   252 		self.log.debug "Getting system load for: %s" % [ host ]
   267 		self.log.debug "Getting system load for: %s" % [ host ]
   253 		load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f
   268 		load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f
   254 		self.log.debug "  Load on %s: %0.2f" % [ host, load5 ]
   269 		self.log.debug "  Load on %s: %0.2f" % [ host, load5 ]
   255 
   270 
   256 		if load5 >= self.load_error_at
   271 		config = self.identifiers[ host ].last || {}
       
   272 		error_at = config[ 'load_error_at' ] || self.load_error_at
       
   273 		if load5 >= error_at
   257 			self.results[ host ] = {
   274 			self.results[ host ] = {
   258 				error: "Load has exceeded %0.2f over a 5 minute average" % [ self.load_error_at ],
   275 				error: "Load has exceeded %0.2f over a 5 minute average" % [ error_at ],
   259 				load5: load5
   276 				load5: load5
   260 			}
   277 			}
   261 		else
   278 		else
   262 			self.results[ host ] = { load5: load5 }
   279 			self.results[ host ] = { load5: load5 }
   263 		end
   280 		end
   270 	def gather_free_memory( snmp, host )
   287 	def gather_free_memory( snmp, host )
   271 		self.log.debug "Getting available memory for: %s" % [ host ]
   288 		self.log.debug "Getting available memory for: %s" % [ host ]
   272 		mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f
   289 		mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f
   273 		self.log.debug "  Available memory on %s: %0.2f" % [ host, mem_avail ]
   290 		self.log.debug "  Available memory on %s: %0.2f" % [ host, mem_avail ]
   274 
   291 
   275 		if mem_avail <= self.mem_error_at
   292 		config = self.identifiers[ host ].last || {}
       
   293 		error_at = config['mem_error_at'] || self.mem_error_at
       
   294 		if mem_avail <= error_at
   276 			self.results[ host ] = {
   295 			self.results[ host ] = {
   277 				error: "Available memory is under %0.1fMB" % [ self.mem_error_at.to_f / 1024 ],
   296 				error: "Available memory is under %0.1fMB" % [ error_at.to_f / 1024 ],
   278 				available_memory: mem_avail
   297 				available_memory: mem_avail
   279 			}
   298 			}
   280 		else
   299 		else
   281 			self.results[ host ] = { available_memory: mem_avail }
   300 			self.results[ host ] = { available_memory: mem_avail }
   282 		end
   301 		end
   292 		swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f
   311 		swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f
   293 		swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f
   312 		swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f
   294 		swap_used  = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f
   313 		swap_used  = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f
   295 		self.log.debug "  Swap in use on %s: %0.2f" % [ host, swap_used ]
   314 		self.log.debug "  Swap in use on %s: %0.2f" % [ host, swap_used ]
   296 
   315 
   297 		if swap_used >= self.swap_error_at
   316 		config = self.identifiers[ host ].last || {}
       
   317 		if swap_used >= ( config['swap_error_at'] || self.swap_error_at )
   298 			self.results[ host ] = {
   318 			self.results[ host ] = {
   299 				error: "%0.2f%% swap in use" % [ swap_used ],
   319 				error: "%0.2f%% swap in use" % [ swap_used ],
   300 				swap_used: swap_used
   320 				swap_used: swap_used
   301 			}
   321 			}
   302 		else
   322 		else
   311 	def gather_disks( snmp, host )
   331 	def gather_disks( snmp, host )
   312 		self.log.debug "Getting disk information for %s" % [ host ]
   332 		self.log.debug "Getting disk information for %s" % [ host ]
   313 		errors  = []
   333 		errors  = []
   314 		results = {}
   334 		results = {}
   315 		mounts  = self.get_disk_percentages( snmp )
   335 		mounts  = self.get_disk_percentages( snmp )
       
   336 		config  = self.identifiers[ host ].last || {}
       
   337 
       
   338 		includes = config[ 'storage_include' ] || self.storage_include
       
   339 		excludes = config[ 'storage_exclude' ] || self.storage_exclude
   316 
   340 
   317 		mounts.each_pair do |path, percentage|
   341 		mounts.each_pair do |path, percentage|
   318 			if percentage >= self.storage_error_at
   342 			next if excludes.include?( path )
   319 				errors << "Mount %s at %d%% capacity" % [ path, percentage ]
   343 			next if ! includes.empty? && ! includes.include?( path )
       
   344 			if percentage >= ( config[ 'storage_error_at' ] || self.storage_error_at )
       
   345 				errors << "%s at %d%% capacity" % [ path, percentage ]
   320 			end
   346 			end
   321 		end
   347 		end
   322 
   348 
   323 		results[ :mounts ] = mounts
   349 		results[ :mounts ] = mounts
   324 		results[ :error ] = errors.join( ', ' ) unless errors.empty?
   350 		results[ :error ] = errors.join( ', ' ) unless errors.empty?
   330 	### Collect running processes on +host+ from an existing (and open)
   356 	### Collect running processes on +host+ from an existing (and open)
   331 	#### +snmp+ connection.
   357 	#### +snmp+ connection.
   332 	###
   358 	###
   333 	def gather_processlist( snmp, host )
   359 	def gather_processlist( snmp, host )
   334 		self.log.debug "Getting running process list for %s" % [ host ]
   360 		self.log.debug "Getting running process list for %s" % [ host ]
   335 		procs = []
   361 		config = self.identifiers[ host ].last || {}
       
   362 		procs  = []
       
   363 		errors = []
   336 
   364 
   337 		snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list|
   365 		snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list|
   338 			process = list[0].value.to_s
   366 			process = list[0].value.to_s
   339 			args    = list[1].value.to_s
   367 			args    = list[1].value.to_s
   340 			procs << "%s %s " % [ process, args ]
   368 			procs << "%s %s " % [ process, args ]
   341 		end
   369 		end
   342 
   370 
   343 		# Check against the running stuff, setting an error if
   371 		# Check against the running stuff, setting an error if
   344 		# one isn't found.
   372 		# one isn't found.
   345 		#
   373 		#
   346 		errors = []
   374 		Array( config['processes'] || self.processes ).each do |process|
   347 		Array( self.processes ).each do |process|
       
   348 			process_r = Regexp.new( process )
   375 			process_r = Regexp.new( process )
   349 			found = procs.find{|p| p.match(process_r) }
   376 			found = procs.find{|p| p.match(process_r) }
   350 			errors << "Process '%s' is not running" % [ process, host ] unless found
   377 			errors << "Process '%s' is not running" % [ process, host ] unless found
   351 		end
   378 		end
   352 
   379 
   380 		end
   407 		end
   381 
   408 
   382 		# Everything else.
   409 		# Everything else.
   383 		#
   410 		#
   384 		snmp.walk( STORAGE_NET_SNMP ) do |list|
   411 		snmp.walk( STORAGE_NET_SNMP ) do |list|
   385 			mount   = list[0].value.to_s
   412 			mount = list[0].value.to_s
   386 			next if mount == 'noSuchInstance'
   413 			next if mount == 'noSuchInstance'
   387 
   414 
   388 			next if list[2].value.to_s == 'noSuchInstance'
   415 			next if list[2].value.to_s == 'noSuchInstance'
   389 			used    = list[2].value.to_i
   416 			used = list[2].value.to_i
   390 
   417 
   391 			typeoid = list[1].value.join('.').to_s
   418 			unless list[1].value.to_s == 'noSuchInstance'
   392 			next if typeoid =~ STORAGE_IGNORE
   419 				typeoid = list[1].value.join('.').to_s
       
   420 				next if typeoid =~ STORAGE_IGNORE
       
   421 			end
   393 			next if mount =~ /\/(?:dev|proc)$/
   422 			next if mount =~ /\/(?:dev|proc)$/
   394 
   423 
   395 			self.log.debug "   %s -> %s -> %s" % [ mount, typeoid, used ]
       
   396 			disks[ mount ] = used
   424 			disks[ mount ] = used
   397 		end
   425 		end
   398 
   426 
   399 		return disks
   427 		return disks
   400 	end
   428 	end