lib/arborist/monitor/snmp.rb
changeset 1 8446f55f7e58
parent 0 8547a1ce445e
child 3 d46ca2b52efe
--- a/lib/arborist/monitor/snmp.rb	Thu Jun 02 11:50:54 2016 -0700
+++ b/lib/arborist/monitor/snmp.rb	Mon Sep 05 20:58:54 2016 -0700
@@ -23,7 +23,7 @@
 	log_to :arborist
 
 	# The version of this library.
-	VERSION = '0.1.0'
+	VERSION = '0.2.0'
 
 	# "Modes" that this monitor understands.
 	VALID_MODES = %i[ disk load memory swap process ]
@@ -79,7 +79,7 @@
 	}
 
 
-	# Defaults for instances of this monitor
+	# Global defaults for instances of this monitor
 	#
 	DEFAULT_OPTIONS = {
 		timeout:          2,
@@ -87,6 +87,8 @@
 		community:        'public',
 		port:             161,
 		storage_error_at: 95,    # in percent full
+		storage_include:  [],    # if non-empty, only these paths are included in checks
+		storage_exclude:  [],    # paths to exclude from checks
 		load_error_at:    7,
 		swap_error_at:    25,    # in percent remaining
 		mem_error_at:     51200, # in kilobytes
@@ -107,9 +109,12 @@
 	###
 	def initialize( options=DEFAULT_OPTIONS )
 		options = DEFAULT_OPTIONS.merge( options || {} )
+		%i[ storage_include storage_exclude processes ].each do |opt|
+			options[ opt ] = Array( options[opt] )
+		end
 
 		options.each do |name, value|
-			self.public_send( "#{name}=", value )
+			self.public_send( "#{name.to_s}=", value )
 		end
 	end
 
@@ -139,6 +144,12 @@
 	# Set an error if mount points are above this percentage.
 	attr_accessor :storage_error_at
 
+	# Only check these specific mount points.
+	attr_accessor :storage_include
+
+	# Exclude these mount points (array of paths) from checks.
+	attr_accessor :storage_exclude
+
 	# Set an error if the 5 minute load average exceeds this.
 	attr_accessor :load_error_at
 
@@ -177,12 +188,14 @@
 			return {}
 		end
 
-		# Create mapping of addresses back to node identifiers.
+		# Create mapping of addresses back to node identifiers,
+		# and retain any custom configs per node.
 		#
-		@identifiers = nodes.each_with_object({}) do |(identifier, props), hash|
+		@identifiers = {}
+		nodes.each_pair do |(identifier, props)|
 			next unless props.key?( 'addresses' )
 			address = props[ 'addresses' ].first
-			hash[ address ] = identifier
+			self.identifiers[ address ] = [ identifier, props['config'] ]
 		end
 
 		# Perform the work!
@@ -191,12 +204,14 @@
 		self.identifiers.keys.each do |host|
 			thr = Thread.new do
 				Thread.current.abort_on_exception = true
+
+				config = self.identifiers[host].last || {}
 				opts = {
 					host:      host,
-					port:      self.port,
-					community: self.community,
-					timeout:   self.timeout,
-					retries:   self.retries
+					port:      config[ 'port' ] || self.port,
+					community: config[ 'community' ] || self.community,
+					timeout:   config[ 'timeout' ] || self.timeout,
+					retries:   config[ 'retries' ] || self.retries
 				}
 
 				begin
@@ -233,7 +248,7 @@
 		# Map everything back to identifier -> attribute(s), and send to the manager.
 		#
 		reply = self.results.each_with_object({}) do |(address, results), hash|
-			identifier = self.identifiers[ address ] or next
+			identifier = self.identifiers[ address ].first
 			hash[ identifier ] = results
 		end
 		self.log.debug "Sending to manager: %p" % [ reply ]
@@ -253,9 +268,11 @@
 		load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f
 		self.log.debug "  Load on %s: %0.2f" % [ host, load5 ]
 
-		if load5 >= self.load_error_at
+		config = self.identifiers[ host ].last || {}
+		error_at = config[ 'load_error_at' ] || self.load_error_at
+		if load5 >= error_at
 			self.results[ host ] = {
-				error: "Load has exceeded %0.2f over a 5 minute average" % [ self.load_error_at ],
+				error: "Load has exceeded %0.2f over a 5 minute average" % [ error_at ],
 				load5: load5
 			}
 		else
@@ -272,9 +289,11 @@
 		mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f
 		self.log.debug "  Available memory on %s: %0.2f" % [ host, mem_avail ]
 
-		if mem_avail <= self.mem_error_at
+		config = self.identifiers[ host ].last || {}
+		error_at = config['mem_error_at'] || self.mem_error_at
+		if mem_avail <= error_at
 			self.results[ host ] = {
-				error: "Available memory is under %0.1fMB" % [ self.mem_error_at.to_f / 1024 ],
+				error: "Available memory is under %0.1fMB" % [ error_at.to_f / 1024 ],
 				available_memory: mem_avail
 			}
 		else
@@ -294,7 +313,8 @@
 		swap_used  = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f
 		self.log.debug "  Swap in use on %s: %0.2f" % [ host, swap_used ]
 
-		if swap_used >= self.swap_error_at
+		config = self.identifiers[ host ].last || {}
+		if swap_used >= ( config['swap_error_at'] || self.swap_error_at )
 			self.results[ host ] = {
 				error: "%0.2f%% swap in use" % [ swap_used ],
 				swap_used: swap_used
@@ -313,10 +333,16 @@
 		errors  = []
 		results = {}
 		mounts  = self.get_disk_percentages( snmp )
+		config  = self.identifiers[ host ].last || {}
+
+		includes = config[ 'storage_include' ] || self.storage_include
+		excludes = config[ 'storage_exclude' ] || self.storage_exclude
 
 		mounts.each_pair do |path, percentage|
-			if percentage >= self.storage_error_at
-				errors << "Mount %s at %d%% capacity" % [ path, percentage ]
+			next if excludes.include?( path )
+			next if ! includes.empty? && ! includes.include?( path )
+			if percentage >= ( config[ 'storage_error_at' ] || self.storage_error_at )
+				errors << "%s at %d%% capacity" % [ path, percentage ]
 			end
 		end
 
@@ -332,7 +358,9 @@
 	###
 	def gather_processlist( snmp, host )
 		self.log.debug "Getting running process list for %s" % [ host ]
-		procs = []
+		config = self.identifiers[ host ].last || {}
+		procs  = []
+		errors = []
 
 		snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list|
 			process = list[0].value.to_s
@@ -343,8 +371,7 @@
 		# Check against the running stuff, setting an error if
 		# one isn't found.
 		#
-		errors = []
-		Array( self.processes ).each do |process|
+		Array( config['processes'] || self.processes ).each do |process|
 			process_r = Regexp.new( process )
 			found = procs.find{|p| p.match(process_r) }
 			errors << "Process '%s' is not running" % [ process, host ] unless found
@@ -382,17 +409,18 @@
 		# Everything else.
 		#
 		snmp.walk( STORAGE_NET_SNMP ) do |list|
-			mount   = list[0].value.to_s
+			mount = list[0].value.to_s
 			next if mount == 'noSuchInstance'
 
 			next if list[2].value.to_s == 'noSuchInstance'
-			used    = list[2].value.to_i
+			used = list[2].value.to_i
 
-			typeoid = list[1].value.join('.').to_s
-			next if typeoid =~ STORAGE_IGNORE
+			unless list[1].value.to_s == 'noSuchInstance'
+				typeoid = list[1].value.join('.').to_s
+				next if typeoid =~ STORAGE_IGNORE
+			end
 			next if mount =~ /\/(?:dev|proc)$/
 
-			self.log.debug "   %s -> %s -> %s" % [ mount, typeoid, used ]
 			disks[ mount ] = used
 		end