21 class Arborist::Monitor::SNMP |
21 class Arborist::Monitor::SNMP |
22 extend Loggability |
22 extend Loggability |
23 log_to :arborist |
23 log_to :arborist |
24 |
24 |
25 # The version of this library. |
25 # The version of this library. |
26 VERSION = '0.1.0' |
26 VERSION = '0.2.0' |
27 |
27 |
28 # "Modes" that this monitor understands. |
28 # "Modes" that this monitor understands. |
29 VALID_MODES = %i[ disk load memory swap process ] |
29 VALID_MODES = %i[ disk load memory swap process ] |
30 |
30 |
31 # The OID that returns the system environment. |
31 # The OID that returns the system environment. |
77 list: '1.3.6.1.2.1.25.4.2.1.4', |
77 list: '1.3.6.1.2.1.25.4.2.1.4', |
78 args: '1.3.6.1.2.1.25.4.2.1.5' |
78 args: '1.3.6.1.2.1.25.4.2.1.5' |
79 } |
79 } |
80 |
80 |
81 |
81 |
82 # Defaults for instances of this monitor |
82 # Global defaults for instances of this monitor |
83 # |
83 # |
84 DEFAULT_OPTIONS = { |
84 DEFAULT_OPTIONS = { |
85 timeout: 2, |
85 timeout: 2, |
86 retries: 1, |
86 retries: 1, |
87 community: 'public', |
87 community: 'public', |
88 port: 161, |
88 port: 161, |
89 storage_error_at: 95, # in percent full |
89 storage_error_at: 95, # in percent full |
|
90 storage_include: [], # if non-empty, only these paths are included in checks |
|
91 storage_exclude: [], # paths to exclude from checks |
90 load_error_at: 7, |
92 load_error_at: 7, |
91 swap_error_at: 25, # in percent remaining |
93 swap_error_at: 25, # in percent remaining |
92 mem_error_at: 51200, # in kilobytes |
94 mem_error_at: 51200, # in kilobytes |
93 processes: [] # list of procs to match |
95 processes: [] # list of procs to match |
94 } |
96 } |
105 |
107 |
106 ### Create a new instance of this monitor. |
108 ### Create a new instance of this monitor. |
107 ### |
109 ### |
108 def initialize( options=DEFAULT_OPTIONS ) |
110 def initialize( options=DEFAULT_OPTIONS ) |
109 options = DEFAULT_OPTIONS.merge( options || {} ) |
111 options = DEFAULT_OPTIONS.merge( options || {} ) |
|
112 %i[ storage_include storage_exclude processes ].each do |opt| |
|
113 options[ opt ] = Array( options[opt] ) |
|
114 end |
110 |
115 |
111 options.each do |name, value| |
116 options.each do |name, value| |
112 self.public_send( "#{name}=", value ) |
117 self.public_send( "#{name.to_s}=", value ) |
113 end |
118 end |
114 end |
119 end |
115 |
120 |
116 |
121 |
117 # The mode (section) that this SMMP instance should check. |
122 # The mode (section) that this SMMP instance should check. |
136 # The community string to connect with. |
141 # The community string to connect with. |
137 attr_accessor :community |
142 attr_accessor :community |
138 |
143 |
139 # Set an error if mount points are above this percentage. |
144 # Set an error if mount points are above this percentage. |
140 attr_accessor :storage_error_at |
145 attr_accessor :storage_error_at |
|
146 |
|
147 # Only check these specific mount points. |
|
148 attr_accessor :storage_include |
|
149 |
|
150 # Exclude these mount points (array of paths) from checks. |
|
151 attr_accessor :storage_exclude |
141 |
152 |
142 # Set an error if the 5 minute load average exceeds this. |
153 # Set an error if the 5 minute load average exceeds this. |
143 attr_accessor :load_error_at |
154 attr_accessor :load_error_at |
144 |
155 |
145 # Set an error if used swap exceeds this percentage. |
156 # Set an error if used swap exceeds this percentage. |
175 unless self.mode |
186 unless self.mode |
176 self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ] |
187 self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ] |
177 return {} |
188 return {} |
178 end |
189 end |
179 |
190 |
180 # Create mapping of addresses back to node identifiers. |
191 # Create mapping of addresses back to node identifiers, |
181 # |
192 # and retain any custom configs per node. |
182 @identifiers = nodes.each_with_object({}) do |(identifier, props), hash| |
193 # |
|
194 @identifiers = {} |
|
195 nodes.each_pair do |(identifier, props)| |
183 next unless props.key?( 'addresses' ) |
196 next unless props.key?( 'addresses' ) |
184 address = props[ 'addresses' ].first |
197 address = props[ 'addresses' ].first |
185 hash[ address ] = identifier |
198 self.identifiers[ address ] = [ identifier, props['config'] ] |
186 end |
199 end |
187 |
200 |
188 # Perform the work! |
201 # Perform the work! |
189 # |
202 # |
190 threads = [] |
203 threads = [] |
191 self.identifiers.keys.each do |host| |
204 self.identifiers.keys.each do |host| |
192 thr = Thread.new do |
205 thr = Thread.new do |
193 Thread.current.abort_on_exception = true |
206 Thread.current.abort_on_exception = true |
|
207 |
|
208 config = self.identifiers[host].last || {} |
194 opts = { |
209 opts = { |
195 host: host, |
210 host: host, |
196 port: self.port, |
211 port: config[ 'port' ] || self.port, |
197 community: self.community, |
212 community: config[ 'community' ] || self.community, |
198 timeout: self.timeout, |
213 timeout: config[ 'timeout' ] || self.timeout, |
199 retries: self.retries |
214 retries: config[ 'retries' ] || self.retries |
200 } |
215 } |
201 |
216 |
202 begin |
217 begin |
203 SNMP::Manager.open( opts ) do |snmp| |
218 SNMP::Manager.open( opts ) do |snmp| |
204 case self.mode |
219 case self.mode |
231 threads.map( &:join ) |
246 threads.map( &:join ) |
232 |
247 |
233 # Map everything back to identifier -> attribute(s), and send to the manager. |
248 # Map everything back to identifier -> attribute(s), and send to the manager. |
234 # |
249 # |
235 reply = self.results.each_with_object({}) do |(address, results), hash| |
250 reply = self.results.each_with_object({}) do |(address, results), hash| |
236 identifier = self.identifiers[ address ] or next |
251 identifier = self.identifiers[ address ].first |
237 hash[ identifier ] = results |
252 hash[ identifier ] = results |
238 end |
253 end |
239 self.log.debug "Sending to manager: %p" % [ reply ] |
254 self.log.debug "Sending to manager: %p" % [ reply ] |
240 return reply |
255 return reply |
241 end |
256 end |
251 def gather_load( snmp, host ) |
266 def gather_load( snmp, host ) |
252 self.log.debug "Getting system load for: %s" % [ host ] |
267 self.log.debug "Getting system load for: %s" % [ host ] |
253 load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f |
268 load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f |
254 self.log.debug " Load on %s: %0.2f" % [ host, load5 ] |
269 self.log.debug " Load on %s: %0.2f" % [ host, load5 ] |
255 |
270 |
256 if load5 >= self.load_error_at |
271 config = self.identifiers[ host ].last || {} |
|
272 error_at = config[ 'load_error_at' ] || self.load_error_at |
|
273 if load5 >= error_at |
257 self.results[ host ] = { |
274 self.results[ host ] = { |
258 error: "Load has exceeded %0.2f over a 5 minute average" % [ self.load_error_at ], |
275 error: "Load has exceeded %0.2f over a 5 minute average" % [ error_at ], |
259 load5: load5 |
276 load5: load5 |
260 } |
277 } |
261 else |
278 else |
262 self.results[ host ] = { load5: load5 } |
279 self.results[ host ] = { load5: load5 } |
263 end |
280 end |
270 def gather_free_memory( snmp, host ) |
287 def gather_free_memory( snmp, host ) |
271 self.log.debug "Getting available memory for: %s" % [ host ] |
288 self.log.debug "Getting available memory for: %s" % [ host ] |
272 mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f |
289 mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f |
273 self.log.debug " Available memory on %s: %0.2f" % [ host, mem_avail ] |
290 self.log.debug " Available memory on %s: %0.2f" % [ host, mem_avail ] |
274 |
291 |
275 if mem_avail <= self.mem_error_at |
292 config = self.identifiers[ host ].last || {} |
|
293 error_at = config['mem_error_at'] || self.mem_error_at |
|
294 if mem_avail <= error_at |
276 self.results[ host ] = { |
295 self.results[ host ] = { |
277 error: "Available memory is under %0.1fMB" % [ self.mem_error_at.to_f / 1024 ], |
296 error: "Available memory is under %0.1fMB" % [ error_at.to_f / 1024 ], |
278 available_memory: mem_avail |
297 available_memory: mem_avail |
279 } |
298 } |
280 else |
299 else |
281 self.results[ host ] = { available_memory: mem_avail } |
300 self.results[ host ] = { available_memory: mem_avail } |
282 end |
301 end |
292 swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f |
311 swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f |
293 swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f |
312 swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f |
294 swap_used = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f |
313 swap_used = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f |
295 self.log.debug " Swap in use on %s: %0.2f" % [ host, swap_used ] |
314 self.log.debug " Swap in use on %s: %0.2f" % [ host, swap_used ] |
296 |
315 |
297 if swap_used >= self.swap_error_at |
316 config = self.identifiers[ host ].last || {} |
|
317 if swap_used >= ( config['swap_error_at'] || self.swap_error_at ) |
298 self.results[ host ] = { |
318 self.results[ host ] = { |
299 error: "%0.2f%% swap in use" % [ swap_used ], |
319 error: "%0.2f%% swap in use" % [ swap_used ], |
300 swap_used: swap_used |
320 swap_used: swap_used |
301 } |
321 } |
302 else |
322 else |
311 def gather_disks( snmp, host ) |
331 def gather_disks( snmp, host ) |
312 self.log.debug "Getting disk information for %s" % [ host ] |
332 self.log.debug "Getting disk information for %s" % [ host ] |
313 errors = [] |
333 errors = [] |
314 results = {} |
334 results = {} |
315 mounts = self.get_disk_percentages( snmp ) |
335 mounts = self.get_disk_percentages( snmp ) |
|
336 config = self.identifiers[ host ].last || {} |
|
337 |
|
338 includes = config[ 'storage_include' ] || self.storage_include |
|
339 excludes = config[ 'storage_exclude' ] || self.storage_exclude |
316 |
340 |
317 mounts.each_pair do |path, percentage| |
341 mounts.each_pair do |path, percentage| |
318 if percentage >= self.storage_error_at |
342 next if excludes.include?( path ) |
319 errors << "Mount %s at %d%% capacity" % [ path, percentage ] |
343 next if ! includes.empty? && ! includes.include?( path ) |
|
344 if percentage >= ( config[ 'storage_error_at' ] || self.storage_error_at ) |
|
345 errors << "%s at %d%% capacity" % [ path, percentage ] |
320 end |
346 end |
321 end |
347 end |
322 |
348 |
323 results[ :mounts ] = mounts |
349 results[ :mounts ] = mounts |
324 results[ :error ] = errors.join( ', ' ) unless errors.empty? |
350 results[ :error ] = errors.join( ', ' ) unless errors.empty? |
330 ### Collect running processes on +host+ from an existing (and open) |
356 ### Collect running processes on +host+ from an existing (and open) |
331 #### +snmp+ connection. |
357 #### +snmp+ connection. |
332 ### |
358 ### |
333 def gather_processlist( snmp, host ) |
359 def gather_processlist( snmp, host ) |
334 self.log.debug "Getting running process list for %s" % [ host ] |
360 self.log.debug "Getting running process list for %s" % [ host ] |
335 procs = [] |
361 config = self.identifiers[ host ].last || {} |
|
362 procs = [] |
|
363 errors = [] |
336 |
364 |
337 snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list| |
365 snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list| |
338 process = list[0].value.to_s |
366 process = list[0].value.to_s |
339 args = list[1].value.to_s |
367 args = list[1].value.to_s |
340 procs << "%s %s " % [ process, args ] |
368 procs << "%s %s " % [ process, args ] |
341 end |
369 end |
342 |
370 |
343 # Check against the running stuff, setting an error if |
371 # Check against the running stuff, setting an error if |
344 # one isn't found. |
372 # one isn't found. |
345 # |
373 # |
346 errors = [] |
374 Array( config['processes'] || self.processes ).each do |process| |
347 Array( self.processes ).each do |process| |
|
348 process_r = Regexp.new( process ) |
375 process_r = Regexp.new( process ) |
349 found = procs.find{|p| p.match(process_r) } |
376 found = procs.find{|p| p.match(process_r) } |
350 errors << "Process '%s' is not running" % [ process, host ] unless found |
377 errors << "Process '%s' is not running" % [ process, host ] unless found |
351 end |
378 end |
352 |
379 |
380 end |
407 end |
381 |
408 |
382 # Everything else. |
409 # Everything else. |
383 # |
410 # |
384 snmp.walk( STORAGE_NET_SNMP ) do |list| |
411 snmp.walk( STORAGE_NET_SNMP ) do |list| |
385 mount = list[0].value.to_s |
412 mount = list[0].value.to_s |
386 next if mount == 'noSuchInstance' |
413 next if mount == 'noSuchInstance' |
387 |
414 |
388 next if list[2].value.to_s == 'noSuchInstance' |
415 next if list[2].value.to_s == 'noSuchInstance' |
389 used = list[2].value.to_i |
416 used = list[2].value.to_i |
390 |
417 |
391 typeoid = list[1].value.join('.').to_s |
418 unless list[1].value.to_s == 'noSuchInstance' |
392 next if typeoid =~ STORAGE_IGNORE |
419 typeoid = list[1].value.join('.').to_s |
|
420 next if typeoid =~ STORAGE_IGNORE |
|
421 end |
393 next if mount =~ /\/(?:dev|proc)$/ |
422 next if mount =~ /\/(?:dev|proc)$/ |
394 |
423 |
395 self.log.debug " %s -> %s -> %s" % [ mount, typeoid, used ] |
|
396 disks[ mount ] = used |
424 disks[ mount ] = used |
397 end |
425 end |
398 |
426 |
399 return disks |
427 return disks |
400 end |
428 end |