14 require 'arborist/monitor' unless defined?( Arborist::Monitor ) |
14 require 'arborist/monitor' unless defined?( Arborist::Monitor ) |
15 require 'snmp' |
15 require 'snmp' |
16 |
16 |
17 using Arborist::TimeRefinements |
17 using Arborist::TimeRefinements |
18 |
18 |
19 # SNMP specific monitors and monitor logic. |
19 # Shared SNMP monitor logic. |
20 # |
20 # |
21 class Arborist::Monitor::SNMP |
21 module Arborist::Monitor::SNMP |
22 extend Loggability |
22 extend Loggability |
23 log_to :arborist |
23 log_to :arborist |
24 |
24 |
25 # The version of this library. |
25 # The version of this library. |
26 VERSION = '0.2.1' |
26 VERSION = '0.3.0' |
27 |
|
28 # "Modes" that this monitor understands. |
|
29 VALID_MODES = %i[ disk load memory swap process ] |
|
30 |
|
31 # The OID that returns the system environment. |
|
32 IDENTIFICATION_OID = '1.3.6.1.2.1.1.1.0' |
|
33 |
|
34 # For net-snmp systems, ignore mount types that match |
|
35 # this regular expression. This includes null/union mounts |
|
36 # and NFS, currently. |
|
37 STORAGE_IGNORE = %r{25.3.9.(?:2|14)$} |
|
38 |
|
39 # The OID that matches a local windows hard disk. Anything else |
|
40 # is a remote (SMB) mount. |
|
41 WINDOWS_DEVICE = '1.3.6.1.2.1.25.2.1.4' |
|
42 |
|
43 # OIDS required to pull disk information from net-snmp. |
|
44 # |
|
45 STORAGE_NET_SNMP = [ |
|
46 '1.3.6.1.4.1.2021.9.1.2', # paths |
|
47 '1.3.6.1.2.1.25.3.8.1.4', # types |
|
48 '1.3.6.1.4.1.2021.9.1.9' # percents |
|
49 ] |
|
50 |
|
51 # OIDS required to pull disk information from Windows. |
|
52 # |
|
53 STORAGE_WINDOWS = [ |
|
54 '1.3.6.1.2.1.25.2.3.1.2', # types |
|
55 '1.3.6.1.2.1.25.2.3.1.3', # paths |
|
56 '1.3.6.1.2.1.25.2.3.1.5', # totalsize |
|
57 '1.3.6.1.2.1.25.2.3.1.6' # usedsize |
|
58 ] |
|
59 |
|
60 # OIDS for discovering memory usage. |
|
61 # |
|
62 MEMORY = { |
|
63 swap_total: '1.3.6.1.4.1.2021.4.3.0', |
|
64 swap_avail: '1.3.6.1.4.1.2021.4.4.0', |
|
65 mem_avail: '1.3.6.1.4.1.2021.4.6.0' |
|
66 } |
|
67 |
|
68 # OIDS for discovering system load. |
|
69 # |
|
70 LOAD = { |
|
71 five_min: '1.3.6.1.4.1.2021.10.1.3.2' |
|
72 } |
|
73 |
|
74 # OIDS for discovering running processes. |
|
75 # |
|
76 PROCESS = { |
|
77 list: '1.3.6.1.2.1.25.4.2.1.4', |
|
78 args: '1.3.6.1.2.1.25.4.2.1.5' |
|
79 } |
|
80 |
|
81 |
27 |
82 # Global defaults for instances of this monitor |
28 # Global defaults for instances of this monitor |
83 # |
29 # |
84 DEFAULT_OPTIONS = { |
30 DEFAULT_OPTIONS = { |
85 timeout: 2, |
31 timeout: 2, |
86 retries: 1, |
32 retries: 1, |
87 community: 'public', |
33 community: 'public', |
88 port: 161, |
34 port: 161 |
89 storage_error_at: 95, # in percent full |
|
90 storage_include: [], # if non-empty, only these paths are included in checks |
|
91 storage_exclude: [], # paths to exclude from checks |
|
92 load_error_at: 7, |
|
93 swap_error_at: 25, # in percent remaining |
|
94 mem_error_at: 51200, # in kilobytes |
|
95 processes: [] # list of procs to match |
|
96 } |
35 } |
97 |
36 |
98 |
37 |
99 ### This monitor is complex enough to require creating an instance from the caller. |
38 ### Connect to the SNMP daemon and yield. |
100 ### Provide a friendlier error message the class was provided to exec() directly. |
|
101 ### |
|
102 def self::run( nodes ) |
|
103 self.log.error "Please use %s via an instance." % [ self.name ] |
|
104 return {} |
|
105 end |
|
106 |
|
107 |
|
108 ### Create a new instance of this monitor. |
|
109 ### |
|
110 def initialize( options=DEFAULT_OPTIONS ) |
|
111 options = DEFAULT_OPTIONS.merge( options || {} ) |
|
112 %i[ storage_include storage_exclude processes ].each do |opt| |
|
113 options[ opt ] = Array( options[opt] ) |
|
114 end |
|
115 |
|
116 options.each do |name, value| |
|
117 self.public_send( "#{name.to_s}=", value ) |
|
118 end |
|
119 end |
|
120 |
|
121 |
|
122 # The mode (section) that this SMMP instance should check. |
|
123 # Must be a +VALID_MODES+ mode. |
|
124 attr_reader :mode |
|
125 |
|
126 # Mapping of node addresses back to the node identifier. |
|
127 attr_reader :identifiers |
|
128 |
|
129 # The results from the SNMP daemons, keyed by address. |
|
130 attr_reader :results |
|
131 |
|
132 # A timeout in seconds if the SNMP server isn't responding. |
|
133 attr_accessor :timeout |
|
134 |
|
135 # Retry with the timeout this many times. Defaults to 1. |
|
136 attr_accessor :retries |
|
137 |
|
138 # The SNMP UDP port, if running on non default. |
|
139 attr_accessor :port |
|
140 |
|
141 # The community string to connect with. |
|
142 attr_accessor :community |
|
143 |
|
144 # Set an error if mount points are above this percentage. |
|
145 attr_accessor :storage_error_at |
|
146 |
|
147 # Only check these specific mount points. |
|
148 attr_accessor :storage_include |
|
149 |
|
150 # Exclude these mount points (array of paths) from checks. |
|
151 attr_accessor :storage_exclude |
|
152 |
|
153 # Set an error if the 5 minute load average exceeds this. |
|
154 attr_accessor :load_error_at |
|
155 |
|
156 # Set an error if used swap exceeds this percentage. |
|
157 attr_accessor :swap_error_at |
|
158 |
|
159 # Set an error if memory used is below this many kilobytes. |
|
160 attr_accessor :mem_error_at |
|
161 |
|
162 # Set an error if processes in this array aren't running. |
|
163 attr_accessor :processes |
|
164 |
|
165 |
|
166 ### Set the SNMP mode, after validation. |
|
167 ### |
|
168 def mode=( mode ) |
|
169 unless VALID_MODES.include?( mode.to_sym ) |
|
170 self.log.error "Unknown SNMP mode: %s" % [ mode ] |
|
171 return nil |
|
172 end |
|
173 |
|
174 @mode = mode.to_sym |
|
175 @results = {} |
|
176 end |
|
177 |
|
178 |
|
179 ### Perform the monitoring checks. |
|
180 ### |
39 ### |
181 def run( nodes ) |
40 def run( nodes ) |
182 self.log.debug "Got nodes to SNMP check: %p" % [ nodes ] |
41 self.log.debug "Got nodes to SNMP check: %p" % [ nodes ] |
183 |
42 opts = Arborist::Monitor::SNMP::DEFAULT_OPTIONS |
184 # Sanity check. |
|
185 # |
|
186 unless self.mode |
|
187 self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ] |
|
188 return {} |
|
189 end |
|
190 |
43 |
191 # Create mapping of addresses back to node identifiers, |
44 # Create mapping of addresses back to node identifiers, |
192 # and retain any custom configs per node. |
45 # and retain any custom (overrides) config per node. |
193 # |
46 # |
194 @identifiers = {} |
47 @identifiers = {} |
|
48 @results = {} |
|
49 |
195 nodes.each_pair do |(identifier, props)| |
50 nodes.each_pair do |(identifier, props)| |
196 next unless props.key?( 'addresses' ) |
51 next unless props.key?( 'addresses' ) |
197 address = props[ 'addresses' ].first |
52 address = props[ 'addresses' ].first |
198 self.identifiers[ address ] = [ identifier, props['config'] ] |
53 @identifiers[ address ] = [ identifier, props['config'] ] |
199 end |
54 end |
200 |
55 |
201 # Perform the work! |
56 # Perform the work! |
202 # |
57 # |
203 threads = [] |
58 threads = [] |
204 self.identifiers.keys.each do |host| |
59 @identifiers.keys.each do |host| |
205 thr = Thread.new do |
60 thr = Thread.new do |
206 Thread.current.abort_on_exception = true |
61 Thread.current.abort_on_exception = true |
207 |
62 |
208 config = self.identifiers[host].last || {} |
63 config = @identifiers[host].last || {} |
209 opts = { |
64 opts = { |
210 host: host, |
65 host: host, |
211 port: config[ 'port' ] || self.port, |
66 port: config[ 'port' ] || opts[ :port ], |
212 community: config[ 'community' ] || self.community, |
67 community: config[ 'community' ] || opts[ :community ], |
213 timeout: config[ 'timeout' ] || self.timeout, |
68 timeout: config[ 'timeout' ] || opts[ :timeout ], |
214 retries: config[ 'retries' ] || self.retries |
69 retries: config[ 'retries' ] || opts[ :retries ] |
215 } |
70 } |
216 |
71 |
217 begin |
72 begin |
218 SNMP::Manager.open( opts ) do |snmp| |
73 SNMP::Manager.open( opts ) do |snmp| |
219 case self.mode |
74 yield( snmp, host ) |
220 when :disk |
|
221 self.gather_disks( snmp, host ) |
|
222 when :load |
|
223 self.gather_load( snmp, host ) |
|
224 when :memory |
|
225 self.gather_free_memory( snmp, host ) |
|
226 when :swap |
|
227 self.gather_swap( snmp, host ) |
|
228 when :process |
|
229 self.gather_processlist( snmp, host ) |
|
230 end |
|
231 end |
75 end |
232 rescue SNMP::RequestTimeout |
76 rescue SNMP::RequestTimeout |
233 self.results[ host ] = { |
77 @results[ host ] = { |
234 error: "Host is not responding to SNMP requests." |
78 error: "Host is not responding to SNMP requests." |
235 } |
79 } |
236 rescue StandardError => err |
80 rescue StandardError => err |
237 self.results[ host ] = { |
81 @results[ host ] = { |
238 error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ] |
82 error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ] |
239 } |
83 } |
240 end |
84 end |
241 end |
85 end |
242 threads << thr |
86 threads << thr |
245 # Wait for thread completion |
89 # Wait for thread completion |
246 threads.map( &:join ) |
90 threads.map( &:join ) |
247 |
91 |
248 # Map everything back to identifier -> attribute(s), and send to the manager. |
92 # Map everything back to identifier -> attribute(s), and send to the manager. |
249 # |
93 # |
250 reply = self.results.each_with_object({}) do |(address, results), hash| |
94 reply = @results.each_with_object({}) do |(address, results), hash| |
251 identifier = self.identifiers[ address ] or next |
95 identifier = @identifiers[ address ] or next |
252 hash[ identifier.first ] = results |
96 hash[ identifier.first ] = results |
253 end |
97 end |
254 self.log.debug "Sending to manager: %p" % [ reply ] |
98 self.log.debug "Sending to manager: %p" % [ reply ] |
255 return reply |
99 return reply |
|
100 |
|
101 ensure |
|
102 @identifiers = {} |
|
103 @results = {} |
256 end |
104 end |
257 |
105 |
|
106 end # Arborist::Monitor::SNMP |
258 |
107 |
259 ######### |
108 require 'arborist/monitor/snmp/disk' |
260 protected |
109 require 'arborist/monitor/snmp/load' |
261 ######### |
110 require 'arborist/monitor/snmp/memory' |
|
111 require 'arborist/monitor/snmp/process' |
|
112 require 'arborist/monitor/snmp/swap' |
262 |
113 |
263 ### Collect the load information for +host+ from an existing |
|
264 ### (and open) +snmp+ connection. |
|
265 ### |
|
266 def gather_load( snmp, host ) |
|
267 self.log.debug "Getting system load for: %s" % [ host ] |
|
268 load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f |
|
269 self.log.debug " Load on %s: %0.2f" % [ host, load5 ] |
|
270 |
|
271 config = self.identifiers[ host ].last || {} |
|
272 error_at = config[ 'load_error_at' ] || self.load_error_at |
|
273 if load5 >= error_at |
|
274 self.results[ host ] = { |
|
275 error: "Load has exceeded %0.2f over a 5 minute average" % [ error_at ], |
|
276 load5: load5 |
|
277 } |
|
278 else |
|
279 self.results[ host ] = { load5: load5 } |
|
280 end |
|
281 end |
|
282 |
|
283 |
|
284 ### Collect available memory information for +host+ from an existing |
|
285 ### (and open) +snmp+ connection. |
|
286 ### |
|
287 def gather_free_memory( snmp, host ) |
|
288 self.log.debug "Getting available memory for: %s" % [ host ] |
|
289 mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f |
|
290 self.log.debug " Available memory on %s: %0.2f" % [ host, mem_avail ] |
|
291 |
|
292 config = self.identifiers[ host ].last || {} |
|
293 error_at = config['mem_error_at'] || self.mem_error_at |
|
294 if mem_avail <= error_at |
|
295 self.results[ host ] = { |
|
296 error: "Available memory is under %0.1fMB" % [ error_at.to_f / 1024 ], |
|
297 available_memory: mem_avail |
|
298 } |
|
299 else |
|
300 self.results[ host ] = { available_memory: mem_avail } |
|
301 end |
|
302 end |
|
303 |
|
304 |
|
305 ### Collect used swap information for +host+ from an existing (and |
|
306 ### open) +snmp+ connection. |
|
307 ### |
|
308 def gather_swap( snmp, host ) |
|
309 self.log.debug "Getting used swap for: %s" % [ host ] |
|
310 |
|
311 swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f |
|
312 swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f |
|
313 swap_used = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f |
|
314 self.log.debug " Swap in use on %s: %0.2f" % [ host, swap_used ] |
|
315 |
|
316 config = self.identifiers[ host ].last || {} |
|
317 if swap_used >= ( config['swap_error_at'] || self.swap_error_at ) |
|
318 self.results[ host ] = { |
|
319 error: "%0.2f%% swap in use" % [ swap_used ], |
|
320 swap_used: swap_used |
|
321 } |
|
322 else |
|
323 self.results[ host ] = { swap_used: swap_used } |
|
324 end |
|
325 end |
|
326 |
|
327 |
|
328 ### Collect mount point usage for +host+ from an existing (and open) |
|
329 #### +snmp+ connection. |
|
330 ### |
|
331 def gather_disks( snmp, host ) |
|
332 self.log.debug "Getting disk information for %s" % [ host ] |
|
333 errors = [] |
|
334 results = {} |
|
335 mounts = self.get_disk_percentages( snmp ) |
|
336 config = self.identifiers[ host ].last || {} |
|
337 |
|
338 includes = config[ 'storage_include' ] || self.storage_include |
|
339 excludes = config[ 'storage_exclude' ] || self.storage_exclude |
|
340 |
|
341 mounts.each_pair do |path, percentage| |
|
342 next if excludes.include?( path ) |
|
343 next if ! includes.empty? && ! includes.include?( path ) |
|
344 if percentage >= ( config[ 'storage_error_at' ] || self.storage_error_at ) |
|
345 errors << "%s at %d%% capacity" % [ path, percentage ] |
|
346 end |
|
347 end |
|
348 |
|
349 results[ :mounts ] = mounts |
|
350 results[ :error ] = errors.join( ', ' ) unless errors.empty? |
|
351 |
|
352 self.results[ host ] = results |
|
353 end |
|
354 |
|
355 |
|
356 ### Collect running processes on +host+ from an existing (and open) |
|
357 #### +snmp+ connection. |
|
358 ### |
|
359 def gather_processlist( snmp, host ) |
|
360 self.log.debug "Getting running process list for %s" % [ host ] |
|
361 config = self.identifiers[ host ].last || {} |
|
362 procs = [] |
|
363 errors = [] |
|
364 |
|
365 snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list| |
|
366 process = list[0].value.to_s |
|
367 args = list[1].value.to_s |
|
368 procs << "%s %s " % [ process, args ] |
|
369 end |
|
370 |
|
371 # Check against the running stuff, setting an error if |
|
372 # one isn't found. |
|
373 # |
|
374 Array( config['processes'] || self.processes ).each do |process| |
|
375 process_r = Regexp.new( process ) |
|
376 found = procs.find{|p| p.match(process_r) } |
|
377 errors << "Process '%s' is not running" % [ process, host ] unless found |
|
378 end |
|
379 |
|
380 self.log.debug " %d running processes" % [ procs.length ] |
|
381 if errors.empty? |
|
382 self.results[ host ] = {} |
|
383 else |
|
384 self.results[ host ] = { error: errors.join( ', ' ) } |
|
385 end |
|
386 end |
|
387 |
|
388 |
|
389 ### Given a SNMP object, return a hash of: |
|
390 ### |
|
391 ### device path => percentage full |
|
392 ### |
|
393 def get_disk_percentages( snmp ) |
|
394 |
|
395 # Does this look like a windows system, or a net-snmp based one? |
|
396 system_type = snmp.get( SNMP::ObjectId.new( IDENTIFICATION_OID ) ).varbind_list.first.value |
|
397 disks = {} |
|
398 |
|
399 # Windows has it's own MIBs. |
|
400 # |
|
401 if system_type =~ /windows/i |
|
402 snmp.walk( STORAGE_WINDOWS ) do |list| |
|
403 next unless list[0].value.to_s == WINDOWS_DEVICE |
|
404 disks[ list[1].value.to_s ] = ( list[3].value.to_f / list[2].value.to_f ) * 100 |
|
405 end |
|
406 return disks |
|
407 end |
|
408 |
|
409 # Everything else. |
|
410 # |
|
411 snmp.walk( STORAGE_NET_SNMP ) do |list| |
|
412 mount = list[0].value.to_s |
|
413 next if mount == 'noSuchInstance' |
|
414 |
|
415 next if list[2].value.to_s == 'noSuchInstance' |
|
416 used = list[2].value.to_i |
|
417 |
|
418 unless list[1].value.to_s == 'noSuchInstance' |
|
419 typeoid = list[1].value.join('.').to_s |
|
420 next if typeoid =~ STORAGE_IGNORE |
|
421 end |
|
422 next if mount =~ /\/(?:dev|proc)$/ |
|
423 |
|
424 disks[ mount ] = used |
|
425 end |
|
426 |
|
427 return disks |
|
428 end |
|
429 end # class Arborist::Monitor::SNMP |
|
430 |
|