|
1 # -*- ruby -*- |
|
2 # vim: set noet nosta sw=4 ts=4 : |
|
3 #encoding: utf-8 |
|
4 # |
|
5 # SNMP checks for Arborist. Requires an SNMP agent to be installed |
|
6 # on target machine, and the various "pieces" enabled. For your platform. |
|
7 # |
|
8 # For example, for disk monitoring with Net-SNMP, you'll want to set |
|
9 # 'includeAllDisks' in the snmpd.conf. bsnmpd on FreeBSD benefits from |
|
10 # the 'bsnmp-ucd' package. Etc. |
|
11 # |
|
12 |
|
13 require 'loggability' |
|
14 require 'arborist/monitor' unless defined?( Arborist::Monitor ) |
|
15 require 'snmp' |
|
16 |
|
17 using Arborist::TimeRefinements |
|
18 |
|
19 # SNMP specific monitors and monitor logic. |
|
20 # |
|
21 class Arborist::Monitor::SNMP |
|
22 extend Loggability |
|
23 log_to :arborist |
|
24 |
|
25 # The version of this library. |
|
26 VERSION = '0.1.0' |
|
27 |
|
28 # "Modes" that this monitor understands. |
|
29 VALID_MODES = %i[ disk load memory swap process ] |
|
30 |
|
31 # The OID that returns the system environment. |
|
32 IDENTIFICATION_OID = '1.3.6.1.2.1.1.1.0' |
|
33 |
|
34 # For net-snmp systems, ignore mount types that match |
|
35 # this regular expression. This includes null/union mounts |
|
36 # and NFS, currently. |
|
37 STORAGE_IGNORE = %r{25.3.9.(?:2|14)$} |
|
38 |
|
39 # The OID that matches a local windows hard disk. Anything else |
|
40 # is a remote (SMB) mount. |
|
41 WINDOWS_DEVICE = '1.3.6.1.2.1.25.2.1.4' |
|
42 |
|
43 # OIDS required to pull disk information from net-snmp. |
|
44 # |
|
45 STORAGE_NET_SNMP = [ |
|
46 '1.3.6.1.4.1.2021.9.1.2', # paths |
|
47 '1.3.6.1.2.1.25.3.8.1.4', # types |
|
48 '1.3.6.1.4.1.2021.9.1.9' # percents |
|
49 ] |
|
50 |
|
51 # OIDS required to pull disk information from Windows. |
|
52 # |
|
53 STORAGE_WINDOWS = [ |
|
54 '1.3.6.1.2.1.25.2.3.1.2', # types |
|
55 '1.3.6.1.2.1.25.2.3.1.3', # paths |
|
56 '1.3.6.1.2.1.25.2.3.1.5', # totalsize |
|
57 '1.3.6.1.2.1.25.2.3.1.6' # usedsize |
|
58 ] |
|
59 |
|
60 # OIDS for discovering memory usage. |
|
61 # |
|
62 MEMORY = { |
|
63 swap_total: '1.3.6.1.4.1.2021.4.3.0', |
|
64 swap_avail: '1.3.6.1.4.1.2021.4.4.0', |
|
65 mem_avail: '1.3.6.1.4.1.2021.4.6.0' |
|
66 } |
|
67 |
|
68 # OIDS for discovering system load. |
|
69 # |
|
70 LOAD = { |
|
71 five_min: '1.3.6.1.4.1.2021.10.1.3.2' |
|
72 } |
|
73 |
|
74 # OIDS for discovering running processes. |
|
75 # |
|
76 PROCESS = { |
|
77 list: '1.3.6.1.2.1.25.4.2.1.4', |
|
78 args: '1.3.6.1.2.1.25.4.2.1.5' |
|
79 } |
|
80 |
|
81 |
|
82 # Defaults for instances of this monitor |
|
83 # |
|
84 DEFAULT_OPTIONS = { |
|
85 timeout: 2, |
|
86 retries: 1, |
|
87 community: 'public', |
|
88 port: 161, |
|
89 storage_error_at: 95, # in percent full |
|
90 load_error_at: 7, |
|
91 swap_error_at: 25, # in percent remaining |
|
92 mem_error_at: 51200, # in kilobytes |
|
93 processes: [] # list of procs to match |
|
94 } |
|
95 |
|
96 |
|
97 ### This monitor is complex enough to require creating an instance from the caller. |
|
98 ### Provide a friendlier error message the class was provided to exec() directly. |
|
99 ### |
|
100 def self::run( nodes ) |
|
101 self.log.error "Please use %s via an instance." % [ self.name ] |
|
102 return {} |
|
103 end |
|
104 |
|
105 |
|
106 ### Create a new instance of this monitor. |
|
107 ### |
|
108 def initialize( options=DEFAULT_OPTIONS ) |
|
109 options = DEFAULT_OPTIONS.merge( options || {} ) |
|
110 |
|
111 options.each do |name, value| |
|
112 self.public_send( "#{name}=", value ) |
|
113 end |
|
114 end |
|
115 |
|
116 |
|
117 # The mode (section) that this SMMP instance should check. |
|
118 # Must be a +VALID_MODES+ mode. |
|
119 attr_reader :mode |
|
120 |
|
121 # Mapping of node addresses back to the node identifier. |
|
122 attr_reader :identifiers |
|
123 |
|
124 # The results from the SNMP daemons, keyed by address. |
|
125 attr_reader :results |
|
126 |
|
127 # A timeout in seconds if the SNMP server isn't responding. |
|
128 attr_accessor :timeout |
|
129 |
|
130 # Retry with the timeout this many times. Defaults to 1. |
|
131 attr_accessor :retries |
|
132 |
|
133 # The SNMP UDP port, if running on non default. |
|
134 attr_accessor :port |
|
135 |
|
136 # The community string to connect with. |
|
137 attr_accessor :community |
|
138 |
|
139 # Set an error if mount points are above this percentage. |
|
140 attr_accessor :storage_error_at |
|
141 |
|
142 # Set an error if the 5 minute load average exceeds this. |
|
143 attr_accessor :load_error_at |
|
144 |
|
145 # Set an error if used swap exceeds this percentage. |
|
146 attr_accessor :swap_error_at |
|
147 |
|
148 # Set an error if memory used is below this many kilobytes. |
|
149 attr_accessor :mem_error_at |
|
150 |
|
151 # Set an error if processes in this array aren't running. |
|
152 attr_accessor :processes |
|
153 |
|
154 |
|
155 ### Set the SNMP mode, after validation. |
|
156 ### |
|
157 def mode=( mode ) |
|
158 unless VALID_MODES.include?( mode.to_sym ) |
|
159 self.log.error "Unknown SNMP mode: %s" % [ mode ] |
|
160 return nil |
|
161 end |
|
162 |
|
163 @mode = mode.to_sym |
|
164 @results = {} |
|
165 end |
|
166 |
|
167 |
|
168 ### Perform the monitoring checks. |
|
169 ### |
|
170 def run( nodes ) |
|
171 self.log.debug "Got nodes to SNMP check: %p" % [ nodes ] |
|
172 |
|
173 # Sanity check. |
|
174 # |
|
175 unless self.mode |
|
176 self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ] |
|
177 return {} |
|
178 end |
|
179 |
|
180 # Create mapping of addresses back to node identifiers. |
|
181 # |
|
182 @identifiers = nodes.each_with_object({}) do |(identifier, props), hash| |
|
183 next unless props.key?( 'addresses' ) |
|
184 address = props[ 'addresses' ].first |
|
185 hash[ address ] = identifier |
|
186 end |
|
187 |
|
188 # Perform the work! |
|
189 # |
|
190 threads = [] |
|
191 self.identifiers.keys.each do |host| |
|
192 thr = Thread.new do |
|
193 Thread.current.abort_on_exception = true |
|
194 opts = { |
|
195 host: host, |
|
196 port: self.port, |
|
197 community: self.community, |
|
198 timeout: self.timeout, |
|
199 retries: self.retries |
|
200 } |
|
201 |
|
202 begin |
|
203 SNMP::Manager.open( opts ) do |snmp| |
|
204 case self.mode |
|
205 when :disk |
|
206 self.gather_disks( snmp, host ) |
|
207 when :load |
|
208 self.gather_load( snmp, host ) |
|
209 when :memory |
|
210 self.gather_free_memory( snmp, host ) |
|
211 when :swap |
|
212 self.gather_swap( snmp, host ) |
|
213 when :process |
|
214 self.gather_processlist( snmp, host ) |
|
215 end |
|
216 end |
|
217 rescue SNMP::RequestTimeout |
|
218 self.results[ host ] = { |
|
219 error: "Host is not responding to SNMP requests." |
|
220 } |
|
221 rescue StandardError => err |
|
222 self.results[ host ] = { |
|
223 error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ] |
|
224 } |
|
225 end |
|
226 end |
|
227 threads << thr |
|
228 end |
|
229 |
|
230 # Wait for thread completion |
|
231 threads.map( &:join ) |
|
232 |
|
233 # Map everything back to identifier -> attribute(s), and send to the manager. |
|
234 # |
|
235 reply = self.results.each_with_object({}) do |(address, results), hash| |
|
236 identifier = self.identifiers[ address ] or next |
|
237 hash[ identifier ] = results |
|
238 end |
|
239 self.log.debug "Sending to manager: %p" % [ reply ] |
|
240 return reply |
|
241 end |
|
242 |
|
243 |
|
244 ######### |
|
245 protected |
|
246 ######### |
|
247 |
|
248 ### Collect the load information for +host+ from an existing |
|
249 ### (and open) +snmp+ connection. |
|
250 ### |
|
251 def gather_load( snmp, host ) |
|
252 self.log.debug "Getting system load for: %s" % [ host ] |
|
253 load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f |
|
254 self.log.debug " Load on %s: %0.2f" % [ host, load5 ] |
|
255 |
|
256 if load5 >= self.load_error_at |
|
257 self.results[ host ] = { |
|
258 error: "Load has exceeded %0.2f over a 5 minute average" % [ self.load_error_at ], |
|
259 load5: load5 |
|
260 } |
|
261 else |
|
262 self.results[ host ] = { load5: load5 } |
|
263 end |
|
264 end |
|
265 |
|
266 |
|
267 ### Collect available memory information for +host+ from an existing |
|
268 ### (and open) +snmp+ connection. |
|
269 ### |
|
270 def gather_free_memory( snmp, host ) |
|
271 self.log.debug "Getting available memory for: %s" % [ host ] |
|
272 mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f |
|
273 self.log.debug " Available memory on %s: %0.2f" % [ host, mem_avail ] |
|
274 |
|
275 if mem_avail <= self.mem_error_at |
|
276 self.results[ host ] = { |
|
277 error: "Available memory is under %0.1fMB" % [ self.mem_error_at.to_f / 1024 ], |
|
278 available_memory: mem_avail |
|
279 } |
|
280 else |
|
281 self.results[ host ] = { available_memory: mem_avail } |
|
282 end |
|
283 end |
|
284 |
|
285 |
|
286 ### Collect used swap information for +host+ from an existing (and |
|
287 ### open) +snmp+ connection. |
|
288 ### |
|
289 def gather_swap( snmp, host ) |
|
290 self.log.debug "Getting used swap for: %s" % [ host ] |
|
291 |
|
292 swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f |
|
293 swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f |
|
294 swap_used = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f |
|
295 self.log.debug " Swap in use on %s: %0.2f" % [ host, swap_used ] |
|
296 |
|
297 if swap_used >= self.swap_error_at |
|
298 self.results[ host ] = { |
|
299 error: "%0.2f%% swap in use" % [ swap_used ], |
|
300 swap_used: swap_used |
|
301 } |
|
302 else |
|
303 self.results[ host ] = { swap_used: swap_used } |
|
304 end |
|
305 end |
|
306 |
|
307 |
|
308 ### Collect mount point usage for +host+ from an existing (and open) |
|
309 #### +snmp+ connection. |
|
310 ### |
|
311 def gather_disks( snmp, host ) |
|
312 self.log.debug "Getting disk information for %s" % [ host ] |
|
313 errors = [] |
|
314 results = {} |
|
315 mounts = self.get_disk_percentages( snmp ) |
|
316 |
|
317 mounts.each_pair do |path, percentage| |
|
318 if percentage >= self.storage_error_at |
|
319 errors << "Mount %s at %d%% capacity" % [ path, percentage ] |
|
320 end |
|
321 end |
|
322 |
|
323 results[ :mounts ] = mounts |
|
324 results[ :error ] = errors.join( ', ' ) unless errors.empty? |
|
325 |
|
326 self.results[ host ] = results |
|
327 end |
|
328 |
|
329 |
|
330 ### Collect running processes on +host+ from an existing (and open) |
|
331 #### +snmp+ connection. |
|
332 ### |
|
333 def gather_processlist( snmp, host ) |
|
334 self.log.debug "Getting running process list for %s" % [ host ] |
|
335 procs = [] |
|
336 |
|
337 snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list| |
|
338 process = list[0].value.to_s |
|
339 args = list[1].value.to_s |
|
340 procs << "%s %s " % [ process, args ] |
|
341 end |
|
342 |
|
343 # Check against the running stuff, setting an error if |
|
344 # one isn't found. |
|
345 # |
|
346 errors = [] |
|
347 Array( self.processes ).each do |process| |
|
348 process_r = Regexp.new( process ) |
|
349 found = procs.find{|p| p.match(process_r) } |
|
350 errors << "Process '%s' is not running" % [ process, host ] unless found |
|
351 end |
|
352 |
|
353 self.log.debug " %d running processes" % [ procs.length ] |
|
354 if errors.empty? |
|
355 self.results[ host ] = {} |
|
356 else |
|
357 self.results[ host ] = { error: errors.join( ', ' ) } |
|
358 end |
|
359 end |
|
360 |
|
361 |
|
362 ### Given a SNMP object, return a hash of: |
|
363 ### |
|
364 ### device path => percentage full |
|
365 ### |
|
366 def get_disk_percentages( snmp ) |
|
367 |
|
368 # Does this look like a windows system, or a net-snmp based one? |
|
369 system_type = snmp.get( SNMP::ObjectId.new( IDENTIFICATION_OID ) ).varbind_list.first.value |
|
370 disks = {} |
|
371 |
|
372 # Windows has it's own MIBs. |
|
373 # |
|
374 if system_type =~ /windows/i |
|
375 snmp.walk( STORAGE_WINDOWS ) do |list| |
|
376 next unless list[0].value.to_s == WINDOWS_DEVICE |
|
377 disks[ list[1].value.to_s ] = ( list[3].value.to_f / list[2].value.to_f ) * 100 |
|
378 end |
|
379 return disks |
|
380 end |
|
381 |
|
382 # Everything else. |
|
383 # |
|
384 snmp.walk( STORAGE_NET_SNMP ) do |list| |
|
385 mount = list[0].value.to_s |
|
386 next if mount == 'noSuchInstance' |
|
387 |
|
388 next if list[2].value.to_s == 'noSuchInstance' |
|
389 used = list[2].value.to_i |
|
390 |
|
391 typeoid = list[1].value.join('.').to_s |
|
392 next if typeoid =~ STORAGE_IGNORE |
|
393 next if mount =~ /\/(?:dev|proc)$/ |
|
394 |
|
395 self.log.debug " %s -> %s -> %s" % [ mount, typeoid, used ] |
|
396 disks[ mount ] = used |
|
397 end |
|
398 |
|
399 return disks |
|
400 end |
|
401 end # class Arborist::Monitor::SNMP |
|
402 |