0
|
1 |
# -*- ruby -*-
|
|
2 |
# vim: set noet nosta sw=4 ts=4 :
|
|
3 |
#encoding: utf-8
|
|
4 |
#
|
|
5 |
# SNMP checks for Arborist. Requires an SNMP agent to be installed
|
|
6 |
# on target machine, and the various "pieces" enabled. For your platform.
|
|
7 |
#
|
|
8 |
# For example, for disk monitoring with Net-SNMP, you'll want to set
|
|
9 |
# 'includeAllDisks' in the snmpd.conf. bsnmpd on FreeBSD benefits from
|
|
10 |
# the 'bsnmp-ucd' package. Etc.
|
|
11 |
#
|
|
12 |
|
|
13 |
require 'loggability'
|
|
14 |
require 'arborist/monitor' unless defined?( Arborist::Monitor )
|
|
15 |
require 'snmp'
|
|
16 |
|
|
17 |
using Arborist::TimeRefinements
|
|
18 |
|
|
19 |
# SNMP specific monitors and monitor logic.
|
|
20 |
#
|
|
21 |
class Arborist::Monitor::SNMP
|
|
22 |
extend Loggability
|
|
23 |
log_to :arborist
|
|
24 |
|
|
25 |
# The version of this library.
|
|
26 |
VERSION = '0.1.0'
|
|
27 |
|
|
28 |
# "Modes" that this monitor understands.
|
|
29 |
VALID_MODES = %i[ disk load memory swap process ]
|
|
30 |
|
|
31 |
# The OID that returns the system environment.
|
|
32 |
IDENTIFICATION_OID = '1.3.6.1.2.1.1.1.0'
|
|
33 |
|
|
34 |
# For net-snmp systems, ignore mount types that match
|
|
35 |
# this regular expression. This includes null/union mounts
|
|
36 |
# and NFS, currently.
|
|
37 |
STORAGE_IGNORE = %r{25.3.9.(?:2|14)$}
|
|
38 |
|
|
39 |
# The OID that matches a local windows hard disk. Anything else
|
|
40 |
# is a remote (SMB) mount.
|
|
41 |
WINDOWS_DEVICE = '1.3.6.1.2.1.25.2.1.4'
|
|
42 |
|
|
43 |
# OIDS required to pull disk information from net-snmp.
|
|
44 |
#
|
|
45 |
STORAGE_NET_SNMP = [
|
|
46 |
'1.3.6.1.4.1.2021.9.1.2', # paths
|
|
47 |
'1.3.6.1.2.1.25.3.8.1.4', # types
|
|
48 |
'1.3.6.1.4.1.2021.9.1.9' # percents
|
|
49 |
]
|
|
50 |
|
|
51 |
# OIDS required to pull disk information from Windows.
|
|
52 |
#
|
|
53 |
STORAGE_WINDOWS = [
|
|
54 |
'1.3.6.1.2.1.25.2.3.1.2', # types
|
|
55 |
'1.3.6.1.2.1.25.2.3.1.3', # paths
|
|
56 |
'1.3.6.1.2.1.25.2.3.1.5', # totalsize
|
|
57 |
'1.3.6.1.2.1.25.2.3.1.6' # usedsize
|
|
58 |
]
|
|
59 |
|
|
60 |
# OIDS for discovering memory usage.
|
|
61 |
#
|
|
62 |
MEMORY = {
|
|
63 |
swap_total: '1.3.6.1.4.1.2021.4.3.0',
|
|
64 |
swap_avail: '1.3.6.1.4.1.2021.4.4.0',
|
|
65 |
mem_avail: '1.3.6.1.4.1.2021.4.6.0'
|
|
66 |
}
|
|
67 |
|
|
68 |
# OIDS for discovering system load.
|
|
69 |
#
|
|
70 |
LOAD = {
|
|
71 |
five_min: '1.3.6.1.4.1.2021.10.1.3.2'
|
|
72 |
}
|
|
73 |
|
|
74 |
# OIDS for discovering running processes.
|
|
75 |
#
|
|
76 |
PROCESS = {
|
|
77 |
list: '1.3.6.1.2.1.25.4.2.1.4',
|
|
78 |
args: '1.3.6.1.2.1.25.4.2.1.5'
|
|
79 |
}
|
|
80 |
|
|
81 |
|
|
82 |
# Defaults for instances of this monitor
|
|
83 |
#
|
|
84 |
DEFAULT_OPTIONS = {
|
|
85 |
timeout: 2,
|
|
86 |
retries: 1,
|
|
87 |
community: 'public',
|
|
88 |
port: 161,
|
|
89 |
storage_error_at: 95, # in percent full
|
|
90 |
load_error_at: 7,
|
|
91 |
swap_error_at: 25, # in percent remaining
|
|
92 |
mem_error_at: 51200, # in kilobytes
|
|
93 |
processes: [] # list of procs to match
|
|
94 |
}
|
|
95 |
|
|
96 |
|
|
97 |
### This monitor is complex enough to require creating an instance from the caller.
|
|
98 |
### Provide a friendlier error message the class was provided to exec() directly.
|
|
99 |
###
|
|
100 |
def self::run( nodes )
|
|
101 |
self.log.error "Please use %s via an instance." % [ self.name ]
|
|
102 |
return {}
|
|
103 |
end
|
|
104 |
|
|
105 |
|
|
106 |
### Create a new instance of this monitor.
|
|
107 |
###
|
|
108 |
def initialize( options=DEFAULT_OPTIONS )
|
|
109 |
options = DEFAULT_OPTIONS.merge( options || {} )
|
|
110 |
|
|
111 |
options.each do |name, value|
|
|
112 |
self.public_send( "#{name}=", value )
|
|
113 |
end
|
|
114 |
end
|
|
115 |
|
|
116 |
|
|
117 |
# The mode (section) that this SMMP instance should check.
|
|
118 |
# Must be a +VALID_MODES+ mode.
|
|
119 |
attr_reader :mode
|
|
120 |
|
|
121 |
# Mapping of node addresses back to the node identifier.
|
|
122 |
attr_reader :identifiers
|
|
123 |
|
|
124 |
# The results from the SNMP daemons, keyed by address.
|
|
125 |
attr_reader :results
|
|
126 |
|
|
127 |
# A timeout in seconds if the SNMP server isn't responding.
|
|
128 |
attr_accessor :timeout
|
|
129 |
|
|
130 |
# Retry with the timeout this many times. Defaults to 1.
|
|
131 |
attr_accessor :retries
|
|
132 |
|
|
133 |
# The SNMP UDP port, if running on non default.
|
|
134 |
attr_accessor :port
|
|
135 |
|
|
136 |
# The community string to connect with.
|
|
137 |
attr_accessor :community
|
|
138 |
|
|
139 |
# Set an error if mount points are above this percentage.
|
|
140 |
attr_accessor :storage_error_at
|
|
141 |
|
|
142 |
# Set an error if the 5 minute load average exceeds this.
|
|
143 |
attr_accessor :load_error_at
|
|
144 |
|
|
145 |
# Set an error if used swap exceeds this percentage.
|
|
146 |
attr_accessor :swap_error_at
|
|
147 |
|
|
148 |
# Set an error if memory used is below this many kilobytes.
|
|
149 |
attr_accessor :mem_error_at
|
|
150 |
|
|
151 |
# Set an error if processes in this array aren't running.
|
|
152 |
attr_accessor :processes
|
|
153 |
|
|
154 |
|
|
155 |
### Set the SNMP mode, after validation.
|
|
156 |
###
|
|
157 |
def mode=( mode )
|
|
158 |
unless VALID_MODES.include?( mode.to_sym )
|
|
159 |
self.log.error "Unknown SNMP mode: %s" % [ mode ]
|
|
160 |
return nil
|
|
161 |
end
|
|
162 |
|
|
163 |
@mode = mode.to_sym
|
|
164 |
@results = {}
|
|
165 |
end
|
|
166 |
|
|
167 |
|
|
168 |
### Perform the monitoring checks.
|
|
169 |
###
|
|
170 |
def run( nodes )
|
|
171 |
self.log.debug "Got nodes to SNMP check: %p" % [ nodes ]
|
|
172 |
|
|
173 |
# Sanity check.
|
|
174 |
#
|
|
175 |
unless self.mode
|
|
176 |
self.log.error "You must set the 'mode' for the SNMP monitor. (%s)" % [ VALID_MODES.join( ', ' ) ]
|
|
177 |
return {}
|
|
178 |
end
|
|
179 |
|
|
180 |
# Create mapping of addresses back to node identifiers.
|
|
181 |
#
|
|
182 |
@identifiers = nodes.each_with_object({}) do |(identifier, props), hash|
|
|
183 |
next unless props.key?( 'addresses' )
|
|
184 |
address = props[ 'addresses' ].first
|
|
185 |
hash[ address ] = identifier
|
|
186 |
end
|
|
187 |
|
|
188 |
# Perform the work!
|
|
189 |
#
|
|
190 |
threads = []
|
|
191 |
self.identifiers.keys.each do |host|
|
|
192 |
thr = Thread.new do
|
|
193 |
Thread.current.abort_on_exception = true
|
|
194 |
opts = {
|
|
195 |
host: host,
|
|
196 |
port: self.port,
|
|
197 |
community: self.community,
|
|
198 |
timeout: self.timeout,
|
|
199 |
retries: self.retries
|
|
200 |
}
|
|
201 |
|
|
202 |
begin
|
|
203 |
SNMP::Manager.open( opts ) do |snmp|
|
|
204 |
case self.mode
|
|
205 |
when :disk
|
|
206 |
self.gather_disks( snmp, host )
|
|
207 |
when :load
|
|
208 |
self.gather_load( snmp, host )
|
|
209 |
when :memory
|
|
210 |
self.gather_free_memory( snmp, host )
|
|
211 |
when :swap
|
|
212 |
self.gather_swap( snmp, host )
|
|
213 |
when :process
|
|
214 |
self.gather_processlist( snmp, host )
|
|
215 |
end
|
|
216 |
end
|
|
217 |
rescue SNMP::RequestTimeout
|
|
218 |
self.results[ host ] = {
|
|
219 |
error: "Host is not responding to SNMP requests."
|
|
220 |
}
|
|
221 |
rescue StandardError => err
|
|
222 |
self.results[ host ] = {
|
|
223 |
error: "Network is not accessible. (%s: %s)" % [ err.class.name, err.message ]
|
|
224 |
}
|
|
225 |
end
|
|
226 |
end
|
|
227 |
threads << thr
|
|
228 |
end
|
|
229 |
|
|
230 |
# Wait for thread completion
|
|
231 |
threads.map( &:join )
|
|
232 |
|
|
233 |
# Map everything back to identifier -> attribute(s), and send to the manager.
|
|
234 |
#
|
|
235 |
reply = self.results.each_with_object({}) do |(address, results), hash|
|
|
236 |
identifier = self.identifiers[ address ] or next
|
|
237 |
hash[ identifier ] = results
|
|
238 |
end
|
|
239 |
self.log.debug "Sending to manager: %p" % [ reply ]
|
|
240 |
return reply
|
|
241 |
end
|
|
242 |
|
|
243 |
|
|
244 |
#########
|
|
245 |
protected
|
|
246 |
#########
|
|
247 |
|
|
248 |
### Collect the load information for +host+ from an existing
|
|
249 |
### (and open) +snmp+ connection.
|
|
250 |
###
|
|
251 |
def gather_load( snmp, host )
|
|
252 |
self.log.debug "Getting system load for: %s" % [ host ]
|
|
253 |
load5 = snmp.get( SNMP::ObjectId.new( LOAD[:five_min] ) ).varbind_list.first.value.to_f
|
|
254 |
self.log.debug " Load on %s: %0.2f" % [ host, load5 ]
|
|
255 |
|
|
256 |
if load5 >= self.load_error_at
|
|
257 |
self.results[ host ] = {
|
|
258 |
error: "Load has exceeded %0.2f over a 5 minute average" % [ self.load_error_at ],
|
|
259 |
load5: load5
|
|
260 |
}
|
|
261 |
else
|
|
262 |
self.results[ host ] = { load5: load5 }
|
|
263 |
end
|
|
264 |
end
|
|
265 |
|
|
266 |
|
|
267 |
### Collect available memory information for +host+ from an existing
|
|
268 |
### (and open) +snmp+ connection.
|
|
269 |
###
|
|
270 |
def gather_free_memory( snmp, host )
|
|
271 |
self.log.debug "Getting available memory for: %s" % [ host ]
|
|
272 |
mem_avail = snmp.get( SNMP::ObjectId.new( MEMORY[:mem_avail] ) ).varbind_list.first.value.to_f
|
|
273 |
self.log.debug " Available memory on %s: %0.2f" % [ host, mem_avail ]
|
|
274 |
|
|
275 |
if mem_avail <= self.mem_error_at
|
|
276 |
self.results[ host ] = {
|
|
277 |
error: "Available memory is under %0.1fMB" % [ self.mem_error_at.to_f / 1024 ],
|
|
278 |
available_memory: mem_avail
|
|
279 |
}
|
|
280 |
else
|
|
281 |
self.results[ host ] = { available_memory: mem_avail }
|
|
282 |
end
|
|
283 |
end
|
|
284 |
|
|
285 |
|
|
286 |
### Collect used swap information for +host+ from an existing (and
|
|
287 |
### open) +snmp+ connection.
|
|
288 |
###
|
|
289 |
def gather_swap( snmp, host )
|
|
290 |
self.log.debug "Getting used swap for: %s" % [ host ]
|
|
291 |
|
|
292 |
swap_total = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_total]) ).varbind_list.first.value.to_f
|
|
293 |
swap_avail = snmp.get( SNMP::ObjectId.new(MEMORY[:swap_avail]) ).varbind_list.first.value.to_f
|
|
294 |
swap_used = ( "%0.2f" % ((swap_avail / swap_total.to_f * 100 ) - 100).abs ).to_f
|
|
295 |
self.log.debug " Swap in use on %s: %0.2f" % [ host, swap_used ]
|
|
296 |
|
|
297 |
if swap_used >= self.swap_error_at
|
|
298 |
self.results[ host ] = {
|
|
299 |
error: "%0.2f%% swap in use" % [ swap_used ],
|
|
300 |
swap_used: swap_used
|
|
301 |
}
|
|
302 |
else
|
|
303 |
self.results[ host ] = { swap_used: swap_used }
|
|
304 |
end
|
|
305 |
end
|
|
306 |
|
|
307 |
|
|
308 |
### Collect mount point usage for +host+ from an existing (and open)
|
|
309 |
#### +snmp+ connection.
|
|
310 |
###
|
|
311 |
def gather_disks( snmp, host )
|
|
312 |
self.log.debug "Getting disk information for %s" % [ host ]
|
|
313 |
errors = []
|
|
314 |
results = {}
|
|
315 |
mounts = self.get_disk_percentages( snmp )
|
|
316 |
|
|
317 |
mounts.each_pair do |path, percentage|
|
|
318 |
if percentage >= self.storage_error_at
|
|
319 |
errors << "Mount %s at %d%% capacity" % [ path, percentage ]
|
|
320 |
end
|
|
321 |
end
|
|
322 |
|
|
323 |
results[ :mounts ] = mounts
|
|
324 |
results[ :error ] = errors.join( ', ' ) unless errors.empty?
|
|
325 |
|
|
326 |
self.results[ host ] = results
|
|
327 |
end
|
|
328 |
|
|
329 |
|
|
330 |
### Collect running processes on +host+ from an existing (and open)
|
|
331 |
#### +snmp+ connection.
|
|
332 |
###
|
|
333 |
def gather_processlist( snmp, host )
|
|
334 |
self.log.debug "Getting running process list for %s" % [ host ]
|
|
335 |
procs = []
|
|
336 |
|
|
337 |
snmp.walk([ PROCESS[:list], PROCESS[:args] ]) do |list|
|
|
338 |
process = list[0].value.to_s
|
|
339 |
args = list[1].value.to_s
|
|
340 |
procs << "%s %s " % [ process, args ]
|
|
341 |
end
|
|
342 |
|
|
343 |
# Check against the running stuff, setting an error if
|
|
344 |
# one isn't found.
|
|
345 |
#
|
|
346 |
errors = []
|
|
347 |
Array( self.processes ).each do |process|
|
|
348 |
process_r = Regexp.new( process )
|
|
349 |
found = procs.find{|p| p.match(process_r) }
|
|
350 |
errors << "Process '%s' is not running" % [ process, host ] unless found
|
|
351 |
end
|
|
352 |
|
|
353 |
self.log.debug " %d running processes" % [ procs.length ]
|
|
354 |
if errors.empty?
|
|
355 |
self.results[ host ] = {}
|
|
356 |
else
|
|
357 |
self.results[ host ] = { error: errors.join( ', ' ) }
|
|
358 |
end
|
|
359 |
end
|
|
360 |
|
|
361 |
|
|
362 |
### Given a SNMP object, return a hash of:
|
|
363 |
###
|
|
364 |
### device path => percentage full
|
|
365 |
###
|
|
366 |
def get_disk_percentages( snmp )
|
|
367 |
|
|
368 |
# Does this look like a windows system, or a net-snmp based one?
|
|
369 |
system_type = snmp.get( SNMP::ObjectId.new( IDENTIFICATION_OID ) ).varbind_list.first.value
|
|
370 |
disks = {}
|
|
371 |
|
|
372 |
# Windows has it's own MIBs.
|
|
373 |
#
|
|
374 |
if system_type =~ /windows/i
|
|
375 |
snmp.walk( STORAGE_WINDOWS ) do |list|
|
|
376 |
next unless list[0].value.to_s == WINDOWS_DEVICE
|
|
377 |
disks[ list[1].value.to_s ] = ( list[3].value.to_f / list[2].value.to_f ) * 100
|
|
378 |
end
|
|
379 |
return disks
|
|
380 |
end
|
|
381 |
|
|
382 |
# Everything else.
|
|
383 |
#
|
|
384 |
snmp.walk( STORAGE_NET_SNMP ) do |list|
|
|
385 |
mount = list[0].value.to_s
|
|
386 |
next if mount == 'noSuchInstance'
|
|
387 |
|
|
388 |
next if list[2].value.to_s == 'noSuchInstance'
|
|
389 |
used = list[2].value.to_i
|
|
390 |
|
|
391 |
typeoid = list[1].value.join('.').to_s
|
|
392 |
next if typeoid =~ STORAGE_IGNORE
|
|
393 |
next if mount =~ /\/(?:dev|proc)$/
|
|
394 |
|
|
395 |
self.log.debug " %s -> %s -> %s" % [ mount, typeoid, used ]
|
|
396 |
disks[ mount ] = used
|
|
397 |
end
|
|
398 |
|
|
399 |
return disks
|
|
400 |
end
|
|
401 |
end # class Arborist::Monitor::SNMP
|
|
402 |
|