Utoljára aktív 1 month ago

Erreur32's Avatar Erreur32 gist felülvizsgálása 2 years ago. Revízióhoz ugrás

1 file changed, 553 insertions

munin_docker_(fájl létrehozva)

@@ -0,0 +1,553 @@
1 + #!/usr/bin/env python3
2 + """
3 + =head1 NAME
4 +
5 + docker_ - Docker wildcard-plugin to monitor a L<Docker|https://www.docker.com> host.
6 +
7 + This wildcard plugin provides series C<containers>, C<images>, C<status>,
8 + C<volumes>, C<cpu>, C<memory> and C<network> as separate graphs. It also
9 + supports a C<multi> suffix that provides all of those as a multigraph.
10 +
11 + =head1 INSTALLATION
12 +
13 + - Copy this plugin in your munin plugins directory
14 + - Install Python3 "docker" package
15 +
16 + =over 2
17 +
18 + If you want all the graphs as a multigraph, create a single multi symlink.
19 +
20 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_multi
21 +
22 + Or choose a subset of those you want.
23 +
24 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_containers
25 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_cpu
26 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_images
27 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_memory
28 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_network
29 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_status
30 + ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_volumes
31 +
32 + =back
33 +
34 + After the installation you need to restart your munin-node:
35 +
36 + =over 2
37 +
38 + systemctl restart munin-node
39 +
40 + =back
41 +
42 + =head1 CONFIGURATION
43 +
44 + This plugin need to run as root, you need to create a file named docker placed in the
45 + directory /etc/munin/plugin-conf.d/ with the following config (you can also use
46 + Docker environment variables here as described in
47 + https://docs.docker.com/compose/reference/envvars/):
48 +
49 + You can use the EXCLUDE_CONTAINER_NAME environment variable to specify a regular expression
50 + which if matched will exclude the matching containers from the memory and cpu graphs.
51 +
52 + For example
53 +
54 + env.EXCLUDE_CONTAINER_NAME runner
55 +
56 + Would exclude all containers with the word "runner" in the name.
57 +
58 +
59 + =over 2
60 +
61 + [docker_*]
62 + group docker
63 + env.DOCKER_HOST unix://run/docker.sock
64 + env.EXCLUDE_CONTAINER_NAME regexp
65 +
66 + =back
67 +
68 + You may need to pick a different group depending on the name schema of your
69 + distribution. Or maybe use "user root", if nothing else works.
70 +
71 + =head1 AUTHORS
72 +
73 + This section has been reverse-engineered from git logs
74 +
75 + Codimp <[email protected]>: original rewrite
76 +
77 + Rowan Wookey <[email protected]>: performance improvement
78 +
79 + Olivier Mehani <[email protected]>: Network support, ClientWrapper, general cleanup, multigraph
80 +
81 + =head1 MAGIC MARKERS
82 +
83 + #%# family=auto
84 + #%# capabilities=autoconf suggest multigraph
85 +
86 + =cut
87 + """
88 +
89 + import os
90 + import sys
91 + import re
92 + try:
93 + from functools import cached_property
94 + except ImportError:
95 + # If cached_property is not available,
96 + # just use the property decorator, without caching
97 + # This is for backward compatibility with Python<3.8
98 + cached_property = property
99 + from multiprocessing import Process, Queue
100 +
101 +
102 + def sorted_by_creation_date(func):
103 + def sorted_func(*args, **kwargs):
104 + return sorted(
105 + func(*args, **kwargs),
106 + key=(
107 + lambda x: x.attrs['CreatedAt']
108 + if 'CreatedAt' in x.attrs
109 + else x.attrs['Created']
110 + )
111 + )
112 + return sorted_func
113 +
114 +
115 + def clean_fieldname(text):
116 + if text == "root":
117 + # "root" is a magic (forbidden) word
118 + return "_root"
119 + else:
120 + return re.sub(r"(^[^A-Za-z_]|[^A-Za-z0-9_])", "_", text)
121 +
122 +
123 + class ClientWrapper:
124 + """
125 + A small wrapper for the docker client, to centralise some parsing logic,
126 + and support caching.
127 +
128 + In addition, when the exclude_re parameter is not None,
129 + any container which name is matched by the RE will not be excluded from reports.
130 + """
131 + client = None
132 + exclude = None
133 +
134 + def __init__(self, client, exclude_re=None):
135 + self.client = client
136 + if exclude_re:
137 + self.exclude = re.compile(exclude_re)
138 +
139 + @property
140 + def api(self):
141 + return self.client.api
142 +
143 + @cached_property
144 + @sorted_by_creation_date
145 + def all_containers(self):
146 + return [
147 + c for c in self.client.containers.list(all=True)
148 + if (c.status == 'running') and (not self.exclude or not self.exclude.search(c.name))
149 + ]
150 +
151 + @cached_property
152 + @sorted_by_creation_date
153 + def intermediate_images(self):
154 + return list(
155 + set(self.all_images)
156 + .difference(
157 + set(self.images)
158 + .difference(
159 + set(self.dangling_images)
160 + )
161 + )
162 + )
163 +
164 + @cached_property
165 + @sorted_by_creation_date
166 + def all_images(self):
167 + return self.client.images.list(all=True)
168 +
169 + @cached_property
170 + @sorted_by_creation_date
171 + def images(self):
172 + images = self.client.images.list()
173 + return list(
174 + set(images)
175 + .difference(
176 + set(self.dangling_images))
177 + )
178 +
179 + @cached_property
180 + @sorted_by_creation_date
181 + def dangling_images(self):
182 + return self.client.images.list(filters={'dangling': True})
183 +
184 + @cached_property
185 + @sorted_by_creation_date
186 + def volumes(self):
187 + return self.client.volumes.list()
188 +
189 +
190 + def container_summary(container, *args):
191 + summary = container.name
192 + attributes = container_attributes(container, *args)
193 + if attributes:
194 + summary += f' ({attributes})'
195 + return summary
196 +
197 +
198 + def container_attributes(container, *args):
199 + attributes = container.image.tags
200 + attributes.append(container.attrs['Created'])
201 + return ', '.join(attributes + list(args))
202 +
203 +
204 + def print_containers_status(client):
205 + running = []
206 + unhealthy = []
207 + paused = []
208 + created = []
209 + restarting = []
210 + removing = []
211 + exited = []
212 + dead = []
213 + for container in client.all_containers:
214 + if container.status == 'running':
215 + state = client.api.inspect_container(container.name)['State']
216 + if state.get('Health', {}).get('Status') == 'unhealthy':
217 + unhealthy.append(container)
218 + else:
219 + running.append(container)
220 + elif container.status == 'paused':
221 + paused.append(container)
222 + elif container.status == 'created':
223 + created.append(container)
224 + elif container.status == 'restarting':
225 + restarting.append(container)
226 + elif container.status == 'removing':
227 + removing.append(container)
228 + elif container.status == 'exited':
229 + exited.append(container)
230 + elif container.status == 'dead':
231 + dead.append(container)
232 + print('running.value', len(running))
233 + print('running.extinfo', ', '.join(container_summary(c) for c in running))
234 + print('unhealthy.value', len(unhealthy))
235 + print('unhealthy.extinfo', ', '.join(container_summary(c) for c in unhealthy))
236 + print('paused.value', len(paused))
237 + print('paused.extinfo', ', '.join(container_summary(c) for c in paused))
238 + print('created.value', len(created))
239 + print('created.extinfo', ', '.join(container_summary(c) for c in created))
240 + print('restarting.value', len(restarting))
241 + print('restarting.extinfo', ', '.join(container_summary(c) for c in restarting))
242 + print('removing.value', len(removing))
243 + print('removing.extinfo', ', '.join(container_summary(c) for c in removing))
244 + print('exited.value', len(exited))
245 + print('exited.extinfo', ', '.join(container_summary(c) for c in exited))
246 + print('dead.value', len(dead))
247 + print('dead.extinfo', ', '.join(container_summary(c) for c in dead))
248 +
249 +
250 + def image_summary(image):
251 + attributes = image.tags
252 + attributes.append(image.attrs['Created'])
253 + attributes.append(f"{round(image.attrs['Size']/1024**2, 2)} MiB")
254 + return f"{image.short_id} ({', '.join(attributes)})"
255 +
256 +
257 + def print_images_count(client):
258 + images = client.images
259 + intermediate = client.intermediate_images
260 + dangling = client.dangling_images
261 +
262 + print('intermediate_quantity.value', len(intermediate))
263 + print('intermediate_quantity.extinfo', ', '.join(image_summary(i) for i in intermediate))
264 + print('images_quantity.value', len(images))
265 + print('images_quantity.extinfo', ', '.join(image_summary(i) for i in images))
266 + print('dangling_quantity.value', len(dangling))
267 + print('dangling_quantity.extinfo', ', '.join(image_summary(i) for i in dangling))
268 +
269 +
270 + def get_container_stats(container, q):
271 + q.put(container.stats(stream=False))
272 +
273 +
274 + def parallel_container_stats(client):
275 + proc_list = []
276 + stats = {}
277 + for container in client.all_containers:
278 + q = Queue()
279 + p = Process(target=get_container_stats, args=(container, q))
280 + proc_list.append({'proc': p, 'queue': q, 'container': container})
281 + p.start()
282 + for proc in proc_list:
283 + proc['proc'].join()
284 + stats[proc['container']] = proc['queue'].get()
285 + return stats.items()
286 +
287 +
288 + def print_containers_cpu(client):
289 + for container, stats in parallel_container_stats(client):
290 + cpu_percent = 0.0
291 + cpu_delta = (float(stats["cpu_stats"]["cpu_usage"]["total_usage"])
292 + - float(stats["precpu_stats"]["cpu_usage"]["total_usage"]))
293 + system_delta = (float(stats["cpu_stats"]["system_cpu_usage"])
294 + - float(stats["precpu_stats"]["system_cpu_usage"]))
295 + if system_delta > 0.0:
296 + cpu_percent = cpu_delta / system_delta * 100.0 * os.cpu_count()
297 + clean_container_name = clean_fieldname(container.name)
298 + print(clean_container_name + '.value', cpu_percent)
299 + print(clean_container_name + '.extinfo', container_attributes(container))
300 +
301 +
302 + def print_containers_memory(client):
303 + for container, stats in parallel_container_stats(client):
304 + if 'total_rss' in stats['memory_stats']['stats']: # cgroupv1 only?
305 + memory_usage = stats['memory_stats']['stats']['total_rss']
306 + extinfo = 'Resident Set Size'
307 + else:
308 + memory_usage = stats['memory_stats']['usage']
309 + extinfo = 'Total memory usage'
310 + clean_container_name = clean_fieldname(container.name)
311 + print(clean_container_name + '.value', memory_usage)
312 + print(clean_container_name + '.extinfo', container_attributes(container, extinfo))
313 +
314 +
315 + def print_containers_network(client):
316 + for container, stats in parallel_container_stats(client):
317 + tx_bytes = 0
318 + rx_bytes = 0
319 + if "networks" in stats:
320 + for data in stats['networks'].values():
321 + tx_bytes += data['tx_bytes']
322 + rx_bytes += data['rx_bytes']
323 + clean_container_name = clean_fieldname(container.name)
324 + print(clean_container_name + '_up.value', tx_bytes)
325 + print(clean_container_name + '_down.value', rx_bytes)
326 + print(clean_container_name + '_up.extinfo', container_attributes(container))
327 +
328 +
329 + def volume_summary(volume):
330 + summary = f"{volume.short_id}"
331 + if volume.attrs['Labels']:
332 + summary += f" ({', '.join(volume.attrs['Labels'])})"
333 + return summary
334 +
335 +
336 + def status(client, mode):
337 + if mode == "config":
338 + print("graph_title Docker status")
339 + print("graph_vlabel containers")
340 + print("graph_category virtualization")
341 + print("graph_total All containers")
342 + print("running.label RUNNING")
343 + print("running.draw AREASTACK")
344 + print("running.info Running containers can be manipulated with "
345 + "`docker container [attach|kill|logs|pause|restart|stop] <NAME>` or "
346 + "commands run in them with `docker container exec "
347 + "[--detach|--interactive,--privileged,--tty] <NAME> <COMMAND>`"
348 + )
349 + print("unhealthy.label UNHEALTHY")
350 + print("unhealthy.draw AREASTACK")
351 + print("unhealthy.warning 1")
352 + print("unhealthy.info Unhealthy containers can be restarted with "
353 + "`docker container restart <NAME>`")
354 + print("paused.label PAUSED")
355 + print("paused.draw AREASTACK")
356 + print("paused.info Paused containers can be resumed with "
357 + "`docker container unpause <NAME>`")
358 + print("created.label CREATED")
359 + print("created.draw AREASTACK")
360 + print("created.info New containers can be created with "
361 + "`docker container create --name <NAME> <IMAGE_ID >` or "
362 + "`docker container run --name <NAME> <IMAGE_ID> <COMMAND>`")
363 + print("restarting.label RESTARTING")
364 + print("restarting.draw AREASTACK")
365 + print("restarting.info Containers can be restarted with "
366 + "`docker container restart <NAME>`")
367 + print("removing.label REMOVING")
368 + print("removing.draw AREASTACK")
369 + print("removing.info Containers can be removed with "
370 + "`docker container rm <NAME>`")
371 + print("exited.label EXITED")
372 + print("exited.draw AREASTACK")
373 + print("exited.info Exited containers can be started with "
374 + "`docker container start [--attach] <NAME>`")
375 + print("dead.label DEAD")
376 + print("dead.draw AREASTACK")
377 + print("dead.warning 1")
378 + print("dead.info Dead containers can be started with "
379 + "`docker container start <NAME>`")
380 + else:
381 + print_containers_status(client)
382 +
383 +
384 + def containers(client, mode):
385 + if mode == "config":
386 + print("graph_title Docker containers")
387 + print("graph_vlabel containers")
388 + print("graph_category virtualization")
389 + print("containers_quantity.label Containers")
390 + else:
391 + print('containers_quantity.value', len(client.all_containers))
392 +
393 +
394 + def images(client, mode):
395 + if mode == "config":
396 + print("graph_title Docker images")
397 + print("graph_vlabel images")
398 + print("graph_category virtualization")
399 + print("graph_total All images")
400 + print("intermediate_quantity.label Intermediate images")
401 + print("intermediate_quantity.draw AREASTACK")
402 + print("intermediate_quantity.info All unused images can be deleted with "
403 + "`docker image prune --all`")
404 + print("images_quantity.label Images")
405 + print("images_quantity.draw AREASTACK")
406 + print("images_quantity.info Images can be used in containers with "
407 + "`docker container create --name <NAME> <IMAGE_ID >` or "
408 + "`docker container run --name <NAME> <IMAGE_ID> <COMMAND>`")
409 + print("dangling_quantity.label Dangling images")
410 + print("dangling_quantity.draw AREASTACK")
411 + print("dangling_quantity.info Dangling images can be deleted with "
412 + "`docker image prune`"
413 + "or tagged with `docker image tag <IMAGE_ID> <NAME>`")
414 + print("dangling_quantity.warning 10")
415 + else:
416 + print_images_count(client)
417 +
418 +
419 + def volumes(client, mode):
420 + if mode == "config":
421 + print("graph_title Docker volumes")
422 + print("graph_vlabel volumes")
423 + print("graph_category virtualization")
424 + print("volumes_quantity.label Volumes")
425 + print("volumes_quantity.draw AREASTACK")
426 + print("volumes_quantity.info Unused volumes can be deleted with "
427 + "`docker volume prune`")
428 + else:
429 + print('volumes_quantity.value', len(client.volumes))
430 + print('volumes_quantity.extinfo', ', '.join(volume_summary(v) for v in client.volumes))
431 +
432 +
433 + def cpu(client, mode):
434 + if mode == "config":
435 + graphlimit = str(os.cpu_count() * 100)
436 + print("graph_title Docker containers CPU usage")
437 + print("graph_args --base 1000 -r --lower-limit 0 --upper-limit " + graphlimit)
438 + print("graph_scale no")
439 + print("graph_period second")
440 + print("graph_vlabel CPU usage (%)")
441 + print("graph_category virtualization")
442 + print("graph_info This graph shows docker container CPU usage.")
443 + print("graph_total Total CPU usage")
444 + for container in client.all_containers:
445 + fieldname = clean_fieldname(container.name)
446 + print("{}.label {}".format(fieldname, container.name))
447 + print("{}.draw AREASTACK".format(fieldname))
448 + print("{}.info {}".format(fieldname, container_attributes(container)))
449 + else:
450 + print_containers_cpu(client)
451 +
452 +
453 + def network(client, mode):
454 + if mode == "config":
455 + print("graph_title Docker containers network usage")
456 + print("graph_args --base 1024 -l 0")
457 + print("graph_vlabel bits in (-) / out (+) per ${graph_period}")
458 + print("graph_category virtualization")
459 + print("graph_info This graph shows docker container network usage.")
460 + print("graph_total Total network usage")
461 + for container in client.all_containers:
462 + fieldname = clean_fieldname(container.name)
463 + print("{}_down.label {}_received".format(fieldname, container.name))
464 + print("{}_down.type DERIVE".format(fieldname))
465 + print("{}_down.min 0".format(fieldname))
466 + print("{}_down.graph no".format(fieldname))
467 + print("{}_down.cdef {}_down,8,*".format(fieldname, fieldname))
468 + print("{}_up.label {}".format(fieldname, container.name))
469 + print("{}_up.draw LINESTACK1".format(fieldname))
470 + print("{}_up.type DERIVE".format(fieldname))
471 + print("{}_up.min 0".format(fieldname))
472 + print("{}_up.negative {}_down".format(fieldname, fieldname))
473 + print("{}_up.cdef {}_up,8,*".format(fieldname, fieldname))
474 + print("{}_up.info {}".format(fieldname, container_attributes(container)))
475 + else:
476 + print_containers_network(client)
477 +
478 +
479 + def memory(client, mode):
480 + if mode == "config":
481 + print("graph_title Docker containers memory usage")
482 + print("graph_args --base 1024 -l 0")
483 + print("graph_vlabel Bytes")
484 + print("graph_category virtualization")
485 + print("graph_info This graph shows docker container memory usage.")
486 + print("graph_total Total memory usage")
487 + for container in client.all_containers:
488 + fieldname = clean_fieldname(container.name)
489 + print("{}.label {}".format(fieldname, container.name))
490 + print("{}.draw AREASTACK".format(fieldname))
491 + print("{}.info {}".format(fieldname, container_attributes(container)))
492 + else:
493 + print_containers_memory(client)
494 +
495 +
496 + def main():
497 + series = [
498 + 'containers',
499 + 'cpu',
500 + 'images',
501 + 'memory',
502 + 'network',
503 + 'status',
504 + 'volumes',
505 + ]
506 +
507 + try:
508 + mode = sys.argv[1]
509 + except IndexError:
510 + mode = ""
511 + wildcard = sys.argv[0].split("docker_")[1].split("_")[0]
512 +
513 + try:
514 + import docker
515 + client = docker.from_env()
516 + if mode == "autoconf":
517 + client.ping()
518 + print('yes')
519 + sys.exit(0)
520 + except Exception as e:
521 + print(f'no ({e})')
522 + if mode == "autoconf":
523 + sys.exit(0)
524 + sys.exit(1)
525 +
526 + if mode == "suggest":
527 + # The multigraph covers all other graphs,
528 + # so we only need to suggest one
529 + print("multi")
530 + sys.exit(0)
531 +
532 + client = ClientWrapper(client,
533 + exclude_re=os.getenv('EXCLUDE_CONTAINER_NAME'))
534 +
535 + if wildcard in series:
536 + # dereference the function name by looking in the globals()
537 + # this assumes that the function name matches the series name exactly
538 + # if this were to change, a different approach would be needed,
539 + # most likely using a Dict of series name string to callable
540 + globals()[wildcard](client, mode)
541 + elif wildcard == 'multi':
542 + for s in series:
543 + print(f'multigraph docker_{s}')
544 + # ditto
545 + globals()[s](client, mode)
546 + else:
547 + print(f'unknown series ({wildcard})', file=sys.stderr)
548 + sys.exit(1)
549 +
550 +
551 + if __name__ == '__main__':
552 + main()
553 +
Újabb Régebbi