Monitoring

This commit is contained in:
2025-11-14 00:47:26 +01:00
parent 50437d95d9
commit 97133df5ce
18 changed files with 426 additions and 452 deletions

View File

@@ -0,0 +1,18 @@
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'email-notifications'
receivers:
- name: 'email-notifications'
email_configs:
- to: 'admin@delmar.bzh'
from: 'noreply@delmar.bzh'
smarthost: pro1.mail.ovh.net:58
auth_username: 'admin@delmar.bzh'
auth_password: 'sxS4GA8rBfmFkCFL'

1
monitoring/compose.env Normal file
View File

@@ -0,0 +1 @@
GRAFANA_PASSWORD="XbJ6do@xT8478c"

View File

@@ -0,0 +1,96 @@
# bob (mon.delmar.bzh)
---
name: monitoring
volumes:
prometheus_data: {}
grafana_data: {}
networks:
monitoring:
driver: bridge
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
volumes:
- ./prometheus:/etc/prometheus
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--storage.tsdb.retention.time=15d'
- '--storage.tsdb.wal-compression'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
networks:
- monitoring
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)'
ports:
- "9100:9100"
networks:
- monitoring
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
ports:
- "8080:8080"
networks:
- monitoring
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
user: "1000:1000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
ports:
- "11000:3000"
networks:
- monitoring
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
volumes:
- ./alertmanager:/etc/alertmanager
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
ports:
- "9093:9093"
networks:
- monitoring
restart: unless-stopped

View File

@@ -0,0 +1,96 @@
# bob (mon.delmar.bzh)
---
name: monitoring
volumes:
prometheus_data: {}
grafana_data: {}
networks:
monitoring:
driver: bridge
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
volumes:
- ./prometheus:/etc/prometheus
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--storage.tsdb.retention.time=15d'
- '--storage.tsdb.wal-compression'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
networks:
- monitoring
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)'
ports:
- "9100:9100"
networks:
- monitoring
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
ports:
- "8080:8080"
networks:
- monitoring
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
user: "1000:1000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
ports:
- "11000:3000"
networks:
- monitoring
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
volumes:
- ./alertmanager:/etc/alertmanager
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
ports:
- "9093:9093"
networks:
- monitoring
restart: unless-stopped

View File

@@ -0,0 +1,37 @@
{
"title": "System Overview",
"uid": "system-overview",
"version": 1,
"panels": [
{
"title": "CPU Usage",
"type": "gauge",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0},
"targets": [{"expr": "node:cpu_usage:avg5m"}]
},
{
"title": "Memory Usage",
"type": "gauge",
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 0},
"targets": [{"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100"}]
},
{
"title": "Disk Usage",
"type": "gauge",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 0},
"targets": [{"expr": "(node_filesystem_size_bytes{mountpoint=\"/\"} - node_filesystem_free_bytes{mountpoint=\"/\"}) / node_filesystem_size_bytes{mountpoint=\"/\"} * 100"}]
},
{
"title": "Container CPU Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [{"expr": "sum by(name) (rate(container_cpu_usage_seconds_total{name!=\"\"}[5m])) * 100"}]
},
{
"title": "Container Memory Usage",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [{"expr": "sum by(name) (container_memory_usage_bytes{name!=\"\"})"}]
}
]
}

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'Default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,8 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true

View File

@@ -0,0 +1,34 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them
rule_files:
- "rules/*.yml"
# Scrape configurations
scrape_configs:
# System metrics change frequently, scrape more often
- job_name: 'node-exporter'
scrape_interval: 10s
static_configs:
- targets: ['node-exporter:11910']
# Container metrics are also volatile
- job_name: 'cadvisor'
scrape_interval: 10s
static_configs:
- targets: ['cadvisor:11080']
# Prometheus itself changes slowly, scrape less frequently
- job_name: 'prometheus'
scrape_interval: 30s
static_configs:
- targets: ['localhost:11090']

View File

@@ -0,0 +1,29 @@
groups:
- name: container_alerts
rules:
- alert: ContainerRestarting
expr: delta(container_start_time_seconds{name!=""}[15m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Container restarting ({{ $labels.name }})"
description: "Container {{ $labels.name }} has restarted in the last 15 minutes"
- alert: ContainerHighMemoryUsage
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container high memory usage ({{ $labels.name }})"
description: "Container {{ $labels.name }} memory usage is {{ $value }}%"
- alert: ContainerCPUThrottling
expr: rate(container_cpu_cfs_throttled_periods_total{name!=""}[5m]) / rate(container_cpu_cfs_periods_total{name!=""}[5m]) > 0.25
for: 5m
labels:
severity: warning
annotations:
summary: "Container CPU throttling ({{ $labels.name }})"
description: "Container {{ $labels.name }} is being throttled {{ $value | humanizePercentage }}"

View File

@@ -0,0 +1,38 @@
groups:
- name: node_alerts
rules:
- alert: HighCPULoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
- alert: HighMemoryLoad
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory load (instance {{ $labels.instance }})"
description: "Memory load is > 80%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes{fstype=~"ext4|xfs"} - node_filesystem_free_bytes{fstype=~"ext4|xfs"}) / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High disk usage (instance {{ $labels.instance }})"
description: "Disk usage is > 85%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
- alert: UnusualMemoryGrowth
expr: deriv(node_memory_MemAvailable_bytes[30m]) < -10 * 1024 * 1024
for: 10m
labels:
severity: warning
annotations:
summary: "Unusual memory consumption rate (instance {{ $labels.instance }})"
description: "Memory is being consumed at a rate of more than 10MB/min\n VALUE = {{ $value | humanize }}B/s"

View File

@@ -0,0 +1,12 @@
groups:
- name: recording_rules
interval: 1m
rules:
- record: node:cpu_usage:avg5m
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- record: node:memory_usage:percent
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
- record: container:cpu_usage:avg5m
expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100