Monitoring
This commit is contained in:
18
monitoring/alertmanager/config.yml
Normal file
18
monitoring/alertmanager/config.yml
Normal file
@@ -0,0 +1,18 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 1h
|
||||
receiver: 'email-notifications'
|
||||
|
||||
receivers:
|
||||
- name: 'email-notifications'
|
||||
email_configs:
|
||||
- to: 'admin@delmar.bzh'
|
||||
from: 'noreply@delmar.bzh'
|
||||
smarthost: pro1.mail.ovh.net:58
|
||||
auth_username: 'admin@delmar.bzh'
|
||||
auth_password: 'sxS4GA8rBfmFkCFL'
|
||||
1
monitoring/compose.env
Normal file
1
monitoring/compose.env
Normal file
@@ -0,0 +1 @@
|
||||
GRAFANA_PASSWORD="XbJ6do@xT8478c"
|
||||
96
monitoring/docker-compose-node.yaml
Normal file
96
monitoring/docker-compose-node.yaml
Normal file
@@ -0,0 +1,96 @@
|
||||
# bob (mon.delmar.bzh)
|
||||
---
|
||||
name: monitoring
|
||||
|
||||
volumes:
|
||||
prometheus_data: {}
|
||||
grafana_data: {}
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--storage.tsdb.retention.time=15d'
|
||||
- '--storage.tsdb.wal-compression'
|
||||
- '--web.enable-lifecycle'
|
||||
ports:
|
||||
- "9090:9090"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node-exporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
container_name: cadvisor
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
user: "1000:1000"
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
ports:
|
||||
- "11000:3000"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
volumes:
|
||||
- ./alertmanager:/etc/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
ports:
|
||||
- "9093:9093"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
96
monitoring/docker-compose.yaml
Normal file
96
monitoring/docker-compose.yaml
Normal file
@@ -0,0 +1,96 @@
|
||||
# bob (mon.delmar.bzh)
|
||||
---
|
||||
name: monitoring
|
||||
|
||||
volumes:
|
||||
prometheus_data: {}
|
||||
grafana_data: {}
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--storage.tsdb.retention.time=15d'
|
||||
- '--storage.tsdb.wal-compression'
|
||||
- '--web.enable-lifecycle'
|
||||
ports:
|
||||
- "9090:9090"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node-exporter
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($|/)'
|
||||
ports:
|
||||
- "9100:9100"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
container_name: cadvisor
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
user: "1000:1000"
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
ports:
|
||||
- "11000:3000"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
volumes:
|
||||
- ./alertmanager:/etc/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/config.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
ports:
|
||||
- "9093:9093"
|
||||
networks:
|
||||
- monitoring
|
||||
restart: unless-stopped
|
||||
37
monitoring/grafana/dashboards/system-overview.json
Normal file
37
monitoring/grafana/dashboards/system-overview.json
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"title": "System Overview",
|
||||
"uid": "system-overview",
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"title": "CPU Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0},
|
||||
"targets": [{"expr": "node:cpu_usage:avg5m"}]
|
||||
},
|
||||
{
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 0},
|
||||
"targets": [{"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100"}]
|
||||
},
|
||||
{
|
||||
"title": "Disk Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 0},
|
||||
"targets": [{"expr": "(node_filesystem_size_bytes{mountpoint=\"/\"} - node_filesystem_free_bytes{mountpoint=\"/\"}) / node_filesystem_size_bytes{mountpoint=\"/\"} * 100"}]
|
||||
},
|
||||
{
|
||||
"title": "Container CPU Usage",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"targets": [{"expr": "sum by(name) (rate(container_cpu_usage_seconds_total{name!=\"\"}[5m])) * 100"}]
|
||||
},
|
||||
{
|
||||
"title": "Container Memory Usage",
|
||||
"type": "graph",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"targets": [{"expr": "sum by(name) (container_memory_usage_bytes{name!=\"\"})"}]
|
||||
}
|
||||
]
|
||||
}
|
||||
12
monitoring/grafana/provisioning/dashboards/dashboards.yml
Normal file
12
monitoring/grafana/provisioning/dashboards/dashboards.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,8 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
34
monitoring/prometheus/prometheus.yml
Normal file
34
monitoring/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,34 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# Load rules once and periodically evaluate them
|
||||
rule_files:
|
||||
- "rules/*.yml"
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# System metrics change frequently, scrape more often
|
||||
- job_name: 'node-exporter'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['node-exporter:11910']
|
||||
|
||||
# Container metrics are also volatile
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['cadvisor:11080']
|
||||
|
||||
# Prometheus itself changes slowly, scrape less frequently
|
||||
- job_name: 'prometheus'
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ['localhost:11090']
|
||||
29
monitoring/prometheus/rules/container_alerts.yml
Normal file
29
monitoring/prometheus/rules/container_alerts.yml
Normal file
@@ -0,0 +1,29 @@
|
||||
groups:
|
||||
- name: container_alerts
|
||||
rules:
|
||||
- alert: ContainerRestarting
|
||||
expr: delta(container_start_time_seconds{name!=""}[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container restarting ({{ $labels.name }})"
|
||||
description: "Container {{ $labels.name }} has restarted in the last 15 minutes"
|
||||
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container high memory usage ({{ $labels.name }})"
|
||||
description: "Container {{ $labels.name }} memory usage is {{ $value }}%"
|
||||
|
||||
- alert: ContainerCPUThrottling
|
||||
expr: rate(container_cpu_cfs_throttled_periods_total{name!=""}[5m]) / rate(container_cpu_cfs_periods_total{name!=""}[5m]) > 0.25
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container CPU throttling ({{ $labels.name }})"
|
||||
description: "Container {{ $labels.name }} is being throttled {{ $value | humanizePercentage }}"
|
||||
38
monitoring/prometheus/rules/node_alerts.yml
Normal file
38
monitoring/prometheus/rules/node_alerts.yml
Normal file
@@ -0,0 +1,38 @@
|
||||
groups:
|
||||
- name: node_alerts
|
||||
rules:
|
||||
- alert: HighCPULoad
|
||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU load (instance {{ $labels.instance }})"
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
|
||||
|
||||
- alert: HighMemoryLoad
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory load (instance {{ $labels.instance }})"
|
||||
description: "Memory load is > 80%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
|
||||
|
||||
- alert: HighDiskUsage
|
||||
expr: (node_filesystem_size_bytes{fstype=~"ext4|xfs"} - node_filesystem_free_bytes{fstype=~"ext4|xfs"}) / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High disk usage (instance {{ $labels.instance }})"
|
||||
description: "Disk usage is > 85%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
|
||||
|
||||
- alert: UnusualMemoryGrowth
|
||||
expr: deriv(node_memory_MemAvailable_bytes[30m]) < -10 * 1024 * 1024
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Unusual memory consumption rate (instance {{ $labels.instance }})"
|
||||
description: "Memory is being consumed at a rate of more than 10MB/min\n VALUE = {{ $value | humanize }}B/s"
|
||||
12
monitoring/prometheus/rules/recording_rules.yml
Normal file
12
monitoring/prometheus/rules/recording_rules.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
groups:
|
||||
- name: recording_rules
|
||||
interval: 1m
|
||||
rules:
|
||||
- record: node:cpu_usage:avg5m
|
||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
|
||||
- record: node:memory_usage:percent
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
|
||||
|
||||
- record: container:cpu_usage:avg5m
|
||||
expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100
|
||||
Reference in New Issue
Block a user