Monitoring

This commit is contained in:
2025-11-14 00:47:26 +01:00
parent 50437d95d9
commit 97133df5ce
18 changed files with 426 additions and 452 deletions

View File

@@ -0,0 +1,29 @@
groups:
- name: container_alerts
rules:
- alert: ContainerRestarting
expr: delta(container_start_time_seconds{name!=""}[15m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Container restarting ({{ $labels.name }})"
description: "Container {{ $labels.name }} has restarted in the last 15 minutes"
- alert: ContainerHighMemoryUsage
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""} * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container high memory usage ({{ $labels.name }})"
description: "Container {{ $labels.name }} memory usage is {{ $value }}%"
- alert: ContainerCPUThrottling
expr: rate(container_cpu_cfs_throttled_periods_total{name!=""}[5m]) / rate(container_cpu_cfs_periods_total{name!=""}[5m]) > 0.25
for: 5m
labels:
severity: warning
annotations:
summary: "Container CPU throttling ({{ $labels.name }})"
description: "Container {{ $labels.name }} is being throttled {{ $value | humanizePercentage }}"

View File

@@ -0,0 +1,38 @@
groups:
- name: node_alerts
rules:
- alert: HighCPULoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
- alert: HighMemoryLoad
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory load (instance {{ $labels.instance }})"
description: "Memory load is > 80%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes{fstype=~"ext4|xfs"} - node_filesystem_free_bytes{fstype=~"ext4|xfs"}) / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High disk usage (instance {{ $labels.instance }})"
description: "Disk usage is > 85%\n VALUE = {{ $value }}%\n LABELS: {{ $labels }}"
- alert: UnusualMemoryGrowth
expr: deriv(node_memory_MemAvailable_bytes[30m]) < -10 * 1024 * 1024
for: 10m
labels:
severity: warning
annotations:
summary: "Unusual memory consumption rate (instance {{ $labels.instance }})"
description: "Memory is being consumed at a rate of more than 10MB/min\n VALUE = {{ $value | humanize }}B/s"

View File

@@ -0,0 +1,12 @@
groups:
- name: recording_rules
interval: 1m
rules:
- record: node:cpu_usage:avg5m
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- record: node:memory_usage:percent
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
- record: container:cpu_usage:avg5m
expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100