Switched from Dockmon to Beszel

This commit is contained in:
2025-10-31 17:13:00 +01:00
parent cc6454cef9
commit f4a4142799
75 changed files with 24313 additions and 122 deletions

View File

@@ -0,0 +1,268 @@
"""
Blackout Window Management for DockMon
Handles alert suppression during maintenance windows
"""
import asyncio
import logging
from datetime import datetime, time, timedelta, timezone
from typing import Dict, List, Optional, Tuple
from database import DatabaseManager
logger = logging.getLogger(__name__)
class BlackoutManager:
"""Manages blackout windows and deferred alerts"""
def __init__(self, db: DatabaseManager):
self.db = db
self._check_task: Optional[asyncio.Task] = None
self._last_check: Optional[datetime] = None
self._connection_manager = None # Will be set when monitoring starts
def is_in_blackout_window(self) -> Tuple[bool, Optional[str]]:
"""
Check if current time is within any blackout window
Returns: (is_blackout, window_name)
"""
try:
settings = self.db.get_settings()
if not settings or not settings.blackout_windows:
return False, None
# Get timezone offset from settings (in minutes), default to 0 (UTC)
timezone_offset = getattr(settings, 'timezone_offset', 0)
# Get current time in UTC and convert to user's timezone
now_utc = datetime.now(timezone.utc)
now_local = now_utc + timedelta(minutes=timezone_offset)
current_time = now_local.time()
current_weekday = now_local.weekday() # 0=Monday, 6=Sunday
for window in settings.blackout_windows:
if not window.get('enabled', True):
continue
days = window.get('days', [])
start_str = window.get('start', '00:00')
end_str = window.get('end', '00:00')
start_time = datetime.strptime(start_str, '%H:%M').time()
end_time = datetime.strptime(end_str, '%H:%M').time()
# Handle overnight windows (e.g., 23:00 to 02:00)
if start_time > end_time:
# For overnight windows, check if we're in the late night part (before midnight)
# or the early morning part (after midnight)
if current_time >= start_time:
# Late night part - check if today is in the window
if current_weekday in days:
window_name = window.get('name', f"{start_str}-{end_str}")
return True, window_name
elif current_time < end_time:
# Early morning part - check if YESTERDAY was in the window
prev_day = (current_weekday - 1) % 7
if prev_day in days:
window_name = window.get('name', f"{start_str}-{end_str}")
return True, window_name
else:
# Regular same-day window
if current_weekday in days and start_time <= current_time < end_time:
window_name = window.get('name', f"{start_str}-{end_str}")
return True, window_name
return False, None
except Exception as e:
logger.error(f"Error checking blackout window: {e}")
return False, None
def get_last_window_end_time(self) -> Optional[datetime]:
"""Get when the last blackout window ended (for tracking)"""
return getattr(self, '_last_window_end', None)
def set_last_window_end_time(self, end_time: datetime):
"""Set when the last blackout window ended"""
self._last_window_end = end_time
async def check_container_states_after_blackout(self, notification_service, monitor) -> Dict:
"""
Check all container states after blackout window ends.
Alert if any containers are in problematic states.
Returns summary of what was found.
Args:
notification_service: The notification service instance
monitor: The DockerMonitor instance (reused, not created)
"""
summary = {
'containers_down': [],
'total_checked': 0,
'window_name': None
}
try:
problematic_states = ['exited', 'dead', 'paused', 'removing']
# Check all containers across all hosts
for host_id, host in monitor.hosts.items():
if not host.client:
continue
try:
containers = host.client.containers.list(all=True)
summary['total_checked'] += len(containers)
for container in containers:
if container.status in problematic_states:
# Get exit code if container exited
exit_code = None
if container.status == 'exited':
try:
exit_code = container.attrs.get('State', {}).get('ExitCode')
except (AttributeError, KeyError, TypeError) as e:
logger.debug(f"Could not get exit code for container {container.id[:12]}: {e}")
summary['containers_down'].append({
'id': container.id[:12],
'name': container.name,
'host_id': host_id,
'host_name': host.name,
'state': container.status,
'exit_code': exit_code,
'image': container.image.tags[0] if container.image.tags else 'unknown'
})
except Exception as e:
logger.error(f"Error checking containers on host {host.name}: {e}")
# Send alert if any containers are down
if summary['containers_down'] and notification_service:
await self._send_post_blackout_alert(notification_service, summary)
except Exception as e:
logger.error(f"Error checking container states after blackout: {e}")
return summary
async def _send_post_blackout_alert(self, notification_service, summary: Dict):
"""Send alert for containers found in problematic state after blackout"""
try:
containers_down = summary['containers_down']
# Get all alert rules that monitor state changes
alert_rules = self.db.get_alert_rules()
# For each container that's down, check if it matches any alert rules
for container_info in containers_down:
# Find matching alert rules for this container
matching_rules = []
for rule in alert_rules:
if not rule.enabled:
continue
# Check if this rule monitors the problematic state
if rule.trigger_states and container_info['state'] in rule.trigger_states:
# Check if container matches rule's container pattern
if self._container_matches_rule(container_info, rule):
matching_rules.append(rule)
# Send alert through matching rules
if matching_rules:
from notifications import AlertEvent
event = AlertEvent(
container_id=container_info['id'],
container_name=container_info['name'],
host_id=container_info['host_id'],
host_name=container_info['host_name'],
old_state='unknown_during_blackout',
new_state=container_info['state'],
exit_code=container_info.get('exit_code'),
timestamp=datetime.now(),
image=container_info['image'],
triggered_by='post_blackout_check'
)
# Send through each matching rule's channels
for rule in matching_rules:
try:
# Add note about blackout in the event
event.notes = f"Container found in {container_info['state']} state after maintenance window ended"
await notification_service.send_alert(event, rule)
except Exception as e:
logger.error(f"Failed to send post-blackout alert for {container_info['name']}: {e}")
except Exception as e:
logger.error(f"Error sending post-blackout alerts: {e}")
def _container_matches_rule(self, container_info: Dict, rule) -> bool:
"""Check if container matches an alert rule's container criteria"""
try:
# If rule has specific container+host pairs
if hasattr(rule, 'containers') and rule.containers:
for container_spec in rule.containers:
if (container_spec.container_name == container_info['name'] and
container_spec.host_id == container_info['host_id']):
return True
return False
# Otherwise, rule applies to all containers
return True
except Exception as e:
logger.error(f"Error matching container to rule: {e}")
return False
async def start_monitoring(self, notification_service, monitor, connection_manager=None):
"""Start monitoring for blackout window transitions
Args:
notification_service: The notification service instance
monitor: The DockerMonitor instance (reused, not created)
connection_manager: Optional WebSocket connection manager
"""
self._connection_manager = connection_manager
self._monitor = monitor # Store monitor reference
async def monitor_loop():
was_in_blackout = False
while True:
try:
is_blackout, window_name = self.is_in_blackout_window()
# Check if blackout status changed
if was_in_blackout != is_blackout:
# Broadcast status change to all WebSocket clients
if self._connection_manager:
await self._connection_manager.broadcast({
'type': 'blackout_status_changed',
'data': {
'is_blackout': is_blackout,
'window_name': window_name
}
})
# If we just exited blackout, process suppressed alerts
if was_in_blackout and not is_blackout:
logger.info(f"Blackout window ended. Processing suppressed alerts...")
await notification_service.process_suppressed_alerts(self._monitor)
was_in_blackout = is_blackout
# Check every 15 seconds for more responsive updates
await asyncio.sleep(15)
except Exception as e:
logger.error(f"Error in blackout monitoring: {e}")
await asyncio.sleep(15)
self._check_task = asyncio.create_task(monitor_loop())
def stop_monitoring(self):
"""Stop the monitoring task"""
if self._check_task:
self._check_task.cancel()
self._check_task = None