Switched from Dockmon to Beszel
This commit is contained in:
268
dockmon/backend/blackout_manager.py
Normal file
268
dockmon/backend/blackout_manager.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""
|
||||
Blackout Window Management for DockMon
|
||||
Handles alert suppression during maintenance windows
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, time, timedelta, timezone
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from database import DatabaseManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BlackoutManager:
|
||||
"""Manages blackout windows and deferred alerts"""
|
||||
|
||||
def __init__(self, db: DatabaseManager):
|
||||
self.db = db
|
||||
self._check_task: Optional[asyncio.Task] = None
|
||||
self._last_check: Optional[datetime] = None
|
||||
self._connection_manager = None # Will be set when monitoring starts
|
||||
|
||||
def is_in_blackout_window(self) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Check if current time is within any blackout window
|
||||
Returns: (is_blackout, window_name)
|
||||
"""
|
||||
try:
|
||||
settings = self.db.get_settings()
|
||||
if not settings or not settings.blackout_windows:
|
||||
return False, None
|
||||
|
||||
# Get timezone offset from settings (in minutes), default to 0 (UTC)
|
||||
timezone_offset = getattr(settings, 'timezone_offset', 0)
|
||||
|
||||
# Get current time in UTC and convert to user's timezone
|
||||
now_utc = datetime.now(timezone.utc)
|
||||
now_local = now_utc + timedelta(minutes=timezone_offset)
|
||||
current_time = now_local.time()
|
||||
current_weekday = now_local.weekday() # 0=Monday, 6=Sunday
|
||||
|
||||
for window in settings.blackout_windows:
|
||||
if not window.get('enabled', True):
|
||||
continue
|
||||
|
||||
days = window.get('days', [])
|
||||
start_str = window.get('start', '00:00')
|
||||
end_str = window.get('end', '00:00')
|
||||
|
||||
start_time = datetime.strptime(start_str, '%H:%M').time()
|
||||
end_time = datetime.strptime(end_str, '%H:%M').time()
|
||||
|
||||
# Handle overnight windows (e.g., 23:00 to 02:00)
|
||||
if start_time > end_time:
|
||||
# For overnight windows, check if we're in the late night part (before midnight)
|
||||
# or the early morning part (after midnight)
|
||||
if current_time >= start_time:
|
||||
# Late night part - check if today is in the window
|
||||
if current_weekday in days:
|
||||
window_name = window.get('name', f"{start_str}-{end_str}")
|
||||
return True, window_name
|
||||
elif current_time < end_time:
|
||||
# Early morning part - check if YESTERDAY was in the window
|
||||
prev_day = (current_weekday - 1) % 7
|
||||
if prev_day in days:
|
||||
window_name = window.get('name', f"{start_str}-{end_str}")
|
||||
return True, window_name
|
||||
else:
|
||||
# Regular same-day window
|
||||
if current_weekday in days and start_time <= current_time < end_time:
|
||||
window_name = window.get('name', f"{start_str}-{end_str}")
|
||||
return True, window_name
|
||||
|
||||
return False, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking blackout window: {e}")
|
||||
return False, None
|
||||
|
||||
def get_last_window_end_time(self) -> Optional[datetime]:
|
||||
"""Get when the last blackout window ended (for tracking)"""
|
||||
return getattr(self, '_last_window_end', None)
|
||||
|
||||
def set_last_window_end_time(self, end_time: datetime):
|
||||
"""Set when the last blackout window ended"""
|
||||
self._last_window_end = end_time
|
||||
|
||||
async def check_container_states_after_blackout(self, notification_service, monitor) -> Dict:
|
||||
"""
|
||||
Check all container states after blackout window ends.
|
||||
Alert if any containers are in problematic states.
|
||||
Returns summary of what was found.
|
||||
|
||||
Args:
|
||||
notification_service: The notification service instance
|
||||
monitor: The DockerMonitor instance (reused, not created)
|
||||
"""
|
||||
summary = {
|
||||
'containers_down': [],
|
||||
'total_checked': 0,
|
||||
'window_name': None
|
||||
}
|
||||
|
||||
try:
|
||||
|
||||
problematic_states = ['exited', 'dead', 'paused', 'removing']
|
||||
|
||||
# Check all containers across all hosts
|
||||
for host_id, host in monitor.hosts.items():
|
||||
if not host.client:
|
||||
continue
|
||||
|
||||
try:
|
||||
containers = host.client.containers.list(all=True)
|
||||
summary['total_checked'] += len(containers)
|
||||
|
||||
for container in containers:
|
||||
if container.status in problematic_states:
|
||||
# Get exit code if container exited
|
||||
exit_code = None
|
||||
if container.status == 'exited':
|
||||
try:
|
||||
exit_code = container.attrs.get('State', {}).get('ExitCode')
|
||||
except (AttributeError, KeyError, TypeError) as e:
|
||||
logger.debug(f"Could not get exit code for container {container.id[:12]}: {e}")
|
||||
|
||||
summary['containers_down'].append({
|
||||
'id': container.id[:12],
|
||||
'name': container.name,
|
||||
'host_id': host_id,
|
||||
'host_name': host.name,
|
||||
'state': container.status,
|
||||
'exit_code': exit_code,
|
||||
'image': container.image.tags[0] if container.image.tags else 'unknown'
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking containers on host {host.name}: {e}")
|
||||
|
||||
# Send alert if any containers are down
|
||||
if summary['containers_down'] and notification_service:
|
||||
await self._send_post_blackout_alert(notification_service, summary)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking container states after blackout: {e}")
|
||||
|
||||
return summary
|
||||
|
||||
async def _send_post_blackout_alert(self, notification_service, summary: Dict):
|
||||
"""Send alert for containers found in problematic state after blackout"""
|
||||
try:
|
||||
containers_down = summary['containers_down']
|
||||
|
||||
# Get all alert rules that monitor state changes
|
||||
alert_rules = self.db.get_alert_rules()
|
||||
|
||||
# For each container that's down, check if it matches any alert rules
|
||||
for container_info in containers_down:
|
||||
# Find matching alert rules for this container
|
||||
matching_rules = []
|
||||
for rule in alert_rules:
|
||||
if not rule.enabled:
|
||||
continue
|
||||
|
||||
# Check if this rule monitors the problematic state
|
||||
if rule.trigger_states and container_info['state'] in rule.trigger_states:
|
||||
# Check if container matches rule's container pattern
|
||||
if self._container_matches_rule(container_info, rule):
|
||||
matching_rules.append(rule)
|
||||
|
||||
# Send alert through matching rules
|
||||
if matching_rules:
|
||||
from notifications import AlertEvent
|
||||
event = AlertEvent(
|
||||
container_id=container_info['id'],
|
||||
container_name=container_info['name'],
|
||||
host_id=container_info['host_id'],
|
||||
host_name=container_info['host_name'],
|
||||
old_state='unknown_during_blackout',
|
||||
new_state=container_info['state'],
|
||||
exit_code=container_info.get('exit_code'),
|
||||
timestamp=datetime.now(),
|
||||
image=container_info['image'],
|
||||
triggered_by='post_blackout_check'
|
||||
)
|
||||
|
||||
# Send through each matching rule's channels
|
||||
for rule in matching_rules:
|
||||
try:
|
||||
# Add note about blackout in the event
|
||||
event.notes = f"Container found in {container_info['state']} state after maintenance window ended"
|
||||
await notification_service.send_alert(event, rule)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send post-blackout alert for {container_info['name']}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending post-blackout alerts: {e}")
|
||||
|
||||
def _container_matches_rule(self, container_info: Dict, rule) -> bool:
|
||||
"""Check if container matches an alert rule's container criteria"""
|
||||
try:
|
||||
# If rule has specific container+host pairs
|
||||
if hasattr(rule, 'containers') and rule.containers:
|
||||
for container_spec in rule.containers:
|
||||
if (container_spec.container_name == container_info['name'] and
|
||||
container_spec.host_id == container_info['host_id']):
|
||||
return True
|
||||
return False
|
||||
|
||||
# Otherwise, rule applies to all containers
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error matching container to rule: {e}")
|
||||
return False
|
||||
|
||||
async def start_monitoring(self, notification_service, monitor, connection_manager=None):
|
||||
"""Start monitoring for blackout window transitions
|
||||
|
||||
Args:
|
||||
notification_service: The notification service instance
|
||||
monitor: The DockerMonitor instance (reused, not created)
|
||||
connection_manager: Optional WebSocket connection manager
|
||||
"""
|
||||
self._connection_manager = connection_manager
|
||||
self._monitor = monitor # Store monitor reference
|
||||
|
||||
async def monitor_loop():
|
||||
was_in_blackout = False
|
||||
|
||||
while True:
|
||||
try:
|
||||
is_blackout, window_name = self.is_in_blackout_window()
|
||||
|
||||
# Check if blackout status changed
|
||||
if was_in_blackout != is_blackout:
|
||||
# Broadcast status change to all WebSocket clients
|
||||
if self._connection_manager:
|
||||
await self._connection_manager.broadcast({
|
||||
'type': 'blackout_status_changed',
|
||||
'data': {
|
||||
'is_blackout': is_blackout,
|
||||
'window_name': window_name
|
||||
}
|
||||
})
|
||||
|
||||
# If we just exited blackout, process suppressed alerts
|
||||
if was_in_blackout and not is_blackout:
|
||||
logger.info(f"Blackout window ended. Processing suppressed alerts...")
|
||||
await notification_service.process_suppressed_alerts(self._monitor)
|
||||
|
||||
was_in_blackout = is_blackout
|
||||
|
||||
# Check every 15 seconds for more responsive updates
|
||||
await asyncio.sleep(15)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in blackout monitoring: {e}")
|
||||
await asyncio.sleep(15)
|
||||
|
||||
self._check_task = asyncio.create_task(monitor_loop())
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Stop the monitoring task"""
|
||||
if self._check_task:
|
||||
self._check_task.cancel()
|
||||
self._check_task = None
|
||||
Reference in New Issue
Block a user