""" Blackout Window Management for DockMon Handles alert suppression during maintenance windows """ import asyncio import logging from datetime import datetime, time, timedelta, timezone from typing import Dict, List, Optional, Tuple from database import DatabaseManager logger = logging.getLogger(__name__) class BlackoutManager: """Manages blackout windows and deferred alerts""" def __init__(self, db: DatabaseManager): self.db = db self._check_task: Optional[asyncio.Task] = None self._last_check: Optional[datetime] = None self._connection_manager = None # Will be set when monitoring starts def is_in_blackout_window(self) -> Tuple[bool, Optional[str]]: """ Check if current time is within any blackout window Returns: (is_blackout, window_name) """ try: settings = self.db.get_settings() if not settings or not settings.blackout_windows: return False, None # Get timezone offset from settings (in minutes), default to 0 (UTC) timezone_offset = getattr(settings, 'timezone_offset', 0) # Get current time in UTC and convert to user's timezone now_utc = datetime.now(timezone.utc) now_local = now_utc + timedelta(minutes=timezone_offset) current_time = now_local.time() current_weekday = now_local.weekday() # 0=Monday, 6=Sunday for window in settings.blackout_windows: if not window.get('enabled', True): continue days = window.get('days', []) start_str = window.get('start', '00:00') end_str = window.get('end', '00:00') start_time = datetime.strptime(start_str, '%H:%M').time() end_time = datetime.strptime(end_str, '%H:%M').time() # Handle overnight windows (e.g., 23:00 to 02:00) if start_time > end_time: # For overnight windows, check if we're in the late night part (before midnight) # or the early morning part (after midnight) if current_time >= start_time: # Late night part - check if today is in the window if current_weekday in days: window_name = window.get('name', f"{start_str}-{end_str}") return True, window_name elif current_time < end_time: # Early morning part - check if YESTERDAY was in the window prev_day = (current_weekday - 1) % 7 if prev_day in days: window_name = window.get('name', f"{start_str}-{end_str}") return True, window_name else: # Regular same-day window if current_weekday in days and start_time <= current_time < end_time: window_name = window.get('name', f"{start_str}-{end_str}") return True, window_name return False, None except Exception as e: logger.error(f"Error checking blackout window: {e}") return False, None def get_last_window_end_time(self) -> Optional[datetime]: """Get when the last blackout window ended (for tracking)""" return getattr(self, '_last_window_end', None) def set_last_window_end_time(self, end_time: datetime): """Set when the last blackout window ended""" self._last_window_end = end_time async def check_container_states_after_blackout(self, notification_service, monitor) -> Dict: """ Check all container states after blackout window ends. Alert if any containers are in problematic states. Returns summary of what was found. Args: notification_service: The notification service instance monitor: The DockerMonitor instance (reused, not created) """ summary = { 'containers_down': [], 'total_checked': 0, 'window_name': None } try: problematic_states = ['exited', 'dead', 'paused', 'removing'] # Check all containers across all hosts for host_id, host in monitor.hosts.items(): if not host.client: continue try: containers = host.client.containers.list(all=True) summary['total_checked'] += len(containers) for container in containers: if container.status in problematic_states: # Get exit code if container exited exit_code = None if container.status == 'exited': try: exit_code = container.attrs.get('State', {}).get('ExitCode') except (AttributeError, KeyError, TypeError) as e: logger.debug(f"Could not get exit code for container {container.id[:12]}: {e}") summary['containers_down'].append({ 'id': container.id[:12], 'name': container.name, 'host_id': host_id, 'host_name': host.name, 'state': container.status, 'exit_code': exit_code, 'image': container.image.tags[0] if container.image.tags else 'unknown' }) except Exception as e: logger.error(f"Error checking containers on host {host.name}: {e}") # Send alert if any containers are down if summary['containers_down'] and notification_service: await self._send_post_blackout_alert(notification_service, summary) except Exception as e: logger.error(f"Error checking container states after blackout: {e}") return summary async def _send_post_blackout_alert(self, notification_service, summary: Dict): """Send alert for containers found in problematic state after blackout""" try: containers_down = summary['containers_down'] # Get all alert rules that monitor state changes alert_rules = self.db.get_alert_rules() # For each container that's down, check if it matches any alert rules for container_info in containers_down: # Find matching alert rules for this container matching_rules = [] for rule in alert_rules: if not rule.enabled: continue # Check if this rule monitors the problematic state if rule.trigger_states and container_info['state'] in rule.trigger_states: # Check if container matches rule's container pattern if self._container_matches_rule(container_info, rule): matching_rules.append(rule) # Send alert through matching rules if matching_rules: from notifications import AlertEvent event = AlertEvent( container_id=container_info['id'], container_name=container_info['name'], host_id=container_info['host_id'], host_name=container_info['host_name'], old_state='unknown_during_blackout', new_state=container_info['state'], exit_code=container_info.get('exit_code'), timestamp=datetime.now(), image=container_info['image'], triggered_by='post_blackout_check' ) # Send through each matching rule's channels for rule in matching_rules: try: # Add note about blackout in the event event.notes = f"Container found in {container_info['state']} state after maintenance window ended" await notification_service.send_alert(event, rule) except Exception as e: logger.error(f"Failed to send post-blackout alert for {container_info['name']}: {e}") except Exception as e: logger.error(f"Error sending post-blackout alerts: {e}") def _container_matches_rule(self, container_info: Dict, rule) -> bool: """Check if container matches an alert rule's container criteria""" try: # If rule has specific container+host pairs if hasattr(rule, 'containers') and rule.containers: for container_spec in rule.containers: if (container_spec.container_name == container_info['name'] and container_spec.host_id == container_info['host_id']): return True return False # Otherwise, rule applies to all containers return True except Exception as e: logger.error(f"Error matching container to rule: {e}") return False async def start_monitoring(self, notification_service, monitor, connection_manager=None): """Start monitoring for blackout window transitions Args: notification_service: The notification service instance monitor: The DockerMonitor instance (reused, not created) connection_manager: Optional WebSocket connection manager """ self._connection_manager = connection_manager self._monitor = monitor # Store monitor reference async def monitor_loop(): was_in_blackout = False while True: try: is_blackout, window_name = self.is_in_blackout_window() # Check if blackout status changed if was_in_blackout != is_blackout: # Broadcast status change to all WebSocket clients if self._connection_manager: await self._connection_manager.broadcast({ 'type': 'blackout_status_changed', 'data': { 'is_blackout': is_blackout, 'window_name': window_name } }) # If we just exited blackout, process suppressed alerts if was_in_blackout and not is_blackout: logger.info(f"Blackout window ended. Processing suppressed alerts...") await notification_service.process_suppressed_alerts(self._monitor) was_in_blackout = is_blackout # Check every 15 seconds for more responsive updates await asyncio.sleep(15) except Exception as e: logger.error(f"Error in blackout monitoring: {e}") await asyncio.sleep(15) self._check_task = asyncio.create_task(monitor_loop()) def stop_monitoring(self): """Stop the monitoring task""" if self._check_task: self._check_task.cancel() self._check_task = None