361 lines
15 KiB
Python
361 lines
15 KiB
Python
"""
|
|
Client for communicating with the Go stats service
|
|
"""
|
|
import aiohttp
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
from typing import Dict, Optional, Callable
|
|
import json
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
STATS_SERVICE_URL = "http://localhost:8081"
|
|
TOKEN_FILE_PATH = "/tmp/stats-service-token"
|
|
|
|
|
|
class StatsServiceClient:
|
|
"""Client for the Go stats service"""
|
|
|
|
def __init__(self, base_url: str = STATS_SERVICE_URL):
|
|
self.base_url = base_url
|
|
self.session: Optional[aiohttp.ClientSession] = None
|
|
self.token: Optional[str] = None
|
|
self.ws_connection: Optional[aiohttp.ClientWebSocketResponse] = None
|
|
self.ws_task: Optional[asyncio.Task] = None
|
|
self.event_callback: Optional[Callable] = None
|
|
self._token_lock = asyncio.Lock()
|
|
|
|
async def _load_token(self) -> str:
|
|
"""Load auth token from file (with retry for startup race condition)"""
|
|
# Fast path: check without lock first (double-check locking pattern)
|
|
if self.token:
|
|
return self.token
|
|
|
|
async with self._token_lock:
|
|
# Check again after acquiring lock (another thread may have loaded it)
|
|
if self.token:
|
|
return self.token
|
|
|
|
# Retry logic: Wait up to 5 seconds for token file to appear
|
|
for attempt in range(10):
|
|
try:
|
|
if os.path.exists(TOKEN_FILE_PATH):
|
|
with open(TOKEN_FILE_PATH, 'r') as f:
|
|
self.token = f.read().strip()
|
|
logger.info("Loaded stats service auth token")
|
|
return self.token
|
|
except Exception as e:
|
|
logger.warning(f"Failed to read token file (attempt {attempt + 1}): {e}")
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
raise RuntimeError(f"Failed to load stats service token from {TOKEN_FILE_PATH}")
|
|
|
|
async def _get_session(self) -> aiohttp.ClientSession:
|
|
"""Get or create aiohttp session with auth header"""
|
|
if self.session is None or self.session.closed:
|
|
token = await self._load_token()
|
|
timeout = aiohttp.ClientTimeout(total=5)
|
|
headers = {"Authorization": f"Bearer {token}"}
|
|
self.session = aiohttp.ClientSession(timeout=timeout, headers=headers)
|
|
return self.session
|
|
|
|
async def close(self):
|
|
"""Close the HTTP session and WebSocket connection"""
|
|
# Close WebSocket
|
|
if self.ws_task:
|
|
self.ws_task.cancel()
|
|
try:
|
|
await self.ws_task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
if self.ws_connection and not self.ws_connection.closed:
|
|
await self.ws_connection.close()
|
|
|
|
# Close HTTP session
|
|
if self.session and not self.session.closed:
|
|
await self.session.close()
|
|
|
|
async def health_check(self) -> bool:
|
|
"""Check if stats service is healthy"""
|
|
try:
|
|
session = await self._get_session()
|
|
async with session.get(f"{self.base_url}/health") as resp:
|
|
return resp.status == 200
|
|
except Exception as e:
|
|
logger.warning(f"Stats service health check failed: {e}")
|
|
return False
|
|
|
|
async def add_docker_host(self, host_id: str, host_address: str, tls_ca: str = None, tls_cert: str = None, tls_key: str = None) -> bool:
|
|
"""Register a Docker host with the stats service"""
|
|
try:
|
|
session = await self._get_session()
|
|
payload = {"host_id": host_id, "host_address": host_address}
|
|
|
|
# Add TLS certificates if provided
|
|
if tls_ca and tls_cert and tls_key:
|
|
payload["tls_ca_cert"] = tls_ca
|
|
payload["tls_cert"] = tls_cert
|
|
payload["tls_key"] = tls_key
|
|
|
|
async with session.post(
|
|
f"{self.base_url}/api/hosts/add",
|
|
json=payload
|
|
) as resp:
|
|
if resp.status == 200:
|
|
logger.info(f"Registered host {host_id} with stats service")
|
|
return True
|
|
else:
|
|
logger.error(f"Failed to register host {host_id}: {resp.status}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Error registering host {host_id} with stats service: {e}")
|
|
return False
|
|
|
|
async def remove_docker_host(self, host_id: str) -> bool:
|
|
"""Remove a Docker host from the stats service"""
|
|
try:
|
|
session = await self._get_session()
|
|
async with session.post(
|
|
f"{self.base_url}/api/hosts/remove",
|
|
json={"host_id": host_id}
|
|
) as resp:
|
|
if resp.status == 200:
|
|
logger.info(f"Removed host {host_id[:8]} from stats service")
|
|
return True
|
|
else:
|
|
logger.warning(f"Failed to remove host {host_id[:8]} from stats service: {resp.status}")
|
|
return False
|
|
except asyncio.TimeoutError:
|
|
# Timeout during host removal is expected - Go service closes connections immediately
|
|
logger.debug(f"Timeout removing host {host_id[:8]} from stats service (expected during cleanup)")
|
|
return False
|
|
except Exception as e:
|
|
logger.warning(f"Error removing host {host_id[:8]} from stats service: {e}")
|
|
return False
|
|
|
|
async def start_container_stream(self, container_id: str, container_name: str, host_id: str) -> bool:
|
|
"""Start stats streaming for a container"""
|
|
try:
|
|
session = await self._get_session()
|
|
async with session.post(
|
|
f"{self.base_url}/api/streams/start",
|
|
json={
|
|
"container_id": container_id,
|
|
"container_name": container_name,
|
|
"host_id": host_id
|
|
}
|
|
) as resp:
|
|
if resp.status == 200:
|
|
logger.debug(f"Started stats stream for container {container_id[:12]}")
|
|
return True
|
|
else:
|
|
error_text = await resp.text()
|
|
logger.warning(f"Failed to start stream for {container_id[:12]}: HTTP {resp.status} - {error_text}")
|
|
return False
|
|
except asyncio.TimeoutError:
|
|
# Timeout errors are expected during host cleanup - log at debug level
|
|
logger.debug(f"Timeout starting stream for {container_id[:12]} (expected during host cleanup)")
|
|
return False
|
|
except Exception as e:
|
|
logger.warning(f"Error starting stream for {container_id[:12]}: {type(e).__name__}: {str(e)}", exc_info=True)
|
|
return False
|
|
|
|
async def stop_container_stream(self, container_id: str, host_id: str) -> bool:
|
|
"""Stop stats streaming for a container"""
|
|
try:
|
|
session = await self._get_session()
|
|
async with session.post(
|
|
f"{self.base_url}/api/streams/stop",
|
|
json={"container_id": container_id, "host_id": host_id}
|
|
) as resp:
|
|
if resp.status == 200:
|
|
logger.debug(f"Stopped stats stream for container {container_id[:12]}")
|
|
return True
|
|
else:
|
|
logger.warning(f"Failed to stop stream for {container_id[:12]}: {resp.status}")
|
|
return False
|
|
except asyncio.TimeoutError:
|
|
# Timeout errors are expected when bulk stopping streams - log at debug level
|
|
logger.debug(f"Timeout stopping stream for {container_id[:12]} (expected during bulk stop)")
|
|
return False
|
|
except Exception as e:
|
|
logger.warning(f"Error stopping stream for {container_id[:12]}: {e}")
|
|
return False
|
|
|
|
async def get_host_stats(self) -> Dict[str, Dict]:
|
|
"""
|
|
Get aggregated stats for all hosts
|
|
Returns: {host_id: {cpu_percent, memory_percent, ...}}
|
|
"""
|
|
try:
|
|
session = await self._get_session()
|
|
async with session.get(f"{self.base_url}/api/stats/hosts") as resp:
|
|
if resp.status == 200:
|
|
stats = await resp.json()
|
|
logger.debug(f"Received stats for {len(stats)} hosts from stats service")
|
|
return stats
|
|
else:
|
|
logger.error(f"Failed to get host stats: {resp.status}")
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Error getting host stats from stats service: {e}")
|
|
return {}
|
|
|
|
async def get_container_stats(self) -> Dict[str, Dict]:
|
|
"""
|
|
Get stats for all containers (for debugging)
|
|
Returns: {container_id: {cpu_percent, memory_percent, ...}}
|
|
"""
|
|
try:
|
|
session = await self._get_session()
|
|
async with session.get(f"{self.base_url}/api/stats/containers") as resp:
|
|
if resp.status == 200:
|
|
return await resp.json()
|
|
else:
|
|
logger.error(f"Failed to get container stats: {resp.status}")
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Error getting container stats from stats service: {e}")
|
|
return {}
|
|
|
|
# Event service methods
|
|
|
|
async def add_event_host(self, host_id: str, host_address: str, tls_ca: str = None, tls_cert: str = None, tls_key: str = None) -> bool:
|
|
"""Register a Docker host with the event monitoring service"""
|
|
try:
|
|
session = await self._get_session()
|
|
payload = {"host_id": host_id, "host_address": host_address}
|
|
|
|
# Add TLS certificates if provided
|
|
if tls_ca and tls_cert and tls_key:
|
|
payload["tls_ca_cert"] = tls_ca
|
|
payload["tls_cert"] = tls_cert
|
|
payload["tls_key"] = tls_key
|
|
|
|
async with session.post(
|
|
f"{self.base_url}/api/events/hosts/add",
|
|
json=payload
|
|
) as resp:
|
|
if resp.status == 200:
|
|
logger.info(f"Registered host {host_id[:8]} with event service")
|
|
return True
|
|
else:
|
|
logger.error(f"Failed to register host {host_id[:8]} with event service: {resp.status}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Error registering host {host_id[:8]} with event service: {e}")
|
|
return False
|
|
|
|
async def remove_event_host(self, host_id: str) -> bool:
|
|
"""Remove a Docker host from event monitoring"""
|
|
try:
|
|
session = await self._get_session()
|
|
async with session.post(
|
|
f"{self.base_url}/api/events/hosts/remove",
|
|
json={"host_id": host_id}
|
|
) as resp:
|
|
if resp.status == 200:
|
|
logger.info(f"Removed host {host_id[:8]} from event service")
|
|
return True
|
|
else:
|
|
logger.warning(f"Failed to remove host {host_id[:8]} from event service: {resp.status}")
|
|
return False
|
|
except Exception as e:
|
|
logger.warning(f"Error removing host {host_id[:8]} from event service: {e}")
|
|
return False
|
|
|
|
async def get_recent_events(self, host_id: Optional[str] = None) -> list:
|
|
"""Get recent cached events"""
|
|
try:
|
|
session = await self._get_session()
|
|
url = f"{self.base_url}/api/events/recent"
|
|
if host_id:
|
|
url += f"?host_id={host_id}"
|
|
|
|
async with session.get(url) as resp:
|
|
if resp.status == 200:
|
|
return await resp.json()
|
|
else:
|
|
logger.error(f"Failed to get recent events: {resp.status}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error getting recent events: {e}")
|
|
return []
|
|
|
|
async def connect_event_stream(self, event_callback: Callable):
|
|
"""
|
|
Connect to the WebSocket event stream
|
|
|
|
Args:
|
|
event_callback: Async function to call with each event
|
|
"""
|
|
self.event_callback = event_callback
|
|
|
|
# Start WebSocket connection in background
|
|
self.ws_task = asyncio.create_task(self._event_stream_loop())
|
|
logger.info("Started event stream WebSocket connection task")
|
|
|
|
async def _event_stream_loop(self):
|
|
"""Background task that maintains WebSocket connection and processes events"""
|
|
backoff = 1
|
|
max_backoff = 30
|
|
|
|
while True:
|
|
try:
|
|
# Load token
|
|
token = await self._load_token()
|
|
|
|
# Connect to WebSocket with token in URL
|
|
ws_url = f"{self.base_url.replace('http', 'ws')}/ws/events?token={token}"
|
|
|
|
session = await self._get_session()
|
|
async with session.ws_connect(ws_url) as ws:
|
|
self.ws_connection = ws
|
|
logger.info("Connected to event stream WebSocket")
|
|
backoff = 1 # Reset backoff on successful connection
|
|
|
|
# Process messages
|
|
async for msg in ws:
|
|
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
try:
|
|
event = json.loads(msg.data)
|
|
if self.event_callback:
|
|
await self.event_callback(event)
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to decode event JSON: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Error processing event: {e}")
|
|
|
|
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
logger.error(f"WebSocket error: {ws.exception()}")
|
|
break
|
|
|
|
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
|
logger.warning("WebSocket connection closed by server")
|
|
break
|
|
|
|
self.ws_connection = None
|
|
|
|
except asyncio.CancelledError:
|
|
logger.info("Event stream WebSocket task cancelled")
|
|
break
|
|
|
|
except Exception as e:
|
|
logger.error(f"Event stream WebSocket error: {e}, reconnecting in {backoff}s")
|
|
await asyncio.sleep(backoff)
|
|
backoff = min(backoff * 2, max_backoff)
|
|
|
|
|
|
# Global instance
|
|
_stats_client = None
|
|
|
|
def get_stats_client() -> StatsServiceClient:
|
|
"""Get the global stats client instance"""
|
|
global _stats_client
|
|
if _stats_client is None:
|
|
_stats_client = StatsServiceClient()
|
|
return _stats_client
|