Switched from Dockmon to Beszel
This commit is contained in:
150
dockmon/stats-service/aggregator.go
Normal file
150
dockmon/stats-service/aggregator.go
Normal file
@@ -0,0 +1,150 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Aggregator aggregates container stats into host-level metrics
|
||||
type Aggregator struct {
|
||||
cache *StatsCache
|
||||
streamManager *StreamManager
|
||||
aggregateInterval time.Duration
|
||||
}
|
||||
|
||||
// NewAggregator creates a new aggregator
|
||||
func NewAggregator(cache *StatsCache, streamManager *StreamManager, interval time.Duration) *Aggregator {
|
||||
return &Aggregator{
|
||||
cache: cache,
|
||||
streamManager: streamManager,
|
||||
aggregateInterval: interval,
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins the aggregation loop
|
||||
func (a *Aggregator) Start(ctx context.Context) {
|
||||
ticker := time.NewTicker(a.aggregateInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
log.Printf("Aggregator started (interval: %v)", a.aggregateInterval)
|
||||
|
||||
// Run once immediately
|
||||
a.aggregate()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
log.Println("Aggregator stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
a.aggregate()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// aggregate calculates host-level stats from container stats
|
||||
func (a *Aggregator) aggregate() {
|
||||
containerStats := a.cache.GetAllContainerStats()
|
||||
|
||||
// Group containers by host
|
||||
hostContainers := make(map[string][]*ContainerStats)
|
||||
for _, stats := range containerStats {
|
||||
hostContainers[stats.HostID] = append(hostContainers[stats.HostID], stats)
|
||||
}
|
||||
|
||||
// Aggregate stats for each host that has a registered Docker client
|
||||
for hostID, containers := range hostContainers {
|
||||
// Only aggregate if the host still has a registered Docker client
|
||||
// This prevents recreating stats for hosts that were just deleted
|
||||
if a.streamManager.HasHost(hostID) {
|
||||
hostStats := a.aggregateHostStats(hostID, containers)
|
||||
a.cache.UpdateHostStats(hostStats)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// aggregateHostStats aggregates stats for a single host
|
||||
func (a *Aggregator) aggregateHostStats(hostID string, containers []*ContainerStats) *HostStats {
|
||||
if len(containers) == 0 {
|
||||
return &HostStats{
|
||||
HostID: hostID,
|
||||
ContainerCount: 0,
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
totalCPU float64
|
||||
totalMemUsage uint64
|
||||
totalMemLimit uint64
|
||||
totalNetRx uint64
|
||||
totalNetTx uint64
|
||||
validContainers int
|
||||
)
|
||||
|
||||
const maxUint64 = ^uint64(0)
|
||||
|
||||
// Only count containers updated in the last 30 seconds
|
||||
cutoff := time.Now().Add(-30 * time.Second)
|
||||
|
||||
for _, stats := range containers {
|
||||
if stats.LastUpdate.Before(cutoff) {
|
||||
continue // Skip stale stats
|
||||
}
|
||||
|
||||
totalCPU += stats.CPUPercent
|
||||
totalMemUsage += stats.MemoryUsage
|
||||
totalMemLimit += stats.MemoryLimit
|
||||
|
||||
// Check for overflow before adding network bytes
|
||||
if maxUint64-totalNetRx < stats.NetworkRx {
|
||||
log.Printf("Warning: Network RX overflow prevented for host %s", truncateID(hostID, 8))
|
||||
totalNetRx = maxUint64 // Cap at max instead of wrapping
|
||||
} else {
|
||||
totalNetRx += stats.NetworkRx
|
||||
}
|
||||
|
||||
if maxUint64-totalNetTx < stats.NetworkTx {
|
||||
log.Printf("Warning: Network TX overflow prevented for host %s", truncateID(hostID, 8))
|
||||
totalNetTx = maxUint64
|
||||
} else {
|
||||
totalNetTx += stats.NetworkTx
|
||||
}
|
||||
|
||||
validContainers++
|
||||
}
|
||||
|
||||
// Calculate totals and percentages
|
||||
var cpuPercent, memPercent float64
|
||||
|
||||
// CPU is sum of all container CPU percentages (represents total host CPU usage)
|
||||
cpuPercent = totalCPU
|
||||
|
||||
if totalMemLimit > 0 {
|
||||
memPercent = (float64(totalMemUsage) / float64(totalMemLimit)) * 100.0
|
||||
}
|
||||
|
||||
// Round to 1 decimal place
|
||||
cpuPercent = roundToDecimal(cpuPercent, 1)
|
||||
memPercent = roundToDecimal(memPercent, 1)
|
||||
|
||||
return &HostStats{
|
||||
HostID: hostID,
|
||||
CPUPercent: cpuPercent,
|
||||
MemoryPercent: memPercent,
|
||||
MemoryUsedBytes: totalMemUsage,
|
||||
MemoryLimitBytes: totalMemLimit,
|
||||
NetworkRxBytes: totalNetRx,
|
||||
NetworkTxBytes: totalNetTx,
|
||||
ContainerCount: validContainers,
|
||||
}
|
||||
}
|
||||
|
||||
// roundToDecimal rounds a float to n decimal places
|
||||
func roundToDecimal(value float64, places int) float64 {
|
||||
shift := float64(1)
|
||||
for i := 0; i < places; i++ {
|
||||
shift *= 10
|
||||
}
|
||||
return float64(int(value*shift+0.5)) / shift
|
||||
}
|
||||
172
dockmon/stats-service/cache.go
Normal file
172
dockmon/stats-service/cache.go
Normal file
@@ -0,0 +1,172 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ContainerStats holds real-time stats for a single container
|
||||
type ContainerStats struct {
|
||||
ContainerID string `json:"container_id"`
|
||||
ContainerName string `json:"container_name"`
|
||||
HostID string `json:"host_id"`
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryUsage uint64 `json:"memory_usage"`
|
||||
MemoryLimit uint64 `json:"memory_limit"`
|
||||
MemoryPercent float64 `json:"memory_percent"`
|
||||
NetworkRx uint64 `json:"network_rx"`
|
||||
NetworkTx uint64 `json:"network_tx"`
|
||||
DiskRead uint64 `json:"disk_read"`
|
||||
DiskWrite uint64 `json:"disk_write"`
|
||||
LastUpdate time.Time `json:"last_update"`
|
||||
}
|
||||
|
||||
// HostStats holds aggregated stats for a host
|
||||
type HostStats struct {
|
||||
HostID string `json:"host_id"`
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryPercent float64 `json:"memory_percent"`
|
||||
MemoryUsedBytes uint64 `json:"memory_used_bytes"`
|
||||
MemoryLimitBytes uint64 `json:"memory_limit_bytes"`
|
||||
NetworkRxBytes uint64 `json:"network_rx_bytes"`
|
||||
NetworkTxBytes uint64 `json:"network_tx_bytes"`
|
||||
ContainerCount int `json:"container_count"`
|
||||
LastUpdate time.Time `json:"last_update"`
|
||||
}
|
||||
|
||||
// StatsCache is a thread-safe cache for container and host stats
|
||||
type StatsCache struct {
|
||||
mu sync.RWMutex
|
||||
containerStats map[string]*ContainerStats // key: composite key (hostID:containerID)
|
||||
hostStats map[string]*HostStats // key: hostID
|
||||
}
|
||||
|
||||
// NewStatsCache creates a new stats cache
|
||||
func NewStatsCache() *StatsCache {
|
||||
return &StatsCache{
|
||||
containerStats: make(map[string]*ContainerStats),
|
||||
hostStats: make(map[string]*HostStats),
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateContainerStats updates stats for a container
|
||||
func (c *StatsCache) UpdateContainerStats(stats *ContainerStats) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
stats.LastUpdate = time.Now()
|
||||
// Use composite key to support containers with duplicate IDs on different hosts
|
||||
compositeKey := stats.HostID + ":" + stats.ContainerID
|
||||
c.containerStats[compositeKey] = stats
|
||||
}
|
||||
|
||||
// GetContainerStats retrieves stats for a specific container
|
||||
func (c *StatsCache) GetContainerStats(containerID, hostID string) (*ContainerStats, bool) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
compositeKey := hostID + ":" + containerID
|
||||
stats, ok := c.containerStats[compositeKey]
|
||||
return stats, ok
|
||||
}
|
||||
|
||||
// GetAllContainerStats returns all container stats
|
||||
func (c *StatsCache) GetAllContainerStats() map[string]*ContainerStats {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
// Return a copy to avoid race conditions
|
||||
result := make(map[string]*ContainerStats, len(c.containerStats))
|
||||
for k, v := range c.containerStats {
|
||||
statsCopy := *v
|
||||
result[k] = &statsCopy
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// RemoveContainerStats removes stats for a container (when it stops)
|
||||
func (c *StatsCache) RemoveContainerStats(containerID, hostID string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
compositeKey := hostID + ":" + containerID
|
||||
delete(c.containerStats, compositeKey)
|
||||
}
|
||||
|
||||
// UpdateHostStats updates aggregated stats for a host
|
||||
func (c *StatsCache) UpdateHostStats(stats *HostStats) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
stats.LastUpdate = time.Now()
|
||||
c.hostStats[stats.HostID] = stats
|
||||
}
|
||||
|
||||
// GetHostStats retrieves stats for a specific host
|
||||
func (c *StatsCache) GetHostStats(hostID string) (*HostStats, bool) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
stats, ok := c.hostStats[hostID]
|
||||
return stats, ok
|
||||
}
|
||||
|
||||
// GetAllHostStats returns all host stats
|
||||
func (c *StatsCache) GetAllHostStats() map[string]*HostStats {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
// Return a copy to avoid race conditions
|
||||
result := make(map[string]*HostStats, len(c.hostStats))
|
||||
for k, v := range c.hostStats {
|
||||
statsCopy := *v
|
||||
result[k] = &statsCopy
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// RemoveHostStats removes all stats for a specific host
|
||||
func (c *StatsCache) RemoveHostStats(hostID string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
// Remove host stats
|
||||
delete(c.hostStats, hostID)
|
||||
|
||||
// Remove all container stats for this host
|
||||
for id, stats := range c.containerStats {
|
||||
if stats.HostID == hostID {
|
||||
delete(c.containerStats, id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CleanStaleStats removes stats older than maxAge
|
||||
func (c *StatsCache) CleanStaleStats(maxAge time.Duration) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
// Clean container stats
|
||||
for id, stats := range c.containerStats {
|
||||
if now.Sub(stats.LastUpdate) > maxAge {
|
||||
delete(c.containerStats, id)
|
||||
}
|
||||
}
|
||||
|
||||
// Clean host stats
|
||||
for id, stats := range c.hostStats {
|
||||
if now.Sub(stats.LastUpdate) > maxAge {
|
||||
delete(c.hostStats, id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetStats returns a summary of cache state
|
||||
func (c *StatsCache) GetStats() (containerCount, hostCount int) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
return len(c.containerStats), len(c.hostStats)
|
||||
}
|
||||
126
dockmon/stats-service/event_broadcaster.go
Normal file
126
dockmon/stats-service/event_broadcaster.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"sync"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
// EventBroadcaster manages WebSocket connections and broadcasts events
|
||||
type EventBroadcaster struct {
|
||||
mu sync.RWMutex
|
||||
connections map[*websocket.Conn]*sync.Mutex // Each connection has its own write mutex
|
||||
maxConnections int
|
||||
}
|
||||
|
||||
// NewEventBroadcaster creates a new event broadcaster
|
||||
func NewEventBroadcaster() *EventBroadcaster {
|
||||
return &EventBroadcaster{
|
||||
connections: make(map[*websocket.Conn]*sync.Mutex),
|
||||
maxConnections: 100, // Limit to 100 concurrent WebSocket connections
|
||||
}
|
||||
}
|
||||
|
||||
// AddConnection registers a new WebSocket connection
|
||||
func (eb *EventBroadcaster) AddConnection(conn *websocket.Conn) error {
|
||||
eb.mu.Lock()
|
||||
defer eb.mu.Unlock()
|
||||
|
||||
// Check connection limit
|
||||
if len(eb.connections) >= eb.maxConnections {
|
||||
log.Printf("WebSocket connection limit reached (%d), rejecting new connection", eb.maxConnections)
|
||||
return &websocket.CloseError{Code: websocket.ClosePolicyViolation, Text: "Connection limit reached"}
|
||||
}
|
||||
|
||||
eb.connections[conn] = &sync.Mutex{} // Create a dedicated mutex for this connection
|
||||
log.Printf("WebSocket connected to events. Total connections: %d", len(eb.connections))
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemoveConnection unregisters a WebSocket connection
|
||||
func (eb *EventBroadcaster) RemoveConnection(conn *websocket.Conn) {
|
||||
eb.mu.Lock()
|
||||
defer eb.mu.Unlock()
|
||||
delete(eb.connections, conn)
|
||||
log.Printf("WebSocket disconnected from events. Total connections: %d", len(eb.connections))
|
||||
}
|
||||
|
||||
// Broadcast sends an event to all connected WebSocket clients
|
||||
func (eb *EventBroadcaster) Broadcast(event DockerEvent) {
|
||||
// Marshal event to JSON
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
log.Printf("Error marshaling event: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Track dead connections
|
||||
var deadConnections []*websocket.Conn
|
||||
|
||||
// Get snapshot of connections with their mutexes
|
||||
eb.mu.RLock()
|
||||
connMutexes := make(map[*websocket.Conn]*sync.Mutex, len(eb.connections))
|
||||
for conn, mu := range eb.connections {
|
||||
connMutexes[conn] = mu
|
||||
}
|
||||
eb.mu.RUnlock()
|
||||
|
||||
// Send to all connections (with per-connection write lock)
|
||||
for conn, mu := range connMutexes {
|
||||
mu.Lock()
|
||||
err := conn.WriteMessage(websocket.TextMessage, data)
|
||||
mu.Unlock()
|
||||
|
||||
if err != nil {
|
||||
log.Printf("Error sending event to WebSocket: %v", err)
|
||||
deadConnections = append(deadConnections, conn)
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up dead connections
|
||||
if len(deadConnections) > 0 {
|
||||
// Remove from map first (fast, under lock)
|
||||
eb.mu.Lock()
|
||||
var connectionsToClose []*websocket.Conn
|
||||
for _, conn := range deadConnections {
|
||||
// Only delete if connection still exists in map
|
||||
if _, exists := eb.connections[conn]; exists {
|
||||
delete(eb.connections, conn)
|
||||
connectionsToClose = append(connectionsToClose, conn)
|
||||
}
|
||||
}
|
||||
eb.mu.Unlock()
|
||||
|
||||
// Close connections outside lock (slow, can block)
|
||||
for _, conn := range connectionsToClose {
|
||||
conn.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetConnectionCount returns the number of active WebSocket connections
|
||||
func (eb *EventBroadcaster) GetConnectionCount() int {
|
||||
eb.mu.RLock()
|
||||
defer eb.mu.RUnlock()
|
||||
return len(eb.connections)
|
||||
}
|
||||
|
||||
// CloseAll closes all WebSocket connections
|
||||
func (eb *EventBroadcaster) CloseAll() {
|
||||
eb.mu.Lock()
|
||||
var connectionsToClose []*websocket.Conn
|
||||
for conn := range eb.connections {
|
||||
connectionsToClose = append(connectionsToClose, conn)
|
||||
}
|
||||
eb.connections = make(map[*websocket.Conn]*sync.Mutex)
|
||||
eb.mu.Unlock()
|
||||
|
||||
// Close connections outside lock (can block on network I/O)
|
||||
for _, conn := range connectionsToClose {
|
||||
conn.Close()
|
||||
}
|
||||
|
||||
log.Println("Closed all event WebSocket connections")
|
||||
}
|
||||
106
dockmon/stats-service/event_cache.go
Normal file
106
dockmon/stats-service/event_cache.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
// EventCache stores recent events for each host (ring buffer)
|
||||
type EventCache struct {
|
||||
mu sync.RWMutex
|
||||
events map[string][]DockerEvent // key: hostID, value: ring buffer of events
|
||||
maxSize int // maximum events to keep per host
|
||||
}
|
||||
|
||||
// NewEventCache creates a new event cache
|
||||
func NewEventCache(maxSize int) *EventCache {
|
||||
return &EventCache{
|
||||
events: make(map[string][]DockerEvent),
|
||||
maxSize: maxSize,
|
||||
}
|
||||
}
|
||||
|
||||
// AddEvent adds an event to the cache for a specific host
|
||||
func (ec *EventCache) AddEvent(hostID string, event DockerEvent) {
|
||||
ec.mu.Lock()
|
||||
defer ec.mu.Unlock()
|
||||
|
||||
// Initialize slice if needed
|
||||
if _, exists := ec.events[hostID]; !exists {
|
||||
ec.events[hostID] = make([]DockerEvent, 0, ec.maxSize)
|
||||
}
|
||||
|
||||
// Add event
|
||||
ec.events[hostID] = append(ec.events[hostID], event)
|
||||
|
||||
// Trim if over max size (keep most recent)
|
||||
if len(ec.events[hostID]) > ec.maxSize {
|
||||
ec.events[hostID] = ec.events[hostID][len(ec.events[hostID])-ec.maxSize:]
|
||||
}
|
||||
}
|
||||
|
||||
// GetRecentEvents returns recent events for a specific host
|
||||
func (ec *EventCache) GetRecentEvents(hostID string, limit int) []DockerEvent {
|
||||
ec.mu.RLock()
|
||||
defer ec.mu.RUnlock()
|
||||
|
||||
events, exists := ec.events[hostID]
|
||||
if !exists || len(events) == 0 {
|
||||
return []DockerEvent{}
|
||||
}
|
||||
|
||||
// Return last N events
|
||||
if limit <= 0 || limit > len(events) {
|
||||
limit = len(events)
|
||||
}
|
||||
|
||||
// Return copy to avoid race conditions
|
||||
result := make([]DockerEvent, limit)
|
||||
copy(result, events[len(events)-limit:])
|
||||
return result
|
||||
}
|
||||
|
||||
// GetAllRecentEvents returns recent events for all hosts
|
||||
func (ec *EventCache) GetAllRecentEvents(limit int) map[string][]DockerEvent {
|
||||
ec.mu.RLock()
|
||||
defer ec.mu.RUnlock()
|
||||
|
||||
result := make(map[string][]DockerEvent)
|
||||
|
||||
for hostID, events := range ec.events {
|
||||
if len(events) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Get last N events
|
||||
count := limit
|
||||
if count <= 0 || count > len(events) {
|
||||
count = len(events)
|
||||
}
|
||||
|
||||
// Copy to avoid race conditions
|
||||
hostEvents := make([]DockerEvent, count)
|
||||
copy(hostEvents, events[len(events)-count:])
|
||||
result[hostID] = hostEvents
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ClearHost removes all cached events for a specific host
|
||||
func (ec *EventCache) ClearHost(hostID string) {
|
||||
ec.mu.Lock()
|
||||
defer ec.mu.Unlock()
|
||||
delete(ec.events, hostID)
|
||||
}
|
||||
|
||||
// GetStats returns cache statistics
|
||||
func (ec *EventCache) GetStats() (hostCount int, totalEvents int) {
|
||||
ec.mu.RLock()
|
||||
defer ec.mu.RUnlock()
|
||||
|
||||
hostCount = len(ec.events)
|
||||
for _, events := range ec.events {
|
||||
totalEvents += len(events)
|
||||
}
|
||||
return
|
||||
}
|
||||
351
dockmon/stats-service/event_manager.go
Normal file
351
dockmon/stats-service/event_manager.go
Normal file
@@ -0,0 +1,351 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types/events"
|
||||
"github.com/docker/docker/api/types/filters"
|
||||
"github.com/docker/docker/client"
|
||||
)
|
||||
|
||||
// DockerEvent represents a Docker container event
|
||||
type DockerEvent struct {
|
||||
Action string `json:"action"`
|
||||
ContainerID string `json:"container_id"`
|
||||
ContainerName string `json:"container_name"`
|
||||
Image string `json:"image"`
|
||||
HostID string `json:"host_id"`
|
||||
Timestamp string `json:"timestamp"`
|
||||
Attributes map[string]string `json:"attributes"`
|
||||
}
|
||||
|
||||
// EventManager manages Docker event streams for multiple hosts
|
||||
type EventManager struct {
|
||||
mu sync.RWMutex
|
||||
hosts map[string]*eventStream // key: hostID
|
||||
broadcaster *EventBroadcaster
|
||||
eventCache *EventCache
|
||||
}
|
||||
|
||||
// eventStream represents a single Docker host event stream
|
||||
type eventStream struct {
|
||||
hostID string
|
||||
hostAddr string
|
||||
client *client.Client
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
active bool
|
||||
}
|
||||
|
||||
// createEventTLSOption creates a Docker client TLS option from PEM-encoded certificates
|
||||
func createEventTLSOption(caCertPEM, certPEM, keyPEM string) (client.Opt, error) {
|
||||
// Parse CA certificate
|
||||
caCertPool := x509.NewCertPool()
|
||||
if !caCertPool.AppendCertsFromPEM([]byte(caCertPEM)) {
|
||||
return nil, fmt.Errorf("failed to parse CA certificate")
|
||||
}
|
||||
|
||||
// Parse client certificate and key
|
||||
clientCert, err := tls.X509KeyPair([]byte(certPEM), []byte(keyPEM))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse client certificate/key: %v", err)
|
||||
}
|
||||
|
||||
// Create TLS config
|
||||
tlsConfig := &tls.Config{
|
||||
Certificates: []tls.Certificate{clientCert},
|
||||
RootCAs: caCertPool,
|
||||
MinVersion: tls.VersionTLS12,
|
||||
}
|
||||
|
||||
// Create HTTP client with TLS transport and timeouts
|
||||
// Note: No overall Timeout set because Docker API streaming operations (stats, events)
|
||||
// are long-running connections that should not be killed by a timeout
|
||||
httpClient := &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 30 * time.Second, // Connection establishment timeout
|
||||
KeepAlive: 30 * time.Second, // TCP keepalive interval
|
||||
}).DialContext,
|
||||
TLSClientConfig: tlsConfig,
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
ResponseHeaderTimeout: 10 * time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
return client.WithHTTPClient(httpClient), nil
|
||||
}
|
||||
|
||||
// NewEventManager creates a new event manager
|
||||
func NewEventManager(broadcaster *EventBroadcaster, cache *EventCache) *EventManager {
|
||||
return &EventManager{
|
||||
hosts: make(map[string]*eventStream),
|
||||
broadcaster: broadcaster,
|
||||
eventCache: cache,
|
||||
}
|
||||
}
|
||||
|
||||
// AddHost starts monitoring Docker events for a host
|
||||
func (em *EventManager) AddHost(hostID, hostAddress, tlsCACert, tlsCert, tlsKey string) error {
|
||||
// Create Docker client FIRST (before acquiring lock or stopping old stream)
|
||||
var dockerClient *client.Client
|
||||
var err error
|
||||
|
||||
if hostAddress == "" || hostAddress == "unix:///var/run/docker.sock" {
|
||||
// Local Docker socket
|
||||
dockerClient, err = client.NewClientWithOpts(
|
||||
client.FromEnv,
|
||||
client.WithAPIVersionNegotiation(),
|
||||
)
|
||||
} else {
|
||||
// Remote Docker host - check if TLS is needed
|
||||
clientOpts := []client.Opt{
|
||||
client.WithHost(hostAddress),
|
||||
client.WithAPIVersionNegotiation(),
|
||||
}
|
||||
|
||||
// If TLS certificates provided, configure TLS
|
||||
if tlsCACert != "" && tlsCert != "" && tlsKey != "" {
|
||||
tlsOpt, err := createEventTLSOption(tlsCACert, tlsCert, tlsKey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create TLS config: %v", err)
|
||||
}
|
||||
clientOpts = append(clientOpts, tlsOpt)
|
||||
}
|
||||
|
||||
dockerClient, err = client.NewClientWithOpts(clientOpts...)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Now that new client is successfully created, acquire lock and swap
|
||||
em.mu.Lock()
|
||||
defer em.mu.Unlock()
|
||||
|
||||
// If already monitoring, stop the old stream (only after new client succeeds)
|
||||
if stream, exists := em.hosts[hostID]; exists && stream.active {
|
||||
log.Printf("Stopping existing event monitoring for host %s to update", truncateID(hostID, 8))
|
||||
stream.cancel()
|
||||
stream.active = false
|
||||
if stream.client != nil {
|
||||
stream.client.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// Create context for this stream
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
stream := &eventStream{
|
||||
hostID: hostID,
|
||||
hostAddr: hostAddress,
|
||||
client: dockerClient,
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
active: true,
|
||||
}
|
||||
|
||||
em.hosts[hostID] = stream
|
||||
|
||||
// Start event stream in goroutine
|
||||
go em.streamEvents(stream)
|
||||
|
||||
log.Printf("Started event monitoring for host %s (%s)", truncateID(hostID, 8), hostAddress)
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemoveHost stops monitoring events for a host
|
||||
func (em *EventManager) RemoveHost(hostID string) {
|
||||
em.mu.Lock()
|
||||
defer em.mu.Unlock()
|
||||
|
||||
if stream, exists := em.hosts[hostID]; exists {
|
||||
stream.cancel()
|
||||
stream.active = false
|
||||
if stream.client != nil {
|
||||
stream.client.Close()
|
||||
}
|
||||
|
||||
// Clear cached events for this host
|
||||
em.eventCache.ClearHost(hostID)
|
||||
delete(em.hosts, hostID)
|
||||
log.Printf("Stopped event monitoring for host %s", truncateID(hostID, 8))
|
||||
}
|
||||
}
|
||||
|
||||
// StopAll stops all event monitoring
|
||||
func (em *EventManager) StopAll() {
|
||||
em.mu.Lock()
|
||||
defer em.mu.Unlock()
|
||||
|
||||
for hostID, stream := range em.hosts {
|
||||
stream.cancel()
|
||||
stream.active = false
|
||||
if stream.client != nil {
|
||||
stream.client.Close()
|
||||
}
|
||||
log.Printf("Stopped event monitoring for host %s", truncateID(hostID, 8))
|
||||
}
|
||||
|
||||
em.hosts = make(map[string]*eventStream)
|
||||
}
|
||||
|
||||
// GetActiveHosts returns count of active event streams
|
||||
func (em *EventManager) GetActiveHosts() int {
|
||||
em.mu.RLock()
|
||||
defer em.mu.RUnlock()
|
||||
return len(em.hosts)
|
||||
}
|
||||
|
||||
// streamEvents listens to Docker events for a specific host
|
||||
func (em *EventManager) streamEvents(stream *eventStream) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("Recovered from panic in event stream for %s: %v", truncateID(stream.hostID, 8), r)
|
||||
}
|
||||
}()
|
||||
|
||||
// Retry loop with exponential backoff
|
||||
backoff := time.Second
|
||||
maxBackoff := 30 * time.Second
|
||||
// Track if we've received any successful events (to know when to reset backoff)
|
||||
receivedSuccessfulEvent := false
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-stream.ctx.Done():
|
||||
log.Printf("Event stream for host %s stopped", truncateID(stream.hostID, 8))
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
// Listen to container events only
|
||||
eventFilters := filters.NewArgs()
|
||||
eventFilters.Add("type", "container")
|
||||
|
||||
eventOptions := events.ListOptions{
|
||||
Filters: eventFilters,
|
||||
}
|
||||
|
||||
eventsChan, errChan := stream.client.Events(stream.ctx, eventOptions)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-stream.ctx.Done():
|
||||
return
|
||||
|
||||
case err := <-errChan:
|
||||
if err != nil {
|
||||
log.Printf("Event stream error for host %s: %v (retrying in %v)", truncateID(stream.hostID, 8), err, backoff)
|
||||
time.Sleep(backoff)
|
||||
// Only increase backoff if we never got a successful event
|
||||
if !receivedSuccessfulEvent {
|
||||
backoff = min(backoff*2, maxBackoff)
|
||||
} else {
|
||||
// We had a successful connection before, reset backoff
|
||||
backoff = time.Second
|
||||
}
|
||||
goto reconnect
|
||||
}
|
||||
|
||||
case event := <-eventsChan:
|
||||
// Reset backoff on first successful event after reconnection
|
||||
if !receivedSuccessfulEvent {
|
||||
backoff = time.Second
|
||||
receivedSuccessfulEvent = true
|
||||
}
|
||||
|
||||
// Process the event with panic recovery
|
||||
func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("Recovered from panic in processEvent for host %s: %v", truncateID(stream.hostID, 8), r)
|
||||
}
|
||||
}()
|
||||
em.processEvent(stream.hostID, event)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
reconnect:
|
||||
// Continue to next iteration (backoff sleep already happened above)
|
||||
}
|
||||
}
|
||||
|
||||
// processEvent converts Docker event to our format and broadcasts it
|
||||
func (em *EventManager) processEvent(hostID string, event events.Message) {
|
||||
// Extract container info
|
||||
containerID := event.Actor.ID
|
||||
|
||||
// Safely extract attributes with defensive access pattern
|
||||
containerName := ""
|
||||
image := ""
|
||||
if attrs := event.Actor.Attributes; attrs != nil {
|
||||
if name, ok := attrs["name"]; ok {
|
||||
containerName = name
|
||||
}
|
||||
if img, ok := attrs["image"]; ok {
|
||||
image = img
|
||||
}
|
||||
}
|
||||
|
||||
// Create our event
|
||||
dockerEvent := DockerEvent{
|
||||
Action: string(event.Action),
|
||||
ContainerID: containerID,
|
||||
ContainerName: containerName,
|
||||
Image: image,
|
||||
HostID: hostID,
|
||||
Timestamp: time.Unix(event.Time, 0).Format(time.RFC3339),
|
||||
Attributes: event.Actor.Attributes,
|
||||
}
|
||||
|
||||
// Only log important events (not noisy exec_* events)
|
||||
action := dockerEvent.Action
|
||||
if action != "" && !isExecEvent(action) && isImportantEvent(action) {
|
||||
log.Printf("Event: %s - container %s (%s) on host %s",
|
||||
dockerEvent.Action,
|
||||
dockerEvent.ContainerName,
|
||||
truncateID(dockerEvent.ContainerID, 12),
|
||||
truncateID(hostID, 8))
|
||||
}
|
||||
|
||||
// Add to cache
|
||||
em.eventCache.AddEvent(hostID, dockerEvent)
|
||||
|
||||
// Broadcast to all WebSocket clients
|
||||
em.broadcaster.Broadcast(dockerEvent)
|
||||
}
|
||||
|
||||
// isExecEvent checks if the event is an exec_* event (noisy)
|
||||
func isExecEvent(action string) bool {
|
||||
return len(action) > 5 && action[:5] == "exec_"
|
||||
}
|
||||
|
||||
// isImportantEvent checks if the event should be logged
|
||||
func isImportantEvent(action string) bool {
|
||||
importantEvents := map[string]bool{
|
||||
"create": true,
|
||||
"start": true,
|
||||
"stop": true,
|
||||
"die": true,
|
||||
"kill": true,
|
||||
"destroy": true,
|
||||
"pause": true,
|
||||
"unpause": true,
|
||||
"restart": true,
|
||||
"oom": true,
|
||||
"health_status": true,
|
||||
}
|
||||
return importantEvents[action]
|
||||
}
|
||||
40
dockmon/stats-service/go.mod
Normal file
40
dockmon/stats-service/go.mod
Normal file
@@ -0,0 +1,40 @@
|
||||
module github.com/dockmon/stats-service
|
||||
|
||||
go 1.23.0
|
||||
|
||||
toolchain go1.23.12
|
||||
|
||||
require (
|
||||
github.com/docker/docker v27.5.1+incompatible
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
)
|
||||
|
||||
require github.com/docker/go-connections v0.5.0 // indirect
|
||||
|
||||
require (
|
||||
github.com/Microsoft/go-winio v0.6.1 // indirect
|
||||
github.com/containerd/log v0.1.0 // indirect
|
||||
github.com/distribution/reference v0.6.0 // indirect
|
||||
github.com/docker/go-units v0.5.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/go-logr/logr v1.4.3 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/moby/docker-image-spec v1.3.1 // indirect
|
||||
github.com/moby/term v0.5.0 // indirect
|
||||
github.com/morikuni/aec v1.0.0 // indirect
|
||||
github.com/opencontainers/go-digest v1.0.0 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.0 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
|
||||
go.opentelemetry.io/otel v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.38.0 // indirect
|
||||
golang.org/x/mod v0.8.0 // indirect
|
||||
golang.org/x/sys v0.35.0 // indirect
|
||||
golang.org/x/time v0.5.0 // indirect
|
||||
golang.org/x/tools v0.6.0 // indirect
|
||||
gotest.tools/v3 v3.5.1 // indirect
|
||||
)
|
||||
128
dockmon/stats-service/go.sum
Normal file
128
dockmon/stats-service/go.sum
Normal file
@@ -0,0 +1,128 @@
|
||||
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
|
||||
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
|
||||
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
|
||||
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
|
||||
github.com/docker/docker v27.5.1+incompatible h1:4PYU5dnBYqRQi0294d1FBECqT9ECWeQAIfE8q4YnPY8=
|
||||
github.com/docker/docker v27.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
|
||||
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
|
||||
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
|
||||
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs=
|
||||
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
|
||||
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
|
||||
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
|
||||
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
|
||||
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
|
||||
go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
|
||||
go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4=
|
||||
go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
|
||||
go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
|
||||
go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
|
||||
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
|
||||
golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
|
||||
golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
|
||||
golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
|
||||
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
|
||||
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc=
|
||||
google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4=
|
||||
google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ=
|
||||
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
|
||||
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU=
|
||||
gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU=
|
||||
569
dockmon/stats-service/main.go
Normal file
569
dockmon/stats-service/main.go
Normal file
@@ -0,0 +1,569 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"crypto/subtle"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
// Configuration with environment variable support
|
||||
var config = struct {
|
||||
TokenFilePath string
|
||||
Port string
|
||||
AggregationInterval time.Duration
|
||||
EventCacheSize int
|
||||
CleanupInterval time.Duration
|
||||
MaxRequestBodySize int64
|
||||
AllowedOrigins string
|
||||
}{
|
||||
TokenFilePath: getEnv("TOKEN_FILE_PATH", "/tmp/stats-service-token"),
|
||||
Port: getEnv("STATS_SERVICE_PORT", "8081"),
|
||||
AggregationInterval: getEnvDuration("AGGREGATION_INTERVAL", "1s"),
|
||||
EventCacheSize: getEnvInt("EVENT_CACHE_SIZE", 100),
|
||||
CleanupInterval: getEnvDuration("CLEANUP_INTERVAL", "60s"),
|
||||
MaxRequestBodySize: getEnvInt64("MAX_REQUEST_BODY_SIZE", 1048576), // 1MB default
|
||||
AllowedOrigins: getEnv("ALLOWED_ORIGINS", "http://localhost:8080,http://127.0.0.1:8080,http://localhost,http://127.0.0.1"),
|
||||
}
|
||||
|
||||
// getEnv gets environment variable with fallback
|
||||
func getEnv(key, fallback string) string {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// getEnvInt gets integer environment variable with fallback
|
||||
func getEnvInt(key string, fallback int) int {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// getEnvInt64 gets int64 environment variable with fallback
|
||||
func getEnvInt64(key string, fallback int64) int64 {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.ParseInt(value, 10, 64); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// getEnvDuration gets duration environment variable with fallback
|
||||
func getEnvDuration(key string, fallback string) time.Duration {
|
||||
value := getEnv(key, fallback)
|
||||
if duration, err := time.ParseDuration(value); err == nil {
|
||||
return duration
|
||||
}
|
||||
// Fallback parsing
|
||||
if duration, err := time.ParseDuration(fallback); err == nil {
|
||||
return duration
|
||||
}
|
||||
return 1 * time.Second // Ultimate fallback
|
||||
}
|
||||
|
||||
// generateToken creates a cryptographically secure random token
|
||||
func generateToken() (string, error) {
|
||||
bytes := make([]byte, 32) // 256-bit token
|
||||
if _, err := rand.Read(bytes); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return hex.EncodeToString(bytes), nil
|
||||
}
|
||||
|
||||
// limitRequestBody limits the size of request bodies
|
||||
func limitRequestBody(next http.HandlerFunc) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
r.Body = http.MaxBytesReader(w, r.Body, config.MaxRequestBodySize)
|
||||
next(w, r)
|
||||
}
|
||||
}
|
||||
|
||||
// jsonResponse writes a JSON response and handles encoding errors
|
||||
func jsonResponse(w http.ResponseWriter, data interface{}) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(w).Encode(data); err != nil {
|
||||
log.Printf("Error encoding JSON response: %v", err)
|
||||
// Can't send error status if response already partially sent
|
||||
}
|
||||
}
|
||||
|
||||
// authMiddleware validates the Bearer token using constant-time comparison
|
||||
func authMiddleware(token string, next http.HandlerFunc) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
// Get Authorization header
|
||||
authHeader := r.Header.Get("Authorization")
|
||||
expectedAuth := "Bearer " + token
|
||||
|
||||
// Use constant-time comparison to prevent timing attacks
|
||||
// Check length first (still constant-time for the comparison itself)
|
||||
if len(authHeader) != len(expectedAuth) {
|
||||
http.Error(w, "Unauthorized", http.StatusUnauthorized)
|
||||
log.Printf("Unauthorized request from %s to %s (length mismatch)", r.RemoteAddr, r.URL.Path)
|
||||
return
|
||||
}
|
||||
|
||||
// Constant-time comparison of the full auth header
|
||||
if subtle.ConstantTimeCompare([]byte(authHeader), []byte(expectedAuth)) != 1 {
|
||||
http.Error(w, "Unauthorized", http.StatusUnauthorized)
|
||||
log.Printf("Unauthorized request from %s to %s", r.RemoteAddr, r.URL.Path)
|
||||
return
|
||||
}
|
||||
|
||||
next(w, r)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
log.Println("Starting DockMon Stats Service...")
|
||||
|
||||
// Generate random token
|
||||
token, err := generateToken()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to generate token: %v", err)
|
||||
}
|
||||
|
||||
// Write token to file for Python backend
|
||||
if err := os.WriteFile(config.TokenFilePath, []byte(token), 0600); err != nil {
|
||||
log.Fatalf("Failed to write token file: %v", err)
|
||||
}
|
||||
log.Printf("Generated temporary auth token for stats service")
|
||||
log.Printf("Configuration: port=%s, aggregation=%v, cache_size=%d, cleanup=%v",
|
||||
config.Port, config.AggregationInterval, config.EventCacheSize, config.CleanupInterval)
|
||||
|
||||
// Create stats cache
|
||||
cache := NewStatsCache()
|
||||
|
||||
// Create stream manager
|
||||
streamManager := NewStreamManager(cache)
|
||||
|
||||
// Create aggregator with configured interval
|
||||
aggregator := NewAggregator(cache, streamManager, config.AggregationInterval)
|
||||
|
||||
// Create event management components with configured cache size
|
||||
eventCache := NewEventCache(config.EventCacheSize)
|
||||
eventBroadcaster := NewEventBroadcaster()
|
||||
eventManager := NewEventManager(eventBroadcaster, eventCache)
|
||||
|
||||
// Create context for graceful shutdown
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// Start aggregator
|
||||
go aggregator.Start(ctx)
|
||||
|
||||
// Start cleanup routine (remove stale stats every 60 seconds)
|
||||
go func() {
|
||||
ticker := time.NewTicker(60 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
cache.CleanStaleStats(60 * time.Second)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Create HTTP server
|
||||
mux := http.NewServeMux()
|
||||
|
||||
// Health check endpoint
|
||||
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
|
||||
_, totalEvents := eventCache.GetStats()
|
||||
jsonResponse(w, map[string]interface{}{
|
||||
"status": "ok",
|
||||
"service": "dockmon-stats",
|
||||
"stats_streams": streamManager.GetStreamCount(),
|
||||
"event_hosts": eventManager.GetActiveHosts(),
|
||||
"event_connections": eventBroadcaster.GetConnectionCount(),
|
||||
"cached_events": totalEvents,
|
||||
})
|
||||
})
|
||||
|
||||
// Get all host stats (main endpoint for Python backend) - PROTECTED
|
||||
mux.HandleFunc("/api/stats/hosts", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
hostStats := cache.GetAllHostStats()
|
||||
json.NewEncoder(w).Encode(hostStats)
|
||||
}))
|
||||
|
||||
// Get stats for a specific host - PROTECTED
|
||||
mux.HandleFunc("/api/stats/host/", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
hostID := r.URL.Path[len("/api/stats/host/"):]
|
||||
if hostID == "" {
|
||||
http.Error(w, "host_id required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
stats, ok := cache.GetHostStats(hostID)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
jsonResponse(w,stats)
|
||||
}))
|
||||
|
||||
// Get all container stats (for debugging) - PROTECTED
|
||||
mux.HandleFunc("/api/stats/containers", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
containerStats := cache.GetAllContainerStats()
|
||||
json.NewEncoder(w).Encode(containerStats)
|
||||
}))
|
||||
|
||||
// Start stream for a container (called by Python backend) - PROTECTED
|
||||
mux.HandleFunc("/api/streams/start", authMiddleware(token, limitRequestBody(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
ContainerID string `json:"container_id"`
|
||||
ContainerName string `json:"container_name"`
|
||||
HostID string `json:"host_id"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if req.ContainerID == "" || req.HostID == "" {
|
||||
http.Error(w, "container_id and host_id are required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if err := streamManager.StartStream(ctx, req.ContainerID, req.ContainerName, req.HostID); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "started"})
|
||||
})))
|
||||
|
||||
// Stop stream for a container - PROTECTED
|
||||
mux.HandleFunc("/api/streams/stop", authMiddleware(token, limitRequestBody(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
ContainerID string `json:"container_id"`
|
||||
HostID string `json:"host_id"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if req.ContainerID == "" || req.HostID == "" {
|
||||
http.Error(w, "container_id and host_id are required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
streamManager.StopStream(req.ContainerID, req.HostID)
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "stopped"})
|
||||
})))
|
||||
|
||||
// Add Docker host - PROTECTED
|
||||
mux.HandleFunc("/api/hosts/add", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
HostID string `json:"host_id"`
|
||||
HostAddress string `json:"host_address"`
|
||||
TLSCACert string `json:"tls_ca_cert,omitempty"`
|
||||
TLSCert string `json:"tls_cert,omitempty"`
|
||||
TLSKey string `json:"tls_key,omitempty"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if req.HostID == "" || req.HostAddress == "" {
|
||||
http.Error(w, "host_id and host_address are required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if err := streamManager.AddDockerHost(req.HostID, req.HostAddress, req.TLSCACert, req.TLSCert, req.TLSKey); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "added"})
|
||||
}))
|
||||
|
||||
// Remove Docker host - PROTECTED
|
||||
mux.HandleFunc("/api/hosts/remove", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
HostID string `json:"host_id"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if req.HostID == "" {
|
||||
http.Error(w, "host_id is required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
streamManager.RemoveDockerHost(req.HostID)
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "removed"})
|
||||
}))
|
||||
|
||||
// Debug endpoint - PROTECTED
|
||||
mux.HandleFunc("/debug/stats", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
containerCount, hostCount := cache.GetStats()
|
||||
jsonResponse(w, map[string]interface{}{
|
||||
"streams": streamManager.GetStreamCount(),
|
||||
"containers": containerCount,
|
||||
"hosts": hostCount,
|
||||
})
|
||||
}))
|
||||
|
||||
// === Event Monitoring Endpoints ===
|
||||
|
||||
// Start monitoring events for a host - PROTECTED
|
||||
mux.HandleFunc("/api/events/hosts/add", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
HostID string `json:"host_id"`
|
||||
HostAddress string `json:"host_address"`
|
||||
TLSCACert string `json:"tls_ca_cert,omitempty"`
|
||||
TLSCert string `json:"tls_cert,omitempty"`
|
||||
TLSKey string `json:"tls_key,omitempty"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if req.HostID == "" || req.HostAddress == "" {
|
||||
http.Error(w, "host_id and host_address are required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if err := eventManager.AddHost(req.HostID, req.HostAddress, req.TLSCACert, req.TLSCert, req.TLSKey); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "started"})
|
||||
}))
|
||||
|
||||
// Stop monitoring events for a host - PROTECTED
|
||||
mux.HandleFunc("/api/events/hosts/remove", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
HostID string `json:"host_id"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if req.HostID == "" {
|
||||
http.Error(w, "host_id is required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
eventManager.RemoveHost(req.HostID)
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(map[string]string{"status": "stopped"})
|
||||
}))
|
||||
|
||||
// Get recent events - PROTECTED
|
||||
mux.HandleFunc("/api/events/recent", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
|
||||
hostID := r.URL.Query().Get("host_id")
|
||||
|
||||
var events interface{}
|
||||
if hostID != "" {
|
||||
// Get events for specific host
|
||||
events = eventCache.GetRecentEvents(hostID, 50)
|
||||
} else {
|
||||
// Get events for all hosts
|
||||
events = eventCache.GetAllRecentEvents(50)
|
||||
}
|
||||
|
||||
jsonResponse(w,events)
|
||||
}))
|
||||
|
||||
// WebSocket endpoint for event streaming - PROTECTED
|
||||
mux.HandleFunc("/ws/events", func(w http.ResponseWriter, r *http.Request) {
|
||||
// Validate token from query parameter or header
|
||||
tokenParam := r.URL.Query().Get("token")
|
||||
authHeader := r.Header.Get("Authorization")
|
||||
|
||||
validToken := false
|
||||
if tokenParam == token {
|
||||
validToken = true
|
||||
} else if authHeader == "Bearer "+token {
|
||||
validToken = true
|
||||
}
|
||||
|
||||
if !validToken {
|
||||
http.Error(w, "Unauthorized", http.StatusUnauthorized)
|
||||
log.Printf("Unauthorized WebSocket connection attempt from %s", r.RemoteAddr)
|
||||
return
|
||||
}
|
||||
|
||||
// Upgrade to WebSocket
|
||||
upgrader := websocket.Upgrader{
|
||||
CheckOrigin: func(r *http.Request) bool {
|
||||
origin := r.Header.Get("Origin")
|
||||
if origin == "" {
|
||||
return true // Allow same-origin requests
|
||||
}
|
||||
// Check against configured allowed origins
|
||||
allowedOrigins := strings.Split(config.AllowedOrigins, ",")
|
||||
for _, allowed := range allowedOrigins {
|
||||
if strings.TrimSpace(allowed) == origin {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
},
|
||||
}
|
||||
|
||||
conn, err := upgrader.Upgrade(w, r, nil)
|
||||
if err != nil {
|
||||
log.Printf("WebSocket upgrade failed: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Register connection
|
||||
if err := eventBroadcaster.AddConnection(conn); err != nil {
|
||||
log.Printf("Failed to register connection: %v", err)
|
||||
conn.WriteMessage(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.ClosePolicyViolation, "Connection limit reached"))
|
||||
conn.Close()
|
||||
return
|
||||
}
|
||||
|
||||
// Handle connection (read loop to detect disconnect)
|
||||
go func() {
|
||||
defer func() {
|
||||
eventBroadcaster.RemoveConnection(conn)
|
||||
conn.Close()
|
||||
}()
|
||||
|
||||
for {
|
||||
// Read messages (just to detect disconnect, we don't expect any)
|
||||
_, _, err := conn.ReadMessage()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
|
||||
// Create server with configured port
|
||||
srv := &http.Server{
|
||||
Addr: ":" + config.Port,
|
||||
Handler: mux,
|
||||
ReadTimeout: 10 * time.Second,
|
||||
WriteTimeout: 10 * time.Second,
|
||||
IdleTimeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
// Start server in goroutine
|
||||
go func() {
|
||||
log.Printf("Stats service listening on %s", srv.Addr)
|
||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("Server error: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for interrupt signal
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
|
||||
<-sigChan
|
||||
|
||||
log.Println("Shutting down stats service...")
|
||||
|
||||
// Graceful shutdown
|
||||
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer shutdownCancel()
|
||||
|
||||
// Stop all stats streams
|
||||
streamManager.StopAllStreams()
|
||||
|
||||
// Stop all event monitoring
|
||||
eventManager.StopAll()
|
||||
|
||||
// Close all event WebSocket connections
|
||||
eventBroadcaster.CloseAll()
|
||||
|
||||
// Stop HTTP server
|
||||
if err := srv.Shutdown(shutdownCtx); err != nil {
|
||||
log.Printf("Server shutdown error: %v", err)
|
||||
}
|
||||
|
||||
// Cancel context to stop aggregator
|
||||
cancel()
|
||||
|
||||
// Clean up token file
|
||||
if err := os.Remove(config.TokenFilePath); err != nil {
|
||||
log.Printf("Warning: Failed to remove token file: %v", err)
|
||||
} else {
|
||||
log.Println("Removed token file")
|
||||
}
|
||||
|
||||
log.Println("Stats service stopped")
|
||||
}
|
||||
440
dockmon/stats-service/streamer.go
Normal file
440
dockmon/stats-service/streamer.go
Normal file
@@ -0,0 +1,440 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
"github.com/docker/docker/client"
|
||||
)
|
||||
|
||||
// ContainerInfo holds basic container information
|
||||
type ContainerInfo struct {
|
||||
ID string
|
||||
Name string
|
||||
HostID string
|
||||
}
|
||||
|
||||
// createTLSOption creates a Docker client TLS option from PEM-encoded certificates
|
||||
func createTLSOption(caCertPEM, certPEM, keyPEM string) (client.Opt, error) {
|
||||
// Parse CA certificate
|
||||
caCertPool := x509.NewCertPool()
|
||||
if !caCertPool.AppendCertsFromPEM([]byte(caCertPEM)) {
|
||||
return nil, fmt.Errorf("failed to parse CA certificate")
|
||||
}
|
||||
|
||||
// Parse client certificate and key
|
||||
clientCert, err := tls.X509KeyPair([]byte(certPEM), []byte(keyPEM))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse client certificate/key: %v", err)
|
||||
}
|
||||
|
||||
// Create TLS config
|
||||
tlsConfig := &tls.Config{
|
||||
Certificates: []tls.Certificate{clientCert},
|
||||
RootCAs: caCertPool,
|
||||
MinVersion: tls.VersionTLS12,
|
||||
}
|
||||
|
||||
// Create HTTP client with TLS transport and timeouts
|
||||
// Note: No overall Timeout set because Docker API streaming operations (stats, events)
|
||||
// are long-running connections that should not be killed by a timeout
|
||||
httpClient := &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 30 * time.Second, // Connection establishment timeout
|
||||
KeepAlive: 30 * time.Second, // TCP keepalive interval
|
||||
}).DialContext,
|
||||
TLSClientConfig: tlsConfig,
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
ResponseHeaderTimeout: 10 * time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
return client.WithHTTPClient(httpClient), nil
|
||||
}
|
||||
|
||||
// StreamManager manages persistent stats streams for all containers
|
||||
type StreamManager struct {
|
||||
cache *StatsCache
|
||||
clients map[string]*client.Client // hostID -> Docker client
|
||||
clientsMu sync.RWMutex
|
||||
streams map[string]context.CancelFunc // composite key (hostID:containerID) -> cancel function
|
||||
streamsMu sync.RWMutex
|
||||
containers map[string]*ContainerInfo // composite key (hostID:containerID) -> info
|
||||
containersMu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewStreamManager creates a new stream manager
|
||||
func NewStreamManager(cache *StatsCache) *StreamManager {
|
||||
return &StreamManager{
|
||||
cache: cache,
|
||||
clients: make(map[string]*client.Client),
|
||||
streams: make(map[string]context.CancelFunc),
|
||||
containers: make(map[string]*ContainerInfo),
|
||||
}
|
||||
}
|
||||
|
||||
// AddDockerHost adds a Docker host client
|
||||
func (sm *StreamManager) AddDockerHost(hostID, hostAddress, tlsCACert, tlsCert, tlsKey string) error {
|
||||
// Create Docker client for this host FIRST (before acquiring lock)
|
||||
var cli *client.Client
|
||||
var err error
|
||||
|
||||
if hostAddress == "" || hostAddress == "unix:///var/run/docker.sock" {
|
||||
// Local Docker socket
|
||||
cli, err = client.NewClientWithOpts(
|
||||
client.FromEnv,
|
||||
client.WithAPIVersionNegotiation(),
|
||||
)
|
||||
} else {
|
||||
// Remote Docker host - check if TLS is needed
|
||||
clientOpts := []client.Opt{
|
||||
client.WithHost(hostAddress),
|
||||
client.WithAPIVersionNegotiation(),
|
||||
}
|
||||
|
||||
// If TLS certificates provided, configure TLS
|
||||
if tlsCACert != "" && tlsCert != "" && tlsKey != "" {
|
||||
tlsOpt, err := createTLSOption(tlsCACert, tlsCert, tlsKey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create TLS config: %v", err)
|
||||
}
|
||||
clientOpts = append(clientOpts, tlsOpt)
|
||||
}
|
||||
|
||||
cli, err = client.NewClientWithOpts(clientOpts...)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Track whether client was successfully stored to prevent leak
|
||||
clientStored := false
|
||||
defer func() {
|
||||
if !clientStored && cli != nil {
|
||||
cli.Close()
|
||||
log.Printf("Cleaned up unstored Docker client for host %s", truncateID(hostID, 8))
|
||||
}
|
||||
}()
|
||||
|
||||
// Now that new client is successfully created, acquire lock and swap
|
||||
sm.clientsMu.Lock()
|
||||
defer sm.clientsMu.Unlock()
|
||||
|
||||
// Close existing client if it exists (only after new one succeeds)
|
||||
if existingClient, exists := sm.clients[hostID]; exists {
|
||||
existingClient.Close()
|
||||
log.Printf("Closed existing Docker client for host %s", truncateID(hostID, 8))
|
||||
}
|
||||
|
||||
sm.clients[hostID] = cli
|
||||
clientStored = true // Mark as successfully stored
|
||||
log.Printf("Added Docker host: %s (%s)", truncateID(hostID, 8), hostAddress)
|
||||
|
||||
// Initialize host stats with zero values so the host appears immediately in the UI
|
||||
sm.cache.UpdateHostStats(&HostStats{
|
||||
HostID: hostID,
|
||||
ContainerCount: 0,
|
||||
})
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemoveDockerHost removes a Docker host client and stops all its streams
|
||||
func (sm *StreamManager) RemoveDockerHost(hostID string) {
|
||||
// First, find all containers for this host
|
||||
sm.containersMu.RLock()
|
||||
containersToStop := make([]string, 0)
|
||||
for compositeKey, info := range sm.containers {
|
||||
if info.HostID == hostID {
|
||||
containersToStop = append(containersToStop, compositeKey)
|
||||
}
|
||||
}
|
||||
sm.containersMu.RUnlock()
|
||||
|
||||
// Stop all streams for containers on this host
|
||||
// Do this BEFORE closing the client to avoid streams trying to use a closed client
|
||||
for _, compositeKey := range containersToStop {
|
||||
// Extract container ID from composite key (format: hostID:containerID)
|
||||
parts := strings.SplitN(compositeKey, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
sm.StopStream(parts[1], parts[0]) // containerID, hostID
|
||||
}
|
||||
}
|
||||
|
||||
// Now close and remove the Docker client
|
||||
sm.clientsMu.Lock()
|
||||
defer sm.clientsMu.Unlock()
|
||||
if cli, exists := sm.clients[hostID]; exists {
|
||||
cli.Close()
|
||||
delete(sm.clients, hostID)
|
||||
log.Printf("Removed Docker host: %s", truncateID(hostID, 8))
|
||||
}
|
||||
|
||||
// Remove all stats for this host from cache
|
||||
sm.cache.RemoveHostStats(hostID)
|
||||
}
|
||||
|
||||
// StartStream starts a persistent stats stream for a container
|
||||
func (sm *StreamManager) StartStream(ctx context.Context, containerID, containerName, hostID string) error {
|
||||
// Create composite key to support containers with duplicate IDs on different hosts
|
||||
compositeKey := fmt.Sprintf("%s:%s", hostID, containerID)
|
||||
|
||||
// Acquire locks in consistent order: clientsMu → streamsMu → containersMu (when needed)
|
||||
sm.clientsMu.RLock()
|
||||
sm.streamsMu.Lock()
|
||||
|
||||
// Check if stream already exists
|
||||
if _, exists := sm.streams[compositeKey]; exists {
|
||||
sm.streamsMu.Unlock()
|
||||
sm.clientsMu.RUnlock()
|
||||
return nil // Already streaming
|
||||
}
|
||||
|
||||
// Check if client exists
|
||||
_, clientExists := sm.clients[hostID]
|
||||
if !clientExists {
|
||||
sm.streamsMu.Unlock()
|
||||
sm.clientsMu.RUnlock()
|
||||
log.Printf("Warning: No Docker client for host %s", truncateID(hostID, 8))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Create cancellable context for this stream
|
||||
streamCtx, cancel := context.WithCancel(ctx)
|
||||
sm.streams[compositeKey] = cancel
|
||||
|
||||
// Release locks before acquiring containersMu to prevent nested locking
|
||||
sm.streamsMu.Unlock()
|
||||
sm.clientsMu.RUnlock()
|
||||
|
||||
// Store container info with separate lock
|
||||
sm.containersMu.Lock()
|
||||
sm.containers[compositeKey] = &ContainerInfo{
|
||||
ID: containerID,
|
||||
Name: containerName,
|
||||
HostID: hostID,
|
||||
}
|
||||
sm.containersMu.Unlock()
|
||||
|
||||
// Start streaming goroutine (no locks held)
|
||||
go sm.streamStats(streamCtx, containerID, containerName, hostID)
|
||||
|
||||
log.Printf("Started stats stream for container %s (%s) on host %s", containerName, truncateID(containerID, 12), truncateID(hostID, 12))
|
||||
return nil
|
||||
}
|
||||
|
||||
// StopStream stops the stats stream for a container
|
||||
func (sm *StreamManager) StopStream(containerID, hostID string) {
|
||||
// Create composite key to support containers with duplicate IDs on different hosts
|
||||
compositeKey := fmt.Sprintf("%s:%s", hostID, containerID)
|
||||
|
||||
sm.streamsMu.Lock()
|
||||
defer sm.streamsMu.Unlock()
|
||||
cancel, exists := sm.streams[compositeKey]
|
||||
if exists {
|
||||
cancel()
|
||||
delete(sm.streams, compositeKey)
|
||||
}
|
||||
|
||||
sm.containersMu.Lock()
|
||||
defer sm.containersMu.Unlock()
|
||||
delete(sm.containers, compositeKey)
|
||||
|
||||
// Remove from cache
|
||||
sm.cache.RemoveContainerStats(containerID, hostID)
|
||||
|
||||
log.Printf("Stopped stats stream for container %s", truncateID(containerID, 12))
|
||||
}
|
||||
|
||||
// streamStats maintains a persistent stats stream for a single container
|
||||
func (sm *StreamManager) streamStats(ctx context.Context, containerID, containerName, hostID string) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("Recovered from panic in stats stream for %s: %v", truncateID(containerID, 12), r)
|
||||
}
|
||||
}()
|
||||
|
||||
// Retry loop - restart stream if it fails
|
||||
backoff := time.Second
|
||||
maxBackoff := 30 * time.Second
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
// Get current Docker client (may have changed if host was updated)
|
||||
sm.clientsMu.RLock()
|
||||
cli, ok := sm.clients[hostID]
|
||||
sm.clientsMu.RUnlock() // Manual unlock needed - we're in a loop
|
||||
|
||||
if !ok {
|
||||
log.Printf("No Docker client for host %s (container %s), retrying in %v", truncateID(hostID, 8), truncateID(containerID, 12), backoff)
|
||||
time.Sleep(backoff)
|
||||
backoff = min(backoff*2, maxBackoff)
|
||||
continue
|
||||
}
|
||||
|
||||
// Open stats stream
|
||||
stats, err := cli.ContainerStats(ctx, containerID, true) // stream=true
|
||||
if err != nil {
|
||||
log.Printf("Error opening stats stream for %s: %v (retrying in %v)", truncateID(containerID, 12), err, backoff)
|
||||
time.Sleep(backoff)
|
||||
backoff = min(backoff*2, maxBackoff)
|
||||
continue
|
||||
}
|
||||
|
||||
// Reset backoff on successful connection
|
||||
backoff = time.Second
|
||||
|
||||
// Read stats from stream
|
||||
decoder := json.NewDecoder(stats.Body)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
stats.Body.Close()
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
var stat types.StatsJSON
|
||||
if err := decoder.Decode(&stat); err != nil {
|
||||
stats.Body.Close()
|
||||
if err == io.EOF || err == context.Canceled {
|
||||
log.Printf("Stats stream ended for %s", truncateID(containerID, 12))
|
||||
} else {
|
||||
log.Printf("Error decoding stats for %s: %v", truncateID(containerID, 12), err)
|
||||
}
|
||||
break // Break inner loop, will retry in outer loop
|
||||
}
|
||||
|
||||
// Calculate and cache stats
|
||||
sm.processStats(&stat, containerID, containerName, hostID)
|
||||
}
|
||||
|
||||
// Brief pause before reconnecting
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// processStats calculates metrics from raw Docker stats
|
||||
func (sm *StreamManager) processStats(stat *types.StatsJSON, containerID, containerName, hostID string) {
|
||||
// Calculate CPU percentage
|
||||
cpuPercent := calculateCPUPercent(stat)
|
||||
|
||||
// Memory stats
|
||||
memUsage := stat.MemoryStats.Usage
|
||||
memLimit := stat.MemoryStats.Limit
|
||||
memPercent := 0.0
|
||||
if memLimit > 0 {
|
||||
memPercent = (float64(memUsage) / float64(memLimit)) * 100.0
|
||||
}
|
||||
|
||||
// Network stats
|
||||
var netRx, netTx uint64
|
||||
for _, net := range stat.Networks {
|
||||
netRx += net.RxBytes
|
||||
netTx += net.TxBytes
|
||||
}
|
||||
|
||||
// Disk I/O stats
|
||||
var diskRead, diskWrite uint64
|
||||
for _, bio := range stat.BlkioStats.IoServiceBytesRecursive {
|
||||
if bio.Op == "Read" {
|
||||
diskRead += bio.Value
|
||||
} else if bio.Op == "Write" {
|
||||
diskWrite += bio.Value
|
||||
}
|
||||
}
|
||||
|
||||
// Update cache
|
||||
sm.cache.UpdateContainerStats(&ContainerStats{
|
||||
ContainerID: containerID,
|
||||
ContainerName: containerName,
|
||||
HostID: hostID,
|
||||
CPUPercent: roundToDecimal(cpuPercent, 1),
|
||||
MemoryUsage: memUsage,
|
||||
MemoryLimit: memLimit,
|
||||
MemoryPercent: roundToDecimal(memPercent, 1),
|
||||
NetworkRx: netRx,
|
||||
NetworkTx: netTx,
|
||||
DiskRead: diskRead,
|
||||
DiskWrite: diskWrite,
|
||||
})
|
||||
}
|
||||
|
||||
// calculateCPUPercent calculates CPU percentage from Docker stats
|
||||
func calculateCPUPercent(stat *types.StatsJSON) float64 {
|
||||
// CPU calculation similar to `docker stats` command
|
||||
cpuDelta := float64(stat.CPUStats.CPUUsage.TotalUsage) - float64(stat.PreCPUStats.CPUUsage.TotalUsage)
|
||||
systemDelta := float64(stat.CPUStats.SystemUsage) - float64(stat.PreCPUStats.SystemUsage)
|
||||
|
||||
if systemDelta > 0.0 && cpuDelta > 0.0 {
|
||||
numCPUs := float64(len(stat.CPUStats.CPUUsage.PercpuUsage))
|
||||
if numCPUs == 0 {
|
||||
numCPUs = 1.0
|
||||
}
|
||||
return (cpuDelta / systemDelta) * numCPUs * 100.0
|
||||
}
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// GetStreamCount returns the number of active streams
|
||||
func (sm *StreamManager) GetStreamCount() int {
|
||||
sm.streamsMu.RLock()
|
||||
defer sm.streamsMu.RUnlock()
|
||||
return len(sm.streams)
|
||||
}
|
||||
|
||||
// HasHost checks if a Docker host is registered
|
||||
func (sm *StreamManager) HasHost(hostID string) bool {
|
||||
sm.clientsMu.RLock()
|
||||
defer sm.clientsMu.RUnlock()
|
||||
_, exists := sm.clients[hostID]
|
||||
return exists
|
||||
}
|
||||
|
||||
// StopAllStreams stops all active streams and closes all Docker clients
|
||||
func (sm *StreamManager) StopAllStreams() {
|
||||
// Stop all streams
|
||||
sm.streamsMu.Lock()
|
||||
for containerID, cancel := range sm.streams {
|
||||
cancel()
|
||||
log.Printf("Stopped stream for %s", truncateID(containerID, 12))
|
||||
}
|
||||
sm.streams = make(map[string]context.CancelFunc)
|
||||
sm.streamsMu.Unlock()
|
||||
|
||||
// Close all Docker clients
|
||||
sm.clientsMu.Lock()
|
||||
for hostID, cli := range sm.clients {
|
||||
cli.Close()
|
||||
log.Printf("Closed Docker client for host %s", truncateID(hostID, 8))
|
||||
}
|
||||
sm.clients = make(map[string]*client.Client)
|
||||
sm.clientsMu.Unlock()
|
||||
}
|
||||
|
||||
func min(a, b time.Duration) time.Duration {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
9
dockmon/stats-service/utils.go
Normal file
9
dockmon/stats-service/utils.go
Normal file
@@ -0,0 +1,9 @@
|
||||
package main
|
||||
|
||||
// truncateID truncates an ID string to specified length
|
||||
func truncateID(id string, length int) string {
|
||||
if len(id) <= length {
|
||||
return id
|
||||
}
|
||||
return id[:length]
|
||||
}
|
||||
Reference in New Issue
Block a user