Switched from Dockmon to Beszel

This commit is contained in:
2025-10-31 17:13:00 +01:00
parent cc6454cef9
commit f4a4142799
75 changed files with 24313 additions and 122 deletions

View File

@@ -0,0 +1,150 @@
package main
import (
"context"
"log"
"time"
)
// Aggregator aggregates container stats into host-level metrics
type Aggregator struct {
cache *StatsCache
streamManager *StreamManager
aggregateInterval time.Duration
}
// NewAggregator creates a new aggregator
func NewAggregator(cache *StatsCache, streamManager *StreamManager, interval time.Duration) *Aggregator {
return &Aggregator{
cache: cache,
streamManager: streamManager,
aggregateInterval: interval,
}
}
// Start begins the aggregation loop
func (a *Aggregator) Start(ctx context.Context) {
ticker := time.NewTicker(a.aggregateInterval)
defer ticker.Stop()
log.Printf("Aggregator started (interval: %v)", a.aggregateInterval)
// Run once immediately
a.aggregate()
for {
select {
case <-ctx.Done():
log.Println("Aggregator stopped")
return
case <-ticker.C:
a.aggregate()
}
}
}
// aggregate calculates host-level stats from container stats
func (a *Aggregator) aggregate() {
containerStats := a.cache.GetAllContainerStats()
// Group containers by host
hostContainers := make(map[string][]*ContainerStats)
for _, stats := range containerStats {
hostContainers[stats.HostID] = append(hostContainers[stats.HostID], stats)
}
// Aggregate stats for each host that has a registered Docker client
for hostID, containers := range hostContainers {
// Only aggregate if the host still has a registered Docker client
// This prevents recreating stats for hosts that were just deleted
if a.streamManager.HasHost(hostID) {
hostStats := a.aggregateHostStats(hostID, containers)
a.cache.UpdateHostStats(hostStats)
}
}
}
// aggregateHostStats aggregates stats for a single host
func (a *Aggregator) aggregateHostStats(hostID string, containers []*ContainerStats) *HostStats {
if len(containers) == 0 {
return &HostStats{
HostID: hostID,
ContainerCount: 0,
}
}
var (
totalCPU float64
totalMemUsage uint64
totalMemLimit uint64
totalNetRx uint64
totalNetTx uint64
validContainers int
)
const maxUint64 = ^uint64(0)
// Only count containers updated in the last 30 seconds
cutoff := time.Now().Add(-30 * time.Second)
for _, stats := range containers {
if stats.LastUpdate.Before(cutoff) {
continue // Skip stale stats
}
totalCPU += stats.CPUPercent
totalMemUsage += stats.MemoryUsage
totalMemLimit += stats.MemoryLimit
// Check for overflow before adding network bytes
if maxUint64-totalNetRx < stats.NetworkRx {
log.Printf("Warning: Network RX overflow prevented for host %s", truncateID(hostID, 8))
totalNetRx = maxUint64 // Cap at max instead of wrapping
} else {
totalNetRx += stats.NetworkRx
}
if maxUint64-totalNetTx < stats.NetworkTx {
log.Printf("Warning: Network TX overflow prevented for host %s", truncateID(hostID, 8))
totalNetTx = maxUint64
} else {
totalNetTx += stats.NetworkTx
}
validContainers++
}
// Calculate totals and percentages
var cpuPercent, memPercent float64
// CPU is sum of all container CPU percentages (represents total host CPU usage)
cpuPercent = totalCPU
if totalMemLimit > 0 {
memPercent = (float64(totalMemUsage) / float64(totalMemLimit)) * 100.0
}
// Round to 1 decimal place
cpuPercent = roundToDecimal(cpuPercent, 1)
memPercent = roundToDecimal(memPercent, 1)
return &HostStats{
HostID: hostID,
CPUPercent: cpuPercent,
MemoryPercent: memPercent,
MemoryUsedBytes: totalMemUsage,
MemoryLimitBytes: totalMemLimit,
NetworkRxBytes: totalNetRx,
NetworkTxBytes: totalNetTx,
ContainerCount: validContainers,
}
}
// roundToDecimal rounds a float to n decimal places
func roundToDecimal(value float64, places int) float64 {
shift := float64(1)
for i := 0; i < places; i++ {
shift *= 10
}
return float64(int(value*shift+0.5)) / shift
}

View File

@@ -0,0 +1,172 @@
package main
import (
"sync"
"time"
)
// ContainerStats holds real-time stats for a single container
type ContainerStats struct {
ContainerID string `json:"container_id"`
ContainerName string `json:"container_name"`
HostID string `json:"host_id"`
CPUPercent float64 `json:"cpu_percent"`
MemoryUsage uint64 `json:"memory_usage"`
MemoryLimit uint64 `json:"memory_limit"`
MemoryPercent float64 `json:"memory_percent"`
NetworkRx uint64 `json:"network_rx"`
NetworkTx uint64 `json:"network_tx"`
DiskRead uint64 `json:"disk_read"`
DiskWrite uint64 `json:"disk_write"`
LastUpdate time.Time `json:"last_update"`
}
// HostStats holds aggregated stats for a host
type HostStats struct {
HostID string `json:"host_id"`
CPUPercent float64 `json:"cpu_percent"`
MemoryPercent float64 `json:"memory_percent"`
MemoryUsedBytes uint64 `json:"memory_used_bytes"`
MemoryLimitBytes uint64 `json:"memory_limit_bytes"`
NetworkRxBytes uint64 `json:"network_rx_bytes"`
NetworkTxBytes uint64 `json:"network_tx_bytes"`
ContainerCount int `json:"container_count"`
LastUpdate time.Time `json:"last_update"`
}
// StatsCache is a thread-safe cache for container and host stats
type StatsCache struct {
mu sync.RWMutex
containerStats map[string]*ContainerStats // key: composite key (hostID:containerID)
hostStats map[string]*HostStats // key: hostID
}
// NewStatsCache creates a new stats cache
func NewStatsCache() *StatsCache {
return &StatsCache{
containerStats: make(map[string]*ContainerStats),
hostStats: make(map[string]*HostStats),
}
}
// UpdateContainerStats updates stats for a container
func (c *StatsCache) UpdateContainerStats(stats *ContainerStats) {
c.mu.Lock()
defer c.mu.Unlock()
stats.LastUpdate = time.Now()
// Use composite key to support containers with duplicate IDs on different hosts
compositeKey := stats.HostID + ":" + stats.ContainerID
c.containerStats[compositeKey] = stats
}
// GetContainerStats retrieves stats for a specific container
func (c *StatsCache) GetContainerStats(containerID, hostID string) (*ContainerStats, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
compositeKey := hostID + ":" + containerID
stats, ok := c.containerStats[compositeKey]
return stats, ok
}
// GetAllContainerStats returns all container stats
func (c *StatsCache) GetAllContainerStats() map[string]*ContainerStats {
c.mu.RLock()
defer c.mu.RUnlock()
// Return a copy to avoid race conditions
result := make(map[string]*ContainerStats, len(c.containerStats))
for k, v := range c.containerStats {
statsCopy := *v
result[k] = &statsCopy
}
return result
}
// RemoveContainerStats removes stats for a container (when it stops)
func (c *StatsCache) RemoveContainerStats(containerID, hostID string) {
c.mu.Lock()
defer c.mu.Unlock()
compositeKey := hostID + ":" + containerID
delete(c.containerStats, compositeKey)
}
// UpdateHostStats updates aggregated stats for a host
func (c *StatsCache) UpdateHostStats(stats *HostStats) {
c.mu.Lock()
defer c.mu.Unlock()
stats.LastUpdate = time.Now()
c.hostStats[stats.HostID] = stats
}
// GetHostStats retrieves stats for a specific host
func (c *StatsCache) GetHostStats(hostID string) (*HostStats, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
stats, ok := c.hostStats[hostID]
return stats, ok
}
// GetAllHostStats returns all host stats
func (c *StatsCache) GetAllHostStats() map[string]*HostStats {
c.mu.RLock()
defer c.mu.RUnlock()
// Return a copy to avoid race conditions
result := make(map[string]*HostStats, len(c.hostStats))
for k, v := range c.hostStats {
statsCopy := *v
result[k] = &statsCopy
}
return result
}
// RemoveHostStats removes all stats for a specific host
func (c *StatsCache) RemoveHostStats(hostID string) {
c.mu.Lock()
defer c.mu.Unlock()
// Remove host stats
delete(c.hostStats, hostID)
// Remove all container stats for this host
for id, stats := range c.containerStats {
if stats.HostID == hostID {
delete(c.containerStats, id)
}
}
}
// CleanStaleStats removes stats older than maxAge
func (c *StatsCache) CleanStaleStats(maxAge time.Duration) {
c.mu.Lock()
defer c.mu.Unlock()
now := time.Now()
// Clean container stats
for id, stats := range c.containerStats {
if now.Sub(stats.LastUpdate) > maxAge {
delete(c.containerStats, id)
}
}
// Clean host stats
for id, stats := range c.hostStats {
if now.Sub(stats.LastUpdate) > maxAge {
delete(c.hostStats, id)
}
}
}
// GetStats returns a summary of cache state
func (c *StatsCache) GetStats() (containerCount, hostCount int) {
c.mu.RLock()
defer c.mu.RUnlock()
return len(c.containerStats), len(c.hostStats)
}

View File

@@ -0,0 +1,126 @@
package main
import (
"encoding/json"
"log"
"sync"
"github.com/gorilla/websocket"
)
// EventBroadcaster manages WebSocket connections and broadcasts events
type EventBroadcaster struct {
mu sync.RWMutex
connections map[*websocket.Conn]*sync.Mutex // Each connection has its own write mutex
maxConnections int
}
// NewEventBroadcaster creates a new event broadcaster
func NewEventBroadcaster() *EventBroadcaster {
return &EventBroadcaster{
connections: make(map[*websocket.Conn]*sync.Mutex),
maxConnections: 100, // Limit to 100 concurrent WebSocket connections
}
}
// AddConnection registers a new WebSocket connection
func (eb *EventBroadcaster) AddConnection(conn *websocket.Conn) error {
eb.mu.Lock()
defer eb.mu.Unlock()
// Check connection limit
if len(eb.connections) >= eb.maxConnections {
log.Printf("WebSocket connection limit reached (%d), rejecting new connection", eb.maxConnections)
return &websocket.CloseError{Code: websocket.ClosePolicyViolation, Text: "Connection limit reached"}
}
eb.connections[conn] = &sync.Mutex{} // Create a dedicated mutex for this connection
log.Printf("WebSocket connected to events. Total connections: %d", len(eb.connections))
return nil
}
// RemoveConnection unregisters a WebSocket connection
func (eb *EventBroadcaster) RemoveConnection(conn *websocket.Conn) {
eb.mu.Lock()
defer eb.mu.Unlock()
delete(eb.connections, conn)
log.Printf("WebSocket disconnected from events. Total connections: %d", len(eb.connections))
}
// Broadcast sends an event to all connected WebSocket clients
func (eb *EventBroadcaster) Broadcast(event DockerEvent) {
// Marshal event to JSON
data, err := json.Marshal(event)
if err != nil {
log.Printf("Error marshaling event: %v", err)
return
}
// Track dead connections
var deadConnections []*websocket.Conn
// Get snapshot of connections with their mutexes
eb.mu.RLock()
connMutexes := make(map[*websocket.Conn]*sync.Mutex, len(eb.connections))
for conn, mu := range eb.connections {
connMutexes[conn] = mu
}
eb.mu.RUnlock()
// Send to all connections (with per-connection write lock)
for conn, mu := range connMutexes {
mu.Lock()
err := conn.WriteMessage(websocket.TextMessage, data)
mu.Unlock()
if err != nil {
log.Printf("Error sending event to WebSocket: %v", err)
deadConnections = append(deadConnections, conn)
}
}
// Clean up dead connections
if len(deadConnections) > 0 {
// Remove from map first (fast, under lock)
eb.mu.Lock()
var connectionsToClose []*websocket.Conn
for _, conn := range deadConnections {
// Only delete if connection still exists in map
if _, exists := eb.connections[conn]; exists {
delete(eb.connections, conn)
connectionsToClose = append(connectionsToClose, conn)
}
}
eb.mu.Unlock()
// Close connections outside lock (slow, can block)
for _, conn := range connectionsToClose {
conn.Close()
}
}
}
// GetConnectionCount returns the number of active WebSocket connections
func (eb *EventBroadcaster) GetConnectionCount() int {
eb.mu.RLock()
defer eb.mu.RUnlock()
return len(eb.connections)
}
// CloseAll closes all WebSocket connections
func (eb *EventBroadcaster) CloseAll() {
eb.mu.Lock()
var connectionsToClose []*websocket.Conn
for conn := range eb.connections {
connectionsToClose = append(connectionsToClose, conn)
}
eb.connections = make(map[*websocket.Conn]*sync.Mutex)
eb.mu.Unlock()
// Close connections outside lock (can block on network I/O)
for _, conn := range connectionsToClose {
conn.Close()
}
log.Println("Closed all event WebSocket connections")
}

View File

@@ -0,0 +1,106 @@
package main
import (
"sync"
)
// EventCache stores recent events for each host (ring buffer)
type EventCache struct {
mu sync.RWMutex
events map[string][]DockerEvent // key: hostID, value: ring buffer of events
maxSize int // maximum events to keep per host
}
// NewEventCache creates a new event cache
func NewEventCache(maxSize int) *EventCache {
return &EventCache{
events: make(map[string][]DockerEvent),
maxSize: maxSize,
}
}
// AddEvent adds an event to the cache for a specific host
func (ec *EventCache) AddEvent(hostID string, event DockerEvent) {
ec.mu.Lock()
defer ec.mu.Unlock()
// Initialize slice if needed
if _, exists := ec.events[hostID]; !exists {
ec.events[hostID] = make([]DockerEvent, 0, ec.maxSize)
}
// Add event
ec.events[hostID] = append(ec.events[hostID], event)
// Trim if over max size (keep most recent)
if len(ec.events[hostID]) > ec.maxSize {
ec.events[hostID] = ec.events[hostID][len(ec.events[hostID])-ec.maxSize:]
}
}
// GetRecentEvents returns recent events for a specific host
func (ec *EventCache) GetRecentEvents(hostID string, limit int) []DockerEvent {
ec.mu.RLock()
defer ec.mu.RUnlock()
events, exists := ec.events[hostID]
if !exists || len(events) == 0 {
return []DockerEvent{}
}
// Return last N events
if limit <= 0 || limit > len(events) {
limit = len(events)
}
// Return copy to avoid race conditions
result := make([]DockerEvent, limit)
copy(result, events[len(events)-limit:])
return result
}
// GetAllRecentEvents returns recent events for all hosts
func (ec *EventCache) GetAllRecentEvents(limit int) map[string][]DockerEvent {
ec.mu.RLock()
defer ec.mu.RUnlock()
result := make(map[string][]DockerEvent)
for hostID, events := range ec.events {
if len(events) == 0 {
continue
}
// Get last N events
count := limit
if count <= 0 || count > len(events) {
count = len(events)
}
// Copy to avoid race conditions
hostEvents := make([]DockerEvent, count)
copy(hostEvents, events[len(events)-count:])
result[hostID] = hostEvents
}
return result
}
// ClearHost removes all cached events for a specific host
func (ec *EventCache) ClearHost(hostID string) {
ec.mu.Lock()
defer ec.mu.Unlock()
delete(ec.events, hostID)
}
// GetStats returns cache statistics
func (ec *EventCache) GetStats() (hostCount int, totalEvents int) {
ec.mu.RLock()
defer ec.mu.RUnlock()
hostCount = len(ec.events)
for _, events := range ec.events {
totalEvents += len(events)
}
return
}

View File

@@ -0,0 +1,351 @@
package main
import (
"context"
"crypto/tls"
"crypto/x509"
"fmt"
"log"
"net"
"net/http"
"sync"
"time"
"github.com/docker/docker/api/types/events"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/client"
)
// DockerEvent represents a Docker container event
type DockerEvent struct {
Action string `json:"action"`
ContainerID string `json:"container_id"`
ContainerName string `json:"container_name"`
Image string `json:"image"`
HostID string `json:"host_id"`
Timestamp string `json:"timestamp"`
Attributes map[string]string `json:"attributes"`
}
// EventManager manages Docker event streams for multiple hosts
type EventManager struct {
mu sync.RWMutex
hosts map[string]*eventStream // key: hostID
broadcaster *EventBroadcaster
eventCache *EventCache
}
// eventStream represents a single Docker host event stream
type eventStream struct {
hostID string
hostAddr string
client *client.Client
ctx context.Context
cancel context.CancelFunc
active bool
}
// createEventTLSOption creates a Docker client TLS option from PEM-encoded certificates
func createEventTLSOption(caCertPEM, certPEM, keyPEM string) (client.Opt, error) {
// Parse CA certificate
caCertPool := x509.NewCertPool()
if !caCertPool.AppendCertsFromPEM([]byte(caCertPEM)) {
return nil, fmt.Errorf("failed to parse CA certificate")
}
// Parse client certificate and key
clientCert, err := tls.X509KeyPair([]byte(certPEM), []byte(keyPEM))
if err != nil {
return nil, fmt.Errorf("failed to parse client certificate/key: %v", err)
}
// Create TLS config
tlsConfig := &tls.Config{
Certificates: []tls.Certificate{clientCert},
RootCAs: caCertPool,
MinVersion: tls.VersionTLS12,
}
// Create HTTP client with TLS transport and timeouts
// Note: No overall Timeout set because Docker API streaming operations (stats, events)
// are long-running connections that should not be killed by a timeout
httpClient := &http.Client{
Transport: &http.Transport{
DialContext: (&net.Dialer{
Timeout: 30 * time.Second, // Connection establishment timeout
KeepAlive: 30 * time.Second, // TCP keepalive interval
}).DialContext,
TLSClientConfig: tlsConfig,
TLSHandshakeTimeout: 10 * time.Second,
IdleConnTimeout: 90 * time.Second,
ResponseHeaderTimeout: 10 * time.Second,
},
}
return client.WithHTTPClient(httpClient), nil
}
// NewEventManager creates a new event manager
func NewEventManager(broadcaster *EventBroadcaster, cache *EventCache) *EventManager {
return &EventManager{
hosts: make(map[string]*eventStream),
broadcaster: broadcaster,
eventCache: cache,
}
}
// AddHost starts monitoring Docker events for a host
func (em *EventManager) AddHost(hostID, hostAddress, tlsCACert, tlsCert, tlsKey string) error {
// Create Docker client FIRST (before acquiring lock or stopping old stream)
var dockerClient *client.Client
var err error
if hostAddress == "" || hostAddress == "unix:///var/run/docker.sock" {
// Local Docker socket
dockerClient, err = client.NewClientWithOpts(
client.FromEnv,
client.WithAPIVersionNegotiation(),
)
} else {
// Remote Docker host - check if TLS is needed
clientOpts := []client.Opt{
client.WithHost(hostAddress),
client.WithAPIVersionNegotiation(),
}
// If TLS certificates provided, configure TLS
if tlsCACert != "" && tlsCert != "" && tlsKey != "" {
tlsOpt, err := createEventTLSOption(tlsCACert, tlsCert, tlsKey)
if err != nil {
return fmt.Errorf("failed to create TLS config: %v", err)
}
clientOpts = append(clientOpts, tlsOpt)
}
dockerClient, err = client.NewClientWithOpts(clientOpts...)
}
if err != nil {
return err
}
// Now that new client is successfully created, acquire lock and swap
em.mu.Lock()
defer em.mu.Unlock()
// If already monitoring, stop the old stream (only after new client succeeds)
if stream, exists := em.hosts[hostID]; exists && stream.active {
log.Printf("Stopping existing event monitoring for host %s to update", truncateID(hostID, 8))
stream.cancel()
stream.active = false
if stream.client != nil {
stream.client.Close()
}
}
// Create context for this stream
ctx, cancel := context.WithCancel(context.Background())
stream := &eventStream{
hostID: hostID,
hostAddr: hostAddress,
client: dockerClient,
ctx: ctx,
cancel: cancel,
active: true,
}
em.hosts[hostID] = stream
// Start event stream in goroutine
go em.streamEvents(stream)
log.Printf("Started event monitoring for host %s (%s)", truncateID(hostID, 8), hostAddress)
return nil
}
// RemoveHost stops monitoring events for a host
func (em *EventManager) RemoveHost(hostID string) {
em.mu.Lock()
defer em.mu.Unlock()
if stream, exists := em.hosts[hostID]; exists {
stream.cancel()
stream.active = false
if stream.client != nil {
stream.client.Close()
}
// Clear cached events for this host
em.eventCache.ClearHost(hostID)
delete(em.hosts, hostID)
log.Printf("Stopped event monitoring for host %s", truncateID(hostID, 8))
}
}
// StopAll stops all event monitoring
func (em *EventManager) StopAll() {
em.mu.Lock()
defer em.mu.Unlock()
for hostID, stream := range em.hosts {
stream.cancel()
stream.active = false
if stream.client != nil {
stream.client.Close()
}
log.Printf("Stopped event monitoring for host %s", truncateID(hostID, 8))
}
em.hosts = make(map[string]*eventStream)
}
// GetActiveHosts returns count of active event streams
func (em *EventManager) GetActiveHosts() int {
em.mu.RLock()
defer em.mu.RUnlock()
return len(em.hosts)
}
// streamEvents listens to Docker events for a specific host
func (em *EventManager) streamEvents(stream *eventStream) {
defer func() {
if r := recover(); r != nil {
log.Printf("Recovered from panic in event stream for %s: %v", truncateID(stream.hostID, 8), r)
}
}()
// Retry loop with exponential backoff
backoff := time.Second
maxBackoff := 30 * time.Second
// Track if we've received any successful events (to know when to reset backoff)
receivedSuccessfulEvent := false
for {
select {
case <-stream.ctx.Done():
log.Printf("Event stream for host %s stopped", truncateID(stream.hostID, 8))
return
default:
}
// Listen to container events only
eventFilters := filters.NewArgs()
eventFilters.Add("type", "container")
eventOptions := events.ListOptions{
Filters: eventFilters,
}
eventsChan, errChan := stream.client.Events(stream.ctx, eventOptions)
for {
select {
case <-stream.ctx.Done():
return
case err := <-errChan:
if err != nil {
log.Printf("Event stream error for host %s: %v (retrying in %v)", truncateID(stream.hostID, 8), err, backoff)
time.Sleep(backoff)
// Only increase backoff if we never got a successful event
if !receivedSuccessfulEvent {
backoff = min(backoff*2, maxBackoff)
} else {
// We had a successful connection before, reset backoff
backoff = time.Second
}
goto reconnect
}
case event := <-eventsChan:
// Reset backoff on first successful event after reconnection
if !receivedSuccessfulEvent {
backoff = time.Second
receivedSuccessfulEvent = true
}
// Process the event with panic recovery
func() {
defer func() {
if r := recover(); r != nil {
log.Printf("Recovered from panic in processEvent for host %s: %v", truncateID(stream.hostID, 8), r)
}
}()
em.processEvent(stream.hostID, event)
}()
}
}
reconnect:
// Continue to next iteration (backoff sleep already happened above)
}
}
// processEvent converts Docker event to our format and broadcasts it
func (em *EventManager) processEvent(hostID string, event events.Message) {
// Extract container info
containerID := event.Actor.ID
// Safely extract attributes with defensive access pattern
containerName := ""
image := ""
if attrs := event.Actor.Attributes; attrs != nil {
if name, ok := attrs["name"]; ok {
containerName = name
}
if img, ok := attrs["image"]; ok {
image = img
}
}
// Create our event
dockerEvent := DockerEvent{
Action: string(event.Action),
ContainerID: containerID,
ContainerName: containerName,
Image: image,
HostID: hostID,
Timestamp: time.Unix(event.Time, 0).Format(time.RFC3339),
Attributes: event.Actor.Attributes,
}
// Only log important events (not noisy exec_* events)
action := dockerEvent.Action
if action != "" && !isExecEvent(action) && isImportantEvent(action) {
log.Printf("Event: %s - container %s (%s) on host %s",
dockerEvent.Action,
dockerEvent.ContainerName,
truncateID(dockerEvent.ContainerID, 12),
truncateID(hostID, 8))
}
// Add to cache
em.eventCache.AddEvent(hostID, dockerEvent)
// Broadcast to all WebSocket clients
em.broadcaster.Broadcast(dockerEvent)
}
// isExecEvent checks if the event is an exec_* event (noisy)
func isExecEvent(action string) bool {
return len(action) > 5 && action[:5] == "exec_"
}
// isImportantEvent checks if the event should be logged
func isImportantEvent(action string) bool {
importantEvents := map[string]bool{
"create": true,
"start": true,
"stop": true,
"die": true,
"kill": true,
"destroy": true,
"pause": true,
"unpause": true,
"restart": true,
"oom": true,
"health_status": true,
}
return importantEvents[action]
}

View File

@@ -0,0 +1,40 @@
module github.com/dockmon/stats-service
go 1.23.0
toolchain go1.23.12
require (
github.com/docker/docker v27.5.1+incompatible
github.com/gorilla/websocket v1.5.3
)
require github.com/docker/go-connections v0.5.0 // indirect
require (
github.com/Microsoft/go-winio v0.6.1 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/go-units v0.5.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/moby/docker-image-spec v1.3.1 // indirect
github.com/moby/term v0.5.0 // indirect
github.com/morikuni/aec v1.0.0 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
go.opentelemetry.io/otel v1.38.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 // indirect
go.opentelemetry.io/otel/metric v1.38.0 // indirect
go.opentelemetry.io/otel/trace v1.38.0 // indirect
golang.org/x/mod v0.8.0 // indirect
golang.org/x/sys v0.35.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.6.0 // indirect
gotest.tools/v3 v3.5.1 // indirect
)

View File

@@ -0,0 +1,128 @@
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8=
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/docker/docker v27.5.1+incompatible h1:4PYU5dnBYqRQi0294d1FBECqT9ECWeQAIfE8q4YnPY8=
github.com/docker/docker v27.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4=
go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4=
go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY=
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE=
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE=
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc=
google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4=
google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ=
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU=
gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU=

View File

@@ -0,0 +1,569 @@
package main
import (
"context"
"crypto/rand"
"crypto/subtle"
"encoding/hex"
"encoding/json"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/gorilla/websocket"
)
// Configuration with environment variable support
var config = struct {
TokenFilePath string
Port string
AggregationInterval time.Duration
EventCacheSize int
CleanupInterval time.Duration
MaxRequestBodySize int64
AllowedOrigins string
}{
TokenFilePath: getEnv("TOKEN_FILE_PATH", "/tmp/stats-service-token"),
Port: getEnv("STATS_SERVICE_PORT", "8081"),
AggregationInterval: getEnvDuration("AGGREGATION_INTERVAL", "1s"),
EventCacheSize: getEnvInt("EVENT_CACHE_SIZE", 100),
CleanupInterval: getEnvDuration("CLEANUP_INTERVAL", "60s"),
MaxRequestBodySize: getEnvInt64("MAX_REQUEST_BODY_SIZE", 1048576), // 1MB default
AllowedOrigins: getEnv("ALLOWED_ORIGINS", "http://localhost:8080,http://127.0.0.1:8080,http://localhost,http://127.0.0.1"),
}
// getEnv gets environment variable with fallback
func getEnv(key, fallback string) string {
if value := os.Getenv(key); value != "" {
return value
}
return fallback
}
// getEnvInt gets integer environment variable with fallback
func getEnvInt(key string, fallback int) int {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return fallback
}
// getEnvInt64 gets int64 environment variable with fallback
func getEnvInt64(key string, fallback int64) int64 {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.ParseInt(value, 10, 64); err == nil {
return intVal
}
}
return fallback
}
// getEnvDuration gets duration environment variable with fallback
func getEnvDuration(key string, fallback string) time.Duration {
value := getEnv(key, fallback)
if duration, err := time.ParseDuration(value); err == nil {
return duration
}
// Fallback parsing
if duration, err := time.ParseDuration(fallback); err == nil {
return duration
}
return 1 * time.Second // Ultimate fallback
}
// generateToken creates a cryptographically secure random token
func generateToken() (string, error) {
bytes := make([]byte, 32) // 256-bit token
if _, err := rand.Read(bytes); err != nil {
return "", err
}
return hex.EncodeToString(bytes), nil
}
// limitRequestBody limits the size of request bodies
func limitRequestBody(next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
r.Body = http.MaxBytesReader(w, r.Body, config.MaxRequestBodySize)
next(w, r)
}
}
// jsonResponse writes a JSON response and handles encoding errors
func jsonResponse(w http.ResponseWriter, data interface{}) {
w.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(w).Encode(data); err != nil {
log.Printf("Error encoding JSON response: %v", err)
// Can't send error status if response already partially sent
}
}
// authMiddleware validates the Bearer token using constant-time comparison
func authMiddleware(token string, next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
// Get Authorization header
authHeader := r.Header.Get("Authorization")
expectedAuth := "Bearer " + token
// Use constant-time comparison to prevent timing attacks
// Check length first (still constant-time for the comparison itself)
if len(authHeader) != len(expectedAuth) {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
log.Printf("Unauthorized request from %s to %s (length mismatch)", r.RemoteAddr, r.URL.Path)
return
}
// Constant-time comparison of the full auth header
if subtle.ConstantTimeCompare([]byte(authHeader), []byte(expectedAuth)) != 1 {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
log.Printf("Unauthorized request from %s to %s", r.RemoteAddr, r.URL.Path)
return
}
next(w, r)
}
}
func main() {
log.Println("Starting DockMon Stats Service...")
// Generate random token
token, err := generateToken()
if err != nil {
log.Fatalf("Failed to generate token: %v", err)
}
// Write token to file for Python backend
if err := os.WriteFile(config.TokenFilePath, []byte(token), 0600); err != nil {
log.Fatalf("Failed to write token file: %v", err)
}
log.Printf("Generated temporary auth token for stats service")
log.Printf("Configuration: port=%s, aggregation=%v, cache_size=%d, cleanup=%v",
config.Port, config.AggregationInterval, config.EventCacheSize, config.CleanupInterval)
// Create stats cache
cache := NewStatsCache()
// Create stream manager
streamManager := NewStreamManager(cache)
// Create aggregator with configured interval
aggregator := NewAggregator(cache, streamManager, config.AggregationInterval)
// Create event management components with configured cache size
eventCache := NewEventCache(config.EventCacheSize)
eventBroadcaster := NewEventBroadcaster()
eventManager := NewEventManager(eventBroadcaster, eventCache)
// Create context for graceful shutdown
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Start aggregator
go aggregator.Start(ctx)
// Start cleanup routine (remove stale stats every 60 seconds)
go func() {
ticker := time.NewTicker(60 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
cache.CleanStaleStats(60 * time.Second)
}
}
}()
// Create HTTP server
mux := http.NewServeMux()
// Health check endpoint
mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
_, totalEvents := eventCache.GetStats()
jsonResponse(w, map[string]interface{}{
"status": "ok",
"service": "dockmon-stats",
"stats_streams": streamManager.GetStreamCount(),
"event_hosts": eventManager.GetActiveHosts(),
"event_connections": eventBroadcaster.GetConnectionCount(),
"cached_events": totalEvents,
})
})
// Get all host stats (main endpoint for Python backend) - PROTECTED
mux.HandleFunc("/api/stats/hosts", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
hostStats := cache.GetAllHostStats()
json.NewEncoder(w).Encode(hostStats)
}))
// Get stats for a specific host - PROTECTED
mux.HandleFunc("/api/stats/host/", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
hostID := r.URL.Path[len("/api/stats/host/"):]
if hostID == "" {
http.Error(w, "host_id required", http.StatusBadRequest)
return
}
stats, ok := cache.GetHostStats(hostID)
if !ok {
http.NotFound(w, r)
return
}
jsonResponse(w,stats)
}))
// Get all container stats (for debugging) - PROTECTED
mux.HandleFunc("/api/stats/containers", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
containerStats := cache.GetAllContainerStats()
json.NewEncoder(w).Encode(containerStats)
}))
// Start stream for a container (called by Python backend) - PROTECTED
mux.HandleFunc("/api/streams/start", authMiddleware(token, limitRequestBody(func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req struct {
ContainerID string `json:"container_id"`
ContainerName string `json:"container_name"`
HostID string `json:"host_id"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Validate required fields
if req.ContainerID == "" || req.HostID == "" {
http.Error(w, "container_id and host_id are required", http.StatusBadRequest)
return
}
if err := streamManager.StartStream(ctx, req.ContainerID, req.ContainerName, req.HostID); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{"status": "started"})
})))
// Stop stream for a container - PROTECTED
mux.HandleFunc("/api/streams/stop", authMiddleware(token, limitRequestBody(func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req struct {
ContainerID string `json:"container_id"`
HostID string `json:"host_id"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Validate required fields
if req.ContainerID == "" || req.HostID == "" {
http.Error(w, "container_id and host_id are required", http.StatusBadRequest)
return
}
streamManager.StopStream(req.ContainerID, req.HostID)
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{"status": "stopped"})
})))
// Add Docker host - PROTECTED
mux.HandleFunc("/api/hosts/add", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req struct {
HostID string `json:"host_id"`
HostAddress string `json:"host_address"`
TLSCACert string `json:"tls_ca_cert,omitempty"`
TLSCert string `json:"tls_cert,omitempty"`
TLSKey string `json:"tls_key,omitempty"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Validate required fields
if req.HostID == "" || req.HostAddress == "" {
http.Error(w, "host_id and host_address are required", http.StatusBadRequest)
return
}
if err := streamManager.AddDockerHost(req.HostID, req.HostAddress, req.TLSCACert, req.TLSCert, req.TLSKey); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{"status": "added"})
}))
// Remove Docker host - PROTECTED
mux.HandleFunc("/api/hosts/remove", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req struct {
HostID string `json:"host_id"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Validate required fields
if req.HostID == "" {
http.Error(w, "host_id is required", http.StatusBadRequest)
return
}
streamManager.RemoveDockerHost(req.HostID)
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{"status": "removed"})
}))
// Debug endpoint - PROTECTED
mux.HandleFunc("/debug/stats", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
containerCount, hostCount := cache.GetStats()
jsonResponse(w, map[string]interface{}{
"streams": streamManager.GetStreamCount(),
"containers": containerCount,
"hosts": hostCount,
})
}))
// === Event Monitoring Endpoints ===
// Start monitoring events for a host - PROTECTED
mux.HandleFunc("/api/events/hosts/add", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req struct {
HostID string `json:"host_id"`
HostAddress string `json:"host_address"`
TLSCACert string `json:"tls_ca_cert,omitempty"`
TLSCert string `json:"tls_cert,omitempty"`
TLSKey string `json:"tls_key,omitempty"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Validate required fields
if req.HostID == "" || req.HostAddress == "" {
http.Error(w, "host_id and host_address are required", http.StatusBadRequest)
return
}
if err := eventManager.AddHost(req.HostID, req.HostAddress, req.TLSCACert, req.TLSCert, req.TLSKey); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{"status": "started"})
}))
// Stop monitoring events for a host - PROTECTED
mux.HandleFunc("/api/events/hosts/remove", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
var req struct {
HostID string `json:"host_id"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
return
}
// Validate required fields
if req.HostID == "" {
http.Error(w, "host_id is required", http.StatusBadRequest)
return
}
eventManager.RemoveHost(req.HostID)
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(map[string]string{"status": "stopped"})
}))
// Get recent events - PROTECTED
mux.HandleFunc("/api/events/recent", authMiddleware(token, func(w http.ResponseWriter, r *http.Request) {
hostID := r.URL.Query().Get("host_id")
var events interface{}
if hostID != "" {
// Get events for specific host
events = eventCache.GetRecentEvents(hostID, 50)
} else {
// Get events for all hosts
events = eventCache.GetAllRecentEvents(50)
}
jsonResponse(w,events)
}))
// WebSocket endpoint for event streaming - PROTECTED
mux.HandleFunc("/ws/events", func(w http.ResponseWriter, r *http.Request) {
// Validate token from query parameter or header
tokenParam := r.URL.Query().Get("token")
authHeader := r.Header.Get("Authorization")
validToken := false
if tokenParam == token {
validToken = true
} else if authHeader == "Bearer "+token {
validToken = true
}
if !validToken {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
log.Printf("Unauthorized WebSocket connection attempt from %s", r.RemoteAddr)
return
}
// Upgrade to WebSocket
upgrader := websocket.Upgrader{
CheckOrigin: func(r *http.Request) bool {
origin := r.Header.Get("Origin")
if origin == "" {
return true // Allow same-origin requests
}
// Check against configured allowed origins
allowedOrigins := strings.Split(config.AllowedOrigins, ",")
for _, allowed := range allowedOrigins {
if strings.TrimSpace(allowed) == origin {
return true
}
}
return false
},
}
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
log.Printf("WebSocket upgrade failed: %v", err)
return
}
// Register connection
if err := eventBroadcaster.AddConnection(conn); err != nil {
log.Printf("Failed to register connection: %v", err)
conn.WriteMessage(websocket.CloseMessage, websocket.FormatCloseMessage(websocket.ClosePolicyViolation, "Connection limit reached"))
conn.Close()
return
}
// Handle connection (read loop to detect disconnect)
go func() {
defer func() {
eventBroadcaster.RemoveConnection(conn)
conn.Close()
}()
for {
// Read messages (just to detect disconnect, we don't expect any)
_, _, err := conn.ReadMessage()
if err != nil {
break
}
}
}()
})
// Create server with configured port
srv := &http.Server{
Addr: ":" + config.Port,
Handler: mux,
ReadTimeout: 10 * time.Second,
WriteTimeout: 10 * time.Second,
IdleTimeout: 60 * time.Second,
}
// Start server in goroutine
go func() {
log.Printf("Stats service listening on %s", srv.Addr)
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Fatalf("Server error: %v", err)
}
}()
// Wait for interrupt signal
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
<-sigChan
log.Println("Shutting down stats service...")
// Graceful shutdown
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer shutdownCancel()
// Stop all stats streams
streamManager.StopAllStreams()
// Stop all event monitoring
eventManager.StopAll()
// Close all event WebSocket connections
eventBroadcaster.CloseAll()
// Stop HTTP server
if err := srv.Shutdown(shutdownCtx); err != nil {
log.Printf("Server shutdown error: %v", err)
}
// Cancel context to stop aggregator
cancel()
// Clean up token file
if err := os.Remove(config.TokenFilePath); err != nil {
log.Printf("Warning: Failed to remove token file: %v", err)
} else {
log.Println("Removed token file")
}
log.Println("Stats service stopped")
}

View File

@@ -0,0 +1,440 @@
package main
import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/json"
"fmt"
"io"
"log"
"net"
"net/http"
"strings"
"sync"
"time"
"github.com/docker/docker/api/types"
"github.com/docker/docker/client"
)
// ContainerInfo holds basic container information
type ContainerInfo struct {
ID string
Name string
HostID string
}
// createTLSOption creates a Docker client TLS option from PEM-encoded certificates
func createTLSOption(caCertPEM, certPEM, keyPEM string) (client.Opt, error) {
// Parse CA certificate
caCertPool := x509.NewCertPool()
if !caCertPool.AppendCertsFromPEM([]byte(caCertPEM)) {
return nil, fmt.Errorf("failed to parse CA certificate")
}
// Parse client certificate and key
clientCert, err := tls.X509KeyPair([]byte(certPEM), []byte(keyPEM))
if err != nil {
return nil, fmt.Errorf("failed to parse client certificate/key: %v", err)
}
// Create TLS config
tlsConfig := &tls.Config{
Certificates: []tls.Certificate{clientCert},
RootCAs: caCertPool,
MinVersion: tls.VersionTLS12,
}
// Create HTTP client with TLS transport and timeouts
// Note: No overall Timeout set because Docker API streaming operations (stats, events)
// are long-running connections that should not be killed by a timeout
httpClient := &http.Client{
Transport: &http.Transport{
DialContext: (&net.Dialer{
Timeout: 30 * time.Second, // Connection establishment timeout
KeepAlive: 30 * time.Second, // TCP keepalive interval
}).DialContext,
TLSClientConfig: tlsConfig,
TLSHandshakeTimeout: 10 * time.Second,
IdleConnTimeout: 90 * time.Second,
ResponseHeaderTimeout: 10 * time.Second,
},
}
return client.WithHTTPClient(httpClient), nil
}
// StreamManager manages persistent stats streams for all containers
type StreamManager struct {
cache *StatsCache
clients map[string]*client.Client // hostID -> Docker client
clientsMu sync.RWMutex
streams map[string]context.CancelFunc // composite key (hostID:containerID) -> cancel function
streamsMu sync.RWMutex
containers map[string]*ContainerInfo // composite key (hostID:containerID) -> info
containersMu sync.RWMutex
}
// NewStreamManager creates a new stream manager
func NewStreamManager(cache *StatsCache) *StreamManager {
return &StreamManager{
cache: cache,
clients: make(map[string]*client.Client),
streams: make(map[string]context.CancelFunc),
containers: make(map[string]*ContainerInfo),
}
}
// AddDockerHost adds a Docker host client
func (sm *StreamManager) AddDockerHost(hostID, hostAddress, tlsCACert, tlsCert, tlsKey string) error {
// Create Docker client for this host FIRST (before acquiring lock)
var cli *client.Client
var err error
if hostAddress == "" || hostAddress == "unix:///var/run/docker.sock" {
// Local Docker socket
cli, err = client.NewClientWithOpts(
client.FromEnv,
client.WithAPIVersionNegotiation(),
)
} else {
// Remote Docker host - check if TLS is needed
clientOpts := []client.Opt{
client.WithHost(hostAddress),
client.WithAPIVersionNegotiation(),
}
// If TLS certificates provided, configure TLS
if tlsCACert != "" && tlsCert != "" && tlsKey != "" {
tlsOpt, err := createTLSOption(tlsCACert, tlsCert, tlsKey)
if err != nil {
return fmt.Errorf("failed to create TLS config: %v", err)
}
clientOpts = append(clientOpts, tlsOpt)
}
cli, err = client.NewClientWithOpts(clientOpts...)
}
if err != nil {
return err
}
// Track whether client was successfully stored to prevent leak
clientStored := false
defer func() {
if !clientStored && cli != nil {
cli.Close()
log.Printf("Cleaned up unstored Docker client for host %s", truncateID(hostID, 8))
}
}()
// Now that new client is successfully created, acquire lock and swap
sm.clientsMu.Lock()
defer sm.clientsMu.Unlock()
// Close existing client if it exists (only after new one succeeds)
if existingClient, exists := sm.clients[hostID]; exists {
existingClient.Close()
log.Printf("Closed existing Docker client for host %s", truncateID(hostID, 8))
}
sm.clients[hostID] = cli
clientStored = true // Mark as successfully stored
log.Printf("Added Docker host: %s (%s)", truncateID(hostID, 8), hostAddress)
// Initialize host stats with zero values so the host appears immediately in the UI
sm.cache.UpdateHostStats(&HostStats{
HostID: hostID,
ContainerCount: 0,
})
return nil
}
// RemoveDockerHost removes a Docker host client and stops all its streams
func (sm *StreamManager) RemoveDockerHost(hostID string) {
// First, find all containers for this host
sm.containersMu.RLock()
containersToStop := make([]string, 0)
for compositeKey, info := range sm.containers {
if info.HostID == hostID {
containersToStop = append(containersToStop, compositeKey)
}
}
sm.containersMu.RUnlock()
// Stop all streams for containers on this host
// Do this BEFORE closing the client to avoid streams trying to use a closed client
for _, compositeKey := range containersToStop {
// Extract container ID from composite key (format: hostID:containerID)
parts := strings.SplitN(compositeKey, ":", 2)
if len(parts) == 2 {
sm.StopStream(parts[1], parts[0]) // containerID, hostID
}
}
// Now close and remove the Docker client
sm.clientsMu.Lock()
defer sm.clientsMu.Unlock()
if cli, exists := sm.clients[hostID]; exists {
cli.Close()
delete(sm.clients, hostID)
log.Printf("Removed Docker host: %s", truncateID(hostID, 8))
}
// Remove all stats for this host from cache
sm.cache.RemoveHostStats(hostID)
}
// StartStream starts a persistent stats stream for a container
func (sm *StreamManager) StartStream(ctx context.Context, containerID, containerName, hostID string) error {
// Create composite key to support containers with duplicate IDs on different hosts
compositeKey := fmt.Sprintf("%s:%s", hostID, containerID)
// Acquire locks in consistent order: clientsMu → streamsMu → containersMu (when needed)
sm.clientsMu.RLock()
sm.streamsMu.Lock()
// Check if stream already exists
if _, exists := sm.streams[compositeKey]; exists {
sm.streamsMu.Unlock()
sm.clientsMu.RUnlock()
return nil // Already streaming
}
// Check if client exists
_, clientExists := sm.clients[hostID]
if !clientExists {
sm.streamsMu.Unlock()
sm.clientsMu.RUnlock()
log.Printf("Warning: No Docker client for host %s", truncateID(hostID, 8))
return nil
}
// Create cancellable context for this stream
streamCtx, cancel := context.WithCancel(ctx)
sm.streams[compositeKey] = cancel
// Release locks before acquiring containersMu to prevent nested locking
sm.streamsMu.Unlock()
sm.clientsMu.RUnlock()
// Store container info with separate lock
sm.containersMu.Lock()
sm.containers[compositeKey] = &ContainerInfo{
ID: containerID,
Name: containerName,
HostID: hostID,
}
sm.containersMu.Unlock()
// Start streaming goroutine (no locks held)
go sm.streamStats(streamCtx, containerID, containerName, hostID)
log.Printf("Started stats stream for container %s (%s) on host %s", containerName, truncateID(containerID, 12), truncateID(hostID, 12))
return nil
}
// StopStream stops the stats stream for a container
func (sm *StreamManager) StopStream(containerID, hostID string) {
// Create composite key to support containers with duplicate IDs on different hosts
compositeKey := fmt.Sprintf("%s:%s", hostID, containerID)
sm.streamsMu.Lock()
defer sm.streamsMu.Unlock()
cancel, exists := sm.streams[compositeKey]
if exists {
cancel()
delete(sm.streams, compositeKey)
}
sm.containersMu.Lock()
defer sm.containersMu.Unlock()
delete(sm.containers, compositeKey)
// Remove from cache
sm.cache.RemoveContainerStats(containerID, hostID)
log.Printf("Stopped stats stream for container %s", truncateID(containerID, 12))
}
// streamStats maintains a persistent stats stream for a single container
func (sm *StreamManager) streamStats(ctx context.Context, containerID, containerName, hostID string) {
defer func() {
if r := recover(); r != nil {
log.Printf("Recovered from panic in stats stream for %s: %v", truncateID(containerID, 12), r)
}
}()
// Retry loop - restart stream if it fails
backoff := time.Second
maxBackoff := 30 * time.Second
for {
select {
case <-ctx.Done():
return
default:
}
// Get current Docker client (may have changed if host was updated)
sm.clientsMu.RLock()
cli, ok := sm.clients[hostID]
sm.clientsMu.RUnlock() // Manual unlock needed - we're in a loop
if !ok {
log.Printf("No Docker client for host %s (container %s), retrying in %v", truncateID(hostID, 8), truncateID(containerID, 12), backoff)
time.Sleep(backoff)
backoff = min(backoff*2, maxBackoff)
continue
}
// Open stats stream
stats, err := cli.ContainerStats(ctx, containerID, true) // stream=true
if err != nil {
log.Printf("Error opening stats stream for %s: %v (retrying in %v)", truncateID(containerID, 12), err, backoff)
time.Sleep(backoff)
backoff = min(backoff*2, maxBackoff)
continue
}
// Reset backoff on successful connection
backoff = time.Second
// Read stats from stream
decoder := json.NewDecoder(stats.Body)
for {
select {
case <-ctx.Done():
stats.Body.Close()
return
default:
}
var stat types.StatsJSON
if err := decoder.Decode(&stat); err != nil {
stats.Body.Close()
if err == io.EOF || err == context.Canceled {
log.Printf("Stats stream ended for %s", truncateID(containerID, 12))
} else {
log.Printf("Error decoding stats for %s: %v", truncateID(containerID, 12), err)
}
break // Break inner loop, will retry in outer loop
}
// Calculate and cache stats
sm.processStats(&stat, containerID, containerName, hostID)
}
// Brief pause before reconnecting
time.Sleep(time.Second)
}
}
// processStats calculates metrics from raw Docker stats
func (sm *StreamManager) processStats(stat *types.StatsJSON, containerID, containerName, hostID string) {
// Calculate CPU percentage
cpuPercent := calculateCPUPercent(stat)
// Memory stats
memUsage := stat.MemoryStats.Usage
memLimit := stat.MemoryStats.Limit
memPercent := 0.0
if memLimit > 0 {
memPercent = (float64(memUsage) / float64(memLimit)) * 100.0
}
// Network stats
var netRx, netTx uint64
for _, net := range stat.Networks {
netRx += net.RxBytes
netTx += net.TxBytes
}
// Disk I/O stats
var diskRead, diskWrite uint64
for _, bio := range stat.BlkioStats.IoServiceBytesRecursive {
if bio.Op == "Read" {
diskRead += bio.Value
} else if bio.Op == "Write" {
diskWrite += bio.Value
}
}
// Update cache
sm.cache.UpdateContainerStats(&ContainerStats{
ContainerID: containerID,
ContainerName: containerName,
HostID: hostID,
CPUPercent: roundToDecimal(cpuPercent, 1),
MemoryUsage: memUsage,
MemoryLimit: memLimit,
MemoryPercent: roundToDecimal(memPercent, 1),
NetworkRx: netRx,
NetworkTx: netTx,
DiskRead: diskRead,
DiskWrite: diskWrite,
})
}
// calculateCPUPercent calculates CPU percentage from Docker stats
func calculateCPUPercent(stat *types.StatsJSON) float64 {
// CPU calculation similar to `docker stats` command
cpuDelta := float64(stat.CPUStats.CPUUsage.TotalUsage) - float64(stat.PreCPUStats.CPUUsage.TotalUsage)
systemDelta := float64(stat.CPUStats.SystemUsage) - float64(stat.PreCPUStats.SystemUsage)
if systemDelta > 0.0 && cpuDelta > 0.0 {
numCPUs := float64(len(stat.CPUStats.CPUUsage.PercpuUsage))
if numCPUs == 0 {
numCPUs = 1.0
}
return (cpuDelta / systemDelta) * numCPUs * 100.0
}
return 0.0
}
// GetStreamCount returns the number of active streams
func (sm *StreamManager) GetStreamCount() int {
sm.streamsMu.RLock()
defer sm.streamsMu.RUnlock()
return len(sm.streams)
}
// HasHost checks if a Docker host is registered
func (sm *StreamManager) HasHost(hostID string) bool {
sm.clientsMu.RLock()
defer sm.clientsMu.RUnlock()
_, exists := sm.clients[hostID]
return exists
}
// StopAllStreams stops all active streams and closes all Docker clients
func (sm *StreamManager) StopAllStreams() {
// Stop all streams
sm.streamsMu.Lock()
for containerID, cancel := range sm.streams {
cancel()
log.Printf("Stopped stream for %s", truncateID(containerID, 12))
}
sm.streams = make(map[string]context.CancelFunc)
sm.streamsMu.Unlock()
// Close all Docker clients
sm.clientsMu.Lock()
for hostID, cli := range sm.clients {
cli.Close()
log.Printf("Closed Docker client for host %s", truncateID(hostID, 8))
}
sm.clients = make(map[string]*client.Client)
sm.clientsMu.Unlock()
}
func min(a, b time.Duration) time.Duration {
if a < b {
return a
}
return b
}

View File

@@ -0,0 +1,9 @@
package main
// truncateID truncates an ID string to specified length
func truncateID(id string, length int) string {
if len(id) <= length {
return id
}
return id[:length]
}