Files
mq/dag/monitoring.go
2025-07-29 11:12:53 +05:45

447 lines
12 KiB
Go

package dag
import (
"context"
"fmt"
"sync"
"time"
"github.com/oarkflow/mq"
"github.com/oarkflow/mq/logger"
)
// MonitoringMetrics holds comprehensive metrics for DAG monitoring
type MonitoringMetrics struct {
mu sync.RWMutex
TasksTotal int64
TasksCompleted int64
TasksFailed int64
TasksCancelled int64
TasksInProgress int64
NodesExecuted map[string]int64
NodeExecutionTimes map[string][]time.Duration
NodeFailures map[string]int64
AverageExecutionTime time.Duration
TotalExecutionTime time.Duration
StartTime time.Time
LastTaskCompletedAt time.Time
ActiveTasks map[string]time.Time
NodeProcessingStats map[string]*NodeStats
}
// NodeStats holds statistics for individual nodes
type NodeStats struct {
ExecutionCount int64
SuccessCount int64
FailureCount int64
TotalDuration time.Duration
AverageDuration time.Duration
MinDuration time.Duration
MaxDuration time.Duration
LastExecuted time.Time
LastSuccess time.Time
LastFailure time.Time
CurrentlyRunning int64
}
// NewMonitoringMetrics creates a new metrics instance
func NewMonitoringMetrics() *MonitoringMetrics {
return &MonitoringMetrics{
NodesExecuted: make(map[string]int64),
NodeExecutionTimes: make(map[string][]time.Duration),
NodeFailures: make(map[string]int64),
StartTime: time.Now(),
ActiveTasks: make(map[string]time.Time),
NodeProcessingStats: make(map[string]*NodeStats),
}
}
// RecordTaskStart records the start of a task
func (m *MonitoringMetrics) RecordTaskStart(taskID string) {
m.mu.Lock()
defer m.mu.Unlock()
m.TasksTotal++
m.TasksInProgress++
m.ActiveTasks[taskID] = time.Now()
}
// RecordTaskCompletion records task completion
func (m *MonitoringMetrics) RecordTaskCompletion(taskID string, status mq.Status) {
m.mu.Lock()
defer m.mu.Unlock()
if startTime, exists := m.ActiveTasks[taskID]; exists {
duration := time.Since(startTime)
m.TotalExecutionTime += duration
m.LastTaskCompletedAt = time.Now()
delete(m.ActiveTasks, taskID)
m.TasksInProgress--
// Update average execution time
if m.TasksCompleted > 0 {
m.AverageExecutionTime = m.TotalExecutionTime / time.Duration(m.TasksCompleted+1)
}
}
switch status {
case mq.Completed:
m.TasksCompleted++
case mq.Failed:
m.TasksFailed++
case mq.Cancelled:
m.TasksCancelled++
}
}
// RecordNodeExecution records node execution metrics
func (m *MonitoringMetrics) RecordNodeExecution(nodeID string, duration time.Duration, success bool) {
m.mu.Lock()
defer m.mu.Unlock()
// Initialize node stats if not exists
if _, exists := m.NodeProcessingStats[nodeID]; !exists {
m.NodeProcessingStats[nodeID] = &NodeStats{
MinDuration: duration,
MaxDuration: duration,
}
}
stats := m.NodeProcessingStats[nodeID]
stats.ExecutionCount++
stats.TotalDuration += duration
stats.AverageDuration = stats.TotalDuration / time.Duration(stats.ExecutionCount)
stats.LastExecuted = time.Now()
if duration < stats.MinDuration || stats.MinDuration == 0 {
stats.MinDuration = duration
}
if duration > stats.MaxDuration {
stats.MaxDuration = duration
}
if success {
stats.SuccessCount++
stats.LastSuccess = time.Now()
} else {
stats.FailureCount++
stats.LastFailure = time.Now()
m.NodeFailures[nodeID]++
}
// Legacy tracking
m.NodesExecuted[nodeID]++
if len(m.NodeExecutionTimes[nodeID]) > 100 {
// Keep only last 100 execution times
m.NodeExecutionTimes[nodeID] = m.NodeExecutionTimes[nodeID][1:]
}
m.NodeExecutionTimes[nodeID] = append(m.NodeExecutionTimes[nodeID], duration)
}
// RecordNodeStart records when a node starts processing
func (m *MonitoringMetrics) RecordNodeStart(nodeID string) {
m.mu.Lock()
defer m.mu.Unlock()
if stats, exists := m.NodeProcessingStats[nodeID]; exists {
stats.CurrentlyRunning++
}
}
// RecordNodeEnd records when a node finishes processing
func (m *MonitoringMetrics) RecordNodeEnd(nodeID string) {
m.mu.Lock()
defer m.mu.Unlock()
if stats, exists := m.NodeProcessingStats[nodeID]; exists && stats.CurrentlyRunning > 0 {
stats.CurrentlyRunning--
}
}
// GetSnapshot returns a snapshot of current metrics
func (m *MonitoringMetrics) GetSnapshot() *MonitoringMetrics {
m.mu.RLock()
defer m.mu.RUnlock()
snapshot := &MonitoringMetrics{
TasksTotal: m.TasksTotal,
TasksCompleted: m.TasksCompleted,
TasksFailed: m.TasksFailed,
TasksCancelled: m.TasksCancelled,
TasksInProgress: m.TasksInProgress,
AverageExecutionTime: m.AverageExecutionTime,
TotalExecutionTime: m.TotalExecutionTime,
StartTime: m.StartTime,
LastTaskCompletedAt: m.LastTaskCompletedAt,
NodesExecuted: make(map[string]int64),
NodeExecutionTimes: make(map[string][]time.Duration),
NodeFailures: make(map[string]int64),
ActiveTasks: make(map[string]time.Time),
NodeProcessingStats: make(map[string]*NodeStats),
}
// Deep copy maps
for k, v := range m.NodesExecuted {
snapshot.NodesExecuted[k] = v
}
for k, v := range m.NodeFailures {
snapshot.NodeFailures[k] = v
}
for k, v := range m.ActiveTasks {
snapshot.ActiveTasks[k] = v
}
for k, v := range m.NodeExecutionTimes {
snapshot.NodeExecutionTimes[k] = make([]time.Duration, len(v))
copy(snapshot.NodeExecutionTimes[k], v)
}
for k, v := range m.NodeProcessingStats {
snapshot.NodeProcessingStats[k] = &NodeStats{
ExecutionCount: v.ExecutionCount,
SuccessCount: v.SuccessCount,
FailureCount: v.FailureCount,
TotalDuration: v.TotalDuration,
AverageDuration: v.AverageDuration,
MinDuration: v.MinDuration,
MaxDuration: v.MaxDuration,
LastExecuted: v.LastExecuted,
LastSuccess: v.LastSuccess,
LastFailure: v.LastFailure,
CurrentlyRunning: v.CurrentlyRunning,
}
}
return snapshot
}
// GetNodeStats returns statistics for a specific node
func (m *MonitoringMetrics) GetNodeStats(nodeID string) *NodeStats {
m.mu.RLock()
defer m.mu.RUnlock()
if stats, exists := m.NodeProcessingStats[nodeID]; exists {
// Return a copy
return &NodeStats{
ExecutionCount: stats.ExecutionCount,
SuccessCount: stats.SuccessCount,
FailureCount: stats.FailureCount,
TotalDuration: stats.TotalDuration,
AverageDuration: stats.AverageDuration,
MinDuration: stats.MinDuration,
MaxDuration: stats.MaxDuration,
LastExecuted: stats.LastExecuted,
LastSuccess: stats.LastSuccess,
LastFailure: stats.LastFailure,
CurrentlyRunning: stats.CurrentlyRunning,
}
}
return nil
}
// Monitor provides comprehensive monitoring capabilities for DAG
type Monitor struct {
dag *DAG
metrics *MonitoringMetrics
logger logger.Logger
alertThresholds *AlertThresholds
webhookURL string
alertHandlers []AlertHandler
monitoringActive bool
stopCh chan struct{}
mu sync.RWMutex
}
// AlertThresholds defines thresholds for alerting
type AlertThresholds struct {
MaxFailureRate float64 // Maximum allowed failure rate (0.0 - 1.0)
MaxExecutionTime time.Duration // Maximum allowed execution time
MaxTasksInProgress int64 // Maximum allowed concurrent tasks
MinSuccessRate float64 // Minimum required success rate
MaxNodeFailures int64 // Maximum failures per node
HealthCheckInterval time.Duration // How often to check health
}
// AlertHandler defines interface for handling alerts
type AlertHandler interface {
HandleAlert(alert Alert) error
}
// Alert represents a monitoring alert
type Alert struct {
Type string
Severity string
Message string
NodeID string
TaskID string
Timestamp time.Time
Metrics map[string]interface{}
}
// NewMonitor creates a new DAG monitor
func NewMonitor(dag *DAG, logger logger.Logger) *Monitor {
return &Monitor{
dag: dag,
metrics: NewMonitoringMetrics(),
logger: logger,
alertThresholds: &AlertThresholds{
MaxFailureRate: 0.1, // 10% failure rate
MaxExecutionTime: 5 * time.Minute,
MaxTasksInProgress: 1000,
MinSuccessRate: 0.9, // 90% success rate
MaxNodeFailures: 10,
HealthCheckInterval: 30 * time.Second,
},
stopCh: make(chan struct{}),
}
}
// Start begins monitoring
func (m *Monitor) Start(ctx context.Context) {
m.mu.Lock()
if m.monitoringActive {
m.mu.Unlock()
return
}
m.monitoringActive = true
m.mu.Unlock()
// Start health check routine
go m.healthCheckRoutine(ctx)
m.logger.Info("DAG monitoring started")
}
// Stop stops monitoring
func (m *Monitor) Stop() {
m.mu.Lock()
defer m.mu.Unlock()
if !m.monitoringActive {
return
}
close(m.stopCh)
m.monitoringActive = false
m.logger.Info("DAG monitoring stopped")
}
// SetAlertThresholds updates alert thresholds
func (m *Monitor) SetAlertThresholds(thresholds *AlertThresholds) {
m.mu.Lock()
defer m.mu.Unlock()
m.alertThresholds = thresholds
}
// AddAlertHandler adds an alert handler
func (m *Monitor) AddAlertHandler(handler AlertHandler) {
m.mu.Lock()
defer m.mu.Unlock()
m.alertHandlers = append(m.alertHandlers, handler)
}
// GetMetrics returns current metrics
func (m *Monitor) GetMetrics() *MonitoringMetrics {
return m.metrics.GetSnapshot()
}
// healthCheckRoutine performs periodic health checks
func (m *Monitor) healthCheckRoutine(ctx context.Context) {
ticker := time.NewTicker(m.alertThresholds.HealthCheckInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-m.stopCh:
return
case <-ticker.C:
m.performHealthCheck()
}
}
}
// performHealthCheck checks system health and triggers alerts
func (m *Monitor) performHealthCheck() {
snapshot := m.metrics.GetSnapshot()
// Check failure rate
if snapshot.TasksTotal > 0 {
failureRate := float64(snapshot.TasksFailed) / float64(snapshot.TasksTotal)
if failureRate > m.alertThresholds.MaxFailureRate {
m.triggerAlert(Alert{
Type: "high_failure_rate",
Severity: "warning",
Message: fmt.Sprintf("High failure rate: %.2f%%", failureRate*100),
Timestamp: time.Now(),
Metrics: map[string]interface{}{
"failure_rate": failureRate,
"total_tasks": snapshot.TasksTotal,
"failed_tasks": snapshot.TasksFailed,
},
})
}
}
// Check tasks in progress
if snapshot.TasksInProgress > m.alertThresholds.MaxTasksInProgress {
m.triggerAlert(Alert{
Type: "high_task_load",
Severity: "warning",
Message: fmt.Sprintf("High number of tasks in progress: %d", snapshot.TasksInProgress),
Timestamp: time.Now(),
Metrics: map[string]interface{}{
"tasks_in_progress": snapshot.TasksInProgress,
"threshold": m.alertThresholds.MaxTasksInProgress,
},
})
}
// Check node failures
for nodeID, failures := range snapshot.NodeFailures {
if failures > m.alertThresholds.MaxNodeFailures {
m.triggerAlert(Alert{
Type: "node_failures",
Severity: "error",
Message: fmt.Sprintf("Node %s has %d failures", nodeID, failures),
NodeID: nodeID,
Timestamp: time.Now(),
Metrics: map[string]interface{}{
"node_id": nodeID,
"failures": failures,
},
})
}
}
// Check execution time
if snapshot.AverageExecutionTime > m.alertThresholds.MaxExecutionTime {
m.triggerAlert(Alert{
Type: "slow_execution",
Severity: "warning",
Message: fmt.Sprintf("Average execution time is high: %v", snapshot.AverageExecutionTime),
Timestamp: time.Now(),
Metrics: map[string]interface{}{
"average_execution_time": snapshot.AverageExecutionTime,
"threshold": m.alertThresholds.MaxExecutionTime,
},
})
}
}
// triggerAlert sends alerts to all registered handlers
func (m *Monitor) triggerAlert(alert Alert) {
m.logger.Warn("Alert triggered",
logger.Field{Key: "type", Value: alert.Type},
logger.Field{Key: "severity", Value: alert.Severity},
logger.Field{Key: "message", Value: alert.Message},
)
for _, handler := range m.alertHandlers {
if err := handler.HandleAlert(alert); err != nil {
m.logger.Error("Alert handler failed",
logger.Field{Key: "error", Value: err.Error()},
)
}
}
}