update

2025-10-30 22:26:32 +08:00 · 2025-07-30 12:29:04 +05:45
parent 2829e73450
commit d814019d73
10 changed files with 2356 additions and 507 deletions
--- a/dag/monitoring.go
+++ b/dag/monitoring.go
@@ -71,17 +71,9 @@ func (m *MonitoringMetrics) RecordTaskCompletion(taskID string, status mq.Status
 	m.mu.Lock()
 	defer m.mu.Unlock()

-	if startTime, exists := m.ActiveTasks[taskID]; exists {
-		duration := time.Since(startTime)
-		m.TotalExecutionTime += duration
-		m.LastTaskCompletedAt = time.Now()
-		delete(m.ActiveTasks, taskID)
-		m.TasksInProgress--
-
-		// Update average execution time
-		if m.TasksCompleted > 0 {
-			m.AverageExecutionTime = m.TotalExecutionTime / time.Duration(m.TasksCompleted+1)
-		}
+	m.TasksInProgress--
+	if m.TasksInProgress < 0 {
+		m.TasksInProgress = 0
 	}

 	switch status {
@@ -92,6 +84,9 @@ func (m *MonitoringMetrics) RecordTaskCompletion(taskID string, status mq.Status
 	case mq.Cancelled:
 		m.TasksCancelled++
 	}
+
+	m.LastTaskCompletedAt = time.Now()
+	delete(m.ActiveTasks, taskID)
 }

 // RecordNodeExecution records node execution metrics
@@ -131,11 +126,27 @@ func (m *MonitoringMetrics) RecordNodeExecution(nodeID string, duration time.Dur

 	// Legacy tracking
 	m.NodesExecuted[nodeID]++
-	if len(m.NodeExecutionTimes[nodeID]) > 100 {
-		// Keep only last 100 execution times
-		m.NodeExecutionTimes[nodeID] = m.NodeExecutionTimes[nodeID][1:]
-	}
 	m.NodeExecutionTimes[nodeID] = append(m.NodeExecutionTimes[nodeID], duration)
+
+	// Keep only last 100 execution times per node to prevent memory bloat
+	if len(m.NodeExecutionTimes[nodeID]) > 100 {
+		m.NodeExecutionTimes[nodeID] = m.NodeExecutionTimes[nodeID][len(m.NodeExecutionTimes[nodeID])-100:]
+	}
+
+	// Calculate average execution time
+	var totalDuration time.Duration
+	var totalExecutions int64
+	for _, durations := range m.NodeExecutionTimes {
+		for _, d := range durations {
+			totalDuration += d
+			totalExecutions++
+		}
+	}
+	if totalExecutions > 0 {
+		m.AverageExecutionTime = totalDuration / time.Duration(totalExecutions)
+	}
+
+	m.TotalExecutionTime += duration
 }

 // RecordNodeStart records when a node starts processing
@@ -145,6 +156,10 @@ func (m *MonitoringMetrics) RecordNodeStart(nodeID string) {

 	if stats, exists := m.NodeProcessingStats[nodeID]; exists {
 		stats.CurrentlyRunning++
+	} else {
+		m.NodeProcessingStats[nodeID] = &NodeStats{
+			CurrentlyRunning: 1,
+		}
 	}
 }

@@ -153,8 +168,11 @@ func (m *MonitoringMetrics) RecordNodeEnd(nodeID string) {
 	m.mu.Lock()
 	defer m.mu.Unlock()

-	if stats, exists := m.NodeProcessingStats[nodeID]; exists && stats.CurrentlyRunning > 0 {
+	if stats, exists := m.NodeProcessingStats[nodeID]; exists {
 		stats.CurrentlyRunning--
+		if stats.CurrentlyRunning < 0 {
+			stats.CurrentlyRunning = 0
+		}
 	}
 }

@@ -190,24 +208,14 @@ func (m *MonitoringMetrics) GetSnapshot() *MonitoringMetrics {
 	for k, v := range m.ActiveTasks {
 		snapshot.ActiveTasks[k] = v
 	}
-	for k, v := range m.NodeExecutionTimes {
-		snapshot.NodeExecutionTimes[k] = make([]time.Duration, len(v))
-		copy(snapshot.NodeExecutionTimes[k], v)
-	}
 	for k, v := range m.NodeProcessingStats {
-		snapshot.NodeProcessingStats[k] = &NodeStats{
-			ExecutionCount:   v.ExecutionCount,
-			SuccessCount:     v.SuccessCount,
-			FailureCount:     v.FailureCount,
-			TotalDuration:    v.TotalDuration,
-			AverageDuration:  v.AverageDuration,
-			MinDuration:      v.MinDuration,
-			MaxDuration:      v.MaxDuration,
-			LastExecuted:     v.LastExecuted,
-			LastSuccess:      v.LastSuccess,
-			LastFailure:      v.LastFailure,
-			CurrentlyRunning: v.CurrentlyRunning,
-		}
+		statsCopy := *v
+		snapshot.NodeProcessingStats[k] = &statsCopy
+	}
+	for k, v := range m.NodeExecutionTimes {
+		timesCopy := make([]time.Duration, len(v))
+		copy(timesCopy, v)
+		snapshot.NodeExecutionTimes[k] = timesCopy
 	}

 	return snapshot
@@ -219,45 +227,32 @@ func (m *MonitoringMetrics) GetNodeStats(nodeID string) *NodeStats {
 	defer m.mu.RUnlock()

 	if stats, exists := m.NodeProcessingStats[nodeID]; exists {
-		// Return a copy
-		return &NodeStats{
-			ExecutionCount:   stats.ExecutionCount,
-			SuccessCount:     stats.SuccessCount,
-			FailureCount:     stats.FailureCount,
-			TotalDuration:    stats.TotalDuration,
-			AverageDuration:  stats.AverageDuration,
-			MinDuration:      stats.MinDuration,
-			MaxDuration:      stats.MaxDuration,
-			LastExecuted:     stats.LastExecuted,
-			LastSuccess:      stats.LastSuccess,
-			LastFailure:      stats.LastFailure,
-			CurrentlyRunning: stats.CurrentlyRunning,
-		}
+		statsCopy := *stats
+		return &statsCopy
 	}
 	return nil
 }

 // Monitor provides comprehensive monitoring capabilities for DAG
 type Monitor struct {
-	dag              *DAG
-	metrics          *MonitoringMetrics
-	logger           logger.Logger
-	alertThresholds  *AlertThresholds
-	webhookURL       string
-	alertHandlers    []AlertHandler
-	monitoringActive bool
-	stopCh           chan struct{}
-	mu               sync.RWMutex
+	dag        *DAG
+	metrics    *MonitoringMetrics
+	logger     logger.Logger
+	thresholds *AlertThresholds
+	handlers   []AlertHandler
+	stopCh     chan struct{}
+	running    bool
+	mu         sync.RWMutex
 }

 // AlertThresholds defines thresholds for alerting
 type AlertThresholds struct {
-	MaxFailureRate      float64       // Maximum allowed failure rate (0.0 - 1.0)
-	MaxExecutionTime    time.Duration // Maximum allowed execution time
-	MaxTasksInProgress  int64         // Maximum allowed concurrent tasks
-	MinSuccessRate      float64       // Minimum required success rate
-	MaxNodeFailures     int64         // Maximum failures per node
-	HealthCheckInterval time.Duration // How often to check health
+	MaxFailureRate      float64       `json:"max_failure_rate"`
+	MaxExecutionTime    time.Duration `json:"max_execution_time"`
+	MaxTasksInProgress  int64         `json:"max_tasks_in_progress"`
+	MinSuccessRate      float64       `json:"min_success_rate"`
+	MaxNodeFailures     int64         `json:"max_node_failures"`
+	HealthCheckInterval time.Duration `json:"health_check_interval"`
 }

 // AlertHandler defines interface for handling alerts
@@ -267,44 +262,66 @@ type AlertHandler interface {

 // Alert represents a monitoring alert
 type Alert struct {
-	Type      string
-	Severity  string
-	Message   string
-	NodeID    string
-	TaskID    string
-	Timestamp time.Time
-	Metrics   map[string]interface{}
+	ID          string                 `json:"id"`
+	Timestamp   time.Time              `json:"timestamp"`
+	Severity    AlertSeverity          `json:"severity"`
+	Type        AlertType              `json:"type"`
+	Message     string                 `json:"message"`
+	Details     map[string]interface{} `json:"details"`
+	NodeID      string                 `json:"node_id,omitempty"`
+	TaskID      string                 `json:"task_id,omitempty"`
+	Threshold   interface{}            `json:"threshold,omitempty"`
+	ActualValue interface{}            `json:"actual_value,omitempty"`
 }

+type AlertSeverity string
+
+const (
+	AlertSeverityInfo     AlertSeverity = "info"
+	AlertSeverityWarning  AlertSeverity = "warning"
+	AlertSeverityCritical AlertSeverity = "critical"
+)
+
+type AlertType string
+
+const (
+	AlertTypeFailureRate    AlertType = "failure_rate"
+	AlertTypeExecutionTime  AlertType = "execution_time"
+	AlertTypeTaskLoad       AlertType = "task_load"
+	AlertTypeNodeFailures   AlertType = "node_failures"
+	AlertTypeCircuitBreaker AlertType = "circuit_breaker"
+	AlertTypeHealthCheck    AlertType = "health_check"
+)
+
 // NewMonitor creates a new DAG monitor
 func NewMonitor(dag *DAG, logger logger.Logger) *Monitor {
 	return &Monitor{
 		dag:     dag,
 		metrics: NewMonitoringMetrics(),
 		logger:  logger,
-		alertThresholds: &AlertThresholds{
-			MaxFailureRate:      0.1, // 10% failure rate
+		thresholds: &AlertThresholds{
+			MaxFailureRate:      0.1, // 10%
 			MaxExecutionTime:    5 * time.Minute,
 			MaxTasksInProgress:  1000,
-			MinSuccessRate:      0.9, // 90% success rate
+			MinSuccessRate:      0.9, // 90%
 			MaxNodeFailures:     10,
 			HealthCheckInterval: 30 * time.Second,
 		},
-		stopCh: make(chan struct{}),
+		handlers: make([]AlertHandler, 0),
+		stopCh:   make(chan struct{}),
 	}
 }

 // Start begins monitoring
 func (m *Monitor) Start(ctx context.Context) {
 	m.mu.Lock()
-	if m.monitoringActive {
-		m.mu.Unlock()
+	defer m.mu.Unlock()
+
+	if m.running {
 		return
 	}
-	m.monitoringActive = true
-	m.mu.Unlock()

-	// Start health check routine
+	m.running = true
 	go m.healthCheckRoutine(ctx)

 	m.logger.Info("DAG monitoring started")
@@ -315,12 +332,13 @@ func (m *Monitor) Stop() {
 	m.mu.Lock()
 	defer m.mu.Unlock()

-	if !m.monitoringActive {
+	if !m.running {
 		return
 	}

+	m.running = false
 	close(m.stopCh)
-	m.monitoringActive = false
+
 	m.logger.Info("DAG monitoring stopped")
 }

@@ -328,14 +346,14 @@ func (m *Monitor) Stop() {
 func (m *Monitor) SetAlertThresholds(thresholds *AlertThresholds) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
-	m.alertThresholds = thresholds
+	m.thresholds = thresholds
 }

 // AddAlertHandler adds an alert handler
 func (m *Monitor) AddAlertHandler(handler AlertHandler) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
-	m.alertHandlers = append(m.alertHandlers, handler)
+	m.handlers = append(m.handlers, handler)
 }

 // GetMetrics returns current metrics
@@ -345,7 +363,7 @@ func (m *Monitor) GetMetrics() *MonitoringMetrics {

 // healthCheckRoutine performs periodic health checks
 func (m *Monitor) healthCheckRoutine(ctx context.Context) {
-	ticker := time.NewTicker(m.alertThresholds.HealthCheckInterval)
+	ticker := time.NewTicker(m.thresholds.HealthCheckInterval)
 	defer ticker.Stop()

 	for {
@@ -362,50 +380,57 @@ func (m *Monitor) healthCheckRoutine(ctx context.Context) {

 // performHealthCheck checks system health and triggers alerts
 func (m *Monitor) performHealthCheck() {
-	snapshot := m.metrics.GetSnapshot()
+	metrics := m.GetMetrics()

 	// Check failure rate
-	if snapshot.TasksTotal > 0 {
-		failureRate := float64(snapshot.TasksFailed) / float64(snapshot.TasksTotal)
-		if failureRate > m.alertThresholds.MaxFailureRate {
+	if metrics.TasksTotal > 0 {
+		failureRate := float64(metrics.TasksFailed) / float64(metrics.TasksTotal)
+		if failureRate > m.thresholds.MaxFailureRate {
 			m.triggerAlert(Alert{
-				Type:      "high_failure_rate",
-				Severity:  "warning",
-				Message:   fmt.Sprintf("High failure rate: %.2f%%", failureRate*100),
-				Timestamp: time.Now(),
-				Metrics: map[string]interface{}{
-					"failure_rate": failureRate,
-					"total_tasks":  snapshot.TasksTotal,
-					"failed_tasks": snapshot.TasksFailed,
+				ID:          mq.NewID(),
+				Timestamp:   time.Now(),
+				Severity:    AlertSeverityCritical,
+				Type:        AlertTypeFailureRate,
+				Message:     "High failure rate detected",
+				Threshold:   m.thresholds.MaxFailureRate,
+				ActualValue: failureRate,
+				Details: map[string]interface{}{
+					"failed_tasks": metrics.TasksFailed,
+					"total_tasks":  metrics.TasksTotal,
 				},
 			})
 		}
 	}

-	// Check tasks in progress
-	if snapshot.TasksInProgress > m.alertThresholds.MaxTasksInProgress {
+	// Check task load
+	if metrics.TasksInProgress > m.thresholds.MaxTasksInProgress {
 		m.triggerAlert(Alert{
-			Type:      "high_task_load",
-			Severity:  "warning",
-			Message:   fmt.Sprintf("High number of tasks in progress: %d", snapshot.TasksInProgress),
-			Timestamp: time.Now(),
-			Metrics: map[string]interface{}{
-				"tasks_in_progress": snapshot.TasksInProgress,
-				"threshold":         m.alertThresholds.MaxTasksInProgress,
+			ID:          mq.NewID(),
+			Timestamp:   time.Now(),
+			Severity:    AlertSeverityWarning,
+			Type:        AlertTypeTaskLoad,
+			Message:     "High task load detected",
+			Threshold:   m.thresholds.MaxTasksInProgress,
+			ActualValue: metrics.TasksInProgress,
+			Details: map[string]interface{}{
+				"tasks_in_progress": metrics.TasksInProgress,
 			},
 		})
 	}

 	// Check node failures
-	for nodeID, failures := range snapshot.NodeFailures {
-		if failures > m.alertThresholds.MaxNodeFailures {
+	for nodeID, failures := range metrics.NodeFailures {
+		if failures > m.thresholds.MaxNodeFailures {
 			m.triggerAlert(Alert{
-				Type:      "node_failures",
-				Severity:  "error",
-				Message:   fmt.Sprintf("Node %s has %d failures", nodeID, failures),
-				NodeID:    nodeID,
-				Timestamp: time.Now(),
-				Metrics: map[string]interface{}{
+				ID:          mq.NewID(),
+				Timestamp:   time.Now(),
+				Severity:    AlertSeverityCritical,
+				Type:        AlertTypeNodeFailures,
+				Message:     fmt.Sprintf("Node %s has too many failures", nodeID),
+				NodeID:      nodeID,
+				Threshold:   m.thresholds.MaxNodeFailures,
+				ActualValue: failures,
+				Details: map[string]interface{}{
 					"node_id":  nodeID,
 					"failures": failures,
 				},
@@ -414,15 +439,17 @@ func (m *Monitor) performHealthCheck() {
 	}

 	// Check execution time
-	if snapshot.AverageExecutionTime > m.alertThresholds.MaxExecutionTime {
+	if metrics.AverageExecutionTime > m.thresholds.MaxExecutionTime {
 		m.triggerAlert(Alert{
-			Type:      "slow_execution",
-			Severity:  "warning",
-			Message:   fmt.Sprintf("Average execution time is high: %v", snapshot.AverageExecutionTime),
-			Timestamp: time.Now(),
-			Metrics: map[string]interface{}{
-				"average_execution_time": snapshot.AverageExecutionTime,
-				"threshold":              m.alertThresholds.MaxExecutionTime,
+			ID:          mq.NewID(),
+			Timestamp:   time.Now(),
+			Severity:    AlertSeverityWarning,
+			Type:        AlertTypeExecutionTime,
+			Message:     "Average execution time is too high",
+			Threshold:   m.thresholds.MaxExecutionTime,
+			ActualValue: metrics.AverageExecutionTime,
+			Details: map[string]interface{}{
+				"average_execution_time": metrics.AverageExecutionTime.String(),
 			},
 		})
 	}
@@ -431,16 +458,20 @@ func (m *Monitor) performHealthCheck() {
 // triggerAlert sends alerts to all registered handlers
 func (m *Monitor) triggerAlert(alert Alert) {
 	m.logger.Warn("Alert triggered",
-		logger.Field{Key: "type", Value: alert.Type},
-		logger.Field{Key: "severity", Value: alert.Severity},
+		logger.Field{Key: "alert_id", Value: alert.ID},
+		logger.Field{Key: "type", Value: string(alert.Type)},
+		logger.Field{Key: "severity", Value: string(alert.Severity)},
 		logger.Field{Key: "message", Value: alert.Message},
 	)

-	for _, handler := range m.alertHandlers {
-		if err := handler.HandleAlert(alert); err != nil {
-			m.logger.Error("Alert handler failed",
-				logger.Field{Key: "error", Value: err.Error()},
-			)
-		}
+	for _, handler := range m.handlers {
+		go func(h AlertHandler, a Alert) {
+			if err := h.HandleAlert(a); err != nil {
+				m.logger.Error("Alert handler error",
+					logger.Field{Key: "error", Value: err.Error()},
+					logger.Field{Key: "alert_id", Value: a.ID},
+				)
+			}
+		}(handler, alert)
 	}
 }