mirror of
https://github.com/oarkflow/mq.git
synced 2025-10-06 16:36:53 +08:00
447 lines
12 KiB
Go
447 lines
12 KiB
Go
package dag
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/oarkflow/mq"
|
|
"github.com/oarkflow/mq/logger"
|
|
)
|
|
|
|
// MonitoringMetrics holds comprehensive metrics for DAG monitoring
|
|
type MonitoringMetrics struct {
|
|
mu sync.RWMutex
|
|
TasksTotal int64
|
|
TasksCompleted int64
|
|
TasksFailed int64
|
|
TasksCancelled int64
|
|
TasksInProgress int64
|
|
NodesExecuted map[string]int64
|
|
NodeExecutionTimes map[string][]time.Duration
|
|
NodeFailures map[string]int64
|
|
AverageExecutionTime time.Duration
|
|
TotalExecutionTime time.Duration
|
|
StartTime time.Time
|
|
LastTaskCompletedAt time.Time
|
|
ActiveTasks map[string]time.Time
|
|
NodeProcessingStats map[string]*NodeStats
|
|
}
|
|
|
|
// NodeStats holds statistics for individual nodes
|
|
type NodeStats struct {
|
|
ExecutionCount int64
|
|
SuccessCount int64
|
|
FailureCount int64
|
|
TotalDuration time.Duration
|
|
AverageDuration time.Duration
|
|
MinDuration time.Duration
|
|
MaxDuration time.Duration
|
|
LastExecuted time.Time
|
|
LastSuccess time.Time
|
|
LastFailure time.Time
|
|
CurrentlyRunning int64
|
|
}
|
|
|
|
// NewMonitoringMetrics creates a new metrics instance
|
|
func NewMonitoringMetrics() *MonitoringMetrics {
|
|
return &MonitoringMetrics{
|
|
NodesExecuted: make(map[string]int64),
|
|
NodeExecutionTimes: make(map[string][]time.Duration),
|
|
NodeFailures: make(map[string]int64),
|
|
StartTime: time.Now(),
|
|
ActiveTasks: make(map[string]time.Time),
|
|
NodeProcessingStats: make(map[string]*NodeStats),
|
|
}
|
|
}
|
|
|
|
// RecordTaskStart records the start of a task
|
|
func (m *MonitoringMetrics) RecordTaskStart(taskID string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
m.TasksTotal++
|
|
m.TasksInProgress++
|
|
m.ActiveTasks[taskID] = time.Now()
|
|
}
|
|
|
|
// RecordTaskCompletion records task completion
|
|
func (m *MonitoringMetrics) RecordTaskCompletion(taskID string, status mq.Status) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if startTime, exists := m.ActiveTasks[taskID]; exists {
|
|
duration := time.Since(startTime)
|
|
m.TotalExecutionTime += duration
|
|
m.LastTaskCompletedAt = time.Now()
|
|
delete(m.ActiveTasks, taskID)
|
|
m.TasksInProgress--
|
|
|
|
// Update average execution time
|
|
if m.TasksCompleted > 0 {
|
|
m.AverageExecutionTime = m.TotalExecutionTime / time.Duration(m.TasksCompleted+1)
|
|
}
|
|
}
|
|
|
|
switch status {
|
|
case mq.Completed:
|
|
m.TasksCompleted++
|
|
case mq.Failed:
|
|
m.TasksFailed++
|
|
case mq.Cancelled:
|
|
m.TasksCancelled++
|
|
}
|
|
}
|
|
|
|
// RecordNodeExecution records node execution metrics
|
|
func (m *MonitoringMetrics) RecordNodeExecution(nodeID string, duration time.Duration, success bool) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
// Initialize node stats if not exists
|
|
if _, exists := m.NodeProcessingStats[nodeID]; !exists {
|
|
m.NodeProcessingStats[nodeID] = &NodeStats{
|
|
MinDuration: duration,
|
|
MaxDuration: duration,
|
|
}
|
|
}
|
|
|
|
stats := m.NodeProcessingStats[nodeID]
|
|
stats.ExecutionCount++
|
|
stats.TotalDuration += duration
|
|
stats.AverageDuration = stats.TotalDuration / time.Duration(stats.ExecutionCount)
|
|
stats.LastExecuted = time.Now()
|
|
|
|
if duration < stats.MinDuration || stats.MinDuration == 0 {
|
|
stats.MinDuration = duration
|
|
}
|
|
if duration > stats.MaxDuration {
|
|
stats.MaxDuration = duration
|
|
}
|
|
|
|
if success {
|
|
stats.SuccessCount++
|
|
stats.LastSuccess = time.Now()
|
|
} else {
|
|
stats.FailureCount++
|
|
stats.LastFailure = time.Now()
|
|
m.NodeFailures[nodeID]++
|
|
}
|
|
|
|
// Legacy tracking
|
|
m.NodesExecuted[nodeID]++
|
|
if len(m.NodeExecutionTimes[nodeID]) > 100 {
|
|
// Keep only last 100 execution times
|
|
m.NodeExecutionTimes[nodeID] = m.NodeExecutionTimes[nodeID][1:]
|
|
}
|
|
m.NodeExecutionTimes[nodeID] = append(m.NodeExecutionTimes[nodeID], duration)
|
|
}
|
|
|
|
// RecordNodeStart records when a node starts processing
|
|
func (m *MonitoringMetrics) RecordNodeStart(nodeID string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if stats, exists := m.NodeProcessingStats[nodeID]; exists {
|
|
stats.CurrentlyRunning++
|
|
}
|
|
}
|
|
|
|
// RecordNodeEnd records when a node finishes processing
|
|
func (m *MonitoringMetrics) RecordNodeEnd(nodeID string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if stats, exists := m.NodeProcessingStats[nodeID]; exists && stats.CurrentlyRunning > 0 {
|
|
stats.CurrentlyRunning--
|
|
}
|
|
}
|
|
|
|
// GetSnapshot returns a snapshot of current metrics
|
|
func (m *MonitoringMetrics) GetSnapshot() *MonitoringMetrics {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
snapshot := &MonitoringMetrics{
|
|
TasksTotal: m.TasksTotal,
|
|
TasksCompleted: m.TasksCompleted,
|
|
TasksFailed: m.TasksFailed,
|
|
TasksCancelled: m.TasksCancelled,
|
|
TasksInProgress: m.TasksInProgress,
|
|
AverageExecutionTime: m.AverageExecutionTime,
|
|
TotalExecutionTime: m.TotalExecutionTime,
|
|
StartTime: m.StartTime,
|
|
LastTaskCompletedAt: m.LastTaskCompletedAt,
|
|
NodesExecuted: make(map[string]int64),
|
|
NodeExecutionTimes: make(map[string][]time.Duration),
|
|
NodeFailures: make(map[string]int64),
|
|
ActiveTasks: make(map[string]time.Time),
|
|
NodeProcessingStats: make(map[string]*NodeStats),
|
|
}
|
|
|
|
// Deep copy maps
|
|
for k, v := range m.NodesExecuted {
|
|
snapshot.NodesExecuted[k] = v
|
|
}
|
|
for k, v := range m.NodeFailures {
|
|
snapshot.NodeFailures[k] = v
|
|
}
|
|
for k, v := range m.ActiveTasks {
|
|
snapshot.ActiveTasks[k] = v
|
|
}
|
|
for k, v := range m.NodeExecutionTimes {
|
|
snapshot.NodeExecutionTimes[k] = make([]time.Duration, len(v))
|
|
copy(snapshot.NodeExecutionTimes[k], v)
|
|
}
|
|
for k, v := range m.NodeProcessingStats {
|
|
snapshot.NodeProcessingStats[k] = &NodeStats{
|
|
ExecutionCount: v.ExecutionCount,
|
|
SuccessCount: v.SuccessCount,
|
|
FailureCount: v.FailureCount,
|
|
TotalDuration: v.TotalDuration,
|
|
AverageDuration: v.AverageDuration,
|
|
MinDuration: v.MinDuration,
|
|
MaxDuration: v.MaxDuration,
|
|
LastExecuted: v.LastExecuted,
|
|
LastSuccess: v.LastSuccess,
|
|
LastFailure: v.LastFailure,
|
|
CurrentlyRunning: v.CurrentlyRunning,
|
|
}
|
|
}
|
|
|
|
return snapshot
|
|
}
|
|
|
|
// GetNodeStats returns statistics for a specific node
|
|
func (m *MonitoringMetrics) GetNodeStats(nodeID string) *NodeStats {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
if stats, exists := m.NodeProcessingStats[nodeID]; exists {
|
|
// Return a copy
|
|
return &NodeStats{
|
|
ExecutionCount: stats.ExecutionCount,
|
|
SuccessCount: stats.SuccessCount,
|
|
FailureCount: stats.FailureCount,
|
|
TotalDuration: stats.TotalDuration,
|
|
AverageDuration: stats.AverageDuration,
|
|
MinDuration: stats.MinDuration,
|
|
MaxDuration: stats.MaxDuration,
|
|
LastExecuted: stats.LastExecuted,
|
|
LastSuccess: stats.LastSuccess,
|
|
LastFailure: stats.LastFailure,
|
|
CurrentlyRunning: stats.CurrentlyRunning,
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Monitor provides comprehensive monitoring capabilities for DAG
|
|
type Monitor struct {
|
|
dag *DAG
|
|
metrics *MonitoringMetrics
|
|
logger logger.Logger
|
|
alertThresholds *AlertThresholds
|
|
webhookURL string
|
|
alertHandlers []AlertHandler
|
|
monitoringActive bool
|
|
stopCh chan struct{}
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// AlertThresholds defines thresholds for alerting
|
|
type AlertThresholds struct {
|
|
MaxFailureRate float64 // Maximum allowed failure rate (0.0 - 1.0)
|
|
MaxExecutionTime time.Duration // Maximum allowed execution time
|
|
MaxTasksInProgress int64 // Maximum allowed concurrent tasks
|
|
MinSuccessRate float64 // Minimum required success rate
|
|
MaxNodeFailures int64 // Maximum failures per node
|
|
HealthCheckInterval time.Duration // How often to check health
|
|
}
|
|
|
|
// AlertHandler defines interface for handling alerts
|
|
type AlertHandler interface {
|
|
HandleAlert(alert Alert) error
|
|
}
|
|
|
|
// Alert represents a monitoring alert
|
|
type Alert struct {
|
|
Type string
|
|
Severity string
|
|
Message string
|
|
NodeID string
|
|
TaskID string
|
|
Timestamp time.Time
|
|
Metrics map[string]interface{}
|
|
}
|
|
|
|
// NewMonitor creates a new DAG monitor
|
|
func NewMonitor(dag *DAG, logger logger.Logger) *Monitor {
|
|
return &Monitor{
|
|
dag: dag,
|
|
metrics: NewMonitoringMetrics(),
|
|
logger: logger,
|
|
alertThresholds: &AlertThresholds{
|
|
MaxFailureRate: 0.1, // 10% failure rate
|
|
MaxExecutionTime: 5 * time.Minute,
|
|
MaxTasksInProgress: 1000,
|
|
MinSuccessRate: 0.9, // 90% success rate
|
|
MaxNodeFailures: 10,
|
|
HealthCheckInterval: 30 * time.Second,
|
|
},
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Start begins monitoring
|
|
func (m *Monitor) Start(ctx context.Context) {
|
|
m.mu.Lock()
|
|
if m.monitoringActive {
|
|
m.mu.Unlock()
|
|
return
|
|
}
|
|
m.monitoringActive = true
|
|
m.mu.Unlock()
|
|
|
|
// Start health check routine
|
|
go m.healthCheckRoutine(ctx)
|
|
|
|
m.logger.Info("DAG monitoring started")
|
|
}
|
|
|
|
// Stop stops monitoring
|
|
func (m *Monitor) Stop() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if !m.monitoringActive {
|
|
return
|
|
}
|
|
|
|
close(m.stopCh)
|
|
m.monitoringActive = false
|
|
m.logger.Info("DAG monitoring stopped")
|
|
}
|
|
|
|
// SetAlertThresholds updates alert thresholds
|
|
func (m *Monitor) SetAlertThresholds(thresholds *AlertThresholds) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.alertThresholds = thresholds
|
|
}
|
|
|
|
// AddAlertHandler adds an alert handler
|
|
func (m *Monitor) AddAlertHandler(handler AlertHandler) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.alertHandlers = append(m.alertHandlers, handler)
|
|
}
|
|
|
|
// GetMetrics returns current metrics
|
|
func (m *Monitor) GetMetrics() *MonitoringMetrics {
|
|
return m.metrics.GetSnapshot()
|
|
}
|
|
|
|
// healthCheckRoutine performs periodic health checks
|
|
func (m *Monitor) healthCheckRoutine(ctx context.Context) {
|
|
ticker := time.NewTicker(m.alertThresholds.HealthCheckInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-m.stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
m.performHealthCheck()
|
|
}
|
|
}
|
|
}
|
|
|
|
// performHealthCheck checks system health and triggers alerts
|
|
func (m *Monitor) performHealthCheck() {
|
|
snapshot := m.metrics.GetSnapshot()
|
|
|
|
// Check failure rate
|
|
if snapshot.TasksTotal > 0 {
|
|
failureRate := float64(snapshot.TasksFailed) / float64(snapshot.TasksTotal)
|
|
if failureRate > m.alertThresholds.MaxFailureRate {
|
|
m.triggerAlert(Alert{
|
|
Type: "high_failure_rate",
|
|
Severity: "warning",
|
|
Message: fmt.Sprintf("High failure rate: %.2f%%", failureRate*100),
|
|
Timestamp: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"failure_rate": failureRate,
|
|
"total_tasks": snapshot.TasksTotal,
|
|
"failed_tasks": snapshot.TasksFailed,
|
|
},
|
|
})
|
|
}
|
|
}
|
|
|
|
// Check tasks in progress
|
|
if snapshot.TasksInProgress > m.alertThresholds.MaxTasksInProgress {
|
|
m.triggerAlert(Alert{
|
|
Type: "high_task_load",
|
|
Severity: "warning",
|
|
Message: fmt.Sprintf("High number of tasks in progress: %d", snapshot.TasksInProgress),
|
|
Timestamp: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"tasks_in_progress": snapshot.TasksInProgress,
|
|
"threshold": m.alertThresholds.MaxTasksInProgress,
|
|
},
|
|
})
|
|
}
|
|
|
|
// Check node failures
|
|
for nodeID, failures := range snapshot.NodeFailures {
|
|
if failures > m.alertThresholds.MaxNodeFailures {
|
|
m.triggerAlert(Alert{
|
|
Type: "node_failures",
|
|
Severity: "error",
|
|
Message: fmt.Sprintf("Node %s has %d failures", nodeID, failures),
|
|
NodeID: nodeID,
|
|
Timestamp: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"node_id": nodeID,
|
|
"failures": failures,
|
|
},
|
|
})
|
|
}
|
|
}
|
|
|
|
// Check execution time
|
|
if snapshot.AverageExecutionTime > m.alertThresholds.MaxExecutionTime {
|
|
m.triggerAlert(Alert{
|
|
Type: "slow_execution",
|
|
Severity: "warning",
|
|
Message: fmt.Sprintf("Average execution time is high: %v", snapshot.AverageExecutionTime),
|
|
Timestamp: time.Now(),
|
|
Metrics: map[string]interface{}{
|
|
"average_execution_time": snapshot.AverageExecutionTime,
|
|
"threshold": m.alertThresholds.MaxExecutionTime,
|
|
},
|
|
})
|
|
}
|
|
}
|
|
|
|
// triggerAlert sends alerts to all registered handlers
|
|
func (m *Monitor) triggerAlert(alert Alert) {
|
|
m.logger.Warn("Alert triggered",
|
|
logger.Field{Key: "type", Value: alert.Type},
|
|
logger.Field{Key: "severity", Value: alert.Severity},
|
|
logger.Field{Key: "message", Value: alert.Message},
|
|
)
|
|
|
|
for _, handler := range m.alertHandlers {
|
|
if err := handler.HandleAlert(alert); err != nil {
|
|
m.logger.Error("Alert handler failed",
|
|
logger.Field{Key: "error", Value: err.Error()},
|
|
)
|
|
}
|
|
}
|
|
}
|