mirror of
https://github.com/oarkflow/mq.git
synced 2025-10-05 07:57:00 +08:00
563 lines
14 KiB
Go
563 lines
14 KiB
Go
package mq
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/oarkflow/mq/logger"
|
|
)
|
|
|
|
// TraceEvent represents a single event in a message's lifecycle
|
|
type TraceEvent struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
EventType TraceEventType `json:"event_type"`
|
|
Source string `json:"source"`
|
|
Details map[string]string `json:"details,omitempty"`
|
|
Duration time.Duration `json:"duration,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
// TraceEventType defines types of trace events
|
|
type TraceEventType string
|
|
|
|
const (
|
|
TraceEventEnqueued TraceEventType = "ENQUEUED"
|
|
TraceEventDequeued TraceEventType = "DEQUEUED"
|
|
TraceEventProcessing TraceEventType = "PROCESSING"
|
|
TraceEventCompleted TraceEventType = "COMPLETED"
|
|
TraceEventFailed TraceEventType = "FAILED"
|
|
TraceEventRetried TraceEventType = "RETRIED"
|
|
TraceEventRejected TraceEventType = "REJECTED"
|
|
TraceEventAcked TraceEventType = "ACKED"
|
|
TraceEventNacked TraceEventType = "NACKED"
|
|
TraceEventTimeout TraceEventType = "TIMEOUT"
|
|
TraceEventDLQ TraceEventType = "DLQ"
|
|
)
|
|
|
|
// MessageTrace represents the complete trace of a message
|
|
type MessageTrace struct {
|
|
TraceID string `json:"trace_id"`
|
|
SpanID string `json:"span_id"`
|
|
ParentSpanID string `json:"parent_span_id,omitempty"`
|
|
TaskID string `json:"task_id"`
|
|
QueueName string `json:"queue_name"`
|
|
StartTime time.Time `json:"start_time"`
|
|
EndTime time.Time `json:"end_time,omitempty"`
|
|
Duration time.Duration `json:"duration,omitempty"`
|
|
Events []*TraceEvent `json:"events"`
|
|
Tags map[string]string `json:"tags,omitempty"`
|
|
Status string `json:"status"`
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// TraceManager manages distributed tracing for messages
|
|
type TraceManager struct {
|
|
traces map[string]*MessageTrace
|
|
mu sync.RWMutex
|
|
logger logger.Logger
|
|
storage TraceStorage
|
|
shutdown chan struct{}
|
|
retention time.Duration
|
|
exportInterval time.Duration
|
|
onTraceComplete func(*MessageTrace)
|
|
}
|
|
|
|
// TraceStorage interface for persistent trace storage
|
|
type TraceStorage interface {
|
|
Store(ctx context.Context, trace *MessageTrace) error
|
|
Get(ctx context.Context, traceID string) (*MessageTrace, error)
|
|
Query(ctx context.Context, filter TraceFilter) ([]*MessageTrace, error)
|
|
Delete(ctx context.Context, traceID string) error
|
|
DeleteOlderThan(ctx context.Context, duration time.Duration) (int, error)
|
|
Close() error
|
|
}
|
|
|
|
// TraceFilter for querying traces
|
|
type TraceFilter struct {
|
|
QueueName string
|
|
Status string
|
|
StartTime time.Time
|
|
EndTime time.Time
|
|
MinDuration time.Duration
|
|
MaxDuration time.Duration
|
|
HasError bool
|
|
Limit int
|
|
Offset int
|
|
}
|
|
|
|
// TraceConfig holds tracing configuration
|
|
type TraceConfig struct {
|
|
Storage TraceStorage
|
|
Retention time.Duration
|
|
ExportInterval time.Duration
|
|
Logger logger.Logger
|
|
}
|
|
|
|
// NewTraceManager creates a new trace manager
|
|
func NewTraceManager(config TraceConfig) *TraceManager {
|
|
if config.Retention == 0 {
|
|
config.Retention = 24 * time.Hour
|
|
}
|
|
if config.ExportInterval == 0 {
|
|
config.ExportInterval = 30 * time.Second
|
|
}
|
|
|
|
tm := &TraceManager{
|
|
traces: make(map[string]*MessageTrace),
|
|
logger: config.Logger,
|
|
storage: config.Storage,
|
|
shutdown: make(chan struct{}),
|
|
retention: config.Retention,
|
|
exportInterval: config.ExportInterval,
|
|
}
|
|
|
|
go tm.exportLoop()
|
|
go tm.cleanupLoop()
|
|
|
|
return tm
|
|
}
|
|
|
|
// StartTrace initiates tracing for a message
|
|
func (tm *TraceManager) StartTrace(ctx context.Context, task *Task, queueName string) *MessageTrace {
|
|
trace := &MessageTrace{
|
|
TraceID: task.TraceID,
|
|
SpanID: task.SpanID,
|
|
TaskID: task.ID,
|
|
QueueName: queueName,
|
|
StartTime: time.Now(),
|
|
Events: make([]*TraceEvent, 0),
|
|
Tags: task.Tags,
|
|
Status: "in_progress",
|
|
}
|
|
|
|
tm.mu.Lock()
|
|
tm.traces[trace.TraceID] = trace
|
|
tm.mu.Unlock()
|
|
|
|
tm.RecordEvent(trace.TraceID, TraceEventEnqueued, "broker", nil)
|
|
|
|
return trace
|
|
}
|
|
|
|
// RecordEvent records an event in a trace
|
|
func (tm *TraceManager) RecordEvent(traceID string, eventType TraceEventType, source string, details map[string]string) {
|
|
tm.mu.RLock()
|
|
trace, exists := tm.traces[traceID]
|
|
tm.mu.RUnlock()
|
|
|
|
if !exists {
|
|
tm.logger.Warn("Trace not found for event recording",
|
|
logger.Field{Key: "traceID", Value: traceID},
|
|
logger.Field{Key: "eventType", Value: eventType})
|
|
return
|
|
}
|
|
|
|
trace.mu.Lock()
|
|
defer trace.mu.Unlock()
|
|
|
|
event := &TraceEvent{
|
|
Timestamp: time.Now(),
|
|
EventType: eventType,
|
|
Source: source,
|
|
Details: details,
|
|
}
|
|
|
|
trace.Events = append(trace.Events, event)
|
|
|
|
tm.logger.Debug("Recorded trace event",
|
|
logger.Field{Key: "traceID", Value: traceID},
|
|
logger.Field{Key: "eventType", Value: eventType},
|
|
logger.Field{Key: "source", Value: source})
|
|
}
|
|
|
|
// RecordError records an error event in a trace
|
|
func (tm *TraceManager) RecordError(traceID string, source string, err error, details map[string]string) {
|
|
tm.mu.RLock()
|
|
trace, exists := tm.traces[traceID]
|
|
tm.mu.RUnlock()
|
|
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
trace.mu.Lock()
|
|
defer trace.mu.Unlock()
|
|
|
|
if details == nil {
|
|
details = make(map[string]string)
|
|
}
|
|
|
|
event := &TraceEvent{
|
|
Timestamp: time.Now(),
|
|
EventType: TraceEventFailed,
|
|
Source: source,
|
|
Details: details,
|
|
Error: err.Error(),
|
|
}
|
|
|
|
trace.Events = append(trace.Events, event)
|
|
trace.Status = "failed"
|
|
}
|
|
|
|
// CompleteTrace marks a trace as complete
|
|
func (tm *TraceManager) CompleteTrace(traceID string, status string) {
|
|
tm.mu.Lock()
|
|
trace, exists := tm.traces[traceID]
|
|
if !exists {
|
|
tm.mu.Unlock()
|
|
return
|
|
}
|
|
delete(tm.traces, traceID)
|
|
tm.mu.Unlock()
|
|
|
|
trace.mu.Lock()
|
|
trace.EndTime = time.Now()
|
|
trace.Duration = trace.EndTime.Sub(trace.StartTime)
|
|
trace.Status = status
|
|
trace.mu.Unlock()
|
|
|
|
// Store in persistent storage
|
|
if tm.storage != nil {
|
|
go func() {
|
|
if err := tm.storage.Store(context.Background(), trace); err != nil {
|
|
tm.logger.Error("Failed to store trace",
|
|
logger.Field{Key: "traceID", Value: traceID},
|
|
logger.Field{Key: "error", Value: err})
|
|
}
|
|
}()
|
|
}
|
|
|
|
if tm.onTraceComplete != nil {
|
|
go tm.onTraceComplete(trace)
|
|
}
|
|
|
|
tm.logger.Debug("Completed trace",
|
|
logger.Field{Key: "traceID", Value: traceID},
|
|
logger.Field{Key: "duration", Value: trace.Duration},
|
|
logger.Field{Key: "status", Value: status})
|
|
}
|
|
|
|
// GetTrace retrieves a trace by ID
|
|
func (tm *TraceManager) GetTrace(ctx context.Context, traceID string) (*MessageTrace, error) {
|
|
// Check in-memory traces first
|
|
tm.mu.RLock()
|
|
trace, exists := tm.traces[traceID]
|
|
tm.mu.RUnlock()
|
|
|
|
if exists {
|
|
return trace, nil
|
|
}
|
|
|
|
// Check persistent storage
|
|
if tm.storage != nil {
|
|
return tm.storage.Get(ctx, traceID)
|
|
}
|
|
|
|
return nil, fmt.Errorf("trace not found: %s", traceID)
|
|
}
|
|
|
|
// QueryTraces queries traces based on filter
|
|
func (tm *TraceManager) QueryTraces(ctx context.Context, filter TraceFilter) ([]*MessageTrace, error) {
|
|
if tm.storage != nil {
|
|
return tm.storage.Query(ctx, filter)
|
|
}
|
|
|
|
// Fallback to in-memory search
|
|
tm.mu.RLock()
|
|
defer tm.mu.RUnlock()
|
|
|
|
var results []*MessageTrace
|
|
for _, trace := range tm.traces {
|
|
if tm.matchesFilter(trace, filter) {
|
|
results = append(results, trace)
|
|
}
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// matchesFilter checks if a trace matches the filter
|
|
func (tm *TraceManager) matchesFilter(trace *MessageTrace, filter TraceFilter) bool {
|
|
if filter.QueueName != "" && trace.QueueName != filter.QueueName {
|
|
return false
|
|
}
|
|
if filter.Status != "" && trace.Status != filter.Status {
|
|
return false
|
|
}
|
|
if !filter.StartTime.IsZero() && trace.StartTime.Before(filter.StartTime) {
|
|
return false
|
|
}
|
|
if !filter.EndTime.IsZero() && !trace.EndTime.IsZero() && trace.EndTime.After(filter.EndTime) {
|
|
return false
|
|
}
|
|
if filter.MinDuration > 0 && trace.Duration < filter.MinDuration {
|
|
return false
|
|
}
|
|
if filter.MaxDuration > 0 && trace.Duration > filter.MaxDuration {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// exportLoop periodically exports traces to storage
|
|
func (tm *TraceManager) exportLoop() {
|
|
ticker := time.NewTicker(tm.exportInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
tm.exportTraces()
|
|
case <-tm.shutdown:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// exportTraces exports active traces to storage
|
|
func (tm *TraceManager) exportTraces() {
|
|
if tm.storage == nil {
|
|
return
|
|
}
|
|
|
|
tm.mu.RLock()
|
|
traces := make([]*MessageTrace, 0, len(tm.traces))
|
|
for _, trace := range tm.traces {
|
|
traces = append(traces, trace)
|
|
}
|
|
tm.mu.RUnlock()
|
|
|
|
for _, trace := range traces {
|
|
if err := tm.storage.Store(context.Background(), trace); err != nil {
|
|
tm.logger.Error("Failed to export trace",
|
|
logger.Field{Key: "traceID", Value: trace.TraceID},
|
|
logger.Field{Key: "error", Value: err})
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanupLoop periodically cleans up old traces
|
|
func (tm *TraceManager) cleanupLoop() {
|
|
ticker := time.NewTicker(1 * time.Hour)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
tm.cleanup()
|
|
case <-tm.shutdown:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanup removes old traces
|
|
func (tm *TraceManager) cleanup() {
|
|
if tm.storage != nil {
|
|
deleted, err := tm.storage.DeleteOlderThan(context.Background(), tm.retention)
|
|
if err != nil {
|
|
tm.logger.Error("Failed to cleanup old traces",
|
|
logger.Field{Key: "error", Value: err})
|
|
} else if deleted > 0 {
|
|
tm.logger.Info("Cleaned up old traces",
|
|
logger.Field{Key: "deleted", Value: deleted})
|
|
}
|
|
}
|
|
}
|
|
|
|
// SetOnTraceComplete sets callback for trace completion
|
|
func (tm *TraceManager) SetOnTraceComplete(fn func(*MessageTrace)) {
|
|
tm.onTraceComplete = fn
|
|
}
|
|
|
|
// GetStats returns tracing statistics
|
|
func (tm *TraceManager) GetStats() map[string]any {
|
|
tm.mu.RLock()
|
|
defer tm.mu.RUnlock()
|
|
|
|
activeTraces := len(tm.traces)
|
|
|
|
return map[string]any{
|
|
"active_traces": activeTraces,
|
|
"retention": tm.retention,
|
|
"export_interval": tm.exportInterval,
|
|
}
|
|
}
|
|
|
|
// Shutdown gracefully shuts down the trace manager
|
|
func (tm *TraceManager) Shutdown(ctx context.Context) error {
|
|
close(tm.shutdown)
|
|
|
|
// Export remaining traces
|
|
tm.exportTraces()
|
|
|
|
if tm.storage != nil {
|
|
return tm.storage.Close()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// InMemoryTraceStorage implements in-memory trace storage
|
|
type InMemoryTraceStorage struct {
|
|
traces map[string]*MessageTrace
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// NewInMemoryTraceStorage creates a new in-memory trace storage
|
|
func NewInMemoryTraceStorage() *InMemoryTraceStorage {
|
|
return &InMemoryTraceStorage{
|
|
traces: make(map[string]*MessageTrace),
|
|
}
|
|
}
|
|
|
|
func (s *InMemoryTraceStorage) Store(ctx context.Context, trace *MessageTrace) error {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
// Deep copy to avoid concurrent modifications
|
|
data, err := json.Marshal(trace)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var traceCopy MessageTrace
|
|
if err := json.Unmarshal(data, &traceCopy); err != nil {
|
|
return err
|
|
}
|
|
|
|
s.traces[trace.TraceID] = &traceCopy
|
|
return nil
|
|
}
|
|
|
|
func (s *InMemoryTraceStorage) Get(ctx context.Context, traceID string) (*MessageTrace, error) {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
|
|
trace, exists := s.traces[traceID]
|
|
if !exists {
|
|
return nil, fmt.Errorf("trace not found: %s", traceID)
|
|
}
|
|
|
|
return trace, nil
|
|
}
|
|
|
|
func (s *InMemoryTraceStorage) Query(ctx context.Context, filter TraceFilter) ([]*MessageTrace, error) {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
|
|
var results []*MessageTrace
|
|
for _, trace := range s.traces {
|
|
results = append(results, trace)
|
|
}
|
|
|
|
// Apply pagination
|
|
start := filter.Offset
|
|
if start >= len(results) {
|
|
return []*MessageTrace{}, nil
|
|
}
|
|
|
|
end := start + filter.Limit
|
|
if filter.Limit == 0 || end > len(results) {
|
|
end = len(results)
|
|
}
|
|
|
|
return results[start:end], nil
|
|
}
|
|
|
|
func (s *InMemoryTraceStorage) Delete(ctx context.Context, traceID string) error {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
delete(s.traces, traceID)
|
|
return nil
|
|
}
|
|
|
|
func (s *InMemoryTraceStorage) DeleteOlderThan(ctx context.Context, duration time.Duration) (int, error) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
cutoff := time.Now().Add(-duration)
|
|
deleted := 0
|
|
|
|
for traceID, trace := range s.traces {
|
|
if !trace.EndTime.IsZero() && trace.EndTime.Before(cutoff) {
|
|
delete(s.traces, traceID)
|
|
deleted++
|
|
}
|
|
}
|
|
|
|
return deleted, nil
|
|
}
|
|
|
|
func (s *InMemoryTraceStorage) Close() error {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
s.traces = nil
|
|
return nil
|
|
}
|
|
|
|
// MessageLifecycleTracker tracks the complete lifecycle of messages
|
|
type MessageLifecycleTracker struct {
|
|
traceManager *TraceManager
|
|
logger logger.Logger
|
|
}
|
|
|
|
// NewMessageLifecycleTracker creates a new lifecycle tracker
|
|
func NewMessageLifecycleTracker(traceManager *TraceManager, log logger.Logger) *MessageLifecycleTracker {
|
|
return &MessageLifecycleTracker{
|
|
traceManager: traceManager,
|
|
logger: log,
|
|
}
|
|
}
|
|
|
|
// TrackEnqueue tracks message enqueue
|
|
func (lt *MessageLifecycleTracker) TrackEnqueue(ctx context.Context, task *Task, queueName string) {
|
|
lt.traceManager.StartTrace(ctx, task, queueName)
|
|
}
|
|
|
|
// TrackDequeue tracks message dequeue
|
|
func (lt *MessageLifecycleTracker) TrackDequeue(traceID, consumerID string) {
|
|
lt.traceManager.RecordEvent(traceID, TraceEventDequeued, consumerID, nil)
|
|
}
|
|
|
|
// TrackProcessing tracks message processing start
|
|
func (lt *MessageLifecycleTracker) TrackProcessing(traceID, workerID string) {
|
|
lt.traceManager.RecordEvent(traceID, TraceEventProcessing, workerID, nil)
|
|
}
|
|
|
|
// TrackCompletion tracks successful completion
|
|
func (lt *MessageLifecycleTracker) TrackCompletion(traceID string, duration time.Duration) {
|
|
details := map[string]string{
|
|
"duration": duration.String(),
|
|
}
|
|
lt.traceManager.RecordEvent(traceID, TraceEventCompleted, "worker", details)
|
|
lt.traceManager.CompleteTrace(traceID, "completed")
|
|
}
|
|
|
|
// TrackError tracks processing error
|
|
func (lt *MessageLifecycleTracker) TrackError(traceID, source string, err error) {
|
|
lt.traceManager.RecordError(traceID, source, err, nil)
|
|
}
|
|
|
|
// TrackRetry tracks retry attempt
|
|
func (lt *MessageLifecycleTracker) TrackRetry(traceID string, retryCount int) {
|
|
details := map[string]string{
|
|
"retry_count": fmt.Sprintf("%d", retryCount),
|
|
}
|
|
lt.traceManager.RecordEvent(traceID, TraceEventRetried, "broker", details)
|
|
}
|
|
|
|
// TrackDLQ tracks movement to dead letter queue
|
|
func (lt *MessageLifecycleTracker) TrackDLQ(traceID, reason string) {
|
|
details := map[string]string{
|
|
"reason": reason,
|
|
}
|
|
lt.traceManager.RecordEvent(traceID, TraceEventDLQ, "broker", details)
|
|
lt.traceManager.CompleteTrace(traceID, "dlq")
|
|
}
|