This commit is contained in:
Oarkflow
2025-08-02 16:17:20 +05:45
parent ffbf9f99df
commit 271beed429
5 changed files with 212 additions and 113 deletions

View File

@@ -52,14 +52,14 @@ type Config struct {
BufferPoolSize int BufferPoolSize int
} }
// DefaultConfig returns default configuration // DefaultConfig returns default configuration with NO timeouts for persistent connections
func DefaultConfig() *Config { func DefaultConfig() *Config {
return &Config{ return &Config{
MaxMessageSize: MaxMessageSize, MaxMessageSize: MaxMessageSize,
MaxHeaderSize: MaxHeaderSize, MaxHeaderSize: MaxHeaderSize,
MaxQueueLength: MaxQueueLength, MaxQueueLength: MaxQueueLength,
ReadTimeout: 30 * time.Second, ReadTimeout: 0, // NO read timeout for persistent broker-consumer connections
WriteTimeout: 30 * time.Second, WriteTimeout: 0, // NO write timeout for persistent broker-consumer connections
EnableCompression: false, EnableCompression: false,
BufferPoolSize: 1000, BufferPoolSize: 1000,
} }
@@ -252,7 +252,7 @@ func (c *Codec) SendMessage(ctx context.Context, conn net.Conn, msg *Message) er
return c.sendRawMessage(ctx, conn, msg) return c.sendRawMessage(ctx, conn, msg)
} }
// sendRawMessage handles the actual sending of a message or fragment // sendRawMessage handles the actual sending of a message or fragment WITHOUT timeouts
func (c *Codec) sendRawMessage(ctx context.Context, conn net.Conn, msg *Message) error { func (c *Codec) sendRawMessage(ctx context.Context, conn net.Conn, msg *Message) error {
// Serialize message // Serialize message
data, err := msg.Serialize() data, err := msg.Serialize()
@@ -283,17 +283,21 @@ func (c *Codec) sendRawMessage(ctx context.Context, conn net.Conn, msg *Message)
binary.BigEndian.PutUint32(buffer.B[:4], uint32(len(data))) binary.BigEndian.PutUint32(buffer.B[:4], uint32(len(data)))
copy(buffer.B[4:], data) copy(buffer.B[4:], data)
// Set timeout // CRITICAL: DO NOT set any write deadlines for broker-consumer connections
deadline := time.Now().Add(c.config.WriteTimeout) // These connections must remain open indefinitely for persistent communication
if ctxDeadline, ok := ctx.Deadline(); ok && ctxDeadline.Before(deadline) { // Only set timeout if explicitly configured AND not zero (for backward compatibility)
deadline = ctxDeadline if c.config.WriteTimeout > 0 {
} deadline := time.Now().Add(c.config.WriteTimeout)
if ctxDeadline, ok := ctx.Deadline(); ok && ctxDeadline.Before(deadline) {
deadline = ctxDeadline
}
if err := conn.SetWriteDeadline(deadline); err != nil { if err := conn.SetWriteDeadline(deadline); err != nil {
c.incrementErrors() c.incrementErrors()
return fmt.Errorf("failed to set write deadline: %w", err) return fmt.Errorf("failed to set write deadline: %w", err)
}
defer conn.SetWriteDeadline(time.Time{})
} }
defer conn.SetWriteDeadline(time.Time{})
// Write with buffering // Write with buffering
writer := bufio.NewWriter(conn) writer := bufio.NewWriter(conn)
@@ -318,7 +322,7 @@ func (c *Codec) sendRawMessage(ctx context.Context, conn net.Conn, msg *Message)
return nil return nil
} }
// ReadMessage reads a message with proper error handling and timeouts // ReadMessage reads a message WITHOUT timeouts for persistent broker-consumer connections
func (c *Codec) ReadMessage(ctx context.Context, conn net.Conn) (*Message, error) { func (c *Codec) ReadMessage(ctx context.Context, conn net.Conn) (*Message, error) {
// Check context cancellation before proceeding // Check context cancellation before proceeding
if err := ctx.Err(); err != nil { if err := ctx.Err(); err != nil {
@@ -326,24 +330,22 @@ func (c *Codec) ReadMessage(ctx context.Context, conn net.Conn) (*Message, error
return nil, fmt.Errorf("context ended before read: %w", err) return nil, fmt.Errorf("context ended before read: %w", err)
} }
// Check context cancellation before proceeding // CRITICAL: DO NOT set any read deadlines for broker-consumer connections
if err := ctx.Err(); err != nil { // These connections must remain open indefinitely for persistent communication
c.incrementErrors() // Only set timeout if explicitly configured AND not zero (for backward compatibility)
return nil, fmt.Errorf("context ended before read: %w", err) if c.config.ReadTimeout > 0 {
} deadline := time.Now().Add(c.config.ReadTimeout)
if ctxDeadline, ok := ctx.Deadline(); ok && ctxDeadline.Before(deadline) {
deadline = ctxDeadline
}
// Set timeout if err := conn.SetReadDeadline(deadline); err != nil {
deadline := time.Now().Add(c.config.ReadTimeout) c.incrementErrors()
if ctxDeadline, ok := ctx.Deadline(); ok && ctxDeadline.Before(deadline) { return nil, fmt.Errorf("failed to set read deadline: %w", err)
deadline = ctxDeadline }
defer conn.SetReadDeadline(time.Time{})
} }
if err := conn.SetReadDeadline(deadline); err != nil {
c.incrementErrors()
return nil, fmt.Errorf("failed to set read deadline: %w", err)
}
defer conn.SetReadDeadline(time.Time{})
// Read length prefix // Read length prefix
lengthBytes := make([]byte, 4) lengthBytes := make([]byte, 4)
if _, err := io.ReadFull(conn, lengthBytes); err != nil { if _, err := io.ReadFull(conn, lengthBytes); err != nil {

View File

@@ -593,10 +593,10 @@ func DefaultProductionConfig() *ProductionConfig {
Address: "localhost", Address: "localhost",
Port: 8080, Port: 8080,
MaxConnections: 1000, MaxConnections: 1000,
ConnectionTimeout: 30 * time.Second, ConnectionTimeout: 0, // NO timeout for broker-consumer connections
ReadTimeout: 30 * time.Second, ReadTimeout: 0, // NO read timeout - consumers need persistent connections
WriteTimeout: 30 * time.Second, WriteTimeout: 0, // NO write timeout - allow unlimited time for large messages
IdleTimeout: 5 * time.Minute, IdleTimeout: 0, // NO idle timeout - keep connections alive indefinitely
KeepAlive: true, KeepAlive: true,
KeepAlivePeriod: 30 * time.Second, KeepAlivePeriod: 30 * time.Second,
MaxQueueDepth: 10000, MaxQueueDepth: 10000,

View File

@@ -652,8 +652,9 @@ func (c *Consumer) Consume(ctx context.Context) error {
} }
} }
// processWithTimeout processes messages WITHOUT I/O timeouts for persistent broker connections
func (c *Consumer) processWithTimeout(ctx context.Context) error { func (c *Consumer) processWithTimeout(ctx context.Context) error {
// Consumer should wait indefinitely for messages from broker - no I/O timeout // Consumer should wait indefinitely for messages from broker - NO I/O timeout
// Only individual task processing should have timeouts, not the consumer connection // Only individual task processing should have timeouts, not the consumer connection
c.connMutex.RLock() c.connMutex.RLock()
conn := c.conn conn := c.conn
@@ -663,7 +664,9 @@ func (c *Consumer) processWithTimeout(ctx context.Context) error {
return fmt.Errorf("no connection available") return fmt.Errorf("no connection available")
} }
// Read message without timeout - consumer should be long-running background service // CRITICAL: Never set any connection timeouts for broker-consumer communication
// The consumer must maintain a persistent connection to the broker indefinitely
// Read message without ANY timeout - consumer should be long-running background service
err := c.readMessage(ctx, conn) err := c.readMessage(ctx, conn)
// If message was processed successfully, reset reconnection attempts // If message was processed successfully, reset reconnection attempts

View File

@@ -3,10 +3,10 @@
"address": "localhost", "address": "localhost",
"port": 8080, "port": 8080,
"max_connections": 1000, "max_connections": 1000,
"connection_timeout": "5s", "connection_timeout": "0s",
"read_timeout": "300s", "read_timeout": "0s",
"write_timeout": "30s", "write_timeout": "0s",
"idle_timeout": "600s", "idle_timeout": "0s",
"keep_alive": true, "keep_alive": true,
"keep_alive_period": "60s", "keep_alive_period": "60s",
"max_queue_depth": 10000, "max_queue_depth": 10000,
@@ -29,71 +29,111 @@
"max_retries": 3, "max_retries": 3,
"initial_delay": "1s", "initial_delay": "1s",
"max_backoff": "10s", "max_backoff": "10s",
"confirm_delivery": true, "jitter_percent": 0.5,
"connection_pool_size": 10,
"publish_timeout": "5s", "publish_timeout": "5s",
"connection_pool_size": 10 "enable_batching": false,
"batch_size": 100,
"batch_timeout": "1s"
}, },
"pool": { "pool": {
"min_workers": 1,
"max_workers": 100,
"queue_size": 1000, "queue_size": 1000,
"max_workers": 20,
"max_memory_load": 1073741824, "max_memory_load": 1073741824,
"idle_timeout": "300s", "task_timeout": "30s",
"graceful_shutdown_timeout": "30s", "idle_worker_timeout": "5m",
"task_timeout": "60s", "enable_dynamic_scaling": true,
"enable_metrics": true, "scaling_factor": 1.5,
"enable_diagnostics": true "scaling_interval": "1m",
"max_queue_wait_time": "10s",
"enable_work_stealing": false,
"enable_priority_scheduling": true,
"graceful_shutdown_timeout": "30s"
}, },
"security": { "security": {
"enable_tls": false, "enable_tls": false,
"tls_cert_path": "./certs/server.crt", "tls_cert_path": "",
"tls_key_path": "./certs/server.key", "tls_key_path": "",
"tls_ca_path": "./certs/ca.crt", "tls_ca_path": "",
"enable_auth": false, "tls_insecure_skip_verify": false,
"auth_provider": "jwt", "enable_authentication": false,
"jwt_secret": "your-secret-key", "authentication_method": "basic",
"enable_authorization": false,
"enable_encryption": false, "enable_encryption": false,
"encryption_key": "32-byte-encryption-key-here!!" "encryption_key": "",
"enable_audit_log": false,
"audit_log_path": "/var/log/mq/audit.log",
"session_timeout": "30m",
"max_login_attempts": 3,
"lockout_duration": "15m"
}, },
"monitoring": { "monitoring": {
"metrics_port": 9090,
"health_check_port": 9091,
"enable_metrics": true, "enable_metrics": true,
"enable_health_checks": true, "metrics_port": 9090,
"metrics_interval": "10s", "metrics_path": "/metrics",
"enable_health_check": true,
"health_check_port": 8081,
"health_check_path": "/health",
"health_check_interval": "30s", "health_check_interval": "30s",
"retention_period": "24h", "enable_tracing": false,
"enable_tracing": true, "tracing_endpoint": "",
"jaeger_endpoint": "http://localhost:14268/api/traces" "tracing_sample_rate": 0.1,
"enable_logging": true,
"log_level": "info",
"log_format": "json",
"log_output": "stdout",
"log_file_path": "/var/log/mq/app.log",
"log_max_size": 100,
"log_max_backups": 10,
"log_max_age": 30,
"enable_profiling": false,
"profiling_port": 6060
}, },
"persistence": { "persistence": {
"enable": true, "enable_persistence": false,
"provider": "postgres", "storage_type": "memory",
"connection_string": "postgres://user:password@localhost:5432/mq_db?sslmode=disable", "connection_string": "",
"max_connections": 50, "max_connections": 10,
"connection_timeout": "30s", "connection_timeout": "10s",
"enable_migrations": true, "retention_period": "168h",
"backup_enabled": true, "cleanup_interval": "1h",
"backup_interval": "6h" "backup_enabled": false,
"backup_interval": "6h",
"backup_path": "/var/backup/mq",
"compression_enabled": true,
"encryption_enabled": false,
"replication_enabled": false,
"replication_nodes": [ ]
}, },
"clustering": { "clustering": {
"enable": false, "enable_clustering": false,
"node_id": "node-1", "node_id": "",
"cluster_name": "mq-cluster", "cluster_nodes": [ ],
"peers": [ ], "discovery_method": "static",
"election_timeout": "5s", "discovery_endpoint": "",
"heartbeat_interval": "1s", "heartbeat_interval": "5s",
"enable_auto_discovery": false, "election_timeout": "15s",
"discovery_port": 7946 "enable_load_balancing": false,
"load_balancing_strategy": "round_robin",
"enable_failover": false,
"failover_timeout": "30s",
"enable_replication": false,
"replication_factor": 3,
"consistency_level": "strong"
}, },
"rate_limit": { "rate_limit": {
"enable_broker_rate_limit": false,
"broker_rate": 1000, "broker_rate": 1000,
"broker_burst": 100, "broker_burst": 100,
"consumer_rate": 500, "enable_consumer_rate_limit": false,
"consumer_burst": 50, "consumer_rate": 100,
"publisher_rate": 200, "consumer_burst": 10,
"publisher_burst": 20, "enable_publisher_rate_limit": false,
"global_rate": 2000, "publisher_rate": 100,
"global_burst": 200 "publisher_burst": 10,
}, "enable_per_queue_rate_limit": false,
"last_updated": "2025-07-29T00:00:00Z" "per_queue_rate": 50,
"per_queue_burst": 5
}
} }

110
mq.go
View File

@@ -745,52 +745,104 @@ func (b *Broker) Start(ctx context.Context) error {
if b.opts.tlsConfig.UseTLS { if b.opts.tlsConfig.UseTLS {
cert, err := tls.LoadX509KeyPair(b.opts.tlsConfig.CertPath, b.opts.tlsConfig.KeyPath) cert, err := tls.LoadX509KeyPair(b.opts.tlsConfig.CertPath, b.opts.tlsConfig.KeyPath)
if err != nil { if err != nil {
return fmt.Errorf("failed to load TLS certificates: %v", err) return WrapError(err, "failed to load TLS certificates for broker", "BROKER_TLS_CERT_ERROR")
} }
tlsConfig := &tls.Config{ tlsConfig := &tls.Config{
Certificates: []tls.Certificate{cert}, Certificates: []tls.Certificate{cert},
} }
listener, err = tls.Listen("tcp", b.opts.brokerAddr, tlsConfig) listener, err = tls.Listen("tcp", b.opts.brokerAddr, tlsConfig)
if err != nil { if err != nil {
return fmt.Errorf("failed to start TLS listener: %v", err) return WrapError(err, "TLS broker failed to listen on "+b.opts.brokerAddr, "BROKER_TLS_LISTEN_ERROR")
} }
log.Println("BROKER - RUNNING_TLS ~> started on", b.opts.brokerAddr)
} else { } else {
listener, err = net.Listen("tcp", b.opts.brokerAddr) listener, err = net.Listen("tcp", b.opts.brokerAddr)
if err != nil { if err != nil {
return fmt.Errorf("failed to start TCP listener: %v", err) return WrapError(err, "broker failed to listen on "+b.opts.brokerAddr, "BROKER_LISTEN_ERROR")
} }
log.Println("BROKER - RUNNING ~> started on", b.opts.brokerAddr)
} }
b.listener = listener b.listener = listener
defer b.Close() defer b.Close()
const maxConcurrentConnections = 100 const maxConcurrentConnections = 100
sem := make(chan struct{}, maxConcurrentConnections) sem := make(chan struct{}, maxConcurrentConnections)
for { for {
conn, err := listener.Accept() select {
if err != nil { case <-ctx.Done():
b.OnError(ctx, conn, err) log.Printf("BROKER - Shutdown signal received")
time.Sleep(50 * time.Millisecond) return ctx.Err()
continue default:
} conn, err := listener.Accept()
sem <- struct{}{} if err != nil {
go func(c net.Conn) { if atomic.LoadInt32(&b.isShutdown) == 1 {
defer func() { return nil
<-sem
c.Close()
}()
for {
err := b.readMessage(ctx, c)
if err != nil {
if netErr, ok := err.(net.Error); ok && netErr.Temporary() {
log.Println("Temporary network error, retrying:", netErr)
continue
}
log.Println("Connection closed due to error:", err)
break
} }
log.Printf("BROKER - Error accepting connection: %v", err)
continue
} }
}(conn)
// Configure connection for broker-consumer communication with NO timeouts
if tcpConn, ok := conn.(*net.TCPConn); ok {
// Enable TCP keep-alive for all connections
tcpConn.SetKeepAlive(true)
tcpConn.SetKeepAlivePeriod(30 * time.Second)
// NEVER set any deadlines for broker-consumer connections
// These connections must remain open indefinitely for persistent communication
// DO NOT call: tcpConn.SetReadDeadline() or tcpConn.SetWriteDeadline()
log.Printf("BROKER - TCP keep-alive enabled for connection from %s (NO timeouts)", conn.RemoteAddr())
}
sem <- struct{}{}
go func() {
defer func() { <-sem }()
defer conn.Close()
b.handleConnection(ctx, conn)
}()
}
}
}
// handleConnection handles a single connection with NO timeouts for persistent broker-consumer communication
func (b *Broker) handleConnection(ctx context.Context, conn net.Conn) {
defer func() {
if r := recover(); r != nil {
b.logger.Error("Connection handler panic",
logger.Field{Key: "panic", Value: fmt.Sprintf("%v", r)},
logger.Field{Key: "remote_addr", Value: conn.RemoteAddr().String()})
}
conn.Close()
}()
// CRITICAL: Never set any timeouts on broker-consumer connections
// These connections must remain open indefinitely for persistent communication
for {
select {
case <-ctx.Done():
b.logger.Debug("Context cancelled, closing connection",
logger.Field{Key: "remote_addr", Value: conn.RemoteAddr().String()})
return
default:
// Read message WITHOUT any timeout - this is crucial for persistent connections
if err := b.readMessage(ctx, conn); err != nil {
if err.Error() == "EOF" || strings.Contains(err.Error(), "closed network connection") {
b.logger.Debug("Connection closed by client",
logger.Field{Key: "remote_addr", Value: conn.RemoteAddr().String()})
return
}
// Don't return on timeout errors - they should not occur since we don't set timeouts
if strings.Contains(err.Error(), "timeout") {
b.logger.Warn("Unexpected timeout on connection (should not happen)",
logger.Field{Key: "remote_addr", Value: conn.RemoteAddr().String()},
logger.Field{Key: "error", Value: err.Error()})
continue
}
b.logger.Error("Connection error",
logger.Field{Key: "remote_addr", Value: conn.RemoteAddr().String()},
logger.Field{Key: "error", Value: err.Error()})
return
}
}
} }
} }
@@ -1500,7 +1552,9 @@ func (b *Broker) startEnhancedBroker(ctx context.Context) error {
func (b *Broker) handleEnhancedConnection(ctx context.Context, conn net.Conn) { func (b *Broker) handleEnhancedConnection(ctx context.Context, conn net.Conn) {
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {
b.logger.Error("Connection handler panic", logger.Field{Key: "panic", Value: fmt.Sprintf("%v", r)}) b.logger.Error("Connection handler panic",
logger.Field{Key: "panic", Value: fmt.Sprintf("%v", r)},
logger.Field{Key: "remote_addr", Value: conn.RemoteAddr().String()})
} }
conn.Close() conn.Close()
}() }()