1
0
Fork 0

Add least time load balancing strategy

This commit is contained in:
Simon Delicata 2025-10-23 16:16:05 +02:00 committed by GitHub
parent 067c7e7152
commit a754236ce5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 1830 additions and 134 deletions

View file

@ -13,6 +13,8 @@ import (
"github.com/traefik/traefik/v3/pkg/ip"
)
var errNoAvailableServer = errors.New("no available server")
type namedHandler struct {
http.Handler
name string
@ -114,15 +116,13 @@ func (b *Balancer) SetStatus(ctx context.Context, childName string, up bool) {
// Not thread safe.
func (b *Balancer) RegisterStatusUpdater(fn func(up bool)) error {
if !b.wantsHealthCheck {
return errors.New("healthCheck not enabled in config for this weighted service")
return errors.New("healthCheck not enabled in config for this HRW service")
}
b.updaters = append(b.updaters, fn)
return nil
}
var errNoAvailableServer = errors.New("no available server")
func (b *Balancer) nextServer(ip string) (*namedHandler, error) {
b.handlersMu.RLock()
var healthy []*namedHandler

View file

@ -0,0 +1,373 @@
package leasttime
import (
"context"
"errors"
"math"
"net/http"
"net/http/httptrace"
"sync"
"sync/atomic"
"time"
"github.com/rs/zerolog/log"
"github.com/traefik/traefik/v3/pkg/config/dynamic"
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer"
)
const sampleSize = 100 // Number of response time samples to track.
var errNoAvailableServer = errors.New("no available server")
// namedHandler wraps an HTTP handler with metrics and server information.
// Tracks response time (TTFB) and inflight request count for load balancing decisions.
type namedHandler struct {
http.Handler
name string
weight float64
deadlineMu sync.RWMutex
deadline float64 // WRR tie-breaking (EDF scheduling).
inflightCount atomic.Int64 // Number of inflight requests.
responseTimeMu sync.RWMutex
responseTimes [sampleSize]float64 // Fixed-size ring buffer (TTFB measurements in ms).
responseTimeIdx int // Current position in ring buffer.
responseTimeSum float64 // Sum of all values in buffer.
sampleCount int // Number of samples collected so far.
}
// updateResponseTime updates the average response time for this server using a ring buffer.
func (s *namedHandler) updateResponseTime(elapsed time.Duration) {
s.responseTimeMu.Lock()
defer s.responseTimeMu.Unlock()
ms := float64(elapsed.Milliseconds())
if s.sampleCount < sampleSize {
// Still filling the buffer.
s.responseTimes[s.responseTimeIdx] = ms
s.responseTimeSum += ms
s.sampleCount++
} else {
// Buffer is full, replace oldest value.
oldValue := s.responseTimes[s.responseTimeIdx]
s.responseTimes[s.responseTimeIdx] = ms
s.responseTimeSum = s.responseTimeSum - oldValue + ms
}
s.responseTimeIdx = (s.responseTimeIdx + 1) % sampleSize
}
// getAvgResponseTime returns the average response time in milliseconds.
// Returns 0 if no samples have been collected yet (cold start).
func (s *namedHandler) getAvgResponseTime() float64 {
s.responseTimeMu.RLock()
defer s.responseTimeMu.RUnlock()
if s.sampleCount == 0 {
return 0
}
return s.responseTimeSum / float64(s.sampleCount)
}
func (s *namedHandler) getDeadline() float64 {
s.deadlineMu.RLock()
defer s.deadlineMu.RUnlock()
return s.deadline
}
func (s *namedHandler) setDeadline(deadline float64) {
s.deadlineMu.Lock()
defer s.deadlineMu.Unlock()
s.deadline = deadline
}
// Balancer implements the least-time load balancing algorithm.
// It selects the server with the lowest average response time (TTFB) and fewest active connections.
type Balancer struct {
wantsHealthCheck bool
// handlersMu protects the handlers slice, the status and the fenced maps.
handlersMu sync.RWMutex
handlers []*namedHandler
// status is a record of which child services of the Balancer are healthy, keyed
// by name of child service. A service is initially added to the map when it is
// created via Add, and it is later removed or added to the map as needed,
// through the SetStatus method.
status map[string]struct{}
// fenced is the list of terminating yet still serving child services.
fenced map[string]struct{}
// updaters is the list of hooks that are run (to update the Balancer
// parent(s)), whenever the Balancer status changes.
// No mutex is needed, as it is modified only during the configuration build.
updaters []func(bool)
sticky *loadbalancer.Sticky
// deadlineMu protects EDF scheduling state (curDeadline and all handler deadline fields).
// Separate from handlersMu to reduce lock contention during tie-breaking.
curDeadlineMu sync.RWMutex
// curDeadline is used for WRR tie-breaking (EDF scheduling).
curDeadline float64
}
// New creates a new least-time load balancer.
func New(stickyConfig *dynamic.Sticky, wantsHealthCheck bool) *Balancer {
balancer := &Balancer{
status: make(map[string]struct{}),
fenced: make(map[string]struct{}),
wantsHealthCheck: wantsHealthCheck,
}
if stickyConfig != nil && stickyConfig.Cookie != nil {
balancer.sticky = loadbalancer.NewSticky(*stickyConfig.Cookie)
}
return balancer
}
// SetStatus sets on the balancer that its given child is now of the given
// status. childName is only needed for logging purposes.
func (b *Balancer) SetStatus(ctx context.Context, childName string, up bool) {
b.handlersMu.Lock()
defer b.handlersMu.Unlock()
upBefore := len(b.status) > 0
status := "DOWN"
if up {
status = "UP"
}
log.Ctx(ctx).Debug().Msgf("Setting status of %s to %v", childName, status)
if up {
b.status[childName] = struct{}{}
} else {
delete(b.status, childName)
}
upAfter := len(b.status) > 0
status = "DOWN"
if upAfter {
status = "UP"
}
// No Status Change.
if upBefore == upAfter {
// We're still with the same status, no need to propagate.
log.Ctx(ctx).Debug().Msgf("Still %s, no need to propagate", status)
return
}
// Status Change.
log.Ctx(ctx).Debug().Msgf("Propagating new %s status", status)
for _, fn := range b.updaters {
fn(upAfter)
}
}
// RegisterStatusUpdater adds fn to the list of hooks that are run when the
// status of the Balancer changes.
// Not thread safe.
func (b *Balancer) RegisterStatusUpdater(fn func(up bool)) error {
if !b.wantsHealthCheck {
return errors.New("healthCheck not enabled in config for this LeastTime service")
}
b.updaters = append(b.updaters, fn)
return nil
}
// getHealthyServers returns the list of healthy, non-fenced servers.
func (b *Balancer) getHealthyServers() []*namedHandler {
b.handlersMu.RLock()
defer b.handlersMu.RUnlock()
var healthy []*namedHandler
for _, h := range b.handlers {
if _, ok := b.status[h.name]; ok {
if _, fenced := b.fenced[h.name]; !fenced {
healthy = append(healthy, h)
}
}
}
return healthy
}
// selectWRR selects a server from candidates using Weighted Round Robin (EDF scheduling).
// This is used for tie-breaking when multiple servers have identical scores.
func (b *Balancer) selectWRR(candidates []*namedHandler) *namedHandler {
if len(candidates) == 0 {
return nil
}
selected := candidates[0]
minDeadline := math.MaxFloat64
// Find handler with earliest deadline.
for _, h := range candidates {
handlerDeadline := h.getDeadline()
if handlerDeadline < minDeadline {
minDeadline = handlerDeadline
selected = h
}
}
// Update deadline based on when this server was selected (minDeadline),
// not the global curDeadline. This ensures proper weighted distribution.
newDeadline := minDeadline + 1/selected.weight
selected.setDeadline(newDeadline)
// Track the maximum deadline assigned for initializing new servers.
b.curDeadlineMu.Lock()
if newDeadline > b.curDeadline {
b.curDeadline = newDeadline
}
b.curDeadlineMu.Unlock()
return selected
}
// Score = (avgResponseTime × (1 + inflightCount)) / weight.
func (b *Balancer) nextServer() (*namedHandler, error) {
healthy := b.getHealthyServers()
if len(healthy) == 0 {
return nil, errNoAvailableServer
}
if len(healthy) == 1 {
return healthy[0], nil
}
// Calculate scores and find minimum.
minScore := math.MaxFloat64
var candidates []*namedHandler
for _, h := range healthy {
avgRT := h.getAvgResponseTime()
inflight := float64(h.inflightCount.Load())
score := (avgRT * (1 + inflight)) / h.weight
if score < minScore {
minScore = score
candidates = []*namedHandler{h}
} else if score == minScore {
candidates = append(candidates, h)
}
}
if len(candidates) == 1 {
return candidates[0], nil
}
// Multiple servers with same score: use WRR (EDF) tie-breaking.
selected := b.selectWRR(candidates)
if selected == nil {
return nil, errNoAvailableServer
}
return selected, nil
}
func (b *Balancer) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
// Handle sticky sessions first.
if b.sticky != nil {
h, rewrite, err := b.sticky.StickyHandler(req)
if err != nil {
log.Error().Err(err).Msg("Error while getting sticky handler")
} else if h != nil {
b.handlersMu.RLock()
_, ok := b.status[h.Name]
b.handlersMu.RUnlock()
if ok {
if rewrite {
if err := b.sticky.WriteStickyCookie(rw, h.Name); err != nil {
log.Error().Err(err).Msg("Writing sticky cookie")
}
}
h.ServeHTTP(rw, req)
return
}
}
}
server, err := b.nextServer()
if err != nil {
if errors.Is(err, errNoAvailableServer) {
http.Error(rw, errNoAvailableServer.Error(), http.StatusServiceUnavailable)
} else {
http.Error(rw, err.Error(), http.StatusInternalServerError)
}
return
}
if b.sticky != nil {
if err := b.sticky.WriteStickyCookie(rw, server.name); err != nil {
log.Error().Err(err).Msg("Error while writing sticky cookie")
}
}
// Track inflight requests.
server.inflightCount.Add(1)
defer server.inflightCount.Add(-1)
startTime := time.Now()
trace := &httptrace.ClientTrace{
GotFirstResponseByte: func() {
// Update average response time (TTFB).
server.updateResponseTime(time.Since(startTime))
},
}
traceCtx := httptrace.WithClientTrace(req.Context(), trace)
server.ServeHTTP(rw, req.WithContext(traceCtx))
}
// AddServer adds a handler with a server.
func (b *Balancer) AddServer(name string, handler http.Handler, server dynamic.Server) {
b.Add(name, handler, server.Weight, server.Fenced)
}
// Add adds a handler.
// A handler with a non-positive weight is ignored.
func (b *Balancer) Add(name string, handler http.Handler, weight *int, fenced bool) {
w := 1
if weight != nil {
w = *weight
}
if w <= 0 { // non-positive weight is meaningless.
return
}
h := &namedHandler{Handler: handler, name: name, weight: float64(w)}
// Initialize deadline by adding 1/weight to current deadline.
// This staggers servers to prevent all starting at the same time.
var deadline float64
b.curDeadlineMu.RLock()
deadline = b.curDeadline + 1/h.weight
b.curDeadlineMu.RUnlock()
h.setDeadline(deadline)
// Update balancer's current deadline with the new server's deadline.
b.curDeadlineMu.Lock()
b.curDeadline = deadline
b.curDeadlineMu.Unlock()
b.handlersMu.Lock()
b.handlers = append(b.handlers, h)
b.status[name] = struct{}{}
if fenced {
b.fenced[name] = struct{}{}
}
b.handlersMu.Unlock()
if b.sticky != nil {
b.sticky.AddHandler(name, handler)
}
}

File diff suppressed because it is too large Load diff

View file

@ -14,6 +14,8 @@ import (
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer"
)
var errNoAvailableServer = errors.New("no available server")
type namedHandler struct {
http.Handler
@ -81,7 +83,7 @@ func New(stickyConfig *dynamic.Sticky, wantsHealthCheck bool) *Balancer {
}
// SetStatus sets on the balancer that its given child is now of the given
// status. balancerName is only needed for logging purposes.
// status. childName is only needed for logging purposes.
func (b *Balancer) SetStatus(ctx context.Context, childName string, up bool) {
b.handlersMu.Lock()
defer b.handlersMu.Unlock()
@ -126,14 +128,12 @@ func (b *Balancer) SetStatus(ctx context.Context, childName string, up bool) {
// Not thread safe.
func (b *Balancer) RegisterStatusUpdater(fn func(up bool)) error {
if !b.wantsHealthCheck {
return errors.New("healthCheck not enabled in config for this weighted service")
return errors.New("healthCheck not enabled in config for this P2C service")
}
b.updaters = append(b.updaters, fn)
return nil
}
var errNoAvailableServer = errors.New("no available server")
func (b *Balancer) nextServer() (*namedHandler, error) {
// We kept the same representation (map) as in the WRR strategy to improve maintainability.
// However, with the P2C strategy, we only need a slice of healthy servers.

View file

@ -12,6 +12,8 @@ import (
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer"
)
var errNoAvailableServer = errors.New("no available server")
type namedHandler struct {
http.Handler
name string
@ -94,7 +96,7 @@ func (b *Balancer) Pop() interface{} {
}
// SetStatus sets on the balancer that its given child is now of the given
// status. balancerName is only needed for logging purposes.
// status. childName is only needed for logging purposes.
func (b *Balancer) SetStatus(ctx context.Context, childName string, up bool) {
b.handlersMu.Lock()
defer b.handlersMu.Unlock()
@ -139,14 +141,12 @@ func (b *Balancer) SetStatus(ctx context.Context, childName string, up bool) {
// Not thread safe.
func (b *Balancer) RegisterStatusUpdater(fn func(up bool)) error {
if !b.wantsHealthCheck {
return errors.New("healthCheck not enabled in config for this weighted service")
return errors.New("healthCheck not enabled in config for this WRR service")
}
b.updaters = append(b.updaters, fn)
return nil
}
var errNoAvailableServer = errors.New("no available server")
func (b *Balancer) nextServer() (*namedHandler, error) {
b.handlersMu.Lock()
defer b.handlersMu.Unlock()

View file

@ -29,6 +29,7 @@ import (
"github.com/traefik/traefik/v3/pkg/server/provider"
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer/failover"
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer/hrw"
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer/leasttime"
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer/mirror"
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer/p2c"
"github.com/traefik/traefik/v3/pkg/server/service/loadbalancer/wrr"
@ -402,6 +403,8 @@ func (m *Manager) getLoadBalancerServiceHandler(ctx context.Context, serviceName
lb = p2c.New(service.Sticky, service.HealthCheck != nil)
case dynamic.BalancerStrategyHRW:
lb = hrw.New(service.HealthCheck != nil)
case dynamic.BalancerStrategyLeastTime:
lb = leasttime.New(service.Sticky, service.HealthCheck != nil)
default:
return nil, fmt.Errorf("unsupported load-balancer strategy %q", service.Strategy)
}

View file

@ -82,6 +82,20 @@ func TestGetLoadBalancer(t *testing.T) {
fwd: &forwarderMock{},
expectError: false,
},
{
desc: "Fails when unsupported strategy is set",
serviceName: "test",
service: &dynamic.ServersLoadBalancer{
Strategy: "invalid",
Servers: []dynamic.Server{
{
URL: "http://localhost:8080",
},
},
},
fwd: &forwarderMock{},
expectError: true,
},
}
for _, test := range testCases {