fix: nodes no longer beg to be kept alive
This commit is contained in:
parent
7c4154a459
commit
a32b0f728e
12 changed files with 146 additions and 62 deletions
|
|
@ -28,6 +28,7 @@ func New(state *state.RuntimeState, config config.MasterConfig) *Role {
|
|||
config.ObserverInterval,
|
||||
config.BackoffSeconds,
|
||||
config.BackoffCount,
|
||||
config.NodeTimeout,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
|
@ -57,7 +58,7 @@ func (r *Role) notify(path types.Path, v any) {
|
|||
}
|
||||
}
|
||||
|
||||
func (r *Role) onJoin(node types.Node) ([]types.Node, error) {
|
||||
func (r *Role) onJoin(node types.Node) (map[string]types.Node, error) {
|
||||
if err := r.state.Registry.AddNode(node); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -68,7 +69,7 @@ func (r *Role) onJoin(node types.Node) ([]types.Node, error) {
|
|||
}
|
||||
|
||||
func (r *Role) onLeave(node types.Node) (bool, error) {
|
||||
if err := r.state.Registry.RemoveNode(node.Hostname); err != nil {
|
||||
if err := r.state.Registry.RemoveNode(node); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
|
|
@ -78,6 +79,8 @@ func (r *Role) onLeave(node types.Node) (bool, error) {
|
|||
}
|
||||
|
||||
func (r *Role) onKeepAlive(node types.Node) (bool, error) {
|
||||
r.observer.onKeepAlive(node)
|
||||
|
||||
if ok := r.state.Registry.Exists(node.Hostname); !ok {
|
||||
_, err := r.onJoin(node)
|
||||
return true, err
|
||||
|
|
|
|||
|
|
@ -2,8 +2,10 @@ package master
|
|||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.wzray.com/homelab/hivemind/internal/registry"
|
||||
"git.wzray.com/homelab/hivemind/internal/state"
|
||||
"git.wzray.com/homelab/hivemind/internal/types"
|
||||
"git.wzray.com/homelab/hivemind/internal/web/client"
|
||||
|
|
@ -15,6 +17,9 @@ type observer struct {
|
|||
interval int
|
||||
backoff int
|
||||
backoffCount int
|
||||
nodeTimeout int64
|
||||
keepalives map[string]int64
|
||||
lock sync.RWMutex
|
||||
}
|
||||
|
||||
func newObserver(
|
||||
|
|
@ -22,17 +27,35 @@ func newObserver(
|
|||
interval int,
|
||||
backoff int,
|
||||
backoffCount int,
|
||||
nodeTimeout int,
|
||||
) *observer {
|
||||
return &observer{
|
||||
state: state,
|
||||
interval: interval,
|
||||
backoff: backoff,
|
||||
backoffCount: backoffCount,
|
||||
nodeTimeout: int64(nodeTimeout) * 1000,
|
||||
keepalives: make(map[string]int64),
|
||||
}
|
||||
}
|
||||
|
||||
func (o *observer) pollNodes(ctx context.Context, onLeave func(types.Node) error) {
|
||||
for _, n := range o.state.Registry.Nodes() {
|
||||
nodes := o.state.Registry.Nodes()
|
||||
toCheck := make([]types.Node, 0, len(nodes))
|
||||
now := time.Now().UnixMilli()
|
||||
|
||||
for name, last := range o.keepalives {
|
||||
if name == o.state.Self.Hostname {
|
||||
continue
|
||||
}
|
||||
|
||||
if now-last > o.nodeTimeout {
|
||||
toCheck = append(toCheck, nodes[name])
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: think about this
|
||||
for _, n := range toCheck {
|
||||
name := n.Hostname
|
||||
logger := log.With().Str("name", name).Logger()
|
||||
logger.Debug().Msg("checking node")
|
||||
|
|
@ -71,9 +94,42 @@ func (o *observer) pollNodes(ctx context.Context, onLeave func(types.Node) error
|
|||
}
|
||||
}
|
||||
|
||||
func (o *observer) onKeepAlive(node types.Node) {
|
||||
o.lock.Lock()
|
||||
defer o.lock.Unlock()
|
||||
o.keepalives[node.Hostname] = time.Now().UnixMilli()
|
||||
}
|
||||
|
||||
func (o *observer) onEvent(event registry.RegistryEvent) {
|
||||
o.lock.Lock()
|
||||
defer o.lock.Unlock()
|
||||
|
||||
switch event.Event {
|
||||
case registry.EventNodeJoin:
|
||||
for _, node := range event.Nodes {
|
||||
o.keepalives[node.Hostname] = time.Now().UnixMilli()
|
||||
}
|
||||
case registry.EventNodeLeave:
|
||||
for _, node := range event.Nodes {
|
||||
delete(o.keepalives, node.Hostname)
|
||||
}
|
||||
case registry.EventSet:
|
||||
o.keepalives = make(map[string]int64)
|
||||
now := time.Now().UnixMilli()
|
||||
|
||||
for _, node := range event.Nodes {
|
||||
o.keepalives[node.Hostname] = now
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (o *observer) Start(ctx context.Context, onLeave func(types.Node) error) {
|
||||
registryEvents := o.state.Registry.Subscribe()
|
||||
|
||||
for {
|
||||
select {
|
||||
case n := <-registryEvents:
|
||||
o.onEvent(n)
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(time.Duration(o.interval) * time.Second):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue