ir: Add health status reporting on reconfiguration #1311

Merged
fyrchik merged 2 commits from elebedeva/frostfs-node:fix/ir-reload-notify-systemd into master 2024-09-04 19:51:11 +00:00
9 changed files with 37 additions and 6 deletions

View file

@ -7,6 +7,7 @@ import (
configViper "git.frostfs.info/TrueCloudLab/frostfs-node/cmd/internal/common/config"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
control "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/ir"
"github.com/spf13/viper"
"go.uber.org/zap"
)
@ -81,6 +82,10 @@ func watchForSignal(cancel func()) {
return
case <-sighupCh:
log.Info(logs.FrostFSNodeSIGHUPHasBeenReceivedRereadingConfiguration)
if !innerRing.CompareAndSwapHealthStatus(control.HealthStatus_READY, control.HealthStatus_RECONFIGURING) {
log.Info(logs.FrostFSNodeSIGHUPSkip)
break
}
err := reloadConfig()
if err != nil {
log.Error(logs.FrostFSNodeConfigurationReading, zap.Error(err))
@ -92,6 +97,7 @@ func watchForSignal(cancel func()) {
if err != nil {
log.Error(logs.FrostFSNodeConfigurationReading, zap.Error(err))
}
innerRing.CompareAndSwapHealthStatus(control.HealthStatus_RECONFIGURING, control.HealthStatus_READY)
log.Info(logs.FrostFSNodeConfigurationHasBeenReloadedSuccessfully)
}
}

6
go.mod
View file

@ -59,7 +59,7 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.13.0 // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/consensys/bavard v0.1.13 // indirect
github.com/consensys/gnark-crypto v0.12.2-0.20231222162921-eb75782795d2 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
@ -120,8 +120,8 @@ require (
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/text v0.16.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240311132316-a219d84964c2 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240314234333-6e1732d8331c // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
lukechampine.com/blake3 v1.2.1 // indirect
rsc.io/tmplfunc v0.0.3 // indirect

BIN
go.sum

Binary file not shown.

View file

@ -161,6 +161,16 @@ func (s *Server) setHealthStatus(hs control.HealthStatus) {
}
}
func (s *Server) CompareAndSwapHealthStatus(oldSt, newSt control.HealthStatus) (swapped bool) {
if swapped = s.healthStatus.CompareAndSwap(int32(oldSt), int32(newSt)); swapped {
s.notifySystemd(newSt)
if s.irMetrics != nil {
s.irMetrics.SetHealth(int32(newSt))
}
}
return
}
// HealthStatus returns the current health status of the IR application.
func (s *Server) HealthStatus() control.HealthStatus {
return control.HealthStatus(s.healthStatus.Load())
@ -186,6 +196,8 @@ func (s *Server) notifySystemd(st control.HealthStatus) {
err = sdnotify.FlagAndStatus(sdnotify.ReadyEnabled)
case control.HealthStatus_SHUTTING_DOWN:
err = sdnotify.FlagAndStatus(sdnotify.StoppingEnabled)
case control.HealthStatus_RECONFIGURING:
err = sdnotify.FlagAndStatus(sdnotify.ReloadingEnabled)
default:
err = sdnotify.Status(fmt.Sprintf("%v", st))
}

Binary file not shown.

View file

@ -26,4 +26,7 @@ enum HealthStatus {
// IR application is shutting down.
SHUTTING_DOWN = 3;
// IR application is reconfiguring.
RECONFIGURING = 4;
}

View file

@ -0,0 +1,10 @@
package sdnotify
import (
// For go:linkname to work.
_ "unsafe"
)
//go:noescape
//go:linkname nanotime runtime.nanotime
func nanotime() int64

View file

@ -0,0 +1,2 @@
// The file is intentionally empty.
// It is a workaround for https://github.com/golang/go/issues/15006

View file

@ -6,7 +6,6 @@ import (
"net"
"os"
"strings"
"time"
)
const (
@ -17,7 +16,6 @@ const (
var (
socket *net.UnixAddr
start = time.Now()
errSocketVariableIsNotPresent = errors.New("\"NOTIFY_SOCKET\" environment variable is not present")
errSocketIsNotInitialized = errors.New("socket is not initialized")
@ -53,7 +51,7 @@ func FlagAndStatus(status string) error {
// must be sent, containing "READY=1".
//
// For MONOTONIC_USEC format refer to https://www.man7.org/linux/man-pages/man3/sd_notify.3.html
fyrchik marked this conversation as resolved
Review

Maybe in IR SIGHUP is so fast that we send the same MONOTONIC_USEC? This would've explained the problem.

Maybe in IR SIGHUP is so fast that we send the same `MONOTONIC_USEC`? This would've explained the problem.
Review

Receiving the same MONOTONIC_USEC doesn't seem like a problem to systemd. I tried sending time.Since(time.Time{}) (always the same value), systemd is OK with it, service reload is successful.

It appears as if systemd does not accept values less than some minimum and greater than some maximum, and time in us since start is deemed as being too small. Time in ns since start works fine most of the time but not always (i've got hang-ups a couple times). Haven't found those min & max values in systemd source code yet.

Passing a math.MaxInt64 as MONOTONIC_USEC works fine but math.MaxUint64 is not accepted.

Receiving the same `MONOTONIC_USEC` doesn't seem like a problem to `systemd`. I tried sending `time.Since(time.Time{})` (always the same value), `systemd` is OK with it, service reload is successful. It appears as if `systemd` does not accept values less than some minimum and greater than some maximum, and time in us since start is deemed as being too small. Time in ns since start works fine most of the time but not always (i've got hang-ups a couple times). Haven't found those min & max values in systemd source code yet. Passing a `math.MaxInt64` as `MONOTONIC_USEC` works fine but `math.MaxUint64` is not accepted.
status += fmt.Sprintf("\nMONOTONIC_USEC=%d", uint64(time.Since(start))/1e3 /* microseconds in nanoseconds */)
status += fmt.Sprintf("\nMONOTONIC_USEC=%d", uint64(nanotime())/1e3 /* microseconds in nanoseconds */)
}
status += "\nSTATUS=" + strings.TrimSuffix(status, "=1")
return Send(status)