ir: Add health status reporting on reconfiguration #1311

Merged
fyrchik merged 2 commits from elebedeva/frostfs-node:fix/ir-reload-notify-systemd into master 2024-09-04 19:51:11 +00:00
9 changed files with 55 additions and 18 deletions

View file

@ -7,6 +7,7 @@ import (
configViper "git.frostfs.info/TrueCloudLab/frostfs-node/cmd/internal/common/config"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
control "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/ir"
"github.com/spf13/viper"
"go.uber.org/zap"
)
@ -81,6 +82,10 @@ func watchForSignal(cancel func()) {
return
case <-sighupCh:
log.Info(logs.FrostFSNodeSIGHUPHasBeenReceivedRereadingConfiguration)
if !innerRing.CompareAndSwapHealthStatus(control.HealthStatus_READY, control.HealthStatus_RECONFIGURING) {
log.Info(logs.FrostFSNodeSIGHUPSkip)
break
}
err := reloadConfig()
if err != nil {
log.Error(logs.FrostFSNodeConfigurationReading, zap.Error(err))
@ -92,6 +97,7 @@ func watchForSignal(cancel func()) {
if err != nil {
log.Error(logs.FrostFSNodeConfigurationReading, zap.Error(err))
}
innerRing.CompareAndSwapHealthStatus(control.HealthStatus_RECONFIGURING, control.HealthStatus_READY)
log.Info(logs.FrostFSNodeConfigurationHasBeenReloadedSuccessfully)
}
}

6
go.mod
View file

@ -59,7 +59,7 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.13.0 // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/consensys/bavard v0.1.13 // indirect
github.com/consensys/gnark-crypto v0.12.2-0.20231222162921-eb75782795d2 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
@ -120,8 +120,8 @@ require (
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/text v0.16.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240311132316-a219d84964c2 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240314234333-6e1732d8331c // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
lukechampine.com/blake3 v1.2.1 // indirect
rsc.io/tmplfunc v0.0.3 // indirect

12
go.sum
View file

@ -33,8 +33,8 @@ github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJR
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM=
github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cheggaaa/pb v1.0.29 h1:FckUN5ngEk2LpvuG0fw1GEFx6LtyY2pWI/Z2QgCnEYo=
github.com/cheggaaa/pb v1.0.29/go.mod h1:W40334L7FMC5JKWldsTWbdGjLo0RxUKK73K+TuPxX30=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
@ -380,10 +380,10 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20220517211312-f3a8303e98df/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
google.golang.org/genproto/googleapis/api v0.0.0-20240311132316-a219d84964c2 h1:rIo7ocm2roD9DcFIX67Ym8icoGCKSARAiPljFhh5suQ=
google.golang.org/genproto/googleapis/api v0.0.0-20240311132316-a219d84964c2/go.mod h1:O1cOfN1Cy6QEYr7VxtjOyP5AdAuR0aJ/MYZaaof623Y=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240314234333-6e1732d8331c h1:lfpJ/2rWPa/kJgxyyXM8PrNnfCzcmxJ265mADgwmvLI=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240314234333-6e1732d8331c/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY=
google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237 h1:RFiFrvy37/mpSpdySBDrUdipW/dHwsRwh3J3+A9VgT4=
google.golang.org/genproto/googleapis/api v0.0.0-20240318140521-94a12d6c2237/go.mod h1:Z5Iiy3jtmioajWHDGFk7CeugTyHtPvMHA4UTmUkyalE=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 h1:NnYq6UN9ReLM9/Y01KWNOWyI5xQ9kbIms5GGJVwS/Yc=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY=
google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM=
google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=

View file

@ -161,6 +161,16 @@ func (s *Server) setHealthStatus(hs control.HealthStatus) {
}
}
func (s *Server) CompareAndSwapHealthStatus(oldSt, newSt control.HealthStatus) (swapped bool) {
if swapped = s.healthStatus.CompareAndSwap(int32(oldSt), int32(newSt)); swapped {
s.notifySystemd(newSt)
if s.irMetrics != nil {
s.irMetrics.SetHealth(int32(newSt))
}
}
return
}
// HealthStatus returns the current health status of the IR application.
func (s *Server) HealthStatus() control.HealthStatus {
return control.HealthStatus(s.healthStatus.Load())
@ -186,6 +196,8 @@ func (s *Server) notifySystemd(st control.HealthStatus) {
err = sdnotify.FlagAndStatus(sdnotify.ReadyEnabled)
case control.HealthStatus_SHUTTING_DOWN:
err = sdnotify.FlagAndStatus(sdnotify.StoppingEnabled)
case control.HealthStatus_RECONFIGURING:
err = sdnotify.FlagAndStatus(sdnotify.ReloadingEnabled)
default:
err = sdnotify.Status(fmt.Sprintf("%v", st))
}

View file

@ -32,6 +32,8 @@ const (
HealthStatus_READY HealthStatus = 2
// IR application is shutting down.
HealthStatus_SHUTTING_DOWN HealthStatus = 3
// IR application is reconfiguring.
HealthStatus_RECONFIGURING HealthStatus = 4
)
// Enum value maps for HealthStatus.
@ -41,12 +43,14 @@ var (
1: "STARTING",
2: "READY",
3: "SHUTTING_DOWN",
4: "RECONFIGURING",
}
HealthStatus_value = map[string]int32{
"HEALTH_STATUS_UNDEFINED": 0,
"STARTING": 1,
"READY": 2,
"SHUTTING_DOWN": 3,
"RECONFIGURING": 4,
}
)
@ -144,17 +148,19 @@ var file_pkg_services_control_ir_types_proto_rawDesc = []byte{
0x22, 0x36, 0x0a, 0x09, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x74, 0x75, 0x72, 0x65, 0x12, 0x10, 0x0a,
0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12,
0x17, 0x0a, 0x04, 0x73, 0x69, 0x67, 0x6e, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x09, 0x73,
0x69, 0x67, 0x6e, 0x61, 0x74, 0x75, 0x72, 0x65, 0x2a, 0x57, 0x0a, 0x0c, 0x48, 0x65, 0x61, 0x6c,
0x69, 0x67, 0x6e, 0x61, 0x74, 0x75, 0x72, 0x65, 0x2a, 0x6a, 0x0a, 0x0c, 0x48, 0x65, 0x61, 0x6c,
0x74, 0x68, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x1b, 0x0a, 0x17, 0x48, 0x45, 0x41, 0x4c,
0x54, 0x48, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x55, 0x4e, 0x44, 0x45, 0x46, 0x49,
0x4e, 0x45, 0x44, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x53, 0x54, 0x41, 0x52, 0x54, 0x49, 0x4e,
0x47, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x45, 0x41, 0x44, 0x59, 0x10, 0x02, 0x12, 0x11,
0x0a, 0x0d, 0x53, 0x48, 0x55, 0x54, 0x54, 0x49, 0x4e, 0x47, 0x5f, 0x44, 0x4f, 0x57, 0x4e, 0x10,
0x03, 0x42, 0x44, 0x5a, 0x42, 0x67, 0x69, 0x74, 0x2e, 0x66, 0x72, 0x6f, 0x73, 0x74, 0x66, 0x73,
0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x2f, 0x54, 0x72, 0x75, 0x65, 0x43, 0x6c, 0x6f, 0x75, 0x64, 0x4c,
0x61, 0x62, 0x2f, 0x66, 0x72, 0x6f, 0x73, 0x74, 0x66, 0x73, 0x2d, 0x6e, 0x6f, 0x64, 0x65, 0x2f,
0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x73, 0x2f, 0x69, 0x72, 0x2f,
0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
0x03, 0x12, 0x11, 0x0a, 0x0d, 0x52, 0x45, 0x43, 0x4f, 0x4e, 0x46, 0x49, 0x47, 0x55, 0x52, 0x49,
0x4e, 0x47, 0x10, 0x04, 0x42, 0x44, 0x5a, 0x42, 0x67, 0x69, 0x74, 0x2e, 0x66, 0x72, 0x6f, 0x73,
0x74, 0x66, 0x73, 0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x2f, 0x54, 0x72, 0x75, 0x65, 0x43, 0x6c, 0x6f,
0x75, 0x64, 0x4c, 0x61, 0x62, 0x2f, 0x66, 0x72, 0x6f, 0x73, 0x74, 0x66, 0x73, 0x2d, 0x6e, 0x6f,
0x64, 0x65, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x73, 0x2f,
0x69, 0x72, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74,
0x6f, 0x33,
}
var (

View file

@ -26,4 +26,7 @@ enum HealthStatus {
// IR application is shutting down.
SHUTTING_DOWN = 3;
// IR application is reconfiguring.
RECONFIGURING = 4;
}

View file

@ -0,0 +1,10 @@
package sdnotify
import (
// For go:linkname to work.
_ "unsafe"
)
//go:noescape
//go:linkname nanotime runtime.nanotime
func nanotime() int64

View file

@ -0,0 +1,2 @@
// The file is intentionally empty.
// It is a workaround for https://github.com/golang/go/issues/15006

View file

@ -6,7 +6,6 @@ import (
"net"
"os"
"strings"
"time"
)
const (
@ -17,7 +16,6 @@ const (
var (
socket *net.UnixAddr
start = time.Now()
errSocketVariableIsNotPresent = errors.New("\"NOTIFY_SOCKET\" environment variable is not present")
errSocketIsNotInitialized = errors.New("socket is not initialized")
@ -53,7 +51,7 @@ func FlagAndStatus(status string) error {
// must be sent, containing "READY=1".
//
// For MONOTONIC_USEC format refer to https://www.man7.org/linux/man-pages/man3/sd_notify.3.html
fyrchik marked this conversation as resolved
Review

Maybe in IR SIGHUP is so fast that we send the same MONOTONIC_USEC? This would've explained the problem.

Maybe in IR SIGHUP is so fast that we send the same `MONOTONIC_USEC`? This would've explained the problem.
Review

Receiving the same MONOTONIC_USEC doesn't seem like a problem to systemd. I tried sending time.Since(time.Time{}) (always the same value), systemd is OK with it, service reload is successful.

It appears as if systemd does not accept values less than some minimum and greater than some maximum, and time in us since start is deemed as being too small. Time in ns since start works fine most of the time but not always (i've got hang-ups a couple times). Haven't found those min & max values in systemd source code yet.

Passing a math.MaxInt64 as MONOTONIC_USEC works fine but math.MaxUint64 is not accepted.

Receiving the same `MONOTONIC_USEC` doesn't seem like a problem to `systemd`. I tried sending `time.Since(time.Time{})` (always the same value), `systemd` is OK with it, service reload is successful. It appears as if `systemd` does not accept values less than some minimum and greater than some maximum, and time in us since start is deemed as being too small. Time in ns since start works fine most of the time but not always (i've got hang-ups a couple times). Haven't found those min & max values in systemd source code yet. Passing a `math.MaxInt64` as `MONOTONIC_USEC` works fine but `math.MaxUint64` is not accepted.
status += fmt.Sprintf("\nMONOTONIC_USEC=%d", uint64(time.Since(start))/1e3 /* microseconds in nanoseconds */)
status += fmt.Sprintf("\nMONOTONIC_USEC=%d", uint64(nanotime())/1e3 /* microseconds in nanoseconds */)
}
status += "\nSTATUS=" + strings.TrimSuffix(status, "=1")
return Send(status)