systemd notify protocol #810

Merged
fyrchik merged 5 commits from elebedeva/frostfs-node:sd-notify into master 2024-09-04 19:51:04 +00:00
12 changed files with 144 additions and 6 deletions

View file

@ -62,6 +62,7 @@ import (
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/util/response" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/util/response"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/sdnotify"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/state" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/state"
"git.frostfs.info/TrueCloudLab/frostfs-observability/logging/lokicore" "git.frostfs.info/TrueCloudLab/frostfs-observability/logging/lokicore"
"git.frostfs.info/TrueCloudLab/frostfs-observability/tracing" "git.frostfs.info/TrueCloudLab/frostfs-observability/tracing"
@ -361,6 +362,8 @@ type internals struct {
healthStatus *atomic.Int32 healthStatus *atomic.Int32
// is node under maintenance // is node under maintenance
isMaintenance atomic.Bool isMaintenance atomic.Bool
sdNotify bool
} }
// starts node's maintenance. // starts node's maintenance.
@ -632,9 +635,18 @@ func initInternals(appCfg *config.Config, log *logger.Logger) internals {
log: log, log: log,
apiVersion: version.Current(), apiVersion: version.Current(),
healthStatus: &healthStatus, healthStatus: &healthStatus,
sdNotify: initSdNotify(appCfg),
} }
} }
func initSdNotify(appCfg *config.Config) bool {
if config.BoolSafe(appCfg.Sub("systemdnotify"), "enabled") {
acid-ant marked this conversation as resolved Outdated

Do we need this variable?

Do we need this variable?

no, removed

no, removed
fatalOnErr(sdnotify.InitSocket())
return true
}
return false
acid-ant marked this conversation as resolved Outdated

Can you merge above two lines?

Can you merge above two lines?
}
func initShared(appCfg *config.Config, key *keys.PrivateKey, netState *networkState, relayOnly bool) shared { func initShared(appCfg *config.Config, key *keys.PrivateKey, netState *networkState, relayOnly bool) shared {
var netAddr network.AddressGroup var netAddr network.AddressGroup

View file

@ -2,6 +2,7 @@ package main
import ( import (
"context" "context"
"fmt"
"net" "net"
controlconfig "git.frostfs.info/TrueCloudLab/frostfs-node/cmd/frostfs-node/config/control" controlconfig "git.frostfs.info/TrueCloudLab/frostfs-node/cmd/frostfs-node/config/control"
@ -9,6 +10,7 @@ import (
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control"
controlSvc "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/server" controlSvc "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/server"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/sdnotify"
cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id" cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id"
"go.uber.org/zap" "go.uber.org/zap"
"google.golang.org/grpc" "google.golang.org/grpc"
@ -83,12 +85,14 @@ func (c *cfg) NetmapStatus() control.NetmapStatus {
} }
func (c *cfg) setHealthStatus(st control.HealthStatus) { func (c *cfg) setHealthStatus(st control.HealthStatus) {
fyrchik marked this conversation as resolved Outdated

Can we move fmt.Sprintf to a separate function which accepts st? It can later be extended to accept any string and there is less opportunities to make an error somewhere.

Can we move `fmt.Sprintf` to a separate function which accepts `st`? It can later be extended to accept any string and there is less opportunities to make an error somewhere.

Logging can also be done inside this function.

Logging can also be done inside this function.

done

done
c.notifySystemd(st)
c.healthStatus.Store(int32(st)) c.healthStatus.Store(int32(st))
c.metricsCollector.State().SetHealth(int32(st)) c.metricsCollector.State().SetHealth(int32(st))
} }
func (c *cfg) compareAndSwapHealthStatus(oldSt, newSt control.HealthStatus) (swapped bool) { func (c *cfg) compareAndSwapHealthStatus(oldSt, newSt control.HealthStatus) (swapped bool) {
if swapped = c.healthStatus.CompareAndSwap(int32(oldSt), int32(newSt)); swapped { if swapped = c.healthStatus.CompareAndSwap(int32(oldSt), int32(newSt)); swapped {
c.notifySystemd(newSt)
c.metricsCollector.State().SetHealth(int32(newSt)) c.metricsCollector.State().SetHealth(int32(newSt))
} }

What happens here is sdnotify is disabled?

What happens here is `sdnotify` is disabled?

sysdnotify.Status() returns an error connection refused, and this error will be logged.

```sysdnotify.Status()``` returns an error ```connection refused```, and this error will be logged.
return return
@ -96,6 +100,7 @@ func (c *cfg) compareAndSwapHealthStatus(oldSt, newSt control.HealthStatus) (swa
func (c *cfg) swapHealthStatus(st control.HealthStatus) (old control.HealthStatus) { func (c *cfg) swapHealthStatus(st control.HealthStatus) (old control.HealthStatus) {
old = control.HealthStatus(c.healthStatus.Swap(int32(st))) old = control.HealthStatus(c.healthStatus.Swap(int32(st)))
c.notifySystemd(st)
c.metricsCollector.State().SetHealth(int32(st)) c.metricsCollector.State().SetHealth(int32(st))
return return
} }
@ -103,3 +108,23 @@ func (c *cfg) swapHealthStatus(st control.HealthStatus) (old control.HealthStatu
func (c *cfg) HealthStatus() control.HealthStatus { func (c *cfg) HealthStatus() control.HealthStatus {
return control.HealthStatus(c.healthStatus.Load()) return control.HealthStatus(c.healthStatus.Load())
} }
func (c *cfg) notifySystemd(st control.HealthStatus) {
if !c.sdNotify {

This way allows to drop indent:

if !c.sdNotify {
     return
}
var err error

Not required to fix, just FYI.

This way allows to drop indent: ``` if !c.sdNotify { return } var err error ``` Not required to fix, just FYI.
return
}
var err error

How about merging above three lines in one?

How about merging above three lines in one?
switch st {
fyrchik marked this conversation as resolved Outdated

It is debug log? I believe this is not worth logging especially with info.

It is debug log? I believe this is not worth logging especially with `info`.

removed

removed
aarifullin marked this conversation as resolved Outdated

My point is optional, but how about to introduce constants like

const (
   SystemdReadyEnabled      = "READY=1"
   SystemdStoppingEnabled   = "STOPPING=1"
   SystemdReloadingEnabled  = "RELOADING=1"
)

and replace these strings by the constants?

My point is optional, but how about to introduce constants like ```go const ( SystemdReadyEnabled = "READY=1" SystemdStoppingEnabled = "STOPPING=1" SystemdReloadingEnabled = "RELOADING=1" ) ``` and replace these strings by the constants?

added constants

added constants
case control.HealthStatus_READY:
err = sdnotify.FlagAndStatus(sdnotify.ReadyEnabled)
case control.HealthStatus_SHUTTING_DOWN:
err = sdnotify.FlagAndStatus(sdnotify.StoppingEnabled)
case control.HealthStatus_RECONFIGURING:
err = sdnotify.FlagAndStatus(sdnotify.ReloadingEnabled)
default:
err = sdnotify.Status(fmt.Sprintf("%v", st))
}
if err != nil {
c.log.Error(logs.FailedToReportStatusToSystemd, zap.Error(err))
}
}

View file

@ -23,7 +23,8 @@ var _ engine.LocalOverrideEngine = (*accessPolicyEngine)(nil)
func newAccessPolicyEngine( func newAccessPolicyEngine(
morphChainStorage engine.MorphRuleChainStorage, morphChainStorage engine.MorphRuleChainStorage,
localOverrideDatabase chainbase.LocalOverrideDatabase) *accessPolicyEngine { localOverrideDatabase chainbase.LocalOverrideDatabase,
) *accessPolicyEngine {
return &accessPolicyEngine{ return &accessPolicyEngine{
chainRouter: engine.NewDefaultChainRouterWithLocalOverrides( chainRouter: engine.NewDefaultChainRouterWithLocalOverrides(
morphChainStorage, morphChainStorage,

View file

@ -119,3 +119,6 @@ prometheus:
enabled: true enabled: true
address: localhost:9090 # Endpoint for application prometheus metrics; disabled by default address: localhost:9090 # Endpoint for application prometheus metrics; disabled by default
shutdown_timeout: 30s # Timeout for metrics HTTP server graceful shutdown shutdown_timeout: 30s # Timeout for metrics HTTP server graceful shutdown
systemdnotify:
enabled: true

View file

@ -1,6 +1,9 @@
logger: logger:
level: debug # logger level: one of "debug", "info" (default), "warn", "error", "dpanic", "panic", "fatal" level: debug # logger level: one of "debug", "info" (default), "warn", "error", "dpanic", "panic", "fatal"
systemdnotify:
enabled: true
pprof: pprof:
enabled: true enabled: true
address: localhost:6060 # endpoint for Node profiling address: localhost:6060 # endpoint for Node profiling

View file

@ -3,7 +3,8 @@ Description=FrostFS InnerRing node
Requires=network.target Requires=network.target
[Service] [Service]
Type=simple Type=notify
NotifyAccess=all
ExecStart=/usr/bin/frostfs-ir --config /etc/frostfs/ir/config.yml ExecStart=/usr/bin/frostfs-ir --config /etc/frostfs/ir/config.yml
User=frostfs-ir User=frostfs-ir
Group=frostfs-ir Group=frostfs-ir

View file

@ -3,7 +3,8 @@ Description=FrostFS Storage node
Requires=network.target Requires=network.target
[Service] [Service]
Type=simple Type=notify
NotifyAccess=all
ExecStart=/usr/bin/frostfs-node --config /etc/frostfs/storage/config.yml ExecStart=/usr/bin/frostfs-node --config /etc/frostfs/storage/config.yml
User=frostfs-storage User=frostfs-storage
Group=frostfs-storage Group=frostfs-storage

View file

@ -554,4 +554,5 @@ const (
BlobovniczaSavingCountersToMetaSuccess = "saving counters to blobovnicza's meta completed successfully" BlobovniczaSavingCountersToMetaSuccess = "saving counters to blobovnicza's meta completed successfully"
BlobovniczaSavingCountersToMetaFailed = "saving counters to blobovnicza's meta failed" BlobovniczaSavingCountersToMetaFailed = "saving counters to blobovnicza's meta failed"
ObjectRemovalFailureExistsInWritecache = "can't remove object: object must be flushed from writecache" ObjectRemovalFailureExistsInWritecache = "can't remove object: object must be flushed from writecache"
FailedToReportStatusToSystemd = "failed to report status to systemd"
) )

View file

@ -22,9 +22,7 @@ type boltLocalOverrideStorage struct {
db *bbolt.DB db *bbolt.DB
} }
var ( var chainBucket = []byte{0}
chainBucket = []byte{0}
)
var ( var (
ErrChainBucketNotFound = logicerr.New("chain root bucket has not been found") ErrChainBucketNotFound = logicerr.New("chain root bucket has not been found")

View file

@ -24,6 +24,7 @@ import (
control "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/ir" control "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/ir"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/precision" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/precision"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/sdnotify"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/state" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/state"
"github.com/nspcc-dev/neo-go/pkg/core/block" "github.com/nspcc-dev/neo-go/pkg/core/block"
"github.com/nspcc-dev/neo-go/pkg/core/transaction" "github.com/nspcc-dev/neo-go/pkg/core/transaction"
@ -73,6 +74,7 @@ type (
predefinedValidators keys.PublicKeys predefinedValidators keys.PublicKeys
initialEpochTickDelta uint32 initialEpochTickDelta uint32
withoutMainNet bool withoutMainNet bool
sdNotify bool
// runtime processors // runtime processors
netmapProcessor *netmap.Processor netmapProcessor *netmap.Processor
@ -336,6 +338,11 @@ func New(ctx context.Context, log *logger.Logger, cfg *viper.Viper, errChan chan
irMetrics: metrics, irMetrics: metrics,
} }
server.sdNotify, err = server.initSdNotify(cfg)
if err != nil {
return nil, err
}
server.setHealthStatus(control.HealthStatus_HEALTH_STATUS_UNDEFINED) server.setHealthStatus(control.HealthStatus_HEALTH_STATUS_UNDEFINED)
// parse notary support // parse notary support
@ -404,6 +411,13 @@ func New(ctx context.Context, log *logger.Logger, cfg *viper.Viper, errChan chan
return server, nil return server, nil
} }
func (s *Server) initSdNotify(cfg *viper.Viper) (bool, error) {
if cfg.GetBool("systemdnotify.enabled") {
return true, sdnotify.InitSocket()
}
return false, nil
}
acid-ant marked this conversation as resolved Outdated

How about this:

	if cfg.GetBool("systemdnotify.enabled") {
		return true, sdnotify.InitSocket()
	}
	return false, nil
How about this: ``` if cfg.GetBool("systemdnotify.enabled") { return true, sdnotify.InitSocket() } return false, nil ```

fixed

fixed
func createListener(ctx context.Context, cli *client.Client, p *chainParams) (event.Listener, error) { func createListener(ctx context.Context, cli *client.Client, p *chainParams) (event.Listener, error) {
var ( var (
sub subscriber.Subscriber sub subscriber.Subscriber

View file

@ -7,6 +7,7 @@ import (
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/innerring/processors/governance" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/innerring/processors/governance"
control "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/ir" control "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control/ir"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/sdnotify"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/state" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/state"
"github.com/nspcc-dev/neo-go/pkg/util" "github.com/nspcc-dev/neo-go/pkg/util"
"github.com/spf13/viper" "github.com/spf13/viper"
@ -154,6 +155,7 @@ func (s *Server) ResetEpochTimer(h uint32) error {
func (s *Server) setHealthStatus(hs control.HealthStatus) { func (s *Server) setHealthStatus(hs control.HealthStatus) {
s.healthStatus.Store(int32(hs)) s.healthStatus.Store(int32(hs))
s.notifySystemd(hs)
if s.irMetrics != nil { if s.irMetrics != nil {
s.irMetrics.SetHealth(int32(hs)) s.irMetrics.SetHealth(int32(hs))
} }
@ -173,3 +175,21 @@ func initPersistentStateStorage(cfg *viper.Viper) (*state.PersistentStorage, err
return persistStorage, nil return persistStorage, nil
} }
func (s *Server) notifySystemd(st control.HealthStatus) {
if !s.sdNotify {
return
fyrchik marked this conversation as resolved Outdated

Why do we use strings here? Maybe move the switch from sdnotify.Status to here? This way it would be immediately obvious what do we send.

Why do we use strings here? Maybe move the switch from `sdnotify.Status` to here? This way it would be immediately obvious what do we send.

fixed

fixed
}
var err error
switch st {
dstepanov-yadro marked this conversation as resolved Outdated

zap.String("error", err.Error()) -> zap.Error(err)

`zap.String("error", err.Error())` -> `zap.Error(err)`

fixed

fixed
case control.HealthStatus_READY:
err = sdnotify.FlagAndStatus(sdnotify.ReadyEnabled)
case control.HealthStatus_SHUTTING_DOWN:
err = sdnotify.FlagAndStatus(sdnotify.StoppingEnabled)
default:
err = sdnotify.Status(fmt.Sprintf("%v", st))
}
if err != nil {
s.log.Error(logs.FailedToReportStatusToSystemd, zap.Error(err))
}
}

View file

@ -0,0 +1,59 @@
package sdnotify
import (
"fmt"
"net"
"os"
"strings"
)
const (
ReadyEnabled = "READY=1"
StoppingEnabled = "STOPPING=1"
ReloadingEnabled = "RELOADING=1"
)

Why socket name configured only via env?

Why socket name configured only via env?

This is how systemd communicates socket path to our service.

This is how systemd communicates socket path to our service.
var socket *net.UnixAddr
// Initializes socket with provided name of
// environment variable.
func InitSocket() error {
notifySocket := os.Getenv("NOTIFY_SOCKET")
fyrchik marked this conversation as resolved Outdated

We should take it from config, not from env
How about systemdnotify.socket?

We should take it from config, not from env How about `systemdnotify.socket`?

systemd can only pass socket to service through an environmental variable

```systemd``` can only pass socket to service through an environmental variable
if notifySocket == "" {
return fmt.Errorf("\"NOTIFY_SOCKET\" environment variable is not present")
}
socket = &net.UnixAddr{
Name: notifySocket,
Net: "unixgram",
}
return nil

Why is the error ignored?

Why is the error ignored?

moved this switch to the frostfs-node control.go and to frostfs-ir state.go, where this error may be logged

moved this switch to the frostfs-node control.go and to frostfs-ir state.go, where this error may be logged

It seems the switch is still here.

It seems the switch is still here.

fixed

fixed
}
// FlagAndStatus sends systemd a combination of a
// well-known status and STATUS=%s{status}, separated by newline.
func FlagAndStatus(status string) error {
status += "\nSTATUS=" + strings.TrimSuffix(status, "=1")

We do 2 sends in this function, can they be combined in a single message?

We do 2 sends in this function, can they be combined in a single message?

Messages like "STATUS=%s" allow us to see them in systemctl status output, while messages like "READY=1" are actual notifications to systemd. If we don't need to see status through systemctl status, then second status report can be removed.

Messages like "STATUS=%s" allow us to see them in ```systemctl status``` output, while messages like "READY=1" are actual notifications to systemd. If we don't need to see status through ```systemctl status```, then second status report can be removed.

From here https://www.man7.org/linux/man-pages/man3/sd_notify.3.html :

The state parameter should contain a newline-separated list of variable assignments, similar in style to an environment block.

So will Send(fmt.Sprintf("RELOADING=1\nSTATUS=%s", status)) work? It is not a big problem, but it seems both variables are better changed together.

From here https://www.man7.org/linux/man-pages/man3/sd_notify.3.html : >The state parameter should contain a newline-separated list of variable assignments, similar in style to an environment block. So will `Send(fmt.Sprintf("RELOADING=1\nSTATUS=%s", status))` work? It is not a big problem, but it seems both variables are better changed together.

added a function that combines two notifications

added a function that combines two notifications
return Send(status)
}
// Status sends systemd notify STATUS=%s{status}.
func Status(status string) error {
return Send(fmt.Sprintf("STATUS=%s", status))
}

Why you are skipping check for NOTIFY_SOCKET here?

Why you are skipping check for `NOTIFY_SOCKET` here?

it was done for testing purposes only
replaced with an error

it was done for testing purposes only replaced with an error
// Send state through the notify socket if any.
// If the notify socket was not detected, it returns an error.
func Send(state string) error {
if socket == nil {
return fmt.Errorf("socket is not initialized")
}
conn, err := net.DialUnix(socket.Net, nil, socket)
if err != nil {
return fmt.Errorf("can't open unix socket: %v", err)
}
defer conn.Close()
if _, err = conn.Write([]byte(state)); err != nil {
return fmt.Errorf("can't write into the unix socket: %v", err)
}
return nil
}