node: React on SIGHUP only when node in READY state #748

Merged
fyrchik merged 1 commit from acid-ant/frostfs-node:bugfix/49-node-stuck-middle into master 2023-10-24 15:55:30 +00:00
6 changed files with 35 additions and 7 deletions

View file

@ -1040,7 +1040,6 @@ func (c *cfg) signalWatcher(ctx context.Context) {
c.reloadConfig(ctx)
case syscall.SIGTERM, syscall.SIGINT:
c.log.Info(logs.FrostFSNodeTerminationSignalHasBeenReceivedStopping)
// TODO (@acid-ant): #49 need to cover case when stuck at the middle(node health UNDEFINED or STARTING)
c.shutdown()
@ -1062,6 +1061,12 @@ func (c *cfg) signalWatcher(ctx context.Context) {
func (c *cfg) reloadConfig(ctx context.Context) {
c.log.Info(logs.FrostFSNodeSIGHUPHasBeenReceivedRereadingConfiguration)
if !c.compareAndSwapHealthStatus(control.HealthStatus_READY, control.HealthStatus_RECONFIGURING) {
c.log.Info(logs.FrostFSNodeSIGHUPSkip)
return
}
defer c.compareAndSwapHealthStatus(control.HealthStatus_RECONFIGURING, control.HealthStatus_READY)
err := c.readConfig(c.appCfg)
if err != nil {
c.log.Error(logs.FrostFSNodeConfigurationReading, zap.Error(err))
@ -1142,7 +1147,14 @@ func (c *cfg) createTombstoneSource() *tombstone.ExpirationChecker {
}
func (c *cfg) shutdown() {
c.setHealthStatus(control.HealthStatus_SHUTTING_DOWN)
old := c.swapHealthStatus(control.HealthStatus_SHUTTING_DOWN)
if old == control.HealthStatus_SHUTTING_DOWN {
c.log.Info(logs.FrostFSNodeShutdownSkip)
return
}
if old == control.HealthStatus_STARTING {
c.log.Warn(logs.FrostFSNodeShutdownWhenNotReady)
}
c.ctxCancel()
c.done <- struct{}{}

View file

@ -83,10 +83,20 @@ func (c *cfg) NetmapStatus() control.NetmapStatus {
func (c *cfg) setHealthStatus(st control.HealthStatus) {
c.healthStatus.Store(int32(st))
c.metricsCollector.State().SetHealth(int32(st))
}
if c.metricsCollector != nil {
c.metricsCollector.State().SetHealth(int32(st))
func (c *cfg) compareAndSwapHealthStatus(oldSt, newSt control.HealthStatus) (swapped bool) {
if swapped = c.healthStatus.CompareAndSwap(int32(oldSt), int32(newSt)); swapped {
c.metricsCollector.State().SetHealth(int32(newSt))
}
return
}
func (c *cfg) swapHealthStatus(st control.HealthStatus) (old control.HealthStatus) {
old = control.HealthStatus(c.healthStatus.Swap(int32(st)))
c.metricsCollector.State().SetHealth(int32(st))
return
}
func (c *cfg) HealthStatus() control.HealthStatus {

Can metricsCollector be nil?

Can `metricsCollector` be nil?

In theory yes, but with this method calling it is already initialized.

In theory yes, but with this method calling it is already initialized.

We already use it in multiple places without nil check and, as you said, it is always initialized, let's drop the nil check?

We already use it in multiple places without nil check and, as you said, it is always initialized, let's drop the nil check?

Agree, removed.

Agree, removed.

View file

@ -60,13 +60,13 @@ func main() {
var ctx context.Context
ctx, c.ctxCancel = context.WithCancel(context.Background())
initApp(ctx, c)
c.setHealthStatus(control.HealthStatus_STARTING)
initApp(ctx, c)
bootUp(ctx, c)
c.setHealthStatus(control.HealthStatus_READY)
c.compareAndSwapHealthStatus(control.HealthStatus_STARTING, control.HealthStatus_READY)
wait(c)
}

View file

@ -436,6 +436,9 @@ const (
FrostFSNodeInternalApplicationError = "internal application error"
FrostFSNodeInternalErrorProcessingIsComplete = "internal error processing is complete"
FrostFSNodeSIGHUPHasBeenReceivedRereadingConfiguration = "SIGHUP has been received, rereading configuration..."
FrostFSNodeSIGHUPSkip = "node not ready for reconfiguration, skipped SIGHUP"
FrostFSNodeShutdownSkip = "node already is going to shutting down, skipped shutdown"
FrostFSNodeShutdownWhenNotReady = "node is going to shutting down when subsystems still initializing"
FrostFSNodeConfigurationReading = "configuration reading"
FrostFSNodeLoggerConfigurationPreparation = "logger configuration preparation"
FrostFSNodeTracingConfigationUpdated = "tracing configation updated"

Binary file not shown.

View file

@ -115,6 +115,9 @@ enum HealthStatus {
// Storage node application is shutting down.
SHUTTING_DOWN = 3;
// Storage node application is reconfiguring.
RECONFIGURING = 4;
}
// Shard description.