[#242] pool: Log error that caused healthy status change #242

Merged
fyrchik merged 1 commit from mbiryukova/frostfs-sdk-go:feature/log_unhealthy_error into master 2024-07-23 06:41:58 +00:00
2 changed files with 21 additions and 16 deletions

View file

@ -195,9 +195,9 @@ func (m *mockClient) dial(context.Context) error {
return nil return nil
} }
func (m *mockClient) restartIfUnhealthy(ctx context.Context) (healthy bool, changed bool) { func (m *mockClient) restartIfUnhealthy(ctx context.Context) (changed bool, err error) {
_, err := m.endpointInfo(ctx, prmEndpointInfo{}) _, err = m.endpointInfo(ctx, prmEndpointInfo{})
healthy = err == nil healthy := err == nil
changed = healthy != m.isHealthy() changed = healthy != m.isHealthy()
if healthy { if healthy {
m.setHealthy() m.setHealthy()

View file

@ -86,7 +86,7 @@ type client interface {
// see clientWrapper.dial. // see clientWrapper.dial.
dial(ctx context.Context) error dial(ctx context.Context) error
// see clientWrapper.restartIfUnhealthy. // see clientWrapper.restartIfUnhealthy.
restartIfUnhealthy(ctx context.Context) (bool, bool) restartIfUnhealthy(ctx context.Context) (bool, error)
// see clientWrapper.close. // see clientWrapper.close.
close() error close() error
} }
@ -373,11 +373,11 @@ func (c *clientWrapper) dial(ctx context.Context) error {
} }
// restartIfUnhealthy checks healthy status of client and recreate it if status is unhealthy. // restartIfUnhealthy checks healthy status of client and recreate it if status is unhealthy.
// Return current healthy status and indicating if status was changed by this function call. // Indicating if status was changed by this function call and returns error that caused unhealthy status.
func (c *clientWrapper) restartIfUnhealthy(ctx context.Context) (healthy, changed bool) { func (c *clientWrapper) restartIfUnhealthy(ctx context.Context) (changed bool, err error) {
var wasHealthy bool var wasHealthy bool
dkirillov marked this conversation as resolved Outdated

What do you think about keeping signature of this method intact but adding logging of error (we know log and error here)?

Or at least consider drop healthy bool from return params. We can know this status from error.

What do you think about keeping signature of this method intact but adding logging of error (we know log and error here)? Or at least consider drop `healthy bool` from return params. We can know this status from error.

I would prefer the second option, because it's like in tree pool

I would prefer the second option, because it's like in tree pool
if _, err := c.endpointInfo(ctx, prmEndpointInfo{}); err == nil { if _, err = c.endpointInfo(ctx, prmEndpointInfo{}); err == nil {
return true, false return false, nil
} else if !errors.Is(err, errPoolClientUnhealthy) { } else if !errors.Is(err, errPoolClientUnhealthy) {
wasHealthy = true wasHealthy = true
} }
@ -403,22 +403,22 @@ func (c *clientWrapper) restartIfUnhealthy(ctx context.Context) (healthy, change
GRPCDialOptions: c.prm.dialOptions, GRPCDialOptions: c.prm.dialOptions,
} }
if err := cl.Dial(ctx, prmDial); err != nil { if err = cl.Dial(ctx, prmDial); err != nil {
c.setUnhealthyOnDial() c.setUnhealthyOnDial()
return false, wasHealthy return wasHealthy, err
} }
c.clientMutex.Lock() c.clientMutex.Lock()
c.client = &cl c.client = &cl
c.clientMutex.Unlock() c.clientMutex.Unlock()
if _, err := cl.EndpointInfo(ctx, sdkClient.PrmEndpointInfo{}); err != nil { if _, err = cl.EndpointInfo(ctx, sdkClient.PrmEndpointInfo{}); err != nil {
c.setUnhealthy() c.setUnhealthy()
return false, wasHealthy return wasHealthy, err
} }
c.setHealthy() c.setHealthy()
return true, !wasHealthy return !wasHealthy, nil
} }
func (c *clientWrapper) getClient() (*sdkClient.Client, error) { func (c *clientWrapper) getClient() (*sdkClient.Client, error) {
@ -2198,7 +2198,8 @@ func (p *Pool) updateInnerNodesHealth(ctx context.Context, i int, bufferWeights
tctx, c := context.WithTimeout(ctx, options.nodeRequestTimeout) tctx, c := context.WithTimeout(ctx, options.nodeRequestTimeout)
defer c() defer c()
healthy, changed := cli.restartIfUnhealthy(tctx) changed, err := cli.restartIfUnhealthy(tctx)
healthy := err == nil
if healthy { if healthy {
bufferWeights[j] = options.nodesParams[i].weights[j] bufferWeights[j] = options.nodesParams[i].weights[j]
} else { } else {
@ -2207,8 +2208,12 @@ func (p *Pool) updateInnerNodesHealth(ctx context.Context, i int, bufferWeights
} }
if changed { if changed {
p.log(zap.DebugLevel, "health has changed", fields := []zap.Field{zap.String("address", cli.address()), zap.Bool("healthy", healthy)}
zap.String("address", cli.address()), zap.Bool("healthy", healthy)) if err != nil {
fields = append(fields, zap.String("reason", err.Error()))
  1. It's more correct to use zap.Error formater instead stringifying the error :)
  2. For me "health has changed" with "error" fields seems unclear. It's optinally but I can suggest to log message like health was changed but client error occurred if err != nil
1. It's more correct to use `zap.Error` formater instead stringifying the error :) 2. For me "health has changed" with `"error"` fields seems unclear. It's optinally but I can suggest to log message like `health was changed but client error occurred` if `err != nil`

ЕМНИП We use zap.String because zap.Error produces unnecessary big stacktraces.

ЕМНИП We use `zap.String` because `zap.Error` produces unnecessary big stacktraces.

For error I suggest replacing it with a reason, it is not an error related to the operation being logged (health status changed), it is the reason.

For `error` I suggest replacing it with a `reason`, it is not an error related to the operation being logged (health status changed), it is the reason.
}
p.log(zap.DebugLevel, "health has changed", fields...)
healthyChanged.Store(true) healthyChanged.Store(true)
} }
}(j, cli) }(j, cli)