Reconnect gRPC servers #836

Merged
fyrchik merged 1 commit from dstepanov-yadro/frostfs-node:feat/grpc_init_lazy into master 2024-09-04 19:51:04 +00:00
2 changed files with 79 additions and 41 deletions

View file

@ -426,11 +426,26 @@ type dynamicConfiguration struct {
metrics *httpComponent metrics *httpComponent
} }
type appConfigGuard struct {
mtx sync.RWMutex
}
func (g *appConfigGuard) LockAppConfigShared() func() {
g.mtx.RLock()
return func() { g.mtx.RUnlock() }
}
func (g *appConfigGuard) LockAppConfigExclusive() func() {
g.mtx.Lock()
return func() { g.mtx.Unlock() }
}
type cfg struct { type cfg struct {
applicationConfiguration applicationConfiguration
internals internals
shared shared
dynamicConfiguration dynamicConfiguration
appConfigGuard
// configuration of the internal // configuration of the internal
// services // services
@ -474,8 +489,9 @@ type cfgGRPC struct {
// handlers must be protected with guard // handlers must be protected with guard
handlers []func(e string, l net.Listener, s *grpc.Server) handlers []func(e string, l net.Listener, s *grpc.Server)
maxChunkSize uint64 maxChunkSize uint64
maxAddrAmount uint64 maxAddrAmount uint64
reconnectTimeout time.Duration
} }
func (c *cfgGRPC) append(e string, l net.Listener, s *grpc.Server) { func (c *cfgGRPC) append(e string, l net.Listener, s *grpc.Server) {
@ -1204,7 +1220,7 @@ func (c *cfg) reloadConfig(ctx context.Context) {
} }
defer c.compareAndSwapHealthStatus(control.HealthStatus_RECONFIGURING, control.HealthStatus_READY) defer c.compareAndSwapHealthStatus(control.HealthStatus_RECONFIGURING, control.HealthStatus_READY)
err := c.readConfig(c.appCfg) err := c.reloadAppConfig()
if err != nil { if err != nil {
c.log.Error(logs.FrostFSNodeConfigurationReading, zap.Error(err)) c.log.Error(logs.FrostFSNodeConfigurationReading, zap.Error(err))
return return
@ -1271,6 +1287,13 @@ func (c *cfg) reloadConfig(ctx context.Context) {
c.log.Info(logs.FrostFSNodeConfigurationHasBeenReloadedSuccessfully) c.log.Info(logs.FrostFSNodeConfigurationHasBeenReloadedSuccessfully)
} }
func (c *cfg) reloadAppConfig() error {
unlock := c.LockAppConfigExclusive()
defer unlock()
return c.readConfig(c.appCfg)
}
func (c *cfg) createTombstoneSource() *tombstone.ExpirationChecker { func (c *cfg) createTombstoneSource() *tombstone.ExpirationChecker {
var tssPrm tsourse.TombstoneSourcePrm var tssPrm tsourse.TombstoneSourcePrm
tssPrm.SetGetService(c.cfgObject.getSvc) tssPrm.SetGetService(c.cfgObject.getSvc)

View file

@ -49,6 +49,7 @@ func initGRPC(c *cfg) {
if successCount == 0 { if successCount == 0 {
fatalOnErr(errors.New("could not listen to any gRPC endpoints")) fatalOnErr(errors.New("could not listen to any gRPC endpoints"))
} }
c.cfgGRPC.reconnectTimeout = grpcconfig.ReconnectTimeout(c.appCfg)
for _, endpoint := range endpointsToReconnect { for _, endpoint := range endpointsToReconnect {
scheduleReconnect(endpoint, c) scheduleReconnect(endpoint, c)
@ -60,49 +61,13 @@ func scheduleReconnect(endpoint string, c *cfg) {
go func() { go func() {
defer c.wg.Done() defer c.wg.Done()
timeout := grpcconfig.ReconnectTimeout(c.appCfg) t := time.NewTicker(c.cfgGRPC.reconnectTimeout)
t := time.NewTicker(timeout)
for { for {
select { select {
case <-t.C: case <-t.C:
c.log.Info(logs.FrostFSNodeGRPCReconnecting, zap.String("endpoint", endpoint)) if tryReconnect(endpoint, c) {
var success, found bool
grpcconfig.IterateEndpoints(c.appCfg, func(sc *grpcconfig.Config) {
if sc.Endpoint() != endpoint {
return
}
found = true
serverOpts, ok := getGrpcServerOpts(c, sc)
if !ok {
return
}
lis, err := net.Listen("tcp", sc.Endpoint())
if err != nil {
c.metricsCollector.GrpcServerMetrics().MarkUnhealthy(sc.Endpoint())
c.log.Error(logs.FrostFSNodeCantListenGRPCEndpoint, zap.Error(err))
return
}
c.metricsCollector.GrpcServerMetrics().MarkHealthy(sc.Endpoint())
srv := grpc.NewServer(serverOpts...)
c.onShutdown(func() {
stopGRPC("FrostFS Public API", srv, c.log)
})
c.cfgGRPC.appendAndHandle(sc.Endpoint(), lis, srv)
success = true
})
if !found {
c.log.Warn(logs.FrostFSNodeGRPCServerConfigNotFound, zap.String("endpoint", endpoint))
return return
} }
if success {
c.log.Info(logs.FrostFSNodeGRPCReconnectedSuccessfully, zap.String("endpoint", endpoint))
return
}
c.log.Warn(logs.FrostFSNodeGRPCReconnectFailed, zap.Duration("next_try_in", timeout))
case <-c.done: case <-c.done:
return return
} }
@ -110,6 +75,56 @@ func scheduleReconnect(endpoint string, c *cfg) {
}() }()
} }
func tryReconnect(endpoint string, c *cfg) bool {
c.log.Info(logs.FrostFSNodeGRPCReconnecting, zap.String("endpoint", endpoint))
serverOpts, found := getGRPCEndpointOpts(endpoint, c)
if !found {
c.log.Warn(logs.FrostFSNodeGRPCServerConfigNotFound, zap.String("endpoint", endpoint))
return true
}
lis, err := net.Listen("tcp", endpoint)
if err != nil {
c.metricsCollector.GrpcServerMetrics().MarkUnhealthy(endpoint)
c.log.Error(logs.FrostFSNodeCantListenGRPCEndpoint, zap.Error(err))
c.log.Warn(logs.FrostFSNodeGRPCReconnectFailed, zap.Duration("next_try_in", c.cfgGRPC.reconnectTimeout))
return false
}
c.metricsCollector.GrpcServerMetrics().MarkHealthy(endpoint)
srv := grpc.NewServer(serverOpts...)
c.onShutdown(func() {
stopGRPC("FrostFS Public API", srv, c.log)
})
c.cfgGRPC.appendAndHandle(endpoint, lis, srv)
c.log.Info(logs.FrostFSNodeGRPCReconnectedSuccessfully, zap.String("endpoint", endpoint))
return true
}
func getGRPCEndpointOpts(endpoint string, c *cfg) (result []grpc.ServerOption, found bool) {
unlock := c.LockAppConfigShared()
defer unlock()
grpcconfig.IterateEndpoints(c.appCfg, func(sc *grpcconfig.Config) {
if found {
return
}
if sc.Endpoint() != endpoint {
return
}
var ok bool
result, ok = getGrpcServerOpts(c, sc)
if !ok {
return
}
found = true
})
return
}
func getGrpcServerOpts(c *cfg, sc *grpcconfig.Config) ([]grpc.ServerOption, bool) { func getGrpcServerOpts(c *cfg, sc *grpcconfig.Config) ([]grpc.ServerOption, bool) {
serverOpts := []grpc.ServerOption{ serverOpts := []grpc.ServerOption{
grpc.MaxRecvMsgSize(maxRecvMsgSize), grpc.MaxRecvMsgSize(maxRecvMsgSize),