[#291] server auto re-binding
/ Vulncheck (pull_request) Failing after 1m38s Details
/ DCO (pull_request) Successful in 1m43s Details
/ Builds (1.20) (pull_request) Successful in 2m17s Details
/ Builds (1.21) (pull_request) Successful in 1m57s Details
/ Lint (pull_request) Successful in 5m7s Details
/ Tests (1.20) (pull_request) Successful in 2m32s Details
/ Tests (1.21) (pull_request) Successful in 2m8s Details

Signed-off-by: Pavel Pogodaev <p.pogodaev@yadro.com>
pull/309/head
Pavel Pogodaev 2024-02-11 21:00:56 +03:00
parent 94bd1dfe28
commit bfcde09f07
7 changed files with 147 additions and 27 deletions

View File

@ -26,6 +26,7 @@ This document outlines major changes between releases.
- Support `policy` contract (#259) - Support `policy` contract (#259)
- Support `proxy` contract (#287) - Support `proxy` contract (#287)
- Authmate: support custom attributes (#292) - Authmate: support custom attributes (#292)
- Add new `reconnect_interval` config param (#291)
### Changed ### Changed
- Generalise config param `use_default_xmlns_for_complete_multipart` to `use_default_xmlns` so that use default xmlns for all requests (#221) - Generalise config param `use_default_xmlns_for_complete_multipart` to `use_default_xmlns` so that use default xmlns for all requests (#221)

View File

@ -71,7 +71,9 @@ type (
policyStorage *policy.Storage policyStorage *policy.Storage
servers []Server servers []Server
unbindServers []ServerInfo
mu sync.RWMutex
controlAPI *grpc.Server controlAPI *grpc.Server
@ -88,6 +90,7 @@ type (
logLevel zap.AtomicLevel logLevel zap.AtomicLevel
maxClient maxClientsConfig maxClient maxClientsConfig
defaultMaxAge int defaultMaxAge int
reconnectInterval time.Duration
notificatorEnabled bool notificatorEnabled bool
resolveZoneList []string resolveZoneList []string
isResolveListAllow bool // True if ResolveZoneList contains allowed zones isResolveListAllow bool // True if ResolveZoneList contains allowed zones
@ -205,6 +208,7 @@ func newAppSettings(log *Logger, v *viper.Viper, key *keys.PrivateKey) *appSetti
logLevel: log.lvl, logLevel: log.lvl,
maxClient: newMaxClients(v), maxClient: newMaxClients(v),
defaultMaxAge: fetchDefaultMaxAge(v, log.logger), defaultMaxAge: fetchDefaultMaxAge(v, log.logger),
reconnectInterval: fetchReconnectInterval(v),
notificatorEnabled: v.GetBool(cfgEnableNATS), notificatorEnabled: v.GetBool(cfgEnableNATS),
frostfsidValidation: v.GetBool(cfgFrostfsIDValidationEnabled), frostfsidValidation: v.GetBool(cfgFrostfsIDValidationEnabled),
} }
@ -699,17 +703,23 @@ func (a *App) Serve(ctx context.Context) {
a.startServices() a.startServices()
for i := range a.servers { servs := a.getServers()
go func(i int) {
a.log.Info(logs.StartingServer, zap.String("address", a.servers[i].Address()))
if err := srv.Serve(a.servers[i].Listener()); err != nil && err != http.ErrServerClosed { for i := range servs {
a.metrics.MarkUnhealthy(a.servers[i].Address()) go func(i int) {
a.log.Info(logs.StartingServer, zap.String("address", servs[i].Address()))
if err := srv.Serve(servs[i].Listener()); err != nil && err != http.ErrServerClosed {
a.metrics.MarkUnhealthy(servs[i].Address())
a.log.Fatal(logs.ListenAndServe, zap.Error(err)) a.log.Fatal(logs.ListenAndServe, zap.Error(err))
} }
}(i) }(i)
} }
if len(a.unbindServers) != 0 {
a.scheduleReconnect(ctx, srv)
}
go func() { go func() {
address := a.cfg.GetString(cfgControlGRPCEndpoint) address := a.cfg.GetString(cfgControlGRPCEndpoint)
a.log.Info(logs.StartingControlAPI, zap.String("address", address)) a.log.Info(logs.StartingControlAPI, zap.String("address", address))
@ -826,7 +836,7 @@ func (a *App) startServices() {
} }
func (a *App) initServers(ctx context.Context) { func (a *App) initServers(ctx context.Context) {
serversInfo := fetchServers(a.cfg) serversInfo := fetchServers(a.cfg, a.log)
a.servers = make([]Server, 0, len(serversInfo)) a.servers = make([]Server, 0, len(serversInfo))
for _, serverInfo := range serversInfo { for _, serverInfo := range serversInfo {
@ -836,6 +846,7 @@ func (a *App) initServers(ctx context.Context) {
} }
srv, err := newServer(ctx, serverInfo) srv, err := newServer(ctx, serverInfo)
if err != nil { if err != nil {
a.unbindServers = append(a.unbindServers, serverInfo)
a.metrics.MarkUnhealthy(serverInfo.Address) a.metrics.MarkUnhealthy(serverInfo.Address)
a.log.Warn(logs.FailedToAddServer, append(fields, zap.Error(err))...) a.log.Warn(logs.FailedToAddServer, append(fields, zap.Error(err))...)
continue continue
@ -852,21 +863,24 @@ func (a *App) initServers(ctx context.Context) {
} }
func (a *App) updateServers() error { func (a *App) updateServers() error {
serversInfo := fetchServers(a.cfg) serversInfo := fetchServers(a.cfg, a.log)
a.mu.Lock()
defer a.mu.Unlock()
var found bool var found bool
for _, serverInfo := range serversInfo { for _, serverInfo := range serversInfo {
index := a.serverIndex(serverInfo.Address) ser := a.getServer(serverInfo.Address)
if index == -1 { if ser != nil {
continue if serverInfo.TLS.Enabled {
} if err := ser.UpdateCert(serverInfo.TLS.CertFile, serverInfo.TLS.KeyFile); err != nil {
return fmt.Errorf("failed to update tls certs: %w", err)
if serverInfo.TLS.Enabled { }
if err := a.servers[index].UpdateCert(serverInfo.TLS.CertFile, serverInfo.TLS.KeyFile); err != nil { found = true
return fmt.Errorf("failed to update tls certs: %w", err)
} }
} else if unbind := a.updateUnbindServerInfo(serverInfo); unbind {
found = true
} }
found = true
} }
if !found { if !found {
@ -876,15 +890,6 @@ func (a *App) updateServers() error {
return nil return nil
} }
func (a *App) serverIndex(address string) int {
for i := range a.servers {
if a.servers[i].Address() == address {
return i
}
}
return -1
}
func (a *App) stopServices() { func (a *App) stopServices() {
ctx, cancel := shutdownContext() ctx, cancel := shutdownContext()
defer cancel() defer cancel()
@ -959,6 +964,31 @@ func (a *App) initHandler() {
} }
} }
func (a *App) getServer(address string) Server {
for i := range a.servers {
if a.servers[i].Address() == address {
return a.servers[i]
}
}
return nil
}
func (a *App) updateUnbindServerInfo(info ServerInfo) bool {
for i := range a.unbindServers {
if a.unbindServers[i].Address == info.Address {
a.unbindServers[i] = info
return true
}
}
return false
}
func (a *App) getServers() []Server {
a.mu.RLock()
defer a.mu.RUnlock()
return a.servers
}
func (a *App) setRuntimeParameters() { func (a *App) setRuntimeParameters() {
if len(os.Getenv("GOMEMLIMIT")) != 0 { if len(os.Getenv("GOMEMLIMIT")) != 0 {
// default limit < yaml limit < app env limit < GOMEMLIMIT // default limit < yaml limit < app env limit < GOMEMLIMIT
@ -974,3 +1004,60 @@ func (a *App) setRuntimeParameters() {
zap.Int64("old_value", previous)) zap.Int64("old_value", previous))
} }
} }
func (a *App) scheduleReconnect(ctx context.Context, srv *http.Server) {
go func() {
t := time.NewTicker(a.settings.reconnectInterval)
defer t.Stop()
for {
select {
case <-t.C:
if a.tryReconnect(ctx, srv) {
return
}
t.Reset(a.settings.reconnectInterval)
case <-ctx.Done():
return
}
}
}()
}
func (a *App) tryReconnect(ctx context.Context, sr *http.Server) bool {
a.mu.Lock()
defer a.mu.Unlock()
a.log.Info(logs.ServerReconnecting)
var failedServers []ServerInfo
for _, serverInfo := range a.unbindServers {
fields := []zap.Field{
zap.String("address", serverInfo.Address), zap.Bool("tls enabled", serverInfo.TLS.Enabled),
zap.String("tls cert", serverInfo.TLS.CertFile), zap.String("tls key", serverInfo.TLS.KeyFile),
}
srv, err := newServer(ctx, serverInfo)
if err != nil {
a.log.Warn(logs.ServerReconnectFailed, zap.Error(err))
failedServers = append(failedServers, serverInfo)
a.metrics.MarkUnhealthy(serverInfo.Address)
continue
}
go func() {
a.log.Info(logs.StartingServer, zap.String("address", srv.Address()))
a.metrics.MarkHealthy(serverInfo.Address)
if err = sr.Serve(srv.Listener()); err != nil && !errors.Is(err, http.ErrServerClosed) {
a.log.Warn(logs.ListenAndServe, zap.Error(err))
a.metrics.MarkUnhealthy(serverInfo.Address)
}
}()
a.servers = append(a.servers, srv)
a.log.Info(logs.ServerReconnectedSuccessfully, fields...)
}
a.unbindServers = failedServers
return len(a.unbindServers) == 0
}

View File

@ -59,6 +59,8 @@ const (
defaultConstraintName = "default" defaultConstraintName = "default"
defaultNamespace = "" defaultNamespace = ""
defaultReconnectInterval = time.Minute
) )
var ( var (
@ -222,6 +224,9 @@ const ( // Settings.
// Proxy. // Proxy.
cfgProxyContract = "proxy.contract" cfgProxyContract = "proxy.contract"
// Server.
cfgReconnectInterval = "reconnect_interval"
// envPrefix is an environment variables prefix used for configuration. // envPrefix is an environment variables prefix used for configuration.
envPrefix = "S3_GW" envPrefix = "S3_GW"
) )
@ -244,6 +249,15 @@ func fetchConnectTimeout(cfg *viper.Viper) time.Duration {
return connTimeout return connTimeout
} }
func fetchReconnectInterval(cfg *viper.Viper) time.Duration {
reconnect := cfg.GetDuration(cfgReconnectInterval)
if reconnect <= 0 {
reconnect = defaultReconnectInterval
}
return reconnect
}
func fetchStreamTimeout(cfg *viper.Viper) time.Duration { func fetchStreamTimeout(cfg *viper.Viper) time.Duration {
streamTimeout := cfg.GetDuration(cfgStreamTimeout) streamTimeout := cfg.GetDuration(cfgStreamTimeout)
if streamTimeout <= 0 { if streamTimeout <= 0 {
@ -611,8 +625,9 @@ func fetchPeers(l *zap.Logger, v *viper.Viper) []pool.NodeParam {
return nodes return nodes
} }
func fetchServers(v *viper.Viper) []ServerInfo { func fetchServers(v *viper.Viper, log *zap.Logger) []ServerInfo {
var servers []ServerInfo var servers []ServerInfo
seen := make(map[string]struct{})
for i := 0; ; i++ { for i := 0; ; i++ {
key := cfgServer + "." + strconv.Itoa(i) + "." key := cfgServer + "." + strconv.Itoa(i) + "."
@ -627,6 +642,11 @@ func fetchServers(v *viper.Viper) []ServerInfo {
break break
} }
if _, ok := seen[serverInfo.Address]; ok {
log.Warn(logs.WarnDuplicateAddress, zap.String("address", serverInfo.Address))
continue
}
seen[serverInfo.Address] = struct{}{}
servers = append(servers, serverInfo) servers = append(servers, serverInfo)
} }

View File

@ -33,6 +33,9 @@ S3_GW_SERVER_1_TLS_ENABLED=true
S3_GW_SERVER_1_TLS_CERT_FILE=/path/to/tls/cert S3_GW_SERVER_1_TLS_CERT_FILE=/path/to/tls/cert
S3_GW_SERVER_1_TLS_KEY_FILE=/path/to/tls/key S3_GW_SERVER_1_TLS_KEY_FILE=/path/to/tls/key
# How often to reconnect to the servers
S3_GW_RECONNECT_INTERVAL: 1m
# Control API # Control API
# List of hex-encoded public keys that have rights to use the Control Service # List of hex-encoded public keys that have rights to use the Control Service
S3_GW_CONTROL_AUTHORIZED_KEYS=035839e45d472a3b7769a2a1bd7d54c4ccd4943c3b40f547870e83a8fcbfb3ce11 028f42cfcb74499d7b15b35d9bff260a1c8d27de4f446a627406a382d8961486d6 S3_GW_CONTROL_AUTHORIZED_KEYS=035839e45d472a3b7769a2a1bd7d54c4ccd4943c3b40f547870e83a8fcbfb3ce11 028f42cfcb74499d7b15b35d9bff260a1c8d27de4f446a627406a382d8961486d6

View File

@ -25,6 +25,8 @@ peers:
priority: 2 priority: 2
weight: 0.9 weight: 0.9
reconnect_interval: 1m
server: server:
- address: 0.0.0.0:8080 - address: 0.0.0.0:8080
tls: tls:

View File

@ -218,6 +218,8 @@ max_clients_deadline: 30s
allowed_access_key_id_prefixes: allowed_access_key_id_prefixes:
- Ck9BHsgKcnwfCTUSFm6pxhoNS4cBqgN2NQ8zVgPjqZDX - Ck9BHsgKcnwfCTUSFm6pxhoNS4cBqgN2NQ8zVgPjqZDX
- 3stjWenX15YwYzczMr88gy3CQr4NYFBQ8P7keGzH5QFn - 3stjWenX15YwYzczMr88gy3CQr4NYFBQ8P7keGzH5QFn
reconnect_interval: 1m
``` ```
| Parameter | Type | SIGHUP reload | Default value | Description | | Parameter | Type | SIGHUP reload | Default value | Description |
@ -233,6 +235,7 @@ allowed_access_key_id_prefixes:
| `max_clients_count` | `int` | no | `100` | Limits for processing of clients' requests. | | `max_clients_count` | `int` | no | `100` | Limits for processing of clients' requests. |
| `max_clients_deadline` | `duration` | no | `30s` | Deadline after which the gate sends error `RequestTimeout` to a client. | | `max_clients_deadline` | `duration` | no | `30s` | Deadline after which the gate sends error `RequestTimeout` to a client. |
| `allowed_access_key_id_prefixes` | `[]string` | no | | List of allowed `AccessKeyID` prefixes which S3 GW serve. If the parameter is omitted, all `AccessKeyID` will be accepted. | | `allowed_access_key_id_prefixes` | `[]string` | no | | List of allowed `AccessKeyID` prefixes which S3 GW serve. If the parameter is omitted, all `AccessKeyID` will be accepted. |
| `reconnect_interval` | `duration` | no | `1m` | Listeners reconnection interval. |
### `wallet` section ### `wallet` section

View File

@ -135,6 +135,9 @@ const (
ControlAPIGetPolicy = "get policy request" ControlAPIGetPolicy = "get policy request"
ControlAPIListPolicies = "list policies request" ControlAPIListPolicies = "list policies request"
PolicyValidationFailed = "policy validation failed" PolicyValidationFailed = "policy validation failed"
ServerReconnecting = "reconnecting server..."
ServerReconnectedSuccessfully = "server reconnected successfully"
ServerReconnectFailed = "failed to reconnect server"
ParseTreeNode = "parse tree node" ParseTreeNode = "parse tree node"
FailedToGetRealObjectSize = "failed to get real object size" FailedToGetRealObjectSize = "failed to get real object size"
CouldntDeleteObjectFromStorageContinueDeleting = "couldn't delete object from storage, continue deleting from tree" CouldntDeleteObjectFromStorageContinueDeleting = "couldn't delete object from storage, continue deleting from tree"
@ -149,4 +152,5 @@ const (
InvalidBucketObjectLockEnabledHeader = "invalid X-Amz-Bucket-Object-Lock-Enabled header" InvalidBucketObjectLockEnabledHeader = "invalid X-Amz-Bucket-Object-Lock-Enabled header"
InvalidTreeKV = "invalid tree service meta KV" InvalidTreeKV = "invalid tree service meta KV"
FailedToWriteResponse = "failed to write response" FailedToWriteResponse = "failed to write response"
WarnDuplicateAddress = "duplicate address"
) )