From 87f668128148dc17462dda48cda8bde59a9a452f Mon Sep 17 00:00:00 2001
From: Denis Kirillov <denis@nspcc.ru>
Date: Fri, 29 Jul 2022 09:26:11 +0300
Subject: [PATCH] [#633] Add config param for pool error threshold

Signed-off-by: Denis Kirillov <denis@nspcc.ru>
---
 cmd/s3-gw/app.go          | 6 ++++++
 cmd/s3-gw/app_settings.go | 8 +++++++-
 config/config.env         | 2 ++
 config/config.yaml        | 3 +++
 docs/configuration.md     | 2 ++
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/cmd/s3-gw/app.go b/cmd/s3-gw/app.go
index 24f9531..4a43b88 100644
--- a/cmd/s3-gw/app.go
+++ b/cmd/s3-gw/app.go
@@ -74,6 +74,7 @@ func newApp(ctx context.Context, l *zap.Logger, v *viper.Viper) *App {
 
 		maxClientsCount    = defaultMaxClientsCount
 		maxClientsDeadline = defaultMaxClientsDeadline
+		poolErrorThreshold = defaultPoolErrorThreshold
 	)
 
 	if v := v.GetDuration(cfgConnectTimeout); v > 0 {
@@ -96,6 +97,10 @@ func newApp(ctx context.Context, l *zap.Logger, v *viper.Viper) *App {
 		reBalance = v
 	}
 
+	if v := v.GetUint32(cfgPoolErrorThreshold); v > 0 {
+		poolErrorThreshold = v
+	}
+
 	password := wallet.GetPassword(v, cfgWalletPassphrase)
 	if key, err = wallet.GetKeyFromPath(v.GetString(cfgWallet), v.GetString(cfgAddress), password); err != nil {
 		l.Fatal("could not load NeoFS private key", zap.Error(err))
@@ -114,6 +119,7 @@ func newApp(ctx context.Context, l *zap.Logger, v *viper.Viper) *App {
 	prmPool.SetKey(&key.PrivateKey)
 	prmPool.SetNodeDialTimeout(conTimeout)
 	prmPool.SetHealthcheckTimeout(hckTimeout)
+	prmPool.SetErrorThreshold(poolErrorThreshold)
 	prmPool.SetClientRebalanceInterval(reBalance)
 	for _, peer := range fetchPeers(l, v) {
 		prmPool.AddNode(peer)
diff --git a/cmd/s3-gw/app_settings.go b/cmd/s3-gw/app_settings.go
index ef59fc5..cdd5633 100644
--- a/cmd/s3-gw/app_settings.go
+++ b/cmd/s3-gw/app_settings.go
@@ -23,6 +23,8 @@ const (
 	defaultConnectTimeout     = 10 * time.Second
 	defaultShutdownTimeout    = 15 * time.Second
 
+	defaultPoolErrorThreshold uint32 = 100
+
 	defaultMaxClientsCount    = 100
 	defaultMaxClientsDeadline = time.Second * 30
 )
@@ -40,10 +42,11 @@ const ( // Settings.
 	cfgTLSKeyFile  = "tls.key_file"
 	cfgTLSCertFile = "tls.cert_file"
 
-	// Timeouts.
+	// Pool config.
 	cfgConnectTimeout     = "connect_timeout"
 	cfgHealthcheckTimeout = "healthcheck_timeout"
 	cfgRebalanceInterval  = "rebalance_interval"
+	cfgPoolErrorThreshold = "pool_error_threshold"
 
 	// Caching.
 	cfgObjectsCacheLifetime     = "cache.objects.lifetime"
@@ -211,6 +214,9 @@ func newSettings() *viper.Viper {
 	// logger:
 	v.SetDefault(cfgLoggerLevel, "debug")
 
+	// pool:
+	v.SetDefault(cfgPoolErrorThreshold, defaultPoolErrorThreshold)
+
 	v.SetDefault(cfgPProfAddress, "localhost:8085")
 	v.SetDefault(cfgPrometheusAddress, "localhost:8086")
 
diff --git a/config/config.env b/config/config.env
index 27425ba..88f8ecb 100644
--- a/config/config.env
+++ b/config/config.env
@@ -54,6 +54,8 @@ S3_GW_CONNECT_TIMEOUT=10s
 S3_GW_HEALTHCHECK_TIMEOUT=15s
 # Interval to check node health
 S3_GW_REBALANCE_INTERVAL=60s
+# The number of errors on connection after which node is considered as unhealthy
+S3_GW_POOL_ERROR_THRESHOLD=100
 
 # Limits for processing of clients' requests
 S3_GW_MAX_CLIENTS_COUNT=100
diff --git a/config/config.yaml b/config/config.yaml
index b5956c7..d645dad 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -59,6 +59,9 @@ connect_timeout: 10s
 healthcheck_timeout: 15s
 # Interval to check node health
 rebalance_interval: 60s
+# The number of errors on connection after which node is considered as unhealthy
+pool_error_threshold: 100
+
 
 # Limits for processing of clients' requests
 max_clients_count: 100
diff --git a/docs/configuration.md b/docs/configuration.md
index 6d6d169..d5b6c45 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -149,6 +149,7 @@ resolve_order:
 connect_timeout: 10s
 healthcheck_timeout: 15s
 rebalance_interval: 60s
+pool_error_threshold: 100
 
 max_clients_count: 100
 max_clients_deadline: 30s
@@ -165,6 +166,7 @@ default_policy: REP 3
 | `connect_timeout`      | `duration` | `10s`          | Timeout to connect to a node.                                                                                                                                                                                     |
 | `healthcheck_timeout`  | `duration` | `15s`          | Timeout to check node health during rebalance.                                                                                                                                                                    |
 | `rebalance_interval`   | `duration` | `60s`          | Interval to check node health.                                                                                                                                                                                    |
+| `pool_error_threshold` | `uint32`   | `100`          | The number of errors on connection after which node is considered as unhealthy.                                                                                                                                   |
 | `max_clients_count`    | `int`      | `100`          | Limits for processing of clients' requests.                                                                                                                                                                       |
 | `max_clients_deadline` | `duration` | `30s`          | Deadline after which the gate sends error `RequestTimeout` to a client.                                                                                                                                           |
 | `default_policy`       | `string`   | `REP 3`        | Default policy of placing containers in NeoFS. If a user sends a request `CreateBucket` and doesn't define policy for placing of a container in NeoFS, the S3 Gateway will put the container with default policy. |