From 3ba889d159cc71df0e7981d0d19544b38ec494af Mon Sep 17 00:00:00 2001 From: Alex Vanin Date: Wed, 5 Mar 2025 18:44:11 +0300 Subject: [PATCH] [#xxx] Add circuit breaker configuration in tree pool Circuit breaker prevents from port starving when some storage nodes are up but unsynced. See more details in: https://git.frostfs.info/TrueCloudLab/frostfs-sdk-go/pulls/339 Signed-off-by: Alex Vanin --- cmd/s3-gw/app.go | 2 ++ cmd/s3-gw/app_settings.go | 19 +++++++++++++------ config/config.env | 4 ++++ config/config.yaml | 4 ++++ docs/configuration.md | 4 ++++ go.mod | 2 +- go.sum | 2 ++ 7 files changed, 30 insertions(+), 7 deletions(-) diff --git a/cmd/s3-gw/app.go b/cmd/s3-gw/app.go index 4e0b3d2b..ca5ac13c 100644 --- a/cmd/s3-gw/app.go +++ b/cmd/s3-gw/app.go @@ -799,6 +799,8 @@ func (a *App) initPools(ctx context.Context) { prmTree.SetLogger(a.log) prmTree.SetMaxRequestAttempts(a.cfg.GetInt(cfgTreePoolMaxAttempts)) + prmTree.SetCircuitBreakerThreshold(a.cfg.GetInt(cfgPoolCbThreshold)) + prmTree.SetCircuitBreakerDuration(a.cfg.GetDuration(cfgPoolCbBreakDuration)) interceptors := []grpc.DialOption{ grpc.WithUnaryInterceptor(grpctracing.NewUnaryClientInteceptor()), diff --git a/cmd/s3-gw/app_settings.go b/cmd/s3-gw/app_settings.go index 175f6bb8..f93ae661 100644 --- a/cmd/s3-gw/app_settings.go +++ b/cmd/s3-gw/app_settings.go @@ -42,6 +42,9 @@ const ( defaultStreamTimeout = 10 * time.Second defaultShutdownTimeout = 15 * time.Second + defaultCbThreshold = 10 + defaultCbBreakDuration = 10 * time.Second + defaultLoggerSamplerInterval = 1 * time.Second defaultGracefulCloseOnSwitchTimeout = 10 * time.Second @@ -120,12 +123,14 @@ const ( // Settings. cfgTLSCertFile = "tls.cert_file" // Pool config. - cfgConnectTimeout = "connect_timeout" - cfgStreamTimeout = "stream_timeout" - cfgTreeStreamTimeout = "tree_stream_timeout" - cfgHealthcheckTimeout = "healthcheck_timeout" - cfgRebalanceInterval = "rebalance_interval" - cfgPoolErrorThreshold = "pool_error_threshold" + cfgConnectTimeout = "connect_timeout" + cfgStreamTimeout = "stream_timeout" + cfgTreeStreamTimeout = "tree_stream_timeout" + cfgHealthcheckTimeout = "healthcheck_timeout" + cfgRebalanceInterval = "rebalance_interval" + cfgPoolErrorThreshold = "pool_error_threshold" + cfgPoolCbThreshold = "pool_cb_threshold" + cfgPoolCbBreakDuration = "pool_cb_break_duration" // Caching. cfgObjectsCacheLifetime = "cache.objects.lifetime" @@ -945,6 +950,8 @@ func newSettings() *viper.Viper { // pool: v.SetDefault(cfgPoolErrorThreshold, defaultPoolErrorThreshold) v.SetDefault(cfgStreamTimeout, defaultStreamTimeout) + v.SetDefault(cfgPoolCbThreshold, defaultCbThreshold) + v.SetDefault(cfgPoolCbBreakDuration, defaultCbBreakDuration) v.SetDefault(cfgPProfAddress, "localhost:8085") v.SetDefault(cfgPrometheusAddress, "localhost:8086") diff --git a/config/config.env b/config/config.env index 0c314912..d233d895 100644 --- a/config/config.env +++ b/config/config.env @@ -90,6 +90,10 @@ S3_GW_HEALTHCHECK_TIMEOUT=15s S3_GW_REBALANCE_INTERVAL=60s # The number of errors on connection after which node is considered as unhealthy S3_GW_POOL_ERROR_THRESHOLD=100 +# The number of init errors before tree service circuit breaker is closed +S3_GW_POOL_CB_THRESHOLD: 10 +# Duration when circuit breaker blocks all tree service inits to remote node +S3_GW_POOL_CB_BREAK_DURATION: 10s # Limits for processing of clients' requests S3_GW_MAX_CLIENTS_COUNT=100 diff --git a/config/config.yaml b/config/config.yaml index 9b82b4e1..75a369aa 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -110,6 +110,10 @@ healthcheck_timeout: 15s rebalance_interval: 60s # The number of errors on connection after which node is considered as unhealthy pool_error_threshold: 100 +# The number of init errors before tree service circuit breaker is closed +pool_cb_threshold: 10 +# Duration when circuit breaker blocks all tree service inits to remote node +pool_cb_break_duration: 10s # Limits for processing of clients' requests diff --git a/docs/configuration.md b/docs/configuration.md index f6d003d9..2469048a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -217,6 +217,8 @@ tree_stream_timeout: 10s healthcheck_timeout: 15s rebalance_interval: 60s pool_error_threshold: 100 +pool_cb_threshold: 10 +pool_cb_break_duration: 10s max_clients_count: 100 max_clients_deadline: 30s @@ -241,6 +243,8 @@ source_ip_header: "Source-Ip" | `healthcheck_timeout` | `duration` | no | `15s` | Timeout to check node health during rebalance. | | `rebalance_interval` | `duration` | no | `60s` | Interval to check node health. | | `pool_error_threshold` | `uint32` | no | `100` | The number of errors on connection after which node is considered as unhealthy. | +| `pool_cb_threshold` | `int` | no | `10` | The number of init errors before tree service circuit breaker is closed | +| `pool_cb_break_timeout` | `duration` | no | `10s` | Duration when circuit breaker blocks all tree service inits to remote node | | `max_clients_count` | `int` | no | `100` | Limits for processing of clients' requests. | | `max_clients_deadline` | `duration` | no | `30s` | Deadline after which the gate sends error `RequestTimeout` to a client. | | `allowed_access_key_id_prefixes` | `[]string` | no | | List of allowed `AccessKeyID` prefixes which S3 GW serve. If the parameter is omitted, all `AccessKeyID` will be accepted. | diff --git a/go.mod b/go.mod index 8af225f8..473fee5f 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.22 require ( git.frostfs.info/TrueCloudLab/frostfs-contract v0.20.1-0.20241022094040-5f956751d48b git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88 - git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7 + git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4 git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972 git.frostfs.info/TrueCloudLab/policy-engine v0.0.0-20240822104152-a3bc3099bd5b git.frostfs.info/TrueCloudLab/zapjournald v0.0.0-20240124114243-cb2e66427d02 diff --git a/go.sum b/go.sum index 40430d7d..b5dbd370 100644 --- a/go.sum +++ b/go.sum @@ -44,6 +44,8 @@ git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779 git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88/go.mod h1:kbwB4v2o6RyOfCo9kEFeUDZIX3LKhmS0yXPrtvzkQ1g= git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7 h1:T7r38zZ/aT1xTp+AxhizfukW10Rq3WQ5/m3moLGVnSk= git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8= +git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4 h1:DWMwf08GhGE9Q2g3p8Kyjl0DxPuxY7WmtkkVf4iBiCo= +git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8= git.frostfs.info/TrueCloudLab/hrw v1.2.1 h1:ccBRK21rFvY5R1WotI6LNoPlizk7qSvdfD8lNIRudVc= git.frostfs.info/TrueCloudLab/hrw v1.2.1/go.mod h1:C1Ygde2n843yTZEQ0FP69jYiuaYV0kriLvP4zm8JuvM= git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972 h1:/960fWeyn2AFHwQUwDsWB3sbP6lTEnFnMzLMM6tx6N8=