[#xxx] Add circuit breaker configuration in tree pool

Circuit breaker prevents from port starving when some
storage nodes are up but unsynced. See more details in:

TrueCloudLab/frostfs-sdk-go#339
Signed-off-by: Alex Vanin <a.vanin@yadro.com>
This commit is contained in:
Alexey Vanin 2025-03-05 18:44:11 +03:00
parent 93964834a8
commit 3ba889d159
7 changed files with 30 additions and 7 deletions

View file

@ -799,6 +799,8 @@ func (a *App) initPools(ctx context.Context) {
prmTree.SetLogger(a.log) prmTree.SetLogger(a.log)
prmTree.SetMaxRequestAttempts(a.cfg.GetInt(cfgTreePoolMaxAttempts)) prmTree.SetMaxRequestAttempts(a.cfg.GetInt(cfgTreePoolMaxAttempts))
prmTree.SetCircuitBreakerThreshold(a.cfg.GetInt(cfgPoolCbThreshold))
prmTree.SetCircuitBreakerDuration(a.cfg.GetDuration(cfgPoolCbBreakDuration))
interceptors := []grpc.DialOption{ interceptors := []grpc.DialOption{
grpc.WithUnaryInterceptor(grpctracing.NewUnaryClientInteceptor()), grpc.WithUnaryInterceptor(grpctracing.NewUnaryClientInteceptor()),

View file

@ -42,6 +42,9 @@ const (
defaultStreamTimeout = 10 * time.Second defaultStreamTimeout = 10 * time.Second
defaultShutdownTimeout = 15 * time.Second defaultShutdownTimeout = 15 * time.Second
defaultCbThreshold = 10
defaultCbBreakDuration = 10 * time.Second
defaultLoggerSamplerInterval = 1 * time.Second defaultLoggerSamplerInterval = 1 * time.Second
defaultGracefulCloseOnSwitchTimeout = 10 * time.Second defaultGracefulCloseOnSwitchTimeout = 10 * time.Second
@ -126,6 +129,8 @@ const ( // Settings.
cfgHealthcheckTimeout = "healthcheck_timeout" cfgHealthcheckTimeout = "healthcheck_timeout"
cfgRebalanceInterval = "rebalance_interval" cfgRebalanceInterval = "rebalance_interval"
cfgPoolErrorThreshold = "pool_error_threshold" cfgPoolErrorThreshold = "pool_error_threshold"
cfgPoolCbThreshold = "pool_cb_threshold"
cfgPoolCbBreakDuration = "pool_cb_break_duration"
// Caching. // Caching.
cfgObjectsCacheLifetime = "cache.objects.lifetime" cfgObjectsCacheLifetime = "cache.objects.lifetime"
@ -945,6 +950,8 @@ func newSettings() *viper.Viper {
// pool: // pool:
v.SetDefault(cfgPoolErrorThreshold, defaultPoolErrorThreshold) v.SetDefault(cfgPoolErrorThreshold, defaultPoolErrorThreshold)
v.SetDefault(cfgStreamTimeout, defaultStreamTimeout) v.SetDefault(cfgStreamTimeout, defaultStreamTimeout)
v.SetDefault(cfgPoolCbThreshold, defaultCbThreshold)
v.SetDefault(cfgPoolCbBreakDuration, defaultCbBreakDuration)
v.SetDefault(cfgPProfAddress, "localhost:8085") v.SetDefault(cfgPProfAddress, "localhost:8085")
v.SetDefault(cfgPrometheusAddress, "localhost:8086") v.SetDefault(cfgPrometheusAddress, "localhost:8086")

View file

@ -90,6 +90,10 @@ S3_GW_HEALTHCHECK_TIMEOUT=15s
S3_GW_REBALANCE_INTERVAL=60s S3_GW_REBALANCE_INTERVAL=60s
# The number of errors on connection after which node is considered as unhealthy # The number of errors on connection after which node is considered as unhealthy
S3_GW_POOL_ERROR_THRESHOLD=100 S3_GW_POOL_ERROR_THRESHOLD=100
# The number of init errors before tree service circuit breaker is closed
S3_GW_POOL_CB_THRESHOLD: 10
# Duration when circuit breaker blocks all tree service inits to remote node
S3_GW_POOL_CB_BREAK_DURATION: 10s
# Limits for processing of clients' requests # Limits for processing of clients' requests
S3_GW_MAX_CLIENTS_COUNT=100 S3_GW_MAX_CLIENTS_COUNT=100

View file

@ -110,6 +110,10 @@ healthcheck_timeout: 15s
rebalance_interval: 60s rebalance_interval: 60s
# The number of errors on connection after which node is considered as unhealthy # The number of errors on connection after which node is considered as unhealthy
pool_error_threshold: 100 pool_error_threshold: 100
# The number of init errors before tree service circuit breaker is closed
pool_cb_threshold: 10
# Duration when circuit breaker blocks all tree service inits to remote node
pool_cb_break_duration: 10s
# Limits for processing of clients' requests # Limits for processing of clients' requests

View file

@ -217,6 +217,8 @@ tree_stream_timeout: 10s
healthcheck_timeout: 15s healthcheck_timeout: 15s
rebalance_interval: 60s rebalance_interval: 60s
pool_error_threshold: 100 pool_error_threshold: 100
pool_cb_threshold: 10
pool_cb_break_duration: 10s
max_clients_count: 100 max_clients_count: 100
max_clients_deadline: 30s max_clients_deadline: 30s
@ -241,6 +243,8 @@ source_ip_header: "Source-Ip"
| `healthcheck_timeout` | `duration` | no | `15s` | Timeout to check node health during rebalance. | | `healthcheck_timeout` | `duration` | no | `15s` | Timeout to check node health during rebalance. |
| `rebalance_interval` | `duration` | no | `60s` | Interval to check node health. | | `rebalance_interval` | `duration` | no | `60s` | Interval to check node health. |
| `pool_error_threshold` | `uint32` | no | `100` | The number of errors on connection after which node is considered as unhealthy. | | `pool_error_threshold` | `uint32` | no | `100` | The number of errors on connection after which node is considered as unhealthy. |
| `pool_cb_threshold` | `int` | no | `10` | The number of init errors before tree service circuit breaker is closed |
| `pool_cb_break_timeout` | `duration` | no | `10s` | Duration when circuit breaker blocks all tree service inits to remote node |
| `max_clients_count` | `int` | no | `100` | Limits for processing of clients' requests. | | `max_clients_count` | `int` | no | `100` | Limits for processing of clients' requests. |
| `max_clients_deadline` | `duration` | no | `30s` | Deadline after which the gate sends error `RequestTimeout` to a client. | | `max_clients_deadline` | `duration` | no | `30s` | Deadline after which the gate sends error `RequestTimeout` to a client. |
| `allowed_access_key_id_prefixes` | `[]string` | no | | List of allowed `AccessKeyID` prefixes which S3 GW serve. If the parameter is omitted, all `AccessKeyID` will be accepted. | | `allowed_access_key_id_prefixes` | `[]string` | no | | List of allowed `AccessKeyID` prefixes which S3 GW serve. If the parameter is omitted, all `AccessKeyID` will be accepted. |

2
go.mod
View file

@ -5,7 +5,7 @@ go 1.22
require ( require (
git.frostfs.info/TrueCloudLab/frostfs-contract v0.20.1-0.20241022094040-5f956751d48b git.frostfs.info/TrueCloudLab/frostfs-contract v0.20.1-0.20241022094040-5f956751d48b
git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88 git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7 git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4
git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972 git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972
git.frostfs.info/TrueCloudLab/policy-engine v0.0.0-20240822104152-a3bc3099bd5b git.frostfs.info/TrueCloudLab/policy-engine v0.0.0-20240822104152-a3bc3099bd5b
git.frostfs.info/TrueCloudLab/zapjournald v0.0.0-20240124114243-cb2e66427d02 git.frostfs.info/TrueCloudLab/zapjournald v0.0.0-20240124114243-cb2e66427d02

2
go.sum
View file

@ -44,6 +44,8 @@ git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779
git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88/go.mod h1:kbwB4v2o6RyOfCo9kEFeUDZIX3LKhmS0yXPrtvzkQ1g= git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88/go.mod h1:kbwB4v2o6RyOfCo9kEFeUDZIX3LKhmS0yXPrtvzkQ1g=
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7 h1:T7r38zZ/aT1xTp+AxhizfukW10Rq3WQ5/m3moLGVnSk= git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7 h1:T7r38zZ/aT1xTp+AxhizfukW10Rq3WQ5/m3moLGVnSk=
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8= git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8=
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4 h1:DWMwf08GhGE9Q2g3p8Kyjl0DxPuxY7WmtkkVf4iBiCo=
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8=
git.frostfs.info/TrueCloudLab/hrw v1.2.1 h1:ccBRK21rFvY5R1WotI6LNoPlizk7qSvdfD8lNIRudVc= git.frostfs.info/TrueCloudLab/hrw v1.2.1 h1:ccBRK21rFvY5R1WotI6LNoPlizk7qSvdfD8lNIRudVc=
git.frostfs.info/TrueCloudLab/hrw v1.2.1/go.mod h1:C1Ygde2n843yTZEQ0FP69jYiuaYV0kriLvP4zm8JuvM= git.frostfs.info/TrueCloudLab/hrw v1.2.1/go.mod h1:C1Ygde2n843yTZEQ0FP69jYiuaYV0kriLvP4zm8JuvM=
git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972 h1:/960fWeyn2AFHwQUwDsWB3sbP6lTEnFnMzLMM6tx6N8= git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972 h1:/960fWeyn2AFHwQUwDsWB3sbP6lTEnFnMzLMM6tx6N8=