[#xxx] Add circuit breaker configuration in tree pool
Circuit breaker prevents from port starving when some storage nodes are up but unsynced. See more details in: TrueCloudLab/frostfs-sdk-go#339 Signed-off-by: Alex Vanin <a.vanin@yadro.com>
This commit is contained in:
parent
93964834a8
commit
3ba889d159
7 changed files with 30 additions and 7 deletions
|
@ -799,6 +799,8 @@ func (a *App) initPools(ctx context.Context) {
|
||||||
prmTree.SetLogger(a.log)
|
prmTree.SetLogger(a.log)
|
||||||
|
|
||||||
prmTree.SetMaxRequestAttempts(a.cfg.GetInt(cfgTreePoolMaxAttempts))
|
prmTree.SetMaxRequestAttempts(a.cfg.GetInt(cfgTreePoolMaxAttempts))
|
||||||
|
prmTree.SetCircuitBreakerThreshold(a.cfg.GetInt(cfgPoolCbThreshold))
|
||||||
|
prmTree.SetCircuitBreakerDuration(a.cfg.GetDuration(cfgPoolCbBreakDuration))
|
||||||
|
|
||||||
interceptors := []grpc.DialOption{
|
interceptors := []grpc.DialOption{
|
||||||
grpc.WithUnaryInterceptor(grpctracing.NewUnaryClientInteceptor()),
|
grpc.WithUnaryInterceptor(grpctracing.NewUnaryClientInteceptor()),
|
||||||
|
|
|
@ -42,6 +42,9 @@ const (
|
||||||
defaultStreamTimeout = 10 * time.Second
|
defaultStreamTimeout = 10 * time.Second
|
||||||
defaultShutdownTimeout = 15 * time.Second
|
defaultShutdownTimeout = 15 * time.Second
|
||||||
|
|
||||||
|
defaultCbThreshold = 10
|
||||||
|
defaultCbBreakDuration = 10 * time.Second
|
||||||
|
|
||||||
defaultLoggerSamplerInterval = 1 * time.Second
|
defaultLoggerSamplerInterval = 1 * time.Second
|
||||||
|
|
||||||
defaultGracefulCloseOnSwitchTimeout = 10 * time.Second
|
defaultGracefulCloseOnSwitchTimeout = 10 * time.Second
|
||||||
|
@ -126,6 +129,8 @@ const ( // Settings.
|
||||||
cfgHealthcheckTimeout = "healthcheck_timeout"
|
cfgHealthcheckTimeout = "healthcheck_timeout"
|
||||||
cfgRebalanceInterval = "rebalance_interval"
|
cfgRebalanceInterval = "rebalance_interval"
|
||||||
cfgPoolErrorThreshold = "pool_error_threshold"
|
cfgPoolErrorThreshold = "pool_error_threshold"
|
||||||
|
cfgPoolCbThreshold = "pool_cb_threshold"
|
||||||
|
cfgPoolCbBreakDuration = "pool_cb_break_duration"
|
||||||
|
|
||||||
// Caching.
|
// Caching.
|
||||||
cfgObjectsCacheLifetime = "cache.objects.lifetime"
|
cfgObjectsCacheLifetime = "cache.objects.lifetime"
|
||||||
|
@ -945,6 +950,8 @@ func newSettings() *viper.Viper {
|
||||||
// pool:
|
// pool:
|
||||||
v.SetDefault(cfgPoolErrorThreshold, defaultPoolErrorThreshold)
|
v.SetDefault(cfgPoolErrorThreshold, defaultPoolErrorThreshold)
|
||||||
v.SetDefault(cfgStreamTimeout, defaultStreamTimeout)
|
v.SetDefault(cfgStreamTimeout, defaultStreamTimeout)
|
||||||
|
v.SetDefault(cfgPoolCbThreshold, defaultCbThreshold)
|
||||||
|
v.SetDefault(cfgPoolCbBreakDuration, defaultCbBreakDuration)
|
||||||
|
|
||||||
v.SetDefault(cfgPProfAddress, "localhost:8085")
|
v.SetDefault(cfgPProfAddress, "localhost:8085")
|
||||||
v.SetDefault(cfgPrometheusAddress, "localhost:8086")
|
v.SetDefault(cfgPrometheusAddress, "localhost:8086")
|
||||||
|
|
|
@ -90,6 +90,10 @@ S3_GW_HEALTHCHECK_TIMEOUT=15s
|
||||||
S3_GW_REBALANCE_INTERVAL=60s
|
S3_GW_REBALANCE_INTERVAL=60s
|
||||||
# The number of errors on connection after which node is considered as unhealthy
|
# The number of errors on connection after which node is considered as unhealthy
|
||||||
S3_GW_POOL_ERROR_THRESHOLD=100
|
S3_GW_POOL_ERROR_THRESHOLD=100
|
||||||
|
# The number of init errors before tree service circuit breaker is closed
|
||||||
|
S3_GW_POOL_CB_THRESHOLD: 10
|
||||||
|
# Duration when circuit breaker blocks all tree service inits to remote node
|
||||||
|
S3_GW_POOL_CB_BREAK_DURATION: 10s
|
||||||
|
|
||||||
# Limits for processing of clients' requests
|
# Limits for processing of clients' requests
|
||||||
S3_GW_MAX_CLIENTS_COUNT=100
|
S3_GW_MAX_CLIENTS_COUNT=100
|
||||||
|
|
|
@ -110,6 +110,10 @@ healthcheck_timeout: 15s
|
||||||
rebalance_interval: 60s
|
rebalance_interval: 60s
|
||||||
# The number of errors on connection after which node is considered as unhealthy
|
# The number of errors on connection after which node is considered as unhealthy
|
||||||
pool_error_threshold: 100
|
pool_error_threshold: 100
|
||||||
|
# The number of init errors before tree service circuit breaker is closed
|
||||||
|
pool_cb_threshold: 10
|
||||||
|
# Duration when circuit breaker blocks all tree service inits to remote node
|
||||||
|
pool_cb_break_duration: 10s
|
||||||
|
|
||||||
|
|
||||||
# Limits for processing of clients' requests
|
# Limits for processing of clients' requests
|
||||||
|
|
|
@ -217,6 +217,8 @@ tree_stream_timeout: 10s
|
||||||
healthcheck_timeout: 15s
|
healthcheck_timeout: 15s
|
||||||
rebalance_interval: 60s
|
rebalance_interval: 60s
|
||||||
pool_error_threshold: 100
|
pool_error_threshold: 100
|
||||||
|
pool_cb_threshold: 10
|
||||||
|
pool_cb_break_duration: 10s
|
||||||
|
|
||||||
max_clients_count: 100
|
max_clients_count: 100
|
||||||
max_clients_deadline: 30s
|
max_clients_deadline: 30s
|
||||||
|
@ -241,6 +243,8 @@ source_ip_header: "Source-Ip"
|
||||||
| `healthcheck_timeout` | `duration` | no | `15s` | Timeout to check node health during rebalance. |
|
| `healthcheck_timeout` | `duration` | no | `15s` | Timeout to check node health during rebalance. |
|
||||||
| `rebalance_interval` | `duration` | no | `60s` | Interval to check node health. |
|
| `rebalance_interval` | `duration` | no | `60s` | Interval to check node health. |
|
||||||
| `pool_error_threshold` | `uint32` | no | `100` | The number of errors on connection after which node is considered as unhealthy. |
|
| `pool_error_threshold` | `uint32` | no | `100` | The number of errors on connection after which node is considered as unhealthy. |
|
||||||
|
| `pool_cb_threshold` | `int` | no | `10` | The number of init errors before tree service circuit breaker is closed |
|
||||||
|
| `pool_cb_break_timeout` | `duration` | no | `10s` | Duration when circuit breaker blocks all tree service inits to remote node |
|
||||||
| `max_clients_count` | `int` | no | `100` | Limits for processing of clients' requests. |
|
| `max_clients_count` | `int` | no | `100` | Limits for processing of clients' requests. |
|
||||||
| `max_clients_deadline` | `duration` | no | `30s` | Deadline after which the gate sends error `RequestTimeout` to a client. |
|
| `max_clients_deadline` | `duration` | no | `30s` | Deadline after which the gate sends error `RequestTimeout` to a client. |
|
||||||
| `allowed_access_key_id_prefixes` | `[]string` | no | | List of allowed `AccessKeyID` prefixes which S3 GW serve. If the parameter is omitted, all `AccessKeyID` will be accepted. |
|
| `allowed_access_key_id_prefixes` | `[]string` | no | | List of allowed `AccessKeyID` prefixes which S3 GW serve. If the parameter is omitted, all `AccessKeyID` will be accepted. |
|
||||||
|
|
2
go.mod
2
go.mod
|
@ -5,7 +5,7 @@ go 1.22
|
||||||
require (
|
require (
|
||||||
git.frostfs.info/TrueCloudLab/frostfs-contract v0.20.1-0.20241022094040-5f956751d48b
|
git.frostfs.info/TrueCloudLab/frostfs-contract v0.20.1-0.20241022094040-5f956751d48b
|
||||||
git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88
|
git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88
|
||||||
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7
|
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4
|
||||||
git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972
|
git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972
|
||||||
git.frostfs.info/TrueCloudLab/policy-engine v0.0.0-20240822104152-a3bc3099bd5b
|
git.frostfs.info/TrueCloudLab/policy-engine v0.0.0-20240822104152-a3bc3099bd5b
|
||||||
git.frostfs.info/TrueCloudLab/zapjournald v0.0.0-20240124114243-cb2e66427d02
|
git.frostfs.info/TrueCloudLab/zapjournald v0.0.0-20240124114243-cb2e66427d02
|
||||||
|
|
2
go.sum
2
go.sum
|
@ -44,6 +44,8 @@ git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779
|
||||||
git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88/go.mod h1:kbwB4v2o6RyOfCo9kEFeUDZIX3LKhmS0yXPrtvzkQ1g=
|
git.frostfs.info/TrueCloudLab/frostfs-observability v0.0.0-20241112082307-f17779933e88/go.mod h1:kbwB4v2o6RyOfCo9kEFeUDZIX3LKhmS0yXPrtvzkQ1g=
|
||||||
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7 h1:T7r38zZ/aT1xTp+AxhizfukW10Rq3WQ5/m3moLGVnSk=
|
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7 h1:T7r38zZ/aT1xTp+AxhizfukW10Rq3WQ5/m3moLGVnSk=
|
||||||
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8=
|
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250228093256-2b8329e026c7/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8=
|
||||||
|
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4 h1:DWMwf08GhGE9Q2g3p8Kyjl0DxPuxY7WmtkkVf4iBiCo=
|
||||||
|
git.frostfs.info/TrueCloudLab/frostfs-sdk-go v0.0.0-20250305114045-7a37613988a4/go.mod h1:aQpPWfG8oyfJ2X+FenPTJpSRWZjwcP5/RAtkW+/VEX8=
|
||||||
git.frostfs.info/TrueCloudLab/hrw v1.2.1 h1:ccBRK21rFvY5R1WotI6LNoPlizk7qSvdfD8lNIRudVc=
|
git.frostfs.info/TrueCloudLab/hrw v1.2.1 h1:ccBRK21rFvY5R1WotI6LNoPlizk7qSvdfD8lNIRudVc=
|
||||||
git.frostfs.info/TrueCloudLab/hrw v1.2.1/go.mod h1:C1Ygde2n843yTZEQ0FP69jYiuaYV0kriLvP4zm8JuvM=
|
git.frostfs.info/TrueCloudLab/hrw v1.2.1/go.mod h1:C1Ygde2n843yTZEQ0FP69jYiuaYV0kriLvP4zm8JuvM=
|
||||||
git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972 h1:/960fWeyn2AFHwQUwDsWB3sbP6lTEnFnMzLMM6tx6N8=
|
git.frostfs.info/TrueCloudLab/multinet v0.0.0-20241015075604-6cb0d80e0972 h1:/960fWeyn2AFHwQUwDsWB3sbP6lTEnFnMzLMM6tx6N8=
|
||||||
|
|
Loading…
Add table
Reference in a new issue