From 5ffbeb76e647236761a47573cc2cdb811b7da1d2 Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Mon, 6 Jun 2022 14:46:00 +0300 Subject: [PATCH] [#1456] services/tree: Wait some time before reconnecting after failure In case node is down or failing for some reason, we can expect `Dial` to fail. In case we actively try to replicate and `Dial` always takes 2 seconds, replication-related channels quickly become full. That affects latency of all other write operations. Signed-off-by: Evgenii Stratonikov --- pkg/services/tree/cache.go | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/pkg/services/tree/cache.go b/pkg/services/tree/cache.go index 76d29a00f..54a1f9f9f 100644 --- a/pkg/services/tree/cache.go +++ b/pkg/services/tree/cache.go @@ -2,6 +2,7 @@ package tree import ( "context" + "fmt" "strings" "sync" "time" @@ -17,9 +18,15 @@ type clientCache struct { simplelru.LRU } +type cacheItem struct { + cc *grpc.ClientConn + lastTry time.Time +} + const ( defaultClientCacheSize = 10 defaultClientConnectTimeout = time.Second * 2 + defaultReconnectInterval = time.Second * 15 ) func (c *clientCache) init() { @@ -35,22 +42,35 @@ func (c *clientCache) get(ctx context.Context, netmapAddr string) (TreeServiceCl c.Unlock() if ok { - cc := ccInt.(*grpc.ClientConn) - if s := cc.GetState(); s == connectivity.Idle || s == connectivity.Ready { - return NewTreeServiceClient(cc), nil + item := ccInt.(cacheItem) + if item.cc == nil { + if d := time.Since(item.lastTry); d < defaultReconnectInterval { + return nil, fmt.Errorf("skip connecting to %s (time since last error %s)", + netmapAddr, d) + } + } else { + if s := item.cc.GetState(); s == connectivity.Idle || s == connectivity.Ready { + return NewTreeServiceClient(item.cc), nil + } + _ = item.cc.Close() } - _ = cc.Close() } cc, err := dialTreeService(ctx, netmapAddr) + lastTry := time.Now() + + c.Lock() + if err != nil { + c.LRU.Add(netmapAddr, cacheItem{cc: nil, lastTry: lastTry}) + } else { + c.LRU.Add(netmapAddr, cacheItem{cc: cc, lastTry: lastTry}) + } + c.Unlock() + if err != nil { return nil, err } - c.Lock() - c.LRU.Add(netmapAddr, cc) - c.Unlock() - return NewTreeServiceClient(cc), nil }