diff --git a/go.mod b/go.mod index a792a88d8..067bf9653 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/apache/thrift v0.13.0 // indirect github.com/aws/aws-sdk-go v1.28.9 github.com/caddyserver/caddy v1.0.4 + github.com/cenkalti/backoff/v4 v4.0.0 github.com/coredns/federation v0.0.0-20190818181423-e032b096babe github.com/coreos/go-systemd v0.0.0-20190212144455-93d5ec2c7f76 // indirect github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f // indirect diff --git a/go.sum b/go.sum index 41a053011..9cbe8fe2e 100644 --- a/go.sum +++ b/go.sum @@ -78,6 +78,8 @@ github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kB github.com/caddyserver/caddy v1.0.4 h1:wwuGSkUHo6RZ3oMpeTt7J09WBB87X5o+IZN4dKehcQE= github.com/caddyserver/caddy v1.0.4/go.mod h1:uruyfVsyMcDb3IOzSKsi1x0wOjy1my/PxOSTcD+24jM= github.com/cenkalti/backoff/v3 v3.0.0/go.mod h1:cIeZDE3IrqwwJl6VUwCN6trj1oXrTS4rc0ij+ULvLYs= +github.com/cenkalti/backoff/v4 v4.0.0 h1:6VeaLF9aI+MAUQ95106HwWzYZgJJpZ4stumjj6RFYAU= +github.com/cenkalti/backoff/v4 v4.0.0/go.mod h1:eEew/i+1Q6OrCDZh3WiXYv3+nJwBASZ8Bog/87DQnVg= github.com/census-instrumentation/opencensus-proto v0.2.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= diff --git a/plugin/forward/README.md b/plugin/forward/README.md index cc1845377..b4307d8dd 100644 --- a/plugin/forward/README.md +++ b/plugin/forward/README.md @@ -9,8 +9,10 @@ The *forward* plugin re-uses already opened sockets to the upstreams. It supports UDP, TCP and DNS-over-TLS and uses in band health checking. -When it detects an error a health check is performed. This checks runs in a loop, every *0.5s*, for -as long as the upstream reports unhealthy. Once healthy we stop health checking (until the next +When it detects an error a health check is performed. This checks runs in a loop, +starting with a *0.5s* interval and exponentially backing off with randomized intervals +up to *60s* for as long as the upstream reports unhealthy. The exponential backoff +will reset to *0.5s* after 15 minutes. Once healthy we stop health checking (until the next error). The health checks use a recursive DNS query (`. IN NS`) to get upstream health. Any response that is not a network error (REFUSED, NOTIMPL, SERVFAIL, etc) is taken as a healthy upstream. The health check uses the same protocol as specified in **TO**. If `max_fails` is set to 0, no checking diff --git a/plugin/pkg/up/up.go b/plugin/pkg/up/up.go index 8f866311b..71c128234 100644 --- a/plugin/pkg/up/up.go +++ b/plugin/pkg/up/up.go @@ -5,6 +5,8 @@ package up import ( "sync" "time" + + "github.com/cenkalti/backoff/v4" ) // Probe is used to run a single Func until it returns true (indicating a target is healthy). If an Func @@ -13,8 +15,7 @@ import ( type Probe struct { sync.Mutex inprogress int - interval time.Duration - max time.Duration + expBackoff backoff.ExponentialBackOff } // Func is used to determine if a target is alive. If so this function must return nil. @@ -31,7 +32,13 @@ func (p *Probe) Do(f Func) { return } p.inprogress = active - interval := p.interval + interval := p.expBackoff.NextBackOff() + // If exponential backoff has reached the maximum elapsed time (15 minutes), + // reset it and try again + if interval == -1 { + p.expBackoff.Reset() + interval = p.expBackoff.NextBackOff() + } p.Unlock() // Passed the lock. Now run f for as long it returns false. If a true is returned // we return from the goroutine and we can accept another Func to run. @@ -42,9 +49,6 @@ func (p *Probe) Do(f Func) { break } time.Sleep(interval) - if i%2 == 0 && i < 4 { // 4 is 2 doubles, so no need to increase anymore - this is *also* checked in double() - p.double() - } p.Lock() if p.inprogress == stop { p.Unlock() @@ -60,15 +64,6 @@ func (p *Probe) Do(f Func) { }() } -func (p *Probe) double() { - p.Lock() - p.interval *= 2 - if p.interval > p.max { - p.interval = p.max - } - p.Unlock() -} - // Stop stops the probing. func (p *Probe) Stop() { p.Lock() @@ -77,10 +72,20 @@ func (p *Probe) Stop() { } // Start will initialize the probe manager, after which probes can be initiated with Do. +// Initializes exponential backoff using the given interval duration func (p *Probe) Start(interval time.Duration) { p.Lock() - p.interval = interval - p.max = interval * multiplier + eB := &backoff.ExponentialBackOff{ + InitialInterval: interval, + RandomizationFactor: backoff.DefaultRandomizationFactor, + Multiplier: backoff.DefaultMultiplier, + MaxInterval: backoff.DefaultMaxInterval, + MaxElapsedTime: backoff.DefaultMaxElapsedTime, + Stop: backoff.Stop, + Clock: backoff.SystemClock, + } + p.expBackoff = *eB + p.expBackoff.Reset() p.Unlock() } @@ -88,6 +93,4 @@ const ( idle = iota active stop - - multiplier = 4 )