middleware/proxy: async health checks (#749)

* Switches out Unhealthy bool for OkUntil timestamp

* Make sure servers are healthy forever if there are no health checks

* Moves health check off into a go routine to avoid blocking conditions

* Improved logging info

* Fixes initial date

* Fixes health checking; alters tests to adapt to async health checking

* Moves future variable into static upstream and populates it in more places

* Restores silencing of stdout during testing

* Restores silencing of stdout during testing

* keeps check url string once built

* Removes debug message

* uses zero value to signal no checking; reduces in-mutex code to a fetch
This commit is contained in:
ghostflame 2017-06-30 10:13:45 +01:00 committed by Miek Gieben
parent edf71fb168
commit bb05a665eb
9 changed files with 158 additions and 60 deletions

View file

@ -41,9 +41,9 @@ proxy FROM TO... {
* `max_fails` is the number of failures within fail_timeout that are needed before considering * `max_fails` is the number of failures within fail_timeout that are needed before considering
a backend to be down. If 0, the backend will never be marked as down. Default is 1. a backend to be down. If 0, the backend will never be marked as down. Default is 1.
* `health_check` will check path (on port) on each backend. If a backend returns a status code of * `health_check` will check path (on port) on each backend. If a backend returns a status code of
200-399, then that backend is healthy. If it doesn't, the backend is marked as unhealthy for 200-399, then that backend is marked healthy for double the healthcheck duration. If it doesn't,
duration and no requests are routed to it. If this option is not provided then health checks are it is marked as unhealthy and no requests are routed to it. If this option is not provided then
disabled. The default duration is 30 seconds ("30s"). health checks are disabled. The default duration is 30 seconds ("30s").
* **IGNORED_NAMES** in `except` is a space-separated list of domains to exclude from proxying. * **IGNORED_NAMES** in `except` is a space-separated list of domains to exclude from proxying.
Requests that match none of these names will be passed through. Requests that match none of these names will be passed through.
* `spray` when all backends are unhealthy, randomly pick one to send the traffic to. (This is * `spray` when all backends are unhealthy, randomly pick one to send the traffic to. (This is

View file

@ -206,6 +206,7 @@ func newUpstream(hosts []string, old *staticUpstream) Upstream {
Spray: nil, Spray: nil,
FailTimeout: 10 * time.Second, FailTimeout: 10 * time.Second,
MaxFails: 3, MaxFails: 3,
Future: 60 * time.Second,
ex: old.ex, ex: old.ex,
WithoutPathPrefix: old.WithoutPathPrefix, WithoutPathPrefix: old.WithoutPathPrefix,
IgnoredSubDomains: old.IgnoredSubDomains, IgnoredSubDomains: old.IgnoredSubDomains,
@ -218,23 +219,30 @@ func newUpstream(hosts []string, old *staticUpstream) Upstream {
Conns: 0, Conns: 0,
Fails: 0, Fails: 0,
FailTimeout: upstream.FailTimeout, FailTimeout: upstream.FailTimeout,
Unhealthy: false,
CheckDown: func(upstream *staticUpstream) UpstreamHostDownFunc { CheckDown: func(upstream *staticUpstream) UpstreamHostDownFunc {
return func(uh *UpstreamHost) bool { return func(uh *UpstreamHost) bool {
if uh.Unhealthy {
return true down := false
uh.checkMu.Lock()
until := uh.OkUntil
uh.checkMu.Unlock()
if !until.IsZero() && time.Now().After(until) {
down = true
} }
fails := atomic.LoadInt32(&uh.Fails) fails := atomic.LoadInt32(&uh.Fails)
if fails >= upstream.MaxFails && upstream.MaxFails != 0 { if fails >= upstream.MaxFails && upstream.MaxFails != 0 {
return true down = true
} }
return false return down
} }
}(upstream), }(upstream),
WithoutPathPrefix: upstream.WithoutPathPrefix, WithoutPathPrefix: upstream.WithoutPathPrefix,
} }
upstream.Hosts[i] = uh upstream.Hosts[i] = uh
} }
return upstream return upstream

View file

@ -27,6 +27,7 @@ func TestStartupShutdown(t *testing.T) {
Policy: &Random{}, Policy: &Random{},
Spray: nil, Spray: nil,
FailTimeout: 10 * time.Second, FailTimeout: 10 * time.Second,
Future: 60 * time.Second,
MaxFails: 1, MaxFails: 1,
} }
g := newGrpcClient(nil, upstream) g := newGrpcClient(nil, upstream)

View file

@ -30,6 +30,7 @@ func NewLookupWithOption(hosts []string, opts Options) Proxy {
Spray: nil, Spray: nil,
FailTimeout: 10 * time.Second, FailTimeout: 10 * time.Second,
MaxFails: 3, // TODO(miek): disable error checking for simple lookups? MaxFails: 3, // TODO(miek): disable error checking for simple lookups?
Future: 60 * time.Second,
ex: newDNSExWithOption(opts), ex: newDNSExWithOption(opts),
} }
@ -40,21 +41,29 @@ func NewLookupWithOption(hosts []string, opts Options) Proxy {
Fails: 0, Fails: 0,
FailTimeout: upstream.FailTimeout, FailTimeout: upstream.FailTimeout,
Unhealthy: false,
CheckDown: func(upstream *staticUpstream) UpstreamHostDownFunc { CheckDown: func(upstream *staticUpstream) UpstreamHostDownFunc {
return func(uh *UpstreamHost) bool { return func(uh *UpstreamHost) bool {
if uh.Unhealthy {
return true down := false
uh.checkMu.Lock()
until := uh.OkUntil
uh.checkMu.Unlock()
if !until.IsZero() && time.Now().After(until) {
down = true
} }
fails := atomic.LoadInt32(&uh.Fails) fails := atomic.LoadInt32(&uh.Fails)
if fails >= upstream.MaxFails && upstream.MaxFails != 0 { if fails >= upstream.MaxFails && upstream.MaxFails != 0 {
return true down = true
} }
return false return down
} }
}(upstream), }(upstream),
WithoutPathPrefix: upstream.WithoutPathPrefix, WithoutPathPrefix: upstream.WithoutPathPrefix,
} }
upstream.Hosts[i] = uh upstream.Hosts[i] = uh
} }
p.Upstreams = &[]Upstream{upstream} p.Upstreams = &[]Upstream{upstream}

View file

@ -5,6 +5,7 @@ import (
"net/http/httptest" "net/http/httptest"
"os" "os"
"testing" "testing"
"time"
) )
var workableServer *httptest.Server var workableServer *httptest.Server
@ -54,7 +55,7 @@ func TestRoundRobinPolicy(t *testing.T) {
t.Error("Expected second round robin host to be third host in the pool.") t.Error("Expected second round robin host to be third host in the pool.")
} }
// mark host as down // mark host as down
pool[0].Unhealthy = true pool[0].OkUntil = time.Unix(0, 0)
h = rrPolicy.Select(pool) h = rrPolicy.Select(pool)
if h != pool[1] { if h != pool[1] {
t.Error("Expected third round robin host to be first host in the pool.") t.Error("Expected third round robin host to be first host in the pool.")

View file

@ -59,9 +59,11 @@ type UpstreamHost struct {
Name string // IP address (and port) of this upstream host Name string // IP address (and port) of this upstream host
Fails int32 Fails int32
FailTimeout time.Duration FailTimeout time.Duration
Unhealthy bool OkUntil time.Time
CheckDown UpstreamHostDownFunc CheckDown UpstreamHostDownFunc
CheckUrl string
WithoutPathPrefix string WithoutPathPrefix string
Checking bool
checkMu sync.Mutex checkMu sync.Mutex
} }
@ -72,7 +74,17 @@ func (uh *UpstreamHost) Down() bool {
if uh.CheckDown == nil { if uh.CheckDown == nil {
// Default settings // Default settings
fails := atomic.LoadInt32(&uh.Fails) fails := atomic.LoadInt32(&uh.Fails)
return uh.Unhealthy || fails > 0 after := false
uh.checkMu.Lock()
until := uh.OkUntil
uh.checkMu.Unlock()
if !until.IsZero() && time.Now().After(until) {
after = true
}
return after || fails > 0
} }
return uh.CheckDown(uh) return uh.CheckDown(uh)
} }

View file

@ -74,8 +74,10 @@ func TestStop(t *testing.T) {
t.Error("Expected healthchecks to hit test server. Got no healthchecks.") t.Error("Expected healthchecks to hit test server. Got no healthchecks.")
} }
// health checks are in a go routine now, so one may well occur after we shutdown,
// but we only ever expect one more
counterValueAfterWaiting := atomic.LoadInt64(&counter) counterValueAfterWaiting := atomic.LoadInt64(&counter)
if counterValueAfterWaiting != counterValueAfterShutdown { if counterValueAfterWaiting > (counterValueAfterShutdown + 1) {
t.Errorf("Expected no more healthchecks after shutdown. Got: %d healthchecks after shutdown", counterValueAfterWaiting-counterValueAfterShutdown) t.Errorf("Expected no more healthchecks after shutdown. Got: %d healthchecks after shutdown", counterValueAfterWaiting-counterValueAfterShutdown)
} }

View file

@ -36,6 +36,7 @@ type staticUpstream struct {
FailTimeout time.Duration FailTimeout time.Duration
MaxFails int32 MaxFails int32
Future time.Duration
HealthCheck struct { HealthCheck struct {
Path string Path string
Port string Port string
@ -59,6 +60,7 @@ func NewStaticUpstreams(c *caddyfile.Dispenser) ([]Upstream, error) {
Spray: nil, Spray: nil,
FailTimeout: 10 * time.Second, FailTimeout: 10 * time.Second,
MaxFails: 1, MaxFails: 1,
Future: 60 * time.Second,
ex: newDNSEx(), ex: newDNSEx(),
} }
@ -89,21 +91,25 @@ func NewStaticUpstreams(c *caddyfile.Dispenser) ([]Upstream, error) {
Conns: 0, Conns: 0,
Fails: 0, Fails: 0,
FailTimeout: upstream.FailTimeout, FailTimeout: upstream.FailTimeout,
Unhealthy: false,
CheckDown: func(upstream *staticUpstream) UpstreamHostDownFunc { CheckDown: func(upstream *staticUpstream) UpstreamHostDownFunc {
return func(uh *UpstreamHost) bool { return func(uh *UpstreamHost) bool {
down := false
uh.checkMu.Lock() uh.checkMu.Lock()
defer uh.checkMu.Unlock() until := uh.OkUntil
if uh.Unhealthy { uh.checkMu.Unlock()
return true
if !until.IsZero() && time.Now().After(until) {
down = true
} }
fails := atomic.LoadInt32(&uh.Fails) fails := atomic.LoadInt32(&uh.Fails)
if fails >= upstream.MaxFails && upstream.MaxFails != 0 { if fails >= upstream.MaxFails && upstream.MaxFails != 0 {
return true down = true
} }
return false return down
} }
}(upstream), }(upstream),
WithoutPathPrefix: upstream.WithoutPathPrefix, WithoutPathPrefix: upstream.WithoutPathPrefix,
@ -186,6 +192,12 @@ func parseBlock(c *caddyfile.Dispenser, u *staticUpstream) error {
return err return err
} }
u.HealthCheck.Interval = dur u.HealthCheck.Interval = dur
u.Future = 2 * dur
// set a minimum of 3 seconds
if u.Future < (3 * time.Second) {
u.Future = 3 * time.Second
}
} }
case "without": case "without":
if !c.NextArg() { if !c.NextArg() {
@ -247,13 +259,71 @@ func parseBlock(c *caddyfile.Dispenser, u *staticUpstream) error {
return nil return nil
} }
// This was moved into a thread so that each host could throw a health
// check at the same time. The reason for this is that if we are checking
// 3 hosts, and the first one is gone, and we spend minutes timing out to
// fail it, we would not have been doing any other health checks in that
// time. So we now have a per-host lock and a threaded health check.
//
// We use the Checking bool to avoid concurrent checks against the same
// host; if one is taking a long time, the next one will find a check in
// progress and simply return before trying.
//
// We are carefully avoiding having the mutex locked while we check,
// otherwise checks will back up, potentially a lot of them if a host is
// absent for a long time. This arrangement makes checks quickly see if
// they are the only one running and abort otherwise.
func healthCheckUrl(nextTs time.Time, host *UpstreamHost) {
// lock for our bool check. We don't just defer the unlock because
// we don't want the lock held while http.Get runs
host.checkMu.Lock()
// are we mid check? Don't run another one
if host.Checking {
host.checkMu.Unlock()
return
}
host.Checking = true
host.checkMu.Unlock()
//log.Printf("[DEBUG] Healthchecking %s, nextTs is %s\n", url, nextTs.Local())
// fetch that url. This has been moved into a go func because
// when the remote host is not merely not serving, but actually
// absent, then tcp syn timeouts can be very long, and so one
// fetch could last several check intervals
if r, err := http.Get(host.CheckUrl); err == nil {
io.Copy(ioutil.Discard, r.Body)
r.Body.Close()
if r.StatusCode < 200 || r.StatusCode >= 400 {
log.Printf("[WARNING] Host %s health check returned HTTP code %d\n",
host.Name, r.StatusCode)
nextTs = time.Unix(0, 0)
}
} else {
log.Printf("[WARNING] Host %s health check probe failed: %v\n", host.Name, err)
nextTs = time.Unix(0, 0)
}
host.checkMu.Lock()
host.Checking = false
host.OkUntil = nextTs
host.checkMu.Unlock()
}
func (u *staticUpstream) healthCheck() { func (u *staticUpstream) healthCheck() {
for _, host := range u.Hosts { for _, host := range u.Hosts {
if host.CheckUrl == "" {
var hostName, checkPort string var hostName, checkPort string
// The DNS server might be an HTTP server. If so, extract its name. // The DNS server might be an HTTP server. If so, extract its name.
if url, err := url.Parse(host.Name); err == nil { ret, err := url.Parse(host.Name)
hostName = url.Host if err == nil && len(ret.Host) > 0 {
hostName = ret.Host
} else { } else {
hostName = host.Name hostName = host.Name
} }
@ -268,25 +338,14 @@ func (u *staticUpstream) healthCheck() {
checkPort = u.HealthCheck.Port checkPort = u.HealthCheck.Port
} }
hostURL := "http://" + net.JoinHostPort(checkHostName, checkPort) + u.HealthCheck.Path host.CheckUrl = "http://" + net.JoinHostPort(checkHostName, checkPort) + u.HealthCheck.Path
host.checkMu.Lock()
defer host.checkMu.Unlock()
if r, err := http.Get(hostURL); err == nil {
io.Copy(ioutil.Discard, r.Body)
r.Body.Close()
if r.StatusCode < 200 || r.StatusCode >= 400 {
log.Printf("[WARNING] Health check URL %s returned HTTP code %d\n",
hostURL, r.StatusCode)
host.Unhealthy = true
} else {
host.Unhealthy = false
}
} else {
log.Printf("[WARNING] Health check probe failed: %v\n", err)
host.Unhealthy = true
} }
// calculate this before the get
nextTs := time.Now().Add(u.Future)
// locks/bools should prevent requests backing up
go healthCheckUrl(nextTs, host)
} }
} }

View file

@ -23,9 +23,14 @@ func TestHealthCheck(t *testing.T) {
Policy: &Random{}, Policy: &Random{},
Spray: nil, Spray: nil,
FailTimeout: 10 * time.Second, FailTimeout: 10 * time.Second,
Future: 60 * time.Second,
MaxFails: 1, MaxFails: 1,
} }
upstream.healthCheck() upstream.healthCheck()
// sleep a bit, it's async now
time.Sleep(time.Duration(2 * time.Second))
if upstream.Hosts[0].Down() { if upstream.Hosts[0].Down() {
t.Error("Expected first host in testpool to not fail healthcheck.") t.Error("Expected first host in testpool to not fail healthcheck.")
} }
@ -40,15 +45,16 @@ func TestSelect(t *testing.T) {
Hosts: testPool()[:3], Hosts: testPool()[:3],
Policy: &Random{}, Policy: &Random{},
FailTimeout: 10 * time.Second, FailTimeout: 10 * time.Second,
Future: 60 * time.Second,
MaxFails: 1, MaxFails: 1,
} }
upstream.Hosts[0].Unhealthy = true upstream.Hosts[0].OkUntil = time.Unix(0, 0)
upstream.Hosts[1].Unhealthy = true upstream.Hosts[1].OkUntil = time.Unix(0, 0)
upstream.Hosts[2].Unhealthy = true upstream.Hosts[2].OkUntil = time.Unix(0, 0)
if h := upstream.Select(); h != nil { if h := upstream.Select(); h != nil {
t.Error("Expected select to return nil as all host are down") t.Error("Expected select to return nil as all host are down")
} }
upstream.Hosts[2].Unhealthy = false upstream.Hosts[2].OkUntil = time.Time{}
if h := upstream.Select(); h == nil { if h := upstream.Select(); h == nil {
t.Error("Expected select to not return nil") t.Error("Expected select to not return nil")
} }