diff --git a/plugin/health/README.md b/plugin/health/README.md index f94355649..e547028c2 100644 --- a/plugin/health/README.md +++ b/plugin/health/README.md @@ -17,9 +17,9 @@ health [ADDRESS] ~~~ Optionally takes an address; the default is `:8080`. The health path is fixed to `/health`. The -health endpoint returns a 200 response code and the word "OK" when CoreDNS is healthy. It returns -a 503. *health* periodically (1s) polls plugin that exports health information. If any of the -plugin signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports +health endpoint returns a 200 response code and the word "OK" when this server is healthy. It returns +a 503. *health* periodically (1s) polls plugins that exports health information. If any of the +plugins signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports health checks has a section "Health" in their README. More options can be set with this extended syntax: @@ -33,7 +33,7 @@ health [ADDRESS] { * Where `lameduck` will make the process unhealthy then *wait* for **DURATION** before the process shuts down. -If you have multiple Server Block and need to export health for each of the plugins, you must run +If you have multiple Server Blocks and need to export health for each of the plugins, you must run health endpoints on different ports: ~~~ corefile diff --git a/plugin/health/health.go b/plugin/health/health.go index f6b82804e..782099154 100644 --- a/plugin/health/health.go +++ b/plugin/health/health.go @@ -16,8 +16,9 @@ type health struct { Addr string lameduck time.Duration - ln net.Listener - mux *http.ServeMux + ln net.Listener + nlSetup bool + mux *http.ServeMux // A slice of Healthers that the health plugin will poll every second for their health status. h []Healther @@ -45,6 +46,7 @@ func (h *health) OnStartup() error { h.ln = ln h.mux = http.NewServeMux() + h.nlSetup = true h.mux.HandleFunc(path, func(w http.ResponseWriter, r *http.Request) { if h.Ok() { @@ -61,7 +63,13 @@ func (h *health) OnStartup() error { return nil } -func (h *health) OnShutdown() error { +func (h *health) OnRestart() error { return h.OnFinalShutdown() } + +func (h *health) OnFinalShutdown() error { + if !h.nlSetup { + return nil + } + // Stop polling plugins h.pollstop <- true // NACK health @@ -72,11 +80,10 @@ func (h *health) OnShutdown() error { time.Sleep(h.lameduck) } - if h.ln != nil { - return h.ln.Close() - } + h.ln.Close() h.stop <- true + h.nlSetup = false return nil } diff --git a/plugin/health/health_test.go b/plugin/health/health_test.go index 8f325c9c4..acd06ea37 100644 --- a/plugin/health/health_test.go +++ b/plugin/health/health_test.go @@ -17,7 +17,7 @@ func TestHealth(t *testing.T) { if err := h.OnStartup(); err != nil { t.Fatalf("Unable to startup the health server: %v", err) } - defer h.OnShutdown() + defer h.OnFinalShutdown() go func() { <-h.pollstop @@ -73,5 +73,5 @@ func TestHealthLameduck(t *testing.T) { return }() - h.OnShutdown() + h.OnFinalShutdown() } diff --git a/plugin/health/setup.go b/plugin/health/setup.go index bd5b5c3fc..0b90d829a 100644 --- a/plugin/health/setup.go +++ b/plugin/health/setup.go @@ -60,7 +60,8 @@ func setup(c *caddy.Controller) error { }) c.OnStartup(h.OnStartup) - c.OnShutdown(h.OnShutdown) + c.OnRestart(h.OnRestart) + c.OnFinalShutdown(h.OnFinalShutdown) // Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running. return nil diff --git a/plugin/metrics/addr.go b/plugin/metrics/addr.go new file mode 100644 index 000000000..fe8e5e5fe --- /dev/null +++ b/plugin/metrics/addr.go @@ -0,0 +1,52 @@ +package metrics + +// addrs keeps track on which addrs we listen, so we only start one listener, is +// prometheus is used in multiple Server Blocks. +type addrs struct { + a map[string]value +} + +type value struct { + state int + f func() error +} + +var uniqAddr addrs + +func newAddress() addrs { + return addrs{a: make(map[string]value)} +} + +func (a addrs) setAddress(addr string, f func() error) { + if a.a[addr].state == done { + return + } + a.a[addr] = value{todo, f} +} + +// setAddressTodo sets addr to 'todo' again. +func (a addrs) setAddressTodo(addr string) { + v, ok := a.a[addr] + if !ok { + return + } + v.state = todo + a.a[addr] = v +} + +// forEachTodo iterates for a and executes f for each element that is 'todo' and sets it to 'done'. +func (a addrs) forEachTodo() error { + for k, v := range a.a { + if v.state == todo { + v.f() + } + v.state = done + a.a[k] = v + } + return nil +} + +const ( + todo = 1 + done = 2 +) diff --git a/plugin/metrics/addr_test.go b/plugin/metrics/addr_test.go new file mode 100644 index 000000000..d7a08656b --- /dev/null +++ b/plugin/metrics/addr_test.go @@ -0,0 +1,17 @@ +package metrics + +import "testing" + +func TestForEachTodo(t *testing.T) { + a, i := newAddress(), 0 + a.setAddress("test", func() error { i++; return nil }) + + a.forEachTodo() + if i != 1 { + t.Errorf("Failed to executed f for %s", "test") + } + a.forEachTodo() + if i != 1 { + t.Errorf("Executed f twice instead of once") + } +} diff --git a/plugin/metrics/metrics.go b/plugin/metrics/metrics.go index d52b6717a..384aa8bb4 100644 --- a/plugin/metrics/metrics.go +++ b/plugin/metrics/metrics.go @@ -19,11 +19,12 @@ import ( // Metrics holds the prometheus configuration. The metrics' path is fixed to be /metrics type Metrics struct { - Next plugin.Handler - Addr string - Reg *prometheus.Registry - ln net.Listener - mux *http.ServeMux + Next plugin.Handler + Addr string + Reg *prometheus.Registry + ln net.Listener + lnSetup bool + mux *http.ServeMux zoneNames []string zoneMap map[string]bool @@ -93,7 +94,8 @@ func (m *Metrics) OnStartup() error { } m.ln = ln - ListenAddr = m.ln.Addr().String() + m.lnSetup = true + ListenAddr = m.ln.Addr().String() // For tests m.mux = http.NewServeMux() m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{})) @@ -104,14 +106,29 @@ func (m *Metrics) OnStartup() error { return nil } -// OnShutdown tears down the metrics listener on shutdown and restart. -func (m *Metrics) OnShutdown() error { +// OnRestart stops the listener on reload. +func (m *Metrics) OnRestart() error { + if !m.lnSetup { + return nil + } + + uniqAddr.setAddressTodo(m.Addr) + + m.ln.Close() + m.lnSetup = false + return nil +} + +// OnFinalShutdown tears down the metrics listener on shutdown and restart. +func (m *Metrics) OnFinalShutdown() error { // We allow prometheus statements in multiple Server Blocks, but only the first // will open the listener, for the rest they are all nil; guard against that. - if m.ln != nil { - return m.ln.Close() + if !m.lnSetup { + return nil } - return nil + + m.lnSetup = false + return m.ln.Close() } func keys(m map[string]bool) []string { diff --git a/plugin/metrics/metrics_test.go b/plugin/metrics/metrics_test.go index 5cdbb448c..bc1e3beb5 100644 --- a/plugin/metrics/metrics_test.go +++ b/plugin/metrics/metrics_test.go @@ -18,7 +18,7 @@ func TestMetrics(t *testing.T) { if err := met.OnStartup(); err != nil { t.Fatalf("Failed to start metrics handler: %s", err) } - defer met.OnShutdown() + defer met.OnFinalShutdown() met.AddZone("example.org.") diff --git a/plugin/metrics/setup.go b/plugin/metrics/setup.go index 7dac5ee24..5b605f554 100644 --- a/plugin/metrics/setup.go +++ b/plugin/metrics/setup.go @@ -15,7 +15,7 @@ func init() { Action: setup, }) - uniqAddr = addrs{a: make(map[string]int)} + uniqAddr = newAddress() } func setup(c *caddy.Controller) error { @@ -29,14 +29,15 @@ func setup(c *caddy.Controller) error { return m }) - for a, v := range uniqAddr.a { - if v == todo { - c.OncePerServerBlock(m.OnStartup) - } - uniqAddr.a[a] = done - } + c.OncePerServerBlock(func() error { + c.OnStartup(func() error { + return uniqAddr.forEachTodo() + }) + return nil + }) - c.OnShutdown(m.OnShutdown) + c.OnRestart(m.OnRestart) + c.OnFinalShutdown(m.OnFinalShutdown) return nil } @@ -45,7 +46,7 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) { var met = New(defaultAddr) defer func() { - uniqAddr.SetAddress(met.Addr) + uniqAddr.setAddress(met.Addr, met.OnStartup) }() i := 0 @@ -75,25 +76,5 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) { return met, nil } -var uniqAddr addrs - -// Keep track on which addrs we listen, so we only start one listener. -type addrs struct { - a map[string]int -} - -func (a *addrs) SetAddress(addr string) { - // If already there and set to done, we've already started this listener. - if a.a[addr] == done { - return - } - a.a[addr] = todo -} - // defaultAddr is the address the where the metrics are exported by default. const defaultAddr = "localhost:9153" - -const ( - todo = 1 - done = 2 -) diff --git a/plugin/reload/README.md b/plugin/reload/README.md index 9ebbe2dda..18d40ca5d 100644 --- a/plugin/reload/README.md +++ b/plugin/reload/README.md @@ -13,7 +13,8 @@ or SIGUSR1 after changing the Corefile. The reloads are graceful - you should not see any loss of service when the reload happens. Even if the new Corefile has an error, CoreDNS will continue -to run the old config and an error message will be printed to the log. +to run the old config and an error message will be printed to the log. But see +the Bugs section for failure modes. In some environments (for example, Kubernetes), there may be many CoreDNS instances that started very near the same time and all share a common @@ -59,3 +60,28 @@ Check every 10 seconds (jitter is automatically set to 10 / 2 = 5 in this case): erratic } ~~~ + +## Bugs + +The reload happens without data loss (i.e. DNS queries keep flowing), but there is a corner case +where the reload fails, and you loose functionality. Consider the following Corefile: + +~~~ txt +. { + health :8080 + whoami +} +~~~ + +CoreDNS starts and serves health from :8080. Now you change `:8080` to `:443` not knowing a process +is already listening on that port. The process reloads and performs the following steps: + +1. close the listener on 8080 +2. reload and parse the config again +3. fail to start a new listener on 443 +4. fail loading the new Corefile, abort and keep using the old process + +After the aborted attempt to reload we are left with the old proceses running, but the listener is +closed in step 1; so the health endpoint is broken. The same can hopen in the prometheus metrics plugin. + +In general be careful with assigning new port and expecting reload to work fully. diff --git a/test/reload_test.go b/test/reload_test.go index 24941c6d4..e49628614 100644 --- a/test/reload_test.go +++ b/test/reload_test.go @@ -1,6 +1,9 @@ package test import ( + "bytes" + "io/ioutil" + "net/http" "testing" "github.com/miekg/dns" @@ -52,3 +55,71 @@ func send(t *testing.T, server string) { t.Fatalf("Expected 2 RRs in additional, got %d", len(r.Extra)) } } + +func TestReloadHealth(t *testing.T) { + corefile := ` +.:0 { + health 127.0.0.1:52182 + whoami +}` + c, err := CoreDNSServer(corefile) + if err != nil { + t.Fatalf("Could not get service instance: %s", err) + } + + // This fails with address 8080 already in use, it shouldn't. + if c1, err := c.Restart(NewInput(corefile)); err != nil { + t.Fatal(err) + } else { + c1.Stop() + } +} + +func TestReloadMetricsHealth(t *testing.T) { + corefile := ` +.:0 { + prometheus 127.0.0.1:53183 + health 127.0.0.1:53184 + whoami +}` + c, err := CoreDNSServer(corefile) + if err != nil { + t.Fatalf("Could not get service instance: %s", err) + } + + c1, err := c.Restart(NewInput(corefile)) + if err != nil { + t.Fatal(err) + } + defer c1.Stop() + + // Send query to trigger monitoring to export on the new process + udp, _ := CoreDNSServerPorts(c1, 0) + m := new(dns.Msg) + m.SetQuestion("example.org.", dns.TypeA) + if _, err := dns.Exchange(m, udp); err != nil { + t.Fatal(err) + } + + // Health + resp, err := http.Get("http://localhost:53184/health") + if err != nil { + t.Fatal(err) + } + ok, _ := ioutil.ReadAll(resp.Body) + resp.Body.Close() + if string(ok) != "OK" { + t.Errorf("Failed to receive OK, got %s", ok) + } + + // Metrics + resp, err = http.Get("http://localhost:53183/metrics") + if err != nil { + t.Fatal(err) + } + const proc = "process_virtual_memory_bytes" + metrics, _ := ioutil.ReadAll(resp.Body) + if !bytes.Contains(metrics, []byte(proc)) { + t.Errorf("Failed to see %s in metric output", proc) + } +}