From acbcad7b4ed0f7e0e5f4d6ec6b23509d41b92950 Mon Sep 17 00:00:00 2001 From: Miek Gieben Date: Sat, 21 Apr 2018 17:43:02 +0100 Subject: [PATCH] reload: use OnRestart (#1709) * reload: use OnRestart Close the listener on OnRestart for health and metrics so the default setup function can setup the listener when the plugin is "starting up". Lightly test with some SIGUSR1-ing. Also checked the reload plugin with this, seems fine: .com.:1043 .:1043 2018/04/20 15:01:25 [INFO] CoreDNS-1.1.1 2018/04/20 15:01:25 [INFO] linux/amd64, go1.10, CoreDNS-1.1.1 linux/amd64, go1.10, 2018/04/20 15:01:25 [INFO] Running configuration MD5 = aa8b3f03946fb60546ca1f725d482714 2018/04/20 15:02:01 [INFO] Reloading 2018/04/20 15:02:01 [INFO] Running configuration MD5 = b34a96d99e01db4015a892212560155f 2018/04/20 15:02:01 [INFO] Reloading complete ^C2018/04/20 15:02:06 [INFO] SIGINT: Shutting down With this corefile: .com { proxy . 127.0.0.1:53 prometheus :9054 whoami reload } . { proxy . 127.0.0.1:53 prometheus :9054 whoami reload } The prometheus port was 9053, changed that to 54 so reload would pick it up. From a cursory look it seems this also fixes: Fixes #1604 #1618 #1686 #1492 * At least make it test * Use onfinalshutdown * reload: add reload test This test #1604 adn right now fails. * Address review comments * Add bug section explaining things a bit * compile tests * Fix tests * fixes * slightly less crazy * try to make prometheus setup less confusing * Use ephermal port for test * Don't use the listener * These are shared between goroutines, just use the boolean in the main structure. * Fix text in the reload README, * Set addr to TODO once stopping it * Morph fturb's comment into test, to test reload and scrape health and metric endpoint --- plugin/health/README.md | 8 ++-- plugin/health/health.go | 19 ++++++--- plugin/health/health_test.go | 4 +- plugin/health/setup.go | 3 +- plugin/metrics/addr.go | 52 +++++++++++++++++++++++++ plugin/metrics/addr_test.go | 17 ++++++++ plugin/metrics/metrics.go | 39 +++++++++++++------ plugin/metrics/metrics_test.go | 2 +- plugin/metrics/setup.go | 39 +++++-------------- plugin/reload/README.md | 28 +++++++++++++- test/reload_test.go | 71 ++++++++++++++++++++++++++++++++++ 11 files changed, 227 insertions(+), 55 deletions(-) create mode 100644 plugin/metrics/addr.go create mode 100644 plugin/metrics/addr_test.go diff --git a/plugin/health/README.md b/plugin/health/README.md index f94355649..e547028c2 100644 --- a/plugin/health/README.md +++ b/plugin/health/README.md @@ -17,9 +17,9 @@ health [ADDRESS] ~~~ Optionally takes an address; the default is `:8080`. The health path is fixed to `/health`. The -health endpoint returns a 200 response code and the word "OK" when CoreDNS is healthy. It returns -a 503. *health* periodically (1s) polls plugin that exports health information. If any of the -plugin signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports +health endpoint returns a 200 response code and the word "OK" when this server is healthy. It returns +a 503. *health* periodically (1s) polls plugins that exports health information. If any of the +plugins signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports health checks has a section "Health" in their README. More options can be set with this extended syntax: @@ -33,7 +33,7 @@ health [ADDRESS] { * Where `lameduck` will make the process unhealthy then *wait* for **DURATION** before the process shuts down. -If you have multiple Server Block and need to export health for each of the plugins, you must run +If you have multiple Server Blocks and need to export health for each of the plugins, you must run health endpoints on different ports: ~~~ corefile diff --git a/plugin/health/health.go b/plugin/health/health.go index f6b82804e..782099154 100644 --- a/plugin/health/health.go +++ b/plugin/health/health.go @@ -16,8 +16,9 @@ type health struct { Addr string lameduck time.Duration - ln net.Listener - mux *http.ServeMux + ln net.Listener + nlSetup bool + mux *http.ServeMux // A slice of Healthers that the health plugin will poll every second for their health status. h []Healther @@ -45,6 +46,7 @@ func (h *health) OnStartup() error { h.ln = ln h.mux = http.NewServeMux() + h.nlSetup = true h.mux.HandleFunc(path, func(w http.ResponseWriter, r *http.Request) { if h.Ok() { @@ -61,7 +63,13 @@ func (h *health) OnStartup() error { return nil } -func (h *health) OnShutdown() error { +func (h *health) OnRestart() error { return h.OnFinalShutdown() } + +func (h *health) OnFinalShutdown() error { + if !h.nlSetup { + return nil + } + // Stop polling plugins h.pollstop <- true // NACK health @@ -72,11 +80,10 @@ func (h *health) OnShutdown() error { time.Sleep(h.lameduck) } - if h.ln != nil { - return h.ln.Close() - } + h.ln.Close() h.stop <- true + h.nlSetup = false return nil } diff --git a/plugin/health/health_test.go b/plugin/health/health_test.go index 8f325c9c4..acd06ea37 100644 --- a/plugin/health/health_test.go +++ b/plugin/health/health_test.go @@ -17,7 +17,7 @@ func TestHealth(t *testing.T) { if err := h.OnStartup(); err != nil { t.Fatalf("Unable to startup the health server: %v", err) } - defer h.OnShutdown() + defer h.OnFinalShutdown() go func() { <-h.pollstop @@ -73,5 +73,5 @@ func TestHealthLameduck(t *testing.T) { return }() - h.OnShutdown() + h.OnFinalShutdown() } diff --git a/plugin/health/setup.go b/plugin/health/setup.go index bd5b5c3fc..0b90d829a 100644 --- a/plugin/health/setup.go +++ b/plugin/health/setup.go @@ -60,7 +60,8 @@ func setup(c *caddy.Controller) error { }) c.OnStartup(h.OnStartup) - c.OnShutdown(h.OnShutdown) + c.OnRestart(h.OnRestart) + c.OnFinalShutdown(h.OnFinalShutdown) // Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running. return nil diff --git a/plugin/metrics/addr.go b/plugin/metrics/addr.go new file mode 100644 index 000000000..fe8e5e5fe --- /dev/null +++ b/plugin/metrics/addr.go @@ -0,0 +1,52 @@ +package metrics + +// addrs keeps track on which addrs we listen, so we only start one listener, is +// prometheus is used in multiple Server Blocks. +type addrs struct { + a map[string]value +} + +type value struct { + state int + f func() error +} + +var uniqAddr addrs + +func newAddress() addrs { + return addrs{a: make(map[string]value)} +} + +func (a addrs) setAddress(addr string, f func() error) { + if a.a[addr].state == done { + return + } + a.a[addr] = value{todo, f} +} + +// setAddressTodo sets addr to 'todo' again. +func (a addrs) setAddressTodo(addr string) { + v, ok := a.a[addr] + if !ok { + return + } + v.state = todo + a.a[addr] = v +} + +// forEachTodo iterates for a and executes f for each element that is 'todo' and sets it to 'done'. +func (a addrs) forEachTodo() error { + for k, v := range a.a { + if v.state == todo { + v.f() + } + v.state = done + a.a[k] = v + } + return nil +} + +const ( + todo = 1 + done = 2 +) diff --git a/plugin/metrics/addr_test.go b/plugin/metrics/addr_test.go new file mode 100644 index 000000000..d7a08656b --- /dev/null +++ b/plugin/metrics/addr_test.go @@ -0,0 +1,17 @@ +package metrics + +import "testing" + +func TestForEachTodo(t *testing.T) { + a, i := newAddress(), 0 + a.setAddress("test", func() error { i++; return nil }) + + a.forEachTodo() + if i != 1 { + t.Errorf("Failed to executed f for %s", "test") + } + a.forEachTodo() + if i != 1 { + t.Errorf("Executed f twice instead of once") + } +} diff --git a/plugin/metrics/metrics.go b/plugin/metrics/metrics.go index d52b6717a..384aa8bb4 100644 --- a/plugin/metrics/metrics.go +++ b/plugin/metrics/metrics.go @@ -19,11 +19,12 @@ import ( // Metrics holds the prometheus configuration. The metrics' path is fixed to be /metrics type Metrics struct { - Next plugin.Handler - Addr string - Reg *prometheus.Registry - ln net.Listener - mux *http.ServeMux + Next plugin.Handler + Addr string + Reg *prometheus.Registry + ln net.Listener + lnSetup bool + mux *http.ServeMux zoneNames []string zoneMap map[string]bool @@ -93,7 +94,8 @@ func (m *Metrics) OnStartup() error { } m.ln = ln - ListenAddr = m.ln.Addr().String() + m.lnSetup = true + ListenAddr = m.ln.Addr().String() // For tests m.mux = http.NewServeMux() m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{})) @@ -104,14 +106,29 @@ func (m *Metrics) OnStartup() error { return nil } -// OnShutdown tears down the metrics listener on shutdown and restart. -func (m *Metrics) OnShutdown() error { +// OnRestart stops the listener on reload. +func (m *Metrics) OnRestart() error { + if !m.lnSetup { + return nil + } + + uniqAddr.setAddressTodo(m.Addr) + + m.ln.Close() + m.lnSetup = false + return nil +} + +// OnFinalShutdown tears down the metrics listener on shutdown and restart. +func (m *Metrics) OnFinalShutdown() error { // We allow prometheus statements in multiple Server Blocks, but only the first // will open the listener, for the rest they are all nil; guard against that. - if m.ln != nil { - return m.ln.Close() + if !m.lnSetup { + return nil } - return nil + + m.lnSetup = false + return m.ln.Close() } func keys(m map[string]bool) []string { diff --git a/plugin/metrics/metrics_test.go b/plugin/metrics/metrics_test.go index 5cdbb448c..bc1e3beb5 100644 --- a/plugin/metrics/metrics_test.go +++ b/plugin/metrics/metrics_test.go @@ -18,7 +18,7 @@ func TestMetrics(t *testing.T) { if err := met.OnStartup(); err != nil { t.Fatalf("Failed to start metrics handler: %s", err) } - defer met.OnShutdown() + defer met.OnFinalShutdown() met.AddZone("example.org.") diff --git a/plugin/metrics/setup.go b/plugin/metrics/setup.go index 7dac5ee24..5b605f554 100644 --- a/plugin/metrics/setup.go +++ b/plugin/metrics/setup.go @@ -15,7 +15,7 @@ func init() { Action: setup, }) - uniqAddr = addrs{a: make(map[string]int)} + uniqAddr = newAddress() } func setup(c *caddy.Controller) error { @@ -29,14 +29,15 @@ func setup(c *caddy.Controller) error { return m }) - for a, v := range uniqAddr.a { - if v == todo { - c.OncePerServerBlock(m.OnStartup) - } - uniqAddr.a[a] = done - } + c.OncePerServerBlock(func() error { + c.OnStartup(func() error { + return uniqAddr.forEachTodo() + }) + return nil + }) - c.OnShutdown(m.OnShutdown) + c.OnRestart(m.OnRestart) + c.OnFinalShutdown(m.OnFinalShutdown) return nil } @@ -45,7 +46,7 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) { var met = New(defaultAddr) defer func() { - uniqAddr.SetAddress(met.Addr) + uniqAddr.setAddress(met.Addr, met.OnStartup) }() i := 0 @@ -75,25 +76,5 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) { return met, nil } -var uniqAddr addrs - -// Keep track on which addrs we listen, so we only start one listener. -type addrs struct { - a map[string]int -} - -func (a *addrs) SetAddress(addr string) { - // If already there and set to done, we've already started this listener. - if a.a[addr] == done { - return - } - a.a[addr] = todo -} - // defaultAddr is the address the where the metrics are exported by default. const defaultAddr = "localhost:9153" - -const ( - todo = 1 - done = 2 -) diff --git a/plugin/reload/README.md b/plugin/reload/README.md index 9ebbe2dda..18d40ca5d 100644 --- a/plugin/reload/README.md +++ b/plugin/reload/README.md @@ -13,7 +13,8 @@ or SIGUSR1 after changing the Corefile. The reloads are graceful - you should not see any loss of service when the reload happens. Even if the new Corefile has an error, CoreDNS will continue -to run the old config and an error message will be printed to the log. +to run the old config and an error message will be printed to the log. But see +the Bugs section for failure modes. In some environments (for example, Kubernetes), there may be many CoreDNS instances that started very near the same time and all share a common @@ -59,3 +60,28 @@ Check every 10 seconds (jitter is automatically set to 10 / 2 = 5 in this case): erratic } ~~~ + +## Bugs + +The reload happens without data loss (i.e. DNS queries keep flowing), but there is a corner case +where the reload fails, and you loose functionality. Consider the following Corefile: + +~~~ txt +. { + health :8080 + whoami +} +~~~ + +CoreDNS starts and serves health from :8080. Now you change `:8080` to `:443` not knowing a process +is already listening on that port. The process reloads and performs the following steps: + +1. close the listener on 8080 +2. reload and parse the config again +3. fail to start a new listener on 443 +4. fail loading the new Corefile, abort and keep using the old process + +After the aborted attempt to reload we are left with the old proceses running, but the listener is +closed in step 1; so the health endpoint is broken. The same can hopen in the prometheus metrics plugin. + +In general be careful with assigning new port and expecting reload to work fully. diff --git a/test/reload_test.go b/test/reload_test.go index 24941c6d4..e49628614 100644 --- a/test/reload_test.go +++ b/test/reload_test.go @@ -1,6 +1,9 @@ package test import ( + "bytes" + "io/ioutil" + "net/http" "testing" "github.com/miekg/dns" @@ -52,3 +55,71 @@ func send(t *testing.T, server string) { t.Fatalf("Expected 2 RRs in additional, got %d", len(r.Extra)) } } + +func TestReloadHealth(t *testing.T) { + corefile := ` +.:0 { + health 127.0.0.1:52182 + whoami +}` + c, err := CoreDNSServer(corefile) + if err != nil { + t.Fatalf("Could not get service instance: %s", err) + } + + // This fails with address 8080 already in use, it shouldn't. + if c1, err := c.Restart(NewInput(corefile)); err != nil { + t.Fatal(err) + } else { + c1.Stop() + } +} + +func TestReloadMetricsHealth(t *testing.T) { + corefile := ` +.:0 { + prometheus 127.0.0.1:53183 + health 127.0.0.1:53184 + whoami +}` + c, err := CoreDNSServer(corefile) + if err != nil { + t.Fatalf("Could not get service instance: %s", err) + } + + c1, err := c.Restart(NewInput(corefile)) + if err != nil { + t.Fatal(err) + } + defer c1.Stop() + + // Send query to trigger monitoring to export on the new process + udp, _ := CoreDNSServerPorts(c1, 0) + m := new(dns.Msg) + m.SetQuestion("example.org.", dns.TypeA) + if _, err := dns.Exchange(m, udp); err != nil { + t.Fatal(err) + } + + // Health + resp, err := http.Get("http://localhost:53184/health") + if err != nil { + t.Fatal(err) + } + ok, _ := ioutil.ReadAll(resp.Body) + resp.Body.Close() + if string(ok) != "OK" { + t.Errorf("Failed to receive OK, got %s", ok) + } + + // Metrics + resp, err = http.Get("http://localhost:53183/metrics") + if err != nil { + t.Fatal(err) + } + const proc = "process_virtual_memory_bytes" + metrics, _ := ioutil.ReadAll(resp.Body) + if !bytes.Contains(metrics, []byte(proc)) { + t.Errorf("Failed to see %s in metric output", proc) + } +}