reload: use OnRestart (#1709)

* reload: use OnRestart Close the listener on OnRestart for health and metrics so the default setup function can setup the listener when the plugin is "starting up". Lightly test with some SIGUSR1-ing. Also checked the reload plugin with this, seems fine: .com.:1043 .:1043 2018/04/20 15:01:25 [INFO] CoreDNS-1.1.1 2018/04/20 15:01:25 [INFO] linux/amd64, go1.10, CoreDNS-1.1.1 linux/amd64, go1.10, 2018/04/20 15:01:25 [INFO] Running configuration MD5 = aa8b3f03946fb60546ca1f725d482714 2018/04/20 15:02:01 [INFO] Reloading 2018/04/20 15:02:01 [INFO] Running configuration MD5 = b34a96d99e01db4015a892212560155f 2018/04/20 15:02:01 [INFO] Reloading complete ^C2018/04/20 15:02:06 [INFO] SIGINT: Shutting down With this corefile: .com { proxy . 127.0.0.1:53 prometheus :9054 whoami reload } . { proxy . 127.0.0.1:53 prometheus :9054 whoami reload } The prometheus port was 9053, changed that to 54 so reload would pick it up. From a cursory look it seems this also fixes: Fixes #1604 #1618 #1686 #1492 * At least make it test * Use onfinalshutdown * reload: add reload test This test #1604 adn right now fails. * Address review comments * Add bug section explaining things a bit * compile tests * Fix tests * fixes * slightly less crazy * try to make prometheus setup less confusing * Use ephermal port for test * Don't use the listener * These are shared between goroutines, just use the boolean in the main structure. * Fix text in the reload README, * Set addr to TODO once stopping it * Morph fturb's comment into test, to test reload and scrape health and metric endpoint
2018-04-21 17:43:02 +01:00 · 2018-04-21 17:43:02 +01:00 · acbcad7b4e
commit acbcad7b4e
parent 135377bf77
11 changed files with 227 additions and 55 deletions
--- a/plugin/health/README.md
+++ b/plugin/health/README.md
@ -17,9 +17,9 @@ health [ADDRESS]
 ~~~

 Optionally takes an address; the default is `:8080`. The health path is fixed to `/health`. The
-health endpoint returns a 200 response code and the word "OK" when CoreDNS is healthy. It returns
-a 503. *health* periodically (1s) polls plugin that exports health information. If any of the
-plugin signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports
+health endpoint returns a 200 response code and the word "OK" when this server is healthy. It returns
+a 503. *health* periodically (1s) polls plugins that exports health information. If any of the
+plugins signals that it is unhealthy, the server will go unhealthy too. Each plugin that supports
 health checks has a section "Health" in their README.

 More options can be set with this extended syntax:
@ -33,7 +33,7 @@ health [ADDRESS] {
 * Where `lameduck` will make the process unhealthy then *wait* for **DURATION** before the process
  shuts down.

-If you have multiple Server Block and need to export health for each of the plugins, you must run
+If you have multiple Server Blocks and need to export health for each of the plugins, you must run
 health endpoints on different ports:

 ~~~ corefile
--- a/plugin/health/health.go
+++ b/plugin/health/health.go
@ -16,8 +16,9 @@ type health struct {
 	Addr     string
 	lameduck time.Duration

-	ln  net.Listener
-	mux *http.ServeMux
+	ln      net.Listener
+	nlSetup bool
+	mux     *http.ServeMux

 	// A slice of Healthers that the health plugin will poll every second for their health status.
 	h []Healther
@ -45,6 +46,7 @@ func (h *health) OnStartup() error {

 	h.ln = ln
 	h.mux = http.NewServeMux()
+	h.nlSetup = true

 	h.mux.HandleFunc(path, func(w http.ResponseWriter, r *http.Request) {
 		if h.Ok() {
@ -61,7 +63,13 @@ func (h *health) OnStartup() error {
 	return nil
 }

-func (h *health) OnShutdown() error {
+func (h *health) OnRestart() error { return h.OnFinalShutdown() }
+
+func (h *health) OnFinalShutdown() error {
+	if !h.nlSetup {
+		return nil
+	}
+
 	// Stop polling plugins
 	h.pollstop <- true
 	// NACK health
@ -72,11 +80,10 @@ func (h *health) OnShutdown() error {
 		time.Sleep(h.lameduck)
 	}

-	if h.ln != nil {
-		return h.ln.Close()
-	}
+	h.ln.Close()

 	h.stop <- true
+	h.nlSetup = false
 	return nil
 }

--- a/plugin/health/health_test.go
+++ b/plugin/health/health_test.go
@ -17,7 +17,7 @@ func TestHealth(t *testing.T) {
 	if err := h.OnStartup(); err != nil {
 		t.Fatalf("Unable to startup the health server: %v", err)
 	}
-	defer h.OnShutdown()
+	defer h.OnFinalShutdown()

 	go func() {
 		<-h.pollstop
@ -73,5 +73,5 @@ func TestHealthLameduck(t *testing.T) {
 		return
 	}()

-	h.OnShutdown()
+	h.OnFinalShutdown()
 }
--- a/plugin/health/setup.go
+++ b/plugin/health/setup.go
@ -60,7 +60,8 @@ func setup(c *caddy.Controller) error {
 	})

 	c.OnStartup(h.OnStartup)
-	c.OnShutdown(h.OnShutdown)
+	c.OnRestart(h.OnRestart)
+	c.OnFinalShutdown(h.OnFinalShutdown)

 	// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
 	return nil
--- a/plugin/metrics/addr.go
+++ b/plugin/metrics/addr.go
@ -0,0 +1,52 @@
+package metrics
+
+// addrs keeps track on which addrs we listen, so we only start one listener, is
+// prometheus is used in multiple Server Blocks.
+type addrs struct {
+	a map[string]value
+}
+
+type value struct {
+	state int
+	f     func() error
+}
+
+var uniqAddr addrs
+
+func newAddress() addrs {
+	return addrs{a: make(map[string]value)}
+}
+
+func (a addrs) setAddress(addr string, f func() error) {
+	if a.a[addr].state == done {
+		return
+	}
+	a.a[addr] = value{todo, f}
+}
+
+// setAddressTodo sets addr to 'todo' again.
+func (a addrs) setAddressTodo(addr string) {
+	v, ok := a.a[addr]
+	if !ok {
+		return
+	}
+	v.state = todo
+	a.a[addr] = v
+}
+
+// forEachTodo iterates for a and executes f for each element that is 'todo' and sets it to 'done'.
+func (a addrs) forEachTodo() error {
+	for k, v := range a.a {
+		if v.state == todo {
+			v.f()
+		}
+		v.state = done
+		a.a[k] = v
+	}
+	return nil
+}
+
+const (
+	todo = 1
+	done = 2
+)
--- a/plugin/metrics/addr_test.go
+++ b/plugin/metrics/addr_test.go
@ -0,0 +1,17 @@
+package metrics
+
+import "testing"
+
+func TestForEachTodo(t *testing.T) {
+	a, i := newAddress(), 0
+	a.setAddress("test", func() error { i++; return nil })
+
+	a.forEachTodo()
+	if i != 1 {
+		t.Errorf("Failed to executed f for %s", "test")
+	}
+	a.forEachTodo()
+	if i != 1 {
+		t.Errorf("Executed f twice instead of once")
+	}
+}
--- a/plugin/metrics/metrics.go
+++ b/plugin/metrics/metrics.go
@ -19,11 +19,12 @@ import (

 // Metrics holds the prometheus configuration. The metrics' path is fixed to be /metrics
 type Metrics struct {
-	Next plugin.Handler
-	Addr string
-	Reg  *prometheus.Registry
-	ln   net.Listener
-	mux  *http.ServeMux
+	Next    plugin.Handler
+	Addr    string
+	Reg     *prometheus.Registry
+	ln      net.Listener
+	lnSetup bool
+	mux     *http.ServeMux

 	zoneNames []string
 	zoneMap   map[string]bool
@ -93,7 +94,8 @@ func (m *Metrics) OnStartup() error {
 	}

 	m.ln = ln
-	ListenAddr = m.ln.Addr().String()
+	m.lnSetup = true
+	ListenAddr = m.ln.Addr().String() // For tests

 	m.mux = http.NewServeMux()
 	m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{}))
@ -104,14 +106,29 @@ func (m *Metrics) OnStartup() error {
 	return nil
 }

-// OnShutdown tears down the metrics listener on shutdown and restart.
-func (m *Metrics) OnShutdown() error {
+// OnRestart stops the listener on reload.
+func (m *Metrics) OnRestart() error {
+	if !m.lnSetup {
+		return nil
+	}
+
+	uniqAddr.setAddressTodo(m.Addr)
+
+	m.ln.Close()
+	m.lnSetup = false
+	return nil
+}
+
+// OnFinalShutdown tears down the metrics listener on shutdown and restart.
+func (m *Metrics) OnFinalShutdown() error {
 	// We allow prometheus statements in multiple Server Blocks, but only the first
 	// will open the listener, for the rest they are all nil; guard against that.
-	if m.ln != nil {
-		return m.ln.Close()
+	if !m.lnSetup {
+		return nil
 	}
-	return nil
+
+	m.lnSetup = false
+	return m.ln.Close()
 }

 func keys(m map[string]bool) []string {
--- a/plugin/metrics/metrics_test.go
+++ b/plugin/metrics/metrics_test.go
@ -18,7 +18,7 @@ func TestMetrics(t *testing.T) {
 	if err := met.OnStartup(); err != nil {
 		t.Fatalf("Failed to start metrics handler: %s", err)
 	}
-	defer met.OnShutdown()
+	defer met.OnFinalShutdown()

 	met.AddZone("example.org.")

--- a/plugin/metrics/setup.go
+++ b/plugin/metrics/setup.go
@ -15,7 +15,7 @@ func init() {
 		Action:     setup,
 	})

-	uniqAddr = addrs{a: make(map[string]int)}
+	uniqAddr = newAddress()
 }

 func setup(c *caddy.Controller) error {
@ -29,14 +29,15 @@ func setup(c *caddy.Controller) error {
 		return m
 	})

-	for a, v := range uniqAddr.a {
-		if v == todo {
-			c.OncePerServerBlock(m.OnStartup)
-		}
-		uniqAddr.a[a] = done
-	}
+	c.OncePerServerBlock(func() error {
+		c.OnStartup(func() error {
+			return uniqAddr.forEachTodo()
+		})
+		return nil
+	})

-	c.OnShutdown(m.OnShutdown)
+	c.OnRestart(m.OnRestart)
+	c.OnFinalShutdown(m.OnFinalShutdown)

 	return nil
 }
@ -45,7 +46,7 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) {
 	var met = New(defaultAddr)

 	defer func() {
-		uniqAddr.SetAddress(met.Addr)
+		uniqAddr.setAddress(met.Addr, met.OnStartup)
 	}()

 	i := 0
@ -75,25 +76,5 @@ func prometheusParse(c *caddy.Controller) (*Metrics, error) {
 	return met, nil
 }

-var uniqAddr addrs
-
-// Keep track on which addrs we listen, so we only start one listener.
-type addrs struct {
-	a map[string]int
-}
-
-func (a *addrs) SetAddress(addr string) {
-	// If already there and set to done, we've already started this listener.
-	if a.a[addr] == done {
-		return
-	}
-	a.a[addr] = todo
-}
-
 // defaultAddr is the address the where the metrics are exported by default.
 const defaultAddr = "localhost:9153"
-
-const (
-	todo = 1
-	done = 2
-)
--- a/plugin/reload/README.md
+++ b/plugin/reload/README.md
@ -13,7 +13,8 @@ or SIGUSR1 after changing the Corefile.

 The reloads are graceful - you should not see any loss of service when the
 reload happens. Even if the new Corefile has an error, CoreDNS will continue
-to run the old config and an error message will be printed to the log.
+to run the old config and an error message will be printed to the log. But see
+the Bugs section for failure modes.

 In some environments (for example, Kubernetes), there may be many CoreDNS
 instances that started very near the same time and all share a common
@ -59,3 +60,28 @@ Check every 10 seconds (jitter is automatically set to 10 / 2 = 5 in this case):
    erratic
 }
 ~~~
+
+## Bugs
+
+The reload happens without data loss (i.e. DNS queries keep flowing), but there is a corner case
+where the reload fails, and you loose functionality. Consider the following Corefile:
+
+~~~ txt
+. {
+	health :8080
+	whoami
+}
+~~~
+
+CoreDNS starts and serves health from :8080. Now you change `:8080` to `:443` not knowing a process
+is already listening on that port. The process reloads and performs the following steps:
+
+1. close the listener on 8080
+2. reload and parse the config again
+3. fail to start a new listener on 443
+4. fail loading the new Corefile, abort and keep using the old process
+
+After the aborted attempt to reload we are left with the old proceses running, but the listener is
+closed in step 1; so the health endpoint is broken. The same can hopen in the prometheus metrics plugin.
+
+In general be careful with assigning new port and expecting reload to work fully.
--- a/test/reload_test.go
+++ b/test/reload_test.go
@ -1,6 +1,9 @@
 package test

 import (
+	"bytes"
+	"io/ioutil"
+	"net/http"
 	"testing"

 	"github.com/miekg/dns"
@ -52,3 +55,71 @@ func send(t *testing.T, server string) {
 		t.Fatalf("Expected 2 RRs in additional, got %d", len(r.Extra))
 	}
 }
+
+func TestReloadHealth(t *testing.T) {
+	corefile := `
+.:0 {
+	health 127.0.0.1:52182
+	whoami
+}`
+	c, err := CoreDNSServer(corefile)
+	if err != nil {
+		t.Fatalf("Could not get service instance: %s", err)
+	}
+
+	// This fails with address 8080 already in use, it shouldn't.
+	if c1, err := c.Restart(NewInput(corefile)); err != nil {
+		t.Fatal(err)
+	} else {
+		c1.Stop()
+	}
+}
+
+func TestReloadMetricsHealth(t *testing.T) {
+	corefile := `
+.:0 {
+	prometheus 127.0.0.1:53183
+	health 127.0.0.1:53184
+	whoami
+}`
+	c, err := CoreDNSServer(corefile)
+	if err != nil {
+		t.Fatalf("Could not get service instance: %s", err)
+	}
+
+	c1, err := c.Restart(NewInput(corefile))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer c1.Stop()
+
+	// Send query to trigger monitoring to export on the new process
+	udp, _ := CoreDNSServerPorts(c1, 0)
+	m := new(dns.Msg)
+	m.SetQuestion("example.org.", dns.TypeA)
+	if _, err := dns.Exchange(m, udp); err != nil {
+		t.Fatal(err)
+	}
+
+	// Health
+	resp, err := http.Get("http://localhost:53184/health")
+	if err != nil {
+		t.Fatal(err)
+	}
+	ok, _ := ioutil.ReadAll(resp.Body)
+	resp.Body.Close()
+	if string(ok) != "OK" {
+		t.Errorf("Failed to receive OK, got %s", ok)
+	}
+
+	// Metrics
+	resp, err = http.Get("http://localhost:53183/metrics")
+	if err != nil {
+		t.Fatal(err)
+	}
+	const proc = "process_virtual_memory_bytes"
+	metrics, _ := ioutil.ReadAll(resp.Body)
+	if !bytes.Contains(metrics, []byte(proc)) {
+		t.Errorf("Failed to see %s in metric output", proc)
+	}
+}