Metrics listener fix (#2036)

* Create test to verify correct listener behavior

* Create Unset function to remove todo items

* Reset address for prometheus listener before restarting

* Add inline documentation for Unset function

* Make shutdownTimeout a constant and change to five seconds

* Revert ForEach behavior in uniq package
This commit is contained in:
Zach Eddy 2018-08-21 08:52:25 -07:00 committed by Francois Tur
parent b87ed01bb2
commit 8aa55c5ff2
3 changed files with 108 additions and 13 deletions

View file

@ -2,10 +2,12 @@
package metrics package metrics
import ( import (
"context"
"net" "net"
"net/http" "net/http"
"os" "os"
"sync" "sync"
"time"
"github.com/coredns/coredns/plugin" "github.com/coredns/coredns/plugin"
"github.com/coredns/coredns/plugin/metrics/vars" "github.com/coredns/coredns/plugin/metrics/vars"
@ -22,6 +24,7 @@ type Metrics struct {
ln net.Listener ln net.Listener
lnSetup bool lnSetup bool
mux *http.ServeMux mux *http.ServeMux
srv *http.Server
zoneNames []string zoneNames []string
zoneMap map[string]bool zoneMap map[string]bool
@ -94,9 +97,9 @@ func (m *Metrics) OnStartup() error {
m.mux = http.NewServeMux() m.mux = http.NewServeMux()
m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{})) m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{}))
m.srv = &http.Server{Handler: m.mux}
go func() { go func() {
http.Serve(m.ln, m.mux) m.srv.Serve(m.ln)
}() }()
return nil return nil
} }
@ -106,24 +109,28 @@ func (m *Metrics) OnRestart() error {
if !m.lnSetup { if !m.lnSetup {
return nil return nil
} }
uniqAddr.Unset(m.Addr)
return m.stopServer()
}
uniqAddr.SetTodo(m.Addr) func (m *Metrics) stopServer() error {
if !m.lnSetup {
m.ln.Close() return nil
}
ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
defer cancel()
if err := m.srv.Shutdown(ctx); err != nil {
log.Infof("Failed to stop prometheus http server: %s", err)
return err
}
m.lnSetup = false m.lnSetup = false
m.ln.Close()
return nil return nil
} }
// OnFinalShutdown tears down the metrics listener on shutdown and restart. // OnFinalShutdown tears down the metrics listener on shutdown and restart.
func (m *Metrics) OnFinalShutdown() error { func (m *Metrics) OnFinalShutdown() error {
// We allow prometheus statements in multiple Server Blocks, but only the first return m.stopServer()
// will open the listener, for the rest they are all nil; guard against that.
if !m.lnSetup {
return nil
}
m.lnSetup = false
return m.ln.Close()
} }
func keys(m map[string]bool) []string { func keys(m map[string]bool) []string {
@ -138,6 +145,10 @@ func keys(m map[string]bool) []string {
// we listen on "localhost:0" and need to retrieve the actual address. // we listen on "localhost:0" and need to retrieve the actual address.
var ListenAddr string var ListenAddr string
// shutdownTimeout is the maximum amount of time the metrics plugin will wait
// before erroring when it tries to close the metrics server
const shutdownTimeout time.Duration = time.Second * 5
var buildInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{ var buildInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: plugin.Namespace, Namespace: plugin.Namespace,
Name: "build_info", Name: "build_info",

View file

@ -24,6 +24,13 @@ func (u U) Set(key string, f func() error) {
u.u[key] = item{todo, f} u.u[key] = item{todo, f}
} }
// Unset removes the 'todo' associated with a key
func (u U) Unset(key string) {
if _, ok := u.u[key]; ok {
delete(u.u, key)
}
}
// SetTodo sets key to 'todo' again. // SetTodo sets key to 'todo' again.
func (u U) SetTodo(key string) { func (u U) SetTodo(key string) {
v, ok := u.u[key] v, ok := u.u[key]

View file

@ -2,6 +2,7 @@ package test
import ( import (
"bytes" "bytes"
"fmt"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"strings" "strings"
@ -125,4 +126,80 @@ func TestReloadMetricsHealth(t *testing.T) {
} }
} }
func collectMetricsInfo(addr, proc string) error {
cl := &http.Client{}
resp, err := cl.Get(fmt.Sprintf("http://%s/metrics", addr))
if err != nil {
return err
}
metrics, _ := ioutil.ReadAll(resp.Body)
if !bytes.Contains(metrics, []byte(proc)) {
return fmt.Errorf("failed to see %s in metric output", proc)
}
return nil
}
// TestReloadSeveralTimeMetrics ensures that metrics are not pushed to
// prometheus once the metrics plugin is removed and a coredns
// reload is triggered
// 1. check that metrics have not been exported to prometheus before coredns starts
// 2. ensure that build-related metrics have been exported once coredns starts
// 3. trigger multiple reloads without changing the corefile
// 4. remove the metrics plugin and trigger a final reload
// 5. ensure the original prometheus exporter has not received more metrics
func TestReloadSeveralTimeMetrics(t *testing.T) {
//TODO: add a tool that find an available port because this needs to be a port
// that is not used in another test
promAddress := "127.0.0.1:53185"
proc := "coredns_build_info"
corefileWithMetrics := `
.:0 {
prometheus ` + promAddress + `
whoami
}`
corefileWithoutMetrics := `
.:0 {
whoami
}`
if err := collectMetricsInfo(promAddress, proc); err == nil {
t.Errorf("Prometheus is listening before the test started")
}
serverWithMetrics, err := CoreDNSServer(corefileWithMetrics)
if err != nil {
if strings.Contains(err.Error(), inUse) {
return
}
t.Errorf("Could not get service instance: %s", err)
}
// verify prometheus is running
if err := collectMetricsInfo(promAddress, proc); err != nil {
t.Errorf("Prometheus is not listening : %s", err)
}
reloadCount := 2
for i := 0; i < reloadCount; i++ {
serverReload, err := serverWithMetrics.Restart(
NewInput(corefileWithMetrics),
)
if err != nil {
t.Errorf("Could not restart CoreDNS : %s, at loop %v", err, i)
}
if err := collectMetricsInfo(promAddress, proc); err != nil {
t.Errorf("Prometheus is not listening : %s", err)
}
serverWithMetrics = serverReload
}
// reload without prometheus
serverWithoutMetrics, err := serverWithMetrics.Restart(
NewInput(corefileWithoutMetrics),
)
if err != nil {
t.Errorf("Could not restart a second time CoreDNS : %s", err)
}
serverWithoutMetrics.Stop()
// verify that metrics have not been pushed
if err := collectMetricsInfo(promAddress, proc); err == nil {
t.Errorf("Prometheus is still listening")
}
}
const inUse = "address already in use" const inUse = "address already in use"