Metrics listener fix (#2036)
* Create test to verify correct listener behavior * Create Unset function to remove todo items * Reset address for prometheus listener before restarting * Add inline documentation for Unset function * Make shutdownTimeout a constant and change to five seconds * Revert ForEach behavior in uniq package
This commit is contained in:
parent
b87ed01bb2
commit
8aa55c5ff2
3 changed files with 108 additions and 13 deletions
|
@ -2,10 +2,12 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/coredns/coredns/plugin"
|
"github.com/coredns/coredns/plugin"
|
||||||
"github.com/coredns/coredns/plugin/metrics/vars"
|
"github.com/coredns/coredns/plugin/metrics/vars"
|
||||||
|
@ -22,6 +24,7 @@ type Metrics struct {
|
||||||
ln net.Listener
|
ln net.Listener
|
||||||
lnSetup bool
|
lnSetup bool
|
||||||
mux *http.ServeMux
|
mux *http.ServeMux
|
||||||
|
srv *http.Server
|
||||||
|
|
||||||
zoneNames []string
|
zoneNames []string
|
||||||
zoneMap map[string]bool
|
zoneMap map[string]bool
|
||||||
|
@ -94,9 +97,9 @@ func (m *Metrics) OnStartup() error {
|
||||||
|
|
||||||
m.mux = http.NewServeMux()
|
m.mux = http.NewServeMux()
|
||||||
m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{}))
|
m.mux.Handle("/metrics", promhttp.HandlerFor(m.Reg, promhttp.HandlerOpts{}))
|
||||||
|
m.srv = &http.Server{Handler: m.mux}
|
||||||
go func() {
|
go func() {
|
||||||
http.Serve(m.ln, m.mux)
|
m.srv.Serve(m.ln)
|
||||||
}()
|
}()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -106,24 +109,28 @@ func (m *Metrics) OnRestart() error {
|
||||||
if !m.lnSetup {
|
if !m.lnSetup {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
uniqAddr.Unset(m.Addr)
|
||||||
|
return m.stopServer()
|
||||||
|
}
|
||||||
|
|
||||||
uniqAddr.SetTodo(m.Addr)
|
func (m *Metrics) stopServer() error {
|
||||||
|
if !m.lnSetup {
|
||||||
m.ln.Close()
|
return nil
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
|
||||||
|
defer cancel()
|
||||||
|
if err := m.srv.Shutdown(ctx); err != nil {
|
||||||
|
log.Infof("Failed to stop prometheus http server: %s", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
m.lnSetup = false
|
m.lnSetup = false
|
||||||
|
m.ln.Close()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// OnFinalShutdown tears down the metrics listener on shutdown and restart.
|
// OnFinalShutdown tears down the metrics listener on shutdown and restart.
|
||||||
func (m *Metrics) OnFinalShutdown() error {
|
func (m *Metrics) OnFinalShutdown() error {
|
||||||
// We allow prometheus statements in multiple Server Blocks, but only the first
|
return m.stopServer()
|
||||||
// will open the listener, for the rest they are all nil; guard against that.
|
|
||||||
if !m.lnSetup {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
m.lnSetup = false
|
|
||||||
return m.ln.Close()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func keys(m map[string]bool) []string {
|
func keys(m map[string]bool) []string {
|
||||||
|
@ -138,6 +145,10 @@ func keys(m map[string]bool) []string {
|
||||||
// we listen on "localhost:0" and need to retrieve the actual address.
|
// we listen on "localhost:0" and need to retrieve the actual address.
|
||||||
var ListenAddr string
|
var ListenAddr string
|
||||||
|
|
||||||
|
// shutdownTimeout is the maximum amount of time the metrics plugin will wait
|
||||||
|
// before erroring when it tries to close the metrics server
|
||||||
|
const shutdownTimeout time.Duration = time.Second * 5
|
||||||
|
|
||||||
var buildInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
var buildInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
Namespace: plugin.Namespace,
|
Namespace: plugin.Namespace,
|
||||||
Name: "build_info",
|
Name: "build_info",
|
||||||
|
|
|
@ -24,6 +24,13 @@ func (u U) Set(key string, f func() error) {
|
||||||
u.u[key] = item{todo, f}
|
u.u[key] = item{todo, f}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unset removes the 'todo' associated with a key
|
||||||
|
func (u U) Unset(key string) {
|
||||||
|
if _, ok := u.u[key]; ok {
|
||||||
|
delete(u.u, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// SetTodo sets key to 'todo' again.
|
// SetTodo sets key to 'todo' again.
|
||||||
func (u U) SetTodo(key string) {
|
func (u U) SetTodo(key string) {
|
||||||
v, ok := u.u[key]
|
v, ok := u.u[key]
|
||||||
|
|
|
@ -2,6 +2,7 @@ package test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
|
@ -125,4 +126,80 @@ func TestReloadMetricsHealth(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func collectMetricsInfo(addr, proc string) error {
|
||||||
|
cl := &http.Client{}
|
||||||
|
resp, err := cl.Get(fmt.Sprintf("http://%s/metrics", addr))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
metrics, _ := ioutil.ReadAll(resp.Body)
|
||||||
|
if !bytes.Contains(metrics, []byte(proc)) {
|
||||||
|
return fmt.Errorf("failed to see %s in metric output", proc)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestReloadSeveralTimeMetrics ensures that metrics are not pushed to
|
||||||
|
// prometheus once the metrics plugin is removed and a coredns
|
||||||
|
// reload is triggered
|
||||||
|
// 1. check that metrics have not been exported to prometheus before coredns starts
|
||||||
|
// 2. ensure that build-related metrics have been exported once coredns starts
|
||||||
|
// 3. trigger multiple reloads without changing the corefile
|
||||||
|
// 4. remove the metrics plugin and trigger a final reload
|
||||||
|
// 5. ensure the original prometheus exporter has not received more metrics
|
||||||
|
func TestReloadSeveralTimeMetrics(t *testing.T) {
|
||||||
|
//TODO: add a tool that find an available port because this needs to be a port
|
||||||
|
// that is not used in another test
|
||||||
|
promAddress := "127.0.0.1:53185"
|
||||||
|
proc := "coredns_build_info"
|
||||||
|
corefileWithMetrics := `
|
||||||
|
.:0 {
|
||||||
|
prometheus ` + promAddress + `
|
||||||
|
whoami
|
||||||
|
}`
|
||||||
|
corefileWithoutMetrics := `
|
||||||
|
.:0 {
|
||||||
|
whoami
|
||||||
|
}`
|
||||||
|
if err := collectMetricsInfo(promAddress, proc); err == nil {
|
||||||
|
t.Errorf("Prometheus is listening before the test started")
|
||||||
|
}
|
||||||
|
serverWithMetrics, err := CoreDNSServer(corefileWithMetrics)
|
||||||
|
if err != nil {
|
||||||
|
if strings.Contains(err.Error(), inUse) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
t.Errorf("Could not get service instance: %s", err)
|
||||||
|
}
|
||||||
|
// verify prometheus is running
|
||||||
|
if err := collectMetricsInfo(promAddress, proc); err != nil {
|
||||||
|
t.Errorf("Prometheus is not listening : %s", err)
|
||||||
|
}
|
||||||
|
reloadCount := 2
|
||||||
|
for i := 0; i < reloadCount; i++ {
|
||||||
|
serverReload, err := serverWithMetrics.Restart(
|
||||||
|
NewInput(corefileWithMetrics),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Could not restart CoreDNS : %s, at loop %v", err, i)
|
||||||
|
}
|
||||||
|
if err := collectMetricsInfo(promAddress, proc); err != nil {
|
||||||
|
t.Errorf("Prometheus is not listening : %s", err)
|
||||||
|
}
|
||||||
|
serverWithMetrics = serverReload
|
||||||
|
}
|
||||||
|
// reload without prometheus
|
||||||
|
serverWithoutMetrics, err := serverWithMetrics.Restart(
|
||||||
|
NewInput(corefileWithoutMetrics),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Could not restart a second time CoreDNS : %s", err)
|
||||||
|
}
|
||||||
|
serverWithoutMetrics.Stop()
|
||||||
|
// verify that metrics have not been pushed
|
||||||
|
if err := collectMetricsInfo(promAddress, proc); err == nil {
|
||||||
|
t.Errorf("Prometheus is still listening")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const inUse = "address already in use"
|
const inUse = "address already in use"
|
||||||
|
|
Loading…
Add table
Reference in a new issue