coredns/plugin/health/overloaded.go
Ondřej Benkovský a929b0b1ec
plugin/health : rework overloaded goroutine to support graceful shutdown (#5244)
Signed-off-by: Ondřej Benkovský <ondrej.benkovsky@jamf.com>
2022-04-13 13:09:03 -04:00

70 lines
2 KiB
Go

package health
import (
"context"
"net/http"
"time"
"github.com/coredns/coredns/plugin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
// overloaded queries the health end point and updates a metrics showing how long it took.
func (h *health) overloaded(ctx context.Context) {
timeout := 3 * time.Second
client := http.Client{
Timeout: timeout,
}
url := "http://" + h.Addr + "/health"
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
tick := time.NewTicker(1 * time.Second)
defer tick.Stop()
for {
select {
case <-tick.C:
start := time.Now()
resp, err := client.Do(req)
if err != nil && ctx.Err() == context.Canceled {
// request was cancelled by parent goroutine
return
}
if err != nil {
HealthDuration.Observe(time.Since(start).Seconds())
HealthFailures.Inc()
log.Warningf("Local health request to %q failed: %s", url, err)
continue
}
resp.Body.Close()
elapsed := time.Since(start)
HealthDuration.Observe(elapsed.Seconds())
if elapsed > time.Second { // 1s is pretty random, but a *local* scrape taking that long isn't good
log.Warningf("Local health request to %q took more than 1s: %s", url, elapsed)
}
case <-ctx.Done():
return
}
}
}
var (
// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
HealthDuration = promauto.NewHistogram(prometheus.HistogramOpts{
Namespace: plugin.Namespace,
Subsystem: "health",
Name: "request_duration_seconds",
Buckets: plugin.SlimTimeBuckets,
Help: "Histogram of the time (in seconds) each request took.",
})
// HealthFailures is the metric used to count how many times the health request failed
HealthFailures = promauto.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "health",
Name: "request_failures_total",
Help: "The number of times the health check failed.",
})
)