Add a NativeHistogramBucketFactor parameter to the use of `NewHistogramVec` in order to enable use of Prometheus Native Histograms. This will store automatically computed sparse buckets in CoreDNS. If a compatible Prometeus requests native histograms this data will returned instead of the static buckets. The default factor of 1.05 should provide high quality resolution data. Signed-off-by: SuperQ <superq@gmail.com>
84 lines
2.5 KiB
Go
84 lines
2.5 KiB
Go
package health
|
|
|
|
import (
|
|
"context"
|
|
"net"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/coredns/coredns/plugin"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
// overloaded queries the health end point and updates a metrics showing how long it took.
|
|
func (h *health) overloaded(ctx context.Context) {
|
|
bypassProxy := &http.Transport{
|
|
Proxy: nil,
|
|
DialContext: (&net.Dialer{
|
|
Timeout: 30 * time.Second,
|
|
KeepAlive: 30 * time.Second,
|
|
}).DialContext,
|
|
ForceAttemptHTTP2: true,
|
|
MaxIdleConns: 100,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
TLSHandshakeTimeout: 10 * time.Second,
|
|
ExpectContinueTimeout: 1 * time.Second,
|
|
}
|
|
timeout := 3 * time.Second
|
|
client := http.Client{
|
|
Timeout: timeout,
|
|
Transport: bypassProxy,
|
|
}
|
|
|
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, h.healthURI.String(), nil)
|
|
tick := time.NewTicker(1 * time.Second)
|
|
defer tick.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-tick.C:
|
|
start := time.Now()
|
|
resp, err := client.Do(req)
|
|
if err != nil && ctx.Err() == context.Canceled {
|
|
// request was cancelled by parent goroutine
|
|
return
|
|
}
|
|
if err != nil {
|
|
HealthDuration.Observe(time.Since(start).Seconds())
|
|
HealthFailures.Inc()
|
|
log.Warningf("Local health request to %q failed: %s", req.URL.String(), err)
|
|
continue
|
|
}
|
|
resp.Body.Close()
|
|
elapsed := time.Since(start)
|
|
HealthDuration.Observe(elapsed.Seconds())
|
|
if elapsed > time.Second { // 1s is pretty random, but a *local* scrape taking that long isn't good
|
|
log.Warningf("Local health request to %q took more than 1s: %s", req.URL.String(), elapsed)
|
|
}
|
|
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
var (
|
|
// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
|
|
HealthDuration = promauto.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: plugin.Namespace,
|
|
Subsystem: "health",
|
|
Name: "request_duration_seconds",
|
|
Buckets: plugin.SlimTimeBuckets,
|
|
NativeHistogramBucketFactor: plugin.NativeHistogramBucketFactor,
|
|
Help: "Histogram of the time (in seconds) each request took.",
|
|
})
|
|
// HealthFailures is the metric used to count how many times the health request failed
|
|
HealthFailures = promauto.NewCounter(prometheus.CounterOpts{
|
|
Namespace: plugin.Namespace,
|
|
Subsystem: "health",
|
|
Name: "request_failures_total",
|
|
Help: "The number of times the health check failed.",
|
|
})
|
|
)
|