diff --git a/plugin/health/README.md b/plugin/health/README.md index ce2446a21..c8fda61a4 100644 --- a/plugin/health/README.md +++ b/plugin/health/README.md @@ -50,11 +50,13 @@ Doing this is supported but both endpoints ":8080" and ":8081" will export the e If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported: - * `coredns_health_request_duration_seconds{}` - duration to process a HTTP query to the local + * `coredns_health_request_duration_seconds{}` - duration to process a HTTP query to the local `/health` endpoint. As this a local operation it should be fast. A (large) increase in this duration indicates the CoreDNS process is having trouble keeping up with its query load. + * `coredns_health_request_failures_total{}` - The number of times the internal health check loop + failed to query `/health`. -Note that this metric *does not* have a `server` label, because being overloaded is a symptom of +Note that these metrics *do not* have a `server` label, because being overloaded is a symptom of the running process, *not* a specific server. ## Examples diff --git a/plugin/health/overloaded.go b/plugin/health/overloaded.go index d996827c0..3a4c5f08b 100644 --- a/plugin/health/overloaded.go +++ b/plugin/health/overloaded.go @@ -26,7 +26,8 @@ func (h *health) overloaded() { start := time.Now() resp, err := client.Get(url) if err != nil { - HealthDuration.Observe(timeout.Seconds()) + HealthDuration.Observe(time.Since(start).Seconds()) + HealthFailures.Inc() log.Warningf("Local health request to %q failed: %s", url, err) continue } @@ -49,7 +50,14 @@ var ( Namespace: plugin.Namespace, Subsystem: "health", Name: "request_duration_seconds", - Buckets: plugin.TimeBuckets, + Buckets: plugin.SlimTimeBuckets, Help: "Histogram of the time (in seconds) each request took.", }) + // HealthFailures is the metric used to count how many times the thealth request failed + HealthFailures = promauto.NewCounter(prometheus.CounterOpts{ + Namespace: plugin.Namespace, + Subsystem: "health", + Name: "request_failures_total", + Help: "The number of times the health check failed.", + }) ) diff --git a/plugin/plugin.go b/plugin/plugin.go index 9bac48885..51f5ba79c 100644 --- a/plugin/plugin.go +++ b/plugin/plugin.go @@ -105,5 +105,8 @@ const Namespace = "coredns" // TimeBuckets is based on Prometheus client_golang prometheus.DefBuckets var TimeBuckets = prometheus.ExponentialBuckets(0.00025, 2, 16) // from 0.25ms to 8 seconds +// SlimTimeBuckets is low cardinality set of duration buckets. +var SlimTimeBuckets = prometheus.ExponentialBuckets(0.00025, 10, 5) // from 0.25ms to 2.5 seconds + // ErrOnce is returned when a plugin doesn't support multiple setups per server. var ErrOnce = errors.New("this plugin can only be used once per Server Block")