plugin/kubernetes: Fix dns programming duration metric (#4255)
* get data reqd to record latency before calling toFuncs * refactor out unnecessary toFunc wrappers * remove latency metric unit tests per PR feedback Signed-off-by: Chris O'Haver <cohaver@infoblox.com>
This commit is contained in:
parent
56eea6e609
commit
9121e78496
9 changed files with 166 additions and 196 deletions
|
@ -1,73 +0,0 @@
|
|||
package kubernetes
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/coredns/coredns/plugin"
|
||||
"github.com/coredns/coredns/plugin/kubernetes/object"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
api "k8s.io/api/core/v1"
|
||||
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
var (
|
||||
// DNSProgrammingLatency is defined as the time it took to program a DNS instance - from the time
|
||||
// a service or pod has changed to the time the change was propagated and was available to be
|
||||
// served by a DNS server.
|
||||
// The definition of this SLI can be found at https://github.com/kubernetes/community/blob/master/sig-scalability/slos/dns_programming_latency.md
|
||||
// Note that the metrics is partially based on the time exported by the endpoints controller on
|
||||
// the master machine. The measurement may be inaccurate if there is a clock drift between the
|
||||
// node and master machine.
|
||||
// The service_kind label can be one of:
|
||||
// * cluster_ip
|
||||
// * headless_with_selector
|
||||
// * headless_without_selector
|
||||
DNSProgrammingLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: pluginName,
|
||||
Name: "dns_programming_duration_seconds",
|
||||
// From 1 millisecond to ~17 minutes.
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 20),
|
||||
Help: "Histogram of the time (in seconds) it took to program a dns instance.",
|
||||
}, []string{"service_kind"})
|
||||
|
||||
// durationSinceFunc returns the duration elapsed since the given time.
|
||||
// Added as a global variable to allow injection for testing.
|
||||
durationSinceFunc = time.Since
|
||||
)
|
||||
|
||||
func recordDNSProgrammingLatency(svcs []*object.Service, endpoints meta.Object) {
|
||||
// getLastChangeTriggerTime is the time.Time value of the EndpointsLastChangeTriggerTime
|
||||
// annotation stored in the given endpoints object or the "zero" time if the annotation wasn't set
|
||||
var lastChangeTriggerTime time.Time
|
||||
stringVal, ok := endpoints.GetAnnotations()[api.EndpointsLastChangeTriggerTime]
|
||||
if ok {
|
||||
ts, err := time.Parse(time.RFC3339Nano, stringVal)
|
||||
if err != nil {
|
||||
log.Warningf("DnsProgrammingLatency cannot be calculated for Endpoints '%s/%s'; invalid %q annotation RFC3339 value of %q",
|
||||
endpoints.GetNamespace(), endpoints.GetName(), api.EndpointsLastChangeTriggerTime, stringVal)
|
||||
// In case of error val = time.Zero, which is ignored in the upstream code.
|
||||
}
|
||||
lastChangeTriggerTime = ts
|
||||
}
|
||||
|
||||
// isHeadless indicates whether the endpoints object belongs to a headless
|
||||
// service (i.e. clusterIp = None). Note that this can be a false negatives if the service
|
||||
// informer is lagging, i.e. we may not see a recently created service. Given that the services
|
||||
// don't change very often (comparing to much more frequent endpoints changes), cases when this method
|
||||
// will return wrong answer should be relatively rare. Because of that we intentionally accept this
|
||||
// flaw to keep the solution simple.
|
||||
isHeadless := len(svcs) == 1 && svcs[0].ClusterIP == api.ClusterIPNone
|
||||
|
||||
if endpoints == nil || !isHeadless || lastChangeTriggerTime.IsZero() {
|
||||
return
|
||||
}
|
||||
|
||||
// If we're here it means that the Endpoints object is for a headless service and that
|
||||
// the Endpoints object was created by the endpoints-controller (because the
|
||||
// LastChangeTriggerTime annotation is set). It means that the corresponding service is a
|
||||
// "headless service with selector".
|
||||
DNSProgrammingLatency.WithLabelValues("headless_with_selector").
|
||||
Observe(durationSinceFunc(lastChangeTriggerTime).Seconds())
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue