* healthchecks: check on every 3rd failure Check on every third failure and some cleanups to make this possible. A failed healthcheck will never increase Fails, a successfull healthceck will reset Fails to 0. This is a chance this counter now drops below 0, making the upstream super? healthy. This removes the okUntil smartness and condences everything back to 1 metrics: Fails; so it's simpler in that regard. Timout errors are *not* attributed to the local upstream, and don't get counted into the Fails anymore. Meaning the 'dig any isc.org' won't kill your upstream. Added extra test the see if the Fails counter gets reset after 3 failed connection. There is still a disconnect beween HTTP healthceck working the proxy (or lookup) not being able to connect to the upstream. * Fix tests
216 lines
6 KiB
Go
216 lines
6 KiB
Go
// Package proxy is plugin that proxies requests.
|
|
package proxy
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"net"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/coredns/coredns/plugin"
|
|
"github.com/coredns/coredns/plugin/dnstap"
|
|
"github.com/coredns/coredns/plugin/dnstap/msg"
|
|
"github.com/coredns/coredns/plugin/pkg/healthcheck"
|
|
"github.com/coredns/coredns/request"
|
|
|
|
tap "github.com/dnstap/golang-dnstap"
|
|
"github.com/miekg/dns"
|
|
ot "github.com/opentracing/opentracing-go"
|
|
"golang.org/x/net/context"
|
|
)
|
|
|
|
var (
|
|
errUnreachable = errors.New("unreachable backend")
|
|
errInvalidProtocol = errors.New("invalid protocol")
|
|
errInvalidDomain = errors.New("invalid path for proxy")
|
|
)
|
|
|
|
// Proxy represents a plugin instance that can proxy requests to another (DNS) server.
|
|
type Proxy struct {
|
|
Next plugin.Handler
|
|
|
|
// Upstreams is a pointer to a slice, so we can update the upstream (used for Google)
|
|
// midway.
|
|
|
|
Upstreams *[]Upstream
|
|
|
|
// Trace is the Trace plugin, if it is installed
|
|
// This is used by the grpc exchanger to trace through the grpc calls
|
|
Trace plugin.Handler
|
|
}
|
|
|
|
// Upstream manages a pool of proxy upstream hosts. Select should return a
|
|
// suitable upstream host, or nil if no such hosts are available.
|
|
type Upstream interface {
|
|
// The domain name this upstream host should be routed on.
|
|
From() string
|
|
// Selects an upstream host to be routed to.
|
|
Select() *healthcheck.UpstreamHost
|
|
// Checks if subpdomain is not an ignored.
|
|
IsAllowedDomain(string) bool
|
|
// Exchanger returns the exchanger to be used for this upstream.
|
|
Exchanger() Exchanger
|
|
// Stops the upstream from proxying requests to shutdown goroutines cleanly.
|
|
Stop() error
|
|
}
|
|
|
|
// tryDuration is how long to try upstream hosts; failures result in
|
|
// immediate retries until this duration ends or we get a nil host.
|
|
var tryDuration = 16 * time.Second
|
|
|
|
// ServeDNS satisfies the plugin.Handler interface.
|
|
func (p Proxy) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
|
|
var span, child ot.Span
|
|
span = ot.SpanFromContext(ctx)
|
|
state := request.Request{W: w, Req: r}
|
|
|
|
upstream := p.match(state)
|
|
if upstream == nil {
|
|
return plugin.NextOrFailure(p.Name(), p.Next, ctx, w, r)
|
|
}
|
|
|
|
for {
|
|
start := time.Now()
|
|
reply := new(dns.Msg)
|
|
var backendErr error
|
|
|
|
// Since Select() should give us "up" hosts, keep retrying
|
|
// hosts until timeout (or until we get a nil host).
|
|
for time.Since(start) < tryDuration {
|
|
host := upstream.Select()
|
|
if host == nil {
|
|
return dns.RcodeServerFailure, fmt.Errorf("%s: %s", errUnreachable, "no upstream host")
|
|
}
|
|
|
|
if span != nil {
|
|
child = span.Tracer().StartSpan("exchange", ot.ChildOf(span.Context()))
|
|
ctx = ot.ContextWithSpan(ctx, child)
|
|
}
|
|
|
|
atomic.AddInt64(&host.Conns, 1)
|
|
queryEpoch := msg.Epoch()
|
|
|
|
RequestCount.WithLabelValues(state.Proto(), upstream.Exchanger().Protocol(), familyToString(state.Family()), host.Name).Add(1)
|
|
|
|
reply, backendErr = upstream.Exchanger().Exchange(ctx, host.Name, state)
|
|
|
|
respEpoch := msg.Epoch()
|
|
atomic.AddInt64(&host.Conns, -1)
|
|
|
|
if child != nil {
|
|
child.Finish()
|
|
}
|
|
|
|
taperr := toDnstap(ctx, host.Name, upstream.Exchanger(), state, reply, queryEpoch, respEpoch)
|
|
|
|
if backendErr == nil {
|
|
w.WriteMsg(reply)
|
|
|
|
RequestDuration.WithLabelValues(state.Proto(), upstream.Exchanger().Protocol(), familyToString(state.Family()), host.Name).Observe(float64(time.Since(start) / time.Millisecond))
|
|
|
|
return 0, taperr
|
|
}
|
|
|
|
// A "ANY isc.org" query is being dropped by ISC's nameserver, we see this as a i/o timeout, but
|
|
// would then mark our upstream is being broken. We should not do this if we consider the error temporary.
|
|
// Of course it could really be that our upstream is broken
|
|
if oe, ok := backendErr.(*net.OpError); ok {
|
|
// Note this keeps looping and trying until tryDuration is hit, at which point our client
|
|
// might be long gone...
|
|
if oe.Timeout() {
|
|
// Our upstream's upstream is problably messing up, continue with next selected
|
|
// host - which my be the *same* one as we don't set any uh.Fails.
|
|
continue
|
|
}
|
|
}
|
|
|
|
timeout := host.FailTimeout
|
|
if timeout == 0 {
|
|
timeout = defaultFailTimeout
|
|
}
|
|
|
|
atomic.AddInt32(&host.Fails, 1)
|
|
fails := atomic.LoadInt32(&host.Fails)
|
|
|
|
go func(host *healthcheck.UpstreamHost, timeout time.Duration) {
|
|
time.Sleep(timeout)
|
|
// we may go negative here, should be rectified by the HC.
|
|
atomic.AddInt32(&host.Fails, -1)
|
|
if fails%failureCheck == 0 { // Kick off healthcheck on eveyry third failure.
|
|
host.HealthCheckURL()
|
|
}
|
|
}(host, timeout)
|
|
}
|
|
|
|
return dns.RcodeServerFailure, fmt.Errorf("%s: %s", errUnreachable, backendErr)
|
|
}
|
|
}
|
|
|
|
func (p Proxy) match(state request.Request) (u Upstream) {
|
|
if p.Upstreams == nil {
|
|
return nil
|
|
}
|
|
|
|
longestMatch := 0
|
|
for _, upstream := range *p.Upstreams {
|
|
from := upstream.From()
|
|
|
|
if !plugin.Name(from).Matches(state.Name()) || !upstream.IsAllowedDomain(state.Name()) {
|
|
continue
|
|
}
|
|
|
|
if lf := len(from); lf > longestMatch {
|
|
longestMatch = lf
|
|
u = upstream
|
|
}
|
|
}
|
|
return u
|
|
|
|
}
|
|
|
|
// Name implements the Handler interface.
|
|
func (p Proxy) Name() string { return "proxy" }
|
|
|
|
func toDnstap(ctx context.Context, host string, ex Exchanger, state request.Request, reply *dns.Msg, queryEpoch, respEpoch uint64) (err error) {
|
|
if tapper := dnstap.TapperFromContext(ctx); tapper != nil {
|
|
// Query
|
|
b := tapper.TapBuilder()
|
|
b.TimeSec = queryEpoch
|
|
if err = b.HostPort(host); err != nil {
|
|
return
|
|
}
|
|
t := ex.Transport()
|
|
if t == "" {
|
|
t = state.Proto()
|
|
}
|
|
if t == "tcp" {
|
|
b.SocketProto = tap.SocketProtocol_TCP
|
|
} else {
|
|
b.SocketProto = tap.SocketProtocol_UDP
|
|
}
|
|
if err = b.Msg(state.Req); err != nil {
|
|
return
|
|
}
|
|
err = tapper.TapMessage(b.ToOutsideQuery(tap.Message_FORWARDER_QUERY))
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// Response
|
|
if reply != nil {
|
|
b.TimeSec = respEpoch
|
|
if err = b.Msg(reply); err != nil {
|
|
return
|
|
}
|
|
err = tapper.TapMessage(b.ToOutsideResponse(tap.Message_FORWARDER_RESPONSE))
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
const (
|
|
defaultFailTimeout = 2 * time.Second
|
|
defaultTimeout = 5 * time.Second
|
|
failureCheck = 3
|
|
)
|