plugin/cache: Add refresh mode setting to serve_stale (#5131)

This PR adds an optional REFRESH_MODE parameter on the serve_stale configuration directive of the
cache plugin, which verifies that the upstream is still unavailable before returning stale entries.

Signed-off-by: Antoine Tollenaere <atollena@gmail.com>
This commit is contained in:
Antoine Tollenaere 2022-05-02 19:16:33 +02:00 committed by GitHub
parent c3572fdb30
commit 66f2ac7568
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 170 additions and 25 deletions

View file

@ -37,7 +37,7 @@ cache [TTL] [ZONES...] {
success CAPACITY [TTL] [MINTTL]
denial CAPACITY [TTL] [MINTTL]
prefetch AMOUNT [[DURATION] [PERCENTAGE%]]
serve_stale [DURATION]
serve_stale [DURATION] [REFRESH_MODE]
}
~~~
@ -57,7 +57,12 @@ cache [TTL] [ZONES...] {
* `serve_stale`, when serve\_stale is set, cache always will serve an expired entry to a client if there is one
available. When this happens, cache will attempt to refresh the cache entry after sending the expired cache
entry to the client. The responses have a TTL of 0. **DURATION** is how far back to consider
stale responses as fresh. The default duration is 1h.
stale responses as fresh. The default duration is 1h. **REFRESH_MODE** controls when the attempt to refresh
the cache happens. `verified` will first verify that an entry is still unavailable from the source before sending
the stale response to the client. `immediate` will immediately send the expired response to the client before
checking to see if the entry is available from the source. **REFRESH_MODE** defaults to `immediate`. Setting this
value to `verified` can lead to increased latency when serving stale responses, but will prevent stale entries
from ever being served if an updated response can be retrieved from the source.
## Capacity and Eviction

31
plugin/cache/cache.go vendored
View file

@ -38,7 +38,9 @@ type Cache struct {
duration time.Duration
percentage int
staleUpTo time.Duration
// Stale serve
staleUpTo time.Duration
verifyStale bool
// Testing.
now func() time.Time
@ -227,6 +229,33 @@ func (w *ResponseWriter) Write(buf []byte) (int, error) {
return n, err
}
// verifyStaleResponseWriter is a response writer that only writes messages if they should replace a
// stale cache entry, and otherwise discards them.
type verifyStaleResponseWriter struct {
*ResponseWriter
refreshed bool // set to true if the last WriteMsg wrote to ResponseWriter, false otherwise.
}
// newVerifyStaleResponseWriter returns a ResponseWriter to be used when verifying stale cache
// entries. It only forward writes if an entry was successfully refreshed according to RFC8767,
// section 4 (response is NoError or NXDomain), and ignores any other response.
func newVerifyStaleResponseWriter(w *ResponseWriter) *verifyStaleResponseWriter {
return &verifyStaleResponseWriter{
w,
false,
}
}
// WriteMsg implements the dns.ResponseWriter interface.
func (w *verifyStaleResponseWriter) WriteMsg(res *dns.Msg) error {
w.refreshed = false
if res.Rcode == dns.RcodeSuccess || res.Rcode == dns.RcodeNameError {
w.refreshed = true
return w.ResponseWriter.WriteMsg(res) // stores to the cache and send to client
}
return nil // else discard
}
const (
maxTTL = dnsutil.MaximumDefaulTTL
minTTL = dnsutil.MinimalDefaultTTL

View file

@ -266,7 +266,7 @@ func TestServeFromStaleCache(t *testing.T) {
req.SetQuestion("cached.org.", dns.TypeA)
ctx := context.TODO()
// Cache example.org.
// Cache cached.org. with 60s TTL
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.staleUpTo = 1 * time.Hour
c.ServeDNS(ctx, rec, req)
@ -304,6 +304,80 @@ func TestServeFromStaleCache(t *testing.T) {
}
}
func TestServeFromStaleCacheFetchVerify(t *testing.T) {
c := New()
c.Next = ttlBackend(120)
req := new(dns.Msg)
req.SetQuestion("cached.org.", dns.TypeA)
ctx := context.TODO()
// Cache cached.org. with 120s TTL
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.staleUpTo = 1 * time.Hour
c.verifyStale = true
c.ServeDNS(ctx, rec, req)
if c.pcache.Len() != 1 {
t.Fatalf("Msg with > 0 TTL should have been cached")
}
tests := []struct {
name string
upstreamRCode int
upstreamTtl int
futureMinutes int
expectedRCode int
expectedTtl int
}{
// After 1 minutes of initial TTL, we should see a cached response
{"cached.org.", dns.RcodeSuccess, 200, 1, dns.RcodeSuccess, 60}, // ttl = 120 - 60 -- not refreshed
// After the 2 more minutes, we should see upstream responses because upstream is available
{"cached.org.", dns.RcodeSuccess, 200, 3, dns.RcodeSuccess, 200},
// After the TTL expired, if the server fails we should get the cached entry
{"cached.org.", dns.RcodeServerFailure, 200, 7, dns.RcodeSuccess, 0},
// After 1 more minutes, if the server serves nxdomain we should see them (despite being within the serve stale period)
{"cached.org.", dns.RcodeNameError, 150, 8, dns.RcodeNameError, 150},
}
for i, tt := range tests {
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.now = func() time.Time { return time.Now().Add(time.Duration(tt.futureMinutes) * time.Minute) }
if tt.upstreamRCode == dns.RcodeSuccess {
c.Next = ttlBackend(tt.upstreamTtl)
} else if tt.upstreamRCode == dns.RcodeServerFailure {
// Make upstream fail, should now rely on cache during the c.staleUpTo period
c.Next = servFailBackend(tt.upstreamTtl)
} else if tt.upstreamRCode == dns.RcodeNameError {
c.Next = nxDomainBackend(tt.upstreamTtl)
} else {
t.Fatal("upstream code not implemented")
}
r := req.Copy()
r.SetQuestion(tt.name, dns.TypeA)
ret, _ := c.ServeDNS(ctx, rec, r)
if ret != tt.expectedRCode {
t.Errorf("Test %d: expected rcode=%v, got rcode=%v", i, tt.expectedRCode, ret)
continue
}
if ret == dns.RcodeSuccess {
recTtl := rec.Msg.Answer[0].Header().Ttl
if tt.expectedTtl != int(recTtl) {
t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, recTtl)
}
} else if ret == dns.RcodeNameError {
soaTtl := rec.Msg.Ns[0].Header().Ttl
if tt.expectedTtl != int(soaTtl) {
t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, soaTtl)
}
}
}
}
func TestNegativeStaleMaskingPositiveCache(t *testing.T) {
c := New()
c.staleUpTo = time.Minute * 10
@ -454,6 +528,20 @@ func ttlBackend(ttl int) plugin.Handler {
})
}
func servFailBackend(ttl int) plugin.Handler {
return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
m := new(dns.Msg)
m.SetReply(r)
m.Response, m.RecursionAvailable = true, true
m.Ns = []dns.RR{test.SOA(fmt.Sprintf("example.org. %d IN SOA sns.dns.icann.org. noc.dns.icann.org. 2016082540 7200 3600 1209600 3600", ttl))}
m.MsgHdr.Rcode = dns.RcodeServerFailure
w.WriteMsg(m)
return dns.RcodeServerFailure, nil
})
}
func TestComputeTTL(t *testing.T) {
tests := []struct {
msgTTL time.Duration

View file

@ -35,19 +35,29 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg)
ttl := 0
i := c.getIgnoreTTL(now, state, server)
if i != nil {
ttl = i.ttl(now)
}
if i == nil {
crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do}
return c.doRefresh(ctx, state, crr)
}
ttl = i.ttl(now)
if ttl < 0 {
servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc()
// serve stale behavior
if c.verifyStale {
crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do}
cw := newVerifyStaleResponseWriter(crr)
ret, err := c.doRefresh(ctx, state, cw)
if cw.refreshed {
return ret, err
}
}
// Adjust the time to get a 0 TTL in the reply built from a stale item.
now = now.Add(time.Duration(ttl) * time.Second)
cw := newPrefetchResponseWriter(server, state, c)
go c.doPrefetch(ctx, state, cw, i, now)
if !c.verifyStale {
cw := newPrefetchResponseWriter(server, state, c)
go c.doPrefetch(ctx, state, cw, i, now)
}
servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc()
} else if c.shouldPrefetch(i, now) {
cw := newPrefetchResponseWriter(server, state, c)
go c.doPrefetch(ctx, state, cw, i, now)
@ -70,7 +80,7 @@ func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *Respo
}
}
func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw *ResponseWriter) (int, error) {
func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw dns.ResponseWriter) (int, error) {
if !state.Do() {
setDo(state.Req)
}

12
plugin/cache/setup.go vendored
View file

@ -166,11 +166,11 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
case "serve_stale":
args := c.RemainingArgs()
if len(args) > 1 {
if len(args) > 2 {
return nil, c.ArgErr()
}
ca.staleUpTo = 1 * time.Hour
if len(args) == 1 {
if len(args) > 0 {
d, err := time.ParseDuration(args[0])
if err != nil {
return nil, err
@ -180,6 +180,14 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
}
ca.staleUpTo = d
}
ca.verifyStale = false
if len(args) > 1 {
mode := strings.ToLower(args[1])
if mode != "immediate" && mode != "verify" {
return nil, fmt.Errorf("invalid value for serve_stale refresh mode: %s", mode)
}
ca.verifyStale = mode == "verify"
}
default:
return nil, c.ArgErr()
}

View file

@ -117,20 +117,25 @@ func TestSetup(t *testing.T) {
func TestServeStale(t *testing.T) {
tests := []struct {
input string
shouldErr bool
staleUpTo time.Duration
input string
shouldErr bool
staleUpTo time.Duration
verifyStale bool
}{
{"serve_stale", false, 1 * time.Hour},
{"serve_stale 20m", false, 20 * time.Minute},
{"serve_stale 1h20m", false, 80 * time.Minute},
{"serve_stale 0m", false, 0},
{"serve_stale 0", false, 0},
{"serve_stale", false, 1 * time.Hour, false},
{"serve_stale 20m", false, 20 * time.Minute, false},
{"serve_stale 1h20m", false, 80 * time.Minute, false},
{"serve_stale 0m", false, 0, false},
{"serve_stale 0", false, 0, false},
{"serve_stale 0 verify", false, 0, true},
{"serve_stale 0 immediate", false, 0, false},
{"serve_stale 0 VERIFY", false, 0, true},
// fails
{"serve_stale 20", true, 0},
{"serve_stale -20m", true, 0},
{"serve_stale aa", true, 0},
{"serve_stale 1m nono", true, 0},
{"serve_stale 20", true, 0, false},
{"serve_stale -20m", true, 0, false},
{"serve_stale aa", true, 0, false},
{"serve_stale 1m nono", true, 0, false},
{"serve_stale 0 after nono", true, 0, false},
}
for i, test := range tests {
c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input))