From 66f2ac7568ccb0178cc9ce6dbd7320bcd3428d64 Mon Sep 17 00:00:00 2001 From: Antoine Tollenaere Date: Mon, 2 May 2022 19:16:33 +0200 Subject: [PATCH] plugin/cache: Add refresh mode setting to serve_stale (#5131) This PR adds an optional REFRESH_MODE parameter on the serve_stale configuration directive of the cache plugin, which verifies that the upstream is still unavailable before returning stale entries. Signed-off-by: Antoine Tollenaere --- plugin/cache/README.md | 9 +++- plugin/cache/cache.go | 31 ++++++++++++- plugin/cache/cache_test.go | 90 +++++++++++++++++++++++++++++++++++++- plugin/cache/handler.go | 24 +++++++--- plugin/cache/setup.go | 12 ++++- plugin/cache/setup_test.go | 29 +++++++----- 6 files changed, 170 insertions(+), 25 deletions(-) diff --git a/plugin/cache/README.md b/plugin/cache/README.md index 92c231be7..98363abbe 100644 --- a/plugin/cache/README.md +++ b/plugin/cache/README.md @@ -37,7 +37,7 @@ cache [TTL] [ZONES...] { success CAPACITY [TTL] [MINTTL] denial CAPACITY [TTL] [MINTTL] prefetch AMOUNT [[DURATION] [PERCENTAGE%]] - serve_stale [DURATION] + serve_stale [DURATION] [REFRESH_MODE] } ~~~ @@ -57,7 +57,12 @@ cache [TTL] [ZONES...] { * `serve_stale`, when serve\_stale is set, cache always will serve an expired entry to a client if there is one available. When this happens, cache will attempt to refresh the cache entry after sending the expired cache entry to the client. The responses have a TTL of 0. **DURATION** is how far back to consider - stale responses as fresh. The default duration is 1h. + stale responses as fresh. The default duration is 1h. **REFRESH_MODE** controls when the attempt to refresh + the cache happens. `verified` will first verify that an entry is still unavailable from the source before sending + the stale response to the client. `immediate` will immediately send the expired response to the client before + checking to see if the entry is available from the source. **REFRESH_MODE** defaults to `immediate`. Setting this + value to `verified` can lead to increased latency when serving stale responses, but will prevent stale entries + from ever being served if an updated response can be retrieved from the source. ## Capacity and Eviction diff --git a/plugin/cache/cache.go b/plugin/cache/cache.go index 59439653f..58a73e72c 100644 --- a/plugin/cache/cache.go +++ b/plugin/cache/cache.go @@ -38,7 +38,9 @@ type Cache struct { duration time.Duration percentage int - staleUpTo time.Duration + // Stale serve + staleUpTo time.Duration + verifyStale bool // Testing. now func() time.Time @@ -227,6 +229,33 @@ func (w *ResponseWriter) Write(buf []byte) (int, error) { return n, err } +// verifyStaleResponseWriter is a response writer that only writes messages if they should replace a +// stale cache entry, and otherwise discards them. +type verifyStaleResponseWriter struct { + *ResponseWriter + refreshed bool // set to true if the last WriteMsg wrote to ResponseWriter, false otherwise. +} + +// newVerifyStaleResponseWriter returns a ResponseWriter to be used when verifying stale cache +// entries. It only forward writes if an entry was successfully refreshed according to RFC8767, +// section 4 (response is NoError or NXDomain), and ignores any other response. +func newVerifyStaleResponseWriter(w *ResponseWriter) *verifyStaleResponseWriter { + return &verifyStaleResponseWriter{ + w, + false, + } +} + +// WriteMsg implements the dns.ResponseWriter interface. +func (w *verifyStaleResponseWriter) WriteMsg(res *dns.Msg) error { + w.refreshed = false + if res.Rcode == dns.RcodeSuccess || res.Rcode == dns.RcodeNameError { + w.refreshed = true + return w.ResponseWriter.WriteMsg(res) // stores to the cache and send to client + } + return nil // else discard +} + const ( maxTTL = dnsutil.MaximumDefaulTTL minTTL = dnsutil.MinimalDefaultTTL diff --git a/plugin/cache/cache_test.go b/plugin/cache/cache_test.go index d839ea1a3..7f8c28e3f 100644 --- a/plugin/cache/cache_test.go +++ b/plugin/cache/cache_test.go @@ -266,7 +266,7 @@ func TestServeFromStaleCache(t *testing.T) { req.SetQuestion("cached.org.", dns.TypeA) ctx := context.TODO() - // Cache example.org. + // Cache cached.org. with 60s TTL rec := dnstest.NewRecorder(&test.ResponseWriter{}) c.staleUpTo = 1 * time.Hour c.ServeDNS(ctx, rec, req) @@ -304,6 +304,80 @@ func TestServeFromStaleCache(t *testing.T) { } } +func TestServeFromStaleCacheFetchVerify(t *testing.T) { + c := New() + c.Next = ttlBackend(120) + + req := new(dns.Msg) + req.SetQuestion("cached.org.", dns.TypeA) + ctx := context.TODO() + + // Cache cached.org. with 120s TTL + rec := dnstest.NewRecorder(&test.ResponseWriter{}) + c.staleUpTo = 1 * time.Hour + c.verifyStale = true + c.ServeDNS(ctx, rec, req) + if c.pcache.Len() != 1 { + t.Fatalf("Msg with > 0 TTL should have been cached") + } + + tests := []struct { + name string + upstreamRCode int + upstreamTtl int + futureMinutes int + expectedRCode int + expectedTtl int + }{ + // After 1 minutes of initial TTL, we should see a cached response + {"cached.org.", dns.RcodeSuccess, 200, 1, dns.RcodeSuccess, 60}, // ttl = 120 - 60 -- not refreshed + + // After the 2 more minutes, we should see upstream responses because upstream is available + {"cached.org.", dns.RcodeSuccess, 200, 3, dns.RcodeSuccess, 200}, + + // After the TTL expired, if the server fails we should get the cached entry + {"cached.org.", dns.RcodeServerFailure, 200, 7, dns.RcodeSuccess, 0}, + + // After 1 more minutes, if the server serves nxdomain we should see them (despite being within the serve stale period) + {"cached.org.", dns.RcodeNameError, 150, 8, dns.RcodeNameError, 150}, + } + + for i, tt := range tests { + rec := dnstest.NewRecorder(&test.ResponseWriter{}) + c.now = func() time.Time { return time.Now().Add(time.Duration(tt.futureMinutes) * time.Minute) } + + if tt.upstreamRCode == dns.RcodeSuccess { + c.Next = ttlBackend(tt.upstreamTtl) + } else if tt.upstreamRCode == dns.RcodeServerFailure { + // Make upstream fail, should now rely on cache during the c.staleUpTo period + c.Next = servFailBackend(tt.upstreamTtl) + } else if tt.upstreamRCode == dns.RcodeNameError { + c.Next = nxDomainBackend(tt.upstreamTtl) + } else { + t.Fatal("upstream code not implemented") + } + + r := req.Copy() + r.SetQuestion(tt.name, dns.TypeA) + ret, _ := c.ServeDNS(ctx, rec, r) + if ret != tt.expectedRCode { + t.Errorf("Test %d: expected rcode=%v, got rcode=%v", i, tt.expectedRCode, ret) + continue + } + if ret == dns.RcodeSuccess { + recTtl := rec.Msg.Answer[0].Header().Ttl + if tt.expectedTtl != int(recTtl) { + t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, recTtl) + } + } else if ret == dns.RcodeNameError { + soaTtl := rec.Msg.Ns[0].Header().Ttl + if tt.expectedTtl != int(soaTtl) { + t.Errorf("Test %d: expected TTL=%d, got TTL=%d", i, tt.expectedTtl, soaTtl) + } + } + } +} + func TestNegativeStaleMaskingPositiveCache(t *testing.T) { c := New() c.staleUpTo = time.Minute * 10 @@ -454,6 +528,20 @@ func ttlBackend(ttl int) plugin.Handler { }) } +func servFailBackend(ttl int) plugin.Handler { + return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) { + m := new(dns.Msg) + m.SetReply(r) + m.Response, m.RecursionAvailable = true, true + + m.Ns = []dns.RR{test.SOA(fmt.Sprintf("example.org. %d IN SOA sns.dns.icann.org. noc.dns.icann.org. 2016082540 7200 3600 1209600 3600", ttl))} + + m.MsgHdr.Rcode = dns.RcodeServerFailure + w.WriteMsg(m) + return dns.RcodeServerFailure, nil + }) +} + func TestComputeTTL(t *testing.T) { tests := []struct { msgTTL time.Duration diff --git a/plugin/cache/handler.go b/plugin/cache/handler.go index 2b4c89350..d5112fc69 100644 --- a/plugin/cache/handler.go +++ b/plugin/cache/handler.go @@ -35,19 +35,29 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) ttl := 0 i := c.getIgnoreTTL(now, state, server) - if i != nil { - ttl = i.ttl(now) - } if i == nil { crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do} return c.doRefresh(ctx, state, crr) } + ttl = i.ttl(now) if ttl < 0 { - servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc() + // serve stale behavior + if c.verifyStale { + crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do} + cw := newVerifyStaleResponseWriter(crr) + ret, err := c.doRefresh(ctx, state, cw) + if cw.refreshed { + return ret, err + } + } + // Adjust the time to get a 0 TTL in the reply built from a stale item. now = now.Add(time.Duration(ttl) * time.Second) - cw := newPrefetchResponseWriter(server, state, c) - go c.doPrefetch(ctx, state, cw, i, now) + if !c.verifyStale { + cw := newPrefetchResponseWriter(server, state, c) + go c.doPrefetch(ctx, state, cw, i, now) + } + servedStale.WithLabelValues(server, c.zonesMetricLabel).Inc() } else if c.shouldPrefetch(i, now) { cw := newPrefetchResponseWriter(server, state, c) go c.doPrefetch(ctx, state, cw, i, now) @@ -70,7 +80,7 @@ func (c *Cache) doPrefetch(ctx context.Context, state request.Request, cw *Respo } } -func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw *ResponseWriter) (int, error) { +func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw dns.ResponseWriter) (int, error) { if !state.Do() { setDo(state.Req) } diff --git a/plugin/cache/setup.go b/plugin/cache/setup.go index afbf361c5..e5258dc06 100644 --- a/plugin/cache/setup.go +++ b/plugin/cache/setup.go @@ -166,11 +166,11 @@ func cacheParse(c *caddy.Controller) (*Cache, error) { case "serve_stale": args := c.RemainingArgs() - if len(args) > 1 { + if len(args) > 2 { return nil, c.ArgErr() } ca.staleUpTo = 1 * time.Hour - if len(args) == 1 { + if len(args) > 0 { d, err := time.ParseDuration(args[0]) if err != nil { return nil, err @@ -180,6 +180,14 @@ func cacheParse(c *caddy.Controller) (*Cache, error) { } ca.staleUpTo = d } + ca.verifyStale = false + if len(args) > 1 { + mode := strings.ToLower(args[1]) + if mode != "immediate" && mode != "verify" { + return nil, fmt.Errorf("invalid value for serve_stale refresh mode: %s", mode) + } + ca.verifyStale = mode == "verify" + } default: return nil, c.ArgErr() } diff --git a/plugin/cache/setup_test.go b/plugin/cache/setup_test.go index 875af7d03..675147d1b 100644 --- a/plugin/cache/setup_test.go +++ b/plugin/cache/setup_test.go @@ -117,20 +117,25 @@ func TestSetup(t *testing.T) { func TestServeStale(t *testing.T) { tests := []struct { - input string - shouldErr bool - staleUpTo time.Duration + input string + shouldErr bool + staleUpTo time.Duration + verifyStale bool }{ - {"serve_stale", false, 1 * time.Hour}, - {"serve_stale 20m", false, 20 * time.Minute}, - {"serve_stale 1h20m", false, 80 * time.Minute}, - {"serve_stale 0m", false, 0}, - {"serve_stale 0", false, 0}, + {"serve_stale", false, 1 * time.Hour, false}, + {"serve_stale 20m", false, 20 * time.Minute, false}, + {"serve_stale 1h20m", false, 80 * time.Minute, false}, + {"serve_stale 0m", false, 0, false}, + {"serve_stale 0", false, 0, false}, + {"serve_stale 0 verify", false, 0, true}, + {"serve_stale 0 immediate", false, 0, false}, + {"serve_stale 0 VERIFY", false, 0, true}, // fails - {"serve_stale 20", true, 0}, - {"serve_stale -20m", true, 0}, - {"serve_stale aa", true, 0}, - {"serve_stale 1m nono", true, 0}, + {"serve_stale 20", true, 0, false}, + {"serve_stale -20m", true, 0, false}, + {"serve_stale aa", true, 0, false}, + {"serve_stale 1m nono", true, 0, false}, + {"serve_stale 0 after nono", true, 0, false}, } for i, test := range tests { c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input))