certificates/acme/authority.go
David Cowden 9af4dd3692 acme: Retry challenge validation attempts
Section 8.2 of RFC 8555 explains how retries apply to the validation
process. However, much is left up to the implementer.

Add retries every 12 seconds for 2 minutes after a client requests a
validation. The challenge status remains "processing" indefinitely until
a distinct conclusion is reached. This allows a client to continually
re-request a validation by sending a post-get to the challenge resource
until the process fails or succeeds.

Challenges in the processing state include information about why a
validation did not complete in the error field. The server also includes
a Retry-After header to help clients and servers coordinate.

Retries are inherently stateful because they're part of the public API.
When running step-ca in a highly available setup with replicas, care
must be taken to maintain a persistent identifier for each instance
"slot". In kubernetes, this implies a *stateful set*.
2020-05-06 07:39:13 -07:00

473 lines
14 KiB
Go

package acme
import (
"crypto"
"crypto/tls"
"crypto/x509"
"encoding/base64"
"log"
"net"
"net/http"
"net/url"
"os"
"strconv"
"time"
"github.com/pkg/errors"
"github.com/smallstep/certificates/authority/provisioner"
database "github.com/smallstep/certificates/db"
"github.com/smallstep/cli/jose"
"github.com/smallstep/nosql"
)
// Interface is the acme authority interface.
type Interface interface {
DeactivateAccount(provisioner.Interface, string) (*Account, error)
FinalizeOrder(provisioner.Interface, string, string, *x509.CertificateRequest) (*Order, error)
GetAccount(provisioner.Interface, string) (*Account, error)
GetAccountByKey(provisioner.Interface, *jose.JSONWebKey) (*Account, error)
GetAuthz(provisioner.Interface, string, string) (*Authz, error)
GetCertificate(string, string) ([]byte, error)
GetDirectory(provisioner.Interface) *Directory
GetLink(Link, string, bool, ...string) string
GetOrder(provisioner.Interface, string, string) (*Order, error)
GetOrdersByAccount(provisioner.Interface, string) ([]string, error)
LoadProvisionerByID(string) (provisioner.Interface, error)
NewAccount(provisioner.Interface, AccountOptions) (*Account, error)
NewNonce() (string, error)
NewOrder(provisioner.Interface, OrderOptions) (*Order, error)
UpdateAccount(provisioner.Interface, string, []string) (*Account, error)
UseNonce(string) error
ValidateChallenge(provisioner.Interface, string, string, *jose.JSONWebKey) (*Challenge, error)
}
// Authority is the layer that handles all ACME interactions.
type Authority struct {
db nosql.DB
dir *directory
signAuth SignAuthority
}
var (
accountTable = []byte("acme_accounts")
accountByKeyIDTable = []byte("acme_keyID_accountID_index")
authzTable = []byte("acme_authzs")
challengeTable = []byte("acme_challenges")
nonceTable = []byte("nonces")
orderTable = []byte("acme_orders")
ordersByAccountIDTable = []byte("acme_account_orders_index")
certTable = []byte("acme_certs")
ordinal int
)
// Ordinal is used during challenge retries to indicate ownership.
func init() {
ordstr := os.Getenv("STEP_CA_ORDINAL");
if ordstr == "" {
ordinal = 0
} else {
ord, err := strconv.Atoi(ordstr)
if err != nil {
log.Fatal("Unrecognized ordinal ingeter value.")
panic(nil)
}
ordinal = ord
}
}
// NewAuthority returns a new Authority that implements the ACME interface.
func NewAuthority(db nosql.DB, dns, prefix string, signAuth SignAuthority) (*Authority, error) {
if _, ok := db.(*database.SimpleDB); !ok {
// If it's not a SimpleDB then go ahead and bootstrap the DB with the
// necessary ACME tables. SimpleDB should ONLY be used for testing.
tables := [][]byte{accountTable, accountByKeyIDTable, authzTable,
challengeTable, nonceTable, orderTable, ordersByAccountIDTable,
certTable}
for _, b := range tables {
if err := db.CreateTable(b); err != nil {
return nil, errors.Wrapf(err, "error creating table %s",
string(b))
}
}
}
return &Authority{
db: db, dir: newDirectory(dns, prefix), signAuth: signAuth,
}, nil
}
// GetLink returns the requested link from the directory.
func (a *Authority) GetLink(typ Link, provID string, abs bool, inputs ...string) string {
return a.dir.getLink(typ, provID, abs, inputs...)
}
// GetDirectory returns the ACME directory object.
func (a *Authority) GetDirectory(p provisioner.Interface) *Directory {
name := url.PathEscape(p.GetName())
return &Directory{
NewNonce: a.dir.getLink(NewNonceLink, name, true),
NewAccount: a.dir.getLink(NewAccountLink, name, true),
NewOrder: a.dir.getLink(NewOrderLink, name, true),
RevokeCert: a.dir.getLink(RevokeCertLink, name, true),
KeyChange: a.dir.getLink(KeyChangeLink, name, true),
}
}
// LoadProvisionerByID calls out to the SignAuthority interface to load a
// provisioner by ID.
func (a *Authority) LoadProvisionerByID(id string) (provisioner.Interface, error) {
return a.signAuth.LoadProvisionerByID(id)
}
// NewNonce generates, stores, and returns a new ACME nonce.
func (a *Authority) NewNonce() (string, error) {
n, err := newNonce(a.db)
if err != nil {
return "", err
}
return n.ID, nil
}
// UseNonce consumes the given nonce if it is valid, returns error otherwise.
func (a *Authority) UseNonce(nonce string) error {
return useNonce(a.db, nonce)
}
// NewAccount creates, stores, and returns a new ACME account.
func (a *Authority) NewAccount(p provisioner.Interface, ao AccountOptions) (*Account, error) {
acc, err := newAccount(a.db, ao)
if err != nil {
return nil, err
}
return acc.toACME(a.db, a.dir, p)
}
// UpdateAccount updates an ACME account.
func (a *Authority) UpdateAccount(p provisioner.Interface, id string, contact []string) (*Account, error) {
acc, err := getAccountByID(a.db, id)
if err != nil {
return nil, ServerInternalErr(err)
}
if acc, err = acc.update(a.db, contact); err != nil {
return nil, err
}
return acc.toACME(a.db, a.dir, p)
}
// GetAccount returns an ACME account.
func (a *Authority) GetAccount(p provisioner.Interface, id string) (*Account, error) {
acc, err := getAccountByID(a.db, id)
if err != nil {
return nil, err
}
return acc.toACME(a.db, a.dir, p)
}
// DeactivateAccount deactivates an ACME account.
func (a *Authority) DeactivateAccount(p provisioner.Interface, id string) (*Account, error) {
acc, err := getAccountByID(a.db, id)
if err != nil {
return nil, err
}
if acc, err = acc.deactivate(a.db); err != nil {
return nil, err
}
return acc.toACME(a.db, a.dir, p)
}
func keyToID(jwk *jose.JSONWebKey) (string, error) {
kid, err := jwk.Thumbprint(crypto.SHA256)
if err != nil {
return "", ServerInternalErr(errors.Wrap(err, "error generating jwk thumbprint"))
}
return base64.RawURLEncoding.EncodeToString(kid), nil
}
// GetAccountByKey returns the ACME associated with the jwk id.
func (a *Authority) GetAccountByKey(p provisioner.Interface, jwk *jose.JSONWebKey) (*Account, error) {
kid, err := keyToID(jwk)
if err != nil {
return nil, err
}
acc, err := getAccountByKeyID(a.db, kid)
if err != nil {
return nil, err
}
return acc.toACME(a.db, a.dir, p)
}
// GetOrder returns an ACME order.
func (a *Authority) GetOrder(p provisioner.Interface, accID, orderID string) (*Order, error) {
o, err := getOrder(a.db, orderID)
if err != nil {
return nil, err
}
if accID != o.AccountID {
return nil, UnauthorizedErr(errors.New("account does not own order"))
}
if o, err = o.updateStatus(a.db); err != nil {
return nil, err
}
return o.toACME(a.db, a.dir, p)
}
// GetOrdersByAccount returns the list of order urls owned by the account.
func (a *Authority) GetOrdersByAccount(p provisioner.Interface, id string) ([]string, error) {
oids, err := getOrderIDsByAccount(a.db, id)
if err != nil {
return nil, err
}
var ret = []string{}
for _, oid := range oids {
o, err := getOrder(a.db, oid)
if err != nil {
return nil, ServerInternalErr(err)
}
if o.Status == StatusInvalid {
continue
}
ret = append(ret, a.dir.getLink(OrderLink, URLSafeProvisionerName(p), true, o.ID))
}
return ret, nil
}
// NewOrder generates, stores, and returns a new ACME order.
func (a *Authority) NewOrder(p provisioner.Interface, ops OrderOptions) (*Order, error) {
order, err := newOrder(a.db, ops)
if err != nil {
return nil, Wrap(err, "error creating order")
}
return order.toACME(a.db, a.dir, p)
}
// FinalizeOrder attempts to finalize an order and generate a new certificate.
func (a *Authority) FinalizeOrder(p provisioner.Interface, accID, orderID string, csr *x509.CertificateRequest) (*Order, error) {
o, err := getOrder(a.db, orderID)
if err != nil {
return nil, err
}
if accID != o.AccountID {
return nil, UnauthorizedErr(errors.New("account does not own order"))
}
o, err = o.finalize(a.db, csr, a.signAuth, p)
if err != nil {
return nil, Wrap(err, "error finalizing order")
}
return o.toACME(a.db, a.dir, p)
}
// GetAuthz retrieves and attempts to update the status on an ACME authz
// before returning.
func (a *Authority) GetAuthz(p provisioner.Interface, accID, authzID string) (*Authz, error) {
az, err := getAuthz(a.db, authzID)
if err != nil {
return nil, err
}
if accID != az.getAccountID() {
return nil, UnauthorizedErr(errors.New("account does not own authz"))
}
az, err = az.updateStatus(a.db)
if err != nil {
return nil, Wrap(err, "error updating authz status")
}
return az.toACME(a.db, a.dir, p)
}
// The challenge validation state machine looks like:
//
// * https://tools.ietf.org/html/rfc8555#section-7.1.6
//
// While in the processing state, the server may retry as it sees fit. The challenge validation strategy
// needs to be rather specific in order for retries to work in a replicated, crash-proof deployment.
// In general, the goal is to allow requests to hit arbitrary instances of step-ca while managing retry
// responsibility such that multiple instances agree on an owner. Additionally, when a deployment of the
// CA is in progress, the ownership should be carried forward and new, updated (or in general, restarted),
// instances should pick back up where the crashed instance left off.
//
// The steps are:
//
// 1. Upon incoming request to the challenge endpoint, take ownership of the retry responsibility.
// (a) Set Retry.Owner to this instance's ordinal (STEP_CA_ORDINAL).
// (b) Set Retry.NumAttempts to 0 and Retry.MaxAttempts to the desired max.
// (c) Set Challenge.Status to "processing"
// (d) Set retry_after to a time (retryInterval) in the future.
// 2. Perform the validation attempt.
// 3. If the validation attempt results in a challenge that is still processing, schedule a retry.
//
// It's possible that another request to re-attempt the challenge comes in while a retry attempt is
// pending from a previous request. In general, these old attempts will see that Retry.NextAttempt
// is in the future and drop their task. But this also might have happened on another instance, etc.
//
// 4. When the retry timer fires, check to make sure the retry should still process.
// (a) Refresh the challenge from the DB.
// (a) Check that Retry.Owner is equal to this instance's ordinal.
// (b) Check that Retry.NextAttempt is in the past.
// 5. If the retry will commence, immediately update Retry.NextAttempt and save the challenge.
//
// Finally, if this instance is terminated, retries need to be reschedule when the instance restarts. This
// is handled in the acme provisioner (authority/provisioner/acme.go) initialization.
//
// Note: the default ordinal does not need to be changed unless step-ca is running in a replicated scenario.
//
func (a *Authority) ValidateChallenge(p provisioner.Interface, accID, chID string, jwk *jose.JSONWebKey) (*Challenge, error) {
ch, err := getChallenge(a.db, chID)
// Validate the challenge belongs to the account owned by the requester.
if err != nil {
return nil, err
}
if accID != ch.getAccountID() {
return nil, UnauthorizedErr(errors.New("account does not own challenge"))
}
// Take ownership of the challenge status and retry state. The values must be reset.
up := ch.clone()
up.Status = StatusProcessing
up.Retry = &Retry {
Owner: ordinal,
ProvisionerID: p.GetID(),
NumAttempts: 0,
MaxAttempts: 10,
NextAttempt: time.Now().Add(retryInterval).UTC().Format(time.RFC3339),
}
err = up.save(a.db, ch)
if err != nil {
return nil, Wrap(err, "error saving challenge")
}
ch = up
v, err := a.validate(ch, jwk)
// An error here is non-recoverable. Recoverable errors are set on the challenge object
// and should not be returned directly.
if err != nil {
return nil, Wrap(err, "error attempting challenge validation")
}
err = v.save(a.db, ch)
if err != nil {
return nil, Wrap(err, "error saving challenge")
}
ch = v
switch ch.getStatus() {
case StatusValid, StatusInvalid:
break
case StatusProcessing:
if ch.getRetry().Active() {
time.AfterFunc(retryInterval, func() {
a.RetryChallenge(ch.getID())
})
}
default:
panic("post-validation challenge in unexpected state" + ch.getStatus())
}
return ch.toACME(a.dir, p)
}
// The challenge validation process is specific to the type of challenge (dns-01, http-01, tls-alpn-01).
// But, we still pass generic "options" to the polymorphic validate call.
func (a *Authority) validate(ch challenge, jwk *jose.JSONWebKey) (challenge, error) {
client := http.Client{
Timeout: time.Duration(30 * time.Second),
}
dialer := &net.Dialer{
Timeout: 30 * time.Second,
}
return ch.validate(jwk, validateOptions{
httpGet: client.Get,
lookupTxt: net.LookupTXT,
tlsDial: func(network, addr string, config *tls.Config) (*tls.Conn, error) {
return tls.DialWithDialer(dialer, network, addr, config)
},
})
}
const retryInterval = 12 * time.Second
// see: ValidateChallenge
func (a *Authority) RetryChallenge(chID string) {
ch, err := getChallenge(a.db, chID)
if err != nil {
return
}
switch ch.getStatus() {
case StatusPending:
panic("pending challenges must first be moved to the processing state")
case StatusInvalid, StatusValid:
return
case StatusProcessing:
break
default:
panic("unknown challenge state: " + ch.getStatus())
}
// When retrying, check to make sure the ordinal has not changed.
// Make sure there are still retries left.
// Then check to make sure Retry.NextAttempt is in the past.
retry := ch.getRetry()
switch {
case retry.Owner != ordinal:
return
case !retry.Active():
return
}
t, err := time.Parse(time.RFC3339, retry.NextAttempt)
now := time.Now().UTC()
switch {
case err != nil:
return
case t.Before(now):
return
}
// Update the db so that other retries simply drop when their timer fires.
up := ch.clone()
up.Retry.NextAttempt = now.Add(retryInterval).UTC().Format(time.RFC3339)
up.Retry.NumAttempts += 1
err = up.save(a.db, ch)
if err != nil {
return
}
ch = up
p, err := a.LoadProvisionerByID(retry.ProvisionerID)
acc, err := a.GetAccount(p, ch.getAccountID())
v, err := a.validate(up, acc.Key)
if err != nil {
return
}
err = v.save(a.db, ch)
if err != nil {
return
}
ch = v
switch ch.getStatus() {
case StatusValid, StatusInvalid:
break
case StatusProcessing:
if ch.getRetry().Active() {
time.AfterFunc(retryInterval, func() {
a.RetryChallenge(ch.getID())
})
}
default:
panic("post-validation challenge in unexpected state " + ch.getStatus())
}
}
// GetCertificate retrieves the Certificate by ID.
func (a *Authority) GetCertificate(accID, certID string) ([]byte, error) {
cert, err := getCert(a.db, certID)
if err != nil {
return nil, err
}
if accID != cert.AccountID {
return nil, UnauthorizedErr(errors.New("account does not own certificate"))
}
return cert.toACME(a.db, a.dir)
}