frostfs-node/pkg/local_object_storage/metabase/util.go

package meta

import (
	"crypto/sha256"
	"encoding/binary"
	"errors"
	"fmt"

	cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id"
	objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
	oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
	"go.etcd.io/bbolt"
)

var (
	// graveyardBucketName stores rows with the objects that have been
	// covered with Tombstone objects. That objects should not be returned
	// from the node and should not be accepted by the node from other
	// nodes.
	graveyardBucketName = []byte{graveyardPrefix}
	// garbageBucketName stores rows with the objects that should be physically
	// deleted by the node (Garbage Collector routine).
	garbageBucketName          = []byte{garbagePrefix}
	toMoveItBucketName         = []byte{toMoveItPrefix}
	containerVolumeBucketName  = []byte{containerVolumePrefix}
	containerCounterBucketName = []byte{containerCountersPrefix}
	expEpochToObjectBucketName = []byte{expirationEpochToObjectPrefix}

	zeroValue = []byte{0xFF}

	errInvalidLength = errors.New("invalid length")
)

// Prefix bytes for database keys. All ids and addresses are encoded in binary
// unless specified otherwise.
//
//nolint:godot
const (
	// graveyardPrefix is used for the graveyard bucket.
	// 	Key: object address
	// 	Value: tombstone address
	graveyardPrefix = iota
	// garbagePrefix is used for the garbage bucket.
	// 	Key: object address
	// 	Value: dummy value
	garbagePrefix
	// toMoveItPrefix is used for bucket containing IDs of objects that are candidates for moving
	// to another shard.
	toMoveItPrefix
	// containerVolumePrefix is used for storing container size estimations.
	//	Key: container ID
	//  Value: container size in bytes as little-endian uint64
	containerVolumePrefix
	// lockedPrefix is used for storing locked objects information.
	//  Key: container ID
	//  Value: bucket mapping objects locked to the list of corresponding LOCK objects.
	lockedPrefix
	// shardInfoPrefix is used for storing shard ID. All keys are custom and are not connected to the container.
	shardInfoPrefix

	// ======================
	// Unique index buckets.
	// ======================

	// primaryPrefix is used for prefixing buckets containing objects of REGULAR type.
	//  Key: object ID
	//  Value: marshalled object
	primaryPrefix
	// lockersPrefix is used for prefixing buckets containing objects of LOCK type.
	//  Key: object ID
	//  Value: marshalled object
	lockersPrefix
	// _ is unused. Previous usage was for prefixing buckets containing objects of STORAGEGROUP type.
	//  Key: object ID
	//  Value: marshaled object
	_
	// tombstonePrefix is used for prefixing buckets containing objects of TOMBSTONE type.
	//  Key: object ID
	//  Value: marshaled object
	tombstonePrefix
	// smallPrefix is used for prefixing buckets mapping objects to the blobovniczas they are stored in.
	//  Key: object ID
	//  Value: blobovnicza ID
	smallPrefix
	// rootPrefix is used for prefixing buckets mapping parent object to the split info.
	//  Key: object ID
	//  Value: split info
	rootPrefix

	// ====================
	// FKBT index buckets.
	// ====================

	// ownerPrefix was used for prefixing FKBT index buckets mapping owner to object IDs.
	// Key: owner ID
	// Value: bucket containing object IDs as keys
	// removed in version 3
	ownerPrefix
	// userAttributePrefix was used for prefixing FKBT index buckets containing objects.
	// Key: attribute value
	// Value: bucket containing object IDs as keys
	userAttributePrefix

	// ====================
	// List index buckets.
	// ====================

	// payloadHashPrefix was used for prefixing List index buckets mapping payload hash to a list of object IDs.
	//  Key: payload hash
	//  Value: list of object IDs
	// removed in version 3
	payloadHashPrefix
	// parentPrefix is used for prefixing List index buckets mapping parent ID to a list of children IDs.
	//  Key: parent ID
	//  Value: list of object IDs
	parentPrefix
	// splitPrefix is used for prefixing List index buckets mapping split ID to a list of object IDs.
	//  Key: split ID
	//  Value: list of object IDs
	splitPrefix

	// containerCountersPrefix is used for storing container object counters.
	//	Key: container ID + type
	//  Value: container size in bytes as little-endian uint64
	containerCountersPrefix

	// ecInfoPrefix is used for storing relation between EC parent id and chunk id.
	//	Key: container ID + type
	//  Value: Object id
	ecInfoPrefix

	// expirationEpochToObjectPrefix is used for storing relation between expiration epoch and object id.
	//	Key: expiration epoch + object address
	//  Value: zero
	expirationEpochToObjectPrefix

	// objectToExpirationEpochPrefix is used for storing relation between expiration epoch and object id.
	//	Key: object address
	//  Value: expiration epoch
	objectToExpirationEpochPrefix
)

const (
	cidSize        = sha256.Size
	bucketKeySize  = 1 + cidSize
	objectKeySize  = sha256.Size
	addressKeySize = cidSize + objectKeySize
	epochSize      = 8
)

func bucketName(cnr cid.ID, prefix byte, key []byte) []byte {
	key[0] = prefix
	cnr.Encode(key[1:])
	return key[:bucketKeySize]
}

// primaryBucketName returns <CID>.
func primaryBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, primaryPrefix, key)
}

// tombstoneBucketName returns <CID>_TS.
func tombstoneBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, tombstonePrefix, key)
}

// smallBucketName returns <CID>_small.
func smallBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, smallPrefix, key)
}

// attributeBucketName returns <CID>_<attributeKey>.
func attributeBucketName(cnr cid.ID, attributeKey string, key []byte) []byte {
	key[0] = userAttributePrefix
	cnr.Encode(key[1:])
	return append(key[:bucketKeySize], attributeKey...)
}

func cidFromAttributeBucket(bucketName []byte) (cid.ID, bool) {
	if len(bucketName) < bucketKeySize || bucketName[0] != userAttributePrefix {
		return cid.ID{}, false
	}
	var result cid.ID
	return result, result.Decode(bucketName[1:bucketKeySize]) == nil
}

func attributeFromAttributeBucket(bucketName []byte) (string, bool) {
	if len(bucketName) < bucketKeySize || bucketName[0] != userAttributePrefix {
		return "", false
	}
	return string(bucketName[bucketKeySize:]), true
}

// rootBucketName returns <CID>_root.
func rootBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, rootPrefix, key)
}

// parentBucketName returns <CID>_parent.
func parentBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, parentPrefix, key)
}

// splitBucketName returns <CID>_splitid.
func splitBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, splitPrefix, key)
}

// ecInfoBucketName returns <CID>_ecinfo.
func ecInfoBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, ecInfoPrefix, key)
}

// objectToExpirationEpochBucketName returns objectToExpirationEpochPrefix_<CID>.
func objectToExpirationEpochBucketName(cnr cid.ID, key []byte) []byte {
	return bucketName(cnr, objectToExpirationEpochPrefix, key)
}

func expirationEpochKey(epoch uint64, cnr cid.ID, obj oid.ID) []byte {
	result := make([]byte, epochSize+addressKeySize)
	binary.BigEndian.PutUint64(result, epoch)
	cnr.Encode(result[epochSize:])
	obj.Encode(result[epochSize+cidSize:])
	return result
}

func parseExpirationEpochKey(key []byte) (uint64, cid.ID, oid.ID, error) {
	if len(key) != epochSize+addressKeySize {
		return 0, cid.ID{}, oid.ID{}, fmt.Errorf("unexpected expiration epoch to object key length: %d", len(key))
	}
	epoch := binary.BigEndian.Uint64(key)
	var cnr cid.ID
	if err := cnr.Decode(key[epochSize : epochSize+cidSize]); err != nil {
		return 0, cid.ID{}, oid.ID{}, fmt.Errorf("failed to decode expiration epoch to object key (container ID): %w", err)
	}
	var obj oid.ID
	if err := obj.Decode(key[epochSize+cidSize:]); err != nil {
		return 0, cid.ID{}, oid.ID{}, fmt.Errorf("failed to decode expiration epoch to object key (object ID): %w", err)
	}
	return epoch, cnr, obj, nil
}

// addressKey returns key for K-V tables when key is a whole address.
func addressKey(addr oid.Address, key []byte) []byte {
	addr.Container().Encode(key)
	addr.Object().Encode(key[cidSize:])
	return key[:addressKeySize]
}

// parses object address formed by addressKey.
func decodeAddressFromKey(dst *oid.Address, k []byte) error {
	if len(k) != addressKeySize {
		return errInvalidLength
	}

	var cnr cid.ID
	if err := cnr.Decode(k[:cidSize]); err != nil {
		return err
	}

	var obj oid.ID
	if err := obj.Decode(k[cidSize:]); err != nil {
		return err
	}

	dst.SetObject(obj)
	dst.SetContainer(cnr)
	return nil
}

// objectKey returns key for K-V tables when key is an object id.
func objectKey(obj oid.ID, key []byte) []byte {
	obj.Encode(key)
	return key[:objectKeySize]
}

// if meets irregular object container in objs - returns its type, otherwise returns object.TypeRegular.
//
// firstIrregularObjectType(tx, cnr, obj) usage allows getting object type.
func firstIrregularObjectType(tx *bbolt.Tx, idCnr cid.ID, objs ...[]byte) objectSDK.Type {
	if len(objs) == 0 {
		panic("empty object list in firstIrregularObjectType")
	}

	var keys [2][1 + cidSize]byte

	irregularTypeBuckets := [...]struct {
		typ  objectSDK.Type
		name []byte
	}{
		{objectSDK.TypeTombstone, tombstoneBucketName(idCnr, keys[0][:])},
		{objectSDK.TypeLock, bucketNameLockers(idCnr, keys[1][:])},
	}

	for i := range objs {
		for j := range irregularTypeBuckets {
			if inBucket(tx, irregularTypeBuckets[j].name, objs[i]) {
				return irregularTypeBuckets[j].typ
			}
		}
	}

	return objectSDK.TypeRegular
}

// return true if provided object is of LOCK type.
func isLockObject(tx *bbolt.Tx, idCnr cid.ID, obj oid.ID) bool {
	return inBucket(tx,
		bucketNameLockers(idCnr, make([]byte, bucketKeySize)),
		objectKey(obj, make([]byte, objectKeySize)))
}

const NoExpirationEpoch uint64 = 0

// encodeTombstoneWithExpEpoch encodes a tombstone label in the following
// format: tombstone_address + expiration_epoch.
//
// Returns an error if the buffer length isn't 32.
//
// The expiration epoch shouldn't be [NoExpirationEpoch], as tombstone labels
// are intended to have a valid expiration epoch.
//
// The use of [NoExpirationEpoch] is allowed only for test purposes.
func encodeTombstoneWithExpEpoch(addr oid.Address, expEpoch uint64, dst []byte) error {
	if len(dst) != addressKeySize+epochSize {
		return errInvalidLength
	}

	addr.Container().Encode(dst[:cidSize])
	addr.Object().Encode(dst[cidSize:addressKeySize])
	binary.LittleEndian.PutUint64(dst[addressKeySize:], expEpoch)

	return nil
}

// decodeTombstoneWithExpEpoch decodes a tombstone label in the following
// formats: tombstone address or tombstone address + expiration epoch.
//
// Expiration epoch is set to [NoExpirationEpoch] if the label doesn't have one.
func decodeTombstoneWithExpEpoch(addr *oid.Address, expEpoch *uint64, src []byte) error {
	if len(src) != addressKeySize && len(src) != addressKeySize+epochSize {
		return errInvalidLength
	}

	var cnt cid.ID
	if err := cnt.Decode(src[:cidSize]); err != nil {
		return err
	}

	var obj oid.ID
	if err := obj.Decode(src[cidSize:addressKeySize]); err != nil {
		return err
	}

	addr.SetContainer(cnt)
	addr.SetObject(obj)

	if len(src) > addressKeySize {
		*expEpoch = binary.LittleEndian.Uint64(src[addressKeySize:])
	} else {
		*expEpoch = NoExpirationEpoch
	}

	return nil
}

// lockWithExpEpoch contains the ID and expiration epoch of the lock.
type lockWithExpEpoch struct {
	id       [objectKeySize]byte
	expEpoch [epochSize]byte
}

// decode decodes the ID and expiration epoch of the lock.
//
// If the lock has no expiration epoch, uses [NoExpirationEpoch] instead.
func (l lockWithExpEpoch) decode(id *oid.ID, expEpoch *uint64) error {
	if err := id.Decode(l.id[:]); err != nil {
		return err
	}
	*expEpoch = binary.LittleEndian.Uint64(l.expEpoch[:])
	return nil
}

// encode encodes the ID and expiration epoch of the lock.
func (l *lockWithExpEpoch) encode(id oid.ID, expEpoch uint64) {
	id.Encode(l.id[:])
	binary.LittleEndian.PutUint64(l.expEpoch[:], expEpoch)
}

// decodeLockWithExpEpochList decodes the lock list encoded with
// [encodeLockWithExpEpochList].
//
// If some locks have no expiration epoch, uses [NoExpirationEpoch] instead.
func decodeLockWithExpEpochList(data []byte) (locks []lockWithExpEpoch, err error) {
	xs, err := decodeList(data)
	if err != nil {
		return nil, fmt.Errorf("couldn't decode list: %w", err)
	}

	var id []byte
	var idMatched bool

	for _, x := range xs {
		switch size := len(x); {
		case size == objectKeySize && !idMatched:
			id = x
			idMatched = true
		case size == objectKeySize && idMatched:
			var lock lockWithExpEpoch
			copy(lock.id[:], id)
			locks = append(locks, lock)
			id = x
		case size == epochSize && idMatched:
			var lock lockWithExpEpoch
			copy(lock.id[:], id)
			copy(lock.expEpoch[:], x)
			locks = append(locks, lock)
			idMatched = false
		case size == epochSize && !idMatched:
			return nil, errors.New("found expiration epoch but expected lock")
		default:
			return nil, fmt.Errorf("unexpected list element size %d", size)
		}
	}

	if idMatched {
		var lock lockWithExpEpoch
		copy(lock.id[:], id)
		locks = append(locks, lock)
	}

	return locks, nil
}

// encodeLockWithExpEpochList encodes the lock list.
//
// If some locks have [NoExpirationEpoch], encodes only their IDs.
func encodeLockWithExpEpochList(locks []lockWithExpEpoch) (data []byte, err error) {
	var noEpoch [epochSize]byte

	var xs [][]byte

	for _, lock := range locks {
		xs = append(xs, lock.id[:])
		if lock.expEpoch != noEpoch {
			xs = append(xs, lock.expEpoch[:])
		}
	}

	if data, err = encodeList(xs); err != nil {
		return nil, fmt.Errorf("couldn't encode list: %w", err)
	}
	return
}