frostfs-node/pkg/local_object_storage/metabase/util.go

455 lines
14 KiB
Go
Raw Normal View History

package meta
import (
"crypto/sha256"
"encoding/binary"
"errors"
"fmt"
cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id"
objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
"go.etcd.io/bbolt"
)
var (
// graveyardBucketName stores rows with the objects that have been
// covered with Tombstone objects. That objects should not be returned
// from the node and should not be accepted by the node from other
// nodes.
graveyardBucketName = []byte{graveyardPrefix}
// garbageBucketName stores rows with the objects that should be physically
// deleted by the node (Garbage Collector routine).
garbageBucketName = []byte{garbagePrefix}
toMoveItBucketName = []byte{toMoveItPrefix}
containerVolumeBucketName = []byte{containerVolumePrefix}
containerCounterBucketName = []byte{containerCountersPrefix}
expEpochToObjectBucketName = []byte{expirationEpochToObjectPrefix}
zeroValue = []byte{0xFF}
errInvalidLength = errors.New("invalid length")
)
// Prefix bytes for database keys. All ids and addresses are encoded in binary
// unless specified otherwise.
//
//nolint:godot
const (
// graveyardPrefix is used for the graveyard bucket.
// Key: object address
// Value: tombstone address
graveyardPrefix = iota
// garbagePrefix is used for the garbage bucket.
// Key: object address
// Value: dummy value
garbagePrefix
// toMoveItPrefix is used for bucket containing IDs of objects that are candidates for moving
// to another shard.
toMoveItPrefix
// containerVolumePrefix is used for storing container size estimations.
// Key: container ID
// Value: container size in bytes as little-endian uint64
containerVolumePrefix
// lockedPrefix is used for storing locked objects information.
// Key: container ID
// Value: bucket mapping objects locked to the list of corresponding LOCK objects.
lockedPrefix
// shardInfoPrefix is used for storing shard ID. All keys are custom and are not connected to the container.
shardInfoPrefix
// ======================
// Unique index buckets.
// ======================
// primaryPrefix is used for prefixing buckets containing objects of REGULAR type.
// Key: object ID
// Value: marshalled object
primaryPrefix
// lockersPrefix is used for prefixing buckets containing objects of LOCK type.
// Key: object ID
// Value: marshalled object
lockersPrefix
// _ is unused. Previous usage was for prefixing buckets containing objects of STORAGEGROUP type.
// Key: object ID
// Value: marshaled object
_
// tombstonePrefix is used for prefixing buckets containing objects of TOMBSTONE type.
// Key: object ID
// Value: marshaled object
tombstonePrefix
// smallPrefix is used for prefixing buckets mapping objects to the blobovniczas they are stored in.
// Key: object ID
// Value: blobovnicza ID
smallPrefix
// rootPrefix is used for prefixing buckets mapping parent object to the split info.
// Key: object ID
// Value: split info
rootPrefix
// ====================
// FKBT index buckets.
// ====================
// ownerPrefix was used for prefixing FKBT index buckets mapping owner to object IDs.
// Key: owner ID
// Value: bucket containing object IDs as keys
// removed in version 3
ownerPrefix
// userAttributePrefix was used for prefixing FKBT index buckets containing objects.
// Key: attribute value
// Value: bucket containing object IDs as keys
userAttributePrefix
// ====================
// List index buckets.
// ====================
// payloadHashPrefix was used for prefixing List index buckets mapping payload hash to a list of object IDs.
// Key: payload hash
// Value: list of object IDs
// removed in version 3
payloadHashPrefix
// parentPrefix is used for prefixing List index buckets mapping parent ID to a list of children IDs.
// Key: parent ID
// Value: list of object IDs
parentPrefix
// splitPrefix is used for prefixing List index buckets mapping split ID to a list of object IDs.
// Key: split ID
// Value: list of object IDs
splitPrefix
// containerCountersPrefix is used for storing container object counters.
// Key: container ID + type
// Value: container size in bytes as little-endian uint64
containerCountersPrefix
// ecInfoPrefix is used for storing relation between EC parent id and chunk id.
// Key: container ID + type
// Value: Object id
ecInfoPrefix
// expirationEpochToObjectPrefix is used for storing relation between expiration epoch and object id.
// Key: expiration epoch + object address
// Value: zero
expirationEpochToObjectPrefix
// objectToExpirationEpochPrefix is used for storing relation between expiration epoch and object id.
// Key: object address
// Value: expiration epoch
objectToExpirationEpochPrefix
)
const (
cidSize = sha256.Size
bucketKeySize = 1 + cidSize
objectKeySize = sha256.Size
addressKeySize = cidSize + objectKeySize
epochSize = 8
)
func bucketName(cnr cid.ID, prefix byte, key []byte) []byte {
key[0] = prefix
cnr.Encode(key[1:])
return key[:bucketKeySize]
}
// primaryBucketName returns <CID>.
func primaryBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, primaryPrefix, key)
}
// tombstoneBucketName returns <CID>_TS.
func tombstoneBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, tombstonePrefix, key)
}
// smallBucketName returns <CID>_small.
func smallBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, smallPrefix, key)
}
// attributeBucketName returns <CID>_<attributeKey>.
func attributeBucketName(cnr cid.ID, attributeKey string, key []byte) []byte {
key[0] = userAttributePrefix
cnr.Encode(key[1:])
return append(key[:bucketKeySize], attributeKey...)
}
func cidFromAttributeBucket(bucketName []byte) (cid.ID, bool) {
if len(bucketName) < bucketKeySize || bucketName[0] != userAttributePrefix {
return cid.ID{}, false
}
var result cid.ID
return result, result.Decode(bucketName[1:bucketKeySize]) == nil
}
func attributeFromAttributeBucket(bucketName []byte) (string, bool) {
if len(bucketName) < bucketKeySize || bucketName[0] != userAttributePrefix {
return "", false
}
return string(bucketName[bucketKeySize:]), true
}
// rootBucketName returns <CID>_root.
func rootBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, rootPrefix, key)
}
// parentBucketName returns <CID>_parent.
func parentBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, parentPrefix, key)
}
// splitBucketName returns <CID>_splitid.
func splitBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, splitPrefix, key)
}
// ecInfoBucketName returns <CID>_ecinfo.
func ecInfoBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, ecInfoPrefix, key)
}
// objectToExpirationEpochBucketName returns objectToExpirationEpochPrefix_<CID>.
func objectToExpirationEpochBucketName(cnr cid.ID, key []byte) []byte {
return bucketName(cnr, objectToExpirationEpochPrefix, key)
}
func expirationEpochKey(epoch uint64, cnr cid.ID, obj oid.ID) []byte {
result := make([]byte, epochSize+addressKeySize)
binary.BigEndian.PutUint64(result, epoch)
cnr.Encode(result[epochSize:])
obj.Encode(result[epochSize+cidSize:])
return result
}
func parseExpirationEpochKey(key []byte) (uint64, cid.ID, oid.ID, error) {
if len(key) != epochSize+addressKeySize {
return 0, cid.ID{}, oid.ID{}, fmt.Errorf("unexpected expiration epoch to object key length: %d", len(key))
}
epoch := binary.BigEndian.Uint64(key)
var cnr cid.ID
if err := cnr.Decode(key[epochSize : epochSize+cidSize]); err != nil {
return 0, cid.ID{}, oid.ID{}, fmt.Errorf("failed to decode expiration epoch to object key (container ID): %w", err)
}
var obj oid.ID
if err := obj.Decode(key[epochSize+cidSize:]); err != nil {
return 0, cid.ID{}, oid.ID{}, fmt.Errorf("failed to decode expiration epoch to object key (object ID): %w", err)
}
return epoch, cnr, obj, nil
}
// addressKey returns key for K-V tables when key is a whole address.
func addressKey(addr oid.Address, key []byte) []byte {
addr.Container().Encode(key)
addr.Object().Encode(key[cidSize:])
return key[:addressKeySize]
}
// parses object address formed by addressKey.
func decodeAddressFromKey(dst *oid.Address, k []byte) error {
if len(k) != addressKeySize {
return errInvalidLength
}
var cnr cid.ID
if err := cnr.Decode(k[:cidSize]); err != nil {
return err
}
var obj oid.ID
if err := obj.Decode(k[cidSize:]); err != nil {
return err
}
dst.SetObject(obj)
dst.SetContainer(cnr)
return nil
}
// objectKey returns key for K-V tables when key is an object id.
func objectKey(obj oid.ID, key []byte) []byte {
obj.Encode(key)
return key[:objectKeySize]
}
// if meets irregular object container in objs - returns its type, otherwise returns object.TypeRegular.
//
// firstIrregularObjectType(tx, cnr, obj) usage allows getting object type.
func firstIrregularObjectType(tx *bbolt.Tx, idCnr cid.ID, objs ...[]byte) objectSDK.Type {
if len(objs) == 0 {
panic("empty object list in firstIrregularObjectType")
}
var keys [2][1 + cidSize]byte
irregularTypeBuckets := [...]struct {
typ objectSDK.Type
name []byte
}{
{objectSDK.TypeTombstone, tombstoneBucketName(idCnr, keys[0][:])},
{objectSDK.TypeLock, bucketNameLockers(idCnr, keys[1][:])},
}
for i := range objs {
for j := range irregularTypeBuckets {
if inBucket(tx, irregularTypeBuckets[j].name, objs[i]) {
return irregularTypeBuckets[j].typ
}
}
}
return objectSDK.TypeRegular
}
// return true if provided object is of LOCK type.
func isLockObject(tx *bbolt.Tx, idCnr cid.ID, obj oid.ID) bool {
return inBucket(tx,
bucketNameLockers(idCnr, make([]byte, bucketKeySize)),
objectKey(obj, make([]byte, objectKeySize)))
}
const NoExpirationEpoch uint64 = 0
// encodeTombstoneWithExpEpoch encodes a tombstone label in the following
// format: tombstone_address + expiration_epoch.
//
// Returns an error if the buffer length isn't 32.
//
// The expiration epoch shouldn't be [NoExpirationEpoch], as tombstone labels
// are intended to have a valid expiration epoch.
//
// The use of [NoExpirationEpoch] is allowed only for test purposes.
func encodeTombstoneWithExpEpoch(addr oid.Address, expEpoch uint64, dst []byte) error {
if len(dst) != addressKeySize+epochSize {
return errInvalidLength
}
addr.Container().Encode(dst[:cidSize])
addr.Object().Encode(dst[cidSize:addressKeySize])
binary.LittleEndian.PutUint64(dst[addressKeySize:], expEpoch)
return nil
}
// decodeTombstoneWithExpEpoch decodes a tombstone label in the following
// formats: tombstone address or tombstone address + expiration epoch.
//
// Expiration epoch is set to [NoExpirationEpoch] if the label doesn't have one.
func decodeTombstoneWithExpEpoch(addr *oid.Address, expEpoch *uint64, src []byte) error {
if len(src) != addressKeySize && len(src) != addressKeySize+epochSize {
return errInvalidLength
}
var cnt cid.ID
if err := cnt.Decode(src[:cidSize]); err != nil {
return err
}
var obj oid.ID
if err := obj.Decode(src[cidSize:addressKeySize]); err != nil {
return err
}
addr.SetContainer(cnt)
addr.SetObject(obj)
if len(src) > addressKeySize {
*expEpoch = binary.LittleEndian.Uint64(src[addressKeySize:])
} else {
*expEpoch = NoExpirationEpoch
}
return nil
}
// lockWithExpEpoch contains the ID and expiration epoch of the lock.
type lockWithExpEpoch struct {
id [objectKeySize]byte
expEpoch [epochSize]byte
}
// decode decodes the ID and expiration epoch of the lock.
//
// If the lock has no expiration epoch, uses [NoExpirationEpoch] instead.
func (l lockWithExpEpoch) decode(id *oid.ID, expEpoch *uint64) error {
if err := id.Decode(l.id[:]); err != nil {
return err
}
*expEpoch = binary.LittleEndian.Uint64(l.expEpoch[:])
return nil
}
// encode encodes the ID and expiration epoch of the lock.
func (l *lockWithExpEpoch) encode(id oid.ID, expEpoch uint64) {
id.Encode(l.id[:])
binary.LittleEndian.PutUint64(l.expEpoch[:], expEpoch)
}
// decodeLockWithExpEpochList decodes the lock list encoded with
// [encodeLockWithExpEpochList].
//
// If some locks have no expiration epoch, uses [NoExpirationEpoch] instead.
func decodeLockWithExpEpochList(data []byte) (locks []lockWithExpEpoch, err error) {
xs, err := decodeList(data)
if err != nil {
return nil, fmt.Errorf("couldn't decode list: %w", err)
}
var id []byte
var idMatched bool
for _, x := range xs {
switch size := len(x); {
case size == objectKeySize && !idMatched:
id = x
idMatched = true
case size == objectKeySize && idMatched:
var lock lockWithExpEpoch
copy(lock.id[:], id)
locks = append(locks, lock)
id = x
case size == epochSize && idMatched:
var lock lockWithExpEpoch
copy(lock.id[:], id)
copy(lock.expEpoch[:], x)
locks = append(locks, lock)
idMatched = false
case size == epochSize && !idMatched:
return nil, errors.New("found expiration epoch but expected lock")
default:
return nil, fmt.Errorf("unexpected list element size %d", size)
}
}
if idMatched {
var lock lockWithExpEpoch
copy(lock.id[:], id)
locks = append(locks, lock)
}
return locks, nil
}
// encodeLockWithExpEpochList encodes the lock list.
//
// If some locks have [NoExpirationEpoch], encodes only their IDs.
func encodeLockWithExpEpochList(locks []lockWithExpEpoch) (data []byte, err error) {
var noEpoch [epochSize]byte
var xs [][]byte
for _, lock := range locks {
xs = append(xs, lock.id[:])
if lock.expEpoch != noEpoch {
xs = append(xs, lock.expEpoch[:])
}
}
if data, err = encodeList(xs); err != nil {
return nil, fmt.Errorf("couldn't encode list: %w", err)
}
return
}