454 lines
14 KiB
Go
454 lines
14 KiB
Go
package meta
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/binary"
|
|
"errors"
|
|
"fmt"
|
|
|
|
cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id"
|
|
objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
|
|
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
|
|
"go.etcd.io/bbolt"
|
|
)
|
|
|
|
var (
|
|
// graveyardBucketName stores rows with the objects that have been
|
|
// covered with Tombstone objects. That objects should not be returned
|
|
// from the node and should not be accepted by the node from other
|
|
// nodes.
|
|
graveyardBucketName = []byte{graveyardPrefix}
|
|
// garbageBucketName stores rows with the objects that should be physically
|
|
// deleted by the node (Garbage Collector routine).
|
|
garbageBucketName = []byte{garbagePrefix}
|
|
toMoveItBucketName = []byte{toMoveItPrefix}
|
|
containerVolumeBucketName = []byte{containerVolumePrefix}
|
|
containerCounterBucketName = []byte{containerCountersPrefix}
|
|
expEpochToObjectBucketName = []byte{expirationEpochToObjectPrefix}
|
|
|
|
zeroValue = []byte{0xFF}
|
|
|
|
errInvalidLength = errors.New("invalid length")
|
|
)
|
|
|
|
// Prefix bytes for database keys. All ids and addresses are encoded in binary
|
|
// unless specified otherwise.
|
|
//
|
|
//nolint:godot
|
|
const (
|
|
// graveyardPrefix is used for the graveyard bucket.
|
|
// Key: object address
|
|
// Value: tombstone address
|
|
graveyardPrefix = iota
|
|
// garbagePrefix is used for the garbage bucket.
|
|
// Key: object address
|
|
// Value: dummy value
|
|
garbagePrefix
|
|
// toMoveItPrefix is used for bucket containing IDs of objects that are candidates for moving
|
|
// to another shard.
|
|
toMoveItPrefix
|
|
// containerVolumePrefix is used for storing container size estimations.
|
|
// Key: container ID
|
|
// Value: container size in bytes as little-endian uint64
|
|
containerVolumePrefix
|
|
// lockedPrefix is used for storing locked objects information.
|
|
// Key: container ID
|
|
// Value: bucket mapping objects locked to the list of corresponding LOCK objects.
|
|
lockedPrefix
|
|
// shardInfoPrefix is used for storing shard ID. All keys are custom and are not connected to the container.
|
|
shardInfoPrefix
|
|
|
|
// ======================
|
|
// Unique index buckets.
|
|
// ======================
|
|
|
|
// primaryPrefix is used for prefixing buckets containing objects of REGULAR type.
|
|
// Key: object ID
|
|
// Value: marshalled object
|
|
primaryPrefix
|
|
// lockersPrefix is used for prefixing buckets containing objects of LOCK type.
|
|
// Key: object ID
|
|
// Value: marshalled object
|
|
lockersPrefix
|
|
// _ is unused. Previous usage was for prefixing buckets containing objects of STORAGEGROUP type.
|
|
// Key: object ID
|
|
// Value: marshaled object
|
|
_
|
|
// tombstonePrefix is used for prefixing buckets containing objects of TOMBSTONE type.
|
|
// Key: object ID
|
|
// Value: marshaled object
|
|
tombstonePrefix
|
|
// smallPrefix is used for prefixing buckets mapping objects to the blobovniczas they are stored in.
|
|
// Key: object ID
|
|
// Value: blobovnicza ID
|
|
smallPrefix
|
|
// rootPrefix is used for prefixing buckets mapping parent object to the split info.
|
|
// Key: object ID
|
|
// Value: split info
|
|
rootPrefix
|
|
|
|
// ====================
|
|
// FKBT index buckets.
|
|
// ====================
|
|
|
|
// ownerPrefix was used for prefixing FKBT index buckets mapping owner to object IDs.
|
|
// Key: owner ID
|
|
// Value: bucket containing object IDs as keys
|
|
// removed in version 3
|
|
ownerPrefix
|
|
// userAttributePrefix was used for prefixing FKBT index buckets containing objects.
|
|
// Key: attribute value
|
|
// Value: bucket containing object IDs as keys
|
|
userAttributePrefix
|
|
|
|
// ====================
|
|
// List index buckets.
|
|
// ====================
|
|
|
|
// payloadHashPrefix was used for prefixing List index buckets mapping payload hash to a list of object IDs.
|
|
// Key: payload hash
|
|
// Value: list of object IDs
|
|
// removed in version 3
|
|
payloadHashPrefix
|
|
// parentPrefix is used for prefixing List index buckets mapping parent ID to a list of children IDs.
|
|
// Key: parent ID
|
|
// Value: list of object IDs
|
|
parentPrefix
|
|
// splitPrefix is used for prefixing List index buckets mapping split ID to a list of object IDs.
|
|
// Key: split ID
|
|
// Value: list of object IDs
|
|
splitPrefix
|
|
|
|
// containerCountersPrefix is used for storing container object counters.
|
|
// Key: container ID + type
|
|
// Value: container size in bytes as little-endian uint64
|
|
containerCountersPrefix
|
|
|
|
// ecInfoPrefix is used for storing relation between EC parent id and chunk id.
|
|
// Key: container ID + type
|
|
// Value: Object id
|
|
ecInfoPrefix
|
|
|
|
// expirationEpochToObjectPrefix is used for storing relation between expiration epoch and object id.
|
|
// Key: expiration epoch + object address
|
|
// Value: zero
|
|
expirationEpochToObjectPrefix
|
|
|
|
// objectToExpirationEpochPrefix is used for storing relation between expiration epoch and object id.
|
|
// Key: object address
|
|
// Value: expiration epoch
|
|
objectToExpirationEpochPrefix
|
|
)
|
|
|
|
const (
|
|
cidSize = sha256.Size
|
|
bucketKeySize = 1 + cidSize
|
|
objectKeySize = sha256.Size
|
|
addressKeySize = cidSize + objectKeySize
|
|
epochSize = 8
|
|
)
|
|
|
|
func bucketName(cnr cid.ID, prefix byte, key []byte) []byte {
|
|
key[0] = prefix
|
|
cnr.Encode(key[1:])
|
|
return key[:bucketKeySize]
|
|
}
|
|
|
|
// primaryBucketName returns <CID>.
|
|
func primaryBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, primaryPrefix, key)
|
|
}
|
|
|
|
// tombstoneBucketName returns <CID>_TS.
|
|
func tombstoneBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, tombstonePrefix, key)
|
|
}
|
|
|
|
// smallBucketName returns <CID>_small.
|
|
func smallBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, smallPrefix, key)
|
|
}
|
|
|
|
// attributeBucketName returns <CID>_<attributeKey>.
|
|
func attributeBucketName(cnr cid.ID, attributeKey string, key []byte) []byte {
|
|
key[0] = userAttributePrefix
|
|
cnr.Encode(key[1:])
|
|
return append(key[:bucketKeySize], attributeKey...)
|
|
}
|
|
|
|
func cidFromAttributeBucket(bucketName []byte) (cid.ID, bool) {
|
|
if len(bucketName) < bucketKeySize || bucketName[0] != userAttributePrefix {
|
|
return cid.ID{}, false
|
|
}
|
|
var result cid.ID
|
|
return result, result.Decode(bucketName[1:bucketKeySize]) == nil
|
|
}
|
|
|
|
func attributeFromAttributeBucket(bucketName []byte) (string, bool) {
|
|
if len(bucketName) < bucketKeySize || bucketName[0] != userAttributePrefix {
|
|
return "", false
|
|
}
|
|
return string(bucketName[bucketKeySize:]), true
|
|
}
|
|
|
|
// rootBucketName returns <CID>_root.
|
|
func rootBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, rootPrefix, key)
|
|
}
|
|
|
|
// parentBucketName returns <CID>_parent.
|
|
func parentBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, parentPrefix, key)
|
|
}
|
|
|
|
// splitBucketName returns <CID>_splitid.
|
|
func splitBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, splitPrefix, key)
|
|
}
|
|
|
|
// ecInfoBucketName returns <CID>_ecinfo.
|
|
func ecInfoBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, ecInfoPrefix, key)
|
|
}
|
|
|
|
// objectToExpirationEpochBucketName returns objectToExpirationEpochPrefix_<CID>.
|
|
func objectToExpirationEpochBucketName(cnr cid.ID, key []byte) []byte {
|
|
return bucketName(cnr, objectToExpirationEpochPrefix, key)
|
|
}
|
|
|
|
func expirationEpochKey(epoch uint64, cnr cid.ID, obj oid.ID) []byte {
|
|
result := make([]byte, epochSize+addressKeySize)
|
|
binary.BigEndian.PutUint64(result, epoch)
|
|
cnr.Encode(result[epochSize:])
|
|
obj.Encode(result[epochSize+cidSize:])
|
|
return result
|
|
}
|
|
|
|
func parseExpirationEpochKey(key []byte) (uint64, cid.ID, oid.ID, error) {
|
|
if len(key) != epochSize+addressKeySize {
|
|
return 0, cid.ID{}, oid.ID{}, fmt.Errorf("unexpected expiration epoch to object key length: %d", len(key))
|
|
}
|
|
epoch := binary.BigEndian.Uint64(key)
|
|
var cnr cid.ID
|
|
if err := cnr.Decode(key[epochSize : epochSize+cidSize]); err != nil {
|
|
return 0, cid.ID{}, oid.ID{}, fmt.Errorf("failed to decode expiration epoch to object key (container ID): %w", err)
|
|
}
|
|
var obj oid.ID
|
|
if err := obj.Decode(key[epochSize+cidSize:]); err != nil {
|
|
return 0, cid.ID{}, oid.ID{}, fmt.Errorf("failed to decode expiration epoch to object key (object ID): %w", err)
|
|
}
|
|
return epoch, cnr, obj, nil
|
|
}
|
|
|
|
// addressKey returns key for K-V tables when key is a whole address.
|
|
func addressKey(addr oid.Address, key []byte) []byte {
|
|
addr.Container().Encode(key)
|
|
addr.Object().Encode(key[cidSize:])
|
|
return key[:addressKeySize]
|
|
}
|
|
|
|
// parses object address formed by addressKey.
|
|
func decodeAddressFromKey(dst *oid.Address, k []byte) error {
|
|
if len(k) != addressKeySize {
|
|
return errInvalidLength
|
|
}
|
|
|
|
var cnr cid.ID
|
|
if err := cnr.Decode(k[:cidSize]); err != nil {
|
|
return err
|
|
}
|
|
|
|
var obj oid.ID
|
|
if err := obj.Decode(k[cidSize:]); err != nil {
|
|
return err
|
|
}
|
|
|
|
dst.SetObject(obj)
|
|
dst.SetContainer(cnr)
|
|
return nil
|
|
}
|
|
|
|
// objectKey returns key for K-V tables when key is an object id.
|
|
func objectKey(obj oid.ID, key []byte) []byte {
|
|
obj.Encode(key)
|
|
return key[:objectKeySize]
|
|
}
|
|
|
|
// if meets irregular object container in objs - returns its type, otherwise returns object.TypeRegular.
|
|
//
|
|
// firstIrregularObjectType(tx, cnr, obj) usage allows getting object type.
|
|
func firstIrregularObjectType(tx *bbolt.Tx, idCnr cid.ID, objs ...[]byte) objectSDK.Type {
|
|
if len(objs) == 0 {
|
|
panic("empty object list in firstIrregularObjectType")
|
|
}
|
|
|
|
var keys [2][1 + cidSize]byte
|
|
|
|
irregularTypeBuckets := [...]struct {
|
|
typ objectSDK.Type
|
|
name []byte
|
|
}{
|
|
{objectSDK.TypeTombstone, tombstoneBucketName(idCnr, keys[0][:])},
|
|
{objectSDK.TypeLock, bucketNameLockers(idCnr, keys[1][:])},
|
|
}
|
|
|
|
for i := range objs {
|
|
for j := range irregularTypeBuckets {
|
|
if inBucket(tx, irregularTypeBuckets[j].name, objs[i]) {
|
|
return irregularTypeBuckets[j].typ
|
|
}
|
|
}
|
|
}
|
|
|
|
return objectSDK.TypeRegular
|
|
}
|
|
|
|
// return true if provided object is of LOCK type.
|
|
func isLockObject(tx *bbolt.Tx, idCnr cid.ID, obj oid.ID) bool {
|
|
return inBucket(tx,
|
|
bucketNameLockers(idCnr, make([]byte, bucketKeySize)),
|
|
objectKey(obj, make([]byte, objectKeySize)))
|
|
}
|
|
|
|
const NoExpirationEpoch uint64 = 0
|
|
|
|
// encodeTombstoneWithExpEpoch encodes a tombstone label in the following
|
|
// format: tombstone_address + expiration_epoch.
|
|
//
|
|
// Returns an error if the buffer length isn't 32.
|
|
//
|
|
// The expiration epoch shouldn't be [NoExpirationEpoch], as tombstone labels
|
|
// are intended to have a valid expiration epoch.
|
|
//
|
|
// The use of [NoExpirationEpoch] is allowed only for test purposes.
|
|
func encodeTombstoneWithExpEpoch(addr oid.Address, expEpoch uint64, dst []byte) error {
|
|
if len(dst) != addressKeySize+epochSize {
|
|
return errInvalidLength
|
|
}
|
|
|
|
addr.Container().Encode(dst[:cidSize])
|
|
addr.Object().Encode(dst[cidSize:addressKeySize])
|
|
binary.LittleEndian.PutUint64(dst[addressKeySize:], expEpoch)
|
|
|
|
return nil
|
|
}
|
|
|
|
// decodeTombstoneWithExpEpoch decodes a tombstone label in the following
|
|
// formats: tombstone address or tombstone address + expiration epoch.
|
|
//
|
|
// Expiration epoch is set to [NoExpirationEpoch] if the label doesn't have one.
|
|
func decodeTombstoneWithExpEpoch(addr *oid.Address, expEpoch *uint64, src []byte) error {
|
|
if len(src) != addressKeySize && len(src) != addressKeySize+epochSize {
|
|
return errInvalidLength
|
|
}
|
|
|
|
var cnt cid.ID
|
|
if err := cnt.Decode(src[:cidSize]); err != nil {
|
|
return err
|
|
}
|
|
|
|
var obj oid.ID
|
|
if err := obj.Decode(src[cidSize:addressKeySize]); err != nil {
|
|
return err
|
|
}
|
|
|
|
addr.SetContainer(cnt)
|
|
addr.SetObject(obj)
|
|
|
|
if len(src) > addressKeySize {
|
|
*expEpoch = binary.LittleEndian.Uint64(src[addressKeySize:])
|
|
} else {
|
|
*expEpoch = NoExpirationEpoch
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// lockWithExpEpoch contains the ID and expiration epoch of the lock.
|
|
type lockWithExpEpoch struct {
|
|
id [objectKeySize]byte
|
|
expEpoch [epochSize]byte
|
|
}
|
|
|
|
// decode decodes the ID and expiration epoch of the lock.
|
|
//
|
|
// If the lock has no expiration epoch, uses [NoExpirationEpoch] instead.
|
|
func (l lockWithExpEpoch) decode(id *oid.ID, expEpoch *uint64) error {
|
|
if err := id.Decode(l.id[:]); err != nil {
|
|
return err
|
|
}
|
|
*expEpoch = binary.LittleEndian.Uint64(l.expEpoch[:])
|
|
return nil
|
|
}
|
|
|
|
// encode encodes the ID and expiration epoch of the lock.
|
|
func (l *lockWithExpEpoch) encode(id oid.ID, expEpoch uint64) {
|
|
id.Encode(l.id[:])
|
|
binary.LittleEndian.PutUint64(l.expEpoch[:], expEpoch)
|
|
}
|
|
|
|
// decodeLockWithExpEpochList decodes the lock list encoded with
|
|
// [encodeLockWithExpEpochList].
|
|
//
|
|
// If some locks have no expiration epoch, uses [NoExpirationEpoch] instead.
|
|
func decodeLockWithExpEpochList(data []byte) (locks []lockWithExpEpoch, err error) {
|
|
xs, err := decodeList(data)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("couldn't decode list: %w", err)
|
|
}
|
|
|
|
var id []byte
|
|
var idMatched bool
|
|
|
|
for _, x := range xs {
|
|
switch size := len(x); {
|
|
case size == objectKeySize && !idMatched:
|
|
id = x
|
|
idMatched = true
|
|
case size == objectKeySize && idMatched:
|
|
var lock lockWithExpEpoch
|
|
copy(lock.id[:], id)
|
|
locks = append(locks, lock)
|
|
id = x
|
|
case size == epochSize && idMatched:
|
|
var lock lockWithExpEpoch
|
|
copy(lock.id[:], id)
|
|
copy(lock.expEpoch[:], x)
|
|
locks = append(locks, lock)
|
|
idMatched = false
|
|
case size == epochSize && !idMatched:
|
|
return nil, errors.New("found expiration epoch but expected lock")
|
|
default:
|
|
return nil, fmt.Errorf("unexpected list element size %d", size)
|
|
}
|
|
}
|
|
|
|
if idMatched {
|
|
var lock lockWithExpEpoch
|
|
copy(lock.id[:], id)
|
|
locks = append(locks, lock)
|
|
}
|
|
|
|
return locks, nil
|
|
}
|
|
|
|
// encodeLockWithExpEpochList encodes the lock list.
|
|
//
|
|
// If some locks have [NoExpirationEpoch], encodes only their IDs.
|
|
func encodeLockWithExpEpochList(locks []lockWithExpEpoch) (data []byte, err error) {
|
|
var noEpoch [epochSize]byte
|
|
|
|
var xs [][]byte
|
|
|
|
for _, lock := range locks {
|
|
xs = append(xs, lock.id[:])
|
|
if lock.expEpoch != noEpoch {
|
|
xs = append(xs, lock.expEpoch[:])
|
|
}
|
|
}
|
|
|
|
if data, err = encodeList(xs); err != nil {
|
|
return nil, fmt.Errorf("couldn't encode list: %w", err)
|
|
}
|
|
return
|
|
}
|