forked from TrueCloudLab/frostfs-node
Aleksey Savchuk
6ccac0a476
This function was very obfuscated. I hope the newer version is more clear, but IMHO it keeps being bad because: - Its name is confusing because it checks both the graveyard and the garbage. - It has no interface. We use that function in several metabase methods, it just returns some 'magic' uint8 numbers and has no doc comment, I mean it's ridiculous. - It checks out for 'the node being in incorrect state' for some reason but that result isn't used further. I kept a comment about that but it has no logic for me. Signed-off-by: Aleksey Savchuk <a.savchuk@yadro.com>
540 lines
14 KiB
Go
540 lines
14 KiB
Go
package meta
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"time"
|
|
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object"
|
|
storagelog "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/internal/log"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/internal/metaerr"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-observability/tracing"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-sdk-go/client"
|
|
cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id"
|
|
objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
|
|
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
|
|
"go.etcd.io/bbolt"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
)
|
|
|
|
var errFailedToRemoveUniqueIndexes = errors.New("can't remove unique indexes")
|
|
|
|
// DeletePrm groups the parameters of Delete operation.
|
|
type DeletePrm struct {
|
|
addrs []oid.Address
|
|
}
|
|
|
|
// DeleteRes groups the resulting values of Delete operation.
|
|
type DeleteRes struct {
|
|
phyCount uint64
|
|
logicCount uint64
|
|
userCount uint64
|
|
phySize uint64
|
|
logicSize uint64
|
|
removedByCnrID map[cid.ID]ObjectCounters
|
|
}
|
|
|
|
// LogicCount returns the number of removed logic
|
|
// objects.
|
|
func (d DeleteRes) LogicCount() uint64 {
|
|
return d.logicCount
|
|
}
|
|
|
|
func (d DeleteRes) UserCount() uint64 {
|
|
return d.userCount
|
|
}
|
|
|
|
// RemovedByCnrID returns the number of removed objects by container ID.
|
|
func (d DeleteRes) RemovedByCnrID() map[cid.ID]ObjectCounters {
|
|
return d.removedByCnrID
|
|
}
|
|
|
|
// PhyCount returns the number of removed physical objects.
|
|
func (d DeleteRes) PhyCount() uint64 {
|
|
return d.phyCount
|
|
}
|
|
|
|
// PhySize returns the size of removed physical objects.
|
|
func (d DeleteRes) PhySize() uint64 {
|
|
return d.phySize
|
|
}
|
|
|
|
// LogicSize returns the size of removed logical objects.
|
|
func (d DeleteRes) LogicSize() uint64 {
|
|
return d.logicSize
|
|
}
|
|
|
|
// SetAddresses is a Delete option to set the addresses of the objects to delete.
|
|
//
|
|
// Option is required.
|
|
func (p *DeletePrm) SetAddresses(addrs ...oid.Address) {
|
|
p.addrs = addrs
|
|
}
|
|
|
|
type referenceNumber struct {
|
|
all, cur int
|
|
|
|
obj *objectSDK.Object
|
|
}
|
|
|
|
type referenceCounter map[string]*referenceNumber
|
|
|
|
// Delete removed object records from metabase indexes.
|
|
func (db *DB) Delete(ctx context.Context, prm DeletePrm) (DeleteRes, error) {
|
|
var (
|
|
startedAt = time.Now()
|
|
deleted = false
|
|
)
|
|
defer func() {
|
|
db.metrics.AddMethodDuration("Delete", time.Since(startedAt), deleted)
|
|
}()
|
|
|
|
_, span := tracing.StartSpanFromContext(ctx, "metabase.Delete",
|
|
trace.WithAttributes(
|
|
attribute.Int("addr_count", len(prm.addrs)),
|
|
))
|
|
defer span.End()
|
|
|
|
db.modeMtx.RLock()
|
|
defer db.modeMtx.RUnlock()
|
|
|
|
if db.mode.NoMetabase() {
|
|
return DeleteRes{}, ErrDegradedMode
|
|
} else if db.mode.ReadOnly() {
|
|
return DeleteRes{}, ErrReadOnlyMode
|
|
}
|
|
|
|
var err error
|
|
var res DeleteRes
|
|
|
|
err = db.boltDB.Batch(func(tx *bbolt.Tx) error {
|
|
res, err = db.deleteGroup(tx, prm.addrs)
|
|
return err
|
|
})
|
|
if err == nil {
|
|
deleted = true
|
|
for i := range prm.addrs {
|
|
storagelog.Write(db.log,
|
|
storagelog.AddressField(prm.addrs[i]),
|
|
storagelog.OpField("metabase DELETE"))
|
|
}
|
|
}
|
|
return res, metaerr.Wrap(err)
|
|
}
|
|
|
|
// deleteGroup deletes object from the metabase. Handles removal of the
|
|
// references of the split objects.
|
|
func (db *DB) deleteGroup(tx *bbolt.Tx, addrs []oid.Address) (DeleteRes, error) {
|
|
res := DeleteRes{
|
|
removedByCnrID: make(map[cid.ID]ObjectCounters),
|
|
}
|
|
refCounter := make(referenceCounter, len(addrs))
|
|
currEpoch := db.epochState.CurrentEpoch()
|
|
|
|
for i := range addrs {
|
|
r, err := db.delete(tx, addrs[i], refCounter, currEpoch)
|
|
if err != nil {
|
|
return DeleteRes{}, err
|
|
}
|
|
|
|
applyDeleteSingleResult(r, &res, addrs, i)
|
|
}
|
|
|
|
if err := db.updateCountersDelete(tx, res); err != nil {
|
|
return DeleteRes{}, err
|
|
}
|
|
|
|
for _, refNum := range refCounter {
|
|
if refNum.cur == refNum.all {
|
|
err := db.deleteObject(tx, refNum.obj, true)
|
|
if err != nil {
|
|
return DeleteRes{}, err
|
|
}
|
|
}
|
|
}
|
|
|
|
return res, nil
|
|
}
|
|
|
|
func (db *DB) updateCountersDelete(tx *bbolt.Tx, res DeleteRes) error {
|
|
if res.phyCount > 0 {
|
|
err := db.updateShardObjectCounter(tx, phy, res.phyCount, false)
|
|
if err != nil {
|
|
return fmt.Errorf("could not decrease phy object counter: %w", err)
|
|
}
|
|
}
|
|
|
|
if res.logicCount > 0 {
|
|
err := db.updateShardObjectCounter(tx, logical, res.logicCount, false)
|
|
if err != nil {
|
|
return fmt.Errorf("could not decrease logical object counter: %w", err)
|
|
}
|
|
}
|
|
|
|
if res.userCount > 0 {
|
|
err := db.updateShardObjectCounter(tx, user, res.userCount, false)
|
|
if err != nil {
|
|
return fmt.Errorf("could not decrease user object counter: %w", err)
|
|
}
|
|
}
|
|
|
|
if err := db.updateContainerCounter(tx, res.removedByCnrID, false); err != nil {
|
|
return fmt.Errorf("could not decrease container object counter: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func applyDeleteSingleResult(r deleteSingleResult, res *DeleteRes, addrs []oid.Address, i int) {
|
|
if r.Phy {
|
|
if v, ok := res.removedByCnrID[addrs[i].Container()]; ok {
|
|
v.Phy++
|
|
res.removedByCnrID[addrs[i].Container()] = v
|
|
} else {
|
|
res.removedByCnrID[addrs[i].Container()] = ObjectCounters{
|
|
Phy: 1,
|
|
}
|
|
}
|
|
|
|
res.phyCount++
|
|
res.phySize += r.Size
|
|
}
|
|
|
|
if r.Logic {
|
|
if v, ok := res.removedByCnrID[addrs[i].Container()]; ok {
|
|
v.Logic++
|
|
res.removedByCnrID[addrs[i].Container()] = v
|
|
} else {
|
|
res.removedByCnrID[addrs[i].Container()] = ObjectCounters{
|
|
Logic: 1,
|
|
}
|
|
}
|
|
|
|
res.logicCount++
|
|
res.logicSize += r.Size
|
|
}
|
|
|
|
if r.User {
|
|
if v, ok := res.removedByCnrID[addrs[i].Container()]; ok {
|
|
v.User++
|
|
res.removedByCnrID[addrs[i].Container()] = v
|
|
} else {
|
|
res.removedByCnrID[addrs[i].Container()] = ObjectCounters{
|
|
User: 1,
|
|
}
|
|
}
|
|
|
|
res.userCount++
|
|
}
|
|
}
|
|
|
|
type deleteSingleResult struct {
|
|
Phy bool
|
|
Logic bool
|
|
User bool
|
|
Size uint64
|
|
}
|
|
|
|
// delete removes object indexes from the metabase. Counts the references
|
|
// of the object that is being removed.
|
|
// The first return value indicates if an object has been removed. (removing a
|
|
// non-exist object is error-free). The second return value indicates if an
|
|
// object was available before the removal (for calculating the logical object
|
|
// counter). The third return value The fourth return value is removed object payload size.
|
|
func (db *DB) delete(tx *bbolt.Tx, addr oid.Address, refCounter referenceCounter, currEpoch uint64) (deleteSingleResult, error) {
|
|
key := make([]byte, addressKeySize)
|
|
addrKey := addressKey(addr, key)
|
|
garbageBKT := tx.Bucket(garbageBucketName)
|
|
|
|
removeAvailableObject := inGraveyardWithKey(tx, addrKey) == 0
|
|
|
|
// unmarshal object, work only with physically stored (raw == true) objects
|
|
obj, err := db.get(tx, addr, key, false, true, currEpoch)
|
|
if err != nil {
|
|
if client.IsErrObjectNotFound(err) {
|
|
addrKey = addressKey(addr, key)
|
|
if garbageBKT != nil {
|
|
err := garbageBKT.Delete(addrKey)
|
|
if err != nil {
|
|
return deleteSingleResult{}, fmt.Errorf("could not remove from garbage bucket: %w", err)
|
|
}
|
|
}
|
|
return deleteSingleResult{}, nil
|
|
}
|
|
var siErr *objectSDK.SplitInfoError
|
|
var ecErr *objectSDK.ECInfoError
|
|
if errors.As(err, &siErr) || errors.As(err, &ecErr) {
|
|
// if object is virtual (parent) then do nothing, it will be deleted with last child
|
|
// if object is erasure-coded it will be deleted with the last chunk presented on the shard
|
|
return deleteSingleResult{}, nil
|
|
}
|
|
|
|
return deleteSingleResult{}, err
|
|
}
|
|
|
|
addrKey = addressKey(addr, key)
|
|
// remove record from the garbage bucket
|
|
if garbageBKT != nil {
|
|
err := garbageBKT.Delete(addrKey)
|
|
if err != nil {
|
|
return deleteSingleResult{}, fmt.Errorf("could not remove from garbage bucket: %w", err)
|
|
}
|
|
}
|
|
|
|
// if object is an only link to a parent, then remove parent
|
|
if parent := obj.Parent(); parent != nil {
|
|
parAddr := object.AddressOf(parent)
|
|
sParAddr := addressKey(parAddr, key)
|
|
k := string(sParAddr)
|
|
|
|
nRef, ok := refCounter[k]
|
|
if !ok {
|
|
nRef = &referenceNumber{
|
|
all: parentLength(tx, parAddr),
|
|
obj: parent,
|
|
}
|
|
|
|
refCounter[k] = nRef
|
|
}
|
|
|
|
nRef.cur++
|
|
}
|
|
|
|
isUserObject := IsUserObject(obj)
|
|
|
|
// remove object
|
|
err = db.deleteObject(tx, obj, false)
|
|
if err != nil {
|
|
return deleteSingleResult{}, fmt.Errorf("could not remove object: %w", err)
|
|
}
|
|
|
|
if err := deleteECRelatedInfo(tx, garbageBKT, obj, addr.Container(), refCounter); err != nil {
|
|
return deleteSingleResult{}, err
|
|
}
|
|
|
|
return deleteSingleResult{
|
|
Phy: true,
|
|
Logic: removeAvailableObject,
|
|
User: isUserObject && removeAvailableObject,
|
|
Size: obj.PayloadSize(),
|
|
}, nil
|
|
}
|
|
|
|
func (db *DB) deleteObject(
|
|
tx *bbolt.Tx,
|
|
obj *objectSDK.Object,
|
|
isParent bool,
|
|
) error {
|
|
err := delUniqueIndexes(tx, obj, isParent)
|
|
if err != nil {
|
|
return errFailedToRemoveUniqueIndexes
|
|
}
|
|
|
|
err = updateListIndexes(tx, obj, delListIndexItem)
|
|
if err != nil {
|
|
return fmt.Errorf("can't remove list indexes: %w", err)
|
|
}
|
|
|
|
if isParent {
|
|
// remove record from the garbage bucket, because regular object deletion does nothing for virtual object
|
|
garbageBKT := tx.Bucket(garbageBucketName)
|
|
if garbageBKT != nil {
|
|
key := make([]byte, addressKeySize)
|
|
addrKey := addressKey(object.AddressOf(obj), key)
|
|
err := garbageBKT.Delete(addrKey)
|
|
if err != nil {
|
|
return fmt.Errorf("could not remove from garbage bucket: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// parentLength returns amount of available children from parentid index.
|
|
func parentLength(tx *bbolt.Tx, addr oid.Address) int {
|
|
bucketName := make([]byte, bucketKeySize)
|
|
|
|
bkt := tx.Bucket(parentBucketName(addr.Container(), bucketName[:]))
|
|
if bkt == nil {
|
|
return 0
|
|
}
|
|
|
|
lst, err := decodeList(bkt.Get(objectKey(addr.Object(), bucketName[:])))
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
return len(lst)
|
|
}
|
|
|
|
func delUniqueIndexItem(tx *bbolt.Tx, item namedBucketItem) {
|
|
bkt := tx.Bucket(item.name)
|
|
if bkt != nil {
|
|
_ = bkt.Delete(item.key) // ignore error, best effort there
|
|
}
|
|
}
|
|
|
|
func delListIndexItem(tx *bbolt.Tx, item namedBucketItem) error {
|
|
bkt := tx.Bucket(item.name)
|
|
if bkt == nil {
|
|
return nil
|
|
}
|
|
|
|
lst, err := decodeList(bkt.Get(item.key))
|
|
if err != nil || len(lst) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// remove element from the list
|
|
for i := range lst {
|
|
if bytes.Equal(item.val, lst[i]) {
|
|
copy(lst[i:], lst[i+1:])
|
|
lst = lst[:len(lst)-1]
|
|
break
|
|
}
|
|
}
|
|
|
|
// if list empty, remove the key from <list> bucket
|
|
if len(lst) == 0 {
|
|
_ = bkt.Delete(item.key) // ignore error, best effort there
|
|
|
|
return nil
|
|
}
|
|
|
|
// if list is not empty, then update it
|
|
encodedLst, err := encodeList(lst)
|
|
if err != nil {
|
|
return nil // ignore error, best effort there
|
|
}
|
|
|
|
_ = bkt.Put(item.key, encodedLst) // ignore error, best effort there
|
|
return nil
|
|
}
|
|
|
|
func delUniqueIndexes(tx *bbolt.Tx, obj *objectSDK.Object, isParent bool) error {
|
|
addr := object.AddressOf(obj)
|
|
|
|
objKey := objectKey(addr.Object(), make([]byte, objectKeySize))
|
|
cnr := addr.Container()
|
|
bucketName := make([]byte, bucketKeySize)
|
|
|
|
// add value to primary unique bucket
|
|
if !isParent {
|
|
switch obj.Type() {
|
|
case objectSDK.TypeRegular:
|
|
bucketName = primaryBucketName(cnr, bucketName)
|
|
case objectSDK.TypeTombstone:
|
|
bucketName = tombstoneBucketName(cnr, bucketName)
|
|
case objectSDK.TypeLock:
|
|
bucketName = bucketNameLockers(cnr, bucketName)
|
|
default:
|
|
return ErrUnknownObjectType
|
|
}
|
|
|
|
delUniqueIndexItem(tx, namedBucketItem{
|
|
name: bucketName,
|
|
key: objKey,
|
|
})
|
|
} else {
|
|
delUniqueIndexItem(tx, namedBucketItem{
|
|
name: parentBucketName(cnr, bucketName),
|
|
key: objKey,
|
|
})
|
|
}
|
|
|
|
delUniqueIndexItem(tx, namedBucketItem{ // remove from storage id index
|
|
name: smallBucketName(cnr, bucketName),
|
|
key: objKey,
|
|
})
|
|
delUniqueIndexItem(tx, namedBucketItem{ // remove from root index
|
|
name: rootBucketName(cnr, bucketName),
|
|
key: objKey,
|
|
})
|
|
|
|
if expEpoch, ok := hasExpirationEpoch(obj); ok {
|
|
delUniqueIndexItem(tx, namedBucketItem{
|
|
name: expEpochToObjectBucketName,
|
|
key: expirationEpochKey(expEpoch, cnr, addr.Object()),
|
|
})
|
|
delUniqueIndexItem(tx, namedBucketItem{
|
|
name: objectToExpirationEpochBucketName(cnr, make([]byte, bucketKeySize)),
|
|
key: objKey,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func deleteECRelatedInfo(tx *bbolt.Tx, garbageBKT *bbolt.Bucket, obj *objectSDK.Object, cnr cid.ID, refCounter referenceCounter) error {
|
|
ech := obj.ECHeader()
|
|
if ech == nil {
|
|
return nil
|
|
}
|
|
|
|
hasAnyChunks := hasAnyECChunks(tx, ech, cnr)
|
|
// drop EC parent GC mark if current EC chunk is the last one
|
|
if !hasAnyChunks && garbageBKT != nil {
|
|
var ecParentAddress oid.Address
|
|
ecParentAddress.SetContainer(cnr)
|
|
ecParentAddress.SetObject(ech.Parent())
|
|
addrKey := addressKey(ecParentAddress, make([]byte, addressKeySize))
|
|
err := garbageBKT.Delete(addrKey)
|
|
if err != nil {
|
|
return fmt.Errorf("could not remove EC parent from garbage bucket: %w", err)
|
|
}
|
|
}
|
|
|
|
// also drop EC parent root info if current EC chunk is the last one
|
|
if !hasAnyChunks {
|
|
delUniqueIndexItem(tx, namedBucketItem{
|
|
name: rootBucketName(cnr, make([]byte, bucketKeySize)),
|
|
key: objectKey(ech.Parent(), make([]byte, objectKeySize)),
|
|
})
|
|
}
|
|
|
|
if ech.ParentSplitParentID() == nil {
|
|
return nil
|
|
}
|
|
|
|
var splitParentAddress oid.Address
|
|
splitParentAddress.SetContainer(cnr)
|
|
splitParentAddress.SetObject(*ech.ParentSplitParentID())
|
|
|
|
if ref, ok := refCounter[string(addressKey(splitParentAddress, make([]byte, addressKeySize)))]; ok {
|
|
// linking object is already processing
|
|
// so just inform that one more reference was deleted
|
|
// split info and gc marks will be deleted after linking object delete
|
|
ref.cur++
|
|
return nil
|
|
}
|
|
|
|
if parentLength(tx, splitParentAddress) > 0 {
|
|
// linking object still exists, so leave split info and gc mark deletion for linking object processing
|
|
return nil
|
|
}
|
|
|
|
// drop split parent gc mark
|
|
if garbageBKT != nil {
|
|
addrKey := addressKey(splitParentAddress, make([]byte, addressKeySize))
|
|
err := garbageBKT.Delete(addrKey)
|
|
if err != nil {
|
|
return fmt.Errorf("could not remove EC parent from garbage bucket: %w", err)
|
|
}
|
|
}
|
|
|
|
// drop split info
|
|
delUniqueIndexItem(tx, namedBucketItem{
|
|
name: rootBucketName(cnr, make([]byte, bucketKeySize)),
|
|
key: objectKey(*ech.ParentSplitParentID(), make([]byte, objectKeySize)),
|
|
})
|
|
return nil
|
|
}
|
|
|
|
func hasAnyECChunks(tx *bbolt.Tx, ech *objectSDK.ECHeader, cnr cid.ID) bool {
|
|
data := getFromBucket(tx, ecInfoBucketName(cnr, make([]byte, bucketKeySize)),
|
|
objectKey(ech.Parent(), make([]byte, objectKeySize)))
|
|
return len(data) > 0
|
|
}
|