forked from TrueCloudLab/frostfs-node
612 lines
15 KiB
Go
612 lines
15 KiB
Go
package meta
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/binary"
|
|
"errors"
|
|
"fmt"
|
|
gio "io"
|
|
"time"
|
|
|
|
objectCore "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object"
|
|
storagelog "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/internal/log"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/internal/metaerr"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/util"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-observability/tracing"
|
|
cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id"
|
|
objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
|
|
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
|
|
"github.com/nspcc-dev/neo-go/pkg/io"
|
|
"go.etcd.io/bbolt"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
)
|
|
|
|
type (
|
|
namedBucketItem struct {
|
|
name, key, val []byte
|
|
}
|
|
)
|
|
|
|
// PutPrm groups the parameters of Put operation.
|
|
type PutPrm struct {
|
|
obj *objectSDK.Object
|
|
|
|
id []byte
|
|
}
|
|
|
|
// PutRes groups the resulting values of Put operation.
|
|
type PutRes struct {
|
|
Inserted bool
|
|
}
|
|
|
|
// SetObject is a Put option to set object to save.
|
|
func (p *PutPrm) SetObject(obj *objectSDK.Object) {
|
|
p.obj = obj
|
|
}
|
|
|
|
// SetStorageID is a Put option to set storage ID to save.
|
|
func (p *PutPrm) SetStorageID(id []byte) {
|
|
p.id = id
|
|
}
|
|
|
|
var (
|
|
ErrUnknownObjectType = errors.New("unknown object type")
|
|
ErrIncorrectSplitInfoUpdate = errors.New("updating split info on object without it")
|
|
ErrIncorrectRootObject = errors.New("invalid root object")
|
|
)
|
|
|
|
// Put saves object header in metabase. Object payload expected to be cut.
|
|
//
|
|
// Returns an error of type apistatus.ObjectAlreadyRemoved if object has been placed in graveyard.
|
|
// Returns the object.ErrObjectIsExpired if the object is presented but already expired.
|
|
func (db *DB) Put(ctx context.Context, prm PutPrm) (res PutRes, err error) {
|
|
var (
|
|
startedAt = time.Now()
|
|
success = false
|
|
)
|
|
defer func() {
|
|
db.metrics.AddMethodDuration("Put", time.Since(startedAt), success)
|
|
}()
|
|
|
|
_, span := tracing.StartSpanFromContext(ctx, "metabase.Put",
|
|
trace.WithAttributes(
|
|
attribute.String("address", objectCore.AddressOf(prm.obj).EncodeToString()),
|
|
))
|
|
defer span.End()
|
|
|
|
db.modeMtx.RLock()
|
|
defer db.modeMtx.RUnlock()
|
|
|
|
if db.mode.NoMetabase() {
|
|
return res, ErrDegradedMode
|
|
} else if db.mode.ReadOnly() {
|
|
return res, ErrReadOnlyMode
|
|
}
|
|
|
|
currEpoch := db.epochState.CurrentEpoch()
|
|
|
|
err = db.boltDB.Batch(func(tx *bbolt.Tx) error {
|
|
var e error
|
|
res, e = db.put(tx, prm.obj, prm.id, nil, currEpoch)
|
|
return e
|
|
})
|
|
if err == nil {
|
|
success = true
|
|
storagelog.Write(db.log,
|
|
storagelog.AddressField(objectCore.AddressOf(prm.obj)),
|
|
storagelog.OpField("metabase PUT"))
|
|
}
|
|
|
|
return res, metaerr.Wrap(err)
|
|
}
|
|
|
|
func (db *DB) put(tx *bbolt.Tx,
|
|
obj *objectSDK.Object,
|
|
id []byte,
|
|
si *objectSDK.SplitInfo,
|
|
currEpoch uint64,
|
|
) (PutRes, error) {
|
|
cnr, ok := obj.ContainerID()
|
|
if !ok {
|
|
return PutRes{}, errors.New("missing container in object")
|
|
}
|
|
|
|
isParent := si != nil
|
|
|
|
exists, err := db.exists(tx, objectCore.AddressOf(obj), currEpoch)
|
|
|
|
var splitInfoError *objectSDK.SplitInfoError
|
|
if errors.As(err, &splitInfoError) {
|
|
exists = true // object exists, however it is virtual
|
|
} else if err != nil {
|
|
return PutRes{}, err // return any error besides SplitInfoError
|
|
}
|
|
|
|
if exists {
|
|
return PutRes{}, db.updateObj(tx, obj, id, si, isParent)
|
|
}
|
|
|
|
return PutRes{Inserted: true}, db.insertObject(tx, obj, id, si, isParent, cnr, currEpoch)
|
|
}
|
|
|
|
func (db *DB) updateObj(tx *bbolt.Tx, obj *objectSDK.Object, id []byte, si *objectSDK.SplitInfo, isParent bool) error {
|
|
// most right child and split header overlap parent so we have to
|
|
// check if object exists to not overwrite it twice
|
|
|
|
// When storage engine moves objects between different sub-storages,
|
|
// it calls metabase.Put method with new storage ID, thus triggering this code.
|
|
if !isParent && id != nil {
|
|
return setStorageID(tx, objectCore.AddressOf(obj), id, true)
|
|
}
|
|
|
|
// when storage already has last object in split hierarchy and there is
|
|
// a linking object to put (or vice versa), we should update split info
|
|
// with object ids of these objects
|
|
if isParent {
|
|
return updateSplitInfo(tx, objectCore.AddressOf(obj), si)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (db *DB) insertObject(tx *bbolt.Tx, obj *objectSDK.Object, id []byte, si *objectSDK.SplitInfo, isParent bool, cnr cid.ID, currEpoch uint64) error {
|
|
if par := obj.Parent(); par != nil && !isParent { // limit depth by two
|
|
parentSI, err := splitInfoFromObject(obj)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, err = db.put(tx, par, id, parentSI, currEpoch)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
err := putUniqueIndexes(tx, obj, si, id)
|
|
if err != nil {
|
|
return fmt.Errorf("can't put unique indexes: %w", err)
|
|
}
|
|
|
|
err = updateListIndexes(tx, obj, putListIndexItem)
|
|
if err != nil {
|
|
return fmt.Errorf("can't put list indexes: %w", err)
|
|
}
|
|
|
|
err = updateFKBTIndexes(tx, obj, putFKBTIndexItem)
|
|
if err != nil {
|
|
return fmt.Errorf("can't put fake bucket tree indexes: %w", err)
|
|
}
|
|
|
|
// update container volume size estimation
|
|
if obj.Type() == objectSDK.TypeRegular && !isParent {
|
|
err = changeContainerSize(tx, cnr, obj.PayloadSize(), true)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if !isParent {
|
|
if err = db.incCounters(tx, cnr, IsUserObject(obj)); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func putUniqueIndexes(
|
|
tx *bbolt.Tx,
|
|
obj *objectSDK.Object,
|
|
si *objectSDK.SplitInfo,
|
|
id []byte,
|
|
) error {
|
|
isParent := si != nil
|
|
addr := objectCore.AddressOf(obj)
|
|
cnr := addr.Container()
|
|
objKey := objectKey(addr.Object(), make([]byte, objectKeySize))
|
|
|
|
bucketName := make([]byte, bucketKeySize)
|
|
// add value to primary unique bucket
|
|
if !isParent {
|
|
switch obj.Type() {
|
|
case objectSDK.TypeRegular:
|
|
bucketName = primaryBucketName(cnr, bucketName)
|
|
case objectSDK.TypeTombstone:
|
|
bucketName = tombstoneBucketName(cnr, bucketName)
|
|
case objectSDK.TypeLock:
|
|
bucketName = bucketNameLockers(cnr, bucketName)
|
|
default:
|
|
return ErrUnknownObjectType
|
|
}
|
|
|
|
rawObject, err := obj.CutPayload().Marshal()
|
|
if err != nil {
|
|
return fmt.Errorf("can't marshal object header: %w", err)
|
|
}
|
|
|
|
err = putUniqueIndexItem(tx, namedBucketItem{
|
|
name: bucketName,
|
|
key: objKey,
|
|
val: rawObject,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// index storageID if it is present
|
|
if id != nil {
|
|
if err = setStorageID(tx, objectCore.AddressOf(obj), id, false); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
// index root object
|
|
if obj.Type() == objectSDK.TypeRegular && !obj.HasParent() {
|
|
var (
|
|
err error
|
|
splitInfo []byte
|
|
)
|
|
|
|
if isParent {
|
|
splitInfo, err = si.Marshal()
|
|
if err != nil {
|
|
return fmt.Errorf("can't marshal split info: %w", err)
|
|
}
|
|
}
|
|
|
|
err = putUniqueIndexItem(tx, namedBucketItem{
|
|
name: rootBucketName(cnr, bucketName),
|
|
key: objKey,
|
|
val: splitInfo,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if ecHead := obj.GetECHeader(); ecHead != nil {
|
|
err = putECInfo(tx, cnr, objKey, ecHead)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type updateIndexItemFunc = func(tx *bbolt.Tx, item namedBucketItem) error
|
|
|
|
func updateListIndexes(tx *bbolt.Tx, obj *objectSDK.Object, f updateIndexItemFunc) error {
|
|
idObj, _ := obj.ID()
|
|
cnr, _ := obj.ContainerID()
|
|
objKey := objectKey(idObj, make([]byte, objectKeySize))
|
|
bucketName := make([]byte, bucketKeySize)
|
|
|
|
cs, _ := obj.PayloadChecksum()
|
|
|
|
// index payload hashes
|
|
err := f(tx, namedBucketItem{
|
|
name: payloadHashBucketName(cnr, bucketName),
|
|
key: cs.Value(),
|
|
val: objKey,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
idParent, ok := obj.ParentID()
|
|
|
|
// index parent ids
|
|
if ok {
|
|
err := f(tx, namedBucketItem{
|
|
name: parentBucketName(cnr, bucketName),
|
|
key: objectKey(idParent, make([]byte, objectKeySize)),
|
|
val: objKey,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// index split ids
|
|
if obj.SplitID() != nil {
|
|
err := f(tx, namedBucketItem{
|
|
name: splitBucketName(cnr, bucketName),
|
|
key: obj.SplitID().ToV2(),
|
|
val: objKey,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func updateFKBTIndexes(tx *bbolt.Tx, obj *objectSDK.Object, f updateIndexItemFunc) error {
|
|
id, _ := obj.ID()
|
|
cnr, _ := obj.ContainerID()
|
|
objKey := objectKey(id, make([]byte, objectKeySize))
|
|
|
|
attrs := obj.Attributes()
|
|
|
|
key := make([]byte, bucketKeySize)
|
|
err := f(tx, namedBucketItem{
|
|
name: ownerBucketName(cnr, key),
|
|
key: []byte(obj.OwnerID().EncodeToString()),
|
|
val: objKey,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// user specified attributes
|
|
for i := range attrs {
|
|
key = attributeBucketName(cnr, attrs[i].Key(), key)
|
|
err := f(tx, namedBucketItem{
|
|
name: key,
|
|
key: []byte(attrs[i].Value()),
|
|
val: objKey,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type bucketContainer interface {
|
|
Bucket([]byte) *bbolt.Bucket
|
|
CreateBucket([]byte) (*bbolt.Bucket, error)
|
|
CreateBucketIfNotExists([]byte) (*bbolt.Bucket, error)
|
|
}
|
|
|
|
func createBucketLikelyExists[T bucketContainer](tx T, name []byte) (*bbolt.Bucket, error) {
|
|
if bkt := tx.Bucket(name); bkt != nil {
|
|
return bkt, nil
|
|
}
|
|
return tx.CreateBucket(name)
|
|
}
|
|
|
|
func putUniqueIndexItem(tx *bbolt.Tx, item namedBucketItem) error {
|
|
bkt, err := createBucketLikelyExists(tx, item.name)
|
|
if err != nil {
|
|
return fmt.Errorf("can't create index %v: %w", item.name, err)
|
|
}
|
|
|
|
return bkt.Put(item.key, item.val)
|
|
}
|
|
|
|
func putFKBTIndexItem(tx *bbolt.Tx, item namedBucketItem) error {
|
|
bkt, err := createBucketLikelyExists(tx, item.name)
|
|
if err != nil {
|
|
return fmt.Errorf("can't create index %v: %w", item.name, err)
|
|
}
|
|
|
|
fkbtRoot, err := createBucketLikelyExists(bkt, item.key)
|
|
if err != nil {
|
|
return fmt.Errorf("can't create fake bucket tree index %v: %w", item.key, err)
|
|
}
|
|
|
|
return fkbtRoot.Put(item.val, zeroValue)
|
|
}
|
|
|
|
func putListIndexItem(tx *bbolt.Tx, item namedBucketItem) error {
|
|
bkt, err := createBucketLikelyExists(tx, item.name)
|
|
if err != nil {
|
|
return fmt.Errorf("can't create index %v: %w", item.name, err)
|
|
}
|
|
|
|
lst, err := decodeList(bkt.Get(item.key))
|
|
if err != nil {
|
|
return fmt.Errorf("can't decode leaf list %v: %w", item.key, err)
|
|
}
|
|
|
|
lst = append(lst, item.val)
|
|
|
|
encodedLst, err := encodeList(lst)
|
|
if err != nil {
|
|
return fmt.Errorf("can't encode leaf list %v: %w", item.key, err)
|
|
}
|
|
|
|
return bkt.Put(item.key, encodedLst)
|
|
}
|
|
|
|
// encodeList decodes list of bytes into a single blog for list bucket indexes.
|
|
func encodeList(lst [][]byte) ([]byte, error) {
|
|
w := io.NewBufBinWriter()
|
|
w.WriteVarUint(uint64(len(lst)))
|
|
for i := range lst {
|
|
w.WriteVarBytes(lst[i])
|
|
}
|
|
if w.Err != nil {
|
|
return nil, w.Err
|
|
}
|
|
return w.Bytes(), nil
|
|
}
|
|
|
|
// decodeList decodes blob into the list of bytes from list bucket index.
|
|
func decodeList(data []byte) (lst [][]byte, err error) {
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
var offset uint64
|
|
size, n, err := getVarUint(data)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
offset += uint64(n)
|
|
lst = make([][]byte, size, size+1)
|
|
for i := range lst {
|
|
sz, n, err := getVarUint(data[offset:])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
offset += uint64(n)
|
|
|
|
next := offset + sz
|
|
if uint64(len(data)) < next {
|
|
return nil, gio.ErrUnexpectedEOF
|
|
}
|
|
lst[i] = data[offset:next]
|
|
offset = next
|
|
}
|
|
return lst, nil
|
|
}
|
|
|
|
func getVarUint(data []byte) (uint64, int, error) {
|
|
if len(data) == 0 {
|
|
return 0, 0, gio.ErrUnexpectedEOF
|
|
}
|
|
|
|
switch b := data[0]; b {
|
|
case 0xfd:
|
|
if len(data) < 3 {
|
|
return 0, 1, gio.ErrUnexpectedEOF
|
|
}
|
|
return uint64(binary.LittleEndian.Uint16(data[1:])), 3, nil
|
|
case 0xfe:
|
|
if len(data) < 5 {
|
|
return 0, 1, gio.ErrUnexpectedEOF
|
|
}
|
|
return uint64(binary.LittleEndian.Uint32(data[1:])), 5, nil
|
|
case 0xff:
|
|
if len(data) < 9 {
|
|
return 0, 1, gio.ErrUnexpectedEOF
|
|
}
|
|
return binary.LittleEndian.Uint64(data[1:]), 9, nil
|
|
default:
|
|
return uint64(b), 1, nil
|
|
}
|
|
}
|
|
|
|
// setStorageID for existing objects if they were moved from one
|
|
// storage location to another.
|
|
func setStorageID(tx *bbolt.Tx, addr oid.Address, id []byte, override bool) error {
|
|
key := make([]byte, bucketKeySize)
|
|
bkt, err := createBucketLikelyExists(tx, smallBucketName(addr.Container(), key))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
key = objectKey(addr.Object(), key)
|
|
if override || bkt.Get(key) == nil {
|
|
return bkt.Put(key, id)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// updateSpliInfo for existing objects if storage filled with extra information
|
|
// about last object in split hierarchy or linking object.
|
|
func updateSplitInfo(tx *bbolt.Tx, addr oid.Address, from *objectSDK.SplitInfo) error {
|
|
key := make([]byte, bucketKeySize)
|
|
bkt := tx.Bucket(rootBucketName(addr.Container(), key))
|
|
if bkt == nil {
|
|
// if object doesn't exists and we want to update split info on it
|
|
// then ignore, this should never happen
|
|
return ErrIncorrectSplitInfoUpdate
|
|
}
|
|
|
|
objectKey := objectKey(addr.Object(), key)
|
|
|
|
rawSplitInfo := bkt.Get(objectKey)
|
|
if len(rawSplitInfo) == 0 {
|
|
return ErrIncorrectSplitInfoUpdate
|
|
}
|
|
|
|
to := objectSDK.NewSplitInfo()
|
|
|
|
err := to.Unmarshal(rawSplitInfo)
|
|
if err != nil {
|
|
return fmt.Errorf("can't unmarshal split info from root index: %w", err)
|
|
}
|
|
|
|
result := util.MergeSplitInfo(from, to)
|
|
|
|
rawSplitInfo, err = result.Marshal()
|
|
if err != nil {
|
|
return fmt.Errorf("can't marhsal merged split info: %w", err)
|
|
}
|
|
|
|
return bkt.Put(objectKey, rawSplitInfo)
|
|
}
|
|
|
|
// splitInfoFromObject returns split info based on last or linkin object.
|
|
// Otherwise returns nil, nil.
|
|
func splitInfoFromObject(obj *objectSDK.Object) (*objectSDK.SplitInfo, error) {
|
|
if obj.Parent() == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
info := objectSDK.NewSplitInfo()
|
|
info.SetSplitID(obj.SplitID())
|
|
|
|
switch {
|
|
case isLinkObject(obj):
|
|
id, ok := obj.ID()
|
|
if !ok {
|
|
return nil, errors.New("missing object ID")
|
|
}
|
|
|
|
info.SetLink(id)
|
|
case isLastObject(obj):
|
|
id, ok := obj.ID()
|
|
if !ok {
|
|
return nil, errors.New("missing object ID")
|
|
}
|
|
|
|
info.SetLastPart(id)
|
|
default:
|
|
return nil, ErrIncorrectRootObject // should never happen
|
|
}
|
|
|
|
return info, nil
|
|
}
|
|
|
|
// isLinkObject returns true if object contains parent header and list
|
|
// of children.
|
|
func isLinkObject(obj *objectSDK.Object) bool {
|
|
return len(obj.Children()) > 0 && obj.Parent() != nil
|
|
}
|
|
|
|
// isLastObject returns true if object contains only parent header without list
|
|
// of children.
|
|
func isLastObject(obj *objectSDK.Object) bool {
|
|
return len(obj.Children()) == 0 && obj.Parent() != nil
|
|
}
|
|
|
|
func putECInfo(tx *bbolt.Tx,
|
|
cnr cid.ID, objKey []byte,
|
|
ecHead *objectSDK.ECHeader,
|
|
) error {
|
|
parentID := objectKey(ecHead.Parent(), make([]byte, objectKeySize))
|
|
bucketName := make([]byte, bucketKeySize)
|
|
|
|
val := getFromBucket(tx, ecInfoBucketName(cnr, bucketName), parentID)
|
|
if len(val) == 0 {
|
|
val = objKey
|
|
} else {
|
|
offset := 0
|
|
found := false
|
|
for offset < len(val) {
|
|
if bytes.Equal(objKey, val[offset:offset+objectKeySize]) {
|
|
found = true
|
|
break
|
|
}
|
|
offset += objectKeySize
|
|
}
|
|
if !found {
|
|
val = append(val, objKey...)
|
|
}
|
|
}
|
|
return putUniqueIndexItem(tx, namedBucketItem{
|
|
name: ecInfoBucketName(cnr, make([]byte, bucketKeySize)),
|
|
key: parentID,
|
|
val: val,
|
|
})
|
|
}
|