package meta import ( "bytes" "context" "encoding/binary" "encoding/hex" "errors" "fmt" "os" "strconv" "sync/atomic" "time" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/container" objectV2 "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/api/object" cid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id" oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id" "go.etcd.io/bbolt" "golang.org/x/sync/errgroup" ) const ( upgradeLogFrequency = 50_000 upgradeWorkersCount = 1_000 compactMaxTxSize = 256 << 20 upgradeTimeout = 1 * time.Second ) var updates = map[uint64]func(ctx context.Context, db *bbolt.DB, cs container.InfoProvider, log func(a ...any)) error{ 2: upgradeFromV2ToV3, 3: func(_ context.Context, _ *bbolt.DB, _ container.InfoProvider, log func(a ...any)) error { log("metabase already upgraded") return nil }, } func Upgrade(ctx context.Context, path string, compact bool, cs container.InfoProvider, log func(a ...any)) error { if _, err := os.Stat(path); err != nil { return fmt.Errorf("check metabase existence: %w", err) } opts := bbolt.DefaultOptions opts.Timeout = upgradeTimeout db, err := bbolt.Open(path, os.ModePerm, opts) if err != nil { return fmt.Errorf("open metabase: %w", err) } var version uint64 if err := db.View(func(tx *bbolt.Tx) error { var e error version, e = currentVersion(tx) return e }); err != nil { return err } updater, found := updates[version] if !found { return fmt.Errorf("unsupported version %d: no update available", version) } if err := db.Update(func(tx *bbolt.Tx) error { b := tx.Bucket(shardInfoBucket) return b.Put(upgradeKey, zeroValue) }); err != nil { return fmt.Errorf("set upgrade key %w", err) } if err := updater(ctx, db, cs, log); err != nil { return fmt.Errorf("update metabase schema: %w", err) } if err := db.Update(func(tx *bbolt.Tx) error { b := tx.Bucket(shardInfoBucket) return b.Delete(upgradeKey) }); err != nil { return fmt.Errorf("delete upgrade key %w", err) } if compact { log("compacting metabase...") err := compactDB(db) if err != nil { return fmt.Errorf("compact metabase: %w", err) } log("metabase compacted") } return db.Close() } func compactDB(db *bbolt.DB) error { sourcePath := db.Path() tmpFileName := sourcePath + "." + time.Now().Format(time.RFC3339) f, err := os.Stat(sourcePath) if err != nil { return err } dst, err := bbolt.Open(tmpFileName, f.Mode(), &bbolt.Options{ Timeout: 100 * time.Millisecond, NoSync: true, }) if err != nil { return fmt.Errorf("can't open new metabase to compact: %w", err) } if err := bbolt.Compact(dst, db, compactMaxTxSize); err != nil { return fmt.Errorf("compact metabase: %w", errors.Join(err, dst.Close(), os.Remove(tmpFileName))) } if err := dst.Sync(); err != nil { return fmt.Errorf("sync compacted metabase: %w", errors.Join(err, os.Remove(tmpFileName))) } if err := dst.Close(); err != nil { return fmt.Errorf("close compacted metabase: %w", errors.Join(err, os.Remove(tmpFileName))) } if err := db.Close(); err != nil { return fmt.Errorf("close source metabase: %w", errors.Join(err, os.Remove(tmpFileName))) } if err := os.Rename(tmpFileName, sourcePath); err != nil { return fmt.Errorf("replace source metabase with compacted: %w", errors.Join(err, os.Remove(tmpFileName))) } return nil } func upgradeFromV2ToV3(ctx context.Context, db *bbolt.DB, cs container.InfoProvider, log func(a ...any)) error { if err := createExpirationEpochBuckets(ctx, db, log); err != nil { return err } eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { return dropUserAttributes(ctx, db, cs, log) }) eg.Go(func() error { return dropOwnerIDIndex(ctx, db, log) }) eg.Go(func() error { return dropPayloadChecksumIndex(ctx, db, log) }) if err := eg.Wait(); err != nil { return err } return db.Update(func(tx *bbolt.Tx) error { return updateVersion(tx, version) }) } type objectIDToExpEpoch struct { containerID cid.ID objectID oid.ID expirationEpoch uint64 } func createExpirationEpochBuckets(ctx context.Context, db *bbolt.DB, log func(a ...any)) error { log("filling expiration epoch buckets...") if err := db.Update(func(tx *bbolt.Tx) error { _, err := tx.CreateBucketIfNotExists(expEpochToObjectBucketName) return err }); err != nil { return err } objects := make(chan objectIDToExpEpoch) eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { return selectObjectsWithExpirationEpoch(ctx, db, objects) }) var count atomic.Uint64 for range upgradeWorkersCount { eg.Go(func() error { for { select { case <-ctx.Done(): return ctx.Err() case obj, ok := <-objects: if !ok { return nil } if err := db.Batch(func(tx *bbolt.Tx) error { if err := putUniqueIndexItem(tx, namedBucketItem{ name: expEpochToObjectBucketName, key: expirationEpochKey(obj.expirationEpoch, obj.containerID, obj.objectID), val: zeroValue, }); err != nil { return err } val := make([]byte, epochSize) binary.LittleEndian.PutUint64(val, obj.expirationEpoch) return putUniqueIndexItem(tx, namedBucketItem{ name: objectToExpirationEpochBucketName(obj.containerID, make([]byte, bucketKeySize)), key: objectKey(obj.objectID, make([]byte, objectKeySize)), val: val, }) }); err != nil { return err } } if c := count.Add(1); c%upgradeLogFrequency == 0 { log("expiration epoch filled for", c, "objects...") } } }) } err := eg.Wait() if err != nil { log("expiration epoch buckets completed completed with error:", err) return err } log("filling expiration epoch buckets completed successfully, total", count.Load(), "objects") return nil } func selectObjectsWithExpirationEpoch(ctx context.Context, db *bbolt.DB, objects chan objectIDToExpEpoch) error { defer close(objects) const batchSize = 1000 it := &objectsWithExpirationEpochBatchIterator{ lastAttributeKey: usrAttrPrefix, } for { if err := getNextObjectsWithExpirationEpochBatch(ctx, db, it, batchSize); err != nil { return err } for _, item := range it.items { select { case <-ctx.Done(): return ctx.Err() case objects <- item: } } if len(it.items) < batchSize { return nil } it.items = nil } } var ( usrAttrPrefix = []byte{userAttributePrefix} errBatchSizeLimit = errors.New("batch size limit") ) type objectsWithExpirationEpochBatchIterator struct { lastAttributeKey []byte lastAttributeValue []byte lastAttrKeyValueItem []byte items []objectIDToExpEpoch } // - {prefix}{containerID}{attributeKey} <- bucket // -- {attributeValue} <- bucket, expirationEpoch // --- {objectID}: zeroValue <- record func getNextObjectsWithExpirationEpochBatch(ctx context.Context, db *bbolt.DB, it *objectsWithExpirationEpochBatchIterator, batchSize int) error { seekAttrValue := it.lastAttributeValue seekAttrKVItem := it.lastAttrKeyValueItem err := db.View(func(tx *bbolt.Tx) error { attrKeyC := tx.Cursor() for attrKey, _ := attrKeyC.Seek(it.lastAttributeKey); attrKey != nil && bytes.HasPrefix(attrKey, usrAttrPrefix); attrKey, _ = attrKeyC.Next() { select { case <-ctx.Done(): return ctx.Err() default: } if len(attrKey) <= 1+cidSize { continue } attributeKey := string(attrKey[1+cidSize:]) if attributeKey != objectV2.SysAttributeExpEpoch { continue } var containerID cid.ID if err := containerID.Decode(attrKey[1 : 1+cidSize]); err != nil { return fmt.Errorf("decode container id from user attribute bucket: %w", err) } if err := iterateExpirationAttributeKeyBucket(ctx, tx.Bucket(attrKey), it, batchSize, containerID, attrKey, seekAttrValue, seekAttrKVItem); err != nil { return err } seekAttrValue = nil seekAttrKVItem = nil } return nil }) if err != nil && !errors.Is(err, errBatchSizeLimit) { return err } return nil } func iterateExpirationAttributeKeyBucket(ctx context.Context, b *bbolt.Bucket, it *objectsWithExpirationEpochBatchIterator, batchSize int, containerID cid.ID, attrKey, seekAttrValue, seekAttrKVItem []byte) error { attrValueC := b.Cursor() for attrValue, v := attrValueC.Seek(seekAttrValue); attrValue != nil; attrValue, v = attrValueC.Next() { select { case <-ctx.Done(): return ctx.Err() default: } if v != nil { continue // need to iterate over buckets, not records } expirationEpoch, err := strconv.ParseUint(string(attrValue), 10, 64) if err != nil { return fmt.Errorf("could not parse expiration epoch: %w", err) } expirationEpochBucket := b.Bucket(attrValue) attrKeyValueC := expirationEpochBucket.Cursor() for attrKeyValueItem, v := attrKeyValueC.Seek(seekAttrKVItem); attrKeyValueItem != nil; attrKeyValueItem, v = attrKeyValueC.Next() { select { case <-ctx.Done(): return ctx.Err() default: } if v == nil { continue // need to iterate over records, not buckets } if bytes.Equal(it.lastAttributeKey, attrKey) && bytes.Equal(it.lastAttributeValue, attrValue) && bytes.Equal(it.lastAttrKeyValueItem, attrKeyValueItem) { continue } var objectID oid.ID if err := objectID.Decode(attrKeyValueItem); err != nil { return fmt.Errorf("decode object id from container '%s' expiration epoch %d: %w", containerID, expirationEpoch, err) } it.lastAttributeKey = bytes.Clone(attrKey) it.lastAttributeValue = bytes.Clone(attrValue) it.lastAttrKeyValueItem = bytes.Clone(attrKeyValueItem) it.items = append(it.items, objectIDToExpEpoch{ containerID: containerID, objectID: objectID, expirationEpoch: expirationEpoch, }) if len(it.items) == batchSize { return errBatchSizeLimit } } seekAttrKVItem = nil } return nil } func dropUserAttributes(ctx context.Context, db *bbolt.DB, cs container.InfoProvider, log func(a ...any)) error { log("deleting user attribute buckets...") const batch = 1000 prefix := []byte{userAttributePrefix} last := prefix var count uint64 for { select { case <-ctx.Done(): return ctx.Err() default: } var keys [][]byte if err := db.View(func(tx *bbolt.Tx) error { c := tx.Cursor() for k, _ := c.Seek(last); k != nil && bytes.HasPrefix(k, prefix) && len(keys) < batch; k, _ = c.Next() { if bytes.Equal(last, k) { continue } keys = append(keys, bytes.Clone(k)) } return nil }); err != nil { log("deleting user attribute buckets completed with an error:", err) return err } if len(keys) == 0 { log("deleting user attribute buckets completed successfully, deleted", count, "buckets") return nil } last = keys[len(keys)-1] cnt, err := dropNonIndexedUserAttributeBuckets(db, cs, keys) if err != nil { log("deleting user attribute buckets completed with an error:", err) return err } count += cnt cnt, err = dropEmptyUserAttributeBuckets(ctx, db, keys) if err != nil { log("deleting user attribute buckets completed with an error:", err) return err } count += cnt log("deleted", count, "user attribute buckets") } } func dropNonIndexedUserAttributeBuckets(db *bbolt.DB, cs container.InfoProvider, keys [][]byte) (uint64, error) { keysToDrop, err := selectUserAttributeKeysToDrop(keys, cs) if err != nil { return 0, fmt.Errorf("select non indexed user attributes: %w", err) } if err := db.Batch(func(tx *bbolt.Tx) error { for _, k := range keysToDrop { if err := tx.DeleteBucket(k); err != nil { return err } } return nil }); err != nil { return 0, fmt.Errorf("drop non indexed user attributes: %w", err) } return uint64(len(keysToDrop)), nil } func selectUserAttributeKeysToDrop(keys [][]byte, cs container.InfoProvider) ([][]byte, error) { var keysToDrop [][]byte for _, key := range keys { attr, ok := attributeFromAttributeBucket(key) if !ok { return nil, fmt.Errorf("failed to parse attribute key from user attribute bucket key %s", hex.EncodeToString(key)) } if !IsAtrributeIndexed(attr) { keysToDrop = append(keysToDrop, key) continue } contID, ok := cidFromAttributeBucket(key) if !ok { return nil, fmt.Errorf("failed to parse container ID from user attribute bucket key %s", hex.EncodeToString(key)) } info, err := cs.Info(contID) if err != nil { return nil, err } if info.Removed || !info.Indexed { keysToDrop = append(keysToDrop, key) } } return keysToDrop, nil } func dropEmptyUserAttributeBuckets(ctx context.Context, db *bbolt.DB, keys [][]byte) (uint64, error) { var dropBuckets [][]byte for _, key := range keys { select { case <-ctx.Done(): return 0, ctx.Err() default: } if err := dropEmptyNestedBuckets(ctx, db, key); err != nil { return 0, err } empty, exists, err := bucketIsEmpty(db, key) if err != nil { return 0, err } if empty && exists { dropBuckets = append(dropBuckets, key) } } if len(dropBuckets) == 0 { return 0, nil } if err := db.Batch(func(tx *bbolt.Tx) error { for _, key := range dropBuckets { if err := tx.DeleteBucket(key); err != nil { return err } } return nil }); err != nil { return 0, fmt.Errorf("drop empty user attributes buckets: %w", err) } return uint64(len(dropBuckets)), nil } func bucketIsEmpty(db *bbolt.DB, bucketKey []byte) (bool, bool, error) { var empty bool var exists bool if err := db.View(func(tx *bbolt.Tx) error { b := tx.Bucket(bucketKey) if b == nil { return nil } exists = true empty = !hasAnyItem(b) return nil }); err != nil { return false, false, fmt.Errorf("bucket empty check: %w", err) } return empty, exists, nil } func dropEmptyNestedBuckets(ctx context.Context, db *bbolt.DB, rootBucketKey []byte) error { var last []byte for { select { case <-ctx.Done(): return ctx.Err() default: } var dropBuckets [][]byte var err error dropBuckets, last, err = selectEmptyNestedBuckets(ctx, db, rootBucketKey, last) if err != nil { return fmt.Errorf("select empty nested buckets: %w", err) } if len(dropBuckets) == 0 { return nil } if err := db.Batch(func(tx *bbolt.Tx) error { rootBucket := tx.Bucket(rootBucketKey) if rootBucket == nil { return nil } for _, sb := range dropBuckets { if err := rootBucket.DeleteBucket(sb); err != nil { return err } } return nil }); err != nil { return fmt.Errorf("drop empty nested buckets: %w", err) } } } func selectEmptyNestedBuckets(ctx context.Context, db *bbolt.DB, rootBucketKey, last []byte) ([][]byte, []byte, error) { const batchSize = 1000 var result [][]byte if err := db.View(func(tx *bbolt.Tx) error { rootBucket := tx.Bucket(rootBucketKey) if rootBucket == nil { return nil } c := rootBucket.Cursor() for k, v := c.Seek(last); k != nil && len(result) < batchSize; k, v = c.Next() { select { case <-ctx.Done(): return ctx.Err() default: } if bytes.Equal(last, k) { continue } last = bytes.Clone(k) if v != nil { // record continue } nestedBucket := rootBucket.Bucket(k) if nestedBucket == nil { continue } if !hasAnyItem(nestedBucket) { result = append(result, bytes.Clone(k)) } } return nil }); err != nil { return nil, nil, err } return result, last, nil } func dropOwnerIDIndex(ctx context.Context, db *bbolt.DB, log func(a ...any)) error { return dropBucketsByPrefix(ctx, db, []byte{ownerPrefix}, func(a ...any) { log(append([]any{"owner ID index:"}, a...)...) }) } func dropPayloadChecksumIndex(ctx context.Context, db *bbolt.DB, log func(a ...any)) error { return dropBucketsByPrefix(ctx, db, []byte{payloadHashPrefix}, func(a ...any) { log(append([]any{"payload checksum:"}, a...)...) }) } func dropBucketsByPrefix(ctx context.Context, db *bbolt.DB, prefix []byte, log func(a ...any)) error { log("deleting buckets...") const batch = 1000 var count uint64 for { select { case <-ctx.Done(): return ctx.Err() default: } var keys [][]byte if err := db.View(func(tx *bbolt.Tx) error { c := tx.Cursor() for k, _ := c.Seek(prefix); k != nil && bytes.HasPrefix(k, prefix) && len(keys) < batch; k, _ = c.Next() { keys = append(keys, bytes.Clone(k)) } return nil }); err != nil { log("deleting buckets completed with an error:", err) return err } if len(keys) == 0 { log("deleting buckets completed successfully, deleted", count, "buckets") return nil } if err := db.Batch(func(tx *bbolt.Tx) error { for _, k := range keys { if err := tx.DeleteBucket(k); err != nil { return err } } return nil }); err != nil { log("deleting buckets completed with an error:", err) return err } count += uint64(len(keys)) log("deleted", count, "buckets") } }