diff --git a/pkg/local_object_storage/pilorama/batch.go b/pkg/local_object_storage/pilorama/batch.go index 5722c68aa..c65488b74 100644 --- a/pkg/local_object_storage/pilorama/batch.go +++ b/pkg/local_object_storage/pilorama/batch.go @@ -1,6 +1,7 @@ package pilorama import ( + "encoding/binary" "sort" "sync" "time" @@ -49,10 +50,66 @@ func (b *batch) run() { sort.Slice(b.operations, func(i, j int) bool { return b.operations[i].Time < b.operations[j].Time }) - b.operations = removeDuplicatesInPlace(b.operations) - var lm Move - return b.forest.applyOperation(bLog, bTree, b.operations, &lm) + + // Our main use-case is addition of new items. In this case, + // we do not need to perform undo()/redo(), just do(). + // https://github.com/trvedata/move-op/blob/6c23447c12a7862ff31b7fc2205f6c90fbdb9dc0/proof/Move_Create.thy#L259 + // + // For this optimization to work we need to ensure three things: + // 1. The node itself is not yet in tree. + // 2. The node is not a parent. This case is not mentioned in the article, because + // they consider a "static order" (perform all CREATE operations before MOVE). + // We need this because if node _is_ a parent, we could violate (3) for some late operation. + // See TestForest_ApplySameOperation for details. + // 3. Parent of each operation is already in tree. + var parents map[uint64]struct{} + var cKey [maxKeySize]byte + var slow bool + for i := range b.operations { + _, _, _, inTree := b.forest.getState(bTree, stateKey(cKey[:], b.operations[i].Child)) + if inTree { + slow = true + break + } + + key := childrenKey(cKey[:], b.operations[i].Child, 0) + k, _ := bTree.Cursor().Seek(key) + if len(k) == childrenKeySize && binary.LittleEndian.Uint64(k[1:]) == b.operations[i].Child { + slow = true + break + } + + if b.operations[i].Parent == RootID { + continue + } else if parents == nil { + // Attaching key only to root is done frequently, + // no allocations are performed unless necessary. + parents = make(map[uint64]struct{}) + } else if _, ok := parents[b.operations[i].Parent]; ok { + continue + } + + p := b.operations[i].Parent + _, ts, _, inTree := b.forest.getState(bTree, stateKey(cKey[:], p)) + if !inTree || b.operations[0].Time < ts { + slow = true + break + } + parents[b.operations[i].Parent] = struct{}{} + } + + if slow { + var lm Move + return b.forest.applyOperation(bLog, bTree, b.operations, &lm) + } + + for i := range b.operations { + if err := b.forest.do(bLog, bTree, cKey[:], b.operations[i]); err != nil { + return err + } + } + return nil }) for i := range b.results { b.results[i] <- err diff --git a/pkg/local_object_storage/pilorama/bench_test.go b/pkg/local_object_storage/pilorama/bench_test.go new file mode 100644 index 000000000..e729b9ea6 --- /dev/null +++ b/pkg/local_object_storage/pilorama/bench_test.go @@ -0,0 +1,55 @@ +package pilorama + +import ( + "context" + "os" + "path/filepath" + "runtime" + "sync/atomic" + "testing" + + cidtest "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/container/id/test" + "github.com/stretchr/testify/require" +) + +func getTimestamp(reorder int, ts Timestamp) Timestamp { + base := ts / Timestamp(reorder) + rem := ts % Timestamp(reorder) + return base*Timestamp(reorder) + Timestamp(reorder) - rem +} + +func BenchmarkCreate(b *testing.B) { + // Use `os.TempDir` because we construct multiple times in the same test. + tmpDir, err := os.MkdirTemp(os.TempDir(), "*") + require.NoError(b, err) + + f := NewBoltForest( + WithPath(filepath.Join(tmpDir, "test.db")), + WithMaxBatchSize(runtime.GOMAXPROCS(0))) + require.NoError(b, f.Open(false)) + require.NoError(b, f.Init()) + b.Cleanup(func() { + require.NoError(b, f.Close()) + require.NoError(b, os.RemoveAll(tmpDir)) + }) + + cid := cidtest.ID() + treeID := "tree" + ctx := context.Background() + var index atomic.Int32 + index.Store(-1) + b.SetParallelism(2) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + i := index.Add(1) + op := &Move{ + Meta: Meta{Time: getTimestamp(runtime.GOMAXPROCS(0)*2, Timestamp(i+1))}, + Child: Node(i + 1), + Parent: RootID, + } + if err := f.TreeApply(ctx, cid, treeID, op, true); err != nil { + b.FailNow() + } + } + }) +} diff --git a/pkg/local_object_storage/pilorama/boltdb.go b/pkg/local_object_storage/pilorama/boltdb.go index e7bcb110b..4c7258b03 100644 --- a/pkg/local_object_storage/pilorama/boltdb.go +++ b/pkg/local_object_storage/pilorama/boltdb.go @@ -37,6 +37,11 @@ type boltForest struct { cfg } +const ( + childrenKeySize = 17 + maxKeySize = childrenKeySize +) + var ( dataBucket = []byte{0} logBucket = []byte{1} @@ -185,7 +190,7 @@ func (t *boltForest) TreeMove(ctx context.Context, d CIDDescriptor, treeID strin if lm.Child == RootID { lm.Child = t.findSpareID(bTree) } - return t.do(bLog, bTree, make([]byte, 17), &lm) + return t.do(bLog, bTree, make([]byte, maxKeySize), &lm) })) } @@ -340,7 +345,7 @@ func (t *boltForest) TreeAddByPath(ctx context.Context, d CIDDescriptor, treeID } var lm []Move - var key [17]byte + var key [maxKeySize]byte fullID := bucketName(d.CID, treeID) err := t.db.Batch(func(tx *bbolt.Tx) error { @@ -542,7 +547,7 @@ func (t *boltForest) getTreeBuckets(tx *bbolt.Tx, treeRoot []byte) (*bbolt.Bucke // applyOperations applies log operations. Assumes lm are sorted by timestamp. func (t *boltForest) applyOperation(logBucket, treeBucket *bbolt.Bucket, ms []*Move, lm *Move) error { var tmp Move - var cKey [17]byte + var cKey [maxKeySize]byte c := logBucket.Cursor() @@ -864,7 +869,7 @@ func (t *boltForest) TreeGetChildren(ctx context.Context, cid cidSDK.ID, treeID b := treeRoot.Bucket(dataBucket) c := b.Cursor() - for k, _ := c.Seek(key); len(k) == 17 && binary.LittleEndian.Uint64(k[1:]) == nodeID; k, _ = c.Next() { + for k, _ := c.Seek(key); len(k) == childrenKeySize && binary.LittleEndian.Uint64(k[1:]) == nodeID; k, _ = c.Next() { children = append(children, binary.LittleEndian.Uint64(k[9:])) } return nil @@ -1093,7 +1098,7 @@ func childrenKey(key []byte, child, parent Node) []byte { key[0] = 'c' binary.LittleEndian.PutUint64(key[1:], parent) binary.LittleEndian.PutUint64(key[9:], child) - return key[:17] + return key[:childrenKeySize] } // 'i' + attribute name (string) + attribute value (string) + parent (id) + node (id) -> 0/1.