From 58367e4df6a49edb4d422b97c6da5d401c52d4b5 Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Wed, 1 Feb 2023 11:58:16 +0300 Subject: [PATCH] [#2232] pilorama: Merge in-queue batches To achieve high performance we must choose proper values for both batch size and delay. For user operations we want to set low delay. However it would prevent tree synchronization operations to form big enough batches. For these operations, batching gives the most benefit not only in terms of on-CPU execution cost, but also by speeding up transaction persist (`fsync`). In this commit we try merging batches that are already _triggered_, but not yet _started to execute_. This way we can still query batches for execution after the provided delay while also allowing multiple formed batches to execute faster. Signed-off-by: Evgenii Stratonikov --- CHANGELOG.md | 1 + pkg/local_object_storage/pilorama/batch.go | 21 +++++++++++++++------ pkg/local_object_storage/pilorama/boltdb.go | 2 ++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3103208e20..40d86d9186 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Changelog for FrostFS Node - `common.PrintVerbose` prints via `cobra.Command.Printf` (#1962) - Env prefix in configuration changed to `FROSTFS_*` (#43) - Link object is broadcast throughout the whole container now (#57) +- Pilorama now can merge multiple batches into one (#2231) ### Fixed - Increase payload size metric on shards' `put` operation (#1794) diff --git a/pkg/local_object_storage/pilorama/batch.go b/pkg/local_object_storage/pilorama/batch.go index 59712b7617..43375ba1b9 100644 --- a/pkg/local_object_storage/pilorama/batch.go +++ b/pkg/local_object_storage/pilorama/batch.go @@ -10,8 +10,11 @@ import ( ) type batch struct { - forest *boltForest - timer *time.Timer + forest *boltForest + timer *time.Timer + // mtx protects timer and operations fields. + // Because mtx can be taken inside a transaction, + // transactions MUST NOT be executed with the mutex taken to avoid a deadlock. mtx sync.Mutex start sync.Once cid cidSDK.ID @@ -24,16 +27,12 @@ func (b *batch) trigger() { b.mtx.Lock() if b.timer != nil { b.timer.Stop() - b.timer = nil } b.mtx.Unlock() b.start.Do(b.run) } func (b *batch) run() { - sort.Slice(b.operations, func(i, j int) bool { - return b.operations[i].Time < b.operations[j].Time - }) fullID := bucketName(b.cid, b.treeID) err := b.forest.db.Update(func(tx *bbolt.Tx) error { bLog, bTree, err := b.forest.getTreeBuckets(tx, fullID) @@ -41,6 +40,16 @@ func (b *batch) run() { return err } + b.mtx.Lock() + b.timer = nil + b.mtx.Unlock() + + // Sorting without a mutex is ok, because we append to this slice only if timer is non-nil. + // See (*boltForest).addBatch for details. + sort.Slice(b.operations, func(i, j int) bool { + return b.operations[i].Time < b.operations[j].Time + }) + var lm Move return b.forest.applyOperation(bLog, bTree, b.operations, &lm) }) diff --git a/pkg/local_object_storage/pilorama/boltdb.go b/pkg/local_object_storage/pilorama/boltdb.go index 0f546ef610..247d07d280 100644 --- a/pkg/local_object_storage/pilorama/boltdb.go +++ b/pkg/local_object_storage/pilorama/boltdb.go @@ -377,7 +377,9 @@ func (t *boltForest) addBatch(d CIDDescriptor, treeID string, m *Move, ch chan e results: []chan<- error{ch}, operations: []*Move{m}, } + b.mtx.Lock() b.timer = time.AfterFunc(t.db.MaxBatchDelay, b.trigger) + b.mtx.Unlock() t.batches = append(t.batches, b) t.mtx.Unlock() }