[#1648] writecache: Fix race condition when reporting cache size metrics

There is a race condition when multiple cache operation try to report the cache size metrics simultaneously. Consider the following example: - the initial total size of objects stored in the cache size is 2 - worker X deletes an object and reads the cache size, which is 1 - worker Y deletes an object and reads the cache size, which is 0 - worker Y reports the cache size it learnt, which is 0 - worker X reports the cache size it learnt, which is 1 As a result, the observed cache size is 1 (i. e. one object remains in the cache), which is incorrect because the actual cache size is 0. To fix this, a separate worker for reporting the cache size metric has been created. All operations should use a queue (a buffered channel) to request the reporter worker to report the metrics. Currently, all queue writes are non-blocking. Signed-off-by: Aleksey Savchuk <a.savchuk@yadro.com>
2025-02-18 10:51:43 +03:00 · 2025-02-18 10:51:43 +03:00 · 840fc3a31b
commit 840fc3a31b
parent 9b29e7392f
6 changed files with 3 additions and 11 deletions
--- a/pkg/local_object_storage/writecache/cache.go
+++ b/pkg/local_object_storage/writecache/cache.go
@ -94,7 +94,6 @@ func (c *cache) Open(_ context.Context, mod mode.Mode) error {
 	if err != nil {
 		return metaerr.Wrap(err)
 	}
-	c.initCounters()
 	return nil
 }

--- a/pkg/local_object_storage/writecache/delete.go
+++ b/pkg/local_object_storage/writecache/delete.go
@ -52,8 +52,6 @@ func (c *cache) Delete(ctx context.Context, addr oid.Address) error {
 			storagelog.OpField("fstree DELETE"),
 		)
 		deleted = true
-		// counter changed by fstree
-		c.estimateCacheSize()
 	}
 	return metaerr.Wrap(err)
 }
--- a/pkg/local_object_storage/writecache/flush.go
+++ b/pkg/local_object_storage/writecache/flush.go
@ -87,6 +87,9 @@ func (c *cache) pushToFlushQueue(ctx context.Context, fl *flushLimiter) {
 			}

 			c.modeMtx.RUnlock()
+
+			// counter changed by fstree
+			c.estimateCacheSize()
 		case <-ctx.Done():
 			return
 		}
--- a/pkg/local_object_storage/writecache/put.go
+++ b/pkg/local_object_storage/writecache/put.go
@ -73,8 +73,6 @@ func (c *cache) putBig(ctx context.Context, prm common.PutPrm) error {
 		storagelog.StorageTypeField(wcStorageType),
 		storagelog.OpField("fstree PUT"),
 	)
-	// counter changed by fstree
-	c.estimateCacheSize()

 	return nil
 }
--- a/pkg/local_object_storage/writecache/state.go
+++ b/pkg/local_object_storage/writecache/state.go
@ -18,7 +18,3 @@ func (c *cache) hasEnoughSpace(objectSize uint64) bool {
 	}
 	return c.maxCacheSize >= size+objectSize
 }
-
-func (c *cache) initCounters() {
-	c.estimateCacheSize()
-}
--- a/pkg/local_object_storage/writecache/storage.go
+++ b/pkg/local_object_storage/writecache/storage.go
@ -51,7 +51,5 @@ func (c *cache) deleteFromDisk(ctx context.Context, addr oid.Address, size uint6
 			storagelog.OpField("fstree DELETE"),
 		)
 		c.metrics.Evict(StorageTypeFSTree)
-		// counter changed by fstree
-		c.estimateCacheSize()
 	}
 }