[#2239] writecache: Fix possible deadlock

LRU `Peek`/`Contains` take LRU mutex _inside_ of a `View` transaction. `View` transaction itself takes `mmapLock` [1], which is lifted after tx finishes (in `tx.Commit()` -> `tx.close()` -> `tx.db.removeTx`) When we evict items from LRU cache mutex order is different: first we take LRU mutex and then execute `Batch` which _does_ take `mmapLock` in case we need to remap. Thus the deadlock. [1] 8f4a7e1f92/db.go (L708) Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
2023-02-06 16:03:37 +03:00 · 2023-02-06 16:03:37 +03:00 · e0309e398c
commit e0309e398c
parent 58367e4df6
3 changed files with 14 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -30,6 +30,7 @@ Changelog for FrostFS Node
 - Restore subscriptions correctly on morph client switch (#2212)
 - Expired objects could be returned if not marked with GC yet (#2213)
 - `neofs-adm morph dump-hashes` now properly iterates over custom domain (#2224)
+- Possible deadlock in write-cache (#2239)

 ### Removed
 ### Updated
--- a/pkg/local_object_storage/writecache/flush.go
+++ b/pkg/local_object_storage/writecache/flush.go
@ -99,10 +99,6 @@ func (c *cache) flushDB() {
 					lastKey = slice.Copy(k)
 				}

-				if _, ok := c.flushed.Peek(string(k)); ok {
-					continue
-				}
-
 				m = append(m, objectInfo{
 					addr: string(k),
 					data: slice.Copy(v),
@ -111,12 +107,18 @@ func (c *cache) flushDB() {
 			return nil
 		})

+		var count int
 		for i := range m {
+			if c.flushed.Contains(m[i].addr) {
+				continue
+			}
+
 			obj := object.New()
 			if err := obj.Unmarshal(m[i].data); err != nil {
 				continue
 			}

+			count++
 			select {
 			case c.flushCh <- obj:
 			case <-c.closeCh:
@ -125,7 +127,7 @@ func (c *cache) flushDB() {
 			}
 		}

-		if len(m) == 0 {
+		if count == 0 {
 			c.modeMtx.RUnlock()
 			break
 		}
@ -133,7 +135,7 @@ func (c *cache) flushDB() {
 		c.modeMtx.RUnlock()

 		c.log.Debug("tried to flush items from write-cache",
-			zap.Int("count", len(m)),
+			zap.Int("count", count),
 			zap.String("start", base58.Encode(lastKey)))
 	}
 }
--- a/pkg/local_object_storage/writecache/storage.go
+++ b/pkg/local_object_storage/writecache/storage.go
@ -23,6 +23,11 @@ type store struct {
 	maxFlushedMarksCount int
 	maxRemoveBatchSize   int

+	// flushed contains addresses of objects that were already flushed to the main storage.
+	// We use LRU cache instead of map here to facilitate removing of unused object in favour of
+	// frequently read ones.
+	// MUST NOT be used inside bolt db transaction because it's eviction handler
+	// removes untracked items from the database.
 	flushed simplelru.LRUCache[string, bool]
 	db      *bbolt.DB