[#2239] writecache: Fix possible deadlock

LRU `Peek`/`Contains` take LRU mutex _inside_ of a `View` transaction.
`View` transaction itself takes `mmapLock` [1], which is lifted after tx
finishes (in `tx.Commit()` -> `tx.close()` -> `tx.db.removeTx`)

When we evict items from LRU cache mutex order is different:
first we take LRU mutex and then execute `Batch` which _does_ take
`mmapLock` in case we need to remap. Thus the deadlock.

[1] 8f4a7e1f92/db.go (L708)

Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
This commit is contained in:
Evgenii Stratonikov 2023-02-06 16:03:37 +03:00 committed by fyrchik
parent 58367e4df6
commit e0309e398c
3 changed files with 14 additions and 6 deletions

View file

@ -30,6 +30,7 @@ Changelog for FrostFS Node
- Restore subscriptions correctly on morph client switch (#2212) - Restore subscriptions correctly on morph client switch (#2212)
- Expired objects could be returned if not marked with GC yet (#2213) - Expired objects could be returned if not marked with GC yet (#2213)
- `neofs-adm morph dump-hashes` now properly iterates over custom domain (#2224) - `neofs-adm morph dump-hashes` now properly iterates over custom domain (#2224)
- Possible deadlock in write-cache (#2239)
### Removed ### Removed
### Updated ### Updated

View file

@ -99,10 +99,6 @@ func (c *cache) flushDB() {
lastKey = slice.Copy(k) lastKey = slice.Copy(k)
} }
if _, ok := c.flushed.Peek(string(k)); ok {
continue
}
m = append(m, objectInfo{ m = append(m, objectInfo{
addr: string(k), addr: string(k),
data: slice.Copy(v), data: slice.Copy(v),
@ -111,12 +107,18 @@ func (c *cache) flushDB() {
return nil return nil
}) })
var count int
for i := range m { for i := range m {
if c.flushed.Contains(m[i].addr) {
continue
}
obj := object.New() obj := object.New()
if err := obj.Unmarshal(m[i].data); err != nil { if err := obj.Unmarshal(m[i].data); err != nil {
continue continue
} }
count++
select { select {
case c.flushCh <- obj: case c.flushCh <- obj:
case <-c.closeCh: case <-c.closeCh:
@ -125,7 +127,7 @@ func (c *cache) flushDB() {
} }
} }
if len(m) == 0 { if count == 0 {
c.modeMtx.RUnlock() c.modeMtx.RUnlock()
break break
} }
@ -133,7 +135,7 @@ func (c *cache) flushDB() {
c.modeMtx.RUnlock() c.modeMtx.RUnlock()
c.log.Debug("tried to flush items from write-cache", c.log.Debug("tried to flush items from write-cache",
zap.Int("count", len(m)), zap.Int("count", count),
zap.String("start", base58.Encode(lastKey))) zap.String("start", base58.Encode(lastKey)))
} }
} }

View file

@ -23,6 +23,11 @@ type store struct {
maxFlushedMarksCount int maxFlushedMarksCount int
maxRemoveBatchSize int maxRemoveBatchSize int
// flushed contains addresses of objects that were already flushed to the main storage.
// We use LRU cache instead of map here to facilitate removing of unused object in favour of
// frequently read ones.
// MUST NOT be used inside bolt db transaction because it's eviction handler
// removes untracked items from the database.
flushed simplelru.LRUCache[string, bool] flushed simplelru.LRUCache[string, bool]
db *bbolt.DB db *bbolt.DB