[#2239] writecache: Fix possible deadlock

LRU `Peek`/`Contains` take LRU mutex _inside_ of a `View` transaction.
`View` transaction itself takes `mmapLock` [1], which is lifted after tx
finishes (in `tx.Commit()` -> `tx.close()` -> `tx.db.removeTx`)

When we evict items from LRU cache mutex order is different:
first we take LRU mutex and then execute `Batch` which _does_ take
`mmapLock` in case we need to remap. Thus the deadlock.

[1] 8f4a7e1f92/db.go (L708)

Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
KirillovDenis/poc/impersonate
Evgenii Stratonikov 2023-02-06 16:03:37 +03:00 committed by fyrchik
parent 58367e4df6
commit e0309e398c
3 changed files with 14 additions and 6 deletions

View File

@ -30,6 +30,7 @@ Changelog for FrostFS Node
- Restore subscriptions correctly on morph client switch (#2212)
- Expired objects could be returned if not marked with GC yet (#2213)
- `neofs-adm morph dump-hashes` now properly iterates over custom domain (#2224)
- Possible deadlock in write-cache (#2239)
### Removed
### Updated

View File

@ -99,10 +99,6 @@ func (c *cache) flushDB() {
lastKey = slice.Copy(k)
}
if _, ok := c.flushed.Peek(string(k)); ok {
continue
}
m = append(m, objectInfo{
addr: string(k),
data: slice.Copy(v),
@ -111,12 +107,18 @@ func (c *cache) flushDB() {
return nil
})
var count int
for i := range m {
if c.flushed.Contains(m[i].addr) {
continue
}
obj := object.New()
if err := obj.Unmarshal(m[i].data); err != nil {
continue
}
count++
select {
case c.flushCh <- obj:
case <-c.closeCh:
@ -125,7 +127,7 @@ func (c *cache) flushDB() {
}
}
if len(m) == 0 {
if count == 0 {
c.modeMtx.RUnlock()
break
}
@ -133,7 +135,7 @@ func (c *cache) flushDB() {
c.modeMtx.RUnlock()
c.log.Debug("tried to flush items from write-cache",
zap.Int("count", len(m)),
zap.Int("count", count),
zap.String("start", base58.Encode(lastKey)))
}
}

View File

@ -23,6 +23,11 @@ type store struct {
maxFlushedMarksCount int
maxRemoveBatchSize int
// flushed contains addresses of objects that were already flushed to the main storage.
// We use LRU cache instead of map here to facilitate removing of unused object in favour of
// frequently read ones.
// MUST NOT be used inside bolt db transaction because it's eviction handler
// removes untracked items from the database.
flushed simplelru.LRUCache[string, bool]
db *bbolt.DB