forked from TrueCloudLab/frostfs-node
[#2239] writecache: Fix possible deadlock
LRU `Peek`/`Contains` take LRU mutex _inside_ of a `View` transaction.
`View` transaction itself takes `mmapLock` [1], which is lifted after tx
finishes (in `tx.Commit()` -> `tx.close()` -> `tx.db.removeTx`)
When we evict items from LRU cache mutex order is different:
first we take LRU mutex and then execute `Batch` which _does_ take
`mmapLock` in case we need to remap. Thus the deadlock.
[1] 8f4a7e1f92/db.go (L708)
Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
This commit is contained in:
parent
58367e4df6
commit
e0309e398c
3 changed files with 14 additions and 6 deletions
|
@ -30,6 +30,7 @@ Changelog for FrostFS Node
|
|||
- Restore subscriptions correctly on morph client switch (#2212)
|
||||
- Expired objects could be returned if not marked with GC yet (#2213)
|
||||
- `neofs-adm morph dump-hashes` now properly iterates over custom domain (#2224)
|
||||
- Possible deadlock in write-cache (#2239)
|
||||
|
||||
### Removed
|
||||
### Updated
|
||||
|
|
|
@ -99,10 +99,6 @@ func (c *cache) flushDB() {
|
|||
lastKey = slice.Copy(k)
|
||||
}
|
||||
|
||||
if _, ok := c.flushed.Peek(string(k)); ok {
|
||||
continue
|
||||
}
|
||||
|
||||
m = append(m, objectInfo{
|
||||
addr: string(k),
|
||||
data: slice.Copy(v),
|
||||
|
@ -111,12 +107,18 @@ func (c *cache) flushDB() {
|
|||
return nil
|
||||
})
|
||||
|
||||
var count int
|
||||
for i := range m {
|
||||
if c.flushed.Contains(m[i].addr) {
|
||||
continue
|
||||
}
|
||||
|
||||
obj := object.New()
|
||||
if err := obj.Unmarshal(m[i].data); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
count++
|
||||
select {
|
||||
case c.flushCh <- obj:
|
||||
case <-c.closeCh:
|
||||
|
@ -125,7 +127,7 @@ func (c *cache) flushDB() {
|
|||
}
|
||||
}
|
||||
|
||||
if len(m) == 0 {
|
||||
if count == 0 {
|
||||
c.modeMtx.RUnlock()
|
||||
break
|
||||
}
|
||||
|
@ -133,7 +135,7 @@ func (c *cache) flushDB() {
|
|||
c.modeMtx.RUnlock()
|
||||
|
||||
c.log.Debug("tried to flush items from write-cache",
|
||||
zap.Int("count", len(m)),
|
||||
zap.Int("count", count),
|
||||
zap.String("start", base58.Encode(lastKey)))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,11 @@ type store struct {
|
|||
maxFlushedMarksCount int
|
||||
maxRemoveBatchSize int
|
||||
|
||||
// flushed contains addresses of objects that were already flushed to the main storage.
|
||||
// We use LRU cache instead of map here to facilitate removing of unused object in favour of
|
||||
// frequently read ones.
|
||||
// MUST NOT be used inside bolt db transaction because it's eviction handler
|
||||
// removes untracked items from the database.
|
||||
flushed simplelru.LRUCache[string, bool]
|
||||
db *bbolt.DB
|
||||
|
||||
|
|
Loading…
Reference in a new issue