forked from TrueCloudLab/frostfs-node
[#2239] writecache: Fix possible deadlock
LRU `Peek`/`Contains` take LRU mutex _inside_ of a `View` transaction.
`View` transaction itself takes `mmapLock` [1], which is lifted after tx
finishes (in `tx.Commit()` -> `tx.close()` -> `tx.db.removeTx`)
When we evict items from LRU cache mutex order is different:
first we take LRU mutex and then execute `Batch` which _does_ take
`mmapLock` in case we need to remap. Thus the deadlock.
[1] 8f4a7e1f92/db.go (L708)
Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
This commit is contained in:
parent
58367e4df6
commit
e0309e398c
3 changed files with 14 additions and 6 deletions
|
@ -30,6 +30,7 @@ Changelog for FrostFS Node
|
||||||
- Restore subscriptions correctly on morph client switch (#2212)
|
- Restore subscriptions correctly on morph client switch (#2212)
|
||||||
- Expired objects could be returned if not marked with GC yet (#2213)
|
- Expired objects could be returned if not marked with GC yet (#2213)
|
||||||
- `neofs-adm morph dump-hashes` now properly iterates over custom domain (#2224)
|
- `neofs-adm morph dump-hashes` now properly iterates over custom domain (#2224)
|
||||||
|
- Possible deadlock in write-cache (#2239)
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
### Updated
|
### Updated
|
||||||
|
|
|
@ -99,10 +99,6 @@ func (c *cache) flushDB() {
|
||||||
lastKey = slice.Copy(k)
|
lastKey = slice.Copy(k)
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, ok := c.flushed.Peek(string(k)); ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
m = append(m, objectInfo{
|
m = append(m, objectInfo{
|
||||||
addr: string(k),
|
addr: string(k),
|
||||||
data: slice.Copy(v),
|
data: slice.Copy(v),
|
||||||
|
@ -111,12 +107,18 @@ func (c *cache) flushDB() {
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var count int
|
||||||
for i := range m {
|
for i := range m {
|
||||||
|
if c.flushed.Contains(m[i].addr) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
obj := object.New()
|
obj := object.New()
|
||||||
if err := obj.Unmarshal(m[i].data); err != nil {
|
if err := obj.Unmarshal(m[i].data); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
count++
|
||||||
select {
|
select {
|
||||||
case c.flushCh <- obj:
|
case c.flushCh <- obj:
|
||||||
case <-c.closeCh:
|
case <-c.closeCh:
|
||||||
|
@ -125,7 +127,7 @@ func (c *cache) flushDB() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(m) == 0 {
|
if count == 0 {
|
||||||
c.modeMtx.RUnlock()
|
c.modeMtx.RUnlock()
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
@ -133,7 +135,7 @@ func (c *cache) flushDB() {
|
||||||
c.modeMtx.RUnlock()
|
c.modeMtx.RUnlock()
|
||||||
|
|
||||||
c.log.Debug("tried to flush items from write-cache",
|
c.log.Debug("tried to flush items from write-cache",
|
||||||
zap.Int("count", len(m)),
|
zap.Int("count", count),
|
||||||
zap.String("start", base58.Encode(lastKey)))
|
zap.String("start", base58.Encode(lastKey)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,11 @@ type store struct {
|
||||||
maxFlushedMarksCount int
|
maxFlushedMarksCount int
|
||||||
maxRemoveBatchSize int
|
maxRemoveBatchSize int
|
||||||
|
|
||||||
|
// flushed contains addresses of objects that were already flushed to the main storage.
|
||||||
|
// We use LRU cache instead of map here to facilitate removing of unused object in favour of
|
||||||
|
// frequently read ones.
|
||||||
|
// MUST NOT be used inside bolt db transaction because it's eviction handler
|
||||||
|
// removes untracked items from the database.
|
||||||
flushed simplelru.LRUCache[string, bool]
|
flushed simplelru.LRUCache[string, bool]
|
||||||
db *bbolt.DB
|
db *bbolt.DB
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue