From 5f1d5a1897d101ab4a4a2cd51b51d0362ec9978a Mon Sep 17 00:00:00 2001 From: Leo Luan Date: Tue, 15 Sep 2020 01:32:16 -0700 Subject: [PATCH] vfs: Fix a deadlock vulnerability in downloaders.Close The downloaders.Close() call acquires the downloaders' mutex before calling the wait group wait and the main downloaders thread has a periodical (5 seconds interval) call to kick its waiters and the waiter dispatch function tries to get the mutex. So a deadlock can occur if the Close() call starts, gets the mutex, while the main downloader thread already got the timer's tick and proceeded to call kickWaiters. The deadlock happens when the Close call gets the mutex between the timer's kick and the main downloader thread gets the mutex first. So it's a pretty short period of time and it probably explains why the problem has not surfaced, maybe something like tens of nanoseconds out of 5 seconds (~10^^-8). It took 5 days of continued stressing the Close calls for the deadlock to appear. --- vfs/vfscache/downloaders/downloaders.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vfs/vfscache/downloaders/downloaders.go b/vfs/vfscache/downloaders/downloaders.go index b7cde1d9a..6ed57f6d1 100644 --- a/vfs/vfscache/downloaders/downloaders.go +++ b/vfs/vfscache/downloaders/downloaders.go @@ -230,7 +230,11 @@ func (dls *Downloaders) Close(inErr error) (err error) { } } dls.cancel() + // dls may have entered the periodical (every 5 seconds) kickWaiters() call + // unlock the mutex to allow it to finish so that we can get its dls.wg.Done() + dls.mu.Unlock() dls.wg.Wait() + dls.mu.Lock() dls.dls = nil dls._dispatchWaiters() dls._closeWaiters(inErr)