grpclb: fix deadlock in grpclb connection cache (#3017)

Before the fix, if the timer to remove a SubConn fires at the same time
NewSubConn cancels the timer, it caused a mutex leak and deadlock.
This commit is contained in:
Menghan Li
2019-09-10 12:40:48 -07:00
committed by GitHub
parent 4ccf24ac5d
commit ac35b67779
2 changed files with 44 additions and 1 deletions

View File

@ -173,13 +173,13 @@ func (ccc *lbCacheClientConn) RemoveSubConn(sc balancer.SubConn) {
timer := time.AfterFunc(ccc.timeout, func() {
ccc.mu.Lock()
defer ccc.mu.Unlock()
if entry.abortDeleting {
return
}
ccc.cc.RemoveSubConn(sc)
delete(ccc.subConnToAddr, sc)
delete(ccc.subConnCache, addr)
ccc.mu.Unlock()
})
entry.cancel = func() {
if !timer.Stop() {

View File

@ -217,3 +217,46 @@ func TestLBCacheClientConnReuse(t *testing.T) {
t.Fatal(err)
}
}
// Test that if the timer to remove a SubConn fires at the same time NewSubConn
// cancels the timer, it doesn't cause deadlock.
func TestLBCache_RemoveTimer_New_Race(t *testing.T) {
mcc := newMockClientConn()
if err := checkMockCC(mcc, 0); err != nil {
t.Fatal(err)
}
ccc := newLBCacheClientConn(mcc)
ccc.timeout = time.Nanosecond
if err := checkCacheCC(ccc, 0, 0); err != nil {
t.Fatal(err)
}
sc, _ := ccc.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{})
// One subconn in MockCC.
if err := checkMockCC(mcc, 1); err != nil {
t.Fatal(err)
}
// No subconn being deleted, and one in CacheCC.
if err := checkCacheCC(ccc, 0, 1); err != nil {
t.Fatal(err)
}
done := make(chan struct{})
go func() {
for i := 0; i < 1000; i++ {
// Remove starts a timer with 1 ns timeout, the NewSubConn will race
// with with the timer.
ccc.RemoveSubConn(sc)
sc, _ = ccc.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{})
}
close(done)
}()
select {
case <-time.After(time.Second):
t.Fatalf("Test didn't finish within 1 second. Deadlock")
case <-done:
}
}