Skip to content

Commit 2749ad4

Browse files
committed
Add back-off controller for sleep time of reconnection when connection lost is detected immediately after connecting. eclipse-paho#589
Signed-off-by: Daichi Tomaru <[email protected]>
1 parent 4b066a0 commit 2749ad4

File tree

3 files changed

+141
-8
lines changed

3 files changed

+141
-8
lines changed

backoff.go

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
* Copyright (c) 2021 IBM Corp and others.
3+
*
4+
* All rights reserved. This program and the accompanying materials
5+
* are made available under the terms of the Eclipse Public License v2.0
6+
* and Eclipse Distribution License v1.0 which accompany this distribution.
7+
*
8+
* The Eclipse Public License is available at
9+
* https://www.eclipse.org/legal/epl-2.0/
10+
* and the Eclipse Distribution License is available at
11+
* http://www.eclipse.org/org/documents/edl-v10.php.
12+
*
13+
* Contributors:
14+
* Matt Brittan
15+
* Daichi Tomaru
16+
*/
17+
18+
package mqtt
19+
20+
import (
21+
"sync"
22+
"time"
23+
)
24+
25+
// Controller for sleep period with backoff when some reconnection attempting or connection lost occure.
26+
// It has statuses for each situation caused retry.
27+
type backoffController struct {
28+
sync.RWMutex
29+
statusMap map[string]*backoffStatus
30+
}
31+
32+
type backoffStatus struct {
33+
lastSleepPeriod time.Duration
34+
lastErrorTime time.Time
35+
}
36+
37+
func newBackoffController() *backoffController {
38+
return &backoffController{
39+
statusMap: map[string]*backoffStatus{},
40+
}
41+
}
42+
43+
// Calculate next sleep period from initial and max one and elapsed time since last sleeping.
44+
// Returned values are next sleep period and whether the error situation is continual.
45+
// If connection errors continuouslly occurs, its sleep period is exponentially increased.
46+
// Also if there is a lot of time between last and this error, sleep period is initialized.
47+
func (b *backoffController) getBackoffSleepTime(
48+
initSleepPeriod time.Duration, maxSleepPeriod time.Duration, situation string, processTime time.Duration,
49+
) (time.Duration, bool) {
50+
b.Lock()
51+
defer b.Unlock()
52+
53+
status, exist := b.statusMap[situation]
54+
if !exist {
55+
b.statusMap[situation] = &backoffStatus{initSleepPeriod, time.Now()}
56+
return initSleepPeriod, false
57+
}
58+
59+
oldTime := status.lastErrorTime
60+
status.lastErrorTime = time.Now()
61+
62+
// When there is a lot of time between last and this error, sleep period is initialized.
63+
if status.lastErrorTime.Sub(oldTime) > (processTime * 2 + status.lastSleepPeriod) {
64+
status.lastSleepPeriod = initSleepPeriod
65+
return initSleepPeriod, false
66+
}
67+
68+
if nextSleepPeriod := status.lastSleepPeriod * 2; nextSleepPeriod <= maxSleepPeriod {
69+
status.lastSleepPeriod = nextSleepPeriod
70+
} else {
71+
status.lastSleepPeriod = maxSleepPeriod
72+
}
73+
74+
return status.lastSleepPeriod, true
75+
}

backoff_test.go

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Copyright (c) 2021 IBM Corp and others.
3+
*
4+
* All rights reserved. This program and the accompanying materials
5+
* are made available under the terms of the Eclipse Public License v2.0
6+
* and Eclipse Distribution License v1.0 which accompany this distribution.
7+
*
8+
* The Eclipse Public License is available at
9+
* https://www.eclipse.org/legal/epl-2.0/
10+
* and the Eclipse Distribution License is available at
11+
* http://www.eclipse.org/org/documents/edl-v10.php.
12+
*
13+
* Contributors:
14+
* Matt Brittan
15+
* Daichi Tomaru
16+
*/
17+
18+
package mqtt
19+
20+
import (
21+
"testing"
22+
"time"
23+
)
24+
25+
func TestGetBackoffSleepTime(t *testing.T) {
26+
// Test for adding new situation
27+
controller := newBackoffController()
28+
if s, c := controller.getBackoffSleepTime(1 * time.Second, 5 * time.Second, "not-exist", 1 * time.Second); !((s == 1 * time.Second) && !c) {
29+
t.Errorf("When new situation is added, period should be initSleepPeriod and naturally it shouldn't be continual error. s:%d c%t", s, c)
30+
}
31+
32+
// Test for the continual error in the same situation and suppression of sleep period by maxSleepPeriod
33+
controller.getBackoffSleepTime(10 * time.Second, 30 * time.Second, "multi", 1 * time.Second)
34+
if s, c := controller.getBackoffSleepTime(10 * time.Second, 30 * time.Second, "multi", 1 * time.Second); !((s == 20 * time.Second) && c) {
35+
t.Errorf("When same situation is called again, period should be increased and it should be regarded as a continual error. s:%d c%t", s, c)
36+
}
37+
if s, c := controller.getBackoffSleepTime(10 * time.Second, 30 * time.Second, "multi", 1 * time.Second); !((s == 30 * time.Second) && c) {
38+
t.Errorf("A same situation is called three times. 10 * 2 * 2 = 40 but maxSleepPeriod is 30. So the next period should be 30. s:%d c%t", s, c)
39+
}
40+
41+
// Test for initialization by elapsed time.
42+
controller.getBackoffSleepTime(1 * time.Second, 128 * time.Second, "elapsed", 1 * time.Second)
43+
controller.getBackoffSleepTime(1 * time.Second, 128 * time.Second, "elapsed", 1 * time.Second)
44+
time.Sleep((1 * 2 + 1 * 2 + 1) * time.Second)
45+
if s, c := controller.getBackoffSleepTime(1 * time.Second, 128 * time.Second, "elapsed", 1 * time.Second); !((s == 1 * time.Second) && !c) {
46+
t.Errorf("Initialization should be triggered by elapsed time. s:%d c%t", s, c)
47+
}
48+
49+
// Test when initial and max period is same.
50+
controller.getBackoffSleepTime(1 * time.Second, 1 * time.Second, "same", 1 * time.Second)
51+
if s, c := controller.getBackoffSleepTime(1 * time.Second, 1 * time.Second, "same", 1 * time.Second); !((s == 1 * time.Second) && c) {
52+
t.Errorf("Sleep time should be always 1. s:%d c%t", s, c)
53+
}
54+
}

client.go

+12-8
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ type client struct {
141141
stop chan struct{} // Closed to request that workers stop
142142
workers sync.WaitGroup // used to wait for workers to complete (ping, keepalive, errwatch, resume)
143143
commsStopped chan struct{} // closed when the comms routines have stopped (kept running until after workers have closed to avoid deadlocks)
144+
145+
backoff *backoffController
144146
}
145147

146148
// NewClient will create an MQTT v3.1.1 client with all of the options specified
@@ -169,6 +171,7 @@ func NewClient(o *ClientOptions) Client {
169171
c.msgRouter.setDefaultHandler(c.options.DefaultPublishHandler)
170172
c.obound = make(chan *PacketAndToken)
171173
c.oboundP = make(chan *PacketAndToken)
174+
c.backoff = newBackoffController()
172175
return c
173176
}
174177

@@ -302,10 +305,17 @@ func (c *client) Connect() Token {
302305
func (c *client) reconnect(connectionUp connCompletedFn) {
303306
DEBUG.Println(CLI, "enter reconnect")
304307
var (
305-
sleep = 1 * time.Second
308+
initSleep = 1 * time.Second
306309
conn net.Conn
307310
)
308311

312+
// If the reason of connection lost is same as the before one, sleep timer is set before attempting connection is started.
313+
// Sleep time is exponentially increased as the same situation continues
314+
if slp, isSameErr := c.backoff.getBackoffSleepTime(initSleep, c.options.MaxReconnectInterval, "connectionLost", 5 * time.Second); isSameErr {
315+
DEBUG.Println(CLI, "Detect continual connection lost after reconnect, sleeping for", int(slp.Seconds()), "seconds")
316+
time.Sleep(slp)
317+
}
318+
309319
for {
310320
if nil != c.options.OnReconnecting {
311321
c.options.OnReconnecting(c, &c.options)
@@ -315,15 +325,9 @@ func (c *client) reconnect(connectionUp connCompletedFn) {
315325
if err == nil {
316326
break
317327
}
328+
sleep, _ := c.backoff.getBackoffSleepTime(initSleep, c.options.MaxReconnectInterval, "attemptReconnection", c.options.ConnectTimeout)
318329
DEBUG.Println(CLI, "Reconnect failed, sleeping for", int(sleep.Seconds()), "seconds:", err)
319330
time.Sleep(sleep)
320-
if sleep < c.options.MaxReconnectInterval {
321-
sleep *= 2
322-
}
323-
324-
if sleep > c.options.MaxReconnectInterval {
325-
sleep = c.options.MaxReconnectInterval
326-
}
327331

328332
if c.status.ConnectionStatus() != reconnecting { // Disconnect may have been called
329333
if err := connectionUp(false); err != nil { // Should always return an error

0 commit comments

Comments
 (0)