Skip to content
This repository was archived by the owner on Nov 7, 2023. It is now read-only.

Commit c814dd1

Browse files
authored
Merge pull request #160 from cloudflare/repair-exp-backoff
add exponential decay to repair delay
2 parents 33e5bfb + 6617155 commit c814dd1

File tree

4 files changed

+103
-25
lines changed

4 files changed

+103
-25
lines changed

cmd/argot/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ func main() {
5454
connlimit := couple.Flag("connection-limit", "profiling bind address").Default("512").Int()
5555
repairdelay := couple.Flag("repair-delay", "period between tunnel repair attempts").Default(argotunnel.RepairDelayDefault.String()).Duration()
5656
repairjitter := couple.Flag("repair-jitter", "linear jitter as a fraction of repair-delay").Default(strconv.FormatFloat(argotunnel.RepairJitterDefault, 'E', -1, 64)).Float64()
57+
repairsteps := couple.Flag("repair-steps", "number of exponential steps used during tunnel repair").Default(strconv.FormatUint(argotunnel.RepairStepsDefault, 10)).Uint()
5758
resyncperiod := couple.Flag("resync-period", "period between synchronization attempts").Default(argotunnel.ResyncPeriodDefault.String()).Duration()
5859
taglimit := couple.Flag("tag-limit", "number of tags allowed per tunnel").Default(strconv.Itoa(argotunnel.TagLimitDefault)).Int()
5960
transportlogenable := couple.Flag("transport-log-enable", "enable transport logging").Bool()
@@ -176,7 +177,7 @@ func main() {
176177
}
177178

178179
argotunnel.EnableMetrics(5 * time.Second)
179-
argotunnel.SetRepairBackoff(*repairdelay, *repairjitter)
180+
argotunnel.SetRepairBackoff(*repairdelay, *repairjitter, *repairsteps)
180181
argotunnel.SetTagLimit(*taglimit)
181182
argotunnel.SetVersion(version)
182183

hack/tunnel.gosrc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,11 +196,11 @@ func ServeTunnel(
196196
// Treat panics as recoverable errors
197197
defer func() {
198198
if r := recover(); r != nil {
199-
recoverable = true
200-
err = fmt.Errorf("tunnel: %v", r)
201199
config.Logger.WithFields(log.Fields{
202-
"trace": debug.Stack(),
200+
"trace": string(debug.Stack()),
203201
}).Errorf("tunnel runtime panic: %v", r)
202+
err = fmt.Errorf("tunnel: %v", r)
203+
recoverable = true
204204
}
205205
}()
206206

internal/argotunnel/tunnel.go

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"crypto/x509"
77
"encoding/pem"
88
"fmt"
9+
"math/rand"
910
"net"
1011
"net/http"
1112
"reflect"
@@ -17,15 +18,16 @@ import (
1718
"github.com/cloudflare/cloudflared/origin"
1819
"github.com/cloudflare/cloudflared/tunnelrpc/pogs"
1920
"github.com/sirupsen/logrus"
20-
"k8s.io/apimachinery/pkg/util/rand"
21-
"k8s.io/apimachinery/pkg/util/wait"
21+
utilrand "k8s.io/apimachinery/pkg/util/rand"
2222
)
2323

2424
const (
2525
// RepairDelayDefault the default base time to wait between repairs
26-
RepairDelayDefault = 40 * time.Millisecond
26+
RepairDelayDefault = 100 * time.Millisecond
2727
// RepairJitterDefault the default linear jitter applied to the wait on repair
28-
RepairJitterDefault = 1.0
28+
RepairJitterDefault = 0.5
29+
// RepairStepsDefault the default exponential steps used during repair
30+
RepairStepsDefault = 4
2931
// TagLimitDefault the default number of unique tags
3032
TagLimitDefault = 32
3133

@@ -35,17 +37,20 @@ const (
3537
var repairBackoff = struct {
3638
delay time.Duration
3739
jitter float64
40+
steps uint
3841
setRepair sync.Once
3942
}{
4043
delay: RepairDelayDefault,
4144
jitter: RepairJitterDefault,
45+
steps: RepairStepsDefault,
4246
}
4347

4448
// SetRepairBackoff configures the repair backoff used by all tunnels
45-
func SetRepairBackoff(delay time.Duration, jitter float64) {
49+
func SetRepairBackoff(delay time.Duration, jitter float64, steps uint) {
4650
repairBackoff.setRepair.Do(func() {
4751
repairBackoff.delay = delay
4852
repairBackoff.jitter = jitter
53+
repairBackoff.steps = steps
4954
})
5055
}
5156

@@ -90,15 +95,16 @@ type tunnelLink interface {
9095
}
9196

9297
type syncTunnelLink struct {
93-
mu sync.RWMutex
94-
rule tunnelRule
95-
cert []byte
96-
opts tunnelOptions
97-
config *origin.TunnelConfig
98-
errCh chan error
99-
quitCh chan struct{}
100-
stopCh chan struct{}
101-
log *logrus.Logger
98+
mu sync.RWMutex
99+
rule tunnelRule
100+
cert []byte
101+
opts tunnelOptions
102+
config *origin.TunnelConfig
103+
errCh chan error
104+
quitCh chan struct{}
105+
stopCh chan struct{}
106+
repiars uint
107+
log *logrus.Logger
102108
}
103109

104110
func (l *syncTunnelLink) host() string {
@@ -203,7 +209,7 @@ func newLinkTunnelConfig(rule tunnelRule, cert []byte, options tunnelOptions) *o
203209
Retries: options.retries,
204210
HeartbeatInterval: options.heartbeatInterval,
205211
MaxHeartbeats: options.heartbeatCount,
206-
ClientID: rand.String(32),
212+
ClientID: utilrand.String(32),
207213
BuildInfo: origin.GetBuildInfo(),
208214
ReportedVersion: versionConfig.version,
209215
LBPool: options.lbPool,
@@ -220,6 +226,8 @@ func newLinkTunnelConfig(rule tunnelRule, cert []byte, options tunnelOptions) *o
220226
RunFromTerminal: false, // bool
221227
NoChunkedEncoding: options.noChunkedEncoding,
222228
CompressionQuality: options.compressionQuality,
229+
IncidentLookup: origin.NewIncidentLookup(),
230+
CloseConnOnce: &sync.Once{},
223231
}
224232
}
225233

@@ -365,7 +373,7 @@ func repairFunc(l *syncTunnelLink) func() {
365373
}).Errorf("link exited with error (%s) '%v', repairing ...", reflect.TypeOf(err), err)
366374

367375
// linear back-off on runtime error
368-
delay := wait.Jitter(repairBackoff.delay, repairBackoff.jitter)
376+
delay := repairDelay(ll.repiars, repairBackoff.delay, repairBackoff.jitter, repairBackoff.steps)
369377
log.WithFields(logrus.Fields{
370378
"origin": ll.config.OriginUrl,
371379
"hostname": ll.rule.host,
@@ -393,12 +401,24 @@ func repairFunc(l *syncTunnelLink) func() {
393401
}
394402

395403
close(ll.stopCh)
396-
ll.config.ClientID = rand.String(32)
404+
ll.config.ClientID = utilrand.String(32)
397405
ll.stopCh = make(chan struct{})
406+
ll.repiars++
398407
go launchFunc(ll)()
399408
}()
400409
}
401410
}
402411
}
403412
}
404413
}
414+
415+
func repairDelay(step uint, delay time.Duration, jitter float64, steps uint) time.Duration {
416+
d := delay
417+
if steps > 0 {
418+
d = (1 << (step % steps)) * delay
419+
}
420+
if jitter > 0 {
421+
d += time.Duration(rand.Float64() * jitter * float64(delay))
422+
}
423+
return d
424+
}

internal/argotunnel/tunnel_test.go

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -318,32 +318,89 @@ func TestParseTags(t *testing.T) {
318318
func TestSetRepairBackoff(t *testing.T) {
319319
repairDelay := repairBackoff.delay
320320
repairJitter := repairBackoff.jitter
321+
repairSteps := repairBackoff.steps
321322
repairs := []struct {
322323
delay time.Duration
323324
jitter float64
325+
steps uint
324326
}{
325327
{
326328
delay: 1 * time.Millisecond,
327-
jitter: 0.5,
329+
jitter: 0.25,
330+
steps: 1,
328331
},
329332
{
330333
delay: 10 * time.Millisecond,
331-
jitter: .25,
334+
jitter: .125,
335+
steps: 2,
332336
},
333337
{
334338
delay: 100 * time.Millisecond,
335-
jitter: .125,
339+
jitter: .0625,
340+
steps: 3,
336341
},
337342
}
338343

339344
for _, r := range repairs {
340-
SetRepairBackoff(r.delay, r.jitter)
345+
SetRepairBackoff(r.delay, r.jitter, r.steps)
341346
}
342347

343348
assert.Equalf(t, repairs[0].delay, repairBackoff.delay, "test repair delay matches first set")
344349
assert.Equalf(t, repairs[0].jitter, repairBackoff.jitter, "test repair jitter matches first set")
350+
assert.Equalf(t, repairs[0].steps, repairBackoff.steps, "test repair steps matches first set")
345351
assert.NotEqualf(t, repairBackoff.delay, repairDelay, "test repair delay does not match default")
346352
assert.NotEqualf(t, repairBackoff.jitter, repairJitter, "test repair jitter does not match default")
353+
assert.NotEqualf(t, repairBackoff.steps, repairSteps, "test repair steps does not match default")
354+
}
355+
356+
func TestRepairDelay(t *testing.T) {
357+
t.Parallel()
358+
for name, test := range map[string]struct {
359+
step uint
360+
delay time.Duration
361+
jitter float64
362+
steps uint
363+
out time.Duration
364+
}{
365+
"step-0-no-jitter": {
366+
step: 0,
367+
delay: 10 * time.Millisecond,
368+
jitter: 0.0,
369+
steps: 4,
370+
out: 10 * time.Millisecond,
371+
},
372+
"step-0-no-jitter-no-steps": {
373+
step: 0,
374+
delay: 10 * time.Millisecond,
375+
jitter: 0.0,
376+
steps: 0,
377+
out: 10 * time.Millisecond,
378+
},
379+
"step-1-no-jitter": {
380+
step: 1,
381+
delay: 10 * time.Millisecond,
382+
jitter: 0.0,
383+
steps: 4,
384+
out: 20 * time.Millisecond,
385+
},
386+
"step-2-no-jitter": {
387+
step: 2,
388+
delay: 10 * time.Millisecond,
389+
jitter: 0.0,
390+
steps: 4,
391+
out: 40 * time.Millisecond,
392+
},
393+
"step-4-no-jitter": {
394+
step: 4,
395+
delay: 10 * time.Millisecond,
396+
jitter: 0.0,
397+
steps: 4,
398+
out: 10 * time.Millisecond,
399+
},
400+
} {
401+
out := repairDelay(test.step, test.delay, test.jitter, test.steps)
402+
assert.Equalf(t, test.out, out, "test '%s' value mismatch", name)
403+
}
347404
}
348405

349406
func TestSetTagLimit(t *testing.T) {

0 commit comments

Comments
 (0)