Skip to content

Commit b2ef076

Browse files
authored
fix: [2.6] prevent panic in standby mixcoord during shutdown #45859 (#45898)
issue: #45728 pr: #45730 When mixcoord is in standby mode and shutdown is triggered, the ProcessActiveStandBy goroutine may panic if context cancellation occurs. This happens because the error handling didn't check for context.Canceled errors before panicking. Changes: - Add context cancellation check in mix_coord Register() before panic - Check s.ctx.Err() == context.Canceled and gracefully exit - Remove unused ForceActiveStandby() function from session_util This ensures standby mixcoord can shutdown gracefully without panic when context is cancelled during the standby process. Signed-off-by: Wei Liu <[email protected]>
1 parent 5ba7c4e commit b2ef076

File tree

5 files changed

+4
-168
lines changed

5 files changed

+4
-168
lines changed

internal/coordinator/mix_coord.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ func (s *mixCoordImpl) Register() error {
121121
if s.enableActiveStandBy {
122122
go func() {
123123
if err := s.session.ProcessActiveStandBy(s.activateFunc); err != nil {
124+
if s.ctx.Err() == context.Canceled {
125+
log.Info("standby process canceled due to server shutdown")
126+
return
127+
}
124128
log.Error("failed to activate standby server", zap.Error(err))
125129
panic(err)
126130
}

internal/util/sessionutil/mock_session.go

Lines changed: 0 additions & 46 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/util/sessionutil/session.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ type SessionInterface interface {
4545
Disconnected() bool
4646
SetEnableActiveStandBy(enable bool)
4747
ProcessActiveStandBy(activateFunc func() error) error
48-
ForceActiveStandby(activateFunc func() error) error
4948

5049
GetAddress() string
5150
GetServerID() int64

internal/util/sessionutil/session_util.go

Lines changed: 0 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,73 +1230,6 @@ func (s *Session) ProcessActiveStandBy(activateFunc func() error) error {
12301230
return nil
12311231
}
12321232

1233-
func (s *Session) ForceActiveStandby(activateFunc func() error) error {
1234-
s.activeKey = path.Join(s.metaRoot, DefaultServiceRoot, s.ServerName)
1235-
1236-
// force register to the active_key.
1237-
forceRegisterActiveFn := func() error {
1238-
log.Info(fmt.Sprintf("try to register as ACTIVE %v service...", s.ServerName))
1239-
sessionJSON, err := json.Marshal(s)
1240-
if err != nil {
1241-
log.Error("json marshal error", zap.Error(err))
1242-
return err
1243-
}
1244-
1245-
// try to release old session first
1246-
sessions, _, err := s.GetSessions(s.ServerName)
1247-
if err != nil {
1248-
return err
1249-
}
1250-
1251-
if len(sessions) != 0 {
1252-
activeSess := sessions[s.ServerName]
1253-
if activeSess == nil || activeSess.LeaseID == nil {
1254-
// force delete all old sessions
1255-
s.etcdCli.Delete(s.ctx, s.activeKey)
1256-
for _, sess := range sessions {
1257-
if sess.ServerID != s.ServerID {
1258-
sess.getCompleteKey()
1259-
key := path.Join(s.metaRoot, DefaultServiceRoot, fmt.Sprintf("%s-%d", sess.ServerName, sess.ServerID))
1260-
s.etcdCli.Delete(s.ctx, key)
1261-
}
1262-
}
1263-
} else {
1264-
// force release old active session
1265-
_, _ = s.etcdCli.Revoke(s.ctx, *activeSess.LeaseID)
1266-
}
1267-
}
1268-
1269-
// then try to register as active
1270-
resp, err := s.etcdCli.Txn(s.ctx).If(
1271-
clientv3.Compare(
1272-
clientv3.Version(s.activeKey),
1273-
"=",
1274-
0)).
1275-
Then(clientv3.OpPut(s.activeKey, string(sessionJSON), clientv3.WithLease(*s.LeaseID))).Commit()
1276-
1277-
if err != nil || !resp.Succeeded {
1278-
msg := fmt.Sprintf("failed to force register ACTIVE %s", s.ServerName)
1279-
log.Error(msg, zap.Error(err), zap.Any("resp", resp))
1280-
return errors.New(msg)
1281-
}
1282-
1283-
log.Info(fmt.Sprintf("force register ACTIVE %s", s.ServerName))
1284-
return nil
1285-
}
1286-
1287-
err := retry.Do(s.ctx, forceRegisterActiveFn, retry.Attempts(uint(s.sessionRetryTimes)))
1288-
if err != nil {
1289-
log.Warn(fmt.Sprintf("failed to force register ACTIVE %s", s.ServerName))
1290-
return err
1291-
}
1292-
s.updateStandby(false)
1293-
log.Info(fmt.Sprintf("serverName: %v quit STANDBY mode, this node will become ACTIVE, ID: %d", s.ServerName, s.ServerID))
1294-
if activateFunc != nil {
1295-
return activateFunc()
1296-
}
1297-
return nil
1298-
}
1299-
13001233
func filterEmptyStrings(s []string) []string {
13011234
var filtered []string
13021235
for _, str := range s {

internal/util/sessionutil/session_util_test.go

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -962,60 +962,6 @@ func (s *SessionSuite) TestRevoke() {
962962
}
963963
}
964964

965-
func (s *SessionSuite) TestForceActiveWithLeaseID() {
966-
ctx := context.Background()
967-
role := "test"
968-
sess1 := NewSessionWithEtcd(ctx, s.metaRoot, s.client, WithResueNodeID(false))
969-
sess1.Init(role, "normal1", false, false)
970-
sess1.Register()
971-
sess1.ProcessActiveStandBy(nil)
972-
973-
sess2 := NewSessionWithEtcd(ctx, s.metaRoot, s.client, WithResueNodeID(false))
974-
sess2.Init(role, "normal2", false, false)
975-
sess2.Register()
976-
sess2.ForceActiveStandby(nil)
977-
978-
defer func() {
979-
sess1.Stop()
980-
sess2.Stop()
981-
}()
982-
sessions, _, err := sess2.GetSessions(role)
983-
s.NoError(err)
984-
s.Len(sessions, 2)
985-
sess := sessions[role]
986-
s.NotNil(sess)
987-
s.Equal(sess.Address, "normal2")
988-
s.Equal(sess.ServerID, sess2.ServerID)
989-
}
990-
991-
func (s *SessionSuite) TestForceActiveWithDelete() {
992-
ctx := context.Background()
993-
role := "test"
994-
sess1 := NewSessionWithEtcd(ctx, s.metaRoot, s.client, WithResueNodeID(false))
995-
sess1.Init(role, "normal1", false, false)
996-
sessionJSON, err := json.Marshal(sess1)
997-
s.NoError(err)
998-
s.client.Put(ctx, path.Join(s.metaRoot, DefaultServiceRoot, fmt.Sprintf("%s-%d", role, 1)), string(sessionJSON))
999-
s.client.Put(ctx, path.Join(s.metaRoot, DefaultServiceRoot, role), string(sessionJSON))
1000-
1001-
sess2 := NewSessionWithEtcd(ctx, s.metaRoot, s.client, WithResueNodeID(false))
1002-
sess2.Init(role, "normal2", false, false)
1003-
sess2.Register()
1004-
sess2.ForceActiveStandby(nil)
1005-
1006-
defer func() {
1007-
sess1.Stop()
1008-
sess2.Stop()
1009-
}()
1010-
sessions, _, err := sess2.GetSessions(role)
1011-
s.NoError(err)
1012-
s.Len(sessions, 2)
1013-
sess := sessions[role]
1014-
s.NotNil(sess)
1015-
s.Equal(sess.Address, "normal2")
1016-
s.Equal(sess.ServerID, sess2.ServerID)
1017-
}
1018-
1019965
func (s *SessionSuite) TestKeepAliveRetryActiveCancel() {
1020966
ctx := context.Background()
1021967
session := NewSessionWithEtcd(ctx, s.metaRoot, s.client)

0 commit comments

Comments
 (0)