Skip to content

Commit 75670e4

Browse files
authored
Simplify the alarm recovery record. (#13585)
1 parent 33316cf commit 75670e4

File tree

7 files changed

+40
-103
lines changed

7 files changed

+40
-103
lines changed

docs/en/status/query_alarm_runtime_status.md

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ Return the detailed information of the alarm running rule.
7373
{
7474
"scope": "SERVICE",
7575
"name": "mock_b_service",
76-
"formattedMessage": "Response time of mock_b_service is more than upper baseline in 1 minutes of last 10 minutes."
76+
"formattedMessage": "Service mock_b_service response time is more than 1000ms of last 10 minutes"
7777
}
7878
],
7979
"tags": [
@@ -108,12 +108,12 @@ Return the detailed information of the alarm running rule.
108108
{
109109
"scope": "SERVICE",
110110
"name": "mock_a_service",
111-
"formattedMessage": "Response time of mock_a_service is more than upper baseline in 1 minutes of last 10 minutes."
111+
"formattedMessage": "Service mock_a_service response time is more than 1000ms of last 10 minutes."
112112
},
113113
{
114114
"scope": "SERVICE",
115115
"name": "mock_c_service",
116-
"formattedMessage": "Response time of service mock_c_service is more than upper baseline in 1 minutes of last 10 minutes."
116+
"formattedMessage": "Service mock_c_service response time is more than 1000ms of last 10 minutes."
117117
}
118118
],
119119
"tags": [
@@ -155,13 +155,13 @@ Return the running context of the alarm rule.
155155
"status": {
156156
"ruleId": "service_resp_time_rule",
157157
"expression": "sum(service_resp_time > 1000) >= 1",
158-
"endTime": "2025-11-10T09:39:00.000",
158+
"endTime": "2025-11-19T15:20:00.000",
159159
"additionalPeriod": 0,
160160
"size": 10,
161-
"silencePeriod": 3,
162-
"recoveryObservationPeriod": 2,
161+
"silencePeriod": 10,
162+
"recoveryObservationPeriod": 0,
163163
"silenceCountdown": 10,
164-
"recoveryObservationCountdown": 2,
164+
"recoveryObservationCountdown": 0,
165165
"currentState": "FIRING",
166166
"entityName": "mock_b_service",
167167
"windowValues": [
@@ -195,26 +195,30 @@ Return the running context of the alarm rule.
195195
},
196196
{
197197
"index": 7,
198+
"metrics": []
199+
},
200+
{
201+
"index": 8,
198202
"metrics": [
199203
{
200204
"name": "service_resp_time",
201-
"timeBucket": 202502121437,
205+
"timeBucket": 202511191519,
202206
"value": "6000"
203207
}
204208
]
205209
},
206-
{
207-
"index": 8,
208-
"metrics": []
209-
},
210210
{
211211
"index": 9,
212212
"metrics": []
213213
}
214214
],
215215
"mqeMetricsSnapshot": {
216-
"service_resp_time": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121431\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121432\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121433\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121434\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121435\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121436\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121437\",\"doubleValue\":6000.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121439\",\"doubleValue\":0.0,\"isEmptyValue\":true}]}]",
217-
"baseline(service_resp_time,upper)": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121431\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121432\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121433\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121434\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121435\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121436\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121437\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121438\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121439\",\"doubleValue\":10.0,\"isEmptyValue\":false}]}]"
216+
"service_resp_time": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202511191511\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191512\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191513\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191514\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191515\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191516\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191517\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191518\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191519\",\"doubleValue\":6000.0,\"isEmptyValue\":false},{\"id\":\"202511191520\",\"doubleValue\":0.0,\"isEmptyValue\":true}]}]"
217+
},
218+
"lastAlarmTime": "1763536823628",
219+
"lastAlarmMessage": "Service mock_b_service response time is more than 1000ms of last 10 minutes.",
220+
"lastAlarmMqeMetricsSnapshot": {
221+
"service_resp_time": "[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202511191511\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191512\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191513\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191514\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191515\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191516\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191517\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191518\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191519\",\"doubleValue\":6000.0,\"isEmptyValue\":false},{\"id\":\"202511191520\",\"doubleValue\":0.0,\"isEmptyValue\":true}]}]"
218222
}
219223
}
220224
},
@@ -227,7 +231,8 @@ Return the running context of the alarm rule.
227231
"size": 0,
228232
"silenceCountdown": 0,
229233
"recoveryObservationCountdown": 0,
230-
"windowValues": []
234+
"windowValues": [],
235+
"lastAlarmTime": 0
231236
}
232237
}
233238
]
@@ -237,8 +242,11 @@ Return the running context of the alarm rule.
237242
`silenceCountdown` is the countdown of the silence period. -1 means silence countdown is not running.
238243
`recoveryObservationCountdown` is the countdown of the recovery observation period.
239244
`windowValues` is the original metrics data when the metrics come in. The `index` is the index of the window, starting from 0.
240-
`mqeMetricsSnapshot` is the metrics data in the MQE format which is generated when executing the checking.
245+
`mqeMetricsSnapshot` is the current metrics data in the MQE format which is generated when executing the checking.
241246
These data will be calculated according to the expression.
247+
`lastAlarmTime` is the last time when the alarm is triggered. It will be reset to 0 when the alarm recovers.
248+
`lastAlarmMessage` is the last alarm message when the alarm is triggered.
249+
`lastAlarmMqeMetricsSnapshot` is the metrics data snapshot in the MQE format when the last alarm is triggered.
242250

243251
## Get Errors When Querying Status from OAP Instances
244252

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.List;
2323
import java.util.Map;
2424
import java.util.stream.Collectors;
25+
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
2526
import org.apache.skywalking.oap.server.core.alarm.AlarmModule;
2627
import org.apache.skywalking.oap.server.core.alarm.AlarmRulesWatcherService;
2728
import org.apache.skywalking.oap.server.core.alarm.AlarmStatusWatcherService;
@@ -169,6 +170,12 @@ public String getAlarmRuleContext(final String ruleName, final String entityName
169170
}
170171
});
171172
runningContext.setMqeMetricsSnapshot(window.getMqeMetricsSnapshot());
173+
AlarmMessage lastAlarmMessage = window.getLastAlarmMessage();
174+
if (lastAlarmMessage != null) {
175+
runningContext.setLastAlarmTime(window.getLastAlarmMessage().getStartTime());
176+
runningContext.setLastAlarmMessage(window.getLastAlarmMessage().getAlarmMessage());
177+
runningContext.setLastAlarmMqeMetricsSnapshot(window.getLastAlarmMessage().getMqeMetricsSnapshot());
178+
}
172179
return GSON.toJson(runningContext);
173180
}
174181
}

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ public class Window {
260260
private final AlarmStateMachine stateMachine;
261261
private LinkedList<Map<String, Metrics>> values;
262262
private ReentrantLock lock = new ReentrantLock();
263+
@Getter
263264
private AlarmMessage lastAlarmMessage;
264265
@Getter
265266
private JsonObject mqeMetricsSnapshot;

oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ public class AlarmRunningContext {
3838
private String entityName;
3939
private List<WindowValue> windowValues = new ArrayList<>();
4040
private JsonObject mqeMetricsSnapshot;
41+
private long lastAlarmTime;
42+
private String lastAlarmMessage;
43+
private JsonObject lastAlarmMqeMetricsSnapshot;
4144

4245
@Data
4346
public static class Metric {

oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java

Lines changed: 3 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import lombok.Getter;
2222
import lombok.Setter;
2323
import org.apache.skywalking.oap.server.core.analysis.Stream;
24-
import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag;
2524
import org.apache.skywalking.oap.server.core.analysis.record.Record;
2625
import org.apache.skywalking.oap.server.core.analysis.worker.RecordStreamProcessor;
2726
import org.apache.skywalking.oap.server.core.source.DefaultScopeDefine;
@@ -30,114 +29,50 @@
3029
import org.apache.skywalking.oap.server.core.storage.annotation.BanyanDB;
3130
import org.apache.skywalking.oap.server.core.storage.annotation.Column;
3231
import org.apache.skywalking.oap.server.core.storage.annotation.ElasticSearch;
33-
import org.apache.skywalking.oap.server.core.storage.annotation.SQLDatabase;
3432
import org.apache.skywalking.oap.server.core.storage.type.Convert2Entity;
3533
import org.apache.skywalking.oap.server.core.storage.type.Convert2Storage;
3634
import org.apache.skywalking.oap.server.core.storage.type.StorageBuilder;
3735

38-
import java.util.List;
39-
4036
import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.ALARM_RECOVERY;
41-
import static org.apache.skywalking.oap.server.core.storage.StorageData.TIME_BUCKET;
4237

4338
@Getter
4439
@Setter
4540
@ScopeDeclaration(id = ALARM_RECOVERY, name = "AlarmRecovery")
4641
@Stream(name = AlarmRecoveryRecord.INDEX_NAME, scopeId = DefaultScopeDefine.ALARM_RECOVERY, builder = AlarmRecoveryRecord.Builder.class, processor = RecordStreamProcessor.class)
47-
@SQLDatabase.ExtraColumn4AdditionalEntity(additionalTable = AlarmRecoveryRecord.ADDITIONAL_TAG_TABLE, parentColumn = TIME_BUCKET)
48-
@BanyanDB.TimestampColumn(AlarmRecoveryRecord.START_TIME)
42+
@BanyanDB.TimestampColumn(AlarmRecoveryRecord.RECOVERY_TIME)
4943
@BanyanDB.Group(streamGroup = BanyanDB.StreamGroup.RECORDS)
5044
public class AlarmRecoveryRecord extends Record {
5145
public static final String INDEX_NAME = "alarm_recovery_record";
52-
public static final String ADDITIONAL_TAG_TABLE = "alarm_record_tag";
5346
public static final String UUID = "uuid";
54-
public static final String SCOPE = "scope";
55-
public static final String NAME = "name";
56-
public static final String ID0 = "id0";
57-
public static final String ID1 = "id1";
58-
public static final String START_TIME = "start_time";
5947
public static final String RECOVERY_TIME = "recovery_time";
60-
public static final String ALARM_MESSAGE = "alarm_message";
61-
public static final String RULE_NAME = "rule_name";
62-
public static final String TAGS = "tags";
63-
public static final String TAGS_RAW_DATA = "tags_raw_data";
64-
public static final String SNAPSHOT = "snapshot";
6548

6649
@Override
6750
public StorageID id() {
68-
return new StorageID()
69-
.append(TIME_BUCKET, getTimeBucket())
70-
.append(RULE_NAME, ruleName)
71-
.append(ID0, id0)
72-
.append(ID1, id1);
51+
return new StorageID().append(UUID, uuid);
7352
}
7453

75-
@Column(name = SCOPE)
76-
private int scope;
77-
@Column(name = NAME, storageOnly = true, length = 512)
78-
private String name;
79-
@Column(name = ID0, storageOnly = true, length = 512)
80-
@BanyanDB.SeriesID(index = 0)
81-
private String id0;
82-
@Column(name = ID1, storageOnly = true)
83-
private String id1;
84-
@ElasticSearch.EnableDocValues
85-
@Column(name = START_TIME)
86-
private long startTime;
8754
@ElasticSearch.EnableDocValues
8855
@Column(name = RECOVERY_TIME)
8956
private long recoveryTime;
90-
@Column(name = ALARM_MESSAGE, length = 512)
91-
@ElasticSearch.MatchQuery
92-
@BanyanDB.MatchQuery(analyzer = BanyanDB.MatchQuery.AnalyzerType.SIMPLE)
93-
private String alarmMessage;
94-
@Column(name = RULE_NAME)
95-
private String ruleName;
57+
@BanyanDB.SeriesID(index = 0)
9658
@Column(name = UUID)
9759
private String uuid;
98-
@Column(name = TAGS, indexOnly = true)
99-
@SQLDatabase.AdditionalEntity(additionalTables = {ADDITIONAL_TAG_TABLE})
100-
private List<String> tagsInString;
101-
@Column(name = TAGS_RAW_DATA, storageOnly = true, length = Tag.TAG_LENGTH)
102-
private byte[] tagsRawData;
103-
@Column(name = SNAPSHOT, storageOnly = true, length = 50000)
104-
private String snapshot;
10560

10661
public static class Builder implements StorageBuilder<AlarmRecoveryRecord> {
10762
@Override
10863
public AlarmRecoveryRecord storage2Entity(final Convert2Entity converter) {
10964
AlarmRecoveryRecord record = new AlarmRecoveryRecord();
110-
record.setScope(((Number) converter.get(SCOPE)).intValue());
111-
record.setName((String) converter.get(NAME));
11265
record.setUuid((String) converter.get(UUID));
113-
record.setId0((String) converter.get(ID0));
114-
record.setId1((String) converter.get(ID1));
115-
record.setAlarmMessage((String) converter.get(ALARM_MESSAGE));
116-
record.setStartTime(((Number) converter.get(START_TIME)).longValue());
11766
record.setRecoveryTime(((Number) converter.get(RECOVERY_TIME)).longValue());
11867
record.setTimeBucket(((Number) converter.get(TIME_BUCKET)).longValue());
119-
record.setRuleName((String) converter.get(RULE_NAME));
120-
record.setTagsRawData(converter.getBytes(TAGS_RAW_DATA));
121-
record.setSnapshot((String) converter.get(SNAPSHOT));
122-
// Don't read the TAGS as they are only for query.
12368
return record;
12469
}
12570

12671
@Override
12772
public void entity2Storage(final AlarmRecoveryRecord storageData, final Convert2Storage converter) {
128-
converter.accept(SCOPE, storageData.getScope());
129-
converter.accept(NAME, storageData.getName());
13073
converter.accept(UUID, storageData.getUuid());
131-
converter.accept(ID0, storageData.getId0());
132-
converter.accept(ID1, storageData.getId1());
133-
converter.accept(ALARM_MESSAGE, storageData.getAlarmMessage());
134-
converter.accept(START_TIME, storageData.getStartTime());
13574
converter.accept(RECOVERY_TIME, storageData.getRecoveryTime());
13675
converter.accept(TIME_BUCKET, storageData.getTimeBucket());
137-
converter.accept(RULE_NAME, storageData.getRuleName());
138-
converter.accept(TAGS_RAW_DATA, storageData.getTagsRawData());
139-
converter.accept(TAGS, storageData.getTagsInString());
140-
converter.accept(SNAPSHOT, storageData.getSnapshot());
14176
}
14277
}
14378
}

oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -89,23 +89,8 @@ public void doAlarmRecovery(List<AlarmMessage> alarmMessage) {
8989
AlarmRecoveryMessage alarmRecoveryMessage = (AlarmRecoveryMessage) message;
9090
AlarmRecoveryRecord record = new AlarmRecoveryRecord();
9191
record.setUuid(message.getUuid());
92-
record.setScope(message.getScopeId());
93-
record.setId0(message.getId0());
94-
record.setId1(message.getId1());
95-
record.setName(message.getName());
96-
record.setAlarmMessage(message.getAlarmMessage());
97-
record.setStartTime(message.getStartTime());
9892
record.setRecoveryTime(alarmRecoveryMessage.getRecoveryTime());
9993
record.setTimeBucket(TimeBucket.getRecordTimeBucket(message.getStartTime()));
100-
record.setRuleName(message.getRuleName());
101-
Collection<Tag> tags = appendSearchableTags(message.getTags());
102-
addAutocompleteTags(tags, TimeBucket.getMinuteTimeBucket(message.getStartTime()));
103-
record.setTagsRawData(gson.toJson(message.getTags()).getBytes(Charsets.UTF_8));
104-
record.setTagsInString(Tag.Util.toStringList(new ArrayList<>(tags)));
105-
AlarmSnapshotRecord snapshot = new AlarmSnapshotRecord();
106-
snapshot.setExpression(message.getExpression());
107-
snapshot.setMetrics(message.getMqeMetricsSnapshot());
108-
record.setSnapshot(gson.toJson(snapshot));
10994
RecordStreamProcessor.getInstance().in(record);
11095
});
11196
}

oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,8 @@ public class BanyanDBAlarmQueryDAO extends AbstractBanyanDBDAO implements IAlarm
5252
private static final Set<String> TAGS = ImmutableSet.of(AlarmRecord.SCOPE,
5353
AlarmRecord.NAME, AlarmRecord.ID0, AlarmRecord.ID1, AlarmRecord.UUID, AlarmRecord.ALARM_MESSAGE,
5454
AlarmRecord.START_TIME, AlarmRecord.RULE_NAME, AlarmRecord.TAGS, AlarmRecord.TAGS_RAW_DATA, AlarmRecord.SNAPSHOT);
55-
private static final Set<String> RECOVERY_TAGS = ImmutableSet.of(AlarmRecoveryRecord.SCOPE,
56-
AlarmRecoveryRecord.NAME, AlarmRecord.ID0, AlarmRecoveryRecord.ID1, AlarmRecoveryRecord.UUID,
57-
AlarmRecoveryRecord.ALARM_MESSAGE, AlarmRecoveryRecord.START_TIME, AlarmRecoveryRecord.RECOVERY_TIME,
58-
AlarmRecoveryRecord.RULE_NAME, AlarmRecoveryRecord.TAGS, AlarmRecoveryRecord.TAGS_RAW_DATA, AlarmRecoveryRecord.SNAPSHOT);
55+
private static final Set<String> RECOVERY_TAGS = ImmutableSet.of(
56+
AlarmRecoveryRecord.UUID, AlarmRecoveryRecord.RECOVERY_TIME);
5957

6058
public BanyanDBAlarmQueryDAO(BanyanDBStorageClient client) {
6159
super(client);

0 commit comments

Comments
 (0)