Skip to content

Commit a3ba466

Browse files
authored
Database - create postgresql backend and support backend switch with kusto (#117)
## Pull Request Overview This PR adds PostgreSQL as an alternative storage backend alongside the existing Kusto implementation. It introduces a storage abstraction layer through `ltp-storage-common`, implements a PostgreSQL SDK with schema management, and updates all services to support dual backends via a factory pattern. - Introduces `ltp-storage-common` package with shared data schemas, and storage factory - Implements `postgresql-sdk` with full CRUD operations, Alembic migrations, and Kusto-compatible interface and add a schema management service with health checks and migration support - Updates `kusto-sdk`, `alert-manager`, and `cluster-local-storage` to use the factory pattern for backend selection
1 parent 7e1a263 commit a3ba466

File tree

99 files changed

+6466
-454
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+6466
-454
lines changed

src/alert-manager/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ node_modules/
33

44
deploy/icm-certs/
55

6+
**/postgresql-sdk/
67
**/kusto-sdk/
8+
**/ltp-storage-common/

src/alert-manager/build/alert-handler.common.dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,8 @@ COPY ./src/alert-handler .
1818
RUN corepack enable && corepack install -g [email protected]
1919
RUN yarn install
2020

21+
# install python and dependencies
22+
RUN apt-get install -y python3-pip
23+
RUN pip3 install --no-cache-dir -r requirements.txt --break-system-packages
24+
2125
ENTRYPOINT ["npm", "start"]

src/alert-manager/build/build-pre.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,25 @@
55

66
pushd $(dirname "$0") > /dev/null
77

8+
# Copy ltp-storage-common (shared data schemas, interfaces, and factory)
9+
cp -arfT "../../ltp-storage-common" "../src/node-recycler/ltp-storage-common"
10+
cp -arfT "../../ltp-storage-common" "../src/node-issue-classifier/ltp-storage-common"
11+
cp -arfT "../../ltp-storage-common" "../src/alert-parser/ltp-storage-common"
12+
cp -arfT "../../ltp-storage-common" "../src/job-data-recorder/ltp-storage-common"
13+
cp -arfT "../../ltp-storage-common" "../src/alert-handler/ltp-storage-common"
14+
15+
# Copy SDKs (backend implementations)
816
cp -arfT "../../kusto-sdk" "../src/node-recycler/kusto-sdk"
17+
cp -arfT "../../postgresql-sdk/sdk" "../src/node-recycler/postgresql-sdk"
918
cp -arfT "../../kusto-sdk" "../src/node-issue-classifier/kusto-sdk"
19+
cp -arfT "../../postgresql-sdk/sdk" "../src/node-issue-classifier/postgresql-sdk"
1020
cp -arfT "../../kusto-sdk" "../src/alert-parser/kusto-sdk"
21+
cp -arfT "../../postgresql-sdk/sdk" "../src/alert-parser/postgresql-sdk"
1122
cp -arfT "../../kusto-sdk" "../src/job-data-recorder/kusto-sdk"
23+
cp -arfT "../../postgresql-sdk/sdk" "../src/job-data-recorder/postgresql-sdk"
24+
cp -arfT "../../postgresql-sdk/sdk" "../src/alert-handler/postgresql-sdk"
25+
26+
# Copy other dependencies
1227
cp -arfT "../src/node-failure-detection/monitor/data_sources.py" "../src/job-data-recorder/data_sources.py"
1328
cp -arfT "../../database-controller/sdk" "../src/job-status-change-notification/openpaidbsdk"
1429

src/alert-manager/config/alert-manager.yaml

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,32 @@ actions-available:
1212
- drain-nodes
1313
- fix-nvidia-gpu-low-perf
1414
- cert-expiration-checker
15+
16+
# ============================================================================
17+
# Storage Backend Configuration
18+
# ============================================================================
19+
# Global storage backend setting (applies to all services unless overridden)
20+
# Options: 'kusto' or 'postgresql'
21+
storage-backend: kusto
22+
23+
# Kusto (Azure Data Explorer) Configuration
24+
# Only required if using Kusto backend (storage-backend: kusto)
25+
kusto:
26+
kusto-cluster: "" # e.g., https://your-cluster.kusto.windows.net
27+
kusto-database: "" # e.g., YourDatabase
28+
kusto-user-assigned-client-id: "" # For managed identity authentication
29+
node-status-table: NodeStatusRecord
30+
node-action-table: NodeActionRecord
31+
kusto-vm-table: "" # VM info table name
32+
33+
# PostgreSQL Configuration
34+
# Only required if using PostgreSQL backend (storage-backend: postgresql)
35+
postgresql:
36+
schema: ltp_sdk # Default schema name
37+
38+
# ============================================================================
39+
# Service Configuration
40+
# ============================================================================
1541
alert-handler:
1642
log-level: 'info'
1743
port: 9095
@@ -30,10 +56,8 @@ job-status-change-notification:
3056
db-poller-interval-second: 600
3157
node-issue-classifier:
3258
configured: False
33-
kusto: False
3459
node-recycler:
3560
configured: False
36-
kusto: False
3761
icm:
3862
host: prod.microsofticm.com
3963
crt_name: cert.pem

src/alert-manager/deploy/alert-manager-deployment.yaml.template

Lines changed: 82 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ spec:
8080
{%- endif %}
8181
- name: LOG_LEVEL
8282
value: {{ cluster_cfg["alert-manager"]["alert-handler"]["log-level"] }}
83+
- name: LTP_STORAGE_BACKEND_DEFAULT
84+
value: "{{ cluster_cfg['alert-manager']['alert-handler'].get('storage-backend', cluster_cfg['alert-manager'].get('storage-backend', 'kusto')) }}"
85+
{% if cluster_cfg["alert-manager"]["alert-handler"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "postgresql" %}
86+
- name: POSTGRES_CONNECTION_STR
87+
value: "{{ cluster_cfg['postgresql']['connection-str'] }}"
88+
- name: POSTGRES_SCHEMA
89+
value: "{{ cluster_cfg['alert-manager']['postgresql'].get('schema', 'ltp_sdk') }}"
90+
{% endif %}
8391
{% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
8492
- name: EMAIL_CONFIGS_ADMIN_RECEIVER
8593
value: {{ cluster_cfg["alert-manager"]["alert-handler"]["email-configs"]["admin-receiver"] }}
@@ -141,23 +149,28 @@ spec:
141149
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"]["log-level"] | default(cluster_cfg["alert-manager"]["log-level"]) | default("INFO") }}
142150
- name: CLASSIFICATION_INTERVAL_MINUTES
143151
value: "{{ cluster_cfg["alert-manager"]["node-issue-classifier"]["classification-interval-minutes"] | default("10") }}"
144-
{% if cluster_cfg["alert-manager"]["node-issue-classifier"]["kusto"] %}
152+
- name: COOCCURRED_HARDWARE_FAILURE_LIMIT
153+
value: "{{ cluster_cfg["alert-manager"]["node-issue-classifier"]["coccurred-hardware-failure-node-limit"] | default("5") }}"
154+
{% if cluster_cfg["alert-manager"]["node-issue-classifier"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "kusto" %}
145155
- name: KUSTO_USER_ASSIGNED_CLIENT_ID
146-
value: {{ cluster_cfg["alert-manager"]["kusto-user-assigned-client-id"] }}
156+
value: {{ cluster_cfg["alert-manager"]["kusto"]["kusto-user-assigned-client-id"] }}
147157
- name: LTP_KUSTO_CLUSTER_URI
148-
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"]["kusto-cluster"] | default(cluster_cfg["alert-manager"]["kusto-cluster"]) }}
158+
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"].get("kusto-cluster", cluster_cfg["alert-manager"]["kusto"]["kusto-cluster"]) }}
149159
- name: LTP_KUSTO_DATABASE_NAME
150-
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"]["kusto-database"] | default(cluster_cfg["alert-manager"]["kusto-database"]) }}
160+
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"].get("kusto-database", cluster_cfg["alert-manager"]["kusto"]["kusto-database"]) }}
151161
- name: KUSTO_NODE_STATUS_TABLE_NAME
152-
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"]["node-status-table"]
153-
| default(cluster_cfg["alert-manager"]["node-status-table"])
154-
| default("NodeStatusRecord") }}
162+
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"].get("node-status-table", cluster_cfg["alert-manager"]["kusto"]["node-status-table"]) }}
155163
- name: KUSTO_NODE_ACTION_TABLE_NAME
156-
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"]["node-action-table"]
157-
| default(cluster_cfg["alert-manager"]["node-action-table"])
158-
| default("NodeActionRecord") }}
159-
- name: COOCCURRED_HARDWARE_FAILURE_LIMIT
160-
value: "{{ cluster_cfg["alert-manager"]["node-issue-classifier"]["coccurred_hardware_failure_node_limit"] | default("5") }}"
164+
value: {{ cluster_cfg["alert-manager"]["node-issue-classifier"].get("node-action-table", cluster_cfg["alert-manager"]["kusto"]["node-action-table"]) }}
165+
{% endif %}
166+
# Storage backend selection
167+
- name: LTP_STORAGE_BACKEND_DEFAULT
168+
value: "{{ cluster_cfg['alert-manager']['node-issue-classifier'].get('storage-backend', cluster_cfg['alert-manager'].get('storage-backend', 'kusto')) }}"
169+
{% if cluster_cfg["alert-manager"]["node-issue-classifier"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "postgresql" %}
170+
- name: POSTGRES_CONNECTION_STR
171+
value: "{{ cluster_cfg['postgresql']['connection-str'] }}"
172+
- name: POSTGRES_SCHEMA
173+
value: "{{ cluster_cfg['alert-manager']['postgresql'].get('schema', 'ltp_sdk') }}"
161174
{% endif %}
162175
resources:
163176
requests:
@@ -187,30 +200,37 @@ spec:
187200
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["uri"] | default(cluster_cfg["alert-manager"]["prometheus-uri"]) }}
188201
- name: PAI_TOKEN
189202
value: {{ cluster_cfg["alert-manager"]["pai-bearer-token"] }}
203+
{% if cluster_cfg["alert-manager"]["alert-parser"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "kusto" %}
190204
- name: KUSTO_USER_ASSIGNED_CLIENT_ID
191-
value: {{ cluster_cfg["alert-manager"]["kusto-user-assigned-client-id"] }}
205+
value: {{ cluster_cfg["alert-manager"]["kusto"]["kusto-user-assigned-client-id"] }}
192206
- name: LTP_KUSTO_CLUSTER_URI
193-
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["kusto-cluster"] | default(cluster_cfg["alert-manager"]["kusto-cluster"]) }}
207+
value: {{ cluster_cfg["alert-manager"]["alert-parser"].get("kusto-cluster", cluster_cfg["alert-manager"]["kusto"]["kusto-cluster"]) }}
194208
- name: LTP_KUSTO_DATABASE_NAME
195-
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["kusto-database"] | default(cluster_cfg["alert-manager"]["kusto-database"]) }}
209+
value: {{ cluster_cfg["alert-manager"]["alert-parser"].get("kusto-database", cluster_cfg["alert-manager"]["kusto"]["kusto-database"]) }}
196210
- name: KUSTO_NODE_STATUS_TABLE_NAME
197-
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["node-status-table"]
198-
| default(cluster_cfg["alert-manager"]["node-status-table"])
199-
| default("NodeStatusRecord") }}
211+
value: {{ cluster_cfg["alert-manager"]["alert-parser"].get("node-status-table", cluster_cfg["alert-manager"]["kusto"]["node-status-table"]) }}
200212
- name: KUSTO_NODE_ACTION_TABLE_NAME
201-
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["node-action-table"]
202-
| default(cluster_cfg["alert-manager"]["node-action-table"])
203-
| default("NodeActionRecord") }}
213+
value: {{ cluster_cfg["alert-manager"]["alert-parser"].get("node-action-table", cluster_cfg["alert-manager"]["kusto"]["node-action-table"]) }}
204214
- name: KUSTO_ALERT_CLUSTER
205-
value: "{{ cluster_cfg['alert-manager']['alert-parser']['alert-kusto-cluster'] | default('https://ltp-kusto-alerts.westus2.kusto.windows.net') }}"
215+
value: "{{ cluster_cfg['alert-manager']['alert-parser'].get('alert-kusto-cluster', 'https://ltp-kusto-alerts.westus2.kusto.windows.net') }}"
206216
- name: KUSTO_ALERT_DATABASE
207-
value: "{{ cluster_cfg['alert-manager']['alert-parser']['alert-kusto-database'] | default('DefaultWorkspace-id-westus2') }}"
217+
value: "{{ cluster_cfg['alert-manager']['alert-parser'].get('alert-kusto-database', 'DefaultWorkspace-id-westus2') }}"
208218
- name: LTP_KUSTO_VM_INFO_TABLE_NAME
209-
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["kusto-vm-table"] | default(cluster_cfg["alert-manager"]["kusto-vm-table"]) }}
219+
value: {{ cluster_cfg["alert-manager"]["alert-parser"].get("kusto-vm-table", cluster_cfg["alert-manager"]["kusto"].get("kusto-vm-table", "")) }}
220+
{% endif %}
210221
- name: LTP_VMSS_IDS
211222
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["vmss_ids"] | default(cluster_cfg["alert-manager"]["vmss_ids"]) }}
212223
- name: AZURE_CLIENT_ID
213224
value: {{ cluster_cfg["alert-manager"]["alert-parser"]["vmss_client_id"] | default(cluster_cfg["alert-manager"]["vmss_client_id"]) }}
225+
# Storage backend selection
226+
- name: LTP_STORAGE_BACKEND_DEFAULT
227+
value: "{{ cluster_cfg['alert-manager']['alert-parser'].get('storage-backend', cluster_cfg['alert-manager'].get('storage-backend', 'kusto')) }}"
228+
{% if cluster_cfg["alert-manager"]["alert-parser"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "postgresql" %}
229+
- name: POSTGRES_CONNECTION_STR
230+
value: "{{ cluster_cfg['postgresql']['connection-str'] }}"
231+
- name: POSTGRES_SCHEMA
232+
value: "{{ cluster_cfg['alert-manager']['postgresql'].get('schema', 'ltp_sdk') }}"
233+
{% endif %}
214234
resources:
215235
requests:
216236
memory: "2Gi"
@@ -232,21 +252,17 @@ spec:
232252
value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }}
233253
- name: ENVIRONMENT
234254
value: {{ cluster_cfg["alert-manager"]["node-recycler"]["environment"] | default("prod") }}
235-
{% if cluster_cfg["alert-manager"]["node-recycler"]["kusto"] %}
255+
{% if cluster_cfg["alert-manager"]["node-recycler"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "kusto" %}
236256
- name: KUSTO_USER_ASSIGNED_CLIENT_ID
237-
value: {{ cluster_cfg["alert-manager"]["kusto-user-assigned-client-id"] }}
257+
value: {{ cluster_cfg["alert-manager"]["kusto"]["kusto-user-assigned-client-id"] }}
238258
- name: LTP_KUSTO_CLUSTER_URI
239-
value: {{ cluster_cfg["alert-manager"]["node-recycler"]["kusto-cluster"] | default(cluster_cfg["alert-manager"]["kusto-cluster"]) }}
259+
value: {{ cluster_cfg["alert-manager"]["node-recycler"].get("kusto-cluster", cluster_cfg["alert-manager"]["kusto"]["kusto-cluster"]) }}
240260
- name: LTP_KUSTO_DATABASE_NAME
241-
value: {{ cluster_cfg["alert-manager"]["node-recycler"]["kusto-database"] | default(cluster_cfg["alert-manager"]["kusto-database"]) }}
261+
value: {{ cluster_cfg["alert-manager"]["node-recycler"].get("kusto-database", cluster_cfg["alert-manager"]["kusto"]["kusto-database"]) }}
242262
- name: KUSTO_NODE_STATUS_TABLE_NAME
243-
value: {{ cluster_cfg["alert-manager"]["node-recycler"]["node-status-table"]
244-
| default(cluster_cfg["alert-manager"]["node-status-table"])
245-
| default("NodeStatusRecord") }}
263+
value: {{ cluster_cfg["alert-manager"]["node-recycler"].get("node-status-table", cluster_cfg["alert-manager"]["kusto"]["node-status-table"]) }}
246264
- name: KUSTO_NODE_ACTION_TABLE_NAME
247-
value: {{ cluster_cfg["alert-manager"]["node-recycler"]["node-action-table"]
248-
| default(cluster_cfg["alert-manager"]["node-action-table"])
249-
| default("NodeActionRecord") }}
265+
value: {{ cluster_cfg["alert-manager"]["node-recycler"].get("node-action-table", cluster_cfg["alert-manager"]["kusto"]["node-action-table"]) }}
250266
{% endif %}
251267
{% if "icm" in cluster_cfg["alert-manager"]["node-recycler"] %}
252268
- name: ICM_HOST
@@ -272,6 +288,15 @@ spec:
272288
value: {{ cluster_cfg["rest-server"]["uri"] }}
273289
- name: REST_SERVER_TOKEN
274290
value: {{ cluster_cfg["alert-manager"]["pai-bearer-token"] }}
291+
# Storage backend selection
292+
- name: LTP_STORAGE_BACKEND_DEFAULT
293+
value: "{{ cluster_cfg['alert-manager']['node-recycler'].get('storage-backend', cluster_cfg['alert-manager'].get('storage-backend', 'kusto')) }}"
294+
{% if cluster_cfg["alert-manager"]["node-recycler"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "postgresql" %}
295+
- name: POSTGRES_CONNECTION_STR
296+
value: "{{ cluster_cfg['postgresql']['connection-str'] }}"
297+
- name: POSTGRES_SCHEMA
298+
value: "{{ cluster_cfg['alert-manager']['postgresql'].get('schema', 'ltp_sdk') }}"
299+
{% endif %}
275300
resources:
276301
requests:
277302
memory: "1Gi"
@@ -377,26 +402,35 @@ spec:
377402
value: "{{ cluster_cfg["alert-manager"]["job-data-recorder"]["run-interval"] | default("3h") }}"
378403
- name: RECORD_RETAIN_TIME
379404
value: "{{ cluster_cfg["alert-manager"]["job-data-recorder"]["record-retain-time"] | default("30d") }}"
380-
- name: KUSTO_METRICS_TABLE
381-
value: "{{ cluster_cfg["alert-manager"]["job-data-recorder"]["kusto-metrics-table"] | default("JobSummary") }}"
382-
- name: KUSTO_REACT_TABLE
383-
value: "{{ cluster_cfg["alert-manager"]["job-data-recorder"]["kusto-react-table"] | default("JobReactTime") }}"
384-
- name: KUSTO_USER_ASSIGNED_CLIENT_ID
385-
value: {{ cluster_cfg["alert-manager"]["kusto-user-assigned-client-id"] }}
386-
- name: LTP_KUSTO_CLUSTER_URI
387-
value: {{ cluster_cfg["alert-manager"]["job-data-recorder"]["kusto-cluster"] | default(cluster_cfg["alert-manager"]["kusto-cluster"]) }}
388-
- name: LTP_KUSTO_DATABASE_NAME
389-
value: {{ cluster_cfg["alert-manager"]["job-data-recorder"]["kusto-database"] | default(cluster_cfg["alert-manager"]["kusto-database"]) }}
390405
- name: REST_SERVER_URI
391406
value: {{ cluster_cfg["rest-server"]["uri"] }}
392407
- name: PROMETHEUS_SERVER_URI
393408
value: {{ cluster_cfg["alert-manager"]["job-data-recorder"]["prometheus-uri"] | default(cluster_cfg["alert-manager"]["prometheus-uri"]) }}
394-
- name: KUSTO_NODE_ACTION_TABLE_NAME
395-
value: {{ cluster_cfg["alert-manager"]["job-data-recorder"]["node-action-table"]
396-
| default(cluster_cfg["alert-manager"]["node-action-table"])
397-
| default("NodeActionRecord") }}
398409
- name: PAI_TOKEN
399410
value: {{ cluster_cfg["alert-manager"]["pai-bearer-token"] }}
411+
{% if cluster_cfg["alert-manager"]["job-data-recorder"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "kusto" %}
412+
- name: KUSTO_METRICS_TABLE
413+
value: "{{ cluster_cfg["alert-manager"]["job-data-recorder"].get("kusto-metrics-table", "JobSummary") }}"
414+
- name: KUSTO_REACT_TABLE
415+
value: "{{ cluster_cfg["alert-manager"]["job-data-recorder"].get("kusto-react-table", "JobReactTime") }}"
416+
- name: KUSTO_USER_ASSIGNED_CLIENT_ID
417+
value: {{ cluster_cfg["alert-manager"]["kusto"]["kusto-user-assigned-client-id"] }}
418+
- name: LTP_KUSTO_CLUSTER_URI
419+
value: {{ cluster_cfg["alert-manager"]["job-data-recorder"].get("kusto-cluster", cluster_cfg["alert-manager"]["kusto"]["kusto-cluster"]) }}
420+
- name: LTP_KUSTO_DATABASE_NAME
421+
value: {{ cluster_cfg["alert-manager"]["job-data-recorder"].get("kusto-database", cluster_cfg["alert-manager"]["kusto"]["kusto-database"]) }}
422+
- name: KUSTO_NODE_ACTION_TABLE_NAME
423+
value: {{ cluster_cfg["alert-manager"]["job-data-recorder"].get("node-action-table", cluster_cfg["alert-manager"]["kusto"]["node-action-table"]) }}
424+
{% endif %}
425+
# Storage backend selection
426+
- name: LTP_STORAGE_BACKEND_DEFAULT
427+
value: "{{ cluster_cfg['alert-manager']['job-data-recorder'].get('storage-backend', cluster_cfg['alert-manager'].get('storage-backend', 'kusto')) }}"
428+
{% if cluster_cfg["alert-manager"]["job-data-recorder"].get("storage-backend", cluster_cfg["alert-manager"].get("storage-backend", "kusto")) == "postgresql" %}
429+
- name: POSTGRES_CONNECTION_STR
430+
value: "{{ cluster_cfg['postgresql']['connection-str'] }}"
431+
- name: POSTGRES_SCHEMA
432+
value: "{{ cluster_cfg['alert-manager']['postgresql'].get('schema', 'ltp_sdk') }}"
433+
{% endif %}
400434
resources:
401435
requests:
402436
memory: "2Gi"

0 commit comments

Comments
 (0)