Skip to content

Commit 9cbe992

Browse files
committed
Resolve #9: Observability: health endpoint, Prometheus/OTLP HTTP metrics, tests
* processing delay is measured using video frame attributes. Timings are stored in the attributes and passed through the pipeline * measurements logic in `Measurements` * simple health check endpoint to check the process is alive * Prometheus and OTLP HTTP metric exporters * made signal handling, health check and metrics export loosely coupled components. They can be composed as requiered * adjusted config logic to include lists and alternative environment names, better naming (`endpoint`), a config to force client without SSL * histogram boundaries can be configured because it's unclear what defualt is the best * essential tests, some test refactoring * tools to run pipeline elements for manual testing `python -m tests.helpers <tool>` * include Prometheus and OTLP Collector collector images and configs into the dev container environment for the manual testing. Better rootless Docker experience with `.env` files (thanks to compose.yaml). One can stop extra containers with VS Code * fix `ClientService` and `ServerService` loops to stop when whatever unexpected exception occurs
1 parent 0d82fca commit 9cbe992

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+5778
-270
lines changed

.devcontainer/compose.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
services:
2+
cloud-pin:
3+
build:
4+
dockerfile: Dockerfile.${DOCKER_MODE:-default}
5+
args:
6+
REMOTE_USER: ${USER}
7+
REMOTE_UID: ${REMOTE_UID:-1000}
8+
REMOTE_GID: ${REMOTE_GID:-1000}
9+
command: /bin/sh -c "while sleep 1000; do :; done"
10+
volumes:
11+
- ..:/workspace:cached
12+
depends_on:
13+
prometheus:
14+
condition: service_started
15+
required: false
16+
otlp-collector:
17+
condition: service_started
18+
required: false
19+
prometheus:
20+
image: prom/prometheus:latest
21+
ports:
22+
- 9090:9090
23+
volumes:
24+
- ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
25+
command:
26+
- --config.file=/etc/prometheus/prometheus.yml
27+
- --storage.tsdb.path=/prometheus
28+
- --web.enable-otlp-receiver
29+
otlp-collector:
30+
image: otel/opentelemetry-collector-contrib:latest
31+
volumes:
32+
- ./configs/otlp-collector.yaml:/etc/otelcol-contrib/config.yaml
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
receivers:
2+
otlp:
3+
protocols:
4+
grpc:
5+
endpoint: 0.0.0.0:4317
6+
http:
7+
endpoint: 0.0.0.0:4318
8+
9+
processors:
10+
batch:
11+
12+
exporters:
13+
debug:
14+
verbosity: detailed
15+
otlp/jaeger:
16+
endpoint: jaeger:4317
17+
tls:
18+
insecure: true
19+
prometheus:
20+
endpoint: 0.0.0.0:8889
21+
22+
service:
23+
pipelines:
24+
traces:
25+
receivers: [otlp]
26+
processors: [batch]
27+
exporters: [debug]
28+
metrics:
29+
receivers: [otlp]
30+
exporters: [debug, prometheus]
31+
logs:
32+
receivers: [otlp]
33+
processors: [batch]
34+
exporters: [debug]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
global:
2+
scrape_interval: 5s
3+
evaluation_interval: 5s
4+
5+
scrape_configs:
6+
- job_name: prometheus
7+
static_configs:
8+
- targets: ['localhost:9090']
9+
- job_name: cloud-pin-client
10+
static_configs:
11+
- targets: ["cloud-pin:8081"]
12+
- job_name: cloud-pin-server
13+
static_configs:
14+
- targets: ["cloud-pin:8082"]
15+
- job_name: otlp-collector
16+
static_configs:
17+
- targets: ["otlp-collector:8889"]

.devcontainer/detect_rootless.sh

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,24 @@
1-
ROOTLESS_DETECTED=`docker context inspect | sed -n s/rootless/ROOTLESS/ip`
1+
#!/usr/bin/env sh
22

3-
if [ "$ROOTLESS_DETECTED" -a "$DOCKER_MODE" != "rootless" ]
3+
ROOTLESS_DETECTED=`docker context inspect | sed -n s/rootless/ROOTLESS/ip`
4+
5+
if [ -z "$ROOTLESS_DETECTED" ]
46
then
5-
echo "\nERROR: Docker runs in ROOTLESS MODE. " >&2
6-
echo "Set DOCKER_MODE environment variable to 'rootless' and try again. " >&2
7-
echo "\tFor instance run VS Code with 'DOCKER_MODE=rootless code .'\n" >&2
7+
echo "\nINFO: Rootful Docker is detected. OK "
8+
exit 0
9+
fi
10+
11+
ROOTLESS_SET_BY_ENV_FILE=`cat .devcontainer/.env 2>&1 | sed -n s/DOCKER_MODE=rootless/ROOTLESS/ip`
812

9-
exit 1
13+
if [ "$ROOTLESS_SET_BY_ENV_FILE" -o "$DOCKER_MODE" = "rootless" ]
14+
then
15+
echo "\nINFO: Rootless Docker is detected. OK "
16+
exit 0
1017
fi
18+
19+
echo "\nERROR: Docker runs in ROOTLESS MODE. " >&2
20+
echo "Set DOCKER_MODE environment variable to 'rootless' and try again. " >&2
21+
echo "You can do it with .env file as well by placing it at '.devcontainer/.env'. " >&2
22+
echo "\tFor instance run VS Code with 'DOCKER_MODE=rootless code .'" >&2
23+
echo "\tor create .env file with 'echo DOCKER_MODE=rootless >> .devcontainer/.env'\n" >&2
24+
exit 1

.devcontainer/devcontainer.json

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
11
{
2-
"build": {
3-
"dockerfile": "Dockerfile.${localEnv:DOCKER_MODE:default}",
4-
"args": {
5-
"REMOTE_USER": "${localEnv:USER}",
6-
"REMOTE_UID": "${localEnv:REMOTE_UID:1000}",
7-
"REMOTE_GID": "${localEnv:REMOTE_GID:1000}"
8-
}
9-
},
2+
"dockerComposeFile": "./compose.yaml",
103
"remoteUser": "${localEnv:REMOTE_USER}",
4+
"service": "cloud-pin",
5+
"runServices": ["cloud-pin"],
6+
"workspaceFolder": "/workspace",
117
"features": {
128
"ghcr.io/devcontainers-extra/features/uv:1": {},
139
"ghcr.io/devcontainers/features/git:1": {}

pyproject.toml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,13 @@ description = "A service to run Savant pipeline remotely (cloud) via WebSockets
55
readme = "README.md"
66
requires-python = ">=3.14"
77
dependencies = [
8+
"aiohttp>=3.13.2",
89
"omegaconf>=2.3.0",
10+
"opentelemetry-exporter-otlp-proto-http>=1.38.0",
11+
"opentelemetry-exporter-prometheus>=0.59b0",
12+
"opentelemetry-sdk>=1.38.0",
913
"picows>=1.10.1",
14+
"prometheus-client>=0.23.1",
1015
]
1116

1217
[tool.pyrefly]
@@ -33,10 +38,13 @@ prerequisite_platform = [
3338
dev = [
3439
"faker>=37.11.0",
3540
"filelock>=3.20.0",
36-
"pyrefly>=0.39.0",
41+
"freezegun>=1.5.5",
42+
"pillow>=12.0.0",
43+
"pyrefly>=0.40.0",
3744
"pytest>=8.4.2",
3845
"pytest-asyncio>=1.2.0",
3946
"pytest-randomly>=4.0.1",
47+
"pytest-vcr>=1.0.2",
4048
"pytest-xdist>=3.8.0",
41-
"ruff>=0.14.1",
49+
"ruff>=0.14.4",
4250
]

savant_cloudpin/__main__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
from savant_rs.py.log import get_logger, init_logging
44

55
from savant_cloudpin.cfg import SENSITIVE_KEYS, dump_to_yaml, load_config
6+
from savant_cloudpin.observability import serve_health_endpoint, serve_metrics
67
from savant_cloudpin.services import create_service
8+
from savant_cloudpin.signals import handle_signals
79

810

911
async def serve() -> None:
@@ -17,7 +19,14 @@ async def serve() -> None:
1719
logger.debug(f"Configuration details:\n{config_yaml}")
1820

1921
logger.info("Running main loop ...")
20-
async with create_service(config) as service:
22+
async with (
23+
handle_signals() as handler,
24+
serve_health_endpoint(config.observability.health),
25+
serve_metrics(config.observability.metrics),
26+
create_service(config) as service,
27+
):
28+
handler.append(service.stop_running)
29+
2130
await service.run()
2231
logger.info("Main loop stopped")
2332

savant_cloudpin/cfg/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
ClientServiceConfig,
55
ClientSSLConfig,
66
ClientWSConfig,
7+
HealthConfig,
8+
HistogramBoundaries,
9+
MetricsConfig,
710
ObservabilityConfig,
11+
OTLPMetricConfig,
12+
PrometheusConfig,
813
ReaderConfig,
914
ServerServiceConfig,
1015
ServerSSLConfig,
@@ -16,9 +21,14 @@
1621
"ClientServiceConfig",
1722
"ClientSSLConfig",
1823
"ClientWSConfig",
24+
"HealthConfig",
25+
"HistogramBoundaries",
26+
"MetricsConfig",
1927
"dump_to_yaml",
2028
"load_config",
2129
"ObservabilityConfig",
30+
"OTLPMetricConfig",
31+
"PrometheusConfig",
2232
"ReaderConfig",
2333
"SENSITIVE_KEYS",
2434
"ServerServiceConfig",

savant_cloudpin/cfg/_bootstrap.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,28 @@ def merge_env_config(
5252
env_cfg = utils.drop_none_values(env_cfg)
5353
cfg = OmegaConf.merge(yml_cfg, env_cfg, cli_cfg)
5454

55-
ssl = "websockets" in cfg and "ssl" in cfg.websockets
56-
ssl = cfg.websockets.ssl if ssl else {}
55+
assert isinstance(cfg, DictConfig)
56+
cfg.websockets = utils.drop_none_values(cfg.get("websockets", {}))
57+
ssl = cfg.websockets.get("ssl", None)
58+
cfg.observability = utils.drop_none_values(cfg.get("observability", {}))
59+
health = cfg.observability.get("health", {})
60+
metrics = cfg.observability.get("metrics", {})
61+
otlp = metrics.get("otlp", {})
62+
prometheus = metrics.get("prometheus", {})
5763

5864
cfg = OmegaConf.merge(default_cfg, cfg)
5965
assert isinstance(cfg, DictConfig)
60-
if not any(val is not None for val in ssl.values()):
61-
cfg.websockets.ssl = None
66+
if not ssl:
67+
cfg.websockets["ssl"] = None
68+
if not health:
69+
cfg.observability["health"] = None
70+
if not metrics:
71+
cfg.observability["metrics"] = None
72+
else:
73+
if not otlp:
74+
cfg.observability.metrics.otlp = None
75+
if not prometheus:
76+
cfg.observability.metrics.prometheus = None
6277
return cfg
6378

6479

savant_cloudpin/cfg/_defaults.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
ClientServiceConfig,
33
ClientSSLConfig,
44
ClientWSConfig,
5+
HealthConfig,
6+
MetricsConfig,
57
ObservabilityConfig,
8+
OTLPMetricConfig,
9+
PrometheusConfig,
610
ReaderConfig,
711
ServerServiceConfig,
812
ServerSSLConfig,
@@ -23,19 +27,20 @@
2327
)
2428

2529
DEFAULT_OBSERVABILITY_CONFIG = ObservabilityConfig(
26-
log_spec="${oc.env:LOGLEVEL,warning}"
30+
health=HealthConfig(endpoint="???"),
31+
metrics=MetricsConfig(
32+
otlp=OTLPMetricConfig(endpoint="???"),
33+
prometheus=PrometheusConfig(
34+
endpoint="???",
35+
),
36+
),
2737
)
2838

2939
DEFAULT_CLIENT_CONFIG = ClientServiceConfig(
3040
websockets=ClientWSConfig(
31-
server_url="???",
41+
endpoint="???",
3242
api_key="???",
33-
ssl=ClientSSLConfig(
34-
ca_file=None,
35-
cert_file="???",
36-
key_file="???",
37-
check_hostname=False,
38-
),
43+
ssl=ClientSSLConfig(),
3944
),
4045
source=DEFAULT_SOURCE_CONFIG,
4146
sink=DEFAULT_SINK_CONFIG,
@@ -44,13 +49,11 @@
4449

4550
DEFAULT_SERVER_CONFIG = ServerServiceConfig(
4651
websockets=ServerWSConfig(
47-
server_url="???",
52+
endpoint="???",
4853
api_key="???",
4954
ssl=ServerSSLConfig(
50-
ca_file=None,
5155
cert_file="???",
5256
key_file="???",
53-
client_cert_required=True,
5457
),
5558
),
5659
source=DEFAULT_SOURCE_CONFIG,

0 commit comments

Comments
 (0)