Merge pull request #1700 from roboflow/lean/webrtc-data-decoupling

grzegorz-roboflow · web-flow · commit ff184a006168 · 2025-11-13T13:00:54.000+01:00
WebRTC data decoupling
diff --git a/examples/webrtc/webrtc_worker.py b/examples/webrtc/webrtc_worker.py
diff --git a/inference/core/exceptions.py b/inference/core/exceptions.py
@@ -216,3 +216,7 @@ def __init__(self, message: str, inner_error: Exception):
     @property
     def inner_error(self) -> Exception:
         return self._inner_error
+
+
+class WebRTCConfigurationError(Exception):
+    pass
diff --git a/inference/core/interfaces/http/error_handlers.py b/inference/core/interfaces/http/error_handlers.py
@@ -28,6 +28,7 @@
     RoboflowAPITimeoutError,
     RoboflowAPIUnsuccessfulRequestError,
     ServiceConfigurationError,
+    WebRTCConfigurationError,
     WorkspaceLoadError,
 )
 from inference.core.interfaces.stream_manager.api.errors import (
@@ -358,6 +359,15 @@ def wrapped_route(*args, **kwargs):
                     "inner_error_type": error.inner_error_type,
                 },
             )
+        except WebRTCConfigurationError as error:
+            logger.error("%s: %s", type(error).__name__, error)
+            resp = JSONResponse(
+                status_code=400,
+                content={
+                    "message": str(error),
+                    "error_type": "WebRTCConfigurationError",
+                },
+            )
         except Exception as error:
             logger.exception("%s: %s", type(error).__name__, error)
             resp = JSONResponse(status_code=500, content={"message": "Internal error."})
@@ -661,6 +671,15 @@ async def wrapped_route(*args, **kwargs):
                     "inner_error_type": error.inner_error_type,
                 },
             )
+        except WebRTCConfigurationError as error:
+            logger.error("%s: %s", type(error).__name__, error)
+            resp = JSONResponse(
+                status_code=400,
+                content={
+                    "message": str(error),
+                    "error_type": "WebRTCConfigurationError",
+                },
+            )
         except Exception as error:
             logger.exception("%s: %s", type(error).__name__, error)
             resp = JSONResponse(status_code=500, content={"message": "Internal error."})
diff --git a/inference/core/interfaces/http/http_api.py b/inference/core/interfaces/http/http_api.py
@@ -171,6 +171,7 @@
     MissingServiceSecretError,
     RoboflowAPINotAuthorizedError,
     RoboflowAPINotNotFoundError,
+    WebRTCConfigurationError,
     WorkspaceLoadError,
 )
 from inference.core.interfaces.base import BaseInterface
@@ -1467,6 +1468,7 @@ async def initialise_webrtc_worker(
                         "RoboflowAPINotAuthorizedError": RoboflowAPINotAuthorizedError,
                         "RoboflowAPINotNotFoundError": RoboflowAPINotNotFoundError,
                         "ValidationError": ValidationError,
+                        "WebRTCConfigurationError": WebRTCConfigurationError,
                     }
                     exc = expected_exceptions.get(
                         worker_result.exception_type, Exception
diff --git a/inference/core/interfaces/stream_manager/manager_app/entities.py b/inference/core/interfaces/stream_manager/manager_app/entities.py
@@ -112,8 +112,8 @@ class InitialiseWebRTCPipelinePayload(InitialisePipelinePayload):
         WEBRTC_REALTIME_PROCESSING  # this parameter controls only webrtc processing, not inference pipeline strategies
     )
     webrtc_turn_config: Optional[WebRTCTURNConfig] = None
-    stream_output: Optional[List[Optional[str]]] = Field(default_factory=list)
-    data_output: Optional[List[Optional[str]]] = Field(default_factory=list)
+    stream_output: Optional[List[str]] = Field(default_factory=list)
+    data_output: Optional[List[str]] = Field(default_factory=list)
     webcam_fps: Optional[float] = (
         None  # TODO: this parameter is now passed for both webcam and video source
     )
@@ -124,8 +124,8 @@ class InitialiseWebRTCPipelinePayload(InitialisePipelinePayload):
 
 
 class WebRTCData(BaseModel):
-    stream_output: Optional[str] = None
-    data_output: Optional[str] = None
+    stream_output: Optional[List[str]] = None
+    data_output: Optional[List[str]] = None
 
 
 class ConsumeResultsPayload(BaseModel):
diff --git a/inference/core/interfaces/stream_manager/manager_app/webrtc.py b/inference/core/interfaces/stream_manager/manager_app/webrtc.py
@@ -338,7 +338,7 @@ def __init__(
         video_transform_track: VideoTransformTrack,
         asyncio_loop: asyncio.AbstractEventLoop,
         stream_output: Optional[str] = None,
-        data_output: Optional[str] = None,
+        data_output: Optional[List[str]] = None,
         *args,
         **kwargs,
     ):
@@ -347,7 +347,7 @@ def __init__(
         self.video_transform_track: VideoTransformTrack = video_transform_track
         self._consumers_signalled: bool = False
         self.stream_output: Optional[str] = stream_output
-        self.data_output: Optional[str] = data_output
+        self.data_output: Optional[List[str]] = data_output
         self.data_channel: Optional[RTCDataChannel] = None
 
 
@@ -384,7 +384,7 @@ async def init_rtc_peer_connection(
     webrtc_realtime_processing: bool = True,
     webcam_fps: Optional[float] = None,
     stream_output: Optional[str] = None,
-    data_output: Optional[str] = None,
+    data_output: Optional[List[str]] = None,
 ) -> RTCPeerConnectionWithFPS:
     relay = MediaRelay()
     video_transform_track = VideoTransformTrack(
diff --git a/inference/core/interfaces/webrtc_worker/entities.py b/inference/core/interfaces/webrtc_worker/entities.py
@@ -1,4 +1,5 @@
-from typing import List, Literal, Optional, Union
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, Field
 
@@ -21,8 +22,8 @@ class WebRTCWorkerRequest(BaseModel):
     webrtc_realtime_processing: bool = (
         WEBRTC_REALTIME_PROCESSING  # when set to True, MediaRelay.subscribe will be called with buffered=False
     )
-    stream_output: Optional[List[Optional[str]]] = Field(default_factory=list)
-    data_output: Optional[List[Optional[str]]] = Field(default_factory=list)
+    stream_output: Optional[List[str]] = Field(default=None)
+    data_output: Optional[List[str]] = Field(default=None)
     declared_fps: Optional[float] = None
     rtsp_url: Optional[str] = None
     processing_timeout: Optional[int] = WEBRTC_MODAL_FUNCTION_TIME_LIMIT
@@ -53,8 +54,15 @@ class WebRTCVideoMetadata(BaseModel):
 
 
 class WebRTCOutput(BaseModel):
-    output_name: Optional[str] = None
-    serialized_output_data: Optional[str] = None
+    """Output sent via WebRTC data channel.
+
+    serialized_output_data contains a dictionary with workflow outputs:
+    - If data_output is None or []: no data sent (only metadata)
+    - If data_output is ["*"]: all workflow outputs (excluding images, unless explicitly named)
+    - If data_output is ["field1", "field2"]: only those fields (including images if explicitly named)
+    """
+
+    serialized_output_data: Optional[Dict[str, Any]] = None
     video_metadata: Optional[WebRTCVideoMetadata] = None
     errors: List[str] = Field(default_factory=list)
 
@@ -66,3 +74,15 @@ class WebRTCWorkerResult(BaseModel):
     error_message: Optional[str] = None
     error_context: Optional[str] = None
     inner_error: Optional[str] = None
+
+
+class StreamOutputMode(str, Enum):
+    AUTO_DETECT = "auto_detect"  # None -> auto-detect first image
+    NO_VIDEO = "no_video"  # [] -> no video track
+    SPECIFIC_FIELD = "specific"  # ["field"] -> use specific field
+
+
+class DataOutputMode(str, Enum):
+    NONE = "none"  # None or [] -> no data sent
+    ALL = "all"  # ["*"] -> send all (skip images)
+    SPECIFIC = "specific"  # ["field1", "field2"] -> send only these
diff --git a/inference/core/interfaces/webrtc_worker/utils.py b/inference/core/interfaces/webrtc_worker/utils.py
@@ -15,16 +15,38 @@
 logging.getLogger("aiortc").setLevel(logging.WARNING)
 
 
+def detect_image_output(
+    workflow_output: Dict[str, Union[WorkflowImageData, Any]],
+) -> Optional[str]:
+    """Detect the first available image output field in workflow output."""
+    for output_name in workflow_output.keys():
+        if (
+            get_frame_from_workflow_output(
+                workflow_output=workflow_output,
+                frame_output_key=output_name,
+            )
+            is not None
+        ):
+            return output_name
+    return None
+
+
 def process_frame(
     frame: VideoFrame,
     frame_id: int,
     inference_pipeline: InferencePipeline,
-    stream_output: str,
+    stream_output: Optional[str] = None,
+    render_output: bool = True,
     include_errors_on_frame: bool = True,
-) -> Tuple[Dict[str, Union[WorkflowImageData, Any]], VideoFrame, List[str]]:
+) -> Tuple[
+    Dict[str, Union[WorkflowImageData, Any]],
+    Optional[VideoFrame],
+    List[str],
+]:
     np_image = frame.to_ndarray(format="bgr24")
     workflow_output: Dict[str, Union[WorkflowImageData, Any]] = {}
     errors = []
+
     try:
         video_frame = InferenceVideoFrame(
             image=np_image,
@@ -34,36 +56,40 @@ def process_frame(
             fps=30,  # placeholder
             measured_fps=30,  # placeholder
         )
-        workflow_output: Dict[str, Union[WorkflowImageData, Any]] = (
-            inference_pipeline._on_video_frame([video_frame])[0]
+        workflow_output = inference_pipeline._on_video_frame([video_frame])[0]
+    except Exception as e:
+        logger.exception("Error in workflow processing")
+        errors.append(str(e))
+
+    if not render_output:
+        return workflow_output, None, errors
+
+    if stream_output is None:
+        errors.append("stream_output is required when render_output=True")
+        return (
+            workflow_output,
+            VideoFrame.from_ndarray(np_image, format="bgr24"),
+            errors,
         )
-        result_np_image: Optional[np.ndarray] = get_frame_from_workflow_output(
+
+    result_np_image: Optional[np.ndarray] = None
+    try:
+        result_np_image = get_frame_from_workflow_output(
             workflow_output=workflow_output,
             frame_output_key=stream_output,
         )
-        if result_np_image is None:
-            for k in workflow_output.keys():
-                result_np_image = get_frame_from_workflow_output(
-                    workflow_output=workflow_output,
-                    frame_output_key=k,
-                )
-                if result_np_image is not None:
-                    errors.append(
-                        f"'{stream_output}' not found in workflow outputs, using '{k}' instead"
-                    )
-                    break
         if result_np_image is None:
             errors.append("Visualisation blocks were not executed")
             errors.append("or workflow was not configured to output visuals.")
             errors.append("Please try to adjust the scene so models detect objects")
             errors.append("or stop preview, update workflow and try again.")
             result_np_image = np_image
     except Exception as e:
-        logger.exception("Error in inference pipeline")
+        logger.exception("Error extracting visual output")
         result_np_image = np_image
         errors.append(str(e))
 
-    if include_errors_on_frame:
+    if include_errors_on_frame and errors:
         result_np_image = overlay_text_on_np_frame(
             frame=result_np_image,
             text=errors,
diff --git a/inference/core/interfaces/webrtc_worker/webrtc.py b/inference/core/interfaces/webrtc_worker/webrtc.py