roboflow
diff --git a/‎inference/core/interfaces/webrtc_worker/webrtc.py‎
Lines changed: 22 additions & 10 deletions b/‎inference/core/interfaces/webrtc_worker/webrtc.py‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎inference_experimental/inference_exp/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎inference_experimental/inference_exp/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎inference_experimental/inference_exp/models/auto_loaders/core.py‎
Lines changed: 2 additions & 2 deletions b/‎inference_experimental/inference_exp/models/auto_loaders/core.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎inference_experimental/inference_exp/models/auto_loaders/models_registry.py‎
Lines changed: 6 additions & 2 deletions b/‎inference_experimental/inference_exp/models/auto_loaders/models_registry.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎inference_experimental/inference_exp/models/base/documents_parsing.py‎
Lines changed: 2 additions & 4 deletions b/‎inference_experimental/inference_exp/models/base/documents_parsing.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎inference_experimental/inference_exp/models/depth_anything_v2/depth_anything_v2_hf.py‎
Lines changed: 1 addition & 3 deletions b/‎inference_experimental/inference_exp/models/depth_anything_v2/depth_anything_v2_hf.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎inference_experimental/inference_exp/models/doctr/doctr_torch.py‎
Lines changed: 58 additions & 48 deletions b/‎inference_experimental/inference_exp/models/doctr/doctr_torch.py‎
Lines changed: 58 additions & 48 deletions
diff --git a/‎inference_experimental/inference_exp/models/moondream2/moondream2_hf.py‎
Lines changed: 2 additions & 2 deletions b/‎inference_experimental/inference_exp/models/moondream2/moondream2_hf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎inference_experimental/inference_exp/models/rfdetr/rfdetr_instance_segmentation_pytorch.py‎
Lines changed: 3 additions & 1 deletion b/‎inference_experimental/inference_exp/models/rfdetr/rfdetr_instance_segmentation_pytorch.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎inference_experimental/inference_exp/models/rfdetr/rfdetr_object_detection_pytorch.py‎
Lines changed: 3 additions & 1 deletion b/‎inference_experimental/inference_exp/models/rfdetr/rfdetr_object_detection_pytorch.py‎
Lines changed: 3 additions & 1 deletion
@@ -162,10 +162,11 @@ def send_chunked_data(
             f"Sending response for frame {frame_id}: {total_chunks} chunk(s), {len(payload_bytes)} bytes"
         )
 
+    view = memoryview(payload_bytes)
     for chunk_index in range(total_chunks):
         start = chunk_index * chunk_size
         end = min(start + chunk_size, len(payload_bytes))
-        chunk_data = payload_bytes[start:end]
+        chunk_data = view[start:end]
 
         message = create_chunked_binary_message(
             frame_id, chunk_index, total_chunks, chunk_data
@@ -305,7 +306,9 @@ async def _send_data_output(
 
         if self._data_mode == DataOutputMode.NONE:
             # Even empty responses use binary protocol
-            json_bytes = json.dumps(webrtc_output.model_dump()).encode("utf-8")
+            json_bytes = await asyncio.to_thread(
+                lambda: json.dumps(webrtc_output.model_dump()).encode("utf-8")
+            )
             send_chunked_data(self.data_channel, self._received_frames, json_bytes)
             return
 
@@ -371,15 +374,22 @@ async def _handle_data_channel_frame(self, message: bytes) -> None:
                     f"Received frame {frame_id}: {total_chunks} chunk(s), {len(jpeg_bytes)} bytes JPEG"
                 )
 
-            nparr = np.frombuffer(jpeg_bytes, np.uint8)
-            np_image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+            def _decode_to_frame(jpeg_bytes: bytes) -> VideoFrame:
+                nparr = np.frombuffer(jpeg_bytes, np.uint8)
+                np_image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+
+                if np_image is None:
+                    raise ValueError("cv2.imdecode returned None")
+
+                return VideoFrame.from_ndarray(np_image, format="bgr24")
 
-            if np_image is None:
-                logger.error(f"Failed to decode JPEG for frame {frame_id}")
+            try:
+                video_frame = await asyncio.to_thread(_decode_to_frame, jpeg_bytes)
+            except Exception as e:
+                logger.error(f"Failed to decode JPEG for frame {frame_id}: {e}")
                 return
 
-            video_frame = VideoFrame.from_ndarray(np_image, format="bgr24")
-            await self._data_frame_queue.put((frame_id, video_frame))
+            self._data_frame_queue.put_nowait((frame_id, video_frame))
 
             if frame_id % 100 == 1:
                 logger.info(f"Queued frame {frame_id}")
@@ -440,8 +450,10 @@ async def process_frames_data_only(self):
                 )
 
                 # Send data via data channel
-                await self._send_data_output(
-                    workflow_output, frame_timestamp, frame, errors
+                asyncio.create_task(
+                    self._send_data_output(
+                        workflow_output, frame_timestamp, frame, errors
+                    )
                 )
 
         except asyncio.CancelledError:
 
@@ -12,7 +12,7 @@
     MultiLabelClassificationPrediction,
 )
 from inference_exp.models.base.depth_estimation import DepthEstimationModel
-from inference_exp.models.base.documents_parsing import DocumentParsingModel
+from inference_exp.models.base.documents_parsing import StructuredOCRModel
 from inference_exp.models.base.embeddings import TextImageEmbeddingModel
 from inference_exp.models.base.instance_segmentation import (
     InstanceDetections,
 
@@ -57,7 +57,7 @@
     MultiLabelClassificationModel,
 )
 from inference_exp.models.base.depth_estimation import DepthEstimationModel
-from inference_exp.models.base.documents_parsing import DocumentParsingModel
+from inference_exp.models.base.documents_parsing import StructuredOCRModel
 from inference_exp.models.base.embeddings import TextImageEmbeddingModel
 from inference_exp.models.base.instance_segmentation import InstanceSegmentationModel
 from inference_exp.models.base.keypoints_detection import KeyPointsDetectionModel
@@ -79,7 +79,7 @@
     ClassificationModel,
     MultiLabelClassificationModel,
     DepthEstimationModel,
-    DocumentParsingModel,
+    StructuredOCRModel,
     TextImageEmbeddingModel,
     InstanceSegmentationModel,
     KeyPointsDetectionModel,
 
@@ -15,6 +15,7 @@
 CLASSIFICATION_TASK = "classification"
 MULTI_LABEL_CLASSIFICATION_TASK = "multi-label-classification"
 DEPTH_ESTIMATION_TASK = "depth-estimation"
+STRUCTURED_OCR_TASK = "structured-ocr"
 
 
 @dataclass(frozen=True)
@@ -356,8 +357,11 @@ class RegistryEntry:
     ),
     ("depth-anything-v2", DEPTH_ESTIMATION_TASK, BackendType.HF): LazyClass(
         module_name="inference_exp.models.depth_anything_v2.depth_anything_v2_hf",
-        class_name="DepthAnythingV2HF"
-    )
+        class_name="DepthAnythingV2HF",
+    ),
+    ("doctr", STRUCTURED_OCR_TASK, BackendType.TORCH): LazyClass(
+        module_name="inference_exp.models.doctr.doctr_torch", class_name="DocTR"
+    ),
 }
 
 
 
@@ -11,15 +11,13 @@
 )
 
 
-class DocumentParsingModel(
+class StructuredOCRModel(
     ABC, Generic[PreprocessedInputs, PreprocessingMetadata, RawPrediction]
 ):
 
     @classmethod
     @abstractmethod
-    def from_pretrained(
-        cls, model_name_or_path: str, **kwargs
-    ) -> "DocumentParsingModel":
+    def from_pretrained(cls, model_name_or_path: str, **kwargs) -> "StructuredOCRModel":
         pass
 
     @property
 
@@ -28,9 +28,7 @@ def from_pretrained(
             local_files_only=local_files_only,
         ).to(device)
         processor = AutoImageProcessor.from_pretrained(
-            model_name_or_path,
-            local_files_only=local_files_only,
-            use_fast=True
+            model_name_or_path, local_files_only=local_files_only, use_fast=True
         )
         return cls(model=model, processor=processor, device=device)
 
 
@@ -1,88 +1,98 @@
-import os
 from dataclasses import dataclass
 from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 from doctr.io import Document
-from doctr.models import ocr_predictor
+from doctr.models import detection_predictor, ocr_predictor, recognition_predictor
 from inference_exp import Detections
 from inference_exp.configuration import DEFAULT_DEVICE
 from inference_exp.entities import ColorFormat, ImageDimensions
 from inference_exp.errors import CorruptedModelPackageError, ModelRuntimeError
-from inference_exp.models.base.documents_parsing import DocumentParsingModel
+from inference_exp.models.base.documents_parsing import StructuredOCRModel
 from inference_exp.models.common.model_packages import get_model_package_contents
 from inference_exp.utils.file_system import read_json
 
-WEIGHTS_NAMES_MAPPING = {
-    "db_resnet50": "db_resnet50-79bd7d70.pt",
-    "db_resnet34": "db_resnet34-cb6aed9e.pt",
-    "db_mobilenet_v3_large": "db_mobilenet_v3_large-21748dd0.pt",
-    "crnn_vgg16_bn": "crnn_vgg16_bn-9762b0b0.pt",
-    "crnn_mobilenet_v3_small": "crnn_mobilenet_v3_small_pt-3b919a02.pt",
-    "crnn_mobilenet_v3_large": "crnn_mobilenet_v3_large_pt-f5259ec2.pt",
+SUPPORTED_DETECTION_MODELS = {
+    "fast_base",
+    "fast_small",
+    "fast_tiny",
+    "db_resnet50",
+    "db_resnet34",
+    "db_mobilenet_v3_large",
+    "linknet_resnet18",
+    "linknet_resnet34",
+    "linknet_resnet50",
+}
+SUPPORTED_RECOGNITION_MODELS = {
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "master",
+    "sar_resnet31",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
 }
 
 
-class DocTR(DocumentParsingModel[List[np.ndarray], ImageDimensions, Document]):
+class DocTR(StructuredOCRModel[List[np.ndarray], ImageDimensions, Document]):
 
     @classmethod
     def from_pretrained(
         cls,
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
+        assume_straight_pages: bool = True,
+        preserve_aspect_ratio: bool = True,
+        detection_max_batch_size: int = 2,
+        recognition_max_batch_size: int = 128,
         **kwargs,
-    ) -> "DocumentParsingModel":
-        os.environ["DOCTR_CACHE_DIR"] = model_name_or_path
+    ) -> "StructuredOCRModel":
         model_package_content = get_model_package_contents(
             model_package_dir=model_name_or_path,
-            elements=["doctr_det", "doctr_rec", "config.json"],
+            elements=["detection_weights.pt", "recognition_weights.pt", "config.json"],
         )
         config = parse_model_config(config_path=model_package_content["config.json"])
-        os.makedirs(f"{model_name_or_path}/doctr_det/models/", exist_ok=True)
-        os.makedirs(f"{model_name_or_path}/doctr_rec/models/", exist_ok=True)
-        det_model_source_path = os.path.join(
-            model_name_or_path, "doctr_det", config.det_model, "model.pt"
-        )
-        rec_model_source_path = os.path.join(
-            model_name_or_path, "doctr_rec", config.rec_model, "model.pt"
-        )
-        if not os.path.exists(det_model_source_path):
-            raise CorruptedModelPackageError(
-                message="Could not initialize DocTR model - could not find detection model weights.",
-                help_url="https://todo",
-            )
-        if not os.path.exists(rec_model_source_path):
-            raise CorruptedModelPackageError(
-                message="Could not initialize DocTR model - could not find recognition model weights.",
-                help_url="https://todo",
-            )
-        if config.det_model not in WEIGHTS_NAMES_MAPPING:
+        if config.det_model not in SUPPORTED_DETECTION_MODELS:
             raise CorruptedModelPackageError(
                 message=f"{config.det_model} model denoted in configuration not supported as DocTR detection model.",
                 help_url="https://todo",
             )
-        if config.rec_model not in WEIGHTS_NAMES_MAPPING:
+        if config.rec_model not in SUPPORTED_RECOGNITION_MODELS:
             raise CorruptedModelPackageError(
-                message=f"{config.det_model} model denoted in configuration not supported as DocTR recognition model.",
+                message=f"{config.rec_model} model denoted in configuration not supported as DocTR recognition model.",
                 help_url="https://todo",
             )
-        det_model_target_path = os.path.join(
-            model_name_or_path, "models", WEIGHTS_NAMES_MAPPING[config.det_model]
+        det_model = detection_predictor(
+            arch=config.det_model,
+            pretrained=False,
+            assume_straight_pages=assume_straight_pages,
+            preserve_aspect_ratio=preserve_aspect_ratio,
+            batch_size=detection_max_batch_size,
+        )
+        det_model.model.to(device)
+        detector_weights = torch.load(
+            model_package_content["detection_weights.pt"],
+            weights_only=True,
+            map_location=device,
+        )
+        det_model.model.load_state_dict(detector_weights)
+        rec_model = recognition_predictor(
+            arch=config.rec_model,
+            pretrained=False,
+            batch_size=recognition_max_batch_size,
         )
-        rec_model_target_path = os.path.join(
-            model_name_or_path, "models", WEIGHTS_NAMES_MAPPING[config.rec_model]
+        rec_model.model.to(device)
+        rec_weights = torch.load(
+            model_package_content["recognition_weights.pt"],
+            weights_only=True,
+            map_location=device,
         )
-        if os.path.exists(det_model_target_path):
-            os.remove(det_model_target_path)
-        os.symlink(det_model_source_path, det_model_target_path)
-        if os.path.exists(rec_model_target_path):
-            os.remove(rec_model_target_path)
-        os.symlink(rec_model_source_path, rec_model_target_path)
+        rec_model.model.load_state_dict(rec_weights)
         model = ocr_predictor(
-            det_arch=config.det_model,
-            reco_arch=config.rec_model,
-            pretrained=True,
+            det_arch=det_model.model,
+            reco_arch=rec_model.model,
         ).to(device=device)
         return cls(model=model, device=device)
 
 
@@ -37,8 +37,8 @@ def from_pretrained(
         if torch.mps.is_available():
             raise ModelRuntimeError(
                 message=f"This model cannot run on Apple device with MPS unit - original implementation contains bug "
-                        f"preventing proper allocation of tensors which causes runtime error. Run this model on the "
-                        f"machine with Nvidia GPU or x86 CPU.",
+                f"preventing proper allocation of tensors which causes runtime error. Run this model on the "
+                f"machine with Nvidia GPU or x86 CPU.",
                 help_url="https://todo",
             )
         model_package_content = get_model_package_contents(
 
@@ -124,7 +124,9 @@ def from_pretrained(
         model_config = CONFIG_FOR_MODEL_TYPE[model_type](device=device)
         checkpoint_num_classes = weights_dict["class_embed.bias"].shape[0]
         model_config.num_classes = checkpoint_num_classes - 1
-        model_config.resolution = inference_config.network_input.training_input_size.height
+        model_config.resolution = (
+            inference_config.network_input.training_input_size.height
+        )
         model = build_model(config=model_config)
         model.load_state_dict(weights_dict)
         model = model.eval().to(device)
 
@@ -130,7 +130,9 @@ def from_pretrained(
         model_config = CONFIG_FOR_MODEL_TYPE[model_type](device=device)
         checkpoint_num_classes = weights_dict["class_embed.bias"].shape[0]
         model_config.num_classes = checkpoint_num_classes - 1
-        model_config.resolution = inference_config.network_input.training_input_size.height
+        model_config.resolution = (
+            inference_config.network_input.training_input_size.height
+        )
         model = build_model(config=model_config)
         model.load_state_dict(weights_dict)
         model = model.eval().to(device)
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`MultiLabelClassificationPrediction,`
`13`	`13`	`)`
`14`	`14`	`from inference_exp.models.base.depth_estimation import DepthEstimationModel`
`15`		`-from inference_exp.models.base.documents_parsing import DocumentParsingModel`
	`15`	`+from inference_exp.models.base.documents_parsing import StructuredOCRModel`
`16`	`16`	`from inference_exp.models.base.embeddings import TextImageEmbeddingModel`
`17`	`17`	`from inference_exp.models.base.instance_segmentation import (`
`18`	`18`	`InstanceDetections,`