Merge branch 'main' into feat/openai-block-v4

brunopicinin · web-flow · commit ea56c1167b9d · 2025-12-04T18:08:38.000-03:00
diff --git a/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py b/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py
@@ -7,6 +7,7 @@
 from pydantic import ConfigDict, Field
 from supervision.config import CLASS_NAME_DATA_FIELD
 
+from inference.core.roboflow_api import post_to_roboflow_api
 from inference.core.workflows.core_steps.common.utils import (
     attach_parents_coordinates_to_sv_detections,
 )
@@ -22,6 +23,7 @@
 from inference.core.workflows.execution_engine.entities.types import (
     IMAGE_KIND,
     OBJECT_DETECTION_PREDICTION_KIND,
+    ROBOFLOW_MANAGED_KEY,
     SECRET_KIND,
     STRING_KIND,
     Selector,
@@ -40,7 +42,8 @@
 - `text_detection`: optimized for areas of text within a larger image.
 - `ocr_text_detection`: optimized for dense text documents.
 
-You need to provide your Google Vision API key to use this block.
+Provide your Google Vision API key or set the value to ``rf_key:account`` (or
+``rf_key:user:<id>``) to proxy requests through Roboflow's API.
 """
 
 
@@ -80,17 +83,20 @@ class BlockManifest(WorkflowBlockManifest):
             },
         },
     )
+    api_key: Union[
+        Selector(kind=[STRING_KIND, SECRET_KIND, ROBOFLOW_MANAGED_KEY]), str
+    ] = Field(
+        default="rf_key:account",
+        description="Your Google Vision API key",
+        examples=["xxx-xxx", "$inputs.google_api_key"],
+        private=True,
+    )
     language_hints: Optional[List[str]] = Field(
         default=None,
         description="Optional list of language codes to pass to the OCR API. If not provided, the API will attempt to detect the language automatically."
         "If provided, language codes must be supported by the OCR API, visit https://cloud.google.com/vision/docs/languages for list of supported language codes.",
         examples=[["en", "fr"], ["de"]],
     )
-    api_key: Union[Selector(kind=[STRING_KIND, SECRET_KIND]), str] = Field(
-        description="Your Google Vision API key",
-        examples=["xxx-xxx", "$inputs.google_api_key"],
-        private=True,
-    )
 
     @classmethod
     def describe_outputs(cls) -> List[OutputDefinition]:
@@ -109,6 +115,16 @@ def get_execution_engine_compatibility(cls) -> Optional[str]:
 
 class GoogleVisionOCRBlockV1(WorkflowBlock):
 
+    def __init__(
+        self,
+        api_key: Optional[str],
+    ):
+        self._roboflow_api_key = api_key
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["api_key"]
+
     @classmethod
     def get_manifest(cls) -> Type[WorkflowBlockManifest]:
         return BlockManifest
@@ -118,105 +134,163 @@ def run(
         image: WorkflowImageData,
         ocr_type: Literal["text_detection", "ocr_text_detection"],
         language_hints: Optional[List[str]],
-        api_key: str,
+        api_key: str = "rf_key:account",
     ) -> BlockResult:
-        # Decide which type of OCR to use and make the request to Google Vision API
+        # Decide which type of OCR to use
         if ocr_type == "text_detection":
-            type = "TEXT_DETECTION"
+            detection_type = "TEXT_DETECTION"
         elif ocr_type == "ocr_text_detection":
-            type = "DOCUMENT_TEXT_DETECTION"
+            detection_type = "DOCUMENT_TEXT_DETECTION"
         else:
             raise ValueError(f"Invalid ocr_type: {ocr_type}")
 
-        request_json = {
-            "requests": [
-                {
-                    "image": {"content": image.base64_image},
-                    "features": [{"type": type}],
-                }
-            ]
-        }
-
-        if language_hints is not None:
-            for r in request_json["requests"]:
-                r["imageContext"] = {"languageHints": language_hints}
-        response = requests.post(
-            "https://vision.googleapis.com/v1/images:annotate",
-            params={"key": api_key},
-            json=request_json,
+        request_json = _build_request_json(
+            image=image,
+            detection_type=detection_type,
+            language_hints=language_hints,
         )
 
-        if response.status_code != 200:
-            raise RuntimeError(
-                f"Request to Google Cloud Vision API failed: {str(response.json())}"
+        # Route to proxy or direct API based on api_key format
+        if api_key.startswith(("rf_key:account", "rf_key:user:")):
+            result = _execute_proxied_google_vision_request(
+                roboflow_api_key=self._roboflow_api_key,
+                google_vision_api_key=api_key,
+                request_json=request_json,
+            )
+        else:
+            result = _execute_google_vision_request(
+                api_key=api_key,
+                request_json=request_json,
             )
 
-        result = response.json()["responses"][0]
-
-        # Check for image without text
-        if "textAnnotations" not in result or not result["textAnnotations"]:
-            return {
-                "text": "",
-                "language": "",
-                "predictions": sv.Detections.empty(),
-            }
-
-        # Extract predictions from the response
-        text = result["textAnnotations"][0]["description"]
-        language = result["textAnnotations"][0]["locale"]
-
-        xyxy = []
-        confidence = []
-        classes = []
-        detections_id = []
-
-        for page in result["fullTextAnnotation"]["pages"]:
-            for block in page["blocks"]:
-                # Get bounding box coordinates
-                box = block["boundingBox"]["vertices"]
-                x_min = min(v.get("x", 0) for v in box)
-                y_min = min(v.get("y", 0) for v in box)
-                x_max = max(v.get("x", 0) for v in box)
-                y_max = max(v.get("y", 0) for v in box)
-                xyxy.append([x_min, y_min, x_max, y_max])
-
-                # Only DOCUMENT_TEXT_DETECTION provides confidence score, use 1.0 otherwise
-                confidence.append(block.get("confidence", 1.0))
-
-                # Get block text
-                block_text = []
-                for paragraph in block["paragraphs"]:
-                    for word in paragraph["words"]:
-                        word_text = "".join(
-                            symbol["text"] for symbol in word["symbols"]
-                        )
-                        block_text.append(word_text)
-                classes.append(" ".join(block_text))
-
-                # Create unique detection id for each block
-                detections_id.append(uuid4())
-
-        predictions = sv.Detections(
-            xyxy=np.array(xyxy),
-            confidence=np.array(confidence),
-            class_id=np.arange(len(classes)),
-            data={CLASS_NAME_DATA_FIELD: np.array(classes)},
-        )
+        return _parse_google_vision_response(result=result, image=image)
+
+
+def _build_request_json(
+    image: WorkflowImageData,
+    detection_type: str,
+    language_hints: Optional[List[str]],
+) -> dict:
+    ocr_request = {
+        "image": {"content": image.base64_image},
+        "features": [{"type": detection_type}],
+    }
+
+    if language_hints is not None:
+        ocr_request["imageContext"] = {"languageHints": language_hints}
+
+    return {"requests": [ocr_request]}
 
-        predictions[DETECTION_ID_KEY] = np.array(detections_id)
-        predictions[PREDICTION_TYPE_KEY] = np.array(["ocr"] * len(predictions))
-        image_height, image_width = image.numpy_image.shape[:2]
-        predictions[IMAGE_DIMENSIONS_KEY] = np.array(
-            [[image_height, image_width]] * len(predictions)
+
+def _execute_proxied_google_vision_request(
+    roboflow_api_key: str,
+    google_vision_api_key: str,
+    request_json: dict,
+) -> dict:
+    payload = {
+        "google_vision_api_key": google_vision_api_key,
+        "request_json": request_json,
+    }
+
+    try:
+        response_data = post_to_roboflow_api(
+            endpoint="apiproxy/google_vision_ocr",
+            api_key=roboflow_api_key,
+            payload=payload,
         )
+        return response_data["responses"][0]
+    except requests.exceptions.RequestException as e:
+        raise RuntimeError(f"Failed to connect to Roboflow proxy: {e}") from e
+    except (KeyError, IndexError) as e:
+        raise RuntimeError(
+            f"Invalid response structure from Roboflow proxy: {e}"
+        ) from e
 
-        predictions = attach_parents_coordinates_to_sv_detections(
-            detections=predictions,
-            image=image,
+
+def _execute_google_vision_request(
+    api_key: str,
+    request_json: dict,
+) -> dict:
+    response = requests.post(
+        "https://vision.googleapis.com/v1/images:annotate",
+        params={"key": api_key},
+        json=request_json,
+    )
+
+    if response.status_code != 200:
+        raise RuntimeError(
+            f"Request to Google Cloud Vision API failed: {str(response.json())}"
         )
 
+    return response.json()["responses"][0]
+
+
+def _parse_google_vision_response(
+    result: dict,
+    image: WorkflowImageData,
+) -> BlockResult:
+    # Check for image without text
+    if "textAnnotations" not in result or not result["textAnnotations"]:
         return {
-            "text": text,
-            "language": language,
-            "predictions": predictions,
+            "text": "",
+            "language": "",
+            "predictions": sv.Detections.empty(),
         }
+
+    # Extract predictions from the response
+    text = result["textAnnotations"][0]["description"]
+    language = result["textAnnotations"][0]["locale"]
+
+    xyxy = []
+    confidence = []
+    classes = []
+    detections_id = []
+
+    for page in result["fullTextAnnotation"]["pages"]:
+        for block in page["blocks"]:
+            # Get bounding box coordinates
+            box = block["boundingBox"]["vertices"]
+            x_min = min(v.get("x", 0) for v in box)
+            y_min = min(v.get("y", 0) for v in box)
+            x_max = max(v.get("x", 0) for v in box)
+            y_max = max(v.get("y", 0) for v in box)
+            xyxy.append([x_min, y_min, x_max, y_max])
+
+            # Only DOCUMENT_TEXT_DETECTION provides confidence score, use 1.0 otherwise
+            confidence.append(block.get("confidence", 1.0))
+
+            # Get block text
+            block_text = []
+            for paragraph in block["paragraphs"]:
+                for word in paragraph["words"]:
+                    word_text = "".join(symbol["text"] for symbol in word["symbols"])
+                    block_text.append(word_text)
+            classes.append(" ".join(block_text))
+
+            # Create unique detection id for each block
+            detections_id.append(uuid4())
+
+    predictions = sv.Detections(
+        xyxy=np.array(xyxy),
+        confidence=np.array(confidence),
+        class_id=np.arange(len(classes)),
+        data={CLASS_NAME_DATA_FIELD: np.array(classes)},
+    )
+
+    predictions[DETECTION_ID_KEY] = np.array(detections_id)
+    predictions[PREDICTION_TYPE_KEY] = np.array(["ocr"] * len(predictions))
+    image_height, image_width = image.numpy_image.shape[:2]
+    predictions[IMAGE_DIMENSIONS_KEY] = np.array(
+        [[image_height, image_width]] * len(predictions)
+    )
+
+    predictions = attach_parents_coordinates_to_sv_detections(
+        detections=predictions,
+        image=image,
+    )
+
+    return {
+        "text": text,
+        "language": language,
+        "predictions": predictions,
+    }
diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py
@@ -54,7 +54,7 @@
 }
 
 
-@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No OpenAI API key provided")
+@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No Google API key provided")
 @pytest.mark.flaky(retries=4, delay=1)
 def test_workflow_with_google_api_ocr(
     object_detection_service_url: str,
diff --git a/tests/workflows/integration_tests/execution/test_workflow_with_google_vision_ocr.py b/tests/workflows/integration_tests/execution/test_workflow_with_google_vision_ocr.py

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@`
`54`	`54`	`}`
`55`	`55`
`56`	`56`
`57`		`-@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No OpenAI API key provided")`
	`57`	`+@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No Google API key provided")`
`58`	`58`	`@pytest.mark.flaky(retries=4, delay=1)`
`59`	`59`	`def test_workflow_with_google_api_ocr(`
`60`	`60`	`object_detection_service_url: str,`