Fix/yolov8face roi padding #1014 (#1017)

ryan-goosling · web-flow · commit ca2d1bda472f · 2025-07-18T12:31:51.000+02:00
* Fix: YoloV8face converter does not apply ROI padding (#1014) * fixed readme for age_gender_rec * removed converter yolov5face * fixed left, top -> top, left annotations for coordinates
diff --git a/samples/age_gender_recognition/README.md b/samples/age_gender_recognition/README.md
@@ -1,4 +1,4 @@
-# Faces detection, tracking and age-gender recognition (YoloV5face, Nvidia Tracker, Age-Gender model)
+# Faces detection, tracking and age-gender recognition (YoloV8face, Nvidia Tracker, Age-Gender model)
 
 **NB**: The demo uses **YOLOV8-Face** model which takes up to **30-40 minutes** to compile to TensorRT engine. The first launch takes an enormous time.
 
diff --git a/savant/converter/yolo_v5face.py b/savant/converter/yolo_v5face.py
diff --git a/savant/converter/yolo_v8face.py b/savant/converter/yolo_v8face.py
@@ -39,7 +39,7 @@ def __call__(
         :param output_layers: Output layer tensor
         :param model: Model definition, required parameters: input tensor shape,
             maintain_aspect_ratio
-        :param roi: [top, left, width, height] of the rectangle
+        :param roi: [left, top, width, height] of the rectangle
             on which the model infers
         :return: a combination of :py:class:`.BaseObjectModelOutputConverter` and
             :py:class:`.BaseAttributeModelOutputConverter` outputs:
@@ -51,8 +51,9 @@ def __call__(
         """
         attr_name = model.output.attributes[0].name
 
-        ration_width = roi[2] / model.input.shape[2]
-        ratio_height = roi[3] / model.input.shape[1]
+        roi_left, roi_top, roi_width, roi_height = roi
+        ratio_width = roi_width / model.input.shape[2]
+        ratio_height = roi_height / model.input.shape[1]
 
         raw_predictions = np.transpose(output_layers[0])
 
@@ -76,13 +77,22 @@ def __call__(
         xywh = selected_nms_predictions[:, :4]
         conf = selected_nms_predictions[:, 4:5]
         class_num = np.zeros_like(conf)
-        xywh *= np.tile(np.float32([ration_width, ratio_height]), 2)
+
+        # Scale and shift bbox coordinates
+        xywh *= np.tile(np.float32([ratio_width, ratio_height]), 2)
+        xywh[:, 0] += roi_left  # x center
+        xywh[:, 1] += roi_top  # y center
+
         bbox_output = np.concatenate((class_num, conf, xywh), axis=1)
 
+        # Process landmarks (5 points, each with x, y, conf)
         landmarks = (
             selected_nms_predictions[:, 5:20]
-            * np.tile(np.float32([ration_width, ratio_height, 1.0]), 5)
+            * np.tile(np.float32([ratio_width, ratio_height, 1.0]), 5)
         ).reshape(-1, 5, 3)
+        landmarks[:, :, 0] += roi_left  # x
+        landmarks[:, :, 1] += roi_top  # y
+
         landmarks_output = [
             [(attr_name, lms, conf)]
             for lms, conf in zip(

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Faces detection, tracking and age-gender recognition (YoloV5face, Nvidia Tracker, Age-Gender model)`
	`1`	`+# Faces detection, tracking and age-gender recognition (YoloV8face, Nvidia Tracker, Age-Gender model)`
`2`	`2`
`3`	`3`	`NB: The demo uses YOLOV8-Face model which takes up to 30-40 minutes to compile to TensorRT engine. The first launch takes an enormous time.`
`4`	`4`