isaac-sim · matthewtrepte · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025 · Mayankm96
@@ -1,3 +1,5 @@
+rtx.sdg.force.disableColorRender=false
+
 rtx.translucency.enabled = false
 
 rtx.reflections.enabled = false

@@ -1,3 +1,5 @@
+rtx.sdg.force.disableColorRender=true # change to false for RGB baselines
+
 rtx.translucency.enabled = false
 
 rtx.reflections.enabled = false

@@ -1,3 +1,5 @@
+rtx.sdg.force.disableColorRender=false
+
 rtx.translucency.enabled = true
 
 rtx.reflections.enabled = true

@@ -189,12 +189,28 @@ def _initialize_impl(self):
         )
         self._render_product_paths = [rp.path]
 
+        rep.AnnotatorRegistry.register_annotator_from_aov(
+            aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4
+        )
+        rep.AnnotatorRegistry.register_annotator_from_aov(
+            aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4
+        )
         # Define the annotators based on requested data types
         self._annotators = dict()
         for annotator_type in self.cfg.data_types:
             if annotator_type == "rgba" or annotator_type == "rgb":
                 annotator = rep.AnnotatorRegistry.get_annotator("rgb", device=self.device, do_array_copy=False)
                 self._annotators["rgba"] = annotator
+            elif annotator_type == "diffuse_albedo":
+                annotator = rep.AnnotatorRegistry.get_annotator(
+                    "DiffuseAlbedoSD", device=self.device, do_array_copy=False
+                )
+                self._annotators["diffuse_albedo"] = annotator
+            elif annotator_type == "simple_shading":
+                annotator = rep.AnnotatorRegistry.get_annotator(
+                    "SimpleShadingSD", device=self.device, do_array_copy=False
+                )
+                self._annotators["simple_shading"] = annotator
             elif annotator_type == "depth" or annotator_type == "distance_to_image_plane":
                 # keep depth for backwards compatibility
                 annotator = rep.AnnotatorRegistry.get_annotator(
@@ -254,13 +270,16 @@ def _update_buffers_impl(self, env_ids: Sequence[int]):
             else:
                 tiled_data_buffer = tiled_data_buffer.to(device=self.device)
 
-            # process data for different segmentation types
+            # process data for different segmentation types and custom annotators
             # Note: Replicator returns raw buffers of dtype uint32 for segmentation types
             #   so we need to convert them to uint8 4 channel images for colorized types
+            # Note: Custom annotators (diffuse_albedo, simple_shading) also return 4 channel data
             if (
                 (data_type == "semantic_segmentation" and self.cfg.colorize_semantic_segmentation)
                 or (data_type == "instance_segmentation_fast" and self.cfg.colorize_instance_segmentation)
                 or (data_type == "instance_id_segmentation_fast" and self.cfg.colorize_instance_id_segmentation)
+                or data_type == "diffuse_albedo"
+                or data_type == "simple_shading"
             ):
                 tiled_data_buffer = wp.array(
                     ptr=tiled_data_buffer.ptr, shape=(*tiled_data_buffer.shape, 4), dtype=wp.uint8, device=self.device
@@ -271,6 +290,13 @@ def _update_buffers_impl(self, env_ids: Sequence[int]):
             if data_type == "motion_vectors":
                 tiled_data_buffer = tiled_data_buffer[:, :, :2].contiguous()
 
+            # For diffuse albedo, keep only the first three channels (RGB)
+            if data_type == "diffuse_albedo":
+                tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous()
+            # For simple shading, keep only the first three channels (RGB)
+            if data_type == "simple_shading":
+                tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous()
+
             wp.launch(
                 kernel=reshape_tiled_image,
                 dim=(self._view.count, self.cfg.height, self.cfg.width),
@@ -347,6 +373,16 @@ def _create_buffers(self):
         if "rgb" in self.cfg.data_types:
             # RGB is the first 3 channels of RGBA
             data_dict["rgb"] = data_dict["rgba"][..., :3]
+        if "diffuse_albedo" in self.cfg.data_types:
+            data_dict["diffuse_albedo"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
+            ).contiguous()
+            data_dict["diffuse_albedo"] = data_dict["diffuse_albedo"][..., :3]
-        if "diffuse_albedo" in self.cfg.data_types:
-            data_dict["diffuse_albedo"] = torch.zeros(
-                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
-            ).contiguous()
-            data_dict["diffuse_albedo"] = data_dict["diffuse_albedo"][..., :3]
+        if "diffuse_albedo" in self.cfg.data_types:
+            data_dict["diffuse_albedo"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 3), device=self.device, dtype=torch.uint8
+            ).contiguous()
-        if "diffuse_albedo" in self.cfg.data_types:
-            data_dict["diffuse_albedo"] = torch.zeros(
-                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
-            ).contiguous()
-            data_dict["diffuse_albedo"] = data_dict["diffuse_albedo"][..., :3]
+        if "diffuse_albedo" in self.cfg.data_types:
+            data_dict["diffuse_albedo"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 3), device=self.device, dtype=torch.uint8
+            ).contiguous()
+        if "simple_shading" in self.cfg.data_types:
+            data_dict["simple_shading"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
+            ).contiguous()
+            data_dict["simple_shading"] = data_dict["simple_shading"][..., :3]
-        if "simple_shading" in self.cfg.data_types:
-            data_dict["simple_shading"] = torch.zeros(
-                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
-            ).contiguous()
-            data_dict["simple_shading"] = data_dict["simple_shading"][..., :3]
+        if "simple_shading" in self.cfg.data_types:
+            data_dict["simple_shading"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 3), device=self.device, dtype=torch.uint8
+            ).contiguous()
-        if "simple_shading" in self.cfg.data_types:
-            data_dict["simple_shading"] = torch.zeros(
-                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
-            ).contiguous()
-            data_dict["simple_shading"] = data_dict["simple_shading"][..., :3]
+        if "simple_shading" in self.cfg.data_types:
+            data_dict["simple_shading"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 3), device=self.device, dtype=torch.uint8
+            ).contiguous()
         if "distance_to_image_plane" in self.cfg.data_types:
             data_dict["distance_to_image_plane"] = torch.zeros(
                 (self._view.count, self.cfg.height, self.cfg.width, 1), device=self.device, dtype=torch.float32

@@ -64,6 +64,63 @@
     },
 )
 
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-Segmentation-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSegmentationEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-RGB-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionRGBEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-DiffuseAlbedo-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDiffuseAlbedoEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-SimpleShading-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSimpleShadingEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-Depth-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDepthEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
 gym.register(
     id="Isaac-Repose-Cube-Shadow-Vision-Direct-Play-v0",
     entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",

@@ -16,9 +16,9 @@
 class FeatureExtractorNetwork(nn.Module):
     """CNN architecture used to regress keypoint positions of the in-hand cube from image data."""
 
-    def __init__(self):
+    def __init__(self, num_channel):
         super().__init__()
-        num_channel = 7
+        self.num_channel = num_channel
         self.cnn = nn.Sequential(
             nn.Conv2d(num_channel, 16, kernel_size=6, stride=2, padding=0),
             nn.ReLU(),
@@ -45,8 +45,11 @@ def __init__(self):
 
     def forward(self, x):
         x = x.permute(0, 3, 1, 2)
-        x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :])
-        x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :])
+        if self.num_channel == 7:
+            x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :])
+            x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :])
+        elif self.num_channel == 3:
+            x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :])
         cnn_x = self.cnn(x)
         out = self.linear(cnn_x.view(-1, 128))
         return out
@@ -65,6 +68,8 @@ class FeatureExtractorCfg:
     write_image_to_file: bool = False
     """If True, the images from the camera sensor are written to file. Default is False."""
 
+    num_channel: int = 7
+
 
 class FeatureExtractor:
     """Class for extracting features from image data.
@@ -86,7 +91,7 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None =
         self.device = device
 
         # Feature extractor model
-        self.feature_extractor = FeatureExtractorNetwork()
+        self.feature_extractor = FeatureExtractorNetwork(self.cfg.num_channel)
         self.feature_extractor.to(self.device)
 
         self.step_count = 0
@@ -112,8 +117,13 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None =
             self.feature_extractor.eval()
 
     def _preprocess_images(
-        self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        self,
+        rgb_img: torch.Tensor,
+        depth_img: torch.Tensor,
+        segmentation_img: torch.Tensor,
+        albedo_img: torch.Tensor | None = None,
+        simple_shading_img: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
         """Preprocesses the input images.
 
         Args:
@@ -122,33 +132,63 @@ def _preprocess_images(
             segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3)
 
         Returns:
-            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Preprocessed RGB, depth, and segmentation
+            tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
+            Preprocessed RGB, depth, segmentation, albedo, and simple shading images
         """
-        rgb_img = rgb_img / 255.0
+        if rgb_img is not None:
+            rgb_img = rgb_img / 255.0
         # process depth image
-        depth_img[depth_img == float("inf")] = 0
-        depth_img /= 5.0
-        depth_img /= torch.max(depth_img)
+        if depth_img is not None:
+            depth_img[depth_img == float("inf")] = 0
+            depth_img /= 5.0
+            depth_img /= torch.max(depth_img)
         # process segmentation image
-        segmentation_img = segmentation_img / 255.0
-        mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True)
-        segmentation_img -= mean_tensor
-        return rgb_img, depth_img, segmentation_img
-
-    def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor):
+        if segmentation_img is not None:
+            segmentation_img = segmentation_img / 255.0
+            mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True)
+            segmentation_img -= mean_tensor
+        # process albedo image
+        if albedo_img is not None:
+            albedo_img = albedo_img / 255.0
+        # process simple shading image
+        if simple_shading_img is not None:
+            simple_shading_img = simple_shading_img / 255.0
+        return rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img
+
+    def _save_images(
+        self,
+        rgb_img: torch.Tensor | None,
+        depth_img: torch.Tensor | None,
+        segmentation_img: torch.Tensor | None,
+        albedo_img: torch.Tensor | None,
+        simple_shading_img: torch.Tensor | None,
+    ):
         """Writes image buffers to file.
 
         Args:
             rgb_img (torch.Tensor): RGB image tensor. Shape: (N, H, W, 3).
             depth_img (torch.Tensor): Depth image tensor. Shape: (N, H, W, 1).
             segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3).
         """
-        save_images_to_file(rgb_img, "shadow_hand_rgb.png")
-        save_images_to_file(depth_img, "shadow_hand_depth.png")
-        save_images_to_file(segmentation_img, "shadow_hand_segmentation.png")
+        if rgb_img is not None:
+            save_images_to_file(rgb_img, "shadow_hand_rgb.png")
+        if depth_img is not None:
+            save_images_to_file(depth_img, "shadow_hand_depth.png")
+        if segmentation_img is not None:
+            save_images_to_file(segmentation_img, "shadow_hand_segmentation.png")
+        if albedo_img is not None:
+            save_images_to_file(albedo_img, "shadow_hand_diffuse_albedo.png")
+        if simple_shading_img is not None:
+            save_images_to_file(simple_shading_img, "shadow_hand_simple_shading.png")
 
     def step(
-        self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor, gt_pose: torch.Tensor
+        self,
+        rgb_img: torch.Tensor = None,
+        depth_img: torch.Tensor = None,
+        segmentation_img: torch.Tensor = None,
+        albedo_img: torch.Tensor = None,
+        simple_shading_img: torch.Tensor = None,
+        gt_pose: torch.Tensor = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """Extracts the features using the images and trains the model if the train flag is set to True.
 
@@ -162,15 +202,28 @@ def step(
             tuple[torch.Tensor, torch.Tensor]: Pose loss and predicted pose.
         """
 
-        rgb_img, depth_img, segmentation_img = self._preprocess_images(rgb_img, depth_img, segmentation_img)
+        rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img = self._preprocess_images(
+            rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img
+        )
 
         if self.cfg.write_image_to_file:
-            self._save_images(rgb_img, depth_img, segmentation_img)
+            self._save_images(rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img)
 
         if self.cfg.train:
             with torch.enable_grad():
                 with torch.inference_mode(False):
-                    img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+                    if rgb_img is not None and depth_img is not None and segmentation_img is not None:
+                        img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+                    elif albedo_img is not None:
+                        img_input = albedo_img
+                    elif simple_shading_img is not None:
+                        img_input = simple_shading_img
+                    elif rgb_img is not None:
+                        img_input = rgb_img
+                    elif depth_img is not None:
+                        img_input = depth_img
+                    elif segmentation_img is not None:
+                        img_input = segmentation_img
                     self.optimizer.zero_grad()
 
                     predicted_pose = self.feature_extractor(img_input)
@@ -189,6 +242,17 @@ def step(
 
                     return pose_loss, predicted_pose
         else:
-            img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+            if albedo_img is not None:
+                img_input = albedo_img
+            elif simple_shading_img is not None:
+                img_input = simple_shading_img
+            elif rgb_img is not None and depth_img is not None and segmentation_img is not None:
+                img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+            elif rgb_img is not None:
+                img_input = rgb_img
+            elif depth_img is not None:
+                img_input = depth_img
+            elif segmentation_img is not None:
+                img_input = segmentation_img
             predicted_pose = self.feature_extractor(img_input)
             return None, predicted_pose