diff --git a/apps/rendering_modes/balanced.kit b/apps/rendering_modes/balanced.kit index ee92625fd7e..d9b793f2915 100644 --- a/apps/rendering_modes/balanced.kit +++ b/apps/rendering_modes/balanced.kit @@ -1,3 +1,5 @@ +rtx.sdg.force.disableColorRender=false + rtx.translucency.enabled = false rtx.reflections.enabled = false diff --git a/apps/rendering_modes/performance.kit b/apps/rendering_modes/performance.kit index 3cfe6e8c0e2..3925a8e1dff 100644 --- a/apps/rendering_modes/performance.kit +++ b/apps/rendering_modes/performance.kit @@ -1,3 +1,5 @@ +rtx.sdg.force.disableColorRender=true # change to false for RGB baselines + rtx.translucency.enabled = false rtx.reflections.enabled = false diff --git a/apps/rendering_modes/quality.kit b/apps/rendering_modes/quality.kit index 8e966ddfd3b..2aa8d8eae98 100644 --- a/apps/rendering_modes/quality.kit +++ b/apps/rendering_modes/quality.kit @@ -1,3 +1,5 @@ +rtx.sdg.force.disableColorRender=false + rtx.translucency.enabled = true rtx.reflections.enabled = true diff --git a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py index 3e9982135c5..a36111853e7 100644 --- a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py +++ b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py @@ -189,12 +189,28 @@ def _initialize_impl(self): ) self._render_product_paths = [rp.path] + rep.AnnotatorRegistry.register_annotator_from_aov( + aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4 + ) + rep.AnnotatorRegistry.register_annotator_from_aov( + aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4 + ) # Define the annotators based on requested data types self._annotators = dict() for annotator_type in self.cfg.data_types: if annotator_type == "rgba" or annotator_type == "rgb": annotator = rep.AnnotatorRegistry.get_annotator("rgb", device=self.device, do_array_copy=False) self._annotators["rgba"] = annotator + elif annotator_type == "diffuse_albedo": + annotator = rep.AnnotatorRegistry.get_annotator( + "DiffuseAlbedoSD", device=self.device, do_array_copy=False + ) + self._annotators["diffuse_albedo"] = annotator + elif annotator_type == "simple_shading": + annotator = rep.AnnotatorRegistry.get_annotator( + "SimpleShadingSD", device=self.device, do_array_copy=False + ) + self._annotators["simple_shading"] = annotator elif annotator_type == "depth" or annotator_type == "distance_to_image_plane": # keep depth for backwards compatibility annotator = rep.AnnotatorRegistry.get_annotator( @@ -254,13 +270,16 @@ def _update_buffers_impl(self, env_ids: Sequence[int]): else: tiled_data_buffer = tiled_data_buffer.to(device=self.device) - # process data for different segmentation types + # process data for different segmentation types and custom annotators # Note: Replicator returns raw buffers of dtype uint32 for segmentation types # so we need to convert them to uint8 4 channel images for colorized types + # Note: Custom annotators (diffuse_albedo, simple_shading) also return 4 channel data if ( (data_type == "semantic_segmentation" and self.cfg.colorize_semantic_segmentation) or (data_type == "instance_segmentation_fast" and self.cfg.colorize_instance_segmentation) or (data_type == "instance_id_segmentation_fast" and self.cfg.colorize_instance_id_segmentation) + or data_type == "diffuse_albedo" + or data_type == "simple_shading" ): tiled_data_buffer = wp.array( ptr=tiled_data_buffer.ptr, shape=(*tiled_data_buffer.shape, 4), dtype=wp.uint8, device=self.device @@ -271,6 +290,13 @@ def _update_buffers_impl(self, env_ids: Sequence[int]): if data_type == "motion_vectors": tiled_data_buffer = tiled_data_buffer[:, :, :2].contiguous() + # For diffuse albedo, keep only the first three channels (RGB) + if data_type == "diffuse_albedo": + tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous() + # For simple shading, keep only the first three channels (RGB) + if data_type == "simple_shading": + tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous() + wp.launch( kernel=reshape_tiled_image, dim=(self._view.count, self.cfg.height, self.cfg.width), @@ -347,6 +373,16 @@ def _create_buffers(self): if "rgb" in self.cfg.data_types: # RGB is the first 3 channels of RGBA data_dict["rgb"] = data_dict["rgba"][..., :3] + if "diffuse_albedo" in self.cfg.data_types: + data_dict["diffuse_albedo"] = torch.zeros( + (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8 + ).contiguous() + data_dict["diffuse_albedo"] = data_dict["diffuse_albedo"][..., :3] + if "simple_shading" in self.cfg.data_types: + data_dict["simple_shading"] = torch.zeros( + (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8 + ).contiguous() + data_dict["simple_shading"] = data_dict["simple_shading"][..., :3] if "distance_to_image_plane" in self.cfg.data_types: data_dict["distance_to_image_plane"] = torch.zeros( (self._view.count, self.cfg.height, self.cfg.width, 1), device=self.device, dtype=torch.float32 diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py index ed316e6e267..3ce6c42c5b6 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py @@ -64,6 +64,63 @@ }, ) + +gym.register( + id="Isaac-Repose-Cube-Shadow-Segmentation-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSegmentationEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + + +gym.register( + id="Isaac-Repose-Cube-Shadow-RGB-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionRGBEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + +gym.register( + id="Isaac-Repose-Cube-Shadow-DiffuseAlbedo-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDiffuseAlbedoEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + +gym.register( + id="Isaac-Repose-Cube-Shadow-SimpleShading-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSimpleShadingEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + +gym.register( + id="Isaac-Repose-Cube-Shadow-Depth-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDepthEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + gym.register( id="Isaac-Repose-Cube-Shadow-Vision-Direct-Play-v0", entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py index 82d76ec7f1e..a9f7785654c 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py @@ -16,9 +16,9 @@ class FeatureExtractorNetwork(nn.Module): """CNN architecture used to regress keypoint positions of the in-hand cube from image data.""" - def __init__(self): + def __init__(self, num_channel): super().__init__() - num_channel = 7 + self.num_channel = num_channel self.cnn = nn.Sequential( nn.Conv2d(num_channel, 16, kernel_size=6, stride=2, padding=0), nn.ReLU(), @@ -45,8 +45,11 @@ def __init__(self): def forward(self, x): x = x.permute(0, 3, 1, 2) - x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) - x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :]) + if self.num_channel == 7: + x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) + x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :]) + elif self.num_channel == 3: + x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) cnn_x = self.cnn(x) out = self.linear(cnn_x.view(-1, 128)) return out @@ -65,6 +68,8 @@ class FeatureExtractorCfg: write_image_to_file: bool = False """If True, the images from the camera sensor are written to file. Default is False.""" + num_channel: int = 7 + class FeatureExtractor: """Class for extracting features from image data. @@ -86,7 +91,7 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None = self.device = device # Feature extractor model - self.feature_extractor = FeatureExtractorNetwork() + self.feature_extractor = FeatureExtractorNetwork(self.cfg.num_channel) self.feature_extractor.to(self.device) self.step_count = 0 @@ -112,8 +117,13 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None = self.feature_extractor.eval() def _preprocess_images( - self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + self, + rgb_img: torch.Tensor, + depth_img: torch.Tensor, + segmentation_img: torch.Tensor, + albedo_img: torch.Tensor | None = None, + simple_shading_img: torch.Tensor | None = None, + ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]: """Preprocesses the input images. Args: @@ -122,20 +132,37 @@ def _preprocess_images( segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3) Returns: - tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Preprocessed RGB, depth, and segmentation + tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]: + Preprocessed RGB, depth, segmentation, albedo, and simple shading images """ - rgb_img = rgb_img / 255.0 + if rgb_img is not None: + rgb_img = rgb_img / 255.0 # process depth image - depth_img[depth_img == float("inf")] = 0 - depth_img /= 5.0 - depth_img /= torch.max(depth_img) + if depth_img is not None: + depth_img[depth_img == float("inf")] = 0 + depth_img /= 5.0 + depth_img /= torch.max(depth_img) # process segmentation image - segmentation_img = segmentation_img / 255.0 - mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True) - segmentation_img -= mean_tensor - return rgb_img, depth_img, segmentation_img - - def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor): + if segmentation_img is not None: + segmentation_img = segmentation_img / 255.0 + mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True) + segmentation_img -= mean_tensor + # process albedo image + if albedo_img is not None: + albedo_img = albedo_img / 255.0 + # process simple shading image + if simple_shading_img is not None: + simple_shading_img = simple_shading_img / 255.0 + return rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img + + def _save_images( + self, + rgb_img: torch.Tensor | None, + depth_img: torch.Tensor | None, + segmentation_img: torch.Tensor | None, + albedo_img: torch.Tensor | None, + simple_shading_img: torch.Tensor | None, + ): """Writes image buffers to file. Args: @@ -143,12 +170,25 @@ def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentat depth_img (torch.Tensor): Depth image tensor. Shape: (N, H, W, 1). segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3). """ - save_images_to_file(rgb_img, "shadow_hand_rgb.png") - save_images_to_file(depth_img, "shadow_hand_depth.png") - save_images_to_file(segmentation_img, "shadow_hand_segmentation.png") + if rgb_img is not None: + save_images_to_file(rgb_img, "shadow_hand_rgb.png") + if depth_img is not None: + save_images_to_file(depth_img, "shadow_hand_depth.png") + if segmentation_img is not None: + save_images_to_file(segmentation_img, "shadow_hand_segmentation.png") + if albedo_img is not None: + save_images_to_file(albedo_img, "shadow_hand_diffuse_albedo.png") + if simple_shading_img is not None: + save_images_to_file(simple_shading_img, "shadow_hand_simple_shading.png") def step( - self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor, gt_pose: torch.Tensor + self, + rgb_img: torch.Tensor = None, + depth_img: torch.Tensor = None, + segmentation_img: torch.Tensor = None, + albedo_img: torch.Tensor = None, + simple_shading_img: torch.Tensor = None, + gt_pose: torch.Tensor = None, ) -> tuple[torch.Tensor, torch.Tensor]: """Extracts the features using the images and trains the model if the train flag is set to True. @@ -162,15 +202,28 @@ def step( tuple[torch.Tensor, torch.Tensor]: Pose loss and predicted pose. """ - rgb_img, depth_img, segmentation_img = self._preprocess_images(rgb_img, depth_img, segmentation_img) + rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img = self._preprocess_images( + rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img + ) if self.cfg.write_image_to_file: - self._save_images(rgb_img, depth_img, segmentation_img) + self._save_images(rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img) if self.cfg.train: with torch.enable_grad(): with torch.inference_mode(False): - img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + if rgb_img is not None and depth_img is not None and segmentation_img is not None: + img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + elif albedo_img is not None: + img_input = albedo_img + elif simple_shading_img is not None: + img_input = simple_shading_img + elif rgb_img is not None: + img_input = rgb_img + elif depth_img is not None: + img_input = depth_img + elif segmentation_img is not None: + img_input = segmentation_img self.optimizer.zero_grad() predicted_pose = self.feature_extractor(img_input) @@ -189,6 +242,17 @@ def step( return pose_loss, predicted_pose else: - img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + if albedo_img is not None: + img_input = albedo_img + elif simple_shading_img is not None: + img_input = simple_shading_img + elif rgb_img is not None and depth_img is not None and segmentation_img is not None: + img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + elif rgb_img is not None: + img_input = rgb_img + elif depth_img is not None: + img_input = depth_img + elif segmentation_img is not None: + img_input = segmentation_img predicted_pose = self.feature_extractor(img_input) return None, predicted_pose diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py index 13bc6a55328..6a8890c7ed2 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py @@ -44,7 +44,122 @@ class ShadowHandVisionEnvCfg(ShadowHandEnvCfg): width=120, height=120, ) - feature_extractor = FeatureExtractorCfg() + feature_extractor = FeatureExtractorCfg(num_channel=7) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionRGBEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["rgb"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=120, + height=120, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionDiffuseAlbedoEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["diffuse_albedo"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=120, + height=120, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionSimpleShadingEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["simple_shading"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=120, + height=120, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionDepthEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["depth"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=120, + height=120, + ) + feature_extractor = FeatureExtractorCfg(num_channel=1) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionSegmentationEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["semantic_segmentation"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=120, + height=120, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) # env observation_space = 164 + 27 # state observation + vision CNN embedding