-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New Annotators for SimpleShading, DiffuseAlbedo, and Fast Depth for ShadowHandEnv #4066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,5 @@ | ||
| rtx.sdg.force.disableColorRender=false | ||
|
|
||
| rtx.translucency.enabled = false | ||
|
|
||
| rtx.reflections.enabled = false | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,5 @@ | ||
| rtx.sdg.force.disableColorRender=false | ||
|
|
||
| rtx.translucency.enabled = true | ||
|
|
||
| rtx.reflections.enabled = true | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -189,12 +189,28 @@ def _initialize_impl(self): | |||||||||||||||||||
| ) | ||||||||||||||||||||
| self._render_product_paths = [rp.path] | ||||||||||||||||||||
|
|
||||||||||||||||||||
| rep.AnnotatorRegistry.register_annotator_from_aov( | ||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add comments on what these are and why are they needed. |
||||||||||||||||||||
| aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4 | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| rep.AnnotatorRegistry.register_annotator_from_aov( | ||||||||||||||||||||
| aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4 | ||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should output channels be 3 directly? Later on slicing is done to that anyway?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this only for Isaac Sim 5.1? |
||||||||||||||||||||
| ) | ||||||||||||||||||||
| # Define the annotators based on requested data types | ||||||||||||||||||||
| self._annotators = dict() | ||||||||||||||||||||
| for annotator_type in self.cfg.data_types: | ||||||||||||||||||||
| if annotator_type == "rgba" or annotator_type == "rgb": | ||||||||||||||||||||
| annotator = rep.AnnotatorRegistry.get_annotator("rgb", device=self.device, do_array_copy=False) | ||||||||||||||||||||
| self._annotators["rgba"] = annotator | ||||||||||||||||||||
| elif annotator_type == "diffuse_albedo": | ||||||||||||||||||||
| annotator = rep.AnnotatorRegistry.get_annotator( | ||||||||||||||||||||
| "DiffuseAlbedoSD", device=self.device, do_array_copy=False | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| self._annotators["diffuse_albedo"] = annotator | ||||||||||||||||||||
| elif annotator_type == "simple_shading": | ||||||||||||||||||||
| annotator = rep.AnnotatorRegistry.get_annotator( | ||||||||||||||||||||
| "SimpleShadingSD", device=self.device, do_array_copy=False | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| self._annotators["simple_shading"] = annotator | ||||||||||||||||||||
| elif annotator_type == "depth" or annotator_type == "distance_to_image_plane": | ||||||||||||||||||||
| # keep depth for backwards compatibility | ||||||||||||||||||||
| annotator = rep.AnnotatorRegistry.get_annotator( | ||||||||||||||||||||
|
|
@@ -254,13 +270,16 @@ def _update_buffers_impl(self, env_ids: Sequence[int]): | |||||||||||||||||||
| else: | ||||||||||||||||||||
| tiled_data_buffer = tiled_data_buffer.to(device=self.device) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # process data for different segmentation types | ||||||||||||||||||||
| # process data for different segmentation types and custom annotators | ||||||||||||||||||||
| # Note: Replicator returns raw buffers of dtype uint32 for segmentation types | ||||||||||||||||||||
| # so we need to convert them to uint8 4 channel images for colorized types | ||||||||||||||||||||
| # Note: Custom annotators (diffuse_albedo, simple_shading) also return 4 channel data | ||||||||||||||||||||
| if ( | ||||||||||||||||||||
| (data_type == "semantic_segmentation" and self.cfg.colorize_semantic_segmentation) | ||||||||||||||||||||
| or (data_type == "instance_segmentation_fast" and self.cfg.colorize_instance_segmentation) | ||||||||||||||||||||
| or (data_type == "instance_id_segmentation_fast" and self.cfg.colorize_instance_id_segmentation) | ||||||||||||||||||||
| or data_type == "diffuse_albedo" | ||||||||||||||||||||
| or data_type == "simple_shading" | ||||||||||||||||||||
| ): | ||||||||||||||||||||
| tiled_data_buffer = wp.array( | ||||||||||||||||||||
| ptr=tiled_data_buffer.ptr, shape=(*tiled_data_buffer.shape, 4), dtype=wp.uint8, device=self.device | ||||||||||||||||||||
|
|
@@ -271,6 +290,13 @@ def _update_buffers_impl(self, env_ids: Sequence[int]): | |||||||||||||||||||
| if data_type == "motion_vectors": | ||||||||||||||||||||
| tiled_data_buffer = tiled_data_buffer[:, :, :2].contiguous() | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # For diffuse albedo, keep only the first three channels (RGB) | ||||||||||||||||||||
| if data_type == "diffuse_albedo": | ||||||||||||||||||||
| tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous() | ||||||||||||||||||||
| # For simple shading, keep only the first three channels (RGB) | ||||||||||||||||||||
| if data_type == "simple_shading": | ||||||||||||||||||||
| tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous() | ||||||||||||||||||||
|
|
||||||||||||||||||||
| wp.launch( | ||||||||||||||||||||
| kernel=reshape_tiled_image, | ||||||||||||||||||||
| dim=(self._view.count, self.cfg.height, self.cfg.width), | ||||||||||||||||||||
|
|
@@ -347,6 +373,16 @@ def _create_buffers(self): | |||||||||||||||||||
| if "rgb" in self.cfg.data_types: | ||||||||||||||||||||
| # RGB is the first 3 channels of RGBA | ||||||||||||||||||||
| data_dict["rgb"] = data_dict["rgba"][..., :3] | ||||||||||||||||||||
| if "diffuse_albedo" in self.cfg.data_types: | ||||||||||||||||||||
| data_dict["diffuse_albedo"] = torch.zeros( | ||||||||||||||||||||
| (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8 | ||||||||||||||||||||
| ).contiguous() | ||||||||||||||||||||
| data_dict["diffuse_albedo"] = data_dict["diffuse_albedo"][..., :3] | ||||||||||||||||||||
|
Comment on lines
+376
to
+380
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: Buffer is created with 4 channels, then immediately sliced to 3 channels. Consider initializing with 3 channels directly for clarity
Suggested change
|
||||||||||||||||||||
| if "simple_shading" in self.cfg.data_types: | ||||||||||||||||||||
| data_dict["simple_shading"] = torch.zeros( | ||||||||||||||||||||
| (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8 | ||||||||||||||||||||
| ).contiguous() | ||||||||||||||||||||
| data_dict["simple_shading"] = data_dict["simple_shading"][..., :3] | ||||||||||||||||||||
|
Comment on lines
+381
to
+385
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: Buffer is created with 4 channels, then immediately sliced to 3 channels. Consider initializing with 3 channels directly for clarity
Suggested change
|
||||||||||||||||||||
| if "distance_to_image_plane" in self.cfg.data_types: | ||||||||||||||||||||
| data_dict["distance_to_image_plane"] = torch.zeros( | ||||||||||||||||||||
| (self._view.count, self.cfg.height, self.cfg.width, 1), device=self.device, dtype=torch.float32 | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,9 +16,9 @@ | |
| class FeatureExtractorNetwork(nn.Module): | ||
| """CNN architecture used to regress keypoint positions of the in-hand cube from image data.""" | ||
|
|
||
| def __init__(self): | ||
| def __init__(self, num_channel): | ||
| super().__init__() | ||
| num_channel = 7 | ||
| self.num_channel = num_channel | ||
| self.cnn = nn.Sequential( | ||
| nn.Conv2d(num_channel, 16, kernel_size=6, stride=2, padding=0), | ||
| nn.ReLU(), | ||
|
|
@@ -45,8 +45,11 @@ def __init__(self): | |
|
|
||
| def forward(self, x): | ||
| x = x.permute(0, 3, 1, 2) | ||
| x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) | ||
| x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :]) | ||
| if self.num_channel == 7: | ||
| x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) | ||
| x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :]) | ||
| elif self.num_channel == 3: | ||
| x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) | ||
| cnn_x = self.cnn(x) | ||
| out = self.linear(cnn_x.view(-1, 128)) | ||
| return out | ||
|
|
@@ -65,6 +68,8 @@ class FeatureExtractorCfg: | |
| write_image_to_file: bool = False | ||
| """If True, the images from the camera sensor are written to file. Default is False.""" | ||
|
|
||
| num_channel: int = 7 | ||
|
|
||
|
|
||
| class FeatureExtractor: | ||
| """Class for extracting features from image data. | ||
|
|
@@ -86,7 +91,7 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None = | |
| self.device = device | ||
|
|
||
| # Feature extractor model | ||
| self.feature_extractor = FeatureExtractorNetwork() | ||
| self.feature_extractor = FeatureExtractorNetwork(self.cfg.num_channel) | ||
| self.feature_extractor.to(self.device) | ||
|
|
||
| self.step_count = 0 | ||
|
|
@@ -112,8 +117,13 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None = | |
| self.feature_extractor.eval() | ||
|
|
||
| def _preprocess_images( | ||
| self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor | ||
| ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: | ||
| self, | ||
| rgb_img: torch.Tensor, | ||
| depth_img: torch.Tensor, | ||
| segmentation_img: torch.Tensor, | ||
| albedo_img: torch.Tensor | None = None, | ||
| simple_shading_img: torch.Tensor | None = None, | ||
| ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]: | ||
| """Preprocesses the input images. | ||
|
|
||
| Args: | ||
|
|
@@ -122,33 +132,63 @@ def _preprocess_images( | |
| segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3) | ||
|
|
||
| Returns: | ||
| tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Preprocessed RGB, depth, and segmentation | ||
| tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't do type annotations on docstrings since they are harder to maintain. Please remove instances of that everywhere. |
||
| Preprocessed RGB, depth, segmentation, albedo, and simple shading images | ||
| """ | ||
| rgb_img = rgb_img / 255.0 | ||
| if rgb_img is not None: | ||
| rgb_img = rgb_img / 255.0 | ||
| # process depth image | ||
| depth_img[depth_img == float("inf")] = 0 | ||
| depth_img /= 5.0 | ||
| depth_img /= torch.max(depth_img) | ||
| if depth_img is not None: | ||
| depth_img[depth_img == float("inf")] = 0 | ||
| depth_img /= 5.0 | ||
| depth_img /= torch.max(depth_img) | ||
| # process segmentation image | ||
| segmentation_img = segmentation_img / 255.0 | ||
| mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True) | ||
| segmentation_img -= mean_tensor | ||
| return rgb_img, depth_img, segmentation_img | ||
|
|
||
| def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor): | ||
| if segmentation_img is not None: | ||
| segmentation_img = segmentation_img / 255.0 | ||
| mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True) | ||
| segmentation_img -= mean_tensor | ||
| # process albedo image | ||
| if albedo_img is not None: | ||
| albedo_img = albedo_img / 255.0 | ||
| # process simple shading image | ||
| if simple_shading_img is not None: | ||
| simple_shading_img = simple_shading_img / 255.0 | ||
| return rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img | ||
|
|
||
| def _save_images( | ||
| self, | ||
| rgb_img: torch.Tensor | None, | ||
| depth_img: torch.Tensor | None, | ||
| segmentation_img: torch.Tensor | None, | ||
| albedo_img: torch.Tensor | None, | ||
| simple_shading_img: torch.Tensor | None, | ||
| ): | ||
| """Writes image buffers to file. | ||
|
|
||
| Args: | ||
| rgb_img (torch.Tensor): RGB image tensor. Shape: (N, H, W, 3). | ||
| depth_img (torch.Tensor): Depth image tensor. Shape: (N, H, W, 1). | ||
| segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3). | ||
| """ | ||
| save_images_to_file(rgb_img, "shadow_hand_rgb.png") | ||
| save_images_to_file(depth_img, "shadow_hand_depth.png") | ||
| save_images_to_file(segmentation_img, "shadow_hand_segmentation.png") | ||
| if rgb_img is not None: | ||
| save_images_to_file(rgb_img, "shadow_hand_rgb.png") | ||
| if depth_img is not None: | ||
| save_images_to_file(depth_img, "shadow_hand_depth.png") | ||
| if segmentation_img is not None: | ||
| save_images_to_file(segmentation_img, "shadow_hand_segmentation.png") | ||
| if albedo_img is not None: | ||
| save_images_to_file(albedo_img, "shadow_hand_diffuse_albedo.png") | ||
| if simple_shading_img is not None: | ||
| save_images_to_file(simple_shading_img, "shadow_hand_simple_shading.png") | ||
|
|
||
| def step( | ||
| self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor, gt_pose: torch.Tensor | ||
| self, | ||
| rgb_img: torch.Tensor = None, | ||
| depth_img: torch.Tensor = None, | ||
| segmentation_img: torch.Tensor = None, | ||
| albedo_img: torch.Tensor = None, | ||
| simple_shading_img: torch.Tensor = None, | ||
| gt_pose: torch.Tensor = None, | ||
| ) -> tuple[torch.Tensor, torch.Tensor]: | ||
| """Extracts the features using the images and trains the model if the train flag is set to True. | ||
|
|
||
|
|
@@ -162,15 +202,28 @@ def step( | |
| tuple[torch.Tensor, torch.Tensor]: Pose loss and predicted pose. | ||
| """ | ||
|
|
||
| rgb_img, depth_img, segmentation_img = self._preprocess_images(rgb_img, depth_img, segmentation_img) | ||
| rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img = self._preprocess_images( | ||
| rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img | ||
| ) | ||
|
|
||
| if self.cfg.write_image_to_file: | ||
| self._save_images(rgb_img, depth_img, segmentation_img) | ||
| self._save_images(rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img) | ||
|
|
||
| if self.cfg.train: | ||
| with torch.enable_grad(): | ||
| with torch.inference_mode(False): | ||
| img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) | ||
| if rgb_img is not None and depth_img is not None and segmentation_img is not None: | ||
| img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) | ||
| elif albedo_img is not None: | ||
| img_input = albedo_img | ||
| elif simple_shading_img is not None: | ||
| img_input = simple_shading_img | ||
| elif rgb_img is not None: | ||
| img_input = rgb_img | ||
| elif depth_img is not None: | ||
| img_input = depth_img | ||
| elif segmentation_img is not None: | ||
| img_input = segmentation_img | ||
| self.optimizer.zero_grad() | ||
|
|
||
| predicted_pose = self.feature_extractor(img_input) | ||
|
|
@@ -189,6 +242,17 @@ def step( | |
|
|
||
| return pose_loss, predicted_pose | ||
| else: | ||
| img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) | ||
| if albedo_img is not None: | ||
| img_input = albedo_img | ||
| elif simple_shading_img is not None: | ||
| img_input = simple_shading_img | ||
| elif rgb_img is not None and depth_img is not None and segmentation_img is not None: | ||
| img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) | ||
| elif rgb_img is not None: | ||
| img_input = rgb_img | ||
| elif depth_img is not None: | ||
| img_input = depth_img | ||
| elif segmentation_img is not None: | ||
| img_input = segmentation_img | ||
| predicted_pose = self.feature_extractor(img_input) | ||
| return None, predicted_pose | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be set to False only if rendering is not "rgb" in the tiledcamera? Wondering if the logic belongs here or to the camera class.