Add Metric compatibility test for RecMetricsModule (#3586)

prajjwal1 · facebook-github-bot · commit c80b5027cdd0 · 2025-12-03T07:49:53.000-08:00
Summary: In response to https://www.internalfb.com/sevmanager/view/592632, we add a Metric module compatibility test wherein we try to load an older metric module with the latest RecMetricModule. We have a predefined state dict, which simulates a metric module obtained from an older code. We compare its keys with the latest state dict of RecMetricModule Differential Revision: D88207525
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py
@@ -15,6 +15,7 @@
 import os
 import tempfile
 import unittest
+from collections import OrderedDict
 from typing import Any, Callable, Dict, List, Optional
 from unittest.mock import MagicMock, patch
 
@@ -223,6 +224,53 @@ def test_rectask_info(self) -> None:
             metric_module_unified_task_info.rec_metrics[0]._tasks,
         )
 
+    def test_compatibility_with_older_metric_module(self) -> None:
+        """
+        This test checks if latest RecMetricModule can load up
+        metric module from an older checkpoint
+        """
+        # This simulates what an older checkpoint may have
+        predefined_state_dict = OrderedDict(
+            {
+                "rec_metrics.rec_metrics.0._metrics_computations.0.cross_entropy_sum": torch.tensor(
+                    [0.0], dtype=torch.float64
+                ),
+                "rec_metrics.rec_metrics.0._metrics_computations.0.weighted_num_samples": torch.tensor(
+                    [0.0], dtype=torch.float64
+                ),
+                "rec_metrics.rec_metrics.0._metrics_computations.0.pos_labels": torch.tensor(
+                    [0.0], dtype=torch.float64
+                ),
+                "rec_metrics.rec_metrics.0._metrics_computations.0.neg_labels": torch.tensor(
+                    [0.0], dtype=torch.float64
+                ),
+                "throughput_metric.total_examples": torch.Tensor(0),
+                "throughput_metric.warmup_examples": torch.tensor(0),
+                "throughput_metric.time_lapse_after_warmup": torch.tensor(
+                    0.0, dtype=torch.float64
+                ),
+            }
+        )
+
+        # This is the latest RecMetricModule
+        mock_optimizer = MockOptimizer()
+        config = DefaultMetricsConfig
+        latest_metric_module = generate_metric_module(
+            TestMetricModule,
+            metrics_config=config,
+            batch_size=128,
+            world_size=64,
+            my_rank=0,
+            state_metrics_mapping={StateMetricEnum.OPTIMIZERS: mock_optimizer},
+            device=torch.device("cpu"),
+        )
+        tc = unittest.TestCase()
+        tc.assertEqual(
+            predefined_state_dict.keys(),
+            latest_metric_module.state_dict().keys(),
+            "RecMetricModule state_dict keys have changed - ensure backward compatibility with older checkpoints",
+        )
+
     @staticmethod
     def _run_trainer_checkpointing(rank: int, world_size: int, backend: str) -> None:
         dist.init_process_group(