fix: efficiency report jobsteps (#338)

cademirch · cmeesters · web-flow · commit a4cbe368bd4d · 2025-08-21T15:03:24.000+02:00
Fixes #337. This PR makes sure for each job in sacct output, only the jobstep is reported and that each record has rulename and memrequested. Also refactors efficiency report slightly to allow unit testing of parsing sacct output.  ## Summary by CodeRabbit * **Refactor** * Split the efficiency report into separate data-fetching and parsing stages for more reliable SLURM data handling. * Improved handling of job steps with inheritance of job metadata (rule names, requested memory) and robust CPU/memory efficiency calculations with low-efficiency warnings. * **Tests** * Added a test verifying SLURM output parsing, metadata inheritance for job steps, and correct memory/efficiency reporting.  --------- Co-authored-by: Christian Meesters <meesters@uni-mainz.de>
diff --git a/snakemake_executor_plugin_slurm/efficiency_report.py b/snakemake_executor_plugin_slurm/efficiency_report.py
@@ -55,15 +55,10 @@ def parse_reqmem(reqmem, number_of_nodes=1):
     return 0
 
 
-def create_efficiency_report(e_threshold, run_uuid, e_report_path, logger):
-    """
-    Fetch sacct job data for a Snakemake workflow
-    and compute efficiency metrics.
-    """
+def get_sacct_data(run_uuid, logger):
+    """Fetch raw sacct data for a workflow."""
     cmd = f"sacct --name={run_uuid} --parsable2 --noheader"
-    cmd += (
-        " --format=JobID,JobName,Comment,Elapsed,TotalCPU," "NNodes,NCPUS,MaxRSS,ReqMem"
-    )
+    cmd += " --format=JobID,JobName,Comment,Elapsed,TotalCPU,NNodes,NCPUS,MaxRSS,ReqMem"
 
     try:
         result = subprocess.run(
@@ -74,12 +69,14 @@ def create_efficiency_report(e_threshold, run_uuid, e_report_path, logger):
             logger.warning(f"No job data found for workflow {run_uuid}.")
             return None
         lines = raw.split("\n")
+        return lines
 
     except subprocess.CalledProcessError:
         logger.error(f"Failed to retrieve job data for workflow {run_uuid}.")
         return None
 
-    # Convert to DataFrame
+
+def parse_sacct_data(lines, e_threshold, run_uuid, logger):
     df = pd.DataFrame(
         (line.split("|") for line in lines),
         columns=[
@@ -120,20 +117,44 @@ def create_efficiency_report(e_threshold, run_uuid, e_report_path, logger):
     df["Elapsed_sec"] = df["Elapsed"].apply(time_to_seconds)
     df["TotalCPU_sec"] = df["TotalCPU"].apply(time_to_seconds)
 
-    # Compute CPU efficiency
-    df["CPU Efficiency (%)"] = (
-        df["TotalCPU_sec"]
-        / (df["Elapsed_sec"].clip(lower=1) * df["NCPUS"].clip(lower=1))
-    ) * 100
-    df.replace([np.inf, -np.inf], 0, inplace=True)
-
     # Convert MaxRSS
     df["MaxRSS_MB"] = df["MaxRSS"].apply(parse_maxrss)
 
     # Convert ReqMem and calculate memory efficiency
     df["RequestedMem_MB"] = df.apply(
         lambda row: parse_reqmem(row["ReqMem"], row["NNodes"]), axis=1
     )
+
+    # Drop all rows containing "batch" or "extern" as job names
+    df = df[~df["JobName"].str.contains("batch|extern", na=False)]
+
+    # Extract main job ID for grouping
+    df["MainJobID"] = df["JobID"].str.extract(r"^(\d+)", expand=False)
+
+    # Separate main jobs and job steps
+    main_jobs = df[~df["JobID"].str.contains(r"\.\d+", regex=True)].copy()
+    job_steps = df[df["JobID"].str.contains(r"\.\d+", regex=True)].copy()
+
+    # Create maps from main jobs for inheritance
+    if not nocomment:
+        rule_name_map = main_jobs.set_index("MainJobID")["RuleName"].to_dict()
+    mem_map = main_jobs.set_index("MainJobID")["RequestedMem_MB"].to_dict()
+
+    # Inherit data from main jobs to job steps
+    if not nocomment:
+        job_steps["RuleName"] = job_steps["MainJobID"].map(rule_name_map).fillna("")
+    job_steps["RequestedMem_MB"] = job_steps["MainJobID"].map(mem_map).fillna(0)
+
+    # Use job steps as the final dataset (they have the actual resource usage)
+    df = job_steps.copy()
+
+    # Compute CPU efficiency
+    df["CPU Efficiency (%)"] = (
+        df["TotalCPU_sec"]
+        / (df["Elapsed_sec"].clip(lower=1) * df["NCPUS"].clip(lower=1))
+    ) * 100
+    df.replace([np.inf, -np.inf], 0, inplace=True)
+
     df["Memory Usage (%)"] = df.apply(
         lambda row: (
             (row["MaxRSS_MB"] / row["RequestedMem_MB"] * 100)
@@ -145,9 +166,6 @@ def create_efficiency_report(e_threshold, run_uuid, e_report_path, logger):
 
     df["Memory Usage (%)"] = df["Memory Usage (%)"].fillna(0).round(2)
 
-    # Drop all rows containing "batch" or "extern" as job names
-    df = df[~df["JobName"].str.contains("batch|extern", na=False)]
-
     # Log warnings for low efficiency
     for _, row in df.iterrows():
         if row["CPU Efficiency (%)"] < e_threshold:
@@ -164,6 +182,20 @@ def create_efficiency_report(e_threshold, run_uuid, e_report_path, logger):
                     f"({row['JobName']}) has low CPU efficiency: "
                     f"{row['CPU Efficiency (%)']}%."
                 )
+    return df
+
+
+def create_efficiency_report(e_threshold, run_uuid, e_report_path, logger):
+    """
+    Fetch sacct job data for a Snakemake workflow
+    and compute efficiency metrics.
+    """
+    lines = get_sacct_data(run_uuid, logger)
+
+    if lines is None or not lines:
+        return None
+
+    df = parse_sacct_data(lines, e_threshold, run_uuid, logger)
 
     # we construct a path object to allow for a customi
     # logdir, if specified
diff --git a/tests/tests.py b/tests/tests.py
@@ -9,6 +9,7 @@
 import pytest
 
 from snakemake_executor_plugin_slurm import ExecutorSettings
+from snakemake_executor_plugin_slurm.efficiency_report import parse_sacct_data
 from snakemake_executor_plugin_slurm.utils import set_gres_string
 from snakemake_executor_plugin_slurm.submit_string import get_submit_command
 from snakemake_interface_common.exceptions import WorkflowError
@@ -27,6 +28,37 @@ def get_executor_settings(self) -> Optional[ExecutorSettingsBase]:
         )
 
 
+def test_parse_sacct_data():
+    from io import StringIO
+
+    test_data = [
+        "10294159|b10191d0-6985-4c3a-8ccb-"
+        "aa7d23ebffc7|rule_bam_bwa_mem_mosdepth_"
+        "simulate_reads|00:01:31|00:24.041|1|1||32000M",
+        "10294159.batch|batch||00:01:31|00:03.292|1|1|71180K|",
+        "10294159.0|python3.12||00:01:10|00:20.749|1|1|183612K|",
+        "10294160|b10191d0-6985-4c3a-8ccb-"
+        "aa7d23ebffc7|rule_bam_bwa_mem_mosdepth_"
+        "simulate_reads|00:01:30|00:24.055|1|1||32000M",
+        "10294160.batch|batch||00:01:30|00:03.186|1|1|71192K|",
+        "10294160.0|python3.12||00:01:10|00:20.868|1|1|184352K|",
+    ]
+    df = parse_sacct_data(
+        lines=test_data, e_threshold=0.0, run_uuid="test", logger=None
+    )
+    output = StringIO()
+    df.to_csv(output, index=False)
+    print(output.getvalue())
+    # this should only be two rows once collapsed
+    assert len(df) == 2
+    # check that RuleName is properly inherited from main jobs
+    assert all(df["RuleName"] == "rule_bam_bwa_mem_mosdepth_simulate_reads")
+    # check that RequestedMem_MB is properly inherited
+    assert all(df["RequestedMem_MB"] == 32000.0)
+    # check that MaxRSS_MB is properly calculated from job steps
+    assert df.iloc[0]["MaxRSS_MB"] > 0  # Should have actual memory usage from job step
+
+
 class TestEfficiencyReport(snakemake.common.tests.TestWorkflowsLocalStorageBase):
     __test__ = True