Skip to content

Commit 2863f78

Browse files
authored
fix: partition cluster selection (#385)
The automated partition selection did not consider a multicluster setup. This might require a partition selection which considers the cluster keyword. Now, the plugin considers `cluster` , `clusters` and `slurm_cluster` <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added support for multiple cluster specification formats (cluster, clusters, slurm_cluster) treated equivalently across configurations. * **Bug Fixes** * Improved partition selection to enforce strict cluster matching when a cluster is explicitly specified. * **Documentation** * Clarified that partition selection can target specific clusters using cluster resource keys. * **Tests** * Updated tests to verify cluster constraint handling in partition selection logic. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent a3e91cd commit 2863f78

File tree

5 files changed

+121
-18
lines changed

5 files changed

+121
-18
lines changed

docs/further.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ The SLURM executor plugin supports automatic partition selection based on job re
7070

7171
*Jobs that explicitly specify a `slurm_partition` resource will bypass automatic selection and use the specified partition directly.*
7272

73+
> **Note:** Partition selection supports specifying the target cluster using any of the resource keys `cluster`, `clusters`, or `slurm_cluster` in your workflow profile or the partition configuration file. All three are treated equivalently by the plugin.
74+
7375
##### Partition Limits Specification
7476

7577
To enable automatic partition selection, create a YAML configuration file that defines the available partitions and their resource limits. This file should be structured as follows:

snakemake_executor_plugin_slurm/__init__.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -868,10 +868,44 @@ def get_partition_arg(self, job: JobExecutorInterface):
868868
else raises an error - implicetly.
869869
"""
870870
partition = None
871+
872+
# Check if a specific partition is requested
871873
if job.resources.get("slurm_partition"):
872-
partition = job.resources.slurm_partition
873-
elif self._partitions:
874+
# But also check if there's a cluster requirement that might override it
875+
job_cluster = (
876+
job.resources.get("slurm_cluster")
877+
or job.resources.get("cluster")
878+
or job.resources.get("clusters")
879+
)
880+
881+
if job_cluster and self._partitions:
882+
# If a cluster is specified, verify the partition exists and matches
883+
# Otherwise, use auto-selection to find a partition for that cluster
884+
partition_obj = next(
885+
(
886+
p
887+
for p in self._partitions
888+
if p.name == job.resources.slurm_partition
889+
),
890+
None,
891+
)
892+
if (
893+
partition_obj
894+
and partition_obj.partition_cluster
895+
and partition_obj.partition_cluster != job_cluster
896+
):
897+
# Partition exists but is for a different cluster:
898+
# use auto-selection
899+
partition = get_best_partition(self._partitions, job, self.logger)
900+
else:
901+
partition = job.resources.slurm_partition
902+
else:
903+
partition = job.resources.slurm_partition
904+
905+
# If no partition was selected yet, try auto-selection
906+
if not partition and self._partitions:
874907
partition = get_best_partition(self._partitions, job, self.logger)
908+
875909
# we didnt get a partition yet so try fallback.
876910
if not partition:
877911
if self._fallback_partition is None:

snakemake_executor_plugin_slurm/partitions.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,16 @@ def read_partition_file(partition_file: Path) -> List["Partition"]:
3939
raise KeyError("Partition name cannot be empty")
4040

4141
# Extract optional cluster name from partition config
42-
cluster = partition_config.pop("cluster", None)
42+
cluster = None
43+
for key in ("slurm_cluster", "cluster", "clusters"):
44+
if key in partition_config:
45+
cluster = partition_config.pop(key)
46+
break
4347

4448
out.append(
4549
Partition(
4650
name=partition_name,
47-
cluster=cluster,
51+
partition_cluster=cluster,
4852
limits=PartitionLimits(**partition_config),
4953
)
5054
)
@@ -60,7 +64,7 @@ def get_best_partition(
6064
for p in candidate_partitions:
6165
score = p.score_job_fit(job)
6266
logger.debug(f"Partition '{p.name}' score for job {job.name}: {score}")
63-
if score is not None:
67+
if score is not None and score > 0:
6468
scored_partitions.append((p, score))
6569

6670
if scored_partitions:
@@ -241,7 +245,7 @@ class Partition:
241245

242246
name: str
243247
limits: PartitionLimits
244-
cluster: Optional[str] = None
248+
partition_cluster: Optional[str] = None
245249

246250
def score_job_fit(self, job: JobExecutorInterface) -> Optional[float]:
247251
"""
@@ -269,14 +273,21 @@ def score_job_fit(self, job: JobExecutorInterface) -> Optional[float]:
269273
# Accept multiple possible resource names for cluster specification
270274
job_cluster = (
271275
job.resources.get("slurm_cluster")
272-
or job.resources.get("clusters")
273276
or job.resources.get("cluster")
277+
or job.resources.get("clusters")
274278
)
275279

280+
# Enforce strict cluster eligibility:
281+
# - If the job specifies a cluster, only partitions with a matching cluster
282+
# are eligible
283+
# - If the job does not specify a cluster, only partitions without a cluster
284+
# are eligible
276285
if job_cluster is not None:
277-
# Job specifies a cluster - partition must match
278-
if self.cluster is not None and self.cluster != job_cluster:
279-
return None # Partition is for a different cluster
286+
if self.partition_cluster != job_cluster:
287+
return None # Not eligible
288+
else:
289+
if self.partition_cluster is not None:
290+
return None # Not eligible
280291

281292
for resource_key, limit in numerical_resources.items():
282293
job_requirement = job.resources.get(resource_key, 0)

snakemake_executor_plugin_slurm/submit_string.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,16 @@ def get_submit_command(job, params):
4242
# "- p '{partition_name}'"
4343
call += f" {params.partition}"
4444

45-
if job.resources.get("clusters"):
46-
call += f" --clusters {safe_quote(job.resources.clusters)}"
45+
# Add cluster specification if provided
46+
# Check for cluster first (singular), then fall back to clusters (plural)
47+
# for backwards compatibility
48+
cluster_val = (
49+
job.resources.get("cluster")
50+
or job.resources.get("clusters")
51+
or job.resources.get("slurm_cluster")
52+
)
53+
if cluster_val:
54+
call += f" --cluster {safe_quote(cluster_val)}"
4755

4856
if job.resources.get("runtime"):
4957
call += f" -t {safe_quote(job.resources.runtime)}"

tests/test_partition_selection.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -591,8 +591,8 @@ def test_cluster_specification_via_slurm_cluster(
591591
# Verify cluster field is read correctly
592592
# Find partitions by name instead of assuming order
593593
partition_map = {p.name: p for p in partitions}
594-
assert partition_map["normal-small"].cluster == "normal"
595-
assert partition_map["deviating-small"].cluster == "deviating"
594+
assert partition_map["normal-small"].partition_cluster == "normal"
595+
assert partition_map["deviating-small"].partition_cluster == "deviating"
596596

597597
# Job targeting 'normal' cluster
598598
job = mock_job(threads=16, mem_mb=32000, slurm_cluster="normal")
@@ -664,10 +664,59 @@ def test_cluster_mismatch_excludes_partitions(
664664
finally:
665665
temp_path.unlink()
666666

667+
def test_job_with_cluster_does_not_select_partition_without_cluster(
668+
self, multicluster_partition_config, temp_yaml_file, mock_job, mock_logger
669+
):
670+
"""
671+
Test that jobs with a cluster requirement do not select partitions
672+
without a cluster.
673+
"""
674+
config = dict(multicluster_partition_config)
675+
config["partitions"]["no-cluster"] = {
676+
"max_runtime": 360,
677+
"max_threads": 32,
678+
"max_mem_mb": 64000,
679+
}
680+
temp_path = temp_yaml_file(config)
681+
try:
682+
partitions = read_partition_file(temp_path)
683+
job = mock_job(threads=16, mem_mb=32000, slurm_cluster="normal")
684+
selected_partition = get_best_partition(partitions, job, mock_logger)
685+
# Should select a partition with cluster 'normal', not 'no-cluster'
686+
assert selected_partition in ["normal-small", "normal-large"]
687+
finally:
688+
temp_path.unlink()
689+
690+
def test_job_without_cluster_can_select_partition_without_cluster(
691+
self, multicluster_partition_config, temp_yaml_file, mock_job, mock_logger
692+
):
693+
"""
694+
Test that jobs without cluster requirement can select partitions
695+
without a cluster.
696+
"""
697+
config = dict(multicluster_partition_config)
698+
config["partitions"]["no-cluster"] = {
699+
"max_runtime": 360,
700+
"max_threads": 32,
701+
"max_mem_mb": 64000,
702+
}
703+
temp_path = temp_yaml_file(config)
704+
try:
705+
partitions = read_partition_file(temp_path)
706+
job = mock_job(threads=16, mem_mb=32000)
707+
selected_partition = get_best_partition(partitions, job, mock_logger)
708+
# Should be able to select 'no-cluster' partition
709+
assert selected_partition in ["normal-small", "normal-large", "no-cluster"]
710+
finally:
711+
temp_path.unlink()
712+
667713
def test_job_without_cluster_uses_any_partition(
668714
self, multicluster_partition_config, temp_yaml_file, mock_job, mock_logger
669715
):
670-
"""Test that jobs without cluster specification can use any partition."""
716+
"""
717+
Test that jobs without cluster specification can use any partition
718+
without a cluster assignment.
719+
"""
671720
temp_path = temp_yaml_file(multicluster_partition_config)
672721

673722
try:
@@ -677,9 +726,8 @@ def test_job_without_cluster_uses_any_partition(
677726
job = mock_job(threads=16, mem_mb=32000)
678727
selected_partition = get_best_partition(partitions, job, mock_logger)
679728

680-
# Should select a partition (any cluster is acceptable)
681-
assert selected_partition is not None
682-
assert mock_logger.info.call_count >= 1
729+
# Should return None since all partitions have a cluster assignment
730+
assert selected_partition is None
683731
finally:
684732
temp_path.unlink()
685733

0 commit comments

Comments
 (0)