|
8 | 8 | * reserved. |
9 | 9 | * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. |
10 | 10 | * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. |
| 11 | + * Copyright (c) 2025 Bull SAS. All rights reserved. |
11 | 12 | * $COPYRIGHT$ |
12 | 13 | * |
13 | 14 | * Additional copyrights may follow |
|
19 | 20 | #include "instance.h" |
20 | 21 |
|
21 | 22 | #include "opal/util/arch.h" |
| 23 | +#include "opal/util/proc.h" |
22 | 24 |
|
23 | 25 | #include "opal/util/show_help.h" |
24 | 26 | #include "opal/util/argv.h" |
|
39 | 41 | #include "ompi/dpm/dpm.h" |
40 | 42 | #include "ompi/file/file.h" |
41 | 43 | #include "ompi/mpiext/mpiext.h" |
| 44 | +#include "ompi/runtime/ompi_rte.h" |
42 | 45 |
|
43 | 46 | #include "ompi/mca/hook/base/base.h" |
44 | 47 | #include "ompi/mca/op/base/base.h" |
@@ -110,13 +113,17 @@ static void ompi_instance_construct (ompi_instance_t *instance) |
110 | 113 | instance->i_name[0] = '\0'; |
111 | 114 | instance->i_flags = 0; |
112 | 115 | instance->i_keyhash = NULL; |
| 116 | + OBJ_CONSTRUCT(&instance->i_spawned_proc_namelists, opal_list_t); |
| 117 | + OBJ_CONSTRUCT(&instance->i_spawned_proc_lock, opal_mutex_t); |
113 | 118 | OBJ_CONSTRUCT(&instance->s_lock, opal_mutex_t); |
114 | 119 | instance->errhandler_type = OMPI_ERRHANDLER_TYPE_INSTANCE; |
115 | 120 | instance->bsend_buffer = NULL; |
116 | 121 | } |
117 | 122 |
|
118 | 123 | static void ompi_instance_destruct(ompi_instance_t *instance) |
119 | 124 | { |
| 125 | + OBJ_DESTRUCT(&instance->i_spawned_proc_namelists); |
| 126 | + OBJ_DESTRUCT(&instance->i_spawned_proc_lock); |
120 | 127 | OBJ_DESTRUCT(&instance->s_lock); |
121 | 128 | } |
122 | 129 |
|
@@ -177,18 +184,90 @@ static int ompi_instance_print_error (const char *error, int ret) |
177 | 184 | return ret; |
178 | 185 | } |
179 | 186 |
|
| 187 | +/* This function is only needed for the world paradigm because it's the only one |
| 188 | + * we can spawn processes in it for now */ |
| 189 | +void ompi_proc_retain_spawned_jobids(ompi_proc_t **spawned_procs, size_t list_size) { |
| 190 | + const ompi_proc_t *spawned_proc; |
| 191 | + opal_namelist_t *registered_proc; |
| 192 | + ompi_process_name_t name; |
| 193 | + ompi_rte_cmp_bitmask_t mask; |
| 194 | + |
| 195 | + /* NULL if session paradigm, not NULL if world paradigm */ |
| 196 | + if (ompi_mpi_instance_default == NULL) { |
| 197 | + return; |
| 198 | + } |
| 199 | + |
| 200 | + /* return the proc-struct which matches this jobid */ |
| 201 | + mask = OMPI_RTE_CMP_JOBID; |
| 202 | + |
| 203 | + for (size_t i = 0; i < list_size; i++) { |
| 204 | + /* The idea is to filter the procs that have the same jobid, |
| 205 | + * aka the jobs in the same instance. |
| 206 | + * After that we lookup if the jobid is already present, meaning this |
| 207 | + * instance is already registered via the jobid of its procs. |
| 208 | + * If the jobid is not present we add it */ |
| 209 | + |
| 210 | + int found = 0; |
| 211 | + spawned_proc = spawned_procs[i]; |
| 212 | + if (OMPI_PROC_MY_NAME->jobid == spawned_proc->super.proc_name.jobid) { |
| 213 | + continue; |
| 214 | + } |
| 215 | + |
| 216 | + name.jobid = spawned_proc->super.proc_name.jobid; |
| 217 | + name.vpid = spawned_proc->super.proc_name.vpid; |
| 218 | + |
| 219 | + opal_mutex_lock(&ompi_mpi_instance_default->i_spawned_proc_lock); |
| 220 | + OPAL_LIST_FOREACH(registered_proc, |
| 221 | + &ompi_mpi_instance_default->i_spawned_proc_namelists, |
| 222 | + opal_namelist_t) { |
| 223 | + if (OPAL_EQUAL == ompi_rte_compare_name_fields(mask, |
| 224 | + ®istered_proc->name, &name)) { |
| 225 | + found = 1; |
| 226 | + break; |
| 227 | + } |
| 228 | + } |
| 229 | + |
| 230 | + if (0 == found) { |
| 231 | + opal_namelist_t *namelist = OBJ_NEW(opal_namelist_t); |
| 232 | + namelist->name.jobid = name.jobid; |
| 233 | + namelist->name.vpid = 0; /* not needed for lookup */ |
| 234 | + opal_list_append(&ompi_mpi_instance_default->i_spawned_proc_namelists, |
| 235 | + &namelist->super); |
| 236 | + } |
| 237 | + opal_mutex_unlock(&ompi_mpi_instance_default->i_spawned_proc_lock); |
| 238 | + } |
| 239 | + return; |
| 240 | +} |
| 241 | + |
180 | 242 | static int ompi_mpi_instance_cleanup_pml (void) |
181 | 243 | { |
182 | 244 | /* call del_procs on all allocated procs even though some may not be known |
183 | 245 | * to the pml layer. the pml layer is expected to be resilient and ignore |
184 | 246 | * any unknown procs. */ |
185 | 247 | size_t nprocs = 0; |
186 | 248 | ompi_proc_t **procs; |
| 249 | + opal_namelist_t *registered_name; |
| 250 | + opal_namelist_t *next; |
187 | 251 |
|
188 | 252 | procs = ompi_proc_get_allocated (&nprocs); |
189 | 253 | MCA_PML_CALL(del_procs(procs, nprocs)); |
190 | 254 | free(procs); |
191 | 255 |
|
| 256 | + /* If we are in a world paradigm and spawned processes we need to clean */ |
| 257 | + if (ompi_mpi_instance_default != NULL) { |
| 258 | + |
| 259 | + /* Let's loop on all spawned jobids and del_proc the concerned procs */ |
| 260 | + OPAL_LIST_FOREACH_SAFE(registered_name, next, |
| 261 | + &ompi_mpi_instance_default->i_spawned_proc_namelists, |
| 262 | + opal_namelist_t) { |
| 263 | + |
| 264 | + procs = ompi_proc_get_by_name(®istered_name->name, &nprocs); |
| 265 | + MCA_PML_CALL(del_procs(procs, nprocs)); |
| 266 | + opal_list_remove_item(&ompi_mpi_instance_default->i_spawned_proc_namelists, |
| 267 | + ®istered_name->super); |
| 268 | + } |
| 269 | + } |
| 270 | + |
192 | 271 | return OMPI_SUCCESS; |
193 | 272 | } |
194 | 273 |
|
@@ -989,14 +1068,14 @@ int ompi_mpi_instance_finalize (ompi_instance_t **instance) |
989 | 1068 | { |
990 | 1069 | int ret = OMPI_SUCCESS; |
991 | 1070 |
|
992 | | - OBJ_RELEASE(*instance); |
993 | | - |
994 | 1071 | opal_mutex_lock (&instance_lock); |
995 | 1072 | if (0 == opal_atomic_add_fetch_32 (&ompi_instance_count, -1)) { |
996 | 1073 | ret = ompi_mpi_instance_finalize_common (); |
997 | 1074 | } |
998 | 1075 | opal_mutex_unlock (&instance_lock); |
999 | 1076 |
|
| 1077 | + OBJ_RELEASE(*instance); |
| 1078 | + |
1000 | 1079 | *instance = &ompi_mpi_instance_null.instance; |
1001 | 1080 |
|
1002 | 1081 | return ret; |
|
0 commit comments