Skip to content

Commit 959dc30

Browse files
DUPRAT, JULIENEmmanuelBRELLE
authored andcommitted
[PML/UBCL] add_procs fallbacks on higher transport when required
Signed-off-by: Brelle Emmanuel <[email protected]>
1 parent 9ca90fa commit 959dc30

File tree

1 file changed

+57
-14
lines changed

1 file changed

+57
-14
lines changed

ompi/mca/pml/ubcl/pml_ubcl_endpoint.c

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ static int mca_pml_ubcl_create_recv_endpoint(uint64_t sender_rank, const int typ
254254

255255
err = ubcl_export_local_endpoint_handle(type, endpoint_h, &remote_rank_u64);
256256
if (UBCL_SUCCESS != err) {
257-
return OMPI_ERROR;
257+
return ubcl_error_to_ompi(err);
258258
}
259259

260260
return OMPI_SUCCESS;
@@ -269,11 +269,11 @@ static int mca_pml_ubcl_create_self_endpoints(uint64_t remote_rank)
269269

270270
err = ubcl_export_local_endpoint_handle(type, endpoint_h, &my_rank);
271271
if (UBCL_SUCCESS != err) {
272-
return OMPI_ERROR;
272+
return ubcl_error_to_ompi(err);
273273
}
274274
err = ubcl_create_remote_endpoint(my_rank, my_rank, type, endpoint_h);
275275
if (UBCL_SUCCESS != err) {
276-
return OMPI_ERROR;
276+
return ubcl_error_to_ompi(err);
277277
}
278278

279279
return OMPI_SUCCESS;
@@ -295,6 +295,25 @@ static int get_endpoint_type(ompi_proc_t *proc)
295295
}
296296
}
297297

298+
static enum ubcl_endpoint_type_t mca_pml_ubcl_get_higher_transport(
299+
enum ubcl_endpoint_type_t type)
300+
{
301+
switch ((int) type) {
302+
case UBCL_ENDPOINT_TYPE_SELF:
303+
case UBCL_ENDPOINT_TYPE_SHMEM:
304+
type++;
305+
break;
306+
/* There are no valid higher transport */
307+
case UBCL_ENDPOINT_TYPE_BXI:
308+
default:
309+
type = UBCL_ENDPOINT_TYPE_NONE;
310+
/* Not a valid transport */
311+
break;
312+
}
313+
314+
return type;
315+
}
316+
298317
void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc)
299318
{
300319
mca_common_ubcl_endpoint_t *endpoint = NULL;
@@ -311,6 +330,7 @@ void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc)
311330
static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc)
312331
{
313332
int err = OMPI_SUCCESS;
333+
enum ubcl_endpoint_type_t type;
314334
mca_common_ubcl_endpoint_t *new_endpoint;
315335

316336
new_endpoint = malloc(sizeof(mca_common_ubcl_endpoint_t));
@@ -321,32 +341,55 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc)
321341

322342
new_endpoint->refcount = 0; //we increment it to 1 in endpoint_retain
323343
new_endpoint->rank = mca_pml_forge_rank(proc);
324-
new_endpoint->type = get_endpoint_type(proc);
344+
type = get_endpoint_type(proc);
325345

326-
if (UBCL_ENDPOINT_TYPE_SELF == new_endpoint->type) {
346+
if (UBCL_ENDPOINT_TYPE_SELF == type) {
327347
err = mca_pml_ubcl_create_self_endpoints((uint64_t) new_endpoint->rank);
328-
goto end;
329-
}
330348

331-
err = mca_pml_ubcl_create_recv_endpoint(new_endpoint->rank, new_endpoint->type);
332-
if (OMPI_SUCCESS != err) {
333-
mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n",
334-
new_endpoint->rank);
335-
return err;
349+
/* If the transport is unvailable (either explicitely disabled,
350+
* or just unavailable) we do not return any error
351+
* If UBCL encountered another error we return it */
352+
if (OMPI_SUCCESS == err) {
353+
goto end;
354+
} else if (OMPI_ERR_NOT_AVAILABLE != err) {
355+
return err;
356+
}
336357
}
337358

338-
err = mca_pml_ubcl_create_send_endpoint(proc, new_endpoint->rank, new_endpoint->type);
359+
/* If a transport is unavailable only a higher transport can take its place,
360+
* ie. if SHM is unavailable, SELF cannot replace it but BXI can */
361+
do {
362+
err = mca_pml_ubcl_create_recv_endpoint(new_endpoint->rank, type);
363+
364+
if (OMPI_ERR_NOT_AVAILABLE == err) {
365+
type = mca_pml_ubcl_get_higher_transport(type);
366+
if (UBCL_ENDPOINT_TYPE_NONE == type) {
367+
mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n",
368+
new_endpoint->rank);
369+
return err;
370+
}
371+
} else if (OMPI_SUCCESS != err) {
372+
mca_pml_ubcl_warn(err, "Failed to create recv endpoint for rank %zu\n",
373+
new_endpoint->rank);
374+
return err;
375+
}
376+
} while (OMPI_SUCCESS != err);
377+
378+
/* No need to loop again, if the transport became unavailable between
379+
* the last operation and this one we can consider this a error */
380+
err = mca_pml_ubcl_create_send_endpoint(proc, new_endpoint->rank, type);
339381
if (OMPI_SUCCESS != err) {
340382
mca_pml_ubcl_warn(err, "Failed to create send endpoint for rank %zu\n",
341383
new_endpoint->rank);
342384
return err;
343385
}
344386

345387
end:
388+
new_endpoint->type = type;
346389
(proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = new_endpoint;
347390
mca_pml_ubcl_endpoint_retain(proc);
348391

349-
return err;
392+
return UBCL_SUCCESS;
350393
}
351394

352395
int mca_pml_ubcl_add_procs(ompi_proc_t **procs, size_t nprocs)

0 commit comments

Comments
 (0)