@@ -254,7 +254,7 @@ static int mca_pml_ubcl_create_recv_endpoint(uint64_t sender_rank, const int typ
254254
255255 err = ubcl_export_local_endpoint_handle (type , endpoint_h , & remote_rank_u64 );
256256 if (UBCL_SUCCESS != err ) {
257- return OMPI_ERROR ;
257+ return ubcl_error_to_ompi ( err ) ;
258258 }
259259
260260 return OMPI_SUCCESS ;
@@ -269,11 +269,11 @@ static int mca_pml_ubcl_create_self_endpoints(uint64_t remote_rank)
269269
270270 err = ubcl_export_local_endpoint_handle (type , endpoint_h , & my_rank );
271271 if (UBCL_SUCCESS != err ) {
272- return OMPI_ERROR ;
272+ return ubcl_error_to_ompi ( err ) ;
273273 }
274274 err = ubcl_create_remote_endpoint (my_rank , my_rank , type , endpoint_h );
275275 if (UBCL_SUCCESS != err ) {
276- return OMPI_ERROR ;
276+ return ubcl_error_to_ompi ( err ) ;
277277 }
278278
279279 return OMPI_SUCCESS ;
@@ -295,6 +295,25 @@ static int get_endpoint_type(ompi_proc_t *proc)
295295 }
296296}
297297
298+ static enum ubcl_endpoint_type_t mca_pml_ubcl_get_higher_transport (
299+ enum ubcl_endpoint_type_t type )
300+ {
301+ switch ((int ) type ) {
302+ case UBCL_ENDPOINT_TYPE_SELF :
303+ case UBCL_ENDPOINT_TYPE_SHMEM :
304+ type ++ ;
305+ break ;
306+ /* There are no valid higher transport */
307+ case UBCL_ENDPOINT_TYPE_BXI :
308+ default :
309+ type = UBCL_ENDPOINT_TYPE_NONE ;
310+ /* Not a valid transport */
311+ break ;
312+ }
313+
314+ return type ;
315+ }
316+
298317void mca_pml_ubcl_endpoint_retain (ompi_proc_t * proc )
299318{
300319 mca_common_ubcl_endpoint_t * endpoint = NULL ;
@@ -311,6 +330,7 @@ void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc)
311330static int mca_pml_ubcl_create_endpoints (ompi_proc_t * proc )
312331{
313332 int err = OMPI_SUCCESS ;
333+ enum ubcl_endpoint_type_t type ;
314334 mca_common_ubcl_endpoint_t * new_endpoint ;
315335
316336 new_endpoint = malloc (sizeof (mca_common_ubcl_endpoint_t ));
@@ -321,32 +341,55 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc)
321341
322342 new_endpoint -> refcount = 0 ; //we increment it to 1 in endpoint_retain
323343 new_endpoint -> rank = mca_pml_forge_rank (proc );
324- new_endpoint -> type = get_endpoint_type (proc );
344+ type = get_endpoint_type (proc );
325345
326- if (UBCL_ENDPOINT_TYPE_SELF == new_endpoint -> type ) {
346+ if (UBCL_ENDPOINT_TYPE_SELF == type ) {
327347 err = mca_pml_ubcl_create_self_endpoints ((uint64_t ) new_endpoint -> rank );
328- goto end ;
329- }
330348
331- err = mca_pml_ubcl_create_recv_endpoint (new_endpoint -> rank , new_endpoint -> type );
332- if (OMPI_SUCCESS != err ) {
333- mca_pml_ubcl_warn (err , "Failed to create recv endpoint for rank %zu\n" ,
334- new_endpoint -> rank );
335- return err ;
349+ /* If the transport is unvailable (either explicitely disabled,
350+ * or just unavailable) we do not return any error
351+ * If UBCL encountered another error we return it */
352+ if (OMPI_SUCCESS == err ) {
353+ goto end ;
354+ } else if (OMPI_ERR_NOT_AVAILABLE != err ) {
355+ return err ;
356+ }
336357 }
337358
338- err = mca_pml_ubcl_create_send_endpoint (proc , new_endpoint -> rank , new_endpoint -> type );
359+ /* If a transport is unavailable only a higher transport can take its place,
360+ * ie. if SHM is unavailable, SELF cannot replace it but BXI can */
361+ do {
362+ err = mca_pml_ubcl_create_recv_endpoint (new_endpoint -> rank , type );
363+
364+ if (OMPI_ERR_NOT_AVAILABLE == err ) {
365+ type = mca_pml_ubcl_get_higher_transport (type );
366+ if (UBCL_ENDPOINT_TYPE_NONE == type ) {
367+ mca_pml_ubcl_warn (err , "Failed to create recv endpoint for rank %zu\n" ,
368+ new_endpoint -> rank );
369+ return err ;
370+ }
371+ } else if (OMPI_SUCCESS != err ) {
372+ mca_pml_ubcl_warn (err , "Failed to create recv endpoint for rank %zu\n" ,
373+ new_endpoint -> rank );
374+ return err ;
375+ }
376+ } while (OMPI_SUCCESS != err );
377+
378+ /* No need to loop again, if the transport became unavailable between
379+ * the last operation and this one we can consider this a error */
380+ err = mca_pml_ubcl_create_send_endpoint (proc , new_endpoint -> rank , type );
339381 if (OMPI_SUCCESS != err ) {
340382 mca_pml_ubcl_warn (err , "Failed to create send endpoint for rank %zu\n" ,
341383 new_endpoint -> rank );
342384 return err ;
343385 }
344386
345387end :
388+ new_endpoint -> type = type ;
346389 (proc )-> proc_endpoints [OMPI_PROC_ENDPOINT_TAG_PML ] = new_endpoint ;
347390 mca_pml_ubcl_endpoint_retain (proc );
348391
349- return err ;
392+ return UBCL_SUCCESS ;
350393}
351394
352395int mca_pml_ubcl_add_procs (ompi_proc_t * * procs , size_t nprocs )
0 commit comments