Skip to content

Commit fa5977d

Browse files
jakemoronigregkh
authored andcommitted
IB/cm: use rwlock for MAD agent lock
[ Upstream commit 4dab26b ] In workloads where there are many processes establishing connections using RDMA CM in parallel (large scale MPI), there can be heavy contention for mad_agent_lock in cm_alloc_msg. This contention can occur while inside of a spin_lock_irq region, leading to interrupts being disabled for extended durations on many cores. Furthermore, it leads to the serialization of rdma_create_ah calls, which has negative performance impacts for NICs which are capable of processing multiple address handle creations in parallel. The end result is the machine becoming unresponsive, hung task warnings, netdev TX timeouts, etc. Since the lock appears to be only for protection from cm_remove_one, it can be changed to a rwlock to resolve these issues. Reproducer: Server: for i in $(seq 1 512); do ucmatose -c 32 -p $((i + 5000)) & done Client: for i in $(seq 1 512); do ucmatose -c 32 -p $((i + 5000)) -s 10.2.0.52 & done Fixes: 76039ac ("IB/cm: Protect cm_dev, cm_ports and mad_agent with kref and lock") Link: https://patch.msgid.link/r/[email protected] Signed-off-by: Jacob Moroni <[email protected]> Acked-by: Eric Dumazet <[email protected]> Reviewed-by: Zhu Yanjun <[email protected]> Reviewed-by: Jason Gunthorpe <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent a284820 commit fa5977d

File tree

1 file changed

+8
-8
lines changed
  • drivers/infiniband/core

1 file changed

+8
-8
lines changed

drivers/infiniband/core/cm.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ struct cm_port {
166166
struct cm_device {
167167
struct kref kref;
168168
struct list_head list;
169-
spinlock_t mad_agent_lock;
169+
rwlock_t mad_agent_lock;
170170
struct ib_device *ib_device;
171171
u8 ack_delay;
172172
int going_down;
@@ -284,7 +284,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
284284
if (!cm_id_priv->av.port)
285285
return ERR_PTR(-EINVAL);
286286

287-
spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
287+
read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
288288
mad_agent = cm_id_priv->av.port->mad_agent;
289289
if (!mad_agent) {
290290
m = ERR_PTR(-EINVAL);
@@ -315,7 +315,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
315315
m->context[0] = cm_id_priv;
316316

317317
out:
318-
spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
318+
read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
319319
return m;
320320
}
321321

@@ -1294,10 +1294,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
12941294
if (!cm_id_priv->av.port)
12951295
return cpu_to_be64(low_tid);
12961296

1297-
spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
1297+
read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
12981298
if (cm_id_priv->av.port->mad_agent)
12991299
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
1300-
spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
1300+
read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
13011301
return cpu_to_be64(hi_tid | low_tid);
13021302
}
13031303

@@ -4374,7 +4374,7 @@ static int cm_add_one(struct ib_device *ib_device)
43744374
return -ENOMEM;
43754375

43764376
kref_init(&cm_dev->kref);
4377-
spin_lock_init(&cm_dev->mad_agent_lock);
4377+
rwlock_init(&cm_dev->mad_agent_lock);
43784378
cm_dev->ib_device = ib_device;
43794379
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
43804380
cm_dev->going_down = 0;
@@ -4490,9 +4490,9 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
44904490
* The above ensures no call paths from the work are running,
44914491
* the remaining paths all take the mad_agent_lock.
44924492
*/
4493-
spin_lock(&cm_dev->mad_agent_lock);
4493+
write_lock(&cm_dev->mad_agent_lock);
44944494
port->mad_agent = NULL;
4495-
spin_unlock(&cm_dev->mad_agent_lock);
4495+
write_unlock(&cm_dev->mad_agent_lock);
44964496
ib_unregister_mad_agent(mad_agent);
44974497
ib_port_unregister_client_groups(ib_device, i,
44984498
cm_counter_groups);

0 commit comments

Comments
 (0)