Skip to content

Commit 8bfabd1

Browse files
authored
Merge pull request #34 from alexcrichton/fix-linux-deadlock
Optionally increase the size of the pipe buffer
2 parents 5756d02 + 865b7e3 commit 8bfabd1

File tree

2 files changed

+128
-0
lines changed

2 files changed

+128
-0
lines changed

src/unix.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pub struct Acquired {
2323
impl Client {
2424
pub fn new(limit: usize) -> io::Result<Client> {
2525
let client = unsafe { Client::mk()? };
26+
client.configure_capacity(limit)?;
2627
// I don't think the character written here matters, but I could be
2728
// wrong!
2829
for _ in 0..limit {
@@ -63,6 +64,70 @@ impl Client {
6364
Ok(Client::from_fds(pipes[0], pipes[1]))
6465
}
6566

67+
fn configure_capacity(&self, required_capacity: usize) -> io::Result<()> {
68+
// On Linux we may need to increase the capacity of the pipe for the
69+
// jobserver to work correctly. Linux seems to exhibit behavior where it
70+
// implements a ring-buffer internally but apparently the ring-ness of
71+
// the ring-buffer is connected to *pages* of the ring buffer rather
72+
// than actual bytes of the ring buffer. This means that if the pipe has
73+
// only one page of capacity we can hit a possible deadlock situation
74+
// where a bunch of threads are writing to the pipe but they're all
75+
// blocked, despite the current used capacity of the pipe being less
76+
// than a page.
77+
//
78+
// This was first discovered in rust-lang/cargo#9739 where a system with
79+
// a large amount of concurrency would hang in `cargo build` when the
80+
// jobserver pipe only had one page of capacity. This was reduced to a
81+
// reproduction program [1] which indeed showed that the system would
82+
// deadlock if the capacity of the pipe was just one page.
83+
//
84+
// To fix this issue, on Linux only, we may increase the capacity of the
85+
// pipe. The main thing here is that if the capacity of the pipe is a
86+
// single page we try to increase it to two pages, otherwise we fail
87+
// because a deadlock might happen. While we're at it this goes ahead
88+
// and factors in the `required_capacity` requested by the client to
89+
// this calculation as well. If for some reason you want 10_000 units of
90+
// concurrency in the pipe that means we'll need more than 2 pages
91+
// (typically 8192 bytes), so we round that up to 3 pages as well.
92+
//
93+
// Someone with more understanding of linux pipes and how they buffer
94+
// internally should probably review this at some point. The exact cause
95+
// of the deadlock seems a little uncertain and it's not clear why the
96+
// example program [1] deadlocks and why simply adding another page
97+
// fixes things. Is this a kernel bug? Do we need to always guarantee at
98+
// least one free page? I'm not sure! Hopefully for now this is enough
99+
// to fix the problem until machines start having more than 4k cores,
100+
// which seems like it might be awhile.
101+
//
102+
// [1]: https://github.com/rust-lang/cargo/issues/9739#issuecomment-889183009
103+
#[cfg(target_os = "linux")]
104+
unsafe {
105+
let page_size = libc::sysconf(libc::_SC_PAGESIZE);
106+
let actual_capacity = cvt(libc::fcntl(self.write.as_raw_fd(), libc::F_GETPIPE_SZ))?;
107+
108+
if let Some(c) = calculate_capacity(
109+
required_capacity,
110+
actual_capacity as usize,
111+
page_size as usize,
112+
) {
113+
cvt(libc::fcntl(self.write.as_raw_fd(), libc::F_SETPIPE_SZ, c)).map_err(|e| {
114+
io::Error::new(
115+
e.kind(),
116+
format!(
117+
"failed to increase jobserver pipe capacity from {} to {}; \
118+
jobserver otherwise might deadlock",
119+
actual_capacity, c,
120+
),
121+
)
122+
123+
// ...
124+
})?;
125+
}
126+
}
127+
128+
Ok(())
129+
}
130+
66131
pub unsafe fn open(s: &str) -> Option<Client> {
67132
let mut parts = s.splitn(2, ',');
68133
let read = parts.next().unwrap();
@@ -337,3 +402,44 @@ extern "C" fn sigusr1_handler(
337402
) {
338403
// nothing to do
339404
}
405+
406+
#[allow(dead_code)]
407+
fn calculate_capacity(
408+
required_capacity: usize,
409+
actual_capacity: usize,
410+
page_size: usize,
411+
) -> Option<usize> {
412+
if actual_capacity < required_capacity {
413+
let mut rounded_capacity = round_up_to(required_capacity, page_size);
414+
if rounded_capacity < page_size * 2 {
415+
rounded_capacity += page_size;
416+
}
417+
return Some(rounded_capacity);
418+
}
419+
420+
if actual_capacity <= page_size {
421+
return Some(page_size * 2);
422+
}
423+
424+
return None;
425+
426+
fn round_up_to(a: usize, b: usize) -> usize {
427+
assert!(b.is_power_of_two());
428+
(a + (b - 1)) & (!(b - 1))
429+
}
430+
}
431+
432+
#[cfg(test)]
433+
mod tests {
434+
use super::calculate_capacity;
435+
436+
#[test]
437+
fn test_calculate_capacity() {
438+
assert_eq!(calculate_capacity(1, 65536, 4096), None);
439+
assert_eq!(calculate_capacity(500, 65536, 4096), None);
440+
assert_eq!(calculate_capacity(5000, 4096, 4096), Some(8192));
441+
assert_eq!(calculate_capacity(1, 4096, 4096), Some(8192));
442+
assert_eq!(calculate_capacity(4096, 4096, 4096), Some(8192));
443+
assert_eq!(calculate_capacity(8192, 4096, 4096), Some(8192));
444+
}
445+
}

tests/server.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,25 @@ fn zero_client() {
156156
assert!(rx.try_recv().is_err());
157157
}
158158
}
159+
160+
#[test]
161+
fn highly_concurrent() {
162+
const N: usize = 10000;
163+
164+
let client = t!(Client::new(80));
165+
166+
let threads = (0..80)
167+
.map(|_| {
168+
let client = client.clone();
169+
std::thread::spawn(move || {
170+
for _ in 0..N {
171+
drop(client.acquire().unwrap());
172+
}
173+
})
174+
})
175+
.collect::<Vec<_>>();
176+
177+
for t in threads {
178+
t.join().unwrap();
179+
}
180+
}

0 commit comments

Comments
 (0)