Skip to content

Commit 5987feb

Browse files
committed
Make PQcancel use the PGconn's tcp_user_timeout and keepalives settings.
If connectivity to the server has been lost or become flaky, the user might well try to send a query cancel. It's highly annoying if PQcancel hangs up in such a case, but that's exactly what's likely to happen. To ameliorate this problem, apply the PGconn's tcp_user_timeout and keepalives settings to the TCP connection used to send the cancel. This should be safe on Unix machines, since POSIX specifies that setsockopt() is async-signal-safe. We are guessing that WSAIoctl(SIO_KEEPALIVE_VALS) is similarly safe on Windows. (Note that at least in psql and our other frontend programs, there's no safety issue involved anyway, since we run PQcancel in its own thread rather than in a signal handler.) Most of the value here comes from the expectation that tcp_user_timeout will be applied as a connection timeout. That appears to happen on Linux, even though its tcp(7) man page claims differently. The keepalive options probably won't help much, but as long as we can apply them for not much code, we might as well. Jelte Fennema, reviewed by Fujii Masao and myself Discussion: https://postgr.es/m/AM5PR83MB017870DE81FC84D5E21E9D1EF7AA9@AM5PR83MB0178.EURPRD83.prod.outlook.com
1 parent cc333f3 commit 5987feb

File tree

3 files changed

+204
-56
lines changed

3 files changed

+204
-56
lines changed

doc/src/sgml/libpq.sgml

+2-2
Original file line numberDiff line numberDiff line change
@@ -5707,8 +5707,8 @@ int PQrequestCancel(PGconn *conn);
57075707
<structname>PGconn</structname> object, and in case of failure stores the
57085708
error message in the <structname>PGconn</structname> object (whence it can
57095709
be retrieved by <xref linkend="libpq-PQerrorMessage"/>). Although
5710-
the functionality is the same, this approach creates hazards for
5711-
multiple-thread programs and signal handlers, since it is possible
5710+
the functionality is the same, this approach is not safe within
5711+
multiple-thread programs or signal handlers, since it is possible
57125712
that overwriting the <structname>PGconn</structname>'s error message will
57135713
mess up the operation currently in progress on the connection.
57145714
</para>

src/interfaces/libpq/fe-connect.c

+195-54
Original file line numberDiff line numberDiff line change
@@ -1944,34 +1944,25 @@ setKeepalivesCount(PGconn *conn)
19441944
/*
19451945
* Enable keepalives and set the keepalive values on Win32,
19461946
* where they are always set in one batch.
1947+
*
1948+
* CAUTION: This needs to be signal safe, since it's used by PQcancel.
19471949
*/
19481950
static int
1949-
setKeepalivesWin32(PGconn *conn)
1951+
setKeepalivesWin32(pgsocket sock, int idle, int interval)
19501952
{
19511953
struct tcp_keepalive ka;
19521954
DWORD retsize;
1953-
int idle = 0;
1954-
int interval = 0;
19551955

1956-
if (conn->keepalives_idle &&
1957-
!parse_int_param(conn->keepalives_idle, &idle, conn,
1958-
"keepalives_idle"))
1959-
return 0;
19601956
if (idle <= 0)
19611957
idle = 2 * 60 * 60; /* 2 hours = default */
1962-
1963-
if (conn->keepalives_interval &&
1964-
!parse_int_param(conn->keepalives_interval, &interval, conn,
1965-
"keepalives_interval"))
1966-
return 0;
19671958
if (interval <= 0)
19681959
interval = 1; /* 1 second = default */
19691960

19701961
ka.onoff = 1;
19711962
ka.keepalivetime = idle * 1000;
19721963
ka.keepaliveinterval = interval * 1000;
19731964

1974-
if (WSAIoctl(conn->sock,
1965+
if (WSAIoctl(sock,
19751966
SIO_KEEPALIVE_VALS,
19761967
(LPVOID) &ka,
19771968
sizeof(ka),
@@ -1981,6 +1972,26 @@ setKeepalivesWin32(PGconn *conn)
19811972
NULL,
19821973
NULL)
19831974
!= 0)
1975+
return 0;
1976+
return 1;
1977+
}
1978+
1979+
static int
1980+
prepKeepalivesWin32(PGconn *conn)
1981+
{
1982+
int idle = -1;
1983+
int interval = -1;
1984+
1985+
if (conn->keepalives_idle &&
1986+
!parse_int_param(conn->keepalives_idle, &idle, conn,
1987+
"keepalives_idle"))
1988+
return 0;
1989+
if (conn->keepalives_interval &&
1990+
!parse_int_param(conn->keepalives_interval, &interval, conn,
1991+
"keepalives_interval"))
1992+
return 0;
1993+
1994+
if (!setKeepalivesWin32(conn->sock, idle, interval))
19841995
{
19851996
appendPQExpBuffer(&conn->errorMessage,
19861997
libpq_gettext("%s(%s) failed: error code %d\n"),
@@ -2644,7 +2655,7 @@ PQconnectPoll(PGconn *conn)
26442655
err = 1;
26452656
#else /* WIN32 */
26462657
#ifdef SIO_KEEPALIVE_VALS
2647-
else if (!setKeepalivesWin32(conn))
2658+
else if (!prepKeepalivesWin32(conn))
26482659
err = 1;
26492660
#endif /* SIO_KEEPALIVE_VALS */
26502661
#endif /* WIN32 */
@@ -4380,8 +4391,53 @@ PQgetCancel(PGconn *conn)
43804391
memcpy(&cancel->raddr, &conn->raddr, sizeof(SockAddr));
43814392
cancel->be_pid = conn->be_pid;
43824393
cancel->be_key = conn->be_key;
4394+
/* We use -1 to indicate an unset connection option */
4395+
cancel->pgtcp_user_timeout = -1;
4396+
cancel->keepalives = -1;
4397+
cancel->keepalives_idle = -1;
4398+
cancel->keepalives_interval = -1;
4399+
cancel->keepalives_count = -1;
4400+
if (conn->pgtcp_user_timeout != NULL)
4401+
{
4402+
if (!parse_int_param(conn->pgtcp_user_timeout,
4403+
&cancel->pgtcp_user_timeout,
4404+
conn, "tcp_user_timeout"))
4405+
goto fail;
4406+
}
4407+
if (conn->keepalives != NULL)
4408+
{
4409+
if (!parse_int_param(conn->keepalives,
4410+
&cancel->keepalives,
4411+
conn, "keepalives"))
4412+
goto fail;
4413+
}
4414+
if (conn->keepalives_idle != NULL)
4415+
{
4416+
if (!parse_int_param(conn->keepalives_idle,
4417+
&cancel->keepalives_idle,
4418+
conn, "keepalives_idle"))
4419+
goto fail;
4420+
}
4421+
if (conn->keepalives_interval != NULL)
4422+
{
4423+
if (!parse_int_param(conn->keepalives_interval,
4424+
&cancel->keepalives_interval,
4425+
conn, "keepalives_interval"))
4426+
goto fail;
4427+
}
4428+
if (conn->keepalives_count != NULL)
4429+
{
4430+
if (!parse_int_param(conn->keepalives_count,
4431+
&cancel->keepalives_count,
4432+
conn, "keepalives_count"))
4433+
goto fail;
4434+
}
43834435

43844436
return cancel;
4437+
4438+
fail:
4439+
free(cancel);
4440+
return NULL;
43854441
}
43864442

43874443
/* PQfreeCancel: free a cancel structure */
@@ -4394,28 +4450,46 @@ PQfreeCancel(PGcancel *cancel)
43944450

43954451

43964452
/*
4397-
* PQcancel and PQrequestCancel: attempt to request cancellation of the
4398-
* current operation.
4453+
* Sets an integer socket option on a TCP socket, if the provided value is
4454+
* not negative. Returns false if setsockopt fails for some reason.
4455+
*
4456+
* CAUTION: This needs to be signal safe, since it's used by PQcancel.
4457+
*/
4458+
#if defined(TCP_USER_TIMEOUT) || !defined(WIN32)
4459+
static bool
4460+
optional_setsockopt(int fd, int protoid, int optid, int value)
4461+
{
4462+
if (value < 0)
4463+
return true;
4464+
if (setsockopt(fd, protoid, optid, (char *) &value, sizeof(value)) < 0)
4465+
return false;
4466+
return true;
4467+
}
4468+
#endif
4469+
4470+
4471+
/*
4472+
* PQcancel: request query cancel
43994473
*
44004474
* The return value is true if the cancel request was successfully
44014475
* dispatched, false if not (in which case an error message is available).
44024476
* Note: successful dispatch is no guarantee that there will be any effect at
44034477
* the backend. The application must read the operation result as usual.
44044478
*
4479+
* On failure, an error message is stored in *errbuf, which must be of size
4480+
* errbufsize (recommended size is 256 bytes). *errbuf is not changed on
4481+
* success return.
4482+
*
44054483
* CAUTION: we want this routine to be safely callable from a signal handler
44064484
* (for example, an application might want to call it in a SIGINT handler).
44074485
* This means we cannot use any C library routine that might be non-reentrant.
44084486
* malloc/free are often non-reentrant, and anything that might call them is
44094487
* just as dangerous. We avoid sprintf here for that reason. Building up
44104488
* error messages with strcpy/strcat is tedious but should be quite safe.
44114489
* We also save/restore errno in case the signal handler support doesn't.
4412-
*
4413-
* internal_cancel() is an internal helper function to make code-sharing
4414-
* between the two versions of the cancel function possible.
44154490
*/
4416-
static int
4417-
internal_cancel(SockAddr *raddr, int be_pid, int be_key,
4418-
char *errbuf, int errbufsize)
4491+
int
4492+
PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
44194493
{
44204494
int save_errno = SOCK_ERRNO;
44214495
pgsocket tmpsock = PGINVALID_SOCKET;
@@ -4426,18 +4500,98 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key,
44264500
CancelRequestPacket cp;
44274501
} crp;
44284502

4503+
if (!cancel)
4504+
{
4505+
strlcpy(errbuf, "PQcancel() -- no cancel object supplied", errbufsize);
4506+
/* strlcpy probably doesn't change errno, but be paranoid */
4507+
SOCK_ERRNO_SET(save_errno);
4508+
return false;
4509+
}
4510+
44294511
/*
44304512
* We need to open a temporary connection to the postmaster. Do this with
44314513
* only kernel calls.
44324514
*/
4433-
if ((tmpsock = socket(raddr->addr.ss_family, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
4515+
if ((tmpsock = socket(cancel->raddr.addr.ss_family, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
44344516
{
44354517
strlcpy(errbuf, "PQcancel() -- socket() failed: ", errbufsize);
44364518
goto cancel_errReturn;
44374519
}
4520+
4521+
/*
4522+
* Since this connection will only be used to send a single packet of
4523+
* data, we don't need NODELAY. We also don't set the socket to
4524+
* nonblocking mode, because the API definition of PQcancel requires the
4525+
* cancel to be sent in a blocking way.
4526+
*
4527+
* We do set socket options related to keepalives and other TCP timeouts.
4528+
* This ensures that this function does not block indefinitely when
4529+
* reasonable keepalive and timeout settings have been provided.
4530+
*/
4531+
if (!IS_AF_UNIX(cancel->raddr.addr.ss_family) &&
4532+
cancel->keepalives != 0)
4533+
{
4534+
#ifndef WIN32
4535+
if (!optional_setsockopt(tmpsock, SOL_SOCKET, SO_KEEPALIVE, 1))
4536+
{
4537+
strlcpy(errbuf, "PQcancel() -- setsockopt(SO_KEEPALIVE) failed: ", errbufsize);
4538+
goto cancel_errReturn;
4539+
}
4540+
4541+
#ifdef PG_TCP_KEEPALIVE_IDLE
4542+
if (!optional_setsockopt(tmpsock, IPPROTO_TCP, PG_TCP_KEEPALIVE_IDLE,
4543+
cancel->keepalives_idle))
4544+
{
4545+
strlcpy(errbuf, "PQcancel() -- setsockopt(" PG_TCP_KEEPALIVE_IDLE_STR ") failed: ", errbufsize);
4546+
goto cancel_errReturn;
4547+
}
4548+
#endif
4549+
4550+
#ifdef TCP_KEEPINTVL
4551+
if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_KEEPINTVL,
4552+
cancel->keepalives_interval))
4553+
{
4554+
strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_KEEPINTVL) failed: ", errbufsize);
4555+
goto cancel_errReturn;
4556+
}
4557+
#endif
4558+
4559+
#ifdef TCP_KEEPCNT
4560+
if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_KEEPCNT,
4561+
cancel->keepalives_count))
4562+
{
4563+
strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_KEEPCNT) failed: ", errbufsize);
4564+
goto cancel_errReturn;
4565+
}
4566+
#endif
4567+
4568+
#else /* WIN32 */
4569+
4570+
#ifdef SIO_KEEPALIVE_VALS
4571+
if (!setKeepalivesWin32(tmpsock,
4572+
cancel->keepalives_idle,
4573+
cancel->keepalives_interval))
4574+
{
4575+
strlcpy(errbuf, "PQcancel() -- WSAIoctl(SIO_KEEPALIVE_VALS) failed: ", errbufsize);
4576+
goto cancel_errReturn;
4577+
}
4578+
#endif /* SIO_KEEPALIVE_VALS */
4579+
#endif /* WIN32 */
4580+
4581+
/* TCP_USER_TIMEOUT works the same way on Unix and Windows */
4582+
#ifdef TCP_USER_TIMEOUT
4583+
if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_USER_TIMEOUT,
4584+
cancel->pgtcp_user_timeout))
4585+
{
4586+
strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_USER_TIMEOUT) failed: ", errbufsize);
4587+
goto cancel_errReturn;
4588+
}
4589+
#endif
4590+
}
4591+
44384592
retry3:
4439-
if (connect(tmpsock, (struct sockaddr *) &raddr->addr,
4440-
raddr->salen) < 0)
4593+
if (connect(tmpsock, (struct sockaddr *) &cancel->raddr.addr,
4594+
cancel->raddr.salen) < 0)
44414595
{
44424596
if (SOCK_ERRNO == EINTR)
44434597
/* Interrupted system call - we'll just try again */
@@ -4446,16 +4600,12 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key,
44464600
goto cancel_errReturn;
44474601
}
44484602

4449-
/*
4450-
* We needn't set nonblocking I/O or NODELAY options here.
4451-
*/
4452-
44534603
/* Create and send the cancel request packet. */
44544604

44554605
crp.packetlen = pg_hton32((uint32) sizeof(crp));
44564606
crp.cp.cancelRequestCode = (MsgType) pg_hton32(CANCEL_REQUEST_CODE);
4457-
crp.cp.backendPID = pg_hton32(be_pid);
4458-
crp.cp.cancelAuthCode = pg_hton32(be_key);
4607+
crp.cp.backendPID = pg_hton32(cancel->be_pid);
4608+
crp.cp.cancelAuthCode = pg_hton32(cancel->be_key);
44594609

44604610
retry4:
44614611
if (send(tmpsock, (char *) &crp, sizeof(crp), 0) != (int) sizeof(crp))
@@ -4524,27 +4674,6 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key,
45244674
return false;
45254675
}
45264676

4527-
/*
4528-
* PQcancel: request query cancel
4529-
*
4530-
* Returns true if able to send the cancel request, false if not.
4531-
*
4532-
* On failure, an error message is stored in *errbuf, which must be of size
4533-
* errbufsize (recommended size is 256 bytes). *errbuf is not changed on
4534-
* success return.
4535-
*/
4536-
int
4537-
PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
4538-
{
4539-
if (!cancel)
4540-
{
4541-
strlcpy(errbuf, "PQcancel() -- no cancel object supplied", errbufsize);
4542-
return false;
4543-
}
4544-
4545-
return internal_cancel(&cancel->raddr, cancel->be_pid, cancel->be_key,
4546-
errbuf, errbufsize);
4547-
}
45484677

45494678
/*
45504679
* PQrequestCancel: old, not thread-safe function for requesting query cancel
@@ -4562,6 +4691,7 @@ int
45624691
PQrequestCancel(PGconn *conn)
45634692
{
45644693
int r;
4694+
PGcancel *cancel;
45654695

45664696
/* Check we have an open connection */
45674697
if (!conn)
@@ -4577,8 +4707,19 @@ PQrequestCancel(PGconn *conn)
45774707
return false;
45784708
}
45794709

4580-
r = internal_cancel(&conn->raddr, conn->be_pid, conn->be_key,
4581-
conn->errorMessage.data, conn->errorMessage.maxlen);
4710+
cancel = PQgetCancel(conn);
4711+
if (cancel)
4712+
{
4713+
r = PQcancel(cancel, conn->errorMessage.data,
4714+
conn->errorMessage.maxlen);
4715+
PQfreeCancel(cancel);
4716+
}
4717+
else
4718+
{
4719+
strlcpy(conn->errorMessage.data, "out of memory",
4720+
conn->errorMessage.maxlen);
4721+
r = false;
4722+
}
45824723

45834724
if (!r)
45844725
conn->errorMessage.len = strlen(conn->errorMessage.data);

src/interfaces/libpq/libpq-int.h

+7
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,13 @@ struct pg_cancel
583583
SockAddr raddr; /* Remote address */
584584
int be_pid; /* PID of backend --- needed for cancels */
585585
int be_key; /* key of backend --- needed for cancels */
586+
int pgtcp_user_timeout; /* tcp user timeout */
587+
int keepalives; /* use TCP keepalives? */
588+
int keepalives_idle; /* time between TCP keepalives */
589+
int keepalives_interval; /* time between TCP keepalive
590+
* retransmits */
591+
int keepalives_count; /* maximum number of TCP keepalive
592+
* retransmits */
586593
};
587594

588595

0 commit comments

Comments
 (0)