| /*------------------------------------------------------------------------- |
| * |
| * socket.c |
| * Microsoft Windows Win32 Socket Functions |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * |
| * IDENTIFICATION |
| * src/backend/port/win32/socket.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| /* |
| * Indicate if pgwin32_recv() and pgwin32_send() should operate |
| * in non-blocking mode. |
| * |
| * Since the socket emulation layer always sets the actual socket to |
| * non-blocking mode in order to be able to deliver signals, we must |
| * specify this in a separate flag if we actually need non-blocking |
| * operation. |
| * |
| * This flag changes the behaviour *globally* for all socket operations, |
| * so it should only be set for very short periods of time. |
| */ |
| int pgwin32_noblock = 0; |
| |
| /* Undef the macros defined in win32.h, so we can access system functions */ |
| #undef socket |
| #undef bind |
| #undef listen |
| #undef accept |
| #undef connect |
| #undef select |
| #undef recv |
| #undef send |
| |
| /* |
| * Blocking socket functions implemented so they listen on both |
| * the socket and the signal event, required for signal handling. |
| */ |
| |
| /* |
| * Convert the last socket error code into errno |
| * |
| * Note: where there is a direct correspondence between a WSAxxx error code |
| * and a Berkeley error symbol, this mapping is actually a no-op, because |
| * in win32_port.h we redefine the network-related Berkeley error symbols to |
| * have the values of their WSAxxx counterparts. The point of the switch is |
| * mostly to translate near-miss error codes into something that's sensible |
| * in the Berkeley universe. |
| */ |
| static void |
| TranslateSocketError(void) |
| { |
| switch (WSAGetLastError()) |
| { |
| case WSAEINVAL: |
| case WSANOTINITIALISED: |
| case WSAEINVALIDPROVIDER: |
| case WSAEINVALIDPROCTABLE: |
| case WSAEDESTADDRREQ: |
| errno = EINVAL; |
| break; |
| case WSAEINPROGRESS: |
| errno = EINPROGRESS; |
| break; |
| case WSAEFAULT: |
| errno = EFAULT; |
| break; |
| case WSAEISCONN: |
| errno = EISCONN; |
| break; |
| case WSAEMSGSIZE: |
| errno = EMSGSIZE; |
| break; |
| case WSAEAFNOSUPPORT: |
| errno = EAFNOSUPPORT; |
| break; |
| case WSAEMFILE: |
| errno = EMFILE; |
| break; |
| case WSAENOBUFS: |
| errno = ENOBUFS; |
| break; |
| case WSAEPROTONOSUPPORT: |
| case WSAEPROTOTYPE: |
| case WSAESOCKTNOSUPPORT: |
| errno = EPROTONOSUPPORT; |
| break; |
| case WSAECONNABORTED: |
| errno = ECONNABORTED; |
| break; |
| case WSAECONNREFUSED: |
| errno = ECONNREFUSED; |
| break; |
| case WSAECONNRESET: |
| errno = ECONNRESET; |
| break; |
| case WSAEINTR: |
| errno = EINTR; |
| break; |
| case WSAENOTSOCK: |
| errno = ENOTSOCK; |
| break; |
| case WSAEOPNOTSUPP: |
| errno = EOPNOTSUPP; |
| break; |
| case WSAEWOULDBLOCK: |
| errno = EWOULDBLOCK; |
| break; |
| case WSAEACCES: |
| errno = EACCES; |
| break; |
| case WSAEADDRINUSE: |
| errno = EADDRINUSE; |
| break; |
| case WSAEADDRNOTAVAIL: |
| errno = EADDRNOTAVAIL; |
| break; |
| case WSAEHOSTDOWN: |
| errno = EHOSTDOWN; |
| break; |
| case WSAEHOSTUNREACH: |
| case WSAHOST_NOT_FOUND: |
| errno = EHOSTUNREACH; |
| break; |
| case WSAENETDOWN: |
| errno = ENETDOWN; |
| break; |
| case WSAENETUNREACH: |
| errno = ENETUNREACH; |
| break; |
| case WSAENETRESET: |
| errno = ENETRESET; |
| break; |
| case WSAENOTCONN: |
| case WSAESHUTDOWN: |
| case WSAEDISCON: |
| errno = ENOTCONN; |
| break; |
| case WSAETIMEDOUT: |
| errno = ETIMEDOUT; |
| break; |
| default: |
| ereport(NOTICE, |
| (errmsg_internal("unrecognized win32 socket error code: %d", |
| WSAGetLastError()))); |
| errno = EINVAL; |
| break; |
| } |
| } |
| |
| static int |
| pgwin32_poll_signals(void) |
| { |
| if (UNBLOCKED_SIGNAL_QUEUE()) |
| { |
| pgwin32_dispatch_queued_signals(); |
| errno = EINTR; |
| return 1; |
| } |
| return 0; |
| } |
| |
| static int |
| isDataGram(SOCKET s) |
| { |
| int type; |
| int typelen = sizeof(type); |
| |
| if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen)) |
| return 1; |
| |
| return (type == SOCK_DGRAM) ? 1 : 0; |
| } |
| |
| int |
| pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout) |
| { |
| static HANDLE waitevent = INVALID_HANDLE_VALUE; |
| static SOCKET current_socket = INVALID_SOCKET; |
| static int isUDP = 0; |
| HANDLE events[2]; |
| int r; |
| |
| /* Create an event object just once and use it on all future calls */ |
| if (waitevent == INVALID_HANDLE_VALUE) |
| { |
| waitevent = CreateEvent(NULL, TRUE, FALSE, NULL); |
| |
| if (waitevent == INVALID_HANDLE_VALUE) |
| ereport(ERROR, |
| (errmsg_internal("could not create socket waiting event: error code %lu", GetLastError()))); |
| } |
| else if (!ResetEvent(waitevent)) |
| ereport(ERROR, |
| (errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError()))); |
| |
| /* |
| * Track whether socket is UDP or not. (NB: most likely, this is both |
| * useless and wrong; there is no reason to think that the behavior of |
| * WSAEventSelect is different for TCP and UDP.) |
| */ |
| if (current_socket != s) |
| isUDP = isDataGram(s); |
| current_socket = s; |
| |
| /* |
| * Attach event to socket. NOTE: we must detach it again before |
| * returning, since other bits of code may try to attach other events to |
| * the socket. |
| */ |
| if (WSAEventSelect(s, waitevent, what) != 0) |
| { |
| TranslateSocketError(); |
| return 0; |
| } |
| |
| events[0] = pgwin32_signal_event; |
| events[1] = waitevent; |
| |
| /* |
| * Just a workaround of unknown locking problem with writing in UDP socket |
| * under high load: Client's pgsql backend sleeps infinitely in |
| * WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select(). |
| * So, we will wait with small timeout(0.1 sec) and if socket is still |
| * blocked, try WSASend (see comments in pgwin32_select) and wait again. |
| */ |
| if ((what & FD_WRITE) && isUDP) |
| { |
| for (;;) |
| { |
| r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE); |
| |
| if (r == WAIT_TIMEOUT) |
| { |
| char c; |
| WSABUF buf; |
| DWORD sent; |
| |
| buf.buf = &c; |
| buf.len = 0; |
| |
| r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL); |
| if (r == 0) /* Completed - means things are fine! */ |
| { |
| WSAEventSelect(s, NULL, 0); |
| return 1; |
| } |
| else if (WSAGetLastError() != WSAEWOULDBLOCK) |
| { |
| TranslateSocketError(); |
| WSAEventSelect(s, NULL, 0); |
| return 0; |
| } |
| } |
| else |
| break; |
| } |
| } |
| else |
| r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE); |
| |
| WSAEventSelect(s, NULL, 0); |
| |
| if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION) |
| { |
| pgwin32_dispatch_queued_signals(); |
| errno = EINTR; |
| return 0; |
| } |
| if (r == WAIT_OBJECT_0 + 1) |
| return 1; |
| if (r == WAIT_TIMEOUT) |
| { |
| errno = EWOULDBLOCK; |
| return 0; |
| } |
| ereport(ERROR, |
| (errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError()))); |
| return 0; |
| } |
| |
| /* |
| * Create a socket, setting it to overlapped and non-blocking |
| */ |
| SOCKET |
| pgwin32_socket(int af, int type, int protocol) |
| { |
| SOCKET s; |
| unsigned long on = 1; |
| |
| s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED); |
| if (s == INVALID_SOCKET) |
| { |
| TranslateSocketError(); |
| return INVALID_SOCKET; |
| } |
| |
| if (ioctlsocket(s, FIONBIO, &on)) |
| { |
| TranslateSocketError(); |
| closesocket(s); |
| return INVALID_SOCKET; |
| } |
| errno = 0; |
| |
| return s; |
| } |
| |
| int |
| pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen) |
| { |
| int res; |
| |
| res = bind(s, addr, addrlen); |
| if (res < 0) |
| TranslateSocketError(); |
| return res; |
| } |
| |
| int |
| pgwin32_listen(SOCKET s, int backlog) |
| { |
| int res; |
| |
| res = listen(s, backlog); |
| if (res < 0) |
| TranslateSocketError(); |
| return res; |
| } |
| |
| SOCKET |
| pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen) |
| { |
| SOCKET rs; |
| |
| /* |
| * Poll for signals, but don't return with EINTR, since we don't handle |
| * that in pqcomm.c |
| */ |
| pgwin32_poll_signals(); |
| |
| rs = WSAAccept(s, addr, addrlen, NULL, 0); |
| if (rs == INVALID_SOCKET) |
| { |
| TranslateSocketError(); |
| return INVALID_SOCKET; |
| } |
| return rs; |
| } |
| |
| |
| /* No signal delivery during connect. */ |
| int |
| pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen) |
| { |
| int r; |
| |
| r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL); |
| if (r == 0) |
| return 0; |
| |
| if (WSAGetLastError() != WSAEWOULDBLOCK) |
| { |
| TranslateSocketError(); |
| return -1; |
| } |
| |
| while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0) |
| { |
| /* Loop endlessly as long as we are just delivering signals */ |
| } |
| |
| return 0; |
| } |
| |
| int |
| pgwin32_recv(SOCKET s, char *buf, int len, int f) |
| { |
| WSABUF wbuf; |
| int r; |
| DWORD b; |
| DWORD flags = f; |
| int n; |
| |
| if (pgwin32_poll_signals()) |
| return -1; |
| |
| wbuf.len = len; |
| wbuf.buf = buf; |
| |
| r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL); |
| if (r != SOCKET_ERROR) |
| return b; /* success */ |
| |
| if (WSAGetLastError() != WSAEWOULDBLOCK) |
| { |
| TranslateSocketError(); |
| return -1; |
| } |
| |
| if (pgwin32_noblock) |
| { |
| /* |
| * No data received, and we are in "emulated non-blocking mode", so |
| * return indicating that we'd block if we were to continue. |
| */ |
| errno = EWOULDBLOCK; |
| return -1; |
| } |
| |
| /* We're in blocking mode, so wait for data */ |
| |
| for (n = 0; n < 5; n++) |
| { |
| if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT, |
| INFINITE) == 0) |
| return -1; /* errno already set */ |
| |
| r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL); |
| if (r != SOCKET_ERROR) |
| return b; /* success */ |
| if (WSAGetLastError() != WSAEWOULDBLOCK) |
| { |
| TranslateSocketError(); |
| return -1; |
| } |
| |
| /* |
| * There seem to be cases on win2k (at least) where WSARecv can return |
| * WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the |
| * socket is readable. In this case, just sleep for a moment and try |
| * again. We try up to 5 times - if it fails more than that it's not |
| * likely to ever come back. |
| */ |
| pg_usleep(10000); |
| } |
| ereport(NOTICE, |
| (errmsg_internal("could not read from ready socket (after retries)"))); |
| errno = EWOULDBLOCK; |
| return -1; |
| } |
| |
| /* |
| * The second argument to send() is defined by SUS to be a "const void *" |
| * and so we use the same signature here to keep compilers happy when |
| * handling callers. |
| * |
| * But the buf member of a WSABUF struct is defined as "char *", so we cast |
| * the second argument to that here when assigning it, also to keep compilers |
| * happy. |
| */ |
| |
| int |
| pgwin32_send(SOCKET s, const void *buf, int len, int flags) |
| { |
| WSABUF wbuf; |
| int r; |
| DWORD b; |
| |
| if (pgwin32_poll_signals()) |
| return -1; |
| |
| wbuf.len = len; |
| wbuf.buf = (char *) buf; |
| |
| /* |
| * Readiness of socket to send data to UDP socket may be not true: socket |
| * can become busy again! So loop until send or error occurs. |
| */ |
| for (;;) |
| { |
| r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL); |
| if (r != SOCKET_ERROR && b > 0) |
| /* Write succeeded right away */ |
| return b; |
| |
| if (r == SOCKET_ERROR && |
| WSAGetLastError() != WSAEWOULDBLOCK) |
| { |
| TranslateSocketError(); |
| return -1; |
| } |
| |
| if (pgwin32_noblock) |
| { |
| /* |
| * No data sent, and we are in "emulated non-blocking mode", so |
| * return indicating that we'd block if we were to continue. |
| */ |
| errno = EWOULDBLOCK; |
| return -1; |
| } |
| |
| /* No error, zero bytes */ |
| |
| if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0) |
| return -1; |
| } |
| |
| return -1; |
| } |
| |
| |
| /* |
| * Wait for activity on one or more sockets. |
| * While waiting, allow signals to run |
| * |
| * NOTE! Currently does not implement exceptfds check, |
| * since it is not used in postgresql! |
| */ |
| int |
| pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout) |
| { |
| WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally |
| * different from writefds, so |
| * 2*FD_SETSIZE sockets */ |
| SOCKET sockets[FD_SETSIZE * 2]; |
| int numevents = 0; |
| int i; |
| int r; |
| DWORD timeoutval = WSA_INFINITE; |
| FD_SET outreadfds; |
| FD_SET outwritefds; |
| int nummatches = 0; |
| |
| Assert(exceptfds == NULL); |
| |
| if (pgwin32_poll_signals()) |
| return -1; |
| |
| FD_ZERO(&outreadfds); |
| FD_ZERO(&outwritefds); |
| |
| /* |
| * Windows does not guarantee to log an FD_WRITE network event indicating |
| * that more data can be sent unless the previous send() failed with |
| * WSAEWOULDBLOCK. While our caller might well have made such a call, we |
| * cannot assume that here. Therefore, if waiting for write-ready, force |
| * the issue by doing a dummy send(). If the dummy send() succeeds, |
| * assume that the socket is in fact write-ready, and return immediately. |
| * Also, if it fails with something other than WSAEWOULDBLOCK, return a |
| * write-ready indication to let our caller deal with the error condition. |
| */ |
| if (writefds != NULL) |
| { |
| for (i = 0; i < writefds->fd_count; i++) |
| { |
| char c; |
| WSABUF buf; |
| DWORD sent; |
| |
| buf.buf = &c; |
| buf.len = 0; |
| |
| r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL); |
| if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK) |
| FD_SET(writefds->fd_array[i], &outwritefds); |
| } |
| |
| /* If we found any write-ready sockets, just return them immediately */ |
| if (outwritefds.fd_count > 0) |
| { |
| memcpy(writefds, &outwritefds, sizeof(fd_set)); |
| if (readfds) |
| FD_ZERO(readfds); |
| return outwritefds.fd_count; |
| } |
| } |
| |
| |
| /* Now set up for an actual select */ |
| |
| if (timeout != NULL) |
| { |
| /* timeoutval is in milliseconds */ |
| timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000; |
| } |
| |
| if (readfds != NULL) |
| { |
| for (i = 0; i < readfds->fd_count; i++) |
| { |
| events[numevents] = WSACreateEvent(); |
| sockets[numevents] = readfds->fd_array[i]; |
| numevents++; |
| } |
| } |
| if (writefds != NULL) |
| { |
| for (i = 0; i < writefds->fd_count; i++) |
| { |
| if (!readfds || |
| !FD_ISSET(writefds->fd_array[i], readfds)) |
| { |
| /* If the socket is not in the read list */ |
| events[numevents] = WSACreateEvent(); |
| sockets[numevents] = writefds->fd_array[i]; |
| numevents++; |
| } |
| } |
| } |
| |
| for (i = 0; i < numevents; i++) |
| { |
| int flags = 0; |
| |
| if (readfds && FD_ISSET(sockets[i], readfds)) |
| flags |= FD_READ | FD_ACCEPT | FD_CLOSE; |
| |
| if (writefds && FD_ISSET(sockets[i], writefds)) |
| flags |= FD_WRITE | FD_CLOSE; |
| |
| if (WSAEventSelect(sockets[i], events[i], flags) != 0) |
| { |
| TranslateSocketError(); |
| /* release already-assigned event objects */ |
| while (--i >= 0) |
| WSAEventSelect(sockets[i], NULL, 0); |
| for (i = 0; i < numevents; i++) |
| WSACloseEvent(events[i]); |
| return -1; |
| } |
| } |
| |
| events[numevents] = pgwin32_signal_event; |
| r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE); |
| if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents)) |
| { |
| /* |
| * We scan all events, even those not signaled, in case more than one |
| * event has been tagged but Wait.. can only return one. |
| */ |
| WSANETWORKEVENTS resEvents; |
| |
| for (i = 0; i < numevents; i++) |
| { |
| ZeroMemory(&resEvents, sizeof(resEvents)); |
| if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0) |
| elog(ERROR, "failed to enumerate network events: error code %d", |
| WSAGetLastError()); |
| /* Read activity? */ |
| if (readfds && FD_ISSET(sockets[i], readfds)) |
| { |
| if ((resEvents.lNetworkEvents & FD_READ) || |
| (resEvents.lNetworkEvents & FD_ACCEPT) || |
| (resEvents.lNetworkEvents & FD_CLOSE)) |
| { |
| FD_SET(sockets[i], &outreadfds); |
| |
| nummatches++; |
| } |
| } |
| /* Write activity? */ |
| if (writefds && FD_ISSET(sockets[i], writefds)) |
| { |
| if ((resEvents.lNetworkEvents & FD_WRITE) || |
| (resEvents.lNetworkEvents & FD_CLOSE)) |
| { |
| FD_SET(sockets[i], &outwritefds); |
| |
| nummatches++; |
| } |
| } |
| } |
| } |
| |
| /* Clean up all the event objects */ |
| for (i = 0; i < numevents; i++) |
| { |
| WSAEventSelect(sockets[i], NULL, 0); |
| WSACloseEvent(events[i]); |
| } |
| |
| if (r == WSA_WAIT_TIMEOUT) |
| { |
| if (readfds) |
| FD_ZERO(readfds); |
| if (writefds) |
| FD_ZERO(writefds); |
| return 0; |
| } |
| |
| /* Signal-like events. */ |
| if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION) |
| { |
| pgwin32_dispatch_queued_signals(); |
| errno = EINTR; |
| if (readfds) |
| FD_ZERO(readfds); |
| if (writefds) |
| FD_ZERO(writefds); |
| return -1; |
| } |
| |
| /* Overwrite socket sets with our resulting values */ |
| if (readfds) |
| memcpy(readfds, &outreadfds, sizeof(fd_set)); |
| if (writefds) |
| memcpy(writefds, &outwritefds, sizeof(fd_set)); |
| return nummatches; |
| } |