blob: efaa73ee751fb01c7a76f3e1c1456a8f7d2e00e5 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "kudu/security/tls_socket.h"
#include <openssl/err.h>
#include <openssl/ssl.h>
#include <sys/socket.h>
#include <cerrno>
#include <cstddef>
#include <functional>
#include <string>
#include <utility>
#include <glog/logging.h>
#include "kudu/gutil/basictypes.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/security/openssl_util.h"
#include "kudu/util/errno.h"
#include "kudu/util/net/sockaddr.h"
#include "kudu/util/net/socket.h"
using std::string;
using strings::Substitute;
namespace kudu {
namespace security {
TlsSocket::TlsSocket(int fd, c_unique_ptr<SSL> ssl)
: Socket(fd),
ssl_(std::move(ssl)) {
use_cork_ = true;
#ifndef __APPLE__
// `SO_DOMAIN` is not available on macOS. This code can be safely
// skipped because SetTcpCork() is a no-op on macOS.
if (fd >= 0) {
int dom;
socklen_t len = sizeof(dom);
if (getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &dom, &len) == 0 &&
dom == AF_UNIX) {
use_cork_ = false;
}
}
#endif
}
TlsSocket::~TlsSocket() {
ignore_result(Close());
}
Status TlsSocket::Write(const uint8_t *buf, int32_t amt, int32_t *nwritten) {
CHECK(ssl_);
SCOPED_OPENSSL_NO_PENDING_ERRORS;
*nwritten = 0;
if (PREDICT_FALSE(amt == 0)) {
// Writing an empty buffer is a no-op. This happens occasionally, eg in the
// case where the response has an empty sidecar. We have to special case
// it, because SSL_write can return '0' to indicate certain types of errors.
return Status::OK();
}
errno = 0;
int32_t bytes_written = SSL_write(ssl_.get(), buf, amt);
int save_errno = errno;
if (bytes_written <= 0) {
auto error_code = SSL_get_error(ssl_.get(), bytes_written);
if (error_code == SSL_ERROR_WANT_WRITE) {
if (save_errno != 0) {
return Status::NetworkError("SSL_write error",
ErrnoToString(save_errno), save_errno);
}
// Socket not ready to write yet.
return Status::OK();
}
return Status::NetworkError("failed to write to TLS socket",
GetSSLErrorDescription(error_code));
}
*nwritten = bytes_written;
return Status::OK();
}
Status TlsSocket::Writev(const struct ::iovec *iov, int iov_len, int64_t *nwritten) {
SCOPED_OPENSSL_NO_PENDING_ERRORS;
CHECK(ssl_);
// Since OpenSSL doesn't support any kind of writev() call itself, this function
// sets TCP_CORK and then calls Write() for each of the buffers in the iovec,
// then unsets TCP_CORK. This causes the Linux kernel to buffer up the packets
// while corked and then send a minimal number of packets upon uncorking, whereas
// otherwise it would have sent at least packet per Write call. This is beneficial
// since it avoids generating lots of small packets, each of which has overhead in
// the network stack, etc.
//
// The downside, though, is that we need to make (iov_len + 2) system calls, each
// of which has some significant overhead (even moreso after spectre/meltdown
// mitigations were enabled). This can take significant CPU in the reactor thread,
// especially when the underlying buffers are small.
//
// To mitigate this, we handle a common case where the iovec has a few buffers,
// but the total iovec length is actually short. This is the case in many types
// of RPC requests/responses. In this case, it's cheaper to copy all of the buffers
// into a socket-local buffer 'buf_' and do a single Write call, vs doing the
// emulated Writev approach described above.
if (iov_len > 1) {
size_t total_size = 0;
for (int i = 0; i < iov_len; i++) {
total_size += iov[i].iov_len;
}
// Assume we can copy about 8 bytes per cycle, and a syscall takes about 1300 cycles,
// based on some quick benchmarking of 'setsockopt' on a GCP Sky Lake VM.
//
// cycles for memcpy and one write = total_size / 8 + syscall
// cycles for cork, N writes, uncork = syscall * (2 + iov_len)
//
// Solve the inequality to find where memcpy is faster:
// total_size / 8 + syscall < syscall * (2 + iov_len)
// total_size / 8 < syscall * (1 + iov_len)
// total_size < 8 * syscall * (1 + iov_len)
size_t max_copy_size = 8 * 1300 * (1 + iov_len);
if (total_size <= max_copy_size) {
buf_.clear();
buf_.reserve(total_size);
for (int i = 0; i < iov_len; i++) {
buf_.append(iov[i].iov_base, iov[i].iov_len);
}
// TODO(todd) Write()'s 'nwritten' parameter is int32_t* instead of int64_t*
// so we need this temporary. We should change Write() to use size_t as well.
int32_t n = 0;
Status s = Write(buf_.data(), buf_.size(), &n);
*nwritten = n;
return s;
}
}
*nwritten = 0;
// Allows packets to be aggresively be accumulated before sending.
bool do_cork = use_cork_ && iov_len > 1;
if (do_cork) {
RETURN_NOT_OK(SetTcpCork(1));
}
Status write_status = Status::OK();
for (int i = 0; i < iov_len; ++i) {
int32_t frame_size = iov[i].iov_len;
int32_t bytes_written;
// Don't return before unsetting TCP_CORK.
write_status = Write(static_cast<uint8_t*>(iov[i].iov_base), frame_size, &bytes_written);
if (!write_status.ok()) break;
// nwritten should have the correct amount written.
*nwritten += bytes_written;
if (bytes_written < frame_size) break;
}
if (do_cork) {
RETURN_NOT_OK(SetTcpCork(0));
}
// If we did manage to write something, but not everything, due to a temporary socket
// error, then we should still return an OK status indicating a successful _partial_
// write.
if (*nwritten > 0 && Socket::IsTemporarySocketError(write_status.posix_code())) {
return Status::OK();
}
return write_status;
}
Status TlsSocket::Recv(uint8_t *buf, int32_t amt, int32_t *nread) {
SCOPED_OPENSSL_NO_PENDING_ERRORS;
CHECK(ssl_);
errno = 0;
int32_t bytes_read = SSL_read(ssl_.get(), buf, amt);
int save_errno = errno;
if (bytes_read <= 0) {
Sockaddr remote;
Status s = GetPeerAddress(&remote);
const string remote_str = s.ok() ? remote.ToString() : "unknown";
string kErrString = Substitute("failed to read from TLS socket (remote: $0)",
remote_str);
if (bytes_read == 0 && SSL_get_shutdown(ssl_.get()) == SSL_RECEIVED_SHUTDOWN) {
return Status::NetworkError(kErrString, ErrnoToString(ESHUTDOWN), ESHUTDOWN);
}
auto error_code = SSL_get_error(ssl_.get(), bytes_read);
if (error_code == SSL_ERROR_WANT_READ) {
if (save_errno != 0) {
return Status::NetworkError("SSL_read error from " + remote_str,
ErrnoToString(save_errno), save_errno);
}
// Nothing available to read yet.
*nread = 0;
return Status::OK();
}
if (error_code == SSL_ERROR_SYSCALL && ERR_peek_error() == 0) {
// From the OpenSSL docs:
// Some I/O error occurred. The OpenSSL error queue may contain more
// information on the error. If the error queue is empty (i.e.
// ERR_get_error() returns 0), ret can be used to find out more about
// the error: If ret == 0, an EOF was observed that violates the pro-
// tocol. If ret == -1, the underlying BIO reported an I/O error (for
// socket I/O on Unix systems, consult errno for details).
if (bytes_read == 0) {
// "EOF was observed that violates the protocol" (eg the other end disconnected)
return Status::NetworkError(kErrString, ErrnoToString(ECONNRESET), ECONNRESET);
}
if (bytes_read == -1 && save_errno != 0) {
return Status::NetworkError(kErrString, ErrnoToString(save_errno), save_errno);
}
return Status::NetworkError(kErrString, "unknown ERROR_SYSCALL");
}
return Status::NetworkError(kErrString, GetSSLErrorDescription(error_code));
}
*nread = bytes_read;
return Status::OK();
}
Status TlsSocket::Close() {
SCOPED_OPENSSL_NO_PENDING_ERRORS;
errno = 0;
if (!ssl_) {
// Socket is already closed.
return Status::OK();
}
// Start the TLS shutdown processes. We don't care about waiting for the
// response, since the underlying socket will not be reused.
int32_t ret = SSL_shutdown(ssl_.get());
Status ssl_shutdown;
if (ret >= 0) {
ssl_shutdown = Status::OK();
} else {
auto error_code = SSL_get_error(ssl_.get(), ret);
ssl_shutdown = Status::NetworkError("TlsSocket::Close", GetSSLErrorDescription(error_code));
}
ssl_.reset();
// Close the underlying socket.
RETURN_NOT_OK(Socket::Close());
return ssl_shutdown;
}
} // namespace security
} // namespace kudu