src/kudu/security/tls_socket.cc - kudu - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "kudu/security/tls_socket.h"

 #include <openssl/err.h>
 #include <openssl/ssl.h>
 #include <sys/socket.h>

 #include <cerrno>
 #include <cstddef>
 #include <functional>
 #include <string>
 #include <utility>

 #include <glog/logging.h>

 #include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/security/openssl_util.h"
 #include "kudu/util/errno.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/net/socket.h"

 using std::string;
 using strings::Substitute;

 namespace kudu {
 namespace security {

 TlsSocket::TlsSocket(int fd, c_unique_ptr<SSL> ssl)
     : Socket(fd),
       ssl_(std::move(ssl)) {
   use_cork_ = true;

 #ifndef __APPLE__
 // `SO_DOMAIN` is not available on macOS. This code can be safely
 // skipped because SetTcpCork() is a no-op on macOS.
   if (fd >= 0) {
     int dom;
     socklen_t len = sizeof(dom);
     if (getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &dom, &len) == 0 &&
         dom == AF_UNIX) {
       use_cork_ = false;
     }
   }
 #endif
 }

 TlsSocket::~TlsSocket() {
   ignore_result(Close());
 }

 Status TlsSocket::Write(const uint8_t *buf, int32_t amt, int32_t *nwritten) {
   CHECK(ssl_);
   SCOPED_OPENSSL_NO_PENDING_ERRORS;

   *nwritten = 0;
   if (PREDICT_FALSE(amt == 0)) {
     // Writing an empty buffer is a no-op. This happens occasionally, eg in the
     // case where the response has an empty sidecar. We have to special case
     // it, because SSL_write can return '0' to indicate certain types of errors.
     return Status::OK();
   }

   errno = 0;
   int32_t bytes_written = SSL_write(ssl_.get(), buf, amt);
   int save_errno = errno;
   if (bytes_written <= 0) {
     auto error_code = SSL_get_error(ssl_.get(), bytes_written);
     if (error_code == SSL_ERROR_WANT_WRITE) {
       if (save_errno != 0) {
         return Status::NetworkError("SSL_write error",
                                     ErrnoToString(save_errno), save_errno);
       }
       // Socket not ready to write yet.
       return Status::OK();
     }
     return Status::NetworkError("failed to write to TLS socket",
                                 GetSSLErrorDescription(error_code));
   }
   *nwritten = bytes_written;
   return Status::OK();
 }

 Status TlsSocket::Writev(const struct ::iovec *iov, int iov_len, int64_t *nwritten) {
   SCOPED_OPENSSL_NO_PENDING_ERRORS;
   CHECK(ssl_);

   // Since OpenSSL doesn't support any kind of writev() call itself, this function
   // sets TCP_CORK and then calls Write() for each of the buffers in the iovec,
   // then unsets TCP_CORK. This causes the Linux kernel to buffer up the packets
   // while corked and then send a minimal number of packets upon uncorking, whereas
   // otherwise it would have sent at least packet per Write call. This is beneficial
   // since it avoids generating lots of small packets, each of which has overhead in
   // the network stack, etc.
   //
   // The downside, though, is that we need to make (iov_len + 2) system calls, each
   // of which has some significant overhead (even moreso after spectre/meltdown
   // mitigations were enabled). This can take significant CPU in the reactor thread,
   // especially when the underlying buffers are small.
   //
   // To mitigate this, we handle a common case where the iovec has a few buffers,
   // but the total iovec length is actually short. This is the case in many types
   // of RPC requests/responses. In this case, it's cheaper to copy all of the buffers
   // into a socket-local buffer 'buf_' and do a single Write call, vs doing the
   // emulated Writev approach described above.
   if (iov_len > 1) {
     size_t total_size = 0;
     for (int i = 0; i < iov_len; i++) {
       total_size += iov[i].iov_len;
     }
     // Assume we can copy about 8 bytes per cycle, and a syscall takes about 1300 cycles,
     // based on some quick benchmarking of 'setsockopt' on a GCP Sky Lake VM.
     //
     // cycles for memcpy and one write = total_size / 8 + syscall
     // cycles for cork, N writes, uncork = syscall * (2 + iov_len)
     //
     // Solve the inequality to find where memcpy is faster:
     // total_size / 8 + syscall < syscall * (2 + iov_len)
     // total_size / 8 < syscall * (1 + iov_len)
     // total_size < 8 * syscall * (1 + iov_len)
     size_t max_copy_size = 8 * 1300 * (1 + iov_len);
     if (total_size <= max_copy_size) {
       buf_.clear();
       buf_.reserve(total_size);
       for (int i = 0; i < iov_len; i++) {
         buf_.append(iov[i].iov_base, iov[i].iov_len);
       }
       // TODO(todd) Write()'s 'nwritten' parameter is int32_t* instead of int64_t*
       // so we need this temporary. We should change Write() to use size_t as well.
       int32_t n = 0;
       Status s = Write(buf_.data(), buf_.size(), &n);
       *nwritten = n;
       return s;
     }
   }

   *nwritten = 0;
   // Allows packets to be aggresively be accumulated before sending.
   bool do_cork = use_cork_ && iov_len > 1;
   if (do_cork) {
     RETURN_NOT_OK(SetTcpCork(1));
   }
   Status write_status = Status::OK();
   for (int i = 0; i < iov_len; ++i) {
     int32_t frame_size = iov[i].iov_len;
     int32_t bytes_written;
     // Don't return before unsetting TCP_CORK.
     write_status = Write(static_cast<uint8_t*>(iov[i].iov_base), frame_size, &bytes_written);
     if (!write_status.ok()) break;

     // nwritten should have the correct amount written.
     *nwritten += bytes_written;
     if (bytes_written < frame_size) break;
   }
   if (do_cork) {
     RETURN_NOT_OK(SetTcpCork(0));
   }
   // If we did manage to write something, but not everything, due to a temporary socket
   // error, then we should still return an OK status indicating a successful _partial_
   // write.
   if (*nwritten > 0 && Socket::IsTemporarySocketError(write_status.posix_code())) {
     return Status::OK();
   }
   return write_status;
 }

 Status TlsSocket::Recv(uint8_t *buf, int32_t amt, int32_t *nread) {
   SCOPED_OPENSSL_NO_PENDING_ERRORS;

   CHECK(ssl_);
   errno = 0;
   int32_t bytes_read = SSL_read(ssl_.get(), buf, amt);
   int save_errno = errno;
   if (bytes_read <= 0) {
     Sockaddr remote;
     Status s = GetPeerAddress(&remote);
     const string remote_str = s.ok() ? remote.ToString() : "unknown";
     string kErrString = Substitute("failed to read from TLS socket (remote: $0)",
                                    remote_str);

     if (bytes_read == 0 && SSL_get_shutdown(ssl_.get()) == SSL_RECEIVED_SHUTDOWN) {
       return Status::NetworkError(kErrString, ErrnoToString(ESHUTDOWN), ESHUTDOWN);
     }
     auto error_code = SSL_get_error(ssl_.get(), bytes_read);
     if (error_code == SSL_ERROR_WANT_READ) {
       if (save_errno != 0) {
         return Status::NetworkError("SSL_read error from " + remote_str,
                                     ErrnoToString(save_errno), save_errno);
       }
       // Nothing available to read yet.
       *nread = 0;
       return Status::OK();
     }
     if (error_code == SSL_ERROR_SYSCALL && ERR_peek_error() == 0) {
       // From the OpenSSL docs:
       //   Some I/O error occurred.  The OpenSSL error queue may contain more
       //   information on the error.  If the error queue is empty (i.e.
       //   ERR_get_error() returns 0), ret can be used to find out more about
       //   the error: If ret == 0, an EOF was observed that violates the pro-
       //   tocol.  If ret == -1, the underlying BIO reported an I/O error (for
       //   socket I/O on Unix systems, consult errno for details).
       if (bytes_read == 0) {
         // "EOF was observed that violates the protocol" (eg the other end disconnected)
         return Status::NetworkError(kErrString, ErrnoToString(ECONNRESET), ECONNRESET);
       }
       if (bytes_read == -1 && save_errno != 0) {
         return Status::NetworkError(kErrString, ErrnoToString(save_errno), save_errno);
       }
       return Status::NetworkError(kErrString, "unknown ERROR_SYSCALL");
     }
     return Status::NetworkError(kErrString, GetSSLErrorDescription(error_code));
   }
   *nread = bytes_read;
   return Status::OK();
 }

 Status TlsSocket::Close() {
   SCOPED_OPENSSL_NO_PENDING_ERRORS;
   errno = 0;

   if (!ssl_) {
     // Socket is already closed.
     return Status::OK();
   }

   // Start the TLS shutdown processes. We don't care about waiting for the
   // response, since the underlying socket will not be reused.
   int32_t ret = SSL_shutdown(ssl_.get());
   Status ssl_shutdown;
   if (ret >= 0) {
     ssl_shutdown = Status::OK();
   } else {
     auto error_code = SSL_get_error(ssl_.get(), ret);
     ssl_shutdown = Status::NetworkError("TlsSocket::Close", GetSSLErrorDescription(error_code));
   }

   ssl_.reset();

   // Close the underlying socket.
   RETURN_NOT_OK(Socket::Close());
   return ssl_shutdown;
 }

 } // namespace security
 } // namespace kudu
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "kudu/security/tls_socket.h"

	#include <openssl/err.h>
	#include <openssl/ssl.h>
	#include <sys/socket.h>

	#include <cerrno>
	#include <cstddef>
	#include <functional>
	#include <string>
	#include <utility>

	#include <glog/logging.h>

	#include "kudu/gutil/basictypes.h"
	#include "kudu/gutil/strings/substitute.h"
	#include "kudu/security/openssl_util.h"
	#include "kudu/util/errno.h"
	#include "kudu/util/net/sockaddr.h"
	#include "kudu/util/net/socket.h"

	using std::string;
	using strings::Substitute;

	namespace kudu {
	namespace security {

	TlsSocket::TlsSocket(int fd, c_unique_ptr<SSL> ssl)
	: Socket(fd),
	ssl_(std::move(ssl)) {
	use_cork_ = true;

	#ifndef __APPLE__
	// `SO_DOMAIN` is not available on macOS. This code can be safely
	// skipped because SetTcpCork() is a no-op on macOS.
	if (fd >= 0) {
	int dom;
	socklen_t len = sizeof(dom);
	if (getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &dom, &len) == 0 &&
	dom == AF_UNIX) {
	use_cork_ = false;
	}
	}
	#endif
	}

	TlsSocket::~TlsSocket() {
	ignore_result(Close());
	}

	Status TlsSocket::Write(const uint8_t buf, int32_t amt, int32_t nwritten) {
	CHECK(ssl_);
	SCOPED_OPENSSL_NO_PENDING_ERRORS;

	*nwritten = 0;
	if (PREDICT_FALSE(amt == 0)) {
	// Writing an empty buffer is a no-op. This happens occasionally, eg in the
	// case where the response has an empty sidecar. We have to special case
	// it, because SSL_write can return '0' to indicate certain types of errors.
	return Status::OK();
	}

	errno = 0;
	int32_t bytes_written = SSL_write(ssl_.get(), buf, amt);
	int save_errno = errno;
	if (bytes_written <= 0) {
	auto error_code = SSL_get_error(ssl_.get(), bytes_written);
	if (error_code == SSL_ERROR_WANT_WRITE) {
	if (save_errno != 0) {
	return Status::NetworkError("SSL_write error",
	ErrnoToString(save_errno), save_errno);
	}
	// Socket not ready to write yet.
	return Status::OK();
	}
	return Status::NetworkError("failed to write to TLS socket",
	GetSSLErrorDescription(error_code));
	}
	*nwritten = bytes_written;
	return Status::OK();
	}

	Status TlsSocket::Writev(const struct ::iovec iov, int iov_len, int64_t nwritten) {
	SCOPED_OPENSSL_NO_PENDING_ERRORS;
	CHECK(ssl_);

	// Since OpenSSL doesn't support any kind of writev() call itself, this function
	// sets TCP_CORK and then calls Write() for each of the buffers in the iovec,
	// then unsets TCP_CORK. This causes the Linux kernel to buffer up the packets
	// while corked and then send a minimal number of packets upon uncorking, whereas
	// otherwise it would have sent at least packet per Write call. This is beneficial
	// since it avoids generating lots of small packets, each of which has overhead in
	// the network stack, etc.
	//
	// The downside, though, is that we need to make (iov_len + 2) system calls, each
	// of which has some significant overhead (even moreso after spectre/meltdown
	// mitigations were enabled). This can take significant CPU in the reactor thread,
	// especially when the underlying buffers are small.
	//
	// To mitigate this, we handle a common case where the iovec has a few buffers,
	// but the total iovec length is actually short. This is the case in many types
	// of RPC requests/responses. In this case, it's cheaper to copy all of the buffers
	// into a socket-local buffer 'buf_' and do a single Write call, vs doing the
	// emulated Writev approach described above.
	if (iov_len > 1) {
	size_t total_size = 0;
	for (int i = 0; i < iov_len; i++) {
	total_size += iov[i].iov_len;
	}
	// Assume we can copy about 8 bytes per cycle, and a syscall takes about 1300 cycles,
	// based on some quick benchmarking of 'setsockopt' on a GCP Sky Lake VM.
	//
	// cycles for memcpy and one write = total_size / 8 + syscall
	// cycles for cork, N writes, uncork = syscall * (2 + iov_len)
	//
	// Solve the inequality to find where memcpy is faster:
	// total_size / 8 + syscall < syscall * (2 + iov_len)
	// total_size / 8 < syscall * (1 + iov_len)
	// total_size < 8 * syscall * (1 + iov_len)
	size_t max_copy_size = 8 * 1300 * (1 + iov_len);
	if (total_size <= max_copy_size) {
	buf_.clear();
	buf_.reserve(total_size);
	for (int i = 0; i < iov_len; i++) {
	buf_.append(iov[i].iov_base, iov[i].iov_len);
	}
	// TODO(todd) Write()'s 'nwritten' parameter is int32_t* instead of int64_t*
	// so we need this temporary. We should change Write() to use size_t as well.
	int32_t n = 0;
	Status s = Write(buf_.data(), buf_.size(), &n);
	*nwritten = n;
	return s;
	}
	}

	*nwritten = 0;
	// Allows packets to be aggresively be accumulated before sending.
	bool do_cork = use_cork_ && iov_len > 1;
	if (do_cork) {
	RETURN_NOT_OK(SetTcpCork(1));
	}
	Status write_status = Status::OK();
	for (int i = 0; i < iov_len; ++i) {
	int32_t frame_size = iov[i].iov_len;
	int32_t bytes_written;
	// Don't return before unsetting TCP_CORK.
	write_status = Write(static_cast<uint8_t*>(iov[i].iov_base), frame_size, &bytes_written);
	if (!write_status.ok()) break;

	// nwritten should have the correct amount written.
	*nwritten += bytes_written;
	if (bytes_written < frame_size) break;
	}
	if (do_cork) {
	RETURN_NOT_OK(SetTcpCork(0));
	}
	// If we did manage to write something, but not everything, due to a temporary socket
	// error, then we should still return an OK status indicating a successful _partial_
	// write.
	if (*nwritten > 0 && Socket::IsTemporarySocketError(write_status.posix_code())) {
	return Status::OK();
	}
	return write_status;
	}

	Status TlsSocket::Recv(uint8_t buf, int32_t amt, int32_t nread) {
	SCOPED_OPENSSL_NO_PENDING_ERRORS;

	CHECK(ssl_);
	errno = 0;
	int32_t bytes_read = SSL_read(ssl_.get(), buf, amt);
	int save_errno = errno;
	if (bytes_read <= 0) {
	Sockaddr remote;
	Status s = GetPeerAddress(&remote);
	const string remote_str = s.ok() ? remote.ToString() : "unknown";
	string kErrString = Substitute("failed to read from TLS socket (remote: $0)",
	remote_str);

	if (bytes_read == 0 && SSL_get_shutdown(ssl_.get()) == SSL_RECEIVED_SHUTDOWN) {
	return Status::NetworkError(kErrString, ErrnoToString(ESHUTDOWN), ESHUTDOWN);
	}
	auto error_code = SSL_get_error(ssl_.get(), bytes_read);
	if (error_code == SSL_ERROR_WANT_READ) {
	if (save_errno != 0) {
	return Status::NetworkError("SSL_read error from " + remote_str,
	ErrnoToString(save_errno), save_errno);
	}
	// Nothing available to read yet.
	*nread = 0;
	return Status::OK();
	}
	if (error_code == SSL_ERROR_SYSCALL && ERR_peek_error() == 0) {
	// From the OpenSSL docs:
	// Some I/O error occurred. The OpenSSL error queue may contain more
	// information on the error. If the error queue is empty (i.e.
	// ERR_get_error() returns 0), ret can be used to find out more about
	// the error: If ret == 0, an EOF was observed that violates the pro-
	// tocol. If ret == -1, the underlying BIO reported an I/O error (for
	// socket I/O on Unix systems, consult errno for details).
	if (bytes_read == 0) {
	// "EOF was observed that violates the protocol" (eg the other end disconnected)
	return Status::NetworkError(kErrString, ErrnoToString(ECONNRESET), ECONNRESET);
	}
	if (bytes_read == -1 && save_errno != 0) {
	return Status::NetworkError(kErrString, ErrnoToString(save_errno), save_errno);
	}
	return Status::NetworkError(kErrString, "unknown ERROR_SYSCALL");
	}
	return Status::NetworkError(kErrString, GetSSLErrorDescription(error_code));
	}
	*nread = bytes_read;
	return Status::OK();
	}

	Status TlsSocket::Close() {
	SCOPED_OPENSSL_NO_PENDING_ERRORS;
	errno = 0;

	if (!ssl_) {
	// Socket is already closed.
	return Status::OK();
	}

	// Start the TLS shutdown processes. We don't care about waiting for the
	// response, since the underlying socket will not be reused.
	int32_t ret = SSL_shutdown(ssl_.get());
	Status ssl_shutdown;
	if (ret >= 0) {
	ssl_shutdown = Status::OK();
	} else {
	auto error_code = SSL_get_error(ssl_.get(), ret);
	ssl_shutdown = Status::NetworkError("TlsSocket::Close", GetSSLErrorDescription(error_code));
	}

	ssl_.reset();

	// Close the underlying socket.
	RETURN_NOT_OK(Socket::Close());
	return ssl_shutdown;
	}

	} // namespace security
	} // namespace kudu