3rdparty/libprocess/src/tests/benchmarks.cpp - mesos - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include <gtest/gtest.h>

 #include <gmock/gmock.h>

 #include <deque>
 #include <iostream>
 #include <memory>
 #include <string>
 #include <thread>
 #include <vector>

 #include <process/collect.hpp>
 #include <process/count_down_latch.hpp>
 #include <process/future.hpp>
 #include <process/gmock.hpp>
 #include <process/gtest.hpp>
 #include <process/owned.hpp>
 #include <process/process.hpp>
 #include <process/protobuf.hpp>

 #include <process/metrics/counter.hpp>
 #include <process/metrics/metrics.hpp>

 #include <stout/duration.hpp>
 #include <stout/gtest.hpp>
 #include <stout/hashset.hpp>
 #include <stout/stopwatch.hpp>

 #include "benchmarks.pb.h"

 #include "mpsc_linked_queue.hpp"

 namespace http = process::http;
 namespace metrics = process::metrics;

 using process::CountDownLatch;
 using process::Future;
 using process::MessageEvent;
 using process::Owned;
 using process::Process;
 using process::ProcessBase;
 using process::Promise;
 using process::UPID;

 using std::cout;
 using std::endl;
 using std::ostringstream;
 using std::string;
 using std::vector;

 using testing::WithParamInterface;

 int main(int argc, char** argv)
 {
   // Initialize Google Mock/Test.
   testing::InitGoogleMock(&argc, argv);

   // NOTE: Windows does not support signal semantics required for these
   // handlers to be useful.
 #ifndef __WINDOWS__
   // Install GLOG's signal handler.
   google::InstallFailureSignalHandler();

   // We reset the GLOG's signal handler for SIGTERM because
   // 'SubprocessTest.Status' sends SIGTERM to a subprocess which
   // results in a stack trace otherwise.
   os::signals::reset(SIGTERM);
 #endif // __WINDOWS__

   // Add the libprocess test event listeners.
   ::testing::TestEventListeners& listeners =
     ::testing::UnitTest::GetInstance()->listeners();

   listeners.Append(process::ClockTestEventListener::instance());
   listeners.Append(process::FilterTestEventListener::instance());

   int result = RUN_ALL_TESTS();

   process::finalize(true);
   return result;
 }

 // TODO(jmlvanre): Factor out the client / server behavior so that we
 // can make separate binaries for the client and server. This is
 // useful to attach performance tools to them separately.

 // A process that emulates the 'client' side of a ping pong game.
 // An HTTP '/run' request performs a run and returns the time elapsed.
 class ClientProcess : public Process<ClientProcess>
 {
 public:
   ClientProcess()
     : requests(0),
       responses(0),
       totalRequests(0),
       concurrency(0) {}

   ~ClientProcess() override {}

 protected:
   void initialize() override
   {
     install("pong", &ClientProcess::pong);

     route("/run", None(), &ClientProcess::run);
   }

 private:
   Future<http::Response> run(const http::Request& request)
   {
     if (duration.get() != nullptr) {
       return http::BadRequest("A run is already in progress");
     }

     hashmap<string, Option<string>> parameters {
       {"server", request.url.query.get("server")},
       {"messageSize", request.url.query.get("messageSize")},
       {"requests", request.url.query.get("requests")},
       {"concurrency", request.url.query.get("concurrency")},
     };

     // Ensure all parameters were provided.
     foreachpair (const string& parameter,
                  const Option<string>& value,
                  parameters) {
       if (value.isNone()) {
         return http::BadRequest("Missing '" + parameter + "' parameter");
       }
     }

     server = UPID(parameters["server"].get());
     link(server);

     Try<Bytes> messageSize = Bytes::parse(parameters["messageSize"].get());
     if (messageSize.isError()) {
       return http::BadRequest("Invalid 'messageSize': " + messageSize.error());
     }
     message = string(messageSize->bytes(), '1');

     Try<size_t> numify_ = numify<size_t>(parameters["requests"].get());
     if (numify_.isError()) {
       return http::BadRequest("Invalid 'requests': " + numify_.error());
     }
     totalRequests = numify_.get();

     numify_ = numify<size_t>(parameters["concurrency"].get());
     if (numify_.isError()) {
       return http::BadRequest("Invalid 'concurrency': " + numify_.error());
     }
     concurrency = numify_.get();

     if (concurrency > totalRequests) {
       concurrency = totalRequests;
     }

     return _run()
       .then([](const Duration& duration) -> Future<http::Response> {
         return http::OK(stringify(duration));
       });
   }

   Future<Duration> _run()
   {
     duration = Owned<Promise<Duration>>(new Promise<Duration>());

     watch.start();

     while (requests < concurrency) {
       send(server, "ping", message.c_str(), message.size());
       ++requests;
     }

     return duration->future();
   }

   void pong(const UPID& from, const string& body)
   {
     ++responses;

     if (responses == totalRequests) {
       duration->set(watch.elapsed());
       duration.reset();
     } else if (requests < totalRequests) {
       send(server, "ping", message.c_str(), message.size());
       ++requests;
     }
   }

   // The address of the ponger (server).
   UPID server;

   Stopwatch watch;

   Owned<Promise<Duration>> duration;

   string message;

   size_t requests;
   size_t responses;

   size_t totalRequests;
   size_t concurrency;
 };


 // A process that emulates the 'server' side of a ping pong game.
 // Note that the server links to any clients communicating to it.
 class ServerProcess : public Process<ServerProcess>
 {
 public:
   ~ServerProcess() override {}

 protected:
   void initialize() override
   {
     // TODO(bmahler): Move in the message when move support is added.
     install("ping", &ServerProcess::ping);
   }

 private:
   void ping(const UPID& from, const string& body)
   {
     if (!links.contains(from)) {
       link(from);
       links.insert(from);
     }

     send(from, "pong", body.c_str(), body.size());
   }

   hashset<UPID> links;
 };

 // TODO(bmahler): Since there is no forking here, libprocess
 // avoids going through sockets for local messages. Either fork
 // or have the ability to disable local messages in libprocess.

 // Launches many clients against a central server and measures
 // client throughput.
 TEST(ProcessTest, Process_BENCHMARK_ClientServer)
 {
   const size_t numRequests = 10000;
   const size_t concurrency = 250;
   const size_t numClients = 8;
   const Bytes messageSize = Bytes(3);

   ServerProcess server;
   const UPID serverPid = spawn(&server);

   // Launch the clients.
   vector<Owned<ClientProcess>> clients;
   for (size_t i = 0; i < numClients; i++) {
     clients.push_back(Owned<ClientProcess>(new ClientProcess()));
     spawn(clients.back().get());
   }

   // Start the ping / pongs!
   const string query = strings::join(
       "&",
       "server=" + stringify(serverPid),
       "requests=" + stringify(numRequests),
       "concurrency=" + stringify(concurrency),
       "messageSize=" + stringify(messageSize));

   Stopwatch watch;
   watch.start();

   vector<Future<http::Response>> futures;
   foreach (const Owned<ClientProcess>& client, clients) {
     futures.push_back(http::get(client->self(), "run", query));
   }

   Future<vector<http::Response>> responses = collect(futures);
   AWAIT_READY(responses);

   Duration elapsed = watch.elapsed();

   // Print the throughput of each client.
   size_t i = 0;
   foreach (const http::Response& response, responses.get()) {
     ASSERT_EQ(http::Status::OK, response.code);
     ASSERT_EQ(http::Status::string(http::Status::OK), response.status);

     Try<Duration> elapsed = Duration::parse(response.body);
     ASSERT_SOME(elapsed);
     double throughput = numRequests / elapsed->secs();

     cout << "Client " << i << ": " << throughput << " rpcs / sec" << endl;

     i++;
   }

   double throughput = (numRequests * numClients) / elapsed.secs();
   cout << "Estimated Total: " << throughput << " rpcs / sec" << endl;

   foreach (const Owned<ClientProcess>& client, clients) {
     terminate(*client);
     wait(*client);
   }

   terminate(server);
   wait(server);
 }


 class LinkerProcess : public Process<LinkerProcess>
 {
 public:
   LinkerProcess(const UPID& _to) : to(_to) {}

   void initialize() override
   {
     link(to);
   }

 private:
   UPID to;
 };


 class EphemeralProcess : public Process<EphemeralProcess>
 {
 public:
   void terminate()
   {
     process::terminate(self());
   }
 };


 // Simulate the scenario discussed in MESOS-2182. We first establish a
 // large number of links by creating many linker-linkee pairs. And
 // then, we introduce a large amount of ephemeral process exits as
 // well as event dispatches.
 TEST(ProcessTest, Process_BENCHMARK_LargeNumberOfLinks)
 {
   int links = 5000;
   int iterations = 10000;

   // Keep track of all the linked processes we created.
   vector<ProcessBase*> processes;

   // Establish a large number of links.
   for (int i = 0; i < links; i++) {
     ProcessBase* linkee = new ProcessBase();
     LinkerProcess* linker = new LinkerProcess(linkee->self());

     processes.push_back(linkee);
     processes.push_back(linker);

     spawn(linkee);
     spawn(linker);
   }

   // Generate large number of dispatches and process exits by spawning
   // and then terminating EphemeralProcesses.
   vector<ProcessBase*> ephemeralProcesses;

   Stopwatch watch;
   watch.start();

   for (int i = 0; i < iterations ; i++) {
     EphemeralProcess* process = new EphemeralProcess();
     ephemeralProcesses.push_back(process);

     spawn(process);

     // NOTE: We let EphemeralProcess terminate itself to make sure all
     // dispatches are actually executed (otherwise, 'wait' below will
     // be blocked).
     dispatch(process->self(), &EphemeralProcess::terminate);
   }

   foreach (ProcessBase* process, ephemeralProcesses) {
     wait(process);
     delete process;
   }

   cout << "Elapsed: " << watch.elapsed() << endl;

   foreach (ProcessBase* process, processes) {
     terminate(process);
     wait(process);
     delete process;
   }
 }


 class Destination : public Process<Destination>
 {
 protected:
   void consume(MessageEvent&& event) override
   {
     if (event.message.name == "ping") {
       send(event.message.from, "pong");
     }
   }
 };


 class Client : public Process<Client>
 {
 public:
   Client(const UPID& destination, CountDownLatch* latch, long repeat)
     : destination(destination), latch(latch), repeat(repeat) {}

 protected:
   void consume(MessageEvent&& event) override
   {
     if (event.message.name == "pong") {
       received += 1;
       if (sent < repeat) {
         send(destination, "ping");
         sent += 1;
       } else if (received >= repeat) {
         latch->decrement();
       }
     } else if (event.message.name == "run") {
       for (long l = 0; l < std::min(1000L, repeat); l++) {
         send(destination, "ping");
         sent += 1;
       }
     }
   }

 private:
   UPID destination;
   CountDownLatch* latch;
   long repeat;
   long sent = 0L;
   long received = 0L;
 };


 // See
 // https://github.com/akka/akka/blob/7ac37e7536547c57ab639ed8746c7b4e5ff2f69b/akka-actor-tests/src/test/scala/akka/performance/microbench/TellThroughputPerformanceSpec.scala
 // for the inspiration for this benchmark (this file was deleted in
 // this commit:
 // https://github.com/akka/akka/commit/a02e138f3bc7c21c2b2511ea19203a52d74584d5).
 //
 // This benchmark was discussed here:
 // http://letitcrash.com/post/17607272336/scalability-of-fork-join-pool
 TEST(ProcessTest, Process_BENCHMARK_ThroughputPerformance)
 {
   long repeatFactor = 500L;
   long defaultRepeat = 30000L * repeatFactor;

   const long numberOfClients = process::workers();

   CountDownLatch latch(numberOfClients - 1);

   long repeat = defaultRepeat;

   auto repeatsPerClient = repeat / numberOfClients;

   vector<Owned<Destination>> destinations;
   vector<Owned<Client>> clients;

   for (long _ = 0; _ < numberOfClients; _++) {
     Owned<Destination> destination(new Destination());

     spawn(*destination);

     Owned<Client> client(new Client(
         destination->self(),
         &latch,
         repeatsPerClient));

     spawn(*client);

     destinations.push_back(destination);
     clients.push_back(client);
   }

   Stopwatch watch;
   watch.start();

   foreach (const Owned<Client>& client, clients) {
     post(client->self(), "run");
   }

   AWAIT_READY(latch.triggered());

   Duration elapsed = watch.elapsed();

   double throughput = (double) repeat / elapsed.secs();

   cout << "Estimated Total: " << std::fixed << throughput << endl;

   foreach (const Owned<Client>& client, clients) {
     terminate(client->self());
     wait(client->self());
   }

   foreach (const Owned<Destination>& destination, destinations) {
     terminate(destination->self());
     wait(destination->self());
   }
 }


 class DispatchProcess : public Process<DispatchProcess>
 {
 public:
   struct Movable
   {
     std::vector<int> data;
   };

   // This simulates protobuf objects, which do not support moves.
   struct Copyable
   {
     std::vector<int> data;

     Copyable(std::vector<int>&& data) : data(std::move(data)) {}
     Copyable(const Copyable& that) = default;
     Copyable& operator=(const Copyable&) = default;
   };

   DispatchProcess(Promise<Nothing> *promise, long repeat)
     : promise(promise), repeat(repeat) {}

   template <typename T>
   Future<Nothing> handler(const T& data)
   {
     count++;
     if (count >= repeat) {
       promise->set(Nothing());
       return Nothing();
     }

     // NOTE: The prefix `this->` is required here, otherwise it will
     // not compile when permissiveness is disabled (e.g. with MSVC on
     // Windows).
     dispatch(this->self(), &Self::_handler).then(
         defer(this->self(), &Self::handler<T>, data));

     return Nothing();
   }

   template <typename T>
   static void run(const string& name, long repeats)
   {
     Promise<Nothing> promise;

     Owned<DispatchProcess> process(new DispatchProcess(&promise, repeats));
     spawn(*process);

     T data{std::vector<int>(10240, 42)};

     Stopwatch watch;
     watch.start();

     dispatch(process.get(), &DispatchProcess::handler<T>, data);

     AWAIT_READY(promise.future());

     cout << name << " elapsed: " << watch.elapsed() << endl;

     terminate(process.get());
     wait(process.get());
   }

 private:
   Future<Nothing> _handler()
   {
     return Nothing();
   }

   Promise<Nothing> *promise;
   long repeat;
   long count = 0;
 };

 TEST(ProcessTest, Process_BENCHMARK_DispatchDefer)
 {
   constexpr long repeats = 100000;

   // Test performance separately for objects which support std::move,
   // and which don't (e.g. like protobufs).
   // Note: DispatchProcess::handler code is not fully optimized,
   // to take advantage of std::move support, e.g. parameter is passed
   // by const reference, so some copying is unavoidable, however
   // this resembles how most of the handlers are currently implemented.
   DispatchProcess::run<DispatchProcess::Movable>("Movable", repeats);
   DispatchProcess::run<DispatchProcess::Copyable>("Copyable", repeats);
 }


 class ProtobufInstallHandlerBenchmarkProcess
   : public ProtobufProcess<ProtobufInstallHandlerBenchmarkProcess>
 {
 public:
   ProtobufInstallHandlerBenchmarkProcess()
   {
     install<tests::Message>(&Self::handle);
   }

   // TODO(dzhuk): Add benchmark for handlers taking individual
   // message fields as parameters.
   void handle(const tests::Message& message)
   {
     // Intentionally no-op, as ProtobufProcess performance is measured
     // from receiving MessageEvent till calling handler.
   }

   void run(int submessages)
   {
     tests::Message message = createMessage(submessages);

     std::string data;
     bool success = message.SerializeToString(&data);
     CHECK(success);

     Stopwatch watch;
     watch.start();

     size_t count;

     for (count = 0; watch.elapsed() < Seconds(1); count++) {
       MessageEvent event(self(), self(), message.GetTypeName(),
           data.c_str(), data.length());
       consume(std::move(event));
     }

     watch.stop();

     double messagesPerSecond = count / watch.elapsed().secs();

     cout << "Size: " << std::setw(5) << message.ByteSizeLong() << " bytes,"
          << " throughput: " << std::setw(9) << std::setprecision(0)
          << std::fixed << messagesPerSecond << " messages/s" << endl;
   }

 private:
   // Returns a tree with the `submessages` number of sub-messages,
   // the branching factor is 4 and each sub-message contains a
   // payload of two integers. E.g.
   //
   //                             m                            |
   //        /           /        |        \         \         |
   //     [1,1]         m         m         m         m        |
   //                 //|\\     //|\\     //|\\     //|\\      |
   //               [1,1]...  [1,1]...  [1,1]...  [1,1]...     |
   tests::Message createMessage(size_t submessages)
   {
     tests::Message root;

     // Construct messages tree level by level, similar to breadth-first
     // search, where `submessages` defines the total number of nodes in
     // the tree. Messages in the queue still need a payload and children
     // to be added.
     std::deque<tests::Message*> nodes;
     nodes.push_back(&root);

     while (!nodes.empty()) {
       tests::Message* message = nodes.front();
       nodes.pop_front();

       message->mutable_payload()->Resize(2, 1);

       for (size_t i = 0; i < 4; i++) {
         if (submessages == 0) {
           // No more nodes need to be added, but keep processing the
           // queue to add the payloads.
           break;
         }

         tests::Message* child = message->add_submessages();
         nodes.push_back(child);
         submessages--;
       }
     }

     return root;
   }
 };


 // Measures performance of message passing in ProtobufProcess.
 TEST(ProcessTest, Process_BENCHMARK_ProtobufInstallHandler)
 {
   const int submessages[] = {0, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000};

   ProtobufInstallHandlerBenchmarkProcess process;
   foreach (int num_submessages, submessages) {
     process.run(num_submessages);
   }
 }


 TEST(ProcessTest, Process_BENCHMARK_MpscLinkedQueue)
 {
   // NOTE: we set the total number of producers to be 1 less than the
   // hardware concurrency so the consumer doesn't have to fight for
   // processing time with the producers.
   const unsigned int producerCount = std::thread::hardware_concurrency() - 1;
   const int messageCount = 10000000;
   const int totalCount = messageCount * producerCount;
   std::string* s = new std::string("");
   process::MpscLinkedQueue<std::string> q;

   Stopwatch consumerWatch;

   auto consumer = std::thread([totalCount, &q, &consumerWatch]() {
     consumerWatch.start();
     for (int i = totalCount; i > 0;) {
       if (q.dequeue() != nullptr) {
         i--;
       }
     }
     consumerWatch.stop();
   });

   std::vector<std::thread> producers;

   Stopwatch producerWatch;
   producerWatch.start();

   for (unsigned int t = 0; t < producerCount; t++) {
     // We want to capture `messageCount`, `s`, and `&q` here. Since
     // `messageCount` is a constant integer variable initialized with a
     // compile-time expression in a "reaching scope", it can get captured
     // without being mentioned in the capture list, see. e.g.,
     // https://stackoverflow.com/a/43468519/176922.
     //
     // We capture implicitly instead of explicitly since this part of the
     // standard is not supported by msvc, while clang supports it and emits a
     // warning for unneeded captures.
     producers.push_back(std::thread([&]() {
       for (int i = 0; i < messageCount; i++) {
         q.enqueue(s);
       }
     }));
   }

   for (std::thread& producer : producers) {
     producer.join();
   }

   producerWatch.stop();

   consumer.join();

   Duration producerElapsed = producerWatch.elapsed();
   Duration consumerElapsed = consumerWatch.elapsed();

   double consumerThroughput = (double) totalCount / consumerElapsed.secs();
   double producerThroughput = (double) totalCount / producerElapsed.secs();
   double throughput = consumerThroughput + producerThroughput;

   cout << "Estimated producer throughput (" << producerCount << " threads): "
        << std::fixed << producerThroughput << " op/s" << endl;
   cout << "Estimated consumer throughput: "
        << std::fixed << consumerThroughput << " op/s" << endl;
   cout << "Estimated total throughput: "
        << std::fixed << throughput << " op/s" << endl;
 }


 class Metrics_BENCHMARK_Test : public ::testing::Test,
                                public WithParamInterface<size_t>{};


 // Parameterized by the number of metrics.
 INSTANTIATE_TEST_CASE_P(
     MetricsCount,
     Metrics_BENCHMARK_Test,
     ::testing::Values(1u, 100u, 1000u, 10000u, 100000u));


 // Tests the performance of metrics fetching when there
 // are a large number of metrics.
 TEST_P(Metrics_BENCHMARK_Test, Scalability)
 {
   size_t metrics_count = GetParam();

   vector<metrics::Counter> counters;
   counters.reserve(metrics_count);

   for (size_t i = 0; i < metrics_count; ++i) {
     counters.push_back(
         metrics::Counter("metrics/keys/can/be/somewhat/long/"
                          "so/we/use/a/fairly/long/key/here/"
                          "to/test/a/more/pathological/case/" +
                          stringify(i)));
   }

   Stopwatch watch;
   watch.start();
   for (size_t i = 0; i < metrics_count; ++i) {
     metrics::add(counters[i]).get();
   }
   watch.stop();

   std::cout << "Added " << metrics_count << " counters in "
             << watch.elapsed() << std::endl;

   watch.start();
   metrics::snapshot(None()).get();
   watch.stop();

   std::cout << "Snapshot of " << metrics_count << " counters in "
               << watch.elapsed() << std::endl;

   UPID upid("metrics", process::address());

   watch.start();
   http::get(upid, "snapshot").get();
   watch.stop();

   std::cout << "HTTP /snapshot of " << metrics_count << " counters in "
               << watch.elapsed() << std::endl;

   watch.start();
   for (size_t i = 0; i < metrics_count; ++i) {
     metrics::remove(counters[i]).get();
   }
   watch.stop();

   std::cout << "Removed " << metrics_count << " counters in "
             << watch.elapsed() << std::endl;
 }


 TEST(ProcessTest, Process_BENCHMARK_MpscLinkedQueueEmpty)
 {
   const int messageCount = 1000000000;
   process::MpscLinkedQueue<std::string> q;

   Stopwatch consumerWatch;
   consumerWatch.start();

   for (int i = messageCount; i > 0; i--) {
     q.dequeue();
   }

   consumerWatch.stop();

   Duration consumerElapsed = consumerWatch.elapsed();

   double consumerThroughput = messageCount / consumerElapsed.secs();

   cout << "Estimated consumer throughput: "
        << std::fixed << consumerThroughput << " op/s" << endl;
 }


 TEST(ProcessTest, Process_BENCHMARK_MpscLinkedQueueNonContendedRead)
 {
   // NOTE: we set the total number of producers to be 1 less than the
   // hardware concurrency so the consumer doesn't have to fight for
   // processing time with the producers.
   const unsigned int producerCount = std::thread::hardware_concurrency() - 1;
   const int messageCount = 10000000;
   const int totalCount = messageCount * producerCount;
   std::string* s = new std::string("");
   process::MpscLinkedQueue<std::string> q;

   std::vector<std::thread> producers;
   for (unsigned int t = 0; t < producerCount; t++) {
     producers.push_back(std::thread([&]() {
       for (int i = 0; i < messageCount; i++) {
         q.enqueue(s);
       }
     }));
   }

   Stopwatch producerWatch;
   producerWatch.start();

   for (std::thread& producer : producers) {
     producer.join();
   }

   producerWatch.stop();

   Stopwatch consumerWatch;
   consumerWatch.start();

   for (int i = totalCount; i > 0;) {
     if (q.dequeue() != nullptr) {
       i--;
     }
   }

   consumerWatch.stop();

   Duration producerElapsed = producerWatch.elapsed();
   Duration consumerElapsed = consumerWatch.elapsed();

   double consumerThroughput = totalCount / consumerElapsed.secs();
   double producerThroughput = totalCount / producerElapsed.secs();
   double throughput = consumerThroughput + producerThroughput;

   cout << "Estimated producer throughput (" << producerCount << " threads): "
        << std::fixed << producerThroughput << " op/s" << endl;
   cout << "Estimated consumer throughput: "
        << std::fixed << consumerThroughput << " op/s" << endl;
   cout << "Estimated total throughput: "
        << std::fixed << throughput << " op/s" << endl;
 }