src/kudu/util/process_memory.cc - kudu - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <cstddef>
 #include <memory>
 #include <ostream>

 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #ifdef TCMALLOC_ENABLED
 #include <gperftools/malloc_extension.h>  // IWYU pragma: keep
 #endif

 #include "kudu/util/process_memory.h"

 #include "kudu/gutil/atomicops.h"
 #include "kudu/gutil/macros.h"
 #include "kudu/gutil/once.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/gutil/walltime.h"          // IWYU pragma: keep
 #include "kudu/util/debug/trace_event.h"  // IWYU pragma: keep
 #include "kudu/util/env.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/locks.h"
 #include "kudu/util/mem_tracker.h"        // IWYU pragma: keep
 #include "kudu/util/random.h"
 #include "kudu/util/status.h"

 DEFINE_int64(memory_limit_hard_bytes, 0,
              "Maximum amount of memory this daemon should use, in bytes. "
              "A value of 0 autosizes based on the total system memory. "
              "A value of -1 disables all memory limiting.");
 TAG_FLAG(memory_limit_hard_bytes, stable);

 DEFINE_int32(memory_pressure_percentage, 60,
              "Percentage of the hard memory limit that this daemon may "
              "consume before flushing of in-memory data becomes prioritized.");
 TAG_FLAG(memory_pressure_percentage, advanced);

 DEFINE_int32(memory_limit_soft_percentage, 80,
              "Percentage of the hard memory limit that this daemon may "
              "consume before memory throttling of writes begins. The greater "
              "the excess, the higher the chance of throttling. In general, a "
              "lower soft limit leads to smoother write latencies but "
              "decreased throughput, and vice versa for a higher soft limit.");
 TAG_FLAG(memory_limit_soft_percentage, advanced);

 DEFINE_int32(memory_limit_warn_threshold_percentage, 98,
              "Percentage of the hard memory limit that this daemon may "
              "consume before WARNING level messages are periodically logged.");
 TAG_FLAG(memory_limit_warn_threshold_percentage, advanced);

 #ifdef TCMALLOC_ENABLED
 DEFINE_bool(disable_tcmalloc_gc_by_memory_tracker_for_testing, false,
             "For testing only! Whether to disable tcmalloc GC by memory tracker.");
 TAG_FLAG(disable_tcmalloc_gc_by_memory_tracker_for_testing, hidden);

 DEFINE_int32(tcmalloc_max_free_bytes_percentage, 10,
              "Maximum percentage of the RSS that tcmalloc is allowed to use for "
              "reserved but unallocated memory.");
 TAG_FLAG(tcmalloc_max_free_bytes_percentage, advanced);
 #endif

 using strings::Substitute;

 namespace kudu {
 namespace process_memory {

 namespace {
 int64_t g_hard_limit;
 int64_t g_soft_limit;
 int64_t g_pressure_threshold;

 ThreadSafeRandom* g_rand = nullptr;

 #ifdef TCMALLOC_ENABLED
 // Total amount of memory released since the last GC. If this
 // is greater than kGcReleaseSize, this will trigger a tcmalloc gc.
 Atomic64 g_released_memory_since_gc;

 // Size, in bytes, that is considered a large value for Release() (or Consume() with
 // a negative value). If tcmalloc is used, this can trigger it to GC.
 // A higher value will make us call into tcmalloc less often (and therefore more
 // efficient). A lower value will mean our memory overhead is lower.
 // TODO(todd): this is a stopgap.
 const int64_t kGcReleaseSize = 128 * 1024L * 1024L;

 #endif // TCMALLOC_ENABLED

 } // anonymous namespace


 // Flag validation
 // ------------------------------------------------------------
 // Validate that various flags are percentages.
 static bool ValidatePercentage(const char* flagname, int value) {
   if (value >= 0 && value <= 100) {
     return true;
   }
   LOG(ERROR) << Substitute("$0 must be a percentage, value $1 is invalid",
                            flagname, value);
   return false;
 }

 DEFINE_validator(memory_limit_soft_percentage, &ValidatePercentage);
 DEFINE_validator(memory_limit_warn_threshold_percentage, &ValidatePercentage);
 #ifdef TCMALLOC_ENABLED
 DEFINE_validator(tcmalloc_max_free_bytes_percentage, &ValidatePercentage);
 #endif

 // Wrappers around tcmalloc functionality
 // ------------------------------------------------------------
 #ifdef TCMALLOC_ENABLED
 static int64_t GetTCMallocProperty(const char* prop) {
   size_t value;
   if (!MallocExtension::instance()->GetNumericProperty(prop, &value)) {
     LOG(DFATAL) << "Failed to get tcmalloc property " << prop;
   }
   return value;
 }

 int64_t GetTCMallocCurrentAllocatedBytes() {
   return GetTCMallocProperty("generic.current_allocated_bytes");
 }

 void GcTcmalloc() {
   TRACE_EVENT0("process", "GcTcmalloc");

   // Number of bytes in the 'NORMAL' free list (i.e reserved by tcmalloc but
   // not in use).
   int64_t bytes_overhead = GetTCMallocProperty("tcmalloc.pageheap_free_bytes");
   // Bytes allocated by the application.
   int64_t bytes_used = GetTCMallocCurrentAllocatedBytes();

   int64_t max_overhead = bytes_used * FLAGS_tcmalloc_max_free_bytes_percentage / 100.0;
   if (bytes_overhead > max_overhead) {
     int64_t extra = bytes_overhead - max_overhead;
     while (extra > 0) {
       // Release 1MB at a time, so that tcmalloc releases its page heap lock
       // allowing other threads to make progress. This still disrupts the current
       // thread, but is better than disrupting all.
       MallocExtension::instance()->ReleaseToSystem(1024 * 1024);
       extra -= 1024 * 1024;
     }
   }
 }
 #endif // TCMALLOC_ENABLED


 // Consumption and soft memory limit behavior
 // ------------------------------------------------------------
 namespace {
 void DoInitLimits() {
   int64_t limit = FLAGS_memory_limit_hard_bytes;
   if (limit == 0) {
     // If no limit is provided, we'll use 80% of system RAM.
     int64_t total_ram;
     CHECK_OK(Env::Default()->GetTotalRAMBytes(&total_ram));
     limit = total_ram * 4;
     limit /= 5;
   }
   g_hard_limit = limit;
   g_soft_limit = FLAGS_memory_limit_soft_percentage * g_hard_limit / 100;
   g_pressure_threshold = FLAGS_memory_pressure_percentage * g_hard_limit / 100;

   g_rand = new ThreadSafeRandom(1);
 }

 void InitLimits() {
   static GoogleOnceType once;
   GoogleOnceInit(&once, &DoInitLimits);
 }

 } // anonymous namespace

 int64_t CurrentConsumption() {
 #ifdef TCMALLOC_ENABLED
   const int64_t kReadIntervalMicros = 50000;
   static Atomic64 last_read_time = 0;
   static simple_spinlock read_lock;
   static Atomic64 consumption = 0;
   uint64_t time = GetMonoTimeMicros();
   if (time > last_read_time + kReadIntervalMicros && read_lock.try_lock()) {
     base::subtle::NoBarrier_Store(&consumption, GetTCMallocCurrentAllocatedBytes());
     // Re-fetch the time after getting the consumption. This way, in case fetching
     // consumption is extremely slow for some reason (eg due to lots of contention
     // in tcmalloc) we at least ensure that we wait at least another full interval
     // before fetching the information again.
     time = GetMonoTimeMicros();
     base::subtle::NoBarrier_Store(&last_read_time, time);
     read_lock.unlock();
   }

   return base::subtle::NoBarrier_Load(&consumption);
 #else
   // Without tcmalloc, we have no reliable way of determining our own heap
   // size (e.g. mallinfo doesn't work in ASAN builds). So, we'll fall back
   // to just looking at the sum of our tracked memory.
   return MemTracker::GetRootTracker()->consumption();
 #endif
 }

 int64_t HardLimit() {
   InitLimits();
   return g_hard_limit;
 }

 int64_t SoftLimit() {
   InitLimits();
   return g_soft_limit;
 }

 int64_t MemoryPressureThreshold() {
   InitLimits();
   return g_pressure_threshold;
 }

 bool UnderMemoryPressure(double* current_capacity_pct) {
   InitLimits();
   int64_t consumption = CurrentConsumption();
   if (consumption < g_pressure_threshold) {
     return false;
   }
   if (current_capacity_pct) {
     *current_capacity_pct = static_cast<double>(consumption) / g_hard_limit * 100;
   }
   return true;
 }

 bool SoftLimitExceeded(double* current_capacity_pct) {
   InitLimits();
   int64_t consumption = CurrentConsumption();
   // Did we exceed the actual limit?
   if (consumption > g_hard_limit) {
     if (current_capacity_pct) {
       *current_capacity_pct = static_cast<double>(consumption) / g_hard_limit * 100;
     }
     return true;
   }

   // No soft limit defined.
   if (g_hard_limit == g_soft_limit) {
     return false;
   }

   // Are we under the soft limit threshold?
   if (consumption < g_soft_limit) {
     return false;
   }

   // We're over the threshold; were we randomly chosen to be over the soft limit?
   if (consumption + g_rand->Uniform64(g_hard_limit - g_soft_limit) > g_hard_limit) {
     if (current_capacity_pct) {
       *current_capacity_pct = static_cast<double>(consumption) / g_hard_limit * 100;
     }
     return true;
   }
   return false;
 }

 void MaybeGCAfterRelease(int64_t released_bytes) {
 #ifdef TCMALLOC_ENABLED
   int64_t now_released = base::subtle::NoBarrier_AtomicIncrement(
       &g_released_memory_since_gc, released_bytes);
   if (PREDICT_FALSE(now_released > kGcReleaseSize
       && !FLAGS_disable_tcmalloc_gc_by_memory_tracker_for_testing)) {
     base::subtle::NoBarrier_Store(&g_released_memory_since_gc, 0);
     GcTcmalloc();
   }
 #endif
 }

 } // namespace process_memory
 } // namespace kudu
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <cstddef>
	#include <memory>
	#include <ostream>

	#include <gflags/gflags.h>
	#include <glog/logging.h>
	#ifdef TCMALLOC_ENABLED
	#include <gperftools/malloc_extension.h> // IWYU pragma: keep
	#endif

	#include "kudu/util/process_memory.h"

	#include "kudu/gutil/atomicops.h"
	#include "kudu/gutil/macros.h"
	#include "kudu/gutil/once.h"
	#include "kudu/gutil/port.h"
	#include "kudu/gutil/strings/substitute.h"
	#include "kudu/gutil/walltime.h" // IWYU pragma: keep
	#include "kudu/util/debug/trace_event.h" // IWYU pragma: keep
	#include "kudu/util/env.h"
	#include "kudu/util/flag_tags.h"
	#include "kudu/util/locks.h"
	#include "kudu/util/mem_tracker.h" // IWYU pragma: keep
	#include "kudu/util/random.h"
	#include "kudu/util/status.h"

	DEFINE_int64(memory_limit_hard_bytes, 0,
	"Maximum amount of memory this daemon should use, in bytes. "
	"A value of 0 autosizes based on the total system memory. "
	"A value of -1 disables all memory limiting.");
	TAG_FLAG(memory_limit_hard_bytes, stable);

	DEFINE_int32(memory_pressure_percentage, 60,
	"Percentage of the hard memory limit that this daemon may "
	"consume before flushing of in-memory data becomes prioritized.");
	TAG_FLAG(memory_pressure_percentage, advanced);

	DEFINE_int32(memory_limit_soft_percentage, 80,
	"Percentage of the hard memory limit that this daemon may "
	"consume before memory throttling of writes begins. The greater "
	"the excess, the higher the chance of throttling. In general, a "
	"lower soft limit leads to smoother write latencies but "
	"decreased throughput, and vice versa for a higher soft limit.");
	TAG_FLAG(memory_limit_soft_percentage, advanced);

	DEFINE_int32(memory_limit_warn_threshold_percentage, 98,
	"Percentage of the hard memory limit that this daemon may "
	"consume before WARNING level messages are periodically logged.");
	TAG_FLAG(memory_limit_warn_threshold_percentage, advanced);

	#ifdef TCMALLOC_ENABLED
	DEFINE_bool(disable_tcmalloc_gc_by_memory_tracker_for_testing, false,
	"For testing only! Whether to disable tcmalloc GC by memory tracker.");
	TAG_FLAG(disable_tcmalloc_gc_by_memory_tracker_for_testing, hidden);

	DEFINE_int32(tcmalloc_max_free_bytes_percentage, 10,
	"Maximum percentage of the RSS that tcmalloc is allowed to use for "
	"reserved but unallocated memory.");
	TAG_FLAG(tcmalloc_max_free_bytes_percentage, advanced);
	#endif

	using strings::Substitute;

	namespace kudu {
	namespace process_memory {

	namespace {
	int64_t g_hard_limit;
	int64_t g_soft_limit;
	int64_t g_pressure_threshold;

	ThreadSafeRandom* g_rand = nullptr;

	#ifdef TCMALLOC_ENABLED
	// Total amount of memory released since the last GC. If this
	// is greater than kGcReleaseSize, this will trigger a tcmalloc gc.
	Atomic64 g_released_memory_since_gc;

	// Size, in bytes, that is considered a large value for Release() (or Consume() with
	// a negative value). If tcmalloc is used, this can trigger it to GC.
	// A higher value will make us call into tcmalloc less often (and therefore more
	// efficient). A lower value will mean our memory overhead is lower.
	// TODO(todd): this is a stopgap.
	const int64_t kGcReleaseSize = 128 * 1024L * 1024L;

	#endif // TCMALLOC_ENABLED

	} // anonymous namespace


	// Flag validation
	// ------------------------------------------------------------
	// Validate that various flags are percentages.
	static bool ValidatePercentage(const char* flagname, int value) {
	if (value >= 0 && value <= 100) {
	return true;
	}
	LOG(ERROR) << Substitute("$0 must be a percentage, value $1 is invalid",
	flagname, value);
	return false;
	}

	DEFINE_validator(memory_limit_soft_percentage, &ValidatePercentage);
	DEFINE_validator(memory_limit_warn_threshold_percentage, &ValidatePercentage);
	#ifdef TCMALLOC_ENABLED
	DEFINE_validator(tcmalloc_max_free_bytes_percentage, &ValidatePercentage);
	#endif

	// Wrappers around tcmalloc functionality
	// ------------------------------------------------------------
	#ifdef TCMALLOC_ENABLED
	static int64_t GetTCMallocProperty(const char* prop) {
	size_t value;
	if (!MallocExtension::instance()->GetNumericProperty(prop, &value)) {
	LOG(DFATAL) << "Failed to get tcmalloc property " << prop;
	}
	return value;
	}

	int64_t GetTCMallocCurrentAllocatedBytes() {
	return GetTCMallocProperty("generic.current_allocated_bytes");
	}

	void GcTcmalloc() {
	TRACE_EVENT0("process", "GcTcmalloc");

	// Number of bytes in the 'NORMAL' free list (i.e reserved by tcmalloc but
	// not in use).
	int64_t bytes_overhead = GetTCMallocProperty("tcmalloc.pageheap_free_bytes");
	// Bytes allocated by the application.
	int64_t bytes_used = GetTCMallocCurrentAllocatedBytes();

	int64_t max_overhead = bytes_used * FLAGS_tcmalloc_max_free_bytes_percentage / 100.0;
	if (bytes_overhead > max_overhead) {
	int64_t extra = bytes_overhead - max_overhead;
	while (extra > 0) {
	// Release 1MB at a time, so that tcmalloc releases its page heap lock
	// allowing other threads to make progress. This still disrupts the current
	// thread, but is better than disrupting all.
	MallocExtension::instance()->ReleaseToSystem(1024 * 1024);
	extra -= 1024 * 1024;
	}
	}
	}
	#endif // TCMALLOC_ENABLED


	// Consumption and soft memory limit behavior
	// ------------------------------------------------------------
	namespace {
	void DoInitLimits() {
	int64_t limit = FLAGS_memory_limit_hard_bytes;
	if (limit == 0) {
	// If no limit is provided, we'll use 80% of system RAM.
	int64_t total_ram;
	CHECK_OK(Env::Default()->GetTotalRAMBytes(&total_ram));
	limit = total_ram * 4;
	limit /= 5;
	}
	g_hard_limit = limit;
	g_soft_limit = FLAGS_memory_limit_soft_percentage * g_hard_limit / 100;
	g_pressure_threshold = FLAGS_memory_pressure_percentage * g_hard_limit / 100;

	g_rand = new ThreadSafeRandom(1);
	}

	void InitLimits() {
	static GoogleOnceType once;
	GoogleOnceInit(&once, &DoInitLimits);
	}

	} // anonymous namespace

	int64_t CurrentConsumption() {
	#ifdef TCMALLOC_ENABLED
	const int64_t kReadIntervalMicros = 50000;
	static Atomic64 last_read_time = 0;
	static simple_spinlock read_lock;
	static Atomic64 consumption = 0;
	uint64_t time = GetMonoTimeMicros();
	if (time > last_read_time + kReadIntervalMicros && read_lock.try_lock()) {
	base::subtle::NoBarrier_Store(&consumption, GetTCMallocCurrentAllocatedBytes());
	// Re-fetch the time after getting the consumption. This way, in case fetching
	// consumption is extremely slow for some reason (eg due to lots of contention
	// in tcmalloc) we at least ensure that we wait at least another full interval
	// before fetching the information again.
	time = GetMonoTimeMicros();
	base::subtle::NoBarrier_Store(&last_read_time, time);
	read_lock.unlock();
	}

	return base::subtle::NoBarrier_Load(&consumption);
	#else
	// Without tcmalloc, we have no reliable way of determining our own heap
	// size (e.g. mallinfo doesn't work in ASAN builds). So, we'll fall back
	// to just looking at the sum of our tracked memory.
	return MemTracker::GetRootTracker()->consumption();
	#endif
	}

	int64_t HardLimit() {
	InitLimits();
	return g_hard_limit;
	}

	int64_t SoftLimit() {
	InitLimits();
	return g_soft_limit;
	}

	int64_t MemoryPressureThreshold() {
	InitLimits();
	return g_pressure_threshold;
	}

	bool UnderMemoryPressure(double* current_capacity_pct) {
	InitLimits();
	int64_t consumption = CurrentConsumption();
	if (consumption < g_pressure_threshold) {
	return false;
	}
	if (current_capacity_pct) {
	current_capacity_pct = static_cast<double>(consumption) / g_hard_limit 100;
	}
	return true;
	}

	bool SoftLimitExceeded(double* current_capacity_pct) {
	InitLimits();
	int64_t consumption = CurrentConsumption();
	// Did we exceed the actual limit?
	if (consumption > g_hard_limit) {
	if (current_capacity_pct) {
	current_capacity_pct = static_cast<double>(consumption) / g_hard_limit 100;
	}
	return true;
	}

	// No soft limit defined.
	if (g_hard_limit == g_soft_limit) {
	return false;
	}

	// Are we under the soft limit threshold?
	if (consumption < g_soft_limit) {
	return false;
	}

	// We're over the threshold; were we randomly chosen to be over the soft limit?
	if (consumption + g_rand->Uniform64(g_hard_limit - g_soft_limit) > g_hard_limit) {
	if (current_capacity_pct) {
	current_capacity_pct = static_cast<double>(consumption) / g_hard_limit 100;
	}
	return true;
	}
	return false;
	}

	void MaybeGCAfterRelease(int64_t released_bytes) {
	#ifdef TCMALLOC_ENABLED
	int64_t now_released = base::subtle::NoBarrier_AtomicIncrement(
	&g_released_memory_since_gc, released_bytes);
	if (PREDICT_FALSE(now_released > kGcReleaseSize
	&& !FLAGS_disable_tcmalloc_gc_by_memory_tracker_for_testing)) {
	base::subtle::NoBarrier_Store(&g_released_memory_since_gc, 0);
	GcTcmalloc();
	}
	#endif
	}

	} // namespace process_memory
	} // namespace kudu