geode-core/src/main/java/org/apache/geode/SystemFailure.java - geode - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
  * agreements. See the NOTICE file distributed with this work for additional information regarding
  * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance with the License. You may obtain a
  * copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software distributed under the License
  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
 package org.apache.geode;

 import org.jgroups.annotations.GuardedBy;

 import org.apache.geode.annotations.internal.MakeNotStatic;
 import org.apache.geode.annotations.internal.MutableForTesting;
 import org.apache.geode.internal.ExitCode;
 import org.apache.geode.internal.SystemFailureTestHook;
 import org.apache.geode.internal.admin.remote.RemoteGfManagerAgent;
 import org.apache.geode.internal.cache.GemFireCacheImpl;
 import org.apache.geode.logging.internal.executors.LoggingThread;
 import org.apache.geode.util.internal.GeodeGlossary;

 /**
  *
  * Catches and responds to JVM failure
  * <p>
  * This class represents a catastrophic failure of the system, especially the Java virtual machine.
  * Any class may, at any time, indicate that a system failure has occurred by calling
  * {@link #initiateFailure(Error)} (or, less commonly, {@link #setFailure(Error)}).
  * <p>
  * In practice, the most common type of failure that is likely to be reported by an otherwise
  * healthy JVM is {@link OutOfMemoryError}. However, GemFire will report any occurrence of
  * {@link VirtualMachineError} as a JVM failure.
  * <p>
  * When a failure is reported, you must assume that the JVM has <em>broken its fundamental execution
  * contract</em> with your application. No programming invariant can be assumed to be true, and your
  * entire application must be regarded as corrupted.
  * <h1>Failure Hooks</h1> GemFire uses this class to disable its distributed system (group
  * communication) and any open caches. It also provides a hook for you to respond to after GemFire
  * disables itself.
  * <h1>Failure WatchDog</h1> When {@link #startThreads()} is called, a "watchdog" {@link Thread} is
  * started that periodically checks to see if system corruption has been reported. When system
  * corruption is detected, this thread proceeds to:
  *
  * <ol>
  * <li><em>Close GemFire</em> -- Group communication is ceased (this cache member recuses itself
  * from the distributed system) and the cache is further poisoned (it is pointless to try to cleanly
  * close it at this point.).
  * <p>
  * After this has successfully ended, we launch a</li>
  * <li><em>failure action</em>, a user-defined Runnable {@link #setFailureAction(Runnable)}. By
  * default, this Runnable performs nothing. If you feel you need to perform an action before exiting
  * the JVM, this hook gives you a means of attempting some action. Whatever you attempt should be
  * extremely simple, since your Java execution environment has been corrupted.
  * <p>
  * GemStone recommends that you employ
  * <a href="http://wrapper.tanukisoftware.org/doc/english/introduction.html"> Java Service
  * Wrapper</a> to detect when your JVM exits and to perform appropriate failure and restart actions.
  * </li>
  * <li>Finally, if the application has granted the watchdog permission to exit the JVM (via
  * {@link #setExitOK(boolean)}), the watchdog calls {@link System#exit(int)} with an argument of 1.
  * If you have not granted this class permission to close the JVM, you are <em>strongly</em> advised
  * to call it in your failure action (in the previous step).</li>
  * </ol>
  * <p>
  * Each of these actions will be run exactly once in the above described order. However, if either
  * step throws any type of error ({@link Throwable}), the watchdog will assume that the JVM is still
  * under duress (esp. an {@link OutOfMemoryError}), will wait a bit, and then retry the failed
  * action.
  * <p>
  * It bears repeating that you should be very cautious of any Runnables you ask this class to run.
  * By definition the JVM is <em>very sick</em> when failure has been signalled.
  *
  * <h1>Failure Proctor</h1> In addition to the failure watchdog, {@link #startThreads()} creates a
  * second thread (the "proctor") that monitors free memory. It does this by examining
  * {@link Runtime#freeMemory() free memory}, {@link Runtime#totalMemory() total memory} and
  * {@link Runtime#maxMemory() maximum memory}. If the amount of available memory stays below a given
  * {@link #setFailureMemoryThreshold(long) threshold}, for more than {@link #WATCHDOG_WAIT} seconds,
  * the watchdog is notified.
  * <p>
  * Note that the proctor can be effectively disabled by
  * {@link SystemFailure#setFailureMemoryThreshold(long) setting} the failure memory threshold to a
  * negative value.
  * <p>
  * The proctor is a second line of defense, attempting to detect OutOfMemoryError conditions in
  * circumstances where nothing alerted the watchdog. For instance, a third-party jar might
  * incorrectly handle this error and leave your virtual machine in a "stuck" state.
  * <p>
  * Note that the proctor does not relieve you of the obligation to follow the best practices in the
  * next section.
  * <h1>Best Practices</h1>
  * <h2>Catch and Handle VirtualMachineError</h2> If you feel obliged to catch <em>either</em>
  * {@link Error}, or {@link Throwable}, you <em>must</em>also check for {@link VirtualMachineError}
  * like so:
  *
  * <pre>
         catch (VirtualMachineError err) {
           SystemFailure.{@link #initiateFailure(Error) initiateFailure}(err);
           // If this ever returns, rethrow the error.  We're poisoned
           // now, so don't let this thread continue.
           throw err;
         }
  * </pre>
  *
  * <h2>Periodically Check For Errors</h2> Check for serious system errors at appropriate points in
  * your algorithms. You may elect to use the {@link #checkFailure()} utility function, but you are
  * not required to (you could just see if {@link SystemFailure#getFailure()} returns a non-null
  * result).
  * <p>
  * A job processing loop is a good candidate, for instance, in
  * org.apache.org.jgroups.protocols.UDP#run(), which implements {@link Thread#run}:
  *
  * <pre>
          for (;;)  {
            SystemFailure.{@link #checkFailure() checkFailure}();
            if (mcast_recv_sock == null || mcast_recv_sock.isClosed()) break;
            if (Thread.currentThread().isInterrupted()) break;
           ...
  * </pre>
  *
  * <h2>Catches of Error and Throwable Should Check for Failure</h2> Keep in mind that peculiar or
  * flat-out<em>impossible</em> exceptions may ensue after a VirtualMachineError has been thrown
  * <em>anywhere</em> in your virtual machine. Whenever you catch {@link Error} or {@link Throwable},
  * you should also make sure that you aren't dealing with a corrupted JVM:
  *
  * <pre>
        catch (Throwable t) {
          // Whenever you catch Error or Throwable, you must also
          // catch VirtualMachineError (see above).  However, there is
          // _still_ a possibility that you are dealing with a cascading
          // error condition, so you also need to check to see if the JVM
          // is still usable:
          SystemFailure.{@link #checkFailure() checkFailure}();
          ...
        }
  * </pre>
  *
  * @since GemFire 5.1
  *
  * @deprecated since Geode 1.11 because it is potentially counterproductive to try
  *             to mitigate a VirtualMachineError since the JVM (spec) makes no guarantees about the
  *             soundness of the JVM after such an error. In the presence of a VirtualMachineError,
  *             the simplest solution is really the only solution: exit the JVM as soon as possible.
  *
  */
 @Deprecated
 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DM_GC",
     justification = "This class performs System.gc as last ditch effort during out-of-memory condition.")
 public final class SystemFailure {

   /**
    * Time to wait during stopWatchdog and stopProctor. Not final for tests
    */
   @MutableForTesting
   static int SHUTDOWN_WAIT = 1000;
   /**
    * Preallocated error messages may use memory (in the form of an iterator) so we
    * must get the translated messages in advance.
    **/
   static final String JVM_CORRUPTION =
       "JVM corruption has been detected";
   private static final String CALLING_SYSTEM_EXIT =
       "Since this is a dedicated cache server and the JVM has been corrupted, this process will now terminate. Permission to call System#exit(int) was given in the following context.";

   /**
    * the underlying failure
    *
    * This is usually an instance of {@link VirtualMachineError}, but it is not required to be such.
    *
    * @see #getFailure()
    * @see #initiateFailure(Error)
    */
   @MakeNotStatic
   protected static volatile Error failure = null;

   /**
    * user-defined runnable to run last
    *
    * @see #setFailureAction(Runnable)
    */
   @MakeNotStatic
   private static volatile Runnable failureAction = () -> {
     System.err.println(JVM_CORRUPTION);
     failure.printStackTrace();
   };

   /**
    * @see #setExitOK(boolean)
    */
   @MakeNotStatic
   private static volatile boolean exitOK = false;

   /**
    * If we're going to exit the JVM, I want to be accountable for who told us it was OK.
    */
   @MakeNotStatic
   private static volatile Throwable exitExcuse;

   /**
    * Indicate whether it is acceptable to call {@link System#exit(int)} after failure processing has
    * completed.
    * <p>
    * This may be dynamically modified while the system is running.
    *
    * @param newVal true if it is OK to exit the process
    * @return the previous value
    */
   public static boolean setExitOK(boolean newVal) {
     boolean result = exitOK;
     exitOK = newVal;
     if (exitOK) {
       exitExcuse = new Throwable("SystemFailure exitOK set");
     } else {
       exitExcuse = null;
     }
     return result;
   }

   /**
    * Returns true if the given Error is a fatal to the JVM and it should be shut down. Code should
    * call {@link #initiateFailure(Error)} or {@link #setFailure(Error)} if this returns true.
    *
    * @param err an Error
    * @return whether the given error is fatal to the JVM
    */
   public static boolean isJVMFailureError(Error err) {
     return err instanceof OutOfMemoryError || err instanceof UnknownError;
   }

   /**
    * Disallow instance creation
    */
   private SystemFailure() {

   }

   /**
    * Synchronizes access to state variables, used to notify the watchdog when to run
    *
    * @see #notifyWatchDog()
    * @see #startProctor()
    * @see #startWatchDog()
    */
   private static final Object failureSync = new Object();

   /**
    * True if we have closed GemFire
    *
    * @see #emergencyClose()
    */
   @MakeNotStatic
   private static volatile boolean gemfireCloseCompleted = false;

   /**
    * True if we have completed the user-defined failure action
    *
    * @see #setFailureAction(Runnable)
    */
   @MakeNotStatic
   private static volatile boolean failureActionCompleted = false;

   /**
    * This is the amount of time, in seconds, the watchdog periodically awakens to see if the system
    * has been corrupted.
    * <p>
    * The watchdog will be explicitly awakened by calls to {@link #setFailure(Error)} or
    * {@link #initiateFailure(Error)}, but it will awaken of its own accord periodically to check for
    * failure even if the above calls do not occur.
    * <p>
    * This can be set with the system property <code>gemfire.WATCHDOG_WAIT</code>. The default is 15
    * sec.
    */
   private static final int WATCHDOG_WAIT =
       Integer.getInteger(GeodeGlossary.GEMFIRE_PREFIX + "WATCHDOG_WAIT", 15);

   /**
    * This is the watchdog thread
    */
   @GuardedBy("failureSync")
   @MakeNotStatic
   private static Thread watchDog;

   @MakeNotStatic
   private static volatile boolean isCacheClosing = false;

   /**
    * Should be invoked when GemFire cache is being created.
    */
   public static void signalCacheCreate() {
     isCacheClosing = false;
   }

   /**
    * Should be invoked when GemFire cache is closing or closed.
    */
   public static void signalCacheClose() {
     isCacheClosing = true;
     if (proctor != null) {
       proctor.interrupt();
     }
     if (watchDog != null) {
       watchDog.interrupt();
     }
   }

   /**
    * Start the watchdog thread, if it isn't already running.
    */
   private static void startWatchDog() {
     if (failureActionCompleted) {
       return;
     }
     synchronized (failureSync) {
       if (watchDog != null && watchDog.isAlive()) {
         return;
       }
       watchDog = new LoggingThread("SystemFailure WatchDog", SystemFailure::runWatchDog);
       watchDog.start();
     }
   }

   private static void stopWatchDog() {
     Thread watchDogSnapshot = null;
     synchronized (failureSync) {
       stopping = true;
       if (watchDog != null && watchDog.isAlive()) {
         failureSync.notifyAll();
         watchDogSnapshot = watchDog;
       }
     }
     if (watchDogSnapshot != null) {
       try {
         watchDogSnapshot.join(100);
       } catch (InterruptedException ignore) {
       }
       if (watchDogSnapshot.isAlive()) {
         watchDogSnapshot.interrupt();
         try {
           watchDogSnapshot.join(SHUTDOWN_WAIT);
         } catch (InterruptedException ignore) {
         }
       }
     }
   }

   /**
    * This is the run loop for the watchdog thread.
    */
   private static void runWatchDog() {

     boolean warned = false;

     logFine(WATCHDOG_NAME, "Starting");
     while (!stopping) {
       try {
         if (isCacheClosing) {
           break;
         }
         // Sleep or get notified...
         synchronized (failureSync) {
           if (stopping) {
             return;
           }
           logFine(WATCHDOG_NAME, "Waiting for disaster");
           try {
             failureSync.wait(WATCHDOG_WAIT * 1000L);
           } catch (InterruptedException e) {
             // Ignore
           }
           if (stopping) {
             return;
           }
         }

         // Perform watchdog sentinel duties.

         if (failureActionCompleted) {
           logInfo(WATCHDOG_NAME, "all actions completed; exiting");
         }
         if (failure == null) {
           logFine(WATCHDOG_NAME, "no failure detected");
           continue;
         }
         if (!warned) {
           warned = logWarning(WATCHDOG_NAME, "failure detected", failure);
         }

         if (!gemfireCloseCompleted) {
           logInfo(WATCHDOG_NAME, "closing GemFire");
           try {
             emergencyClose();
           } catch (Throwable t) {
             logWarning(WATCHDOG_NAME, "trouble closing GemFire", t);
             continue;
           }
           gemfireCloseCompleted = true;
         }

         if (!failureActionCompleted) {
           // avoid potential race condition setting the runnable
           Runnable r = failureAction;
           if (r != null) {
             logInfo(WATCHDOG_NAME, "running user's runnable");
             try {
               r.run();
             } catch (Throwable t) {
               logWarning(WATCHDOG_NAME, "trouble running user's runnable", t);
               continue;
             }
           }
           failureActionCompleted = true;
         }

         stopping = true;
         stopProctor();

         if (exitOK) {
           logWarning(WATCHDOG_NAME, CALLING_SYSTEM_EXIT, exitExcuse);

           // ATTENTION: there are VERY FEW places in GemFire where it is
           // acceptable to call System.exit. This is one of those
           // places...
           ExitCode.FATAL.doSystemExit();
         }

         logInfo(WATCHDOG_NAME, "exiting");
         return;
       } catch (Throwable t) {
         logWarning(WATCHDOG_NAME, "thread encountered a problem: " + t, t);
       }
     }
   }

   /**
    * Spies on system statistics looking for low memory threshold
    *
    * @see #minimumMemoryThreshold
    */
   @GuardedBy("failureSync")
   @MakeNotStatic
   private static Thread proctor;

   /**
    * This mutex controls access to {@link #firstStarveTime} and {@link #minimumMemoryThreshold}.
    * <p>
    * I'm hoping that a fat lock is never created here, so that an object allocation isn't necessary
    * to acquire this mutex. You'd have to have A LOT of contention on this mutex in order for a fat
    * lock to be created, which indicates IMHO a serious problem in your applications.
    */
   private static final Object memorySync = new Object();

   /**
    * This is the minimum amount of memory that the proctor will tolerate before declaring a system
    * failure.
    *
    * @see #setFailureMemoryThreshold(long)
    */
   @GuardedBy("memorySync")
   @MakeNotStatic
   private static long minimumMemoryThreshold = Long.getLong(
       GeodeGlossary.GEMFIRE_PREFIX + "SystemFailure.chronic_memory_threshold", 1048576);

   /**
    * This is the interval, in seconds, that the proctor thread will awaken and poll system free
    * memory.
    *
    * The default is 1 sec. This can be set using the system property
    * <code>gemfire.SystemFailure.MEMORY_POLL_INTERVAL</code>.
    *
    * @see #setFailureMemoryThreshold(long)
    */
   private static final long MEMORY_POLL_INTERVAL =
       Long.getLong(GeodeGlossary.GEMFIRE_PREFIX + "SystemFailure.MEMORY_POLL_INTERVAL", 1);

   /**
    * This is the maximum amount of time, in seconds, that the proctor thread will tolerate seeing
    * free memory stay below {@link #setFailureMemoryThreshold(long)}, after which point it will
    * declare a system failure.
    *
    * The default is 15 sec. This can be set using the system property
    * <code>gemfire.SystemFailure.MEMORY_MAX_WAIT</code>.
    *
    * @see #setFailureMemoryThreshold(long)
    */
   public static final long MEMORY_MAX_WAIT =
       Long.getLong(GeodeGlossary.GEMFIRE_PREFIX + "SystemFailure.MEMORY_MAX_WAIT", 15);

   /**
    * Flag that determines whether or not we monitor memory on our own. If this flag is set, we will
    * check freeMemory, invoke GC if free memory gets low, and start throwing our own
    * OutOfMemoryException if
    *
    * The default is false, so this monitoring is turned off. This monitoring has been found to be
    * unreliable in non-Sun VMs when the VM is under stress or behaves in unpredictable ways.
    *
    * @since GemFire 6.5
    */
   private static final boolean MONITOR_MEMORY =
       Boolean.getBoolean(GeodeGlossary.GEMFIRE_PREFIX + "SystemFailure.MONITOR_MEMORY");

   /**
    * Start the proctor thread, if it isn't already running.
    *
    * @see #proctor
    */
   private static void startProctor() {
     if (failure != null) {
       notifyWatchDog();
       return;
     }
     synchronized (failureSync) {
       if (proctor != null && proctor.isAlive()) {
         return;
       }
       proctor = new LoggingThread("SystemFailure Proctor", SystemFailure::runProctor);
       proctor.start();
     }
   }

   private static void stopProctor() {
     Thread proctorSnapshot;
     synchronized (failureSync) {
       stopping = true;
       proctorSnapshot = proctor;
     }
     if (proctorSnapshot != null && proctorSnapshot.isAlive()) {
       proctorSnapshot.interrupt();
       try {
         proctorSnapshot.join(SHUTDOWN_WAIT);
       } catch (InterruptedException ignore) {
       }
     }
   }

   /**
    * Symbolic representation of an invalid starve time
    */
   private static final long NEVER_STARVED = Long.MAX_VALUE;

   /**
    * this is the last time we saw memory starvation
    */
   @GuardedBy("memorySync")
   @MakeNotStatic
   private static long firstStarveTime = NEVER_STARVED;

   /**
    * This is the previous measure of total memory. If it changes, we reset the proctor's starve
    * statistic.
    */
   @MakeNotStatic
   private static long lastTotalMemory = 0;

   /**
    * This is the run loop for the proctor thread
    */
   private static void runProctor() {
     // Note that the javadocs say this can return Long.MAX_VALUE.
     final long maxMemory = Runtime.getRuntime().maxMemory();

     // Allocate this error in advance, since it's too late once it's been detected!
     final OutOfMemoryError oome = new OutOfMemoryError(
         String.format(
             "%s : memory has remained chronically below %s bytes (out of a maximum of %s ) for %s sec.",
             PROCTOR_NAME, minimumMemoryThreshold, maxMemory, WATCHDOG_WAIT));

     logFine(PROCTOR_NAME,
         "Starting, threshold = " + minimumMemoryThreshold + "; max = " + maxMemory);
     while (!isCacheClosing) {
       if (stopping) {
         return;
       }

       try {
         try {
           Thread.sleep(MEMORY_POLL_INTERVAL * 1000);
         } catch (InterruptedException e) {
           // ignore
         }

         if (stopping) {
           return;
         }

         if (failureActionCompleted) {
           return;
         }
         if (failure != null) {
           notifyWatchDog();
           logFine(PROCTOR_NAME, "Failure has been reported, exiting");
           return;
         }

         if (!MONITOR_MEMORY) {
           continue;
         }

         long totalMemory = Runtime.getRuntime().totalMemory();
         if (totalMemory < maxMemory) {
           if (DEBUG) {
             logFine(PROCTOR_NAME,
                 "totalMemory (" + totalMemory + ") < maxMemory (" + maxMemory + ")");
           }
           firstStarveTime = NEVER_STARVED;
           continue;
         }
         if (lastTotalMemory < totalMemory) {
           lastTotalMemory = totalMemory;
           firstStarveTime = NEVER_STARVED;
           continue;
         }
         lastTotalMemory = totalMemory;

         long freeMemory = Runtime.getRuntime().freeMemory();
         if (freeMemory == 0) {
           // This is to workaround X bug #41821 in JRockit. Often, Jrockit returns 0 from
           // Runtime.getRuntime().freeMemory() Allocating this one object and calling again seems to
           // workaround the problem.
           new Object();
           freeMemory = Runtime.getRuntime().freeMemory();
         }
         // Grab the threshold and starve time once, under mutex, because
         // it's publicly modifiable.
         long curThreshold;
         long lastStarveTime;
         synchronized (memorySync) {
           curThreshold = minimumMemoryThreshold;
           lastStarveTime = firstStarveTime;
         }

         if (freeMemory >= curThreshold || curThreshold == 0) {
           // Memory is FINE, reset everything
           if (DEBUG) {
             logFine(PROCTOR_NAME, "Current free memory is: " + freeMemory);
           }

           if (lastStarveTime != NEVER_STARVED) {
             logFine(PROCTOR_NAME, "...low memory has self-corrected.");
           }
           synchronized (memorySync) {
             firstStarveTime = NEVER_STARVED;
           }
           continue;
         }

         // Memory is low
         long now = System.currentTimeMillis();
         if (lastStarveTime == NEVER_STARVED) {
           if (DEBUG) {
             logFine(PROCTOR_NAME,
                 "Noting current memory " + freeMemory + " is less than threshold " + curThreshold);
           } else {
             logWarning(PROCTOR_NAME,
                 "Noting that current memory available is less than the currently designated threshold",
                 null);
           }

           synchronized (memorySync) {
             firstStarveTime = now;
           }
           System.gc(); // Attempt to free memory and avoid overflow
           continue;
         }

         if (now - lastStarveTime < MEMORY_MAX_WAIT * 1000) {
           if (DEBUG) {
             logFine(PROCTOR_NAME, "...memory is still below threshold: " + freeMemory);
           } else {
             logWarning(PROCTOR_NAME,
                 "Noting that current memory available is still below currently designated threshold",
                 null);

           }
           continue;
         }

         logWarning(PROCTOR_NAME, "Memory is chronically low; setting failure!", null);
         SystemFailure.setFailure(oome);
         notifyWatchDog();
         return;
       } catch (Throwable t) {
         logWarning(PROCTOR_NAME, "thread encountered a problem", t);
       }
     }
   }

   /**
    * Enables some fine logging
    */
   private static final boolean DEBUG = false;

   private static final String WATCHDOG_NAME = "SystemFailure Watchdog";

   private static final String PROCTOR_NAME = "SystemFailure Proctor";

   /**
    * Since it requires object memory to unpack a jar file, make sure this JVM has loaded the classes
    * necessary for closure <em>before</em> it becomes necessary to use them.
    * <p>
    * Note that just touching the class in order to load it is usually sufficient, so all an
    * implementation needs to do is to reference the same classes used in {@link #emergencyClose()}.
    * Just make sure to do it while you still have memory to succeed!
    */
   public static void loadEmergencyClasses() {
     startThreads();
   }

   /**
    * Attempt to close any and all GemFire resources.
    *
    * The contract of this method is that it should not acquire any synchronization mutexes nor
    * create any objects.
    * <p>
    * The former is because the system is in an undefined state and attempting to acquire the mutex
    * may cause a hang.
    * <p>
    * The latter is because the likelihood is that we are invoking this method due to memory
    * exhaustion, so any attempt to create an object will also cause a hang.
    * <p>
    * This method is not meant to be called directly (but, well, I guess it could). It is public to
    * document the contract that is implemented by <code>emergencyClose</code> in other parts of the
    * system.
    */
   public static void emergencyClose() {
     GemFireCacheImpl.emergencyClose();

     RemoteGfManagerAgent.emergencyClose();

     // If memory was the problem, make an explicit attempt at this point to clean up.
     System.gc();
   }

   /**
    * Throw the system failure.
    *
    * This method does not return normally.
    * <p>
    * Unfortunately, attempting to create a new Throwable at this point may cause the thread to hang
    * (instead of generating another OutOfMemoryError), so we have to make do with whatever Error we
    * have, instead of wrapping it with one pertinent to the current context. See bug 38394.
    *
    */
   private static void throwFailure() throws Error {
     if (failure != null) {
       throw failure;
     }
   }

   /**
    * Notifies the watchdog thread (assumes that {@link #failure} has been set)
    */
   private static void notifyWatchDog() {
     startWatchDog();
     synchronized (failureSync) {
       failureSync.notifyAll();
     }
   }

   /**
    * Utility function to check for failures. If a failure is detected, this methods throws an
    * AssertionFailure.
    *
    * @see #initiateFailure(Error)
    * @throws InternalGemFireError if the system has been corrupted
    * @throws Error if the system has been corrupted and a thread-specific AssertionError cannot be
    *         allocated
    */
   public static void checkFailure() throws InternalGemFireError, Error {
     if (failure == null) {
       return;
     }
     notifyWatchDog();
     throwFailure();
   }

   /**
    * Signals that a system failure has occurred and then throws an AssertionError.
    *
    * @param f the failure to set
    * @throws IllegalArgumentException if f is null
    * @throws InternalGemFireError always; this method does not return normally.
    * @throws Error if a thread-specific AssertionError cannot be allocated.
    */
   public static void initiateFailure(Error f) throws InternalGemFireError, Error {
     SystemFailure.setFailure(f);
     throwFailure();
   }

   /**
    * Set the underlying system failure, if not already set.
    * <p>
    * This method does not generate an error, and should only be used in circumstances where
    * execution needs to continue, such as when re-implementing
    * {@link ThreadGroup#uncaughtException(Thread, Throwable)}.
    *
    * @param failure the system failure
    * @throws IllegalArgumentException if you attempt to set the failure to null
    */
   public static void setFailure(Error failure) {
     if (failure == null) {
       throw new IllegalArgumentException(
           "You are not permitted to un-set a system failure.");
     }
     if (SystemFailureTestHook.errorIsExpected(failure)) {
       return;
     }
     SystemFailure.failure = failure;
     notifyWatchDog();
   }

   /**
    * Returns the catastrophic system failure, if any.
    * <p>
    * This is usually (though not necessarily) an instance of {@link VirtualMachineError}.
    * <p>
    * A return value of null indicates that no system failure has yet been detected.
    * <p>
    * Object synchronization can implicitly require object creation (fat locks in JRockit for
    * instance), so the underlying value is not synchronized (it is a volatile). This means the
    * return value from this call is not necessarily the <em>first</em> failure reported by the JVM.
    * <p>
    * Note that even if it <em>were</em> synchronized, it would only be a proximal indicator near the
    * time that the JVM crashed, and may not actually reflect the underlying root cause that
    * generated the failure. For instance, if your JVM is running short of memory, this Throwable is
    * probably an innocent victim and <em>not</em> the actual allocation (or series of allocations)
    * that caused your JVM to exhaust memory.
    * <p>
    * If this function returns a non-null value, keep in mind that the JVM is very limited. In
    * particular, any attempt to allocate objects may fail if the original failure was an
    * OutOfMemoryError.
    *
    * @return the failure, if any
    */
   public static Error getFailure() {
     return failure;
   }

   /**
    * Sets a user-defined action that is run in the event that failure has been detected.
    * <p>
    * This action is run <em>after</em> the GemFire cache has been shut down. If it throws any error,
    * it will be reattempted indefinitely until it succeeds. This action may be dynamically modified
    * while the system is running.
    * <p>
    * The default action prints the failure stack trace to System.err.
    *
    * @see #initiateFailure(Error)
    * @param action the Runnable to use
    * @return the previous action
    */
   public static Runnable setFailureAction(Runnable action) {
     Runnable old = SystemFailure.failureAction;
     SystemFailure.failureAction = action;
     return old;
   }

   /**
    * Set the memory threshold under which system failure will be notified.
    *
    * This value may be dynamically modified while the system is running. The default is 1048576
    * bytes. This can be set using the system property
    * <code>gemfire.SystemFailure.chronic_memory_threshold</code>.
    *
    * @param newVal threshold in bytes
    * @return the old threshold
    * @see Runtime#freeMemory()
    */
   public static long setFailureMemoryThreshold(long newVal) {
     long result;
     synchronized (memorySync) {
       result = minimumMemoryThreshold;
       minimumMemoryThreshold = newVal;
       firstStarveTime = NEVER_STARVED;
     }
     startProctor();
     return result;
   }

   private static boolean logStdErr(String kind, String name, String s, Throwable t) {
     try {
       System.err.print(name);
       System.err.print(": [");
       System.err.print(kind);
       System.err.print("] ");
       System.err.println(s);
       if (t != null) {
         t.printStackTrace();
       }
       return true;
     } catch (Throwable t2) {
       // out of luck
       return false;
     }
   }

   /**
    * Logging can require allocation of objects, so we wrap the logger so that failures are silently
    * ignored.
    *
    * @param name the name of the logger
    * @param s string to print
    * @param t the call stack, if any
    * @return true if the warning got printed
    */
   protected static boolean logWarning(String name, String s, Throwable t) {
     return logStdErr("warning", name, s, t);
   }

   /**
    * Logging can require allocation of objects, so we wrap the logger so that failures are silently
    * ignored.
    *
    * @param name the name of the logger
    * @param s string to print
    */
   protected static void logInfo(String name, String s) {
     logStdErr("info", name, s, null);
   }

   /**
    * Logging can require allocation of objects, so we wrap the logger so that failures are silently
    * ignored.
    *
    * @param name the name of the logger
    * @param s string to print
    */
   protected static void logFine(String name, String s) {
     if (DEBUG) {
       logStdErr("fine", name, s, null);
     }
   }

   @MakeNotStatic
   private static volatile boolean stopping;

   /**
    * This starts up the watchdog and proctor threads. This method is called when a Cache is created.
    */
   public static void startThreads() {
     stopping = false;
     startWatchDog();
     startProctor();
   }

   /**
    * This stops the threads that implement this service. This method is called when a Cache is
    * closed.
    */
   public static void stopThreads() {
     // this method fixes bug 45409
     stopping = true;
     stopProctor();
     stopWatchDog();
   }

   static Thread getWatchDogForTest() {
     return watchDog;
   }

   static Thread getProctorForTest() {
     return proctor;
   }
 }