Add observer flag to disable resource metric collection (#66)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index 7602314..f7449fa 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -17,12 +17,16 @@
 - Users may now set a value for the URI fetcher to rename a downloaded artifact to after it
   has been downloaded.
 - Auto pause feature added to VariableBatch strategy and Batch strategy. With this feature enabled,
-  when an update is ROLLING_FORWARD, the update will automatically pause itself right before
+  when an update is `ROLLING_FORWARD`, the update will automatically pause itself right before
   a new batch is started. (This feature is being released as tested but in beta state. We are
   looking to collect feedback before we consider it fully stable.)
-- loader.load() now uses memoization on the config file path so that we only load and process
+- `loader.load()` now uses memoization on the config file path so that we only load and process
   each config file once.
 - Instances run with custom executors will no longer show links to thermos observer.
+- Add observer command line option `--disable_task_resource_collection` to disable the collection of
+  CPU, memory, and disk metrics for observed tasks. This is useful in setups where metrics cannot be
+  gathered reliable (e.g. when using PID namespaces) or when it is expensive due to hundreds of
+  active tasks per host.
 
 ### Deprecations and removals:
 
diff --git a/docs/reference/observer-configuration.md b/docs/reference/observer-configuration.md
index c791b34..6d60251 100644
--- a/docs/reference/observer-configuration.md
+++ b/docs/reference/observer-configuration.md
@@ -23,6 +23,11 @@
   --polling_interval_secs=POLLING_INTERVAL_SECS
                         The number of seconds between observer refresh
                         attempts. [default: 5]
+  --disable_task_resource_collection
+                        Disable collection of CPU and memory statistics for
+                        each active task. Those can be expensive to collect if
+                        there are hundreds of active tasks per host. [default:
+                        False]
   --task_process_collection_interval_secs=TASK_PROCESS_COLLECTION_INTERVAL_SECS
                         The number of seconds between per task process
                         resource collections. [default: 20]
diff --git a/src/main/python/apache/aurora/config/BUILD b/src/main/python/apache/aurora/config/BUILD
index 12e7fe9..2f2d9ee 100644
--- a/src/main/python/apache/aurora/config/BUILD
+++ b/src/main/python/apache/aurora/config/BUILD
@@ -22,7 +22,6 @@
     '3rdparty/python:pystachio',
     '3rdparty/python:twitter.common.lang',
     'api/src/main/thrift/org/apache/aurora/gen',
-    'src/main/python/apache/aurora/common',
     'src/main/python/apache/thermos/config',
   ],
   provides = setup_py(
diff --git a/src/main/python/apache/aurora/tools/thermos_observer.py b/src/main/python/apache/aurora/tools/thermos_observer.py
index fd9465d..4d865ce 100644
--- a/src/main/python/apache/aurora/tools/thermos_observer.py
+++ b/src/main/python/apache/aurora/tools/thermos_observer.py
@@ -61,6 +61,15 @@
 
 
 app.add_option(
+    '--disable_task_resource_collection',
+    dest='disable_task_resource_collection',
+    default=False,
+    action='store_true',
+    help="Disable collection of CPU and memory statistics for each active task. Those can be "
+         "expensive to collect if there are hundreds of active tasks per host.")
+
+
+app.add_option(
     '--task_process_collection_interval_secs',
     dest='task_process_collection_interval_secs',
     type='int',
@@ -127,6 +136,7 @@
       path_detector,
       Amount(options.polling_interval_secs, Time.SECONDS),
       Amount(options.task_process_collection_interval_secs, Time.SECONDS),
+      disable_task_resource_collection=options.disable_task_resource_collection,
       enable_mesos_disk_collector=options.enable_mesos_disk_collector,
       disk_collector_settings=disk_collector_settings)
 
diff --git a/src/main/python/apache/thermos/monitoring/resource.py b/src/main/python/apache/thermos/monitoring/resource.py
index 72ed4e5..007b6aa 100644
--- a/src/main/python/apache/thermos/monitoring/resource.py
+++ b/src/main/python/apache/thermos/monitoring/resource.py
@@ -308,3 +308,25 @@
                     'process_collection_interval and disk_collection_interval.')
 
     log.debug('Stopping resource monitoring for task "%s"', self._task_id)
+
+
+class NullTaskResourceMonitor(ResourceMonitorBase):
+  """ Alternative to TaskResourceMonitor that does not collect any resource metrics at all. It can
+      be used as fast replacement for TaskResourceMonitor. It is especially useful in setups where
+      metrics cannot be gathered reliable (e.g. when using PID namespaces).
+  """
+
+  def sample(self):
+    return self.sample_at(time.time())
+
+  def sample_at(self, timestamp):
+    return timestamp, self.AggregateResourceResult(0, ProcessSample.empty(), 0)
+
+  def sample_by_process(self, process_name):
+    return ProcessSample.empty()
+
+  def start(self):
+    pass
+
+  def kill(self):
+    pass
diff --git a/src/main/python/apache/thermos/observer/task_observer.py b/src/main/python/apache/thermos/observer/task_observer.py
index 94cd6c5..c0ddf7d 100644
--- a/src/main/python/apache/thermos/observer/task_observer.py
+++ b/src/main/python/apache/thermos/observer/task_observer.py
@@ -33,8 +33,11 @@
 from apache.thermos.monitoring.disk import DiskCollectorSettings
 from apache.thermos.monitoring.monitor import TaskMonitor
 from apache.thermos.monitoring.process import ProcessSample
-from apache.thermos.monitoring.resource import DiskCollectorProvider, TaskResourceMonitor
-
+from apache.thermos.monitoring.resource import (
+  DiskCollectorProvider,
+  NullTaskResourceMonitor,
+  TaskResourceMonitor
+)
 from .detector import ObserverTaskDetector
 from .observed_task import ActiveObservedTask, FinishedObservedTask
 
@@ -60,6 +63,7 @@
       path_detector,
       interval=POLLING_INTERVAL,
       task_process_collection_interval=TaskResourceMonitor.PROCESS_COLLECTION_INTERVAL,
+      disable_task_resource_collection=False,
       enable_mesos_disk_collector=False,
       disk_collector_settings=DiskCollectorSettings()):
 
@@ -71,6 +75,7 @@
     self._interval = interval
     self._task_process_collection_interval = task_process_collection_interval
     self._enable_mesos_disk_collector = enable_mesos_disk_collector
+    self._disable_task_resource_collection = disable_task_resource_collection
     self._disk_collector_settings = disk_collector_settings
     self._active_tasks = {}    # task_id => ActiveObservedTask
     self._finished_tasks = {}  # task_id => FinishedObservedTask
@@ -107,16 +112,21 @@
       return
     task_monitor = TaskMonitor(root, task_id)
 
-    disk_collector_provider = DiskCollectorProvider(
-      self._enable_mesos_disk_collector,
-      self._disk_collector_settings)
+    if self._disable_task_resource_collection:
+      resource_monitor = NullTaskResourceMonitor()
 
-    resource_monitor = TaskResourceMonitor(
-        task_id,
-        task_monitor,
-        disk_collector_provider=disk_collector_provider,
-        process_collection_interval=self._task_process_collection_interval,
-        disk_collection_interval=self._disk_collector_settings.disk_collection_interval)
+    else:
+      disk_collector_provider = DiskCollectorProvider(
+        self._enable_mesos_disk_collector,
+        self._disk_collector_settings)
+
+      resource_monitor = TaskResourceMonitor(
+          task_id,
+          task_monitor,
+          disk_collector_provider=disk_collector_provider,
+          process_collection_interval=self._task_process_collection_interval,
+          disk_collection_interval=self._disk_collector_settings.disk_collection_interval)
+
     resource_monitor.start()
     self._active_tasks[task_id] = ActiveObservedTask(
         root,
diff --git a/src/test/python/apache/thermos/monitoring/test_resource.py b/src/test/python/apache/thermos/monitoring/test_resource.py
index 4445064..5eeda1e 100644
--- a/src/test/python/apache/thermos/monitoring/test_resource.py
+++ b/src/test/python/apache/thermos/monitoring/test_resource.py
@@ -25,6 +25,7 @@
 from apache.thermos.monitoring.resource import (
     DiskCollectorProvider,
     HistoryProvider,
+    NullTaskResourceMonitor,
     ResourceHistory,
     ResourceMonitorBase,
     TaskResourceMonitor
@@ -154,3 +155,17 @@
       task_resource_monitor.sample_by_process('fake-process-name')
 
     assert mock_get_active_processes.mock_calls == [mock.call(task_monitor)]
+
+
+class TestNullTaskResourceMonitor(TestCase):
+  def test_null_sample(self):
+    monitor = NullTaskResourceMonitor()
+    monitor.start()
+
+    null_aggregate = (0, ProcessSample.empty(), 0)
+
+    assert monitor.sample()[1] == null_aggregate
+    assert monitor.sample_at(time())[1] == null_aggregate
+    assert monitor.sample_by_process("any_process") == ProcessSample.empty()
+
+    monitor.kill()