cython timing tests
diff --git a/src/engine/openmp.cc b/src/engine/openmp.cc
index 8fe3939..47b87bc 100644
--- a/src/engine/openmp.cc
+++ b/src/engine/openmp.cc
@@ -47,8 +47,9 @@
   } else {
     if (!omp_num_threads_set_in_environment_) {
       omp_thread_max_ = omp_get_num_procs();
+      CHECK_GT(omp_thread_max_, 0);
 #ifdef ARCH_IS_INTEL_X86
-      omp_thread_max_ >>= 1;
+      omp_thread_max_ = std::max(1, omp_thread_max_ >> 1);
 #endif
       omp_set_num_threads(omp_thread_max_);
     } else {
@@ -61,6 +62,10 @@
 #endif
 }
 
+OpenMP::~OpenMP() {
+  LOG(INFO) << "OpenMP::~OpenMP()";
+}
+
 void OpenMP::on_start_worker_thread(bool use_omp) {
 #ifdef _OPENMP
   if (!omp_num_threads_set_in_environment_) {
diff --git a/src/engine/openmp.h b/src/engine/openmp.h
index 800ea2f..312e793 100644
--- a/src/engine/openmp.h
+++ b/src/engine/openmp.h
@@ -31,6 +31,7 @@
 class OpenMP {
  public:
   OpenMP();
+  ~OpenMP();
 
   /*!
    * \brief Get the recommended number of OMP threads to use given the current context
diff --git a/src/profiler/profiler.h b/src/profiler/profiler.h
index 768a0bc..ef0f74e 100644
--- a/src/profiler/profiler.h
+++ b/src/profiler/profiler.h
@@ -27,6 +27,7 @@
 
 #include <dmlc/concurrentqueue.h>
 #include <dmlc/thread_group.h>
+#include <mxnet/base.h>
 #include <vector>
 #include <string>
 #include <cstdint>
@@ -36,6 +37,7 @@
 #include "./vtune.h"
 #include "./aggregate_stats.h"
 
+
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
 #else