[VTA] Support TLPP in function simulator. (#3555)

* [VTA] Support TLPP in function simulator.
Issue:
currently vta function simulator just doing serialized instruction
execution, the dependency logic of runtime ISA which use for task
level pipe line parallelism can not get verified by function simulator.

Solution:
make the simulator driver to be multiple thread and support TLPP.

Benefit:
TLPP support VTA function simulator would make VTA logic testing/debug
/change more easy.

replace boost lockfree queue

add configure control for simulator tlpp enable or disable.

change code tyle into google style.

Wrap queue read/write and sync logic to make function call more simple.

Add some comments.

Remove MT logic, change into Single thread mode.

address review comments.

code style change to match google code style and add comments.

add cmake macro to enable/disable simulator tlpp logic.

submodule update.

correct file name mentioned in comments.

* remove USE_VTA_FSIM_TLPP.
diff --git a/include/vta/sim_tlpp.h b/include/vta/sim_tlpp.h
new file mode 100644
index 0000000..ead07f1
--- /dev/null
+++ b/include/vta/sim_tlpp.h
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file sim_tlpp.h
+ * \brief TVM VTA multiple thread simulator header file.
+ */
+#ifndef VTA_SIM_TLPP_H_
+#define VTA_SIM_TLPP_H_
+#include <vta/hw_spec.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <vector>
+#include <ctime>
+#include <cassert>
+#include <queue>
+
+#define SCOREGEMM "gemm"
+#define SCORELOAD "load"
+#define SCORESTORE "store"
+#define SCOREUNKNOWN "unknown"
+typedef void (*Run_Function)(const VTAGenericInsn *, void *);
+typedef enum {COREGEMM = 0, CORELOAD, CORESTORE, COREMAX} CORE_TYPE;
+typedef std::queue<const void*> Insn_q_t;
+typedef std::queue<int> Dep_q_t;
+/*!
+ * \brief simulate core level pipe line parallism logic.
+ */
+class TlppVerify {
+ public:
+    /*! Return TlppVefiy class instance.*/
+    static TlppVerify *Global() { static TlppVerify Cls; return &Cls;}
+
+    /*! 
+     *  \brief Loop to process instruction and verify tlpp logic.
+     *  \param run_function function pointer to excute instruction .
+     *  \param fsim_handle class pointer of function simulator class Device.
+     *  \param debug to enable/disable debug
+     */
+    void TlppSynchronization(Run_Function run_function,
+                             void *fsim_handle,
+                             bool debug = false);
+    /*!
+     *  \brief Push instruction into queue for later excute.
+     *  \param insn instructions.
+     */
+    void TlppPushInsn(const VTAGenericInsn *insn);
+    /*! \ Event pump to handle dependency event. */
+    void EventProcess(void);
+    /*! \ Schedule a paticular core to run. */
+    void CoreRun(CORE_TYPE core_type);
+
+ private:
+    /*! TlppVerify construction function.*/
+    TlppVerify();
+    /*!
+     * \brief clear class variable.
+     */
+    void Clear();
+    /*!
+     * \ brief check if the insn dependency condition satisfy and do notify.
+     * \ param insn instructions.
+     * \ param before_run identify this check is happen before
+     *   instruction excute or after instruction excute, for before
+     *   scenario need to check if depency condition satisfy, for post
+     *   case need to check if need to send notfication.
+     */
+    bool InsnDependencyCheck(const VTAGenericInsn *insn, bool before_run);
+    /*!
+     * \ brief get operation code from insn
+     * \ param insn instructions
+     */
+    uint64_t GetOperationCode(const VTAGenericInsn *insn);
+    /*!
+     * \ brief find which core should run this instruction.
+     * \ param operation_code operation type like load/gemm etc.
+     * \ param insn instructions.
+     */
+    CORE_TYPE GetCoreType(uint64_t operation_code, const VTAGenericInsn *insn);
+    /*!
+     * \ brief , pick up first instruction for specify core.
+     * \ param core_type core type
+     */
+    const VTAGenericInsn *PickFrontInsn(uint64_t core_type);
+    /*!
+     * \ brief consume one instruction after pass dependency condition.
+     * \ param core_type core type
+     */
+    void ConsumeFrontInsn(uint64_t core_type);
+    /*!
+     * \ brief, process dependency logic
+     * param before_run if this call happen before instruction run.
+     * param pop_prev if instruction have previous core dependency.
+     * param pop_next if instruction have depency for next core.
+     * param pop_prev_q notification from previous core.
+     * param pop_next_q notification from next core.
+     * param push_prev_q notification queue need to send notification
+     * for prevous core.
+     * param push_next_q notification queue need to send notification
+     * from next core.
+     * push_to_prev_q_indx which core need wake up if have notification
+     * fro previous core.
+     * push_to_next_q_indx which core need wake up if have notification
+     * fro next core.
+     */
+    bool DependencyProcess(bool before_run,
+        bool pop_prev, bool pop_next,
+        bool push_prev, bool push_next,
+        Dep_q_t *pop_prev_q, Dep_q_t *pop_next_q,
+        Dep_q_t *push_prev_q, Dep_q_t *push_next_q,
+        CORE_TYPE push_to_prev_q_indx, CORE_TYPE push_to_next_q_indx);
+    /*!
+     * \ brief , return name based on core type.
+     * \ param core_type core type
+     */
+    inline const char * GetCoreTypeName(CORE_TYPE core_type) {
+      return (core_type == COREGEMM) ? SCOREGEMM :
+        (core_type == CORELOAD) ? SCORELOAD :
+        (core_type == CORESTORE) ? SCORESTORE :
+        SCOREUNKNOWN;
+    }
+    /*! debug flag*/
+    bool debug_;
+    /*! function simulator device class pointer*/
+    void *fsim_handle_;
+    /*! function simulator instruction excute function pointer*/
+    Run_Function run_fsim_function_;
+    /*! instruction queue for each core*/
+    Insn_q_t insnq_array_[COREMAX];
+    /*! dependency queue from load to gemm*/
+    Dep_q_t l2g_q_;
+    /*! dependency queue from store to gemm*/
+    Dep_q_t s2g_q_;
+    /*! dependency queue from gemm to load*/
+    Dep_q_t g2l_q_;
+    /*! dependency queue from gemm to store*/
+    Dep_q_t g2s_q_;
+    /*! computation done*/
+    int done_;
+    /*! event queue for core wake up*/
+    std::queue<CORE_TYPE> dep_push_event_;
+};
+#endif  // VTA_SIM_TLPP_H_
diff --git a/src/sim/sim_driver.cc b/src/sim/sim_driver.cc
index eb49712..871097f 100644
--- a/src/sim/sim_driver.cc
+++ b/src/sim/sim_driver.cc
@@ -25,6 +25,7 @@
 #include <vta/driver.h>
 #include <vta/hw_spec.h>
 #include <tvm/runtime/registry.h>
+#include <vta/sim_tlpp.h>
 #include <type_traits>
 #include <mutex>
 #include <map>
@@ -275,6 +276,7 @@
   Device() {
     prof_ = Profiler::ThreadLocal();
     dram_ = DRAM::Global();
+    ptlpp = TlppVerify::Global();
   }
 
   int Run(vta_phy_addr_t insn_phy_addr,
@@ -286,26 +288,37 @@
     for (uint32_t i = 0; i < insn_count; ++i) {
       this->Run(insn + i);
     }
+    this->TlppSynchronization();
     return 0;
   }
 
  private:
-  void Run(const VTAGenericInsn* insn) {
+  static void Run_Insn(const VTAGenericInsn* insn, void * dev) {
+    Device * device = reinterpret_cast<Device *> (dev);
     const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
     const VTAGemInsn* gem = reinterpret_cast<const VTAGemInsn*>(insn);
     const VTAAluInsn* alu = reinterpret_cast<const VTAAluInsn*>(insn);
     switch (mem->opcode) {
-      case VTA_OPCODE_LOAD: RunLoad(mem); break;
-      case VTA_OPCODE_STORE: RunStore(mem); break;
-      case VTA_OPCODE_GEMM: RunGEMM(gem); break;
-      case VTA_OPCODE_ALU: RunALU(alu); break;
-      case VTA_OPCODE_FINISH: ++finish_counter_; break;
+      case VTA_OPCODE_LOAD: device->RunLoad(mem); break;
+      case VTA_OPCODE_STORE: device->RunStore(mem); break;
+      case VTA_OPCODE_GEMM: device->RunGEMM(gem); break;
+      case VTA_OPCODE_ALU: device->RunALU(alu); break;
+      case VTA_OPCODE_FINISH: ++(device->finish_counter_); break;
       default: {
         LOG(FATAL) << "Unknown op_code" << mem->opcode;
       }
     }
   }
 
+ private:
+  void Run(const VTAGenericInsn* insn) {
+    ptlpp->TlppPushInsn(insn);
+  }
+
+  void TlppSynchronization(void) {
+    ptlpp->TlppSynchronization(Run_Insn, reinterpret_cast<void *> (this));
+  }
+
   void RunLoad(const VTAMemInsn* op) {
     if (op->x_size == 0) return;
     if (op->memory_type == VTA_MEM_ID_INP) {
@@ -466,6 +479,7 @@
   Profiler* prof_;
   // The DRAM interface
   DRAM* dram_;
+  TlppVerify* ptlpp;
   // The SRAM
   SRAM<VTA_INP_WIDTH, VTA_BATCH * VTA_BLOCK_IN, VTA_INP_BUFF_DEPTH> inp_;
   SRAM<VTA_WGT_WIDTH, VTA_BLOCK_IN * VTA_BLOCK_OUT, VTA_WGT_BUFF_DEPTH> wgt_;
diff --git a/src/sim/sim_tlpp.cc b/src/sim/sim_tlpp.cc
new file mode 100644
index 0000000..5a97b93
--- /dev/null
+++ b/src/sim/sim_tlpp.cc
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file sim_tlpp.cc
+ * \brief simulate core level pipe line parallism logic.
+ */
+#include <vta/sim_tlpp.h>
+TlppVerify::TlppVerify() {
+  done_ = 0;
+}
+
+void TlppVerify::Clear() {
+  fsim_handle_ = nullptr;
+  run_fsim_function_ = nullptr;
+  for (int i = 0; i < COREMAX; i++) {
+    while (insnq_array_[i].size()) {
+      insnq_array_[i].pop();
+    }
+  }
+  done_ = 0;
+}
+
+uint64_t TlppVerify::GetOperationCode(const VTAGenericInsn *insn) {
+  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
+  return mem->opcode;
+}
+
+CORE_TYPE TlppVerify::GetCoreType(uint64_t operation_code,
+                              const VTAGenericInsn *insn) {
+  CORE_TYPE core_type = COREGEMM;
+  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
+  switch (operation_code) {
+    case VTA_OPCODE_GEMM:
+    case VTA_OPCODE_ALU:
+      core_type = COREGEMM;
+      break;
+    case VTA_OPCODE_LOAD:
+      if (mem->memory_type == VTA_MEM_ID_INP||
+          mem->memory_type == VTA_MEM_ID_WGT) {
+        core_type = CORELOAD;
+      }
+      break;
+    case VTA_OPCODE_STORE:
+      core_type = CORESTORE;
+      break;
+    default:
+      break;
+  }
+  return core_type;
+}
+
+bool TlppVerify::DependencyProcess(bool before_run,
+    bool pop_prev, bool pop_next,
+    bool push_prev, bool push_next,
+    Dep_q_t *pop_prev_q, Dep_q_t *pop_next_q,
+    Dep_q_t *push_prev_q, Dep_q_t *push_next_q,
+    CORE_TYPE push_to_prev_q_indx, CORE_TYPE push_to_next_q_indx) {
+
+  int val = 1;
+  if (before_run) {
+    if (pop_prev && pop_prev_q->size() == 0) {
+      return false;
+    }
+    if (pop_next && pop_next_q->size() == 0) {
+      return false;
+    }
+    if (pop_next) pop_next_q->pop();
+    if (pop_prev) pop_prev_q->pop();
+  } else {
+    if (push_prev) {
+      push_prev_q->push(val);
+      dep_push_event_.push(push_to_prev_q_indx);
+    }
+    if (push_next) {
+      push_next_q->push(val);
+      dep_push_event_.push(push_to_next_q_indx);
+    }
+  }
+  return true;
+}
+
+bool TlppVerify::InsnDependencyCheck(const VTAGenericInsn *insn,
+                                     bool before_run) {
+  const VTAMemInsn* mem = reinterpret_cast<const VTAMemInsn*>(insn);
+  bool pop_prev = mem->pop_prev_dep;
+  bool pop_next = mem->pop_next_dep;
+  bool push_prev = mem->push_prev_dep;
+  bool push_next = mem->push_next_dep;
+  CORE_TYPE core_type = GetCoreType(GetOperationCode(insn), insn);
+  bool bcheck = false;
+  switch (core_type) {
+    case COREGEMM:
+      bcheck = DependencyProcess(before_run, pop_prev,
+          pop_next, push_prev, push_next,
+          &l2g_q_, &s2g_q_, &g2l_q_, &g2s_q_, CORELOAD, CORESTORE);
+      break;
+    case CORELOAD:
+      bcheck = DependencyProcess(before_run, pop_prev,
+          pop_next, push_prev, push_next,
+          nullptr, &g2l_q_, nullptr, &l2g_q_, COREMAX, COREGEMM);
+      break;
+    case CORESTORE:
+      bcheck = DependencyProcess(before_run, pop_prev,
+          pop_next, push_prev, push_next,
+          &g2s_q_, nullptr, &s2g_q_, nullptr, COREGEMM, COREMAX);
+      break;
+    case COREMAX:
+      assert(0);
+      break;
+  }
+
+  return bcheck;
+}
+
+void TlppVerify::CoreRun(CORE_TYPE core_type) {
+  const VTAGenericInsn *insn = PickFrontInsn(core_type);
+  while (insn) {
+    /*!
+     * Check need to read any dependency queue for wait.
+     */
+    if (!InsnDependencyCheck(insn, true)) {
+      break;
+    }
+    /*!
+     * Execute the instruction.
+     */
+    run_fsim_function_(insn, fsim_handle_);
+    /*!
+     *check if need to write any dependency queue for notify.
+     */
+    InsnDependencyCheck(insn, false);
+    /*!
+     * If instruction is FINISH set done flag.
+     * notification.
+     */
+    done_ = GetOperationCode(insn) == VTA_OPCODE_FINISH;
+
+    if (debug_) {
+      printf("this is thread for %s\n", GetCoreTypeName(core_type));
+    }
+    ConsumeFrontInsn(core_type);
+    insn = PickFrontInsn(core_type);
+  }
+  return;
+}
+
+void TlppVerify::EventProcess(void) {
+  while (dep_push_event_.size()) {
+      CORE_TYPE core_type = dep_push_event_.front();
+      dep_push_event_.pop();
+      CoreRun(core_type);
+  }
+}
+
+void TlppVerify::TlppSynchronization(Run_Function run_function,
+                                         void *fsim_handle,
+                                         bool debug) {
+  fsim_handle_ = fsim_handle;
+  run_fsim_function_ = run_function;
+  debug_ = debug;
+  done_ = 0;
+  do {
+    /*
+     * Pick a random core to run first.
+     */
+    unsigned int seed = time(NULL);
+    uint8_t core_start = rand_r(&seed)%COREMAX;
+    for (int i = 0; i < COREMAX; i++) {
+      CoreRun(static_cast<CORE_TYPE>((core_start + i) % COREMAX));
+    }
+    EventProcess();
+  }while (!done_);
+  Clear();
+  return;
+}
+
+void TlppVerify::TlppPushInsn(const VTAGenericInsn *insn) {
+  uint64_t operation_code = GetOperationCode(insn);
+  CORE_TYPE core_type = GetCoreType(operation_code, insn);
+  insnq_array_[core_type].push(static_cast<const void *>(insn));
+  return;
+}
+
+const VTAGenericInsn *TlppVerify::PickFrontInsn(uint64_t core_type) {
+  const void *return_value = nullptr;
+  if (insnq_array_[core_type].size()) {
+    return_value = insnq_array_[core_type].front();
+  }
+  return reinterpret_cast<const VTAGenericInsn *> (return_value);
+}
+
+void TlppVerify::ConsumeFrontInsn(uint64_t core_type) {
+  if (insnq_array_[core_type].size()) {
+    insnq_array_[core_type].pop();
+  }
+}