diff --git a/apps/verilator/README.md b/apps/verilator/README.md
index f4b0f69..34da6a0 100644
--- a/apps/verilator/README.md
+++ b/apps/verilator/README.md
@@ -29,6 +29,6 @@
 ## Build
 
 1. Build Verilator hardware library by running `make`
-2. Enable Verilator backend by setting `USE_VERILATOR_HW ON` in TVM cmake configuration file (`config.cmake`)
+2. Enable Verilator backend by setting `USE_VERILATOR ON` in TVM cmake configuration file (`config.cmake`)
 3. Build and install TVM
 
diff --git a/apps/verilator/scalar_add/Makefile b/apps/verilator/add/Makefile
similarity index 87%
rename from apps/verilator/scalar_add/Makefile
rename to apps/verilator/add/Makefile
index 05513f6..ee8f065 100644
--- a/apps/verilator/scalar_add/Makefile
+++ b/apps/verilator/add/Makefile
@@ -18,20 +18,24 @@
 VERILATOR_BIN := $(shell which verilator)
 VERILATOR_INC_DIR := $(abspath $(dir $(VERILATOR_BIN))/../share/verilator/include)
 TOP_NAME = "Top"
+LIB_NAME = "libverilator"
 VERILOG_DIR = $(abspath .)/verilog
 SRC_DIR = $(abspath .)/src
-ROOT_DIR = $(abspath ..)
+ROOT_DIR = $(abspath .)
 TVM_DIR = $(abspath ../../../../..)
 OUT_DIR = $(abspath .)/out
+LANES = 1
+LIB_PATH = $(ROOT_DIR)/$(LIB_NAME).so
 
-default: $(ROOT_DIR)/libverilator.so
+default: $(LIB_PATH)
 
-$(ROOT_DIR)/libverilator.so: $(OUT_DIR)/$(TOP_NAME).cpp
+$(LIB_PATH): $(OUT_DIR)/$(TOP_NAME).cpp
 	g++ \
 	-std=c++14 \
 	-O2 \
 	-shared \
 	-fPIC \
+	-DLANES=$(LANES) \
 	-I$(TVM_DIR)/src/runtime/contrib/verilator \
 	-I$(TVM_DIR)/include \
 	-I$(TVM_DIR)/3rdparty/dlpack/include \
@@ -49,6 +53,7 @@
 	-Wno-STMTDLY \
 	-Wno-WIDTH \
 	-Wno-UNOPTFLAT \
+	-DLANES=$(LANES) \
 	--cc \
 	--prefix $(TOP_NAME) \
 	--top-module "driver" \
@@ -56,4 +61,4 @@
 	$^
 
 clean:
-	-rm -rf $(OUT_DIR)
+	-rm -rf $(OUT_DIR) *.so
diff --git a/apps/verilator/scalar_add/src/driver.cc b/apps/verilator/add/src/driver.cc
similarity index 83%
rename from apps/verilator/scalar_add/src/driver.cc
rename to apps/verilator/add/src/driver.cc
index 8d2c0f1..49c2728 100644
--- a/apps/verilator/scalar_add/src/driver.cc
+++ b/apps/verilator/add/src/driver.cc
@@ -28,14 +28,14 @@
 namespace contrib {
 
 extern "C" VerilatorHandle VerilatorAlloc() {
-  Top *top = new Top;
+  Top* top = new Top;
   return static_cast<VerilatorHandle>(top);
 }
 
-extern "C" void VerilatorDealloc(VerilatorHandle handle) { delete static_cast<Top *>(handle); }
+extern "C" void VerilatorDealloc(VerilatorHandle handle) { delete static_cast<Top*>(handle); }
 
 extern "C" int VerilatorRead(VerilatorHandle handle, int id, int addr) {
-  Top *top = static_cast<Top *>(handle);
+  Top* top = static_cast<Top*>(handle);
   top->opcode = 2;
   top->id = id;
   top->addr = addr;
@@ -44,7 +44,7 @@
 }
 
 extern "C" void VerilatorWrite(VerilatorHandle handle, int id, int addr, int value) {
-  Top *top = static_cast<Top *>(handle);
+  Top* top = static_cast<Top*>(handle);
   top->opcode = 1;
   top->id = id;
   top->addr = addr;
@@ -53,12 +53,12 @@
 }
 
 extern "C" void VerilatorReset(VerilatorHandle handle, int n) {
-  Top *top = static_cast<Top *>(handle);
+  Top* top = static_cast<Top*>(handle);
+  top->opcode = 0;
   top->clock = 0;
   top->reset = 1;
   main_time = 0;
-  while (!Verilated::gotFinish() &&
-         main_time < static_cast<vluint64_t>(n * 10)) {
+  while (!Verilated::gotFinish() && main_time < static_cast<vluint64_t>(n * 10)) {
     if ((main_time % 10) == 1) {
       top->clock = 1;
     }
@@ -72,11 +72,11 @@
 }
 
 extern "C" void VerilatorRun(VerilatorHandle handle, int n) {
-  Top *top = static_cast<Top *>(handle);
+  Top* top = static_cast<Top*>(handle);
+  top->opcode = 0;
   top->clock = 0;
   main_time = 0;
-  while (!Verilated::gotFinish() &&
-         main_time < static_cast<vluint64_t>(n * 10)) {
+  while (!Verilated::gotFinish() && main_time < static_cast<vluint64_t>(n * 10)) {
     if ((main_time % 10) == 1) {
       top->clock = 1;
     }
diff --git a/apps/verilator/add/src/kernel.cc b/apps/verilator/add/src/kernel.cc
new file mode 100644
index 0000000..c3b018d
--- /dev/null
+++ b/apps/verilator/add/src/kernel.cc
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <stdint.h>
+#include <iostream>
+
+#include "verilator_device.h"
+#include "verilator_kernel.h"
+
+#ifndef LANES
+#define LANES 1
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+extern "C" void verilator_add(VerilatorHandle handle, int* left, int* right, int* out, int p_h_,
+                              int p_w_) {
+  for (int64_t i = 0; i < (p_h_ * p_w_ / LANES); ++i) {
+    for (int64_t j = 0; j < LANES; ++j) {
+      int64_t k = i * LANES + j;
+      VerilatorWrite(handle, 1, j, left[k]);
+      VerilatorWrite(handle, 2, j, right[k]);
+    }
+    VerilatorRun(handle, 1);
+    for (int64_t j = 0; j < LANES; ++j) {
+      int64_t k = i * LANES + j;
+      out[k] = VerilatorRead(handle, 3, j);
+    }
+  }
+}
+
+extern "C" void verilator_bias_add(VerilatorHandle handle, int* data, int* bias, int* out, int p_n_,
+                                   int p_c_, int p_h_, int p_w_) {
+  int64_t round = p_c_ / LANES;
+  if (p_c_ % LANES != 0) {
+    round++;
+  }
+  for (int64_t i = 0; i < (p_n_ * p_h_ * p_w_); ++i) {
+    for (int64_t j = 0; j < round; ++j) {
+      for (int64_t k = 0; k < LANES; ++k) {
+        int64_t l = j * LANES + k;
+        int64_t m = i * p_c_ + l;
+        if (l < p_c_) {
+          VerilatorWrite(handle, 1, k, data[m]);
+          VerilatorWrite(handle, 2, k, bias[l]);
+        }
+      }
+      VerilatorRun(handle, 1);
+      for (int64_t k = 0; k < LANES; ++k) {
+        int64_t l = j * LANES + k;
+        int64_t m = i * p_c_ + l;
+        if (l < p_c_) {
+          out[m] = VerilatorRead(handle, 3, k);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/apps/verilator/scalar_add/verilog/adder.v b/apps/verilator/add/verilog/add.v
similarity index 61%
rename from apps/verilator/scalar_add/verilog/adder.v
rename to apps/verilator/add/verilog/add.v
index 6497e78..a896074 100644
--- a/apps/verilator/scalar_add/verilog/adder.v
+++ b/apps/verilator/add/verilog/add.v
@@ -17,21 +17,36 @@
  * under the License.
  */
 
-module scalar_add(input clock, input reset);
+module add #(parameter LANES = 1)(input clock, input reset);
 
-    reg [32-1:0] ra;
-    reg [32-1:0] rb;
-    reg [32-1:0] ry;
+    reg [32-1:0] cc;
+    reg [32*LANES-1:0] ra;
+    reg [32*LANES-1:0] rb;
+    reg [32*LANES-1:0] ry;
 
     always @(posedge clock) begin
         if (reset) begin
-            ra <= 0;
-            rb <= 0;
-            ry <= 0;
+            cc <= 0;
         end
         else begin
-            ry <= ra + rb;
+            cc <= cc + 1'b1;
         end
     end
 
+    genvar i;
+    for (i = 0; i < LANES; i++) begin
+
+        always @(posedge clock) begin
+            if (reset) begin
+                ra[32*i+:32] <= 0;
+                rb[32*i+:32] <= 0;
+                ry[32*i+:32] <= 0;
+            end
+            else begin
+                ry[32*i+:32] <= ra[32*i+:32] + rb[32*i+:32];
+            end
+        end
+
+    end
+
 endmodule
diff --git a/apps/verilator/add/verilog/driver.v b/apps/verilator/add/verilog/driver.v
new file mode 100644
index 0000000..6b700d7
--- /dev/null
+++ b/apps/verilator/add/verilog/driver.v
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+`ifndef LANES
+`define LANES 1
+`endif
+
+module driver (
+    input  logic          clock,
+    input  logic          reset,
+    input  logic [32-1:0] opcode,
+    input  logic [32-1:0] id,
+    input  logic [32-1:0] in,
+    input  logic [32-1:0] addr,
+    output logic [32-1:0] out
+);
+
+    function void write_cc;
+        input int value;
+        input int addr;
+        begin
+            driver.dut.cc[32*addr+:32] = value;
+        end
+    endfunction
+
+    function int read_cc;
+        input int addr;
+        begin
+            return driver.dut.cc[32*addr+:32];
+        end
+    endfunction
+
+    function void write_reg_a;
+        input int value;
+        input int addr;
+        begin
+            driver.dut.ra[32*addr+:32] = value;
+        end
+    endfunction
+
+    function int read_reg_a;
+        input int addr;
+        begin
+            return driver.dut.ra[32*addr+:32];
+        end
+    endfunction
+
+    function void write_reg_b;
+        input int value;
+        input int addr;
+        begin
+            driver.dut.rb[32*addr+:32] = value;
+        end
+    endfunction
+
+    function int read_reg_b;
+        input int addr;
+        begin
+            return driver.dut.rb[32*addr+:32];
+        end
+    endfunction
+
+    function void write_reg_y;
+        input int value;
+        input int addr;
+        begin
+            driver.dut.ry[32*addr+:32] = value;
+        end
+    endfunction
+
+    function int read_reg_y;
+        input int addr;
+        begin
+            return driver.dut.ry[32*addr+:32];
+        end
+    endfunction
+
+    always_comb begin
+        case(opcode)
+            32'd0 : out = 32'hdeadbeef;
+            32'd1 : begin
+                case(id)
+                    32'd0 : write_cc(in, addr);
+                    32'd1 : write_reg_a(in, addr);
+                    32'd2 : write_reg_b(in, addr);
+                    32'd3 : write_reg_y(in, addr);
+                endcase
+            end
+            32'd2 : begin
+                case(id)
+                    32'd0 : out = read_cc(addr);
+                    32'd1 : out = read_reg_a(addr);
+                    32'd2 : out = read_reg_b(addr);
+                    32'd3 : out = read_reg_y(addr);
+                endcase
+            end
+        endcase
+    end
+
+    add #(.LANES(`LANES)) dut (.clock(clock), .reset(reset));
+
+endmodule
diff --git a/apps/verilator/scalar_add/src/kernel.cc b/apps/verilator/scalar_add/src/kernel.cc
deleted file mode 100644
index e4ed9dd..0000000
--- a/apps/verilator/scalar_add/src/kernel.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <stdint.h>
-
-#include "verilator_device.h"
-#include "verilator_kernel.h"
-
-namespace tvm {
-namespace runtime {
-namespace contrib {
-
-extern "C" void verilator_add(VerilatorHandle handle, int *data, int *weight, int *out, int p_h_, int p_w_) {
-  for (int64_t i = 0; i < p_h_; ++i) {
-    for (int64_t j = 0; j < p_w_; ++j) {
-      int64_t k = i * p_w_ + j;
-      VerilatorWrite(handle, 0, 0, data[k]);
-      VerilatorWrite(handle, 1, 0, weight[k]);
-      VerilatorRun(handle, 1);
-      out[k] = VerilatorRead(handle, 2, 0);
-    }
-  }
-}
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
diff --git a/apps/verilator/scalar_add/verilog/driver.v b/apps/verilator/scalar_add/verilog/driver.v
deleted file mode 100644
index c3674d3..0000000
--- a/apps/verilator/scalar_add/verilog/driver.v
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-module driver (
-    input  logic          clock,
-    input  logic          reset,
-    input  logic [32-1:0] opcode,
-    input  logic [32-1:0] id,
-    input  logic [32-1:0] in,
-    input  logic [32-1:0] addr,
-    output logic [32-1:0] out
-);
-
-    function void write_reg_a;
-        input int value;
-        input int addr;
-        logic [31:0] tmp;
-        begin
-            tmp[0+:32] = 0;
-            tmp[0+:32] = driver.dut.ra;
-            tmp[addr*32+:32] = value;
-            driver.dut.ra = tmp[0+:32];
-        end
-    endfunction
-
-    function int read_reg_a;
-        input int addr;
-        logic [32-1:0] tmp;
-        begin
-            tmp[0+:32] = 0;
-            tmp[0+:32] = driver.dut.ra;
-            return tmp[addr*32+:32];
-        end
-    endfunction
-
-    function void write_reg_b;
-        input int value;
-        input int addr;
-        logic [31:0] tmp;
-        begin
-            tmp[0+:32] = 0;
-            tmp[0+:32] = driver.dut.rb;
-            tmp[addr*32+:32] = value;
-            driver.dut.rb = tmp[0+:32];
-        end
-    endfunction
-
-    function int read_reg_b;
-        input int addr;
-        logic [32-1:0] tmp;
-        begin
-            tmp[0+:32] = 0;
-            tmp[0+:32] = driver.dut.rb;
-            return tmp[addr*32+:32];
-        end
-    endfunction
-
-    function void write_reg_y;
-        input int value;
-        input int addr;
-        logic [31:0] tmp;
-        begin
-            tmp[0+:32] = 0;
-            tmp[0+:32] = driver.dut.ry;
-            tmp[addr*32+:32] = value;
-            driver.dut.ry = tmp[0+:32];
-        end
-    endfunction
-
-    function int read_reg_y;
-        input int addr;
-        logic [32-1:0] tmp;
-        begin
-            tmp[0+:32] = 0;
-            tmp[0+:32] = driver.dut.ry;
-            return tmp[addr*32+:32];
-        end
-    endfunction
-
-    always_comb begin
-        case(opcode)
-            32'd0 : out = 32'hdeadbeef;
-            32'd1 : begin
-                case(id)
-                    32'd0 : write_reg_a(in, addr);
-                    32'd1 : write_reg_b(in, addr);
-                    32'd2 : write_reg_y(in, addr);
-                    default : $error("invalid id");
-                endcase
-            end
-            32'd2 : begin
-                case(id)
-                    32'd0 : out = read_reg_a(addr);
-                    32'd1 : out = read_reg_b(addr);
-                    32'd2 : out = read_reg_y(addr);
-                    default : $error("invalid id");
-                endcase
-            end
-            default : $error("invalid opcode");
-        endcase
-    end
-
-    scalar_add dut (.clock(clock), .reset(reset));
-
-endmodule
