| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include <dmlc/logging.h> |
| #include <gtest/gtest.h> |
| #include <tvm/driver/driver_api.h> |
| #include <tvm/runtime/registry.h> |
| #include <tvm/te/operation.h> |
| #include <tvm/topi/cuda/injective.h> |
| |
| #include <cmath> |
| #include <string> |
| |
| TEST(BuildModule, Basic) { |
| using namespace tvm; |
| using namespace tvm::te; |
| auto n = var("n"); |
| Array<PrimExpr> shape; |
| shape.push_back(n); |
| |
| auto A = placeholder(shape, DataType::Float(32), "A"); |
| auto B = placeholder(shape, DataType::Float(32), "B"); |
| |
| auto C = compute( |
| A->shape, [&A, &B](PrimExpr i) { return A[i] + B[i]; }, "C"); |
| |
| auto s = create_schedule({C->op}); |
| |
| auto cAxis = C->op.as<ComputeOpNode>()->axis; |
| |
| IterVar bx, tx; |
| s[C].split(cAxis[0], 64, &bx, &tx); |
| |
| auto args = Array<Tensor>({A, B, C}); |
| std::unordered_map<Tensor, Buffer> binds; |
| |
| auto target = Target("llvm"); |
| |
| auto lowered = lower(s, args, "func", binds); |
| auto module = build(lowered, target, Target()); |
| |
| auto mali_target = Target("opencl -model=Mali-T860MP4@800Mhz -device=mali"); |
| CHECK_EQ(mali_target->kind->name, "opencl"); |
| CHECK_EQ(mali_target->keys.size(), 3); |
| CHECK_EQ(mali_target->keys[0], "mali"); |
| CHECK_EQ(mali_target->keys[1], "opencl"); |
| CHECK_EQ(mali_target->keys[2], "gpu"); |
| CHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali"); |
| CHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz"); |
| CHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256); |
| } |
| |
| TEST(BuildModule, Heterogeneous) { |
| /* The testing network is like following, where the element-wise add and sub |
| * ops are allocated to GPU and CPU, respectively: |
| * |
| * A B |
| * \ / |
| * elemwise_add (gpu) |
| * \ |
| * copy C |
| * \ / |
| * elemwise_sub (cpu) |
| */ |
| |
| using namespace tvm; |
| using namespace tvm::te; |
| bool enabled = tvm::runtime::RuntimeEnabled("cuda"); |
| if (!enabled) { |
| LOG(INFO) << "Skip heterogeneous test because cuda is not enabled." |
| << "\n"; |
| return; |
| } |
| |
| auto target_llvm = Target("llvm"); |
| auto target_cuda = Target("cuda"); |
| |
| // The shape of input tensors. |
| const int n = 4; |
| Array<PrimExpr> shape{n}; |
| |
| auto A = placeholder(shape, DataType::Float(32), "A"); |
| auto B = placeholder(shape, DataType::Float(32), "B"); |
| auto C = placeholder(shape, DataType::Float(32), "C"); |
| |
| auto elemwise_add = compute( |
| A->shape, [&A, &B](PrimExpr i) { return A[i] + B[i]; }, "elemwise_add"); |
| |
| auto copy = placeholder(shape, DataType::Float(32), "__copy"); |
| auto elemwise_sub = compute( |
| C->shape, [©, &C](PrimExpr i) { return copy[i] - C[i]; }, "elemwise_sub"); |
| |
| With<Target> cuda_scope(target_cuda); |
| auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add}); |
| |
| With<Target> llvm_scope(target_llvm); |
| auto s2 = create_schedule({elemwise_sub->op}); |
| |
| auto args1 = Array<Tensor>({A, B, elemwise_add}); |
| auto args2 = Array<Tensor>({copy, C, elemwise_sub}); |
| |
| std::unordered_map<Tensor, Buffer> binds; |
| auto lowered_s1 = lower(s1, args1, "elemwise_add", binds); |
| auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds); |
| Map<tvm::Target, IRModule> inputs = {{target_cuda, lowered_s1}, {target_llvm, lowered_s2}}; |
| auto module = build(inputs, Target()); |
| |
| // Assertion for build. |
| CHECK_EQ(module->imports().size(), 1); |
| |
| // Execute the graph and check the correctness. |
| // Setup graph json. |
| std::string json = |
| "{\"nodes\": [{\"op\": \"null\", \"name\": \"A\", \"inputs\": []}, " |
| "{\"op\": \"null\", \"name\": \"B\", \"inputs\": []}, {\"op\": " |
| "\"tvm_op\", \"name\": \"elemwise_add\", \"attrs\": {\"flatten_data\": " |
| "\"1\", \"func_name\": \"elemwise_add\", \"num_inputs\": \"2\", " |
| "\"num_outputs\": \"1\"}, \"inputs\": [[0, 0, 0], [1, 0, 0]]}, {\"op\": " |
| "\"tvm_op\", \"name\": \"__copy_add_to_sub\", \"attrs\": " |
| "{\"flatten_data\": \"0\", \"func_name\": \"__copy\", \"num_inputs\": " |
| "\"1\", \"num_outputs\": \"1\"}, \"inputs\": [[2, 0, 0]]}, {\"op\": " |
| "\"null\", \"name\": \"C\", \"inputs\": []}, {\"op\": \"tvm_op\", " |
| "\"name\": \"elemwise_sub\", \"attrs\": {\"flatten_data\": \"0\", " |
| "\"func_name\": \"elemwise_sub\", \"num_inputs\": \"2\", " |
| "\"num_outputs\": \"1\"}, \"inputs\": [[3, 0, 0], [4, 0, 0]]}], " |
| "\"arg_nodes\": [0, 1, 4], \"node_row_ptr\": [0, 1, 2, 3, 4, 5, 6], " |
| "\"heads\": [[5, 0, 0]], \"attrs\": {\"storage_id\": [\"list_int\", [3, " |
| "4, 0, 1, 5, 2]], \"shape\": [\"list_shape\", [[4], [4], [4], [4], [4], " |
| "[4]]], \"device_index\": [\"list_int\", [2, 2, 2, 1, 1, 1]], \"dtype\": " |
| "[\"list_int\", [0, 0, 0, 0, 0, 0]], \"dltype\": [\"list_str\", " |
| "[\"float32\", \"float32\", \"float32\", \"float32\", \"float32\", " |
| "\"float32\"]]}}"; |
| |
| // Setup inputs. |
| auto a_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); |
| auto b_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); |
| auto c_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0}); |
| |
| auto pa = (float*)(a_val->data); |
| auto pb = (float*)(b_val->data); |
| auto pc = (float*)(c_val->data); |
| |
| // Assign values. |
| for (int i = 0; i < n; i++) { |
| pa[i] = i; |
| pb[i] = i + 1.0; |
| pc[i] = i - 1.0; |
| } |
| |
| // Initialize graph runtime. |
| int cpu_dev_ty = static_cast<int>(kDLCPU); |
| int cpu_dev_id = 0; |
| int gpu_dev_ty = static_cast<int>(kDLGPU); |
| int gpu_dev_id = 0; |
| |
| const runtime::PackedFunc* graph_runtime = |
| tvm::runtime::Registry::Get("tvm.graph_runtime.create"); |
| runtime::Module mod = |
| (*graph_runtime)(json, module, cpu_dev_ty, cpu_dev_id, gpu_dev_ty, gpu_dev_id); |
| |
| // test FFI for module. |
| auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) { |
| int tcode = args[1]; |
| CHECK_EQ(args[0].type_code(), tcode); |
| }); |
| |
| test_ffi(runtime::Module(mod), static_cast<int>(kTVMModuleHandle)); |
| test_ffi(Optional<runtime::Module>(mod), static_cast<int>(kTVMModuleHandle)); |
| |
| PackedFunc set_input = mod.GetFunction("set_input", false); |
| PackedFunc run = mod.GetFunction("run", false); |
| PackedFunc get_output = mod.GetFunction("get_output", false); |
| set_input("A", a_val); |
| set_input("B", b_val); |
| set_input("C", c_val); |
| |
| run(); |
| tvm::runtime::NDArray out = get_output(0); |
| float* p_out = (float*)out->data; |
| |
| // Check correctness. |
| for (int i = 0; i < n; ++i) { |
| CHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5); |
| } |
| } |
| |
| int main(int argc, char** argv) { |
| testing::InitGoogleTest(&argc, argv); |
| testing::FLAGS_gtest_death_test_style = "threadsafe"; |
| return RUN_ALL_TESTS(); |
| } |