src/runtime/ndarray.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file ndarray.cc
  * \brief NDArray container infratructure.
  */
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>

 #include "runtime_base.h"

 extern "C" {
 // C-mangled dlpack deleter.
 static void TVMNDArrayDLPackDeleter(DLManagedTensor* tensor);
 // helper function to get NDArray's type index, only used by ctypes.
 TVM_DLL int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex);
 }

 namespace tvm {
 namespace runtime {

 inline void VerifyDataType(DLDataType dtype) {
   ICHECK_GE(dtype.lanes, 1);
   if (dtype.code == kDLFloat) {
     ICHECK_EQ(dtype.bits % 8, 0);
   } else {
     // allow uint1 as a special flag for bool.
     if (dtype.bits == 1 && dtype.code == kDLUInt) return;
     // allow int1/uint4/int4
     else if (dtype.bits == 1 && dtype.code == kDLInt)
       return;
     else if (dtype.bits == 4 && dtype.code == kDLUInt)
       return;
     else if (dtype.bits == 4 && dtype.code == kDLInt)
       return;
     else
       ICHECK_EQ(dtype.bits % 8, 0);
   }
   ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }

 void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
   size_t arr_size = GetDataSize(*handle);
   ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
   ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";

   DLTensor from;
   from.data = const_cast<void*>(data);
   from.device = Device{kDLCPU, 0};
   from.ndim = handle->ndim;
   from.dtype = handle->dtype;
   from.shape = handle->shape;
   from.strides = nullptr;
   from.byte_offset = 0;
   DeviceAPI::Get(handle->device)->CopyDataFromTo(&from, handle, nullptr);
   // Synchronize in case data become unavailable later.
   DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
 }

 void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
   size_t arr_size = GetDataSize(*handle);
   ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
   ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";

   DLTensor to;
   to.data = const_cast<void*>(data);
   to.device = Device{kDLCPU, 0};
   to.ndim = handle->ndim;
   to.dtype = handle->dtype;
   to.shape = handle->shape;
   to.strides = nullptr;
   to.byte_offset = 0;

   DeviceAPI::Get(handle->device)->CopyDataFromTo(const_cast<DLTensor*>(handle), &to, nullptr);
   // Synchronize in case data become unavailable later.
   DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
 }

 struct NDArray::Internal {
   // Default deleter for the container
   static void DefaultDeleter(Object* ptr_obj) {
     auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
     if (ptr->manager_ctx != nullptr) {
       static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
     } else if (ptr->dl_tensor.data != nullptr) {
       tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
           ->FreeDataSpace(ptr->dl_tensor.device, ptr->dl_tensor.data);
     }
     delete ptr;
   }
   // Deleter for NDArray converted from DLPack
   // This is used from data which is passed from external DLPack(DLManagedTensor)
   // that are not allocated inside of TVM.
   // This enables us to create NDArray from memory allocated by other
   // frameworks that are DLPack compatible
   static void DLPackDeleter(Object* ptr_obj) {
     auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
     DLManagedTensor* tensor = static_cast<DLManagedTensor*>(ptr->manager_ctx);
     if (tensor->deleter != nullptr) {
       (*tensor->deleter)(tensor);
     }
     delete ptr;
   }
   // Deleter for NDArray based on external DLTensor
   // The memory is allocated from outside and it is assumed that
   // responsibility for its freeing is also outside
   static void SelfDeleter(Object* ptr_obj) {
     auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
     delete ptr;
   }
   // Local create function which allocates tensor metadata
   // but does not allocate space for the data.
   static NDArray Create(ShapeTuple shape, DLDataType dtype, Device dev) {
     VerifyDataType(dtype);

     // critical zone: construct header
     NDArray::Container* data = new NDArray::Container();
     data->SetDeleter(DefaultDeleter);

     // RAII now in effect
     NDArray ret(GetObjectPtr<Object>(data));
     // setup shape
     data->shape_ = std::move(shape);
     data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());
     data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
     // setup dtype
     data->dl_tensor.dtype = dtype;
     // setup device
     data->dl_tensor.device = dev;
     return ret;
   }
   // Implementation of API function
   static DLTensor* MoveToFFIHandle(NDArray arr) {
     DLTensor* handle = NDArray::FFIGetHandle(arr);
     ObjectRef::FFIClearAfterMove(&arr);
     return handle;
   }
   static void FFIDecRef(TVMArrayHandle tensor) { NDArray::FFIDecRef(tensor); }
   // Container to DLManagedTensor
   static DLManagedTensor* ToDLPack(TVMArrayHandle handle) {
     auto* from =
         static_cast<NDArray::Container*>(reinterpret_cast<NDArray::ContainerBase*>(handle));
     return ToDLPack(from);
   }

   static DLManagedTensor* ToDLPack(NDArray::Container* from) {
     ICHECK(from != nullptr);
     DLManagedTensor* ret = new DLManagedTensor();
     ret->dl_tensor = from->dl_tensor;
     ret->manager_ctx = from;
     from->IncRef();
     ret->deleter = TVMNDArrayDLPackDeleter;
     return ret;
   }
   // Delete dlpack object.
   static void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
     static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
     delete tensor;
   }
 };

 NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype) {
   ICHECK(data_ != nullptr);
   ICHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor";
   NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.device);
   ret.get_mutable()->dl_tensor.byte_offset = this->get_mutable()->dl_tensor.byte_offset;
   size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor);
   size_t view_size = GetDataSize(ret.get_mutable()->dl_tensor);
   ICHECK_LE(view_size, curr_size)
       << "Tries to create a view that has bigger memory than current one";
   // increase ref count
   get_mutable()->IncRef();
   ret.get_mutable()->manager_ctx = get_mutable();
   ret.get_mutable()->dl_tensor.data = get_mutable()->dl_tensor.data;
   return ret;
 }

 DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); }

 NDArray NDArray::Empty(ShapeTuple shape, DLDataType dtype, Device dev, Optional<String> mem_scope) {
   NDArray ret = Internal::Create(shape, dtype, dev);
   ret.get_mutable()->dl_tensor.data =
       DeviceAPI::Get(ret->device)
           ->AllocDataSpace(ret->device, shape.size(), shape.data(), ret->dtype, mem_scope);
   return ret;
 }

 NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
   ICHECK(::tvm::runtime::IsContiguous(dl_tensor)) << "External DLTensor must be contiguous.";
   ICHECK(IsAligned(dl_tensor)) << "Data in DLTensor is not aligned as required by NDArray";
   NDArray::Container* data = new NDArray::Container();

   data->SetDeleter(Internal::SelfDeleter);
   data->dl_tensor = dl_tensor;
   std::vector<ShapeTuple::index_type> shape;
   shape.resize(data->dl_tensor.ndim);
   shape.assign(data->dl_tensor.shape, data->dl_tensor.shape + data->dl_tensor.ndim);
   data->shape_ = ShapeTuple(shape);
   data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());

   return NDArray(GetObjectPtr<Object>(data));
 }

 NDArray NDArray::NewFromDLTensor(DLTensor* tensor, const Device& dev) {
   ICHECK(::tvm::runtime::IsContiguous(*tensor))
       << "DLTensor is not contiguous. Copying from non-contiguous data is currently not supported";
   std::vector<int64_t> shape;
   for (int64_t i = 0; i < tensor->ndim; i++) {
     shape.push_back(tensor->shape[i]);
   }
   NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
   ary.CopyFrom(tensor);
   return ary;
 }

 NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   NDArray::Container* data = new NDArray::Container();
   // construct header
   data->SetDeleter(Internal::DLPackDeleter);
   // fill up content.
   data->manager_ctx = tensor;
   ICHECK(::tvm::runtime::IsContiguous(tensor->dl_tensor)) << "DLManagedTensor must be contiguous.";
   ICHECK(IsAligned(tensor->dl_tensor))
       << "Data in DLManagedTensor is not aligned as required by NDArray";
   data->dl_tensor = tensor->dl_tensor;
   // update shape_
   std::vector<ShapeTuple::index_type> shape;
   shape.resize(data->dl_tensor.ndim);
   shape.assign(data->dl_tensor.shape, data->dl_tensor.shape + data->dl_tensor.ndim);
   data->shape_ = ShapeTuple(shape);
   data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());
   return NDArray(GetObjectPtr<Object>(data));
 }

 void NDArray::CopyToBytes(void* data, size_t nbytes) const {
   ICHECK(data != nullptr);
   ICHECK(data_ != nullptr);
   ArrayCopyToBytes(&get_mutable()->dl_tensor, data, nbytes);
 }

 void NDArray::CopyFromBytes(const void* data, size_t nbytes) {
   ICHECK(data != nullptr);
   ICHECK(data_ != nullptr);
   ArrayCopyFromBytes(&get_mutable()->dl_tensor, data, nbytes);
 }

 void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   size_t from_size = GetDataSize(*from);
   size_t to_size = GetDataSize(*to);
   ICHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match";

   ICHECK(from->device.device_type == to->device.device_type || from->device.device_type == kDLCPU ||
          to->device.device_type == kDLCPU || from->device.device_type == kDLCUDAHost ||
          to->device.device_type == kDLCUDAHost)
       << "Can not copy across different device types directly. From device type: "
       << from->device.device_type << " to device type: " << to->device.device_type;

   // Use the device that is *not* a cpu device to get the correct device
   // api manager.
   Device dev = from->device.device_type != kDLCPU ? from->device : to->device;

   DeviceAPI::Get(dev)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
 }

 ShapeTuple NDArray::Shape() const { return get_mutable()->shape_; }

 runtime::DataType NDArray::DataType() const {
   return runtime::DataType(get_mutable()->dl_tensor.dtype);
 }

 bool NDArray::AbilityOfZeroCopyForDLTensor(DLTensor* tensor, const Device& dev) {
   bool device_check = (dev.device_type == tensor->device.device_type);
   bool device_id_check = (dev.device_id == tensor->device.device_id);
   bool alignment_check = IsAligned(*tensor);
   return device_check && device_id_check && alignment_check;
 }

 bool NDArray::IsAligned(const DLTensor& tensor) {
   return (reinterpret_cast<size_t>(static_cast<char*>(tensor.data) + tensor.byte_offset) %
               tvm::runtime::kAllocAlignment ==
           0);
 }

 TVM_REGISTER_OBJECT_TYPE(NDArray::Container);

 }  // namespace runtime
 }  // namespace tvm

 using namespace tvm::runtime;

 void TVMNDArrayDLPackDeleter(DLManagedTensor* tensor) {
   NDArray::Internal::NDArrayDLPackDeleter(tensor);
 }

 int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex) {
   API_BEGIN();
   *out_tindex = TVMArrayHandleToObjectHandle(handle)->type_index();
   API_END();
 }

 int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_bits,
                   int dtype_lanes, int device_type, int device_id, TVMArrayHandle* out) {
   API_BEGIN();
   DLDataType dtype;
   dtype.code = static_cast<uint8_t>(dtype_code);
   dtype.bits = static_cast<uint8_t>(dtype_bits);
   dtype.lanes = static_cast<uint16_t>(dtype_lanes);
   tvm::Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
   auto ndarray = NDArray::Empty(ShapeTuple(shape, shape + ndim), dtype, dev);

   *out = NDArray::Internal::MoveToFFIHandle(ndarray);
   API_END();
 }

 TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body_typed(NDArray::Empty);

 TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body_typed([](NDArray arr, ShapeTuple shape) {
   NDArray view = arr.CreateView(shape, arr->dtype);
   return view;
 });

 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();
   NDArray::Internal::FFIDecRef(handle);
   API_END();
 }

 int TVMArrayCopyFromTo(TVMArrayHandle from, TVMArrayHandle to, TVMStreamHandle stream) {
   API_BEGIN();
   NDArray::CopyFromTo(from, to, stream);
   API_END();
 }

 int TVMArrayFromDLPack(DLManagedTensor* from, TVMArrayHandle* out) {
   API_BEGIN();
   *out = NDArray::Internal::MoveToFFIHandle(NDArray::FromDLPack(from));
   API_END();
 }

 int TVMArrayToDLPack(TVMArrayHandle from, DLManagedTensor** out) {
   API_BEGIN();
   *out = NDArray::Internal::ToDLPack(from);
   API_END();
 }

 void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) { (*(dltensor->deleter))(dltensor); }

 int TVMArrayCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) {
   API_BEGIN();
   ArrayCopyFromBytes(handle, data, nbytes);
   API_END();
 }

 int TVMArrayCopyToBytes(TVMArrayHandle handle, void* data, size_t nbytes) {
   API_BEGIN();
   ArrayCopyToBytes(handle, data, nbytes);
   API_END();
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file ndarray.cc
	* \brief NDArray container infratructure.
	*/
	#include <tvm/runtime/c_runtime_api.h>
	#include <tvm/runtime/device_api.h>
	#include <tvm/runtime/logging.h>
	#include <tvm/runtime/ndarray.h>
	#include <tvm/runtime/registry.h>

	#include "runtime_base.h"

	extern "C" {
	// C-mangled dlpack deleter.
	static void TVMNDArrayDLPackDeleter(DLManagedTensor* tensor);
	// helper function to get NDArray's type index, only used by ctypes.
	TVM_DLL int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex);
	}

	namespace tvm {
	namespace runtime {

	inline void VerifyDataType(DLDataType dtype) {
	ICHECK_GE(dtype.lanes, 1);
	if (dtype.code == kDLFloat) {
	ICHECK_EQ(dtype.bits % 8, 0);
	} else {
	// allow uint1 as a special flag for bool.
	if (dtype.bits == 1 && dtype.code == kDLUInt) return;
	// allow int1/uint4/int4
	else if (dtype.bits == 1 && dtype.code == kDLInt)
	return;
	else if (dtype.bits == 4 && dtype.code == kDLUInt)
	return;
	else if (dtype.bits == 4 && dtype.code == kDLInt)
	return;
	else
	ICHECK_EQ(dtype.bits % 8, 0);
	}
	ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
	}

	void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
	size_t arr_size = GetDataSize(*handle);
	ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
	ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";

	DLTensor from;
	from.data = const_cast<void*>(data);
	from.device = Device{kDLCPU, 0};
	from.ndim = handle->ndim;
	from.dtype = handle->dtype;
	from.shape = handle->shape;
	from.strides = nullptr;
	from.byte_offset = 0;
	DeviceAPI::Get(handle->device)->CopyDataFromTo(&from, handle, nullptr);
	// Synchronize in case data become unavailable later.
	DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
	}

	void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
	size_t arr_size = GetDataSize(*handle);
	ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
	ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";

	DLTensor to;
	to.data = const_cast<void*>(data);
	to.device = Device{kDLCPU, 0};
	to.ndim = handle->ndim;
	to.dtype = handle->dtype;
	to.shape = handle->shape;
	to.strides = nullptr;
	to.byte_offset = 0;

	DeviceAPI::Get(handle->device)->CopyDataFromTo(const_cast<DLTensor*>(handle), &to, nullptr);
	// Synchronize in case data become unavailable later.
	DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
	}

	struct NDArray::Internal {
	// Default deleter for the container
	static void DefaultDeleter(Object* ptr_obj) {
	auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
	if (ptr->manager_ctx != nullptr) {
	static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
	} else if (ptr->dl_tensor.data != nullptr) {
	tvm::runtime::DeviceAPI::Get(ptr->dl_tensor.device)
	->FreeDataSpace(ptr->dl_tensor.device, ptr->dl_tensor.data);
	}
	delete ptr;
	}
	// Deleter for NDArray converted from DLPack
	// This is used from data which is passed from external DLPack(DLManagedTensor)
	// that are not allocated inside of TVM.
	// This enables us to create NDArray from memory allocated by other
	// frameworks that are DLPack compatible
	static void DLPackDeleter(Object* ptr_obj) {
	auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
	DLManagedTensor* tensor = static_cast<DLManagedTensor*>(ptr->manager_ctx);
	if (tensor->deleter != nullptr) {
	(*tensor->deleter)(tensor);
	}
	delete ptr;
	}
	// Deleter for NDArray based on external DLTensor
	// The memory is allocated from outside and it is assumed that
	// responsibility for its freeing is also outside
	static void SelfDeleter(Object* ptr_obj) {
	auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
	delete ptr;
	}
	// Local create function which allocates tensor metadata
	// but does not allocate space for the data.
	static NDArray Create(ShapeTuple shape, DLDataType dtype, Device dev) {
	VerifyDataType(dtype);

	// critical zone: construct header
	NDArray::Container* data = new NDArray::Container();
	data->SetDeleter(DefaultDeleter);

	// RAII now in effect
	NDArray ret(GetObjectPtr<Object>(data));
	// setup shape
	data->shape_ = std::move(shape);
	data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());
	data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
	// setup dtype
	data->dl_tensor.dtype = dtype;
	// setup device
	data->dl_tensor.device = dev;
	return ret;
	}
	// Implementation of API function
	static DLTensor* MoveToFFIHandle(NDArray arr) {
	DLTensor* handle = NDArray::FFIGetHandle(arr);
	ObjectRef::FFIClearAfterMove(&arr);
	return handle;
	}
	static void FFIDecRef(TVMArrayHandle tensor) { NDArray::FFIDecRef(tensor); }
	// Container to DLManagedTensor
	static DLManagedTensor* ToDLPack(TVMArrayHandle handle) {
	auto* from =
	static_cast<NDArray::Container>(reinterpret_cast<NDArray::ContainerBase>(handle));
	return ToDLPack(from);
	}

	static DLManagedTensor* ToDLPack(NDArray::Container* from) {
	ICHECK(from != nullptr);
	DLManagedTensor* ret = new DLManagedTensor();
	ret->dl_tensor = from->dl_tensor;
	ret->manager_ctx = from;
	from->IncRef();
	ret->deleter = TVMNDArrayDLPackDeleter;
	return ret;
	}
	// Delete dlpack object.
	static void NDArrayDLPackDeleter(DLManagedTensor* tensor) {
	static_cast<NDArray::Container*>(tensor->manager_ctx)->DecRef();
	delete tensor;
	}
	};

	NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype) {
	ICHECK(data_ != nullptr);
	ICHECK(get_mutable()->dl_tensor.strides == nullptr) << "Can only create view for compact tensor";
	NDArray ret = Internal::Create(shape, dtype, get_mutable()->dl_tensor.device);
	ret.get_mutable()->dl_tensor.byte_offset = this->get_mutable()->dl_tensor.byte_offset;
	size_t curr_size = GetDataSize(this->get_mutable()->dl_tensor);
	size_t view_size = GetDataSize(ret.get_mutable()->dl_tensor);
	ICHECK_LE(view_size, curr_size)
	<< "Tries to create a view that has bigger memory than current one";
	// increase ref count
	get_mutable()->IncRef();
	ret.get_mutable()->manager_ctx = get_mutable();
	ret.get_mutable()->dl_tensor.data = get_mutable()->dl_tensor.data;
	return ret;
	}

	DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); }

	NDArray NDArray::Empty(ShapeTuple shape, DLDataType dtype, Device dev, Optional<String> mem_scope) {
	NDArray ret = Internal::Create(shape, dtype, dev);
	ret.get_mutable()->dl_tensor.data =
	DeviceAPI::Get(ret->device)
	->AllocDataSpace(ret->device, shape.size(), shape.data(), ret->dtype, mem_scope);
	return ret;
	}

	NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
	ICHECK(::tvm::runtime::IsContiguous(dl_tensor)) << "External DLTensor must be contiguous.";
	ICHECK(IsAligned(dl_tensor)) << "Data in DLTensor is not aligned as required by NDArray";
	NDArray::Container* data = new NDArray::Container();

	data->SetDeleter(Internal::SelfDeleter);
	data->dl_tensor = dl_tensor;
	std::vector<ShapeTuple::index_type> shape;
	shape.resize(data->dl_tensor.ndim);
	shape.assign(data->dl_tensor.shape, data->dl_tensor.shape + data->dl_tensor.ndim);
	data->shape_ = ShapeTuple(shape);
	data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());

	return NDArray(GetObjectPtr<Object>(data));
	}

	NDArray NDArray::NewFromDLTensor(DLTensor* tensor, const Device& dev) {
	ICHECK(::tvm::runtime::IsContiguous(*tensor))
	<< "DLTensor is not contiguous. Copying from non-contiguous data is currently not supported";
	std::vector<int64_t> shape;
	for (int64_t i = 0; i < tensor->ndim; i++) {
	shape.push_back(tensor->shape[i]);
	}
	NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
	ary.CopyFrom(tensor);
	return ary;
	}

	NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
	NDArray::Container* data = new NDArray::Container();
	// construct header
	data->SetDeleter(Internal::DLPackDeleter);
	// fill up content.
	data->manager_ctx = tensor;
	ICHECK(::tvm::runtime::IsContiguous(tensor->dl_tensor)) << "DLManagedTensor must be contiguous.";
	ICHECK(IsAligned(tensor->dl_tensor))
	<< "Data in DLManagedTensor is not aligned as required by NDArray";
	data->dl_tensor = tensor->dl_tensor;
	// update shape_
	std::vector<ShapeTuple::index_type> shape;
	shape.resize(data->dl_tensor.ndim);
	shape.assign(data->dl_tensor.shape, data->dl_tensor.shape + data->dl_tensor.ndim);
	data->shape_ = ShapeTuple(shape);
	data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());
	return NDArray(GetObjectPtr<Object>(data));
	}

	void NDArray::CopyToBytes(void* data, size_t nbytes) const {
	ICHECK(data != nullptr);
	ICHECK(data_ != nullptr);
	ArrayCopyToBytes(&get_mutable()->dl_tensor, data, nbytes);
	}

	void NDArray::CopyFromBytes(const void* data, size_t nbytes) {
	ICHECK(data != nullptr);
	ICHECK(data_ != nullptr);
	ArrayCopyFromBytes(&get_mutable()->dl_tensor, data, nbytes);
	}

	void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
	size_t from_size = GetDataSize(*from);
	size_t to_size = GetDataSize(*to);
	ICHECK_EQ(from_size, to_size) << "TVMArrayCopyFromTo: The size must exactly match";

	ICHECK(from->device.device_type == to->device.device_type \|\| from->device.device_type == kDLCPU \|\|
	to->device.device_type == kDLCPU \|\| from->device.device_type == kDLCUDAHost \|\|
	to->device.device_type == kDLCUDAHost)
	<< "Can not copy across different device types directly. From device type: "
	<< from->device.device_type << " to device type: " << to->device.device_type;

	// Use the device that is not a cpu device to get the correct device
	// api manager.
	Device dev = from->device.device_type != kDLCPU ? from->device : to->device;

	DeviceAPI::Get(dev)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
	}

	ShapeTuple NDArray::Shape() const { return get_mutable()->shape_; }

	runtime::DataType NDArray::DataType() const {
	return runtime::DataType(get_mutable()->dl_tensor.dtype);
	}

	bool NDArray::AbilityOfZeroCopyForDLTensor(DLTensor* tensor, const Device& dev) {
	bool device_check = (dev.device_type == tensor->device.device_type);
	bool device_id_check = (dev.device_id == tensor->device.device_id);
	bool alignment_check = IsAligned(*tensor);
	return device_check && device_id_check && alignment_check;
	}

	bool NDArray::IsAligned(const DLTensor& tensor) {
	return (reinterpret_cast<size_t>(static_cast<char*>(tensor.data) + tensor.byte_offset) %
	tvm::runtime::kAllocAlignment ==
	0);
	}

	TVM_REGISTER_OBJECT_TYPE(NDArray::Container);

	} // namespace runtime
	} // namespace tvm

	using namespace tvm::runtime;

	void TVMNDArrayDLPackDeleter(DLManagedTensor* tensor) {
	NDArray::Internal::NDArrayDLPackDeleter(tensor);
	}

	int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex) {
	API_BEGIN();
	*out_tindex = TVMArrayHandleToObjectHandle(handle)->type_index();
	API_END();
	}

	int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_bits,
	int dtype_lanes, int device_type, int device_id, TVMArrayHandle* out) {
	API_BEGIN();
	DLDataType dtype;
	dtype.code = static_cast<uint8_t>(dtype_code);
	dtype.bits = static_cast<uint8_t>(dtype_bits);
	dtype.lanes = static_cast<uint16_t>(dtype_lanes);
	tvm::Device dev;
	dev.device_type = static_cast<DLDeviceType>(device_type);
	dev.device_id = device_id;
	auto ndarray = NDArray::Empty(ShapeTuple(shape, shape + ndim), dtype, dev);

	*out = NDArray::Internal::MoveToFFIHandle(ndarray);
	API_END();
	}

	TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body_typed(NDArray::Empty);

	TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body_typed([](NDArray arr, ShapeTuple shape) {
	NDArray view = arr.CreateView(shape, arr->dtype);
	return view;
	});

	int TVMArrayFree(TVMArrayHandle handle) {
	API_BEGIN();
	NDArray::Internal::FFIDecRef(handle);
	API_END();
	}

	int TVMArrayCopyFromTo(TVMArrayHandle from, TVMArrayHandle to, TVMStreamHandle stream) {
	API_BEGIN();
	NDArray::CopyFromTo(from, to, stream);
	API_END();
	}

	int TVMArrayFromDLPack(DLManagedTensor* from, TVMArrayHandle* out) {
	API_BEGIN();
	*out = NDArray::Internal::MoveToFFIHandle(NDArray::FromDLPack(from));
	API_END();
	}

	int TVMArrayToDLPack(TVMArrayHandle from, DLManagedTensor** out) {
	API_BEGIN();
	*out = NDArray::Internal::ToDLPack(from);
	API_END();
	}

	void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) { (*(dltensor->deleter))(dltensor); }

	int TVMArrayCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) {
	API_BEGIN();
	ArrayCopyFromBytes(handle, data, nbytes);
	API_END();
	}

	int TVMArrayCopyToBytes(TVMArrayHandle handle, void* data, size_t nbytes) {
	API_BEGIN();
	ArrayCopyToBytes(handle, data, nbytes);
	API_END();
	}