src/runtime/hexagon/hexagon_buffer.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 #include "hexagon_buffer.h"

 #include <tvm/runtime/module.h>

 #include <algorithm>
 #include <string>
 #include <utility>

 #include "hexagon_common.h"
 #include "hexagon_device_api.h"
 #include "qurt_memory.h"

 namespace tvm {
 namespace runtime {
 namespace hexagon {

 struct Allocation {
   Allocation(size_t allocation_nbytes, size_t alignment)
       : allocation_nbytes_(allocation_nbytes), alignment_(alignment) {}
   virtual ~Allocation() {}
   Allocation(const Allocation&) = delete;
   Allocation& operator=(const Allocation&) = delete;
   Allocation(Allocation&&) = delete;
   Allocation& operator=(Allocation&&) = delete;

   void* data_{nullptr};
   size_t allocation_nbytes_;
   size_t alignment_;
 };

 struct DDRAllocation : public Allocation {
   DDRAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
     int ret = posix_memalign(&data_, alignment, nbytes);
     TVM_FFI_ICHECK_EQ(ret, 0);

     // The heap used by malloc on Hexagon is always mapped as cacheable. The heap manager may not
     // perform cache invalidation on a prior memory free. So, a subsequent memory allocation request
     // to the heap manager may allocate memory that resides in part or in full in the cache. Hence,
     // we must invalidate the allocation from the cache to ensure that DMA with cache bypass enabled
     // will function properly. DMA with cache bypass enabled assumes that HexagonBuffer objects are
     // not cached unless explicitly modified by the primfunc. We must invalidate after malloc to
     // uphold this assumption.
     qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(data_), nbytes, QURT_MEM_CACHE_INVALIDATE,
                          QURT_MEM_DCACHE);
   }
   ~DDRAllocation() { free(data_); }
 };

 struct VTCMAllocation : public Allocation {
   VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
     // For simplicity, the current VTCM dynamic pool supports the following alignments: less than
     // or equal to 128 (0x80), and 2k (0x800)
     TVM_FFI_ICHECK((alignment <= 0x80) || (alignment == 0x800))
         << "VTCMAllocation called for invalid alignment " << alignment;

     if (alignment == 0x800) {
       // Adjust size to be a multiple of 2k so that we will allocate from the front of the pool.
       nbytes = (nbytes + 0x7ff) & -0x800;
     } else if (alignment <= 0x80) {
       // Adjust size to be a multiple of 128 so that we will allocate from the back of the pool
       // in 128 byte increments.
       nbytes = (nbytes + 0x7f) & -0x80;
     }
     if (allocation_nbytes_ != nbytes) {
       DLOG(INFO) << "VTCMAllocation size adjusted for alignment " << allocation_nbytes_ << " to "
                  << nbytes;
       allocation_nbytes_ = nbytes;
     }
     data_ = HexagonDeviceAPI::Global()->VtcmPool()->Allocate(allocation_nbytes_);
     DLOG(INFO) << "VTCMAllocation " << data_ << " " << allocation_nbytes_ << " " << alignment;
   }
   ~VTCMAllocation() {
     DLOG(INFO) << "~VTCMAllocation " << data_ << " " << allocation_nbytes_;
     HexagonDeviceAPI::Global()->VtcmPool()->Free(data_, allocation_nbytes_);
     data_ = nullptr;
   }
 };

 template <HexagonBuffer::StorageScope S>
 std::unique_ptr<Allocation> Allocator(size_t nbytes, size_t alignment);

 template <>
 std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kDDR>(size_t nbytes,
                                                                          size_t alignment) {
   return std::make_unique<DDRAllocation>(nbytes, alignment);
 }

 template <>
 std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kVTCM>(size_t nbytes,
                                                                           size_t alignment) {
   return std::make_unique<VTCMAllocation>(nbytes, alignment);
 }

 HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, ffi::Optional<ffi::String> scope)
     : ndim_(1), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);

   std::unique_ptr<Allocation> alloca = nullptr;
   if (GetStorageScope() == StorageScope::kDDR) {
     alloca = Allocator<StorageScope::kDDR>(nbytes, alignment);
   } else if (GetStorageScope() == StorageScope::kVTCM) {
     alloca = Allocator<StorageScope::kVTCM>(nbytes, alignment);
   }
   TVM_FFI_ICHECK(alloca != nullptr);
   allocations_.push_back(alloca->data_);
   managed_allocations_.push_back(std::move(alloca));
 }

 HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
                              ffi::Optional<ffi::String> scope)
     : ndim_(2), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);

   size_t nbytes_aligned = ((nbytes + (alignment - 1)) / alignment) * alignment;
   size_t nbytes_monolithic = nallocs * nbytes_aligned;

   std::unique_ptr<Allocation> alloca = nullptr;
   if (GetStorageScope() == StorageScope::kDDR) {
     alloca = Allocator<StorageScope::kDDR>(nbytes_monolithic, alignment);
   } else if (GetStorageScope() == StorageScope::kVTCM) {
     alloca = Allocator<StorageScope::kVTCM>(nbytes_monolithic, alignment);
   }
   TVM_FFI_ICHECK(alloca) << "could not create allocation";

   for (size_t i = 0; i < nallocs; ++i) {
     void* alloc_offset = static_cast<unsigned char*>(alloca->data_) + i * nbytes_aligned;
     allocations_.push_back(alloc_offset);
   }

   managed_allocations_.push_back(std::move(alloca));
 }

 HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); }

 void* HexagonBuffer::GetPointer() {
   TVM_FFI_ICHECK(allocations_.size())
       << "Internal failure, allocations_ should be set in HexagonBuffer constructor";

   if (ndim_ == 1) {
     TVM_FFI_ICHECK_EQ(allocations_.size(), 1);
     return allocations_[0];
   } else if (ndim_ == 2) {
     return allocations_.data();
   } else {
     TVM_FFI_THROW(InternalError) << "HexagonBuffer should be either 1-d or 2-d, not " << ndim_
                                  << "-d";
   }
 }

 HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; }

 void HexagonBuffer::SetStorageScope(ffi::Optional<ffi::String> scope) {
   const std::string s = scope.value_or("global");

   if (s == "global") {
     storage_scope_ = StorageScope::kDDR;
   } else if (s == "global.ddr") {
     storage_scope_ = StorageScope::kDDR;
   } else if (s == "global.vtcm") {
     storage_scope_ = StorageScope::kVTCM;
   } else {
     TVM_FFI_ICHECK(false) << "Encountered unknown HexagonBuffer storage scope: " << std::string(s);
   }
 }

 std::vector<MemoryCopy> BufferSet::MemoryCopies(const BufferSet& dest, const BufferSet& src,
                                                 size_t bytes_to_copy) {
   TVM_FFI_ICHECK_LE(bytes_to_copy, src.TotalBytes());
   TVM_FFI_ICHECK_LE(bytes_to_copy, dest.TotalBytes());

   auto pointer_to = [](const BufferSet& buf, size_t region_i, size_t byte_i) -> void* {
     void* region = buf.buffers[region_i];
     return static_cast<unsigned char*>(region) + byte_i;
   };

   size_t num_src_regions = (bytes_to_copy + src.region_size_bytes - 1) / src.region_size_bytes;

   // First, determine all copies that do not cross boundaries in
   // either source or destination region.  This requires two loops, as
   // a single source region may overlap one or more destination
   // regions, and vice versa.
   std::vector<MemoryCopy> micro_copies;
   for (size_t src_i = 0; src_i < num_src_regions; src_i++) {
     size_t src_region_begin = src_i * src.region_size_bytes;
     size_t src_region_end = std::min((src_i + 1) * src.region_size_bytes, bytes_to_copy);

     size_t dest_i_begin = src_region_begin / dest.region_size_bytes;
     size_t dest_i_end = (src_region_end - 1) / dest.region_size_bytes + 1;
     for (size_t dest_i = dest_i_begin; dest_i < dest_i_end; dest_i++) {
       size_t offset_begin = std::max(src_region_begin, dest_i * dest.region_size_bytes);
       size_t offset_end = std::min(src_region_end, (dest_i + 1) * dest.region_size_bytes);

       size_t num_bytes = offset_end - offset_begin;
       void* src_ptr = pointer_to(src, src_i, offset_begin % src.region_size_bytes);
       void* dest_ptr = pointer_to(dest, dest_i, offset_begin % dest.region_size_bytes);
       micro_copies.push_back(MemoryCopy(dest_ptr, src_ptr, num_bytes));
     }
   }

   return micro_copies;
 }

 std::vector<MemoryCopy> MemoryCopy::MergeAdjacent(std::vector<MemoryCopy> micro_copies) {
   std::sort(micro_copies.begin(), micro_copies.end(),
             [](const MemoryCopy& a, const MemoryCopy& b) { return a.src < b.src; });

   std::vector<MemoryCopy> macro_copies;
   for (const auto& copy : micro_copies) {
     if (macro_copies.size() && macro_copies.back().IsDirectlyBefore(copy)) {
       macro_copies.back().num_bytes += copy.num_bytes;
     } else {
       macro_copies.push_back(copy);
     }
   }

   return macro_copies;
 }

 void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
                                         size_t bytes_to_copy, bool src_is_hexbuff,
                                         bool dest_is_hexbuff) {
   // First, determine all copies that do not cross boundaries in
   // either source or destination region.
   auto micro_copies = BufferSet::MemoryCopies(dest, src, bytes_to_copy);

   // If regions are contiguously allocated, we can reduce the number
   // of copies required by merging adjacent copies.
   auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));

   // Finally, do the memory copies.
   for (const auto& copy : macro_copies) {
     // if src is a HexagonBuffer, invalidate it before the memcpy
     if (src_is_hexbuff) {
       qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.src), copy.num_bytes,
                            QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
     }

     // TODO(HWE): Switch to ION Buffer to avoid need for memcpy and potentially lighten or alleviate
     // the burden of cache invalidation in this code
     memcpy(copy.dest, copy.src, copy.num_bytes);

     // if dest is a HexagonBuffer, flush it after the memcpy
     if (dest_is_hexbuff) {
       qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.dest), copy.num_bytes,
                            QURT_MEM_CACHE_FLUSH, QURT_MEM_DCACHE);
     }
   }
 }

 void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
   BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
   BufferSet dest(&data, 1, nbytes);

   hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
                                      false /* dest_is_hexbuff */);
 }

 void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
   BufferSet src(&data, 1, nbytes);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);

   hexagon_buffer_copy_across_regions(dest, src, nbytes, false /* src_is_hexbuff */,
                                      true /* dest_is_hexbuff */);
 }

 void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
   BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);

   hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
                                      true /* dest_is_hexbuff */);
 }

 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	#include "hexagon_buffer.h"

	#include <tvm/runtime/module.h>

	#include <algorithm>
	#include <string>
	#include <utility>

	#include "hexagon_common.h"
	#include "hexagon_device_api.h"
	#include "qurt_memory.h"

	namespace tvm {
	namespace runtime {
	namespace hexagon {

	struct Allocation {
	Allocation(size_t allocation_nbytes, size_t alignment)
	: allocation_nbytes_(allocation_nbytes), alignment_(alignment) {}
	virtual ~Allocation() {}
	Allocation(const Allocation&) = delete;
	Allocation& operator=(const Allocation&) = delete;
	Allocation(Allocation&&) = delete;
	Allocation& operator=(Allocation&&) = delete;

	void* data_{nullptr};
	size_t allocation_nbytes_;
	size_t alignment_;
	};

	struct DDRAllocation : public Allocation {
	DDRAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
	int ret = posix_memalign(&data_, alignment, nbytes);
	TVM_FFI_ICHECK_EQ(ret, 0);

	// The heap used by malloc on Hexagon is always mapped as cacheable. The heap manager may not
	// perform cache invalidation on a prior memory free. So, a subsequent memory allocation request
	// to the heap manager may allocate memory that resides in part or in full in the cache. Hence,
	// we must invalidate the allocation from the cache to ensure that DMA with cache bypass enabled
	// will function properly. DMA with cache bypass enabled assumes that HexagonBuffer objects are
	// not cached unless explicitly modified by the primfunc. We must invalidate after malloc to
	// uphold this assumption.
	qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(data_), nbytes, QURT_MEM_CACHE_INVALIDATE,
	QURT_MEM_DCACHE);
	}
	~DDRAllocation() { free(data_); }
	};

	struct VTCMAllocation : public Allocation {
	VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
	// For simplicity, the current VTCM dynamic pool supports the following alignments: less than
	// or equal to 128 (0x80), and 2k (0x800)
	TVM_FFI_ICHECK((alignment <= 0x80) \|\| (alignment == 0x800))
	<< "VTCMAllocation called for invalid alignment " << alignment;

	if (alignment == 0x800) {
	// Adjust size to be a multiple of 2k so that we will allocate from the front of the pool.
	nbytes = (nbytes + 0x7ff) & -0x800;
	} else if (alignment <= 0x80) {
	// Adjust size to be a multiple of 128 so that we will allocate from the back of the pool
	// in 128 byte increments.
	nbytes = (nbytes + 0x7f) & -0x80;
	}
	if (allocation_nbytes_ != nbytes) {
	DLOG(INFO) << "VTCMAllocation size adjusted for alignment " << allocation_nbytes_ << " to "
	<< nbytes;
	allocation_nbytes_ = nbytes;
	}
	data_ = HexagonDeviceAPI::Global()->VtcmPool()->Allocate(allocation_nbytes_);
	DLOG(INFO) << "VTCMAllocation " << data_ << " " << allocation_nbytes_ << " " << alignment;
	}
	~VTCMAllocation() {
	DLOG(INFO) << "~VTCMAllocation " << data_ << " " << allocation_nbytes_;
	HexagonDeviceAPI::Global()->VtcmPool()->Free(data_, allocation_nbytes_);
	data_ = nullptr;
	}
	};

	template <HexagonBuffer::StorageScope S>
	std::unique_ptr<Allocation> Allocator(size_t nbytes, size_t alignment);

	template <>
	std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kDDR>(size_t nbytes,
	size_t alignment) {
	return std::make_unique<DDRAllocation>(nbytes, alignment);
	}

	template <>
	std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kVTCM>(size_t nbytes,
	size_t alignment) {
	return std::make_unique<VTCMAllocation>(nbytes, alignment);
	}

	HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, ffi::Optional<ffi::String> scope)
	: ndim_(1), nbytes_per_allocation_(nbytes) {
	SetStorageScope(scope);

	std::unique_ptr<Allocation> alloca = nullptr;
	if (GetStorageScope() == StorageScope::kDDR) {
	alloca = Allocator<StorageScope::kDDR>(nbytes, alignment);
	} else if (GetStorageScope() == StorageScope::kVTCM) {
	alloca = Allocator<StorageScope::kVTCM>(nbytes, alignment);
	}
	TVM_FFI_ICHECK(alloca != nullptr);
	allocations_.push_back(alloca->data_);
	managed_allocations_.push_back(std::move(alloca));
	}

	HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
	ffi::Optional<ffi::String> scope)
	: ndim_(2), nbytes_per_allocation_(nbytes) {
	SetStorageScope(scope);

	size_t nbytes_aligned = ((nbytes + (alignment - 1)) / alignment) * alignment;
	size_t nbytes_monolithic = nallocs * nbytes_aligned;

	std::unique_ptr<Allocation> alloca = nullptr;
	if (GetStorageScope() == StorageScope::kDDR) {
	alloca = Allocator<StorageScope::kDDR>(nbytes_monolithic, alignment);
	} else if (GetStorageScope() == StorageScope::kVTCM) {
	alloca = Allocator<StorageScope::kVTCM>(nbytes_monolithic, alignment);
	}
	TVM_FFI_ICHECK(alloca) << "could not create allocation";

	for (size_t i = 0; i < nallocs; ++i) {
	void* alloc_offset = static_cast<unsigned char>(alloca->data_) + i nbytes_aligned;
	allocations_.push_back(alloc_offset);
	}

	managed_allocations_.push_back(std::move(alloca));
	}

	HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); }

	void* HexagonBuffer::GetPointer() {
	TVM_FFI_ICHECK(allocations_.size())
	<< "Internal failure, allocations_ should be set in HexagonBuffer constructor";

	if (ndim_ == 1) {
	TVM_FFI_ICHECK_EQ(allocations_.size(), 1);
	return allocations_[0];
	} else if (ndim_ == 2) {
	return allocations_.data();
	} else {
	TVM_FFI_THROW(InternalError) << "HexagonBuffer should be either 1-d or 2-d, not " << ndim_
	<< "-d";
	}
	}

	HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; }

	void HexagonBuffer::SetStorageScope(ffi::Optional<ffi::String> scope) {
	const std::string s = scope.value_or("global");

	if (s == "global") {
	storage_scope_ = StorageScope::kDDR;
	} else if (s == "global.ddr") {
	storage_scope_ = StorageScope::kDDR;
	} else if (s == "global.vtcm") {
	storage_scope_ = StorageScope::kVTCM;
	} else {
	TVM_FFI_ICHECK(false) << "Encountered unknown HexagonBuffer storage scope: " << std::string(s);
	}
	}

	std::vector<MemoryCopy> BufferSet::MemoryCopies(const BufferSet& dest, const BufferSet& src,
	size_t bytes_to_copy) {
	TVM_FFI_ICHECK_LE(bytes_to_copy, src.TotalBytes());
	TVM_FFI_ICHECK_LE(bytes_to_copy, dest.TotalBytes());

	auto pointer_to = [](const BufferSet& buf, size_t region_i, size_t byte_i) -> void* {
	void* region = buf.buffers[region_i];
	return static_cast<unsigned char*>(region) + byte_i;
	};

	size_t num_src_regions = (bytes_to_copy + src.region_size_bytes - 1) / src.region_size_bytes;

	// First, determine all copies that do not cross boundaries in
	// either source or destination region. This requires two loops, as
	// a single source region may overlap one or more destination
	// regions, and vice versa.
	std::vector<MemoryCopy> micro_copies;
	for (size_t src_i = 0; src_i < num_src_regions; src_i++) {
	size_t src_region_begin = src_i * src.region_size_bytes;
	size_t src_region_end = std::min((src_i + 1) * src.region_size_bytes, bytes_to_copy);

	size_t dest_i_begin = src_region_begin / dest.region_size_bytes;
	size_t dest_i_end = (src_region_end - 1) / dest.region_size_bytes + 1;
	for (size_t dest_i = dest_i_begin; dest_i < dest_i_end; dest_i++) {
	size_t offset_begin = std::max(src_region_begin, dest_i * dest.region_size_bytes);
	size_t offset_end = std::min(src_region_end, (dest_i + 1) * dest.region_size_bytes);

	size_t num_bytes = offset_end - offset_begin;
	void* src_ptr = pointer_to(src, src_i, offset_begin % src.region_size_bytes);
	void* dest_ptr = pointer_to(dest, dest_i, offset_begin % dest.region_size_bytes);
	micro_copies.push_back(MemoryCopy(dest_ptr, src_ptr, num_bytes));
	}
	}

	return micro_copies;
	}

	std::vector<MemoryCopy> MemoryCopy::MergeAdjacent(std::vector<MemoryCopy> micro_copies) {
	std::sort(micro_copies.begin(), micro_copies.end(),
	[](const MemoryCopy& a, const MemoryCopy& b) { return a.src < b.src; });

	std::vector<MemoryCopy> macro_copies;
	for (const auto& copy : micro_copies) {
	if (macro_copies.size() && macro_copies.back().IsDirectlyBefore(copy)) {
	macro_copies.back().num_bytes += copy.num_bytes;
	} else {
	macro_copies.push_back(copy);
	}
	}

	return macro_copies;
	}

	void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
	size_t bytes_to_copy, bool src_is_hexbuff,
	bool dest_is_hexbuff) {
	// First, determine all copies that do not cross boundaries in
	// either source or destination region.
	auto micro_copies = BufferSet::MemoryCopies(dest, src, bytes_to_copy);

	// If regions are contiguously allocated, we can reduce the number
	// of copies required by merging adjacent copies.
	auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));

	// Finally, do the memory copies.
	for (const auto& copy : macro_copies) {
	// if src is a HexagonBuffer, invalidate it before the memcpy
	if (src_is_hexbuff) {
	qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.src), copy.num_bytes,
	QURT_MEM_CACHE_INVALIDATE, QURT_MEM_DCACHE);
	}

	// TODO(HWE): Switch to ION Buffer to avoid need for memcpy and potentially lighten or alleviate
	// the burden of cache invalidation in this code
	memcpy(copy.dest, copy.src, copy.num_bytes);

	// if dest is a HexagonBuffer, flush it after the memcpy
	if (dest_is_hexbuff) {
	qurt_mem_cache_clean(reinterpret_cast<qurt_addr_t>(copy.dest), copy.num_bytes,
	QURT_MEM_CACHE_FLUSH, QURT_MEM_DCACHE);
	}
	}
	}

	void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
	BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
	BufferSet dest(&data, 1, nbytes);

	hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
	false /* dest_is_hexbuff */);
	}

	void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
	BufferSet src(&data, 1, nbytes);
	BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);

	hexagon_buffer_copy_across_regions(dest, src, nbytes, false /* src_is_hexbuff */,
	true /* dest_is_hexbuff */);
	}

	void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
	BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_);
	BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);

	hexagon_buffer_copy_across_regions(dest, src, nbytes, true /* src_is_hexbuff */,
	true /* dest_is_hexbuff */);
	}

	} // namespace hexagon
	} // namespace runtime
	} // namespace tvm