src/serialization/cnpy.cc - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 // File is based on https://github.com/leezu/cnpy/tree/libzip released under MIT License
 // Copyright (C) 2011  Carl Rogers, 2018 Leonard Lausen

 #include "cnpy.h"
 #include <mxnet/op_attr_types.h>
 #include <mxnet/imperative.h>
 #include <string_view>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <fstream>
 #include <complex>
 #include <numeric>
 #include <limits>
 #include <regex>
 #include <tuple>
 #include <set>
 #include <stdexcept>
 #include <typeinfo>


 namespace mxnet {

 void fortran_order_transpose_prepare(std::vector<dim_t>& shape) {  // NOLINT(runtime/references)
   std::reverse(std::begin(shape), std::end(shape));
 }

 // NOLINTNEXTLINE(runtime/references)
 NDArray fortran_order_transpose(std::vector<dim_t>& shape, int type_flag, NDArray& array) {
   std::reverse(std::begin(shape), std::end(shape));
   TShape tshape(shape);
   NDArray transposed(tshape, Context::CPU(), false, type_flag);
   const std::vector<NDArray*> inputs {&array};
   const std::vector<NDArray*> outputs {&transposed};
   const std::vector<OpReqType> reqs {kWriteTo};  // Transpose does not support kWriteInplace
   nnvm::NodeAttrs attrs;
   if (!Imperative::Get()->is_np_shape()) {
     attrs.op = nnvm::Op::Get("transpose");
   } else {
     attrs.op = nnvm::Op::Get("_npi_transpose");
   }
   attrs.op->attr_parser(&attrs);
   Imperative::Get()->InvokeOp(Context::CPU(), attrs, inputs, outputs,
                               reqs, DispatchMode::kFCompute, OpStatePtr());
   return transposed;
 }


 namespace npy {

 #if (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
      __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 #define MXNET_BYTEORDER "<"
 #define MXNET_BYTEORDER_CHAR '<'
 #elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
     __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 #define MXNET_BYTEORDER ">"
 #define MXNET_BYTEORDER_CHAR '>'
 #elif defined(_WIN32)
 #define MXNET_BYTEORDER "<"
 #define MXNET_BYTEORDER_CHAR '<'
 #else
 #error "endian detection needs to be set up for your compiler"
 #endif

 std::string dtype_descr(const TBlob& blob) {
   switch (blob.type_flag_) {
     case mshadow::kFloat16: return "'" MXNET_BYTEORDER "f2'";
     case mshadow::kFloat32: return "'" MXNET_BYTEORDER "f4'";
     case mshadow::kFloat64: return "'" MXNET_BYTEORDER "f8'";
     case mshadow::kInt8: return "'|i1'";
     case mshadow::kInt16: return "'" MXNET_BYTEORDER "i2'";
     case mshadow::kInt32: return "'" MXNET_BYTEORDER "i4'";
     case mshadow::kInt64: return "'" MXNET_BYTEORDER "i8'";
     case mshadow::kBool: return "'|b1'";
     case mshadow::kUint8: return "'|u1'";
     case mshadow::kUint16: return "'" MXNET_BYTEORDER "u2'";
     case mshadow::kUint32: return "'" MXNET_BYTEORDER "u4'";
     case mshadow::kUint64: return "'" MXNET_BYTEORDER "u8'";
     case mshadow::kBfloat16: return "[('bfloat16', '" MXNET_BYTEORDER "u2')]";
     default: {
       LOG(FATAL) << "Unknown dtype type " << blob.type_flag_ << "encountered.";
       return "";
     }
   }
 }


 int dtype_descr(const std::string& dtype_descr) {
     if (dtype_descr.find("f2'") != std::string::npos) return mshadow::kFloat16;
     else if (dtype_descr.find("f4'") != std::string::npos) return mshadow::kFloat32;
     else if (dtype_descr.find("f8'") != std::string::npos) return mshadow::kFloat64;
     else if (dtype_descr.find("|i1'") != std::string::npos) return mshadow::kInt8;
     else if (dtype_descr.find("i2'") != std::string::npos) return mshadow::kInt16;
     else if (dtype_descr.find("i4'") != std::string::npos) return mshadow::kInt32;
     else if (dtype_descr.find("i8'") != std::string::npos) return mshadow::kInt64;
     else if (dtype_descr.find("|b1'") != std::string::npos) return mshadow::kBool;
     else if (dtype_descr.find("|u1'") != std::string::npos) return mshadow::kUint8;
     else if (dtype_descr.find("u2'") != std::string::npos) return mshadow::kUint16;
     else if (dtype_descr.find("u4'") != std::string::npos) return mshadow::kUint32;
     else if (dtype_descr.find("u8'") != std::string::npos) return mshadow::kUint64;
     else if (dtype_descr.find("bfloat16'") != std::string::npos) return mshadow::kBfloat16;
     else
       LOG(FATAL) << "Unknown dtype descriptor " << dtype_descr << "encountered.";
     return -1;
 }

 std::string create_npy_header(const TBlob& blob) {
   std::string dict;
   dict += "{'descr': ";
   dict += dtype_descr(blob);
   dict += ", 'fortran_order': False, 'shape': (";
   if (blob.ndim()) {
     dict += std::to_string(blob.shape_[0]);
     for (int i = 1; i < blob.ndim(); i++) {
       dict += ", ";
       dict += std::to_string(blob.shape_[i]);
     }
     if (blob.ndim() == 1) {
       dict += ",";
     }
   }
   dict += "), }";

   // pad with spaces so that preamble+dict is modulo 64 bytes. preamble is
   // 10 bytes. dict needs to end with \n
   int remainder = 64 - (10 + dict.size() + 1) % 64;
   dict.insert(dict.end(), remainder, ' ');
   dict.push_back('\n');
   assert((dict.size() + 10) % 64 == 0);

   std::string header;
   header += static_cast<char>(0x93);
   header += "NUMPY";

   std::string::size_type size = dict.size();
   CHECK(size <= std::numeric_limits<uint32_t>::max()) << "Shape too large for NPY serialization";
   if (size <= std::numeric_limits<uint16_t>::max()) {
     header += static_cast<char>(0x01);  // major version of numpy format
     header += static_cast<char>(0x00);  // minor version of numpy format
     uint16_t size_ = dict.size();
     header += static_cast<char>(size_ & 0xFF);
     header += static_cast<char>(size_ >> 8);
   } else {
     header += static_cast<char>(0x02);  // major version of numpy format
     header += static_cast<char>(0x00);  // minor version of numpy format
     uint32_t size_ = dict.size();
     header += static_cast<char>(size_ & 0xFF);
     header += static_cast<char>((size_ >> 8) & 0xFF);
     header += static_cast<char>((size_ >> 16) & 0xFF);
     header += static_cast<char>((size_ >> 24) & 0xFF);
   }

   header += dict;

   return header;
 }

 uint32_t parse_npy_header_len(std::ifstream& strm) {
   strm.exceptions(std::istream::eofbit);
   strm.exceptions(std::istream::failbit);
   strm.exceptions(std::istream::badbit);

   CHECK_EQ(strm.get(), 0x93);
   CHECK_EQ(strm.get(), 'N');
   CHECK_EQ(strm.get(), 'U');
   CHECK_EQ(strm.get(), 'M');
   CHECK_EQ(strm.get(), 'P');
   CHECK_EQ(strm.get(), 'Y');

   uint8_t major_version = strm.get();
   CHECK(major_version == 0x01 || major_version == 0x02) << "Unsupported npy major version";
   CHECK(strm.get() == 0x00) << "Unsupported npy minor version";

   uint32_t header_len = 0;
   header_len += strm.get();
   header_len += strm.get() >> 8;
   if (major_version == 0x02) {
     header_len += strm.get() >> 16;
     header_len += strm.get() >> 24;
   }
   return header_len;
 }

 std::tuple<int, int, std::vector<dim_t>> parse_npy_header_descr(const std::string& header) {
   // Fortran order
   std::string::size_type loc = header.find("fortran_order");
   CHECK_NE(loc, std::string::npos) << "failed to find NPY header keyword: 'fortran_order'";
   bool fortran_order = (header.substr(loc + 16, 4) == "True" ? true : false);

   // Shape
   loc = header.find('(');
   std::string::size_type end_loc = header.find(')');
   CHECK_NE(loc, std::string::npos) << "failed to find NPY header keyword: '('";
   CHECK_NE(end_loc, std::string::npos) << "failed to find NPY header keyword: ')'";
   std::string shape_str = header.substr(loc+1, end_loc-loc-1);
   std::regex num_regex("[0-9][0-9]*");
   std::smatch sm;
   std::vector<dim_t> shape;
   while (std::regex_search(shape_str, sm, num_regex)) {
     shape.push_back(std::stoi(sm[0].str()));
     shape_str = sm.suffix().str();
   }

   // endian, word size, data type
   // byte order code | stands for not applicable.
   loc = header.find("descr");
   CHECK_NE(loc, std::string::npos) << "failed to find NPY header keyword: 'descr'";
   // May use https://github.com/numpy/numpy/blob/38275835/numpy/core/src/multiarray/ctors.c#L365
   CHECK(header[loc + 9] == MXNET_BYTEORDER_CHAR || header[loc + 9] == '|')
     << "Loading files with non-native endianness "
     << "is not yet supported. Please open the file "
     << "with numpy.load, use byteswap method to "
     << "convert endianness and re-save the file.";

   int type_flag = dtype_descr(header);
   return std::tuple(type_flag, fortran_order, shape);
 }


 void save_array(const std::string& fname, const NDArray& array_) {
   NDArray array;  // a copy on cpu
   if (array_.ctx().dev_mask() != cpu::kDevMask) {
     array = array_.Copy(Context::CPU());
     array.WaitToRead();
   } else {
     array = array_;
     array.WaitToRead();
 #if MXNET_USE_MKLDNN == 1
     if (array.IsMKLDNNData()) {
       array = array.Reorder2Default();
     }
 #endif
   }

   CHECK_EQ(array.storage_type(), kDefaultStorage);

   const TBlob& blob = array.data();
   std::string npy_header = create_npy_header(blob);

   std::ofstream output(fname, std::ios::binary);
   output.write(npy_header.data(), npy_header.size());
   output.write(static_cast<const char*>(blob.dptr_), blob.Size() *
                mshadow::mshadow_sizeof(blob.type_flag_));
 }

 NDArray load_array(const std::string& fname) {
   std::ifstream strm(fname, std::ios::binary);
   strm.exceptions(std::istream::eofbit);
   strm.exceptions(std::istream::failbit);
   strm.exceptions(std::istream::badbit);

   uint32_t header_len = parse_npy_header_len(strm);
   std::string header(header_len, ' ');
   strm.read(header.data(), header_len);
   auto[type_flag, fortran_order, shape] = parse_npy_header_descr(header);

   if (fortran_order) {
     fortran_order_transpose_prepare(shape);
   }

   TShape tshape(shape);
   NDArray array(tshape, Context::CPU(), false, type_flag);
   const TBlob& blob = array.data();
   strm.read(reinterpret_cast<char*>(blob.dptr_), blob.Size() *
             mshadow::mshadow_sizeof(blob.type_flag_));

   if (fortran_order) {
     array = fortran_order_transpose(shape, type_flag, array);
   }

   return array;
 }

 }  // namespace npy

 namespace npz {

 size_t npy_header_blob_read_callback(void *pOpaque, mz_uint64 file_ofs, void *pBuf, size_t n) {
     auto[npy_header, blob] = *static_cast<std::tuple<const std::string*, const TBlob*>*>(pOpaque);

     if (file_ofs < npy_header->size() && file_ofs + n < npy_header->size()) {
         // Read n bytes from npy_header
         const void* pSrc = static_cast<const void*>(npy_header->data() + file_ofs);
         std::memcpy(pBuf, pSrc, n);
     } else if (file_ofs < npy_header->size()) {
         // Read npy_header->size() - file_ofs bytes from npy_header
         const void* pSrc = static_cast<const void*>(npy_header->data() + file_ofs);
         const size_t npy_header_n = npy_header->size() - file_ofs;
         std::memcpy(pBuf, pSrc, npy_header_n);

         // Read n - (npy_header->size() - file_ofs) bytes from blob
         void* pBuf_blob = static_cast<void*>(static_cast<char*>(pBuf) + npy_header_n);
         std::memcpy(pBuf_blob, blob->dptr_, n - npy_header_n);
     } else {
         // Read n bytes from blob
         const void* pSrc = static_cast<const void*>(
             static_cast<char*>(blob->dptr_) + file_ofs - npy_header->size());
         std::memcpy(pBuf, pSrc, n);
     }
     return n;
 }


 void save_blob(mz_zip_archive* archive, const std::string& blob_name, const TBlob& blob) {
   const std::string npy_header = npy::create_npy_header(blob);

   const std::string blob_name_npy = blob_name + ".npy";
   mz_uint64 size_to_add = npy_header.size();
   size_to_add += blob.Size() * mshadow::mshadow_sizeof(blob.type_flag_);
   auto callback_data = std::tuple(&npy_header, &blob);
   CHECK(mz_zip_writer_add_read_buf_callback(archive, blob_name_npy.data(),
                                            npy_header_blob_read_callback,
                                            static_cast<void*>(&callback_data), size_to_add, nullptr,
                                            nullptr, 0, MZ_NO_COMPRESSION, nullptr, 0, nullptr, 0))
       << mz_zip_get_error_string(mz_zip_get_last_error(archive));
 }


 // Save shape of sparse ndarray in to scipy compatible shape.npy with int64 data
 void save_shape_array(mz_zip_archive* archive, const std::string& blob_name,
                       const mxnet::TShape& shape) {
   // Special case of create_npy_header for TShape data
   std::string dict;
   dict += "{'descr': '<i8', 'fortran_order': False, 'shape': (";
   dict += std::to_string(shape.ndim());
   dict += ",), }";
   // pad with spaces so that preamble+dict is modulo 64 bytes. preamble is
   // 10 bytes. dict needs to end with \n
   int remainder = 64 - (10 + dict.size() + 1) % 64;
   dict.insert(dict.end(), remainder, ' ');
   dict.push_back('\n');
   assert((dict.size() + 10) % 64 == 0);
   std::string npy;
   npy += static_cast<char>(0x93);
   npy += "NUMPY";
   std::string::size_type size = dict.size();
   CHECK(size <= std::numeric_limits<uint32_t>::max()) << "Shape too large for NPY serialization";
   if (size <= std::numeric_limits<uint16_t>::max()) {
       npy += static_cast<char>(0x01);  // major version of numpy format
       npy += static_cast<char>(0x00);  // minor version of numpy format
       uint16_t size_ = dict.size();
       npy += static_cast<char>(size_ & 0xFF);
       npy += static_cast<char>(size_ >> 8);
   } else {
       npy += static_cast<char>(0x02);  // major version of numpy format
       npy += static_cast<char>(0x00);  // minor version of numpy format
       uint32_t size_ = dict.size();
       npy += static_cast<char>(size_ & 0xFF);
       npy += static_cast<char>((size_ >> 8) & 0xFF);
       npy += static_cast<char>((size_ >> 16) & 0xFF);
       npy += static_cast<char>((size_ >> 24) & 0xFF);
   }
   npy += dict;

   // Add shape data
   for (const uint64_t value : shape) {
       npy += static_cast<char>(value & 0xFF);
       npy += static_cast<char>((value >> 8) & 0xFF);
       npy += static_cast<char>((value >> 16) & 0xFF);
       npy += static_cast<char>((value >> 24) & 0xFF);
       npy += static_cast<char>((value >> 32) & 0xFF);
       npy += static_cast<char>((value >> 40) & 0xFF);
       npy += static_cast<char>((value >> 48) & 0xFF);
       npy += static_cast<char>((value >> 56) & 0xFF);
   }


   const std::string blob_name_npy = blob_name + ".npy";
   CHECK(mz_zip_writer_add_mem(archive, blob_name_npy.data(), npy.data(),
                               npy.size(), MZ_NO_COMPRESSION))
       << mz_zip_get_error_string(mz_zip_get_last_error(archive));
 }


 void save_format_array(mz_zip_archive* archive, const std::string& blob_name,
                        const std::string_view& format) {
   // Special case of create_npy_header for TShape data
   std::string dict;
   dict += "{'descr': '|s";
   dict += std::to_string(format.size());
   dict += "{'descr': '<i8', 'fortran_order': False, 'shape': (), }";
   // pad with spaces so that preamble+dict is modulo 64 bytes. preamble is
   // 10 bytes. dict needs to end with \n
   int remainder = 64 - (10 + dict.size() + 1) % 64;
   dict.insert(dict.end(), remainder, ' ');
   dict.push_back('\n');
   assert((dict.size() + 10) % 64 == 0);
   std::string npy;
   npy += static_cast<char>(0x93);
   npy += "NUMPY";
   std::string::size_type size = dict.size();
   CHECK(size <= std::numeric_limits<uint32_t>::max());
   if (size <= std::numeric_limits<uint16_t>::max()) {
       npy += static_cast<char>(0x01);  // major version of numpy format
       npy += static_cast<char>(0x00);  // minor version of numpy format
       uint16_t size_ = dict.size();
       npy += static_cast<char>(size_ & 0xFF);
       npy += static_cast<char>(size_ >> 8);
   } else {
       npy += static_cast<char>(0x02);  // major version of numpy format
       npy += static_cast<char>(0x00);  // minor version of numpy format
       uint32_t size_ = dict.size();
       npy += static_cast<char>(size_ & 0xFF);
       npy += static_cast<char>((size_ >> 8) & 0xFF);
       npy += static_cast<char>((size_ >> 16) & 0xFF);
       npy += static_cast<char>((size_ >> 24) & 0xFF);
   }
   npy += dict;

   npy += format;

   const std::string blob_name_npy = blob_name + ".npy";
   CHECK(mz_zip_writer_add_mem(archive, blob_name_npy.data(), npy.data(),
                               npy.size(), MZ_NO_COMPRESSION))
       << mz_zip_get_error_string(mz_zip_get_last_error(archive));
 }


 void save_array(mz_zip_archive* archive, const std::string& array_name, const NDArray& array_) {
   NDArray array;  // a copy on cpu
   if (array_.ctx().dev_mask() != cpu::kDevMask) {
     array = array_.Copy(Context::CPU());
     array.WaitToRead();
   } else {
     array = array_;
     array.WaitToRead();
 #if MXNET_USE_MKLDNN == 1
     if (array.IsMKLDNNData()) {
       array = array.Reorder2Default();
     }
 #endif
   }

   switch (array.storage_type()) {
   case kDefaultStorage: {
     save_blob(archive, array_name, array.data());
     break;
   }
   case kCSRStorage: {
     save_blob(archive, array_name + "/data", array.data());
     save_blob(archive, array_name + "/indptr", array.aux_data(csr::kIndPtr));
     save_blob(archive, array_name + "/indices", array.aux_data(csr::kIdx));
     save_shape_array(archive, array_name + "/shape", array.shape());
     save_format_array(archive, array_name + "/format", "csr");
     break;
   }
   case kRowSparseStorage: {
     save_blob(archive, array_name + "/data", array.data());
     save_blob(archive, array_name + "/indices", array.aux_data(rowsparse::kIdx));
     save_shape_array(archive, array_name + "/shape", array.shape());
     save_format_array(archive, array_name + "/format", "row_sparse");
     break;
   }
   default: LOG(FATAL) << "Unknown storage type " << array.storage_type() << "encountered.";
   }
 }


 uint32_t parse_npy_header_len(mz_zip_reader_extract_iter_state* state,
                               const std::string_view& fname, const std::string& zip_fname) {
   std::array<char, 12> buffer;
   CHECK_EQ(mz_zip_reader_extract_iter_read(state, buffer.data(), 10), 10)
     << "Failed to read from " << fname << " member of " << zip_fname;
   CHECK_EQ(buffer[0], (char)0x93);
   CHECK_EQ(buffer[1], 'N');
   CHECK_EQ(buffer[2], 'U');
   CHECK_EQ(buffer[3], 'M');
   CHECK_EQ(buffer[4], 'P');
   CHECK_EQ(buffer[5], 'Y');
   uint8_t major_version = buffer[6];
   CHECK(major_version == 0x01 || major_version == 0x02) << "Unsupported npy major version";
   CHECK(buffer[7] == 0x00) << "Unsupported npy minor version";
   uint32_t header_len = 0;
   header_len += buffer[8];
   header_len += buffer[9] >> 8;
   if (major_version == 0x02) {
     CHECK_EQ(mz_zip_reader_extract_iter_read(state, &buffer[10], 2), 2)
       << "Failed to read from " << fname << " member of " << zip_fname;
     header_len += buffer[10] >> 16;
     header_len += buffer[11] >> 24;
   }
   return header_len;
 }


 std::pair<std::vector<NDArray>, std::vector<std::string>>
 load_arrays(const std::string& zip_fname) {
   mz_zip_archive archive {};
   CHECK(mz_zip_reader_init_file(&archive, zip_fname.data(), 0))
       << "Failed to open archive " << zip_fname << ": "
       << mz_zip_get_error_string(mz_zip_get_last_error(&archive));

   // Collect the set of file-names per folder in the zip file. If the set of
   // file names in a folder matches the scipy.sparse.save_npz pattern, the
   // folder will be restored as single sparse ndarray.
   std::unordered_map<std::string, std::set<std::string>> names;

   mz_uint num_entries = mz_zip_reader_get_num_files(&archive);
   for (mz_uint i = 0; i < num_entries; i++) {
     mz_uint filename_length = mz_zip_reader_get_filename(&archive, i, nullptr, 0);
     std::string entry_name;
     entry_name.resize(filename_length);  // filename_length includes the \0 terminator
     CHECK_EQ(filename_length, mz_zip_reader_get_filename(&archive, i, entry_name.data(),
                                                          filename_length));
     std::string_view entry_name_v {entry_name.data(), entry_name.size() - 1};  // -1 due to \0
     if (entry_name_v.substr(entry_name_v.size() - 4).compare(".npy") != 0) continue;  // only .npy

     auto dir_sep_search = entry_name_v.rfind("/");
     if (dir_sep_search == std::string::npos) {  // top level file
       [[maybe_unused]] auto[iter, inserted] = names[""].emplace(entry_name_v);
       CHECK(inserted);
     } else {  // file inside a folder
       std::string dirname {entry_name_v.substr(0, dir_sep_search + 1)};
       std::string fname {entry_name_v.substr(dir_sep_search + 1)};
       [[maybe_unused]] auto[iter, inserted] = names[dirname].insert(fname);
       CHECK(inserted);
     }
   }

   // Return values
   std::vector<NDArray> arrays;
   std::vector<std::string> return_names;

   // Patterns used by SciPy to save respective sparse matrix formats to a file
   const std::set<std::string> bsr_csr_csc_pattern
     {"data.npy", "indices.npy", "indptr.npy", "format.npy", "shape.npy"};
   const std::set<std::string> row_sparse_pattern  // MXNet specific format not part of SciPy
     {"data.npy", "indices.npy", "format.npy", "shape.npy"};
   const std::set<std::string> coo_pattern
     {"data.npy", "row.npy", "col.npy", "format.npy", "shape.npy"};
   const std::set<std::string> dia_pattern
     {"data.npy", "offsets.npy", "format.npy", "shape.npy"};
   for (const auto& [dirname, dircontents] : names) {
     if (dircontents == bsr_csr_csc_pattern) {
       // Check format
       std::string fname(dirname);
       fname += "format.npy";
       mz_zip_reader_extract_iter_state* format_file = mz_zip_reader_extract_file_iter_new(
           &archive, fname.data(), 0);
       CHECK(nullptr != format_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));

       // In the special case of format.npy we ignore the header as it
       // specifies the string datatype which is unsupported by MXNet
       uint32_t header_len = parse_npy_header_len(format_file, fname, zip_fname);
       std::string header;
       header.resize(header_len);
       CHECK_EQ(mz_zip_reader_extract_iter_read(format_file, header.data(), header_len), header_len)
           << "Failed to read from " << fname << " member of " << zip_fname << ": "
           << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
       // and simply look at the next 3 bytes containing the format string
       std::string format;
       format.resize(3);
       CHECK_EQ(mz_zip_reader_extract_iter_read(format_file, format.data(), 3), 3)
           << "Failed to read from " << fname << " member of " << zip_fname;
       CHECK(mz_zip_reader_extract_iter_free(format_file));

       if (format == "csr") {
         // Prepare reading storage data array
         fname = dirname;
         fname += "data.npy";
         mz_zip_reader_extract_iter_state* data_file = mz_zip_reader_extract_file_iter_new(
             &archive, fname.data(), 0);
         CHECK(nullptr != data_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         header_len = parse_npy_header_len(data_file, fname, zip_fname);
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(data_file, header.data(), header_len), header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[storage_type_flag, storage_fortran_order, storage_shape] = \
           npy::parse_npy_header_descr(header);
         if (storage_fortran_order) {
           LOG(FATAL) << "Reading fortran order data for sparse arrays not yet implemented.";
         }
         TShape storage_tshape(storage_shape);

         // Prepare reading indptr aux array
         fname = dirname;
         fname += "indptr.npy";
         mz_zip_reader_extract_iter_state* indptr_file = mz_zip_reader_extract_file_iter_new(
             &archive, fname.data(), 0);
         CHECK(nullptr != indptr_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         header_len = parse_npy_header_len(indptr_file, fname, zip_fname);
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(indptr_file, header.data(), header_len),
                  header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[indptr_type_flag, indptr_fortran_order, indptr_shape] = \
           npy::parse_npy_header_descr(header);
         if (indptr_fortran_order) {
           LOG(FATAL) << "Reading fortran order data for sparse arrays not yet implemented.";
         }
         TShape indptr_tshape(indptr_shape);

         // Prepare reading indices aux array
         fname = dirname;
         fname += "indices.npy";
         mz_zip_reader_extract_iter_state* indices_file = mz_zip_reader_extract_file_iter_new(
             &archive, fname.data(), 0);
         CHECK(nullptr != indices_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         header_len = parse_npy_header_len(indices_file, fname, zip_fname);
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(indices_file, header.data(), header_len),
                  header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[indices_type_flag, indices_fortran_order, indices_shape] = \
           npy::parse_npy_header_descr(header);
         if (indices_fortran_order) {
           LOG(FATAL) << "Reading fortran order data for sparse arrays not yet implemented.";
         }
         TShape indices_tshape(indices_shape);

         // Read shape data array
         fname = dirname;
         fname += "shape.npy";
         mz_zip_reader_extract_iter_state* shape_file = mz_zip_reader_extract_file_iter_new(
             &archive, fname.data(), 0);
         CHECK(nullptr != shape_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         header_len = parse_npy_header_len(shape_file, fname, zip_fname);
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(shape_file, header.data(), header_len), header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[shape_type_flag, shape_fortran_order, shape_shape] = \
           npy::parse_npy_header_descr(header);
         if (shape_fortran_order) {
           LOG(FATAL) << "Reading fortran order data for sparse arrays not yet implemented.";
         }
         CHECK_EQ(shape_shape.size(), 1) << "Expected one-dimensional shape of shape information.";
         TShape tshape(shape_shape.at(0), -1);
         if (shape_type_flag == mshadow::kInt64) {  // Used in most SciPy builds
           for (dim_t i = 0; i < shape_shape.at(0); i++) {
             int64_t dim;
             CHECK_EQ(mz_zip_reader_extract_iter_read(shape_file, &dim, 8), 8)
                 << "Failed to read from " << fname << " member of " << zip_fname << ": "
                 << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
             tshape[i] = dim;
           }
         } else if (shape_type_flag == mshadow::kInt32) {  // Used in SciPy pip wheels on Windows
           for (dim_t i = 0; i < shape_shape.at(0); i++) {
             int32_t dim;
             CHECK_EQ(mz_zip_reader_extract_iter_read(shape_file, &dim, 4), 4)
                 << "Failed to read from " << fname << " member of " << zip_fname << ": "
                 << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
             tshape[i] = dim;
           }
         } else {
           LOG(FATAL) << "Expected shape information in int64 or int32 format.";
         }
         CHECK(mz_zip_reader_extract_iter_free(shape_file));

         // Construct aux datastructures
         static_assert(csr::CSRAuxType::kIndPtr == 0);
         static_assert(csr::CSRAuxType::kIdx == 1);
         const std::vector<int> aux_types {indptr_type_flag, indices_type_flag};
         const mxnet::ShapeVector aux_shapes {indptr_tshape, indices_tshape};

         // Allocate NDArray
         NDArray array(NDArrayStorageType::kCSRStorage, tshape, Context::CPU(), false,
                       storage_type_flag, aux_types, aux_shapes, storage_tshape);

         // Read data array
         const TBlob& blob = array.data();
         size_t nbytes = blob.Size() * mshadow::mshadow_sizeof(blob.type_flag_);
         CHECK_EQ(mz_zip_reader_extract_iter_read(data_file, blob.dptr_, nbytes), nbytes)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         CHECK(mz_zip_reader_extract_iter_free(data_file));

         // Read indptr array
         const TBlob& indptr_blob = array.aux_data(csr::CSRAuxType::kIndPtr);
         nbytes = indptr_blob.Size() * mshadow::mshadow_sizeof(indptr_blob.type_flag_);
         CHECK_EQ(mz_zip_reader_extract_iter_read(indptr_file, indptr_blob.dptr_, nbytes), nbytes)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         CHECK(mz_zip_reader_extract_iter_free(indptr_file));

         // Read indices array
         const TBlob& indices_blob = array.aux_data(csr::CSRAuxType::kIdx);
         nbytes = indices_blob.Size() * mshadow::mshadow_sizeof(indices_blob.type_flag_);
         CHECK_EQ(mz_zip_reader_extract_iter_read(indices_file, indices_blob.dptr_, nbytes), nbytes)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         CHECK(mz_zip_reader_extract_iter_free(indices_file));

         arrays.push_back(array);
         return_names.emplace_back(dirname.size() ?   // Exclude "/"
                                   dirname.substr(0, dirname.size() - 1) : dirname);

       } else {
         throw std::runtime_error("Loading " + format + " sparse matrix format is unsupported.");
       }
     } else if (dircontents == row_sparse_pattern) {
       // Check format
       std::string fname(dirname);
       fname += "format.npy";
       mz_zip_reader_extract_iter_state* format_file = mz_zip_reader_extract_file_iter_new(
           &archive, fname.data(), 0);
       CHECK(nullptr != format_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));

       // In the special case of format.npy we ignore the header as it
       // specifies the string datatype which is unsupported by MXNet
       uint32_t header_len = parse_npy_header_len(format_file, fname, zip_fname);
       std::string header;
       header.resize(header_len);
       CHECK_EQ(mz_zip_reader_extract_iter_read(format_file, header.data(), header_len), header_len)
           << "Failed to read from " << fname << " member of " << zip_fname << ": "
           << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
       // and simply look at the next 3 bytes containing the format string
       std::string format;
       format.resize(10);
       mz_zip_reader_extract_iter_read(format_file, format.data(), 10);
       CHECK(mz_zip_reader_extract_iter_free(format_file));

       if (format == "row_sparse") {
         // Prepare reading storage data array
         fname = dirname;
         fname += "data.npy";
         mz_zip_reader_extract_iter_state* data_file = mz_zip_reader_extract_file_iter_new(
             &archive, fname.data(), 0);
         CHECK(nullptr != data_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         header_len = parse_npy_header_len(data_file, fname, zip_fname);
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(data_file, header.data(), header_len), header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[storage_type_flag, storage_fortran_order, storage_shape] = \
           npy::parse_npy_header_descr(header);
         if (storage_fortran_order) {
           LOG(FATAL) << "Reading fortran order data for sparse arrays not yet implemented.";
         }
         TShape storage_tshape(storage_shape);

         // Prepare reading indices aux array
         fname = dirname;
         fname += "indices.npy";
         mz_zip_reader_extract_iter_state* indices_file = mz_zip_reader_extract_file_iter_new(
             &archive, fname.data(), 0);
         CHECK(nullptr != indices_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         header_len = parse_npy_header_len(indices_file, fname, zip_fname);
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(indices_file, header.data(), header_len),
                  header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[indices_type_flag, indices_fortran_order, indices_shape] = \
           npy::parse_npy_header_descr(header);
         if (indices_fortran_order) {
           LOG(FATAL) << "Reading fortran order data for sparse arrays not yet implemented.";
         }
         TShape indices_tshape(indices_shape);

         // Read shape data array
         fname = dirname;
         fname += "shape.npy";
         mz_zip_reader_extract_iter_state* shape_file = mz_zip_reader_extract_file_iter_new(
             &archive, fname.data(), 0);
         CHECK(nullptr != shape_file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         header_len = parse_npy_header_len(shape_file, fname, zip_fname);
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(shape_file, header.data(), header_len), header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[shape_type_flag, shape_fortran_order, shape_shape] = \
           npy::parse_npy_header_descr(header);
         if (shape_fortran_order) {
           LOG(FATAL) << "Reading fortran order data for sparse arrays not yet implemented.";
         }
         CHECK_EQ(shape_shape.size(), 1) << "Expected one-dimensional shape of shape information.";
         TShape tshape(shape_shape.at(0), -1);
         if (shape_type_flag == mshadow::kInt64) {  // Used in most SciPy builds
           for (dim_t i = 0; i < shape_shape.at(0); i++) {
             int64_t dim;
             CHECK_EQ(mz_zip_reader_extract_iter_read(shape_file, &dim, 8), 8)
                 << "Failed to read from " << fname << " member of " << zip_fname << ": "
                 << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
             tshape[i] = dim;
           }
         } else if (shape_type_flag == mshadow::kInt32) {  // Used in SciPy pip wheels on Windows
           for (dim_t i = 0; i < shape_shape.at(0); i++) {
             int32_t dim;
             CHECK_EQ(mz_zip_reader_extract_iter_read(shape_file, &dim, 4), 4)
                 << "Failed to read from " << fname << " member of " << zip_fname << ": "
                 << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
             tshape[i] = dim;
           }
         } else {
           LOG(FATAL) << "Expected shape information in int64 or int32 format.";
         }
         CHECK(mz_zip_reader_extract_iter_free(shape_file));

         // Construct aux datastructures
         static_assert(rowsparse::RowSparseAuxType::kIdx == 0);
         const std::vector<int> aux_types {indices_type_flag};
         const mxnet::ShapeVector aux_shapes {indices_tshape};

         // Allocate NDArray
         NDArray array(NDArrayStorageType::kRowSparseStorage, tshape, Context::CPU(), false,
                       storage_type_flag, aux_types, aux_shapes, storage_tshape);

         // Read data array
         const TBlob& blob = array.data();
         size_t nbytes = blob.Size() * mshadow::mshadow_sizeof(blob.type_flag_);
         CHECK_EQ(mz_zip_reader_extract_iter_read(data_file, blob.dptr_, nbytes), nbytes)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         CHECK(mz_zip_reader_extract_iter_free(data_file));

         // Read indices array
         const TBlob& indices_blob = array.aux_data(rowsparse::RowSparseAuxType::kIdx);
         nbytes = indices_blob.Size() * mshadow::mshadow_sizeof(indices_blob.type_flag_);
         CHECK_EQ(mz_zip_reader_extract_iter_read(indices_file, indices_blob.dptr_, nbytes), nbytes)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         CHECK(mz_zip_reader_extract_iter_free(indices_file));

         arrays.push_back(array);
         return_names.emplace_back(dirname.size() ?   // Exclude "/"
                                   dirname.substr(0, dirname.size() - 1) : dirname);

       } else {
         throw std::runtime_error("Loading " + format + " sparse matrix format is unsupported.");
       }
     } else if (dircontents == coo_pattern) {
       throw std::runtime_error("Loading COO sparse matrix format is unsupported.");
     } else if (dircontents == dia_pattern) {
       throw std::runtime_error("Loading DIA sparse matrix format is unsupported.");
     } else {  // Folder does not match scipy sparse pattern; treat containing files as dense
       for (const std::string& fname : dircontents) {
         std::string path(dirname);
         path += fname;
         mz_zip_reader_extract_iter_state* file = mz_zip_reader_extract_file_iter_new(
             &archive, path.data(), 0);
         CHECK(nullptr != file) << mz_zip_get_error_string(mz_zip_get_last_error(&archive));

         uint32_t header_len = parse_npy_header_len(file, path, zip_fname);
         std::string header;
         header.resize(header_len);
         CHECK_EQ(mz_zip_reader_extract_iter_read(file, header.data(), header_len), header_len)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         auto[type_flag, fortran_order, shape] = npy::parse_npy_header_descr(header);

         if (fortran_order) {
           fortran_order_transpose_prepare(shape);
         }

         TShape tshape(shape);
         NDArray array(tshape, Context::CPU(), false, type_flag);
         const TBlob& blob = array.data();
         size_t nbytes = blob.Size() * mshadow::mshadow_sizeof(blob.type_flag_);
         CHECK_EQ(mz_zip_reader_extract_iter_read(file, blob.dptr_, nbytes), nbytes)
             << "Failed to read from " << fname << " member of " << zip_fname << ": "
             << mz_zip_get_error_string(mz_zip_get_last_error(&archive));
         CHECK(mz_zip_reader_extract_iter_free(file));

         if (fortran_order) {
           array = fortran_order_transpose(shape, type_flag, array);
         }

         arrays.push_back(array);
         return_names.emplace_back(path.substr(0, path.size() - 4));
       }
     }
   }

   mz_zip_reader_end(&archive);

   return std::make_pair(arrays, return_names);
 }

 }  // namespace npz
 }  // namespace mxnet