cpp/src/arrow/util/cpu_info.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 // From Apache Impala (incubating) as of 2016-01-29.

 #include "arrow/util/cpu_info.h"

 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #endif

 #include <stdlib.h>
 #include <string.h>

 #ifndef _MSC_VER
 #include <unistd.h>
 #endif

 #ifdef _WIN32
 #include <immintrin.h>
 #include <intrin.h>
 #include <array>
 #include <bitset>

 #include "arrow/util/windows_compatibility.h"
 #endif

 #include <algorithm>
 #include <cctype>
 #include <cerrno>
 #include <cstdint>
 #include <fstream>
 #include <memory>
 #include <mutex>
 #include <string>

 #include "arrow/result.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/optional.h"
 #include "arrow/util/string.h"

 namespace arrow {
 namespace internal {

 namespace {

 using std::max;

 constexpr int64_t kDefaultL1CacheSize = 32 * 1024;    // Level 1: 32k
 constexpr int64_t kDefaultL2CacheSize = 256 * 1024;   // Level 2: 256k
 constexpr int64_t kDefaultL3CacheSize = 3072 * 1024;  // Level 3: 3M

 #if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
 void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
   __asm__ __volatile__("cpuid"
                        : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]),
                          "=d"(CPUInfo[3])
                        : "a"(function_id), "c"(subfunction_id));
 }

 int64_t _xgetbv(int xcr) {
   int out = 0;
   __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
   return out;
 }
 #endif

 #ifdef __APPLE__
 util::optional<int64_t> IntegerSysCtlByName(const char* name) {
   size_t len = sizeof(int64_t);
   int64_t data = 0;
   if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
     return data;
   }
   // ENOENT is the official errno value for non-existing sysctl's,
   // but EINVAL and ENOTSUP have been seen in the wild.
   if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
     auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
     ARROW_LOG(WARNING) << st.ToString();
   }
   return util::nullopt;
 }
 #endif

 #if defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
 // There is no direct instruction to get cache size on Arm64 like '__cpuid' on x86;
 // Get Arm64 cache size by reading '/sys/devices/system/cpu/cpu0/cache/index*/size';
 // index* :
 //   index0: L1 Dcache
 //   index1: L1 Icache
 //   index2: L2 cache
 //   index3: L3 cache
 const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
 const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
 const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";

 int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
   char* content = nullptr;
   char* last_char = nullptr;
   size_t file_len = 0;

   // Read cache file to 'content' for getting cache size.
   FILE* cache_file = fopen(filename, "r");
   if (cache_file == nullptr) {
     return default_size;
   }
   int res = getline(&content, &file_len, cache_file);
   fclose(cache_file);
   if (res == -1) {
     return default_size;
   }
   std::unique_ptr<char, decltype(&free)> content_guard(content, &free);

   errno = 0;
   const auto cardinal_num = strtoull(content, &last_char, 0);
   if (errno != 0) {
     return default_size;
   }
   // kB, MB, or GB
   int64_t multip = 1;
   switch (*last_char) {
     case 'g':
     case 'G':
       multip *= 1024;
     case 'm':
     case 'M':
       multip *= 1024;
     case 'k':
     case 'K':
       multip *= 1024;
   }
   return cardinal_num * multip;
 }
 #endif

 #if !defined(_WIN32) && !defined(__APPLE__)
 struct {
   std::string name;
   int64_t flag;
 } flag_mappings[] = {
 #if (defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64))
     {"ssse3", CpuInfo::SSSE3},       {"sse4_1", CpuInfo::SSE4_1},
     {"sse4_2", CpuInfo::SSE4_2},     {"popcnt", CpuInfo::POPCNT},
     {"avx", CpuInfo::AVX},           {"avx2", CpuInfo::AVX2},
     {"avx512f", CpuInfo::AVX512F},   {"avx512cd", CpuInfo::AVX512CD},
     {"avx512vl", CpuInfo::AVX512VL}, {"avx512dq", CpuInfo::AVX512DQ},
     {"avx512bw", CpuInfo::AVX512BW}, {"bmi1", CpuInfo::BMI1},
     {"bmi2", CpuInfo::BMI2},
 #endif
 #if defined(__aarch64__)
     {"asimd", CpuInfo::ASIMD},
 #endif
 };
 const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);

 // Helper function to parse for hardware flags.
 // values contains a list of space-separated flags.  check to see if the flags we
 // care about are present.
 // Returns a bitmap of flags.
 int64_t ParseCPUFlags(const std::string& values) {
   int64_t flags = 0;
   for (int i = 0; i < num_flags; ++i) {
     if (values.find(flag_mappings[i].name) != std::string::npos) {
       flags |= flag_mappings[i].flag;
     }
   }
   return flags;
 }
 #endif

 #ifdef _WIN32
 bool RetrieveCacheSize(int64_t* cache_sizes) {
   if (!cache_sizes) {
     return false;
   }
   PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr;
   PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr;
   DWORD buffer_size = 0;
   size_t offset = 0;
   typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*);
   GetLogicalProcessorInformationFuncPointer func_pointer =
       (GetLogicalProcessorInformationFuncPointer)GetProcAddress(
           GetModuleHandle("kernel32"), "GetLogicalProcessorInformation");

   if (!func_pointer) {
     return false;
   }

   // Get buffer size
   if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
     return false;

   buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size);

   if (!buffer || !func_pointer(buffer, &buffer_size)) {
     return false;
   }

   buffer_position = buffer;
   while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) {
     if (RelationCache == buffer_position->Relationship) {
       PCACHE_DESCRIPTOR cache = &buffer_position->Cache;
       if (cache->Level >= 1 && cache->Level <= 3) {
         cache_sizes[cache->Level - 1] += cache->Size;
       }
     }
     offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
     buffer_position++;
   }

   if (buffer) {
     free(buffer);
   }
   return true;
 }

 // Source: https://en.wikipedia.org/wiki/CPUID
 bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
                      CpuInfo::Vendor* vendor) {
   if (!hardware_flags || !model_name || !vendor) {
     return false;
   }
   int register_EAX_id = 1;
   int highest_valid_id = 0;
   int highest_extended_valid_id = 0;
   std::bitset<32> features_ECX;
   std::array<int, 4> cpu_info;

   // Get highest valid id
   __cpuid(cpu_info.data(), 0);
   highest_valid_id = cpu_info[0];
   // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C
   // HEX of "AuthenticAMD": 41757468 656E7469 63414D44
   if (cpu_info[1] == 0x756e6547 && cpu_info[2] == 0x49656e69 &&
       cpu_info[3] == 0x6c65746e) {
     *vendor = CpuInfo::Vendor::Intel;
   } else if (cpu_info[1] == 0x68747541 && cpu_info[2] == 0x69746e65 &&
              cpu_info[3] == 0x444d4163) {
     *vendor = CpuInfo::Vendor::AMD;
   }

   if (highest_valid_id <= register_EAX_id) return false;

   // EAX=1: Processor Info and Feature Bits
   __cpuidex(cpu_info.data(), register_EAX_id, 0);
   features_ECX = cpu_info[2];

   // Get highest extended id
   __cpuid(cpu_info.data(), 0x80000000);
   highest_extended_valid_id = cpu_info[0];

   // Retrieve CPU model name
   if (highest_extended_valid_id >= static_cast<int>(0x80000004)) {
     model_name->clear();
     for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) {
       __cpuidex(cpu_info.data(), i, 0);
       *model_name +=
           std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info));
     }
   }

   bool zmm_enabled = false;
   if (features_ECX[27]) {  // OSXSAVE
     // Query if the OS supports saving ZMM registers when switching contexts
     int64_t xcr0 = _xgetbv(0);
     zmm_enabled = (xcr0 & 0xE0) == 0xE0;
   }

   if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
   if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
   if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
   if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT;
   if (features_ECX[23]) *hardware_flags |= CpuInfo::AVX;

   // cpuid with EAX=7, ECX=0: Extended Features
   register_EAX_id = 7;
   if (highest_valid_id > register_EAX_id) {
     __cpuidex(cpu_info.data(), register_EAX_id, 0);
     std::bitset<32> features_EBX = cpu_info[1];

     if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
     if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
     if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
     // ARROW-11427: only use AVX512 if enabled by the OS
     if (zmm_enabled) {
       if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
       if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
       if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
       if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
       if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
     }
   }

   return true;
 }
 #endif

 }  // namespace

 CpuInfo::CpuInfo()
     : hardware_flags_(0),
       num_cores_(1),
       model_name_("unknown"),
       vendor_(Vendor::Unknown) {}

 std::unique_ptr<CpuInfo> g_cpu_info;
 static std::once_flag cpuinfo_initialized;

 CpuInfo* CpuInfo::GetInstance() {
   std::call_once(cpuinfo_initialized, []() {
     g_cpu_info.reset(new CpuInfo);
     g_cpu_info->Init();
   });
   return g_cpu_info.get();
 }

 void CpuInfo::Init() {
   std::string line;
   std::string name;
   std::string value;

   float max_mhz = 0;
   int num_cores = 0;

   memset(&cache_sizes_, 0, sizeof(cache_sizes_));

 #ifdef _WIN32
   SYSTEM_INFO system_info;
   GetSystemInfo(&system_info);
   num_cores = system_info.dwNumberOfProcessors;

   LARGE_INTEGER performance_frequency;
   if (QueryPerformanceFrequency(&performance_frequency)) {
     max_mhz = static_cast<float>(performance_frequency.QuadPart);
   }
 #elif defined(__APPLE__)
   // On macOS, get CPU information from system information base
   struct SysCtlCpuFeature {
     const char* name;
     int64_t flag;
   };
   std::vector<SysCtlCpuFeature> features = {
 #if defined(__aarch64__)
     // ARM64 (note that this is exposed under Rosetta as well)
     {"hw.optional.neon", ASIMD},
 #else
     // x86
     {"hw.optional.sse4_2", SSSE3 | SSE4_1 | SSE4_2 | POPCNT},
     {"hw.optional.avx1_0", AVX},
     {"hw.optional.avx2_0", AVX2},
     {"hw.optional.bmi1", BMI1},
     {"hw.optional.bmi2", BMI2},
     {"hw.optional.avx512f", AVX512F},
     {"hw.optional.avx512cd", AVX512CD},
     {"hw.optional.avx512dq", AVX512DQ},
     {"hw.optional.avx512bw", AVX512BW},
     {"hw.optional.avx512vl", AVX512VL},
 #endif
   };
   for (const auto& feature : features) {
     auto v = IntegerSysCtlByName(feature.name);
     if (v.value_or(0)) {
       hardware_flags_ |= feature.flag;
     }
   }
 #else
   // Read from /proc/cpuinfo
   std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
   while (cpuinfo) {
     std::getline(cpuinfo, line);
     size_t colon = line.find(':');
     if (colon != std::string::npos) {
       name = TrimString(line.substr(0, colon - 1));
       value = TrimString(line.substr(colon + 1, std::string::npos));
       if (name.compare("flags") == 0 || name.compare("Features") == 0) {
         hardware_flags_ |= ParseCPUFlags(value);
       } else if (name.compare("cpu MHz") == 0) {
         // Every core will report a different speed.  We'll take the max, assuming
         // that when impala is running, the core will not be in a lower power state.
         // TODO: is there a more robust way to do this, such as
         // Window's QueryPerformanceFrequency()
         float mhz = static_cast<float>(atof(value.c_str()));
         max_mhz = max(mhz, max_mhz);
       } else if (name.compare("processor") == 0) {
         ++num_cores;
       } else if (name.compare("model name") == 0) {
         model_name_ = value;
       } else if (name.compare("vendor_id") == 0) {
         if (value.compare("GenuineIntel") == 0) {
           vendor_ = Vendor::Intel;
         } else if (value.compare("AuthenticAMD") == 0) {
           vendor_ = Vendor::AMD;
         }
       }
     }
   }
   if (cpuinfo.is_open()) cpuinfo.close();
 #endif

 #ifdef __APPLE__
   // On macOS, get cache size from system information base
   SetDefaultCacheSize();
   auto c = IntegerSysCtlByName("hw.l1dcachesize");
   if (c.has_value()) {
     cache_sizes_[0] = *c;
   }
   c = IntegerSysCtlByName("hw.l2cachesize");
   if (c.has_value()) {
     cache_sizes_[1] = *c;
   }
   c = IntegerSysCtlByName("hw.l3cachesize");
   if (c.has_value()) {
     cache_sizes_[2] = *c;
   }
 #elif _WIN32
   if (!RetrieveCacheSize(cache_sizes_)) {
     SetDefaultCacheSize();
   }
   RetrieveCPUInfo(&hardware_flags_, &model_name_, &vendor_);
 #else
   SetDefaultCacheSize();
 #endif

   if (max_mhz != 0) {
     cycles_per_ms_ = static_cast<int64_t>(max_mhz);
 #ifndef _WIN32
     cycles_per_ms_ *= 1000;
 #endif
   } else {
     cycles_per_ms_ = 1000000;
   }
   original_hardware_flags_ = hardware_flags_;

   if (num_cores > 0) {
     num_cores_ = num_cores;
   } else {
     num_cores_ = 1;
   }

   // Parse the user simd level
   ParseUserSimdLevel();
 }

 void CpuInfo::VerifyCpuRequirements() {
 #ifdef ARROW_HAVE_SSE4_2
   if (!IsSupported(CpuInfo::SSSE3)) {
     DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
   }
 #endif
 #if defined(ARROW_HAVE_NEON)
   if (!IsSupported(CpuInfo::ASIMD)) {
     DCHECK(false) << "CPU does not support the Armv8 Neon instruction set";
   }
 #endif
 }

 bool CpuInfo::CanUseSSE4_2() const {
 #if defined(ARROW_HAVE_SSE4_2)
   return IsSupported(CpuInfo::SSE4_2);
 #else
   return false;
 #endif
 }

 void CpuInfo::EnableFeature(int64_t flag, bool enable) {
   if (!enable) {
     hardware_flags_ &= ~flag;
   } else {
     // Can't turn something on that can't be supported
     DCHECK_NE(original_hardware_flags_ & flag, 0);
     hardware_flags_ |= flag;
   }
 }

 int64_t CpuInfo::hardware_flags() { return hardware_flags_; }

 int64_t CpuInfo::CacheSize(CacheLevel level) { return cache_sizes_[level]; }

 int64_t CpuInfo::cycles_per_ms() { return cycles_per_ms_; }

 int CpuInfo::num_cores() { return num_cores_; }

 std::string CpuInfo::model_name() { return model_name_; }

 void CpuInfo::SetDefaultCacheSize() {
 #if defined(_SC_LEVEL1_DCACHE_SIZE) && !defined(__aarch64__)
   // Call sysconf to query for the cache sizes
   cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
   cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
   cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
   ARROW_UNUSED(kDefaultL1CacheSize);
   ARROW_UNUSED(kDefaultL2CacheSize);
   ARROW_UNUSED(kDefaultL3CacheSize);
 #elif defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
   cache_sizes_[0] = GetArm64CacheSize(kL1CacheSizeFile, kDefaultL1CacheSize);
   cache_sizes_[1] = GetArm64CacheSize(kL2CacheSizeFile, kDefaultL2CacheSize);
   cache_sizes_[2] = GetArm64CacheSize(kL3CacheSizeFile, kDefaultL3CacheSize);
 #else
   // Provide reasonable default values if no info
   cache_sizes_[0] = kDefaultL1CacheSize;
   cache_sizes_[1] = kDefaultL2CacheSize;
   cache_sizes_[2] = kDefaultL3CacheSize;
 #endif
 }

 void CpuInfo::ParseUserSimdLevel() {
   auto maybe_env_var = GetEnvVar("ARROW_USER_SIMD_LEVEL");
   if (!maybe_env_var.ok()) {
     // No user settings
     return;
   }
   std::string s = *std::move(maybe_env_var);
   std::transform(s.begin(), s.end(), s.begin(),
                  [](unsigned char c) { return std::toupper(c); });

   int level = USER_SIMD_MAX;
   // Parse the level
   if (s == "AVX512") {
     level = USER_SIMD_AVX512;
   } else if (s == "AVX2") {
     level = USER_SIMD_AVX2;
   } else if (s == "AVX") {
     level = USER_SIMD_AVX;
   } else if (s == "SSE4_2") {
     level = USER_SIMD_SSE4_2;
   } else if (s == "NONE") {
     level = USER_SIMD_NONE;
   } else if (!s.empty()) {
     ARROW_LOG(WARNING) << "Invalid value for ARROW_USER_SIMD_LEVEL: " << s;
   }

   // Disable feature as the level
   if (level < USER_SIMD_AVX512) {  // Disable all AVX512 features
     EnableFeature(AVX512, false);
   }
   if (level < USER_SIMD_AVX2) {  // Disable all AVX2 features
     EnableFeature(AVX2 | BMI2, false);
   }
   if (level < USER_SIMD_AVX) {  // Disable all AVX features
     EnableFeature(AVX, false);
   }
   if (level < USER_SIMD_SSE4_2) {  // Disable all SSE4_2 features
     EnableFeature(SSE4_2 | BMI1, false);
   }
 }

 }  // namespace internal
 }  // namespace arrow
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	// From Apache Impala (incubating) as of 2016-01-29.

	#include "arrow/util/cpu_info.h"

	#ifdef __APPLE__
	#include <sys/sysctl.h>
	#endif

	#include <stdlib.h>
	#include <string.h>

	#ifndef _MSC_VER
	#include <unistd.h>
	#endif

	#ifdef _WIN32
	#include <immintrin.h>
	#include <intrin.h>
	#include <array>
	#include <bitset>

	#include "arrow/util/windows_compatibility.h"
	#endif

	#include <algorithm>
	#include <cctype>
	#include <cerrno>
	#include <cstdint>
	#include <fstream>
	#include <memory>
	#include <mutex>
	#include <string>

	#include "arrow/result.h"
	#include "arrow/util/io_util.h"
	#include "arrow/util/logging.h"
	#include "arrow/util/optional.h"
	#include "arrow/util/string.h"

	namespace arrow {
	namespace internal {

	namespace {

	using std::max;

	constexpr int64_t kDefaultL1CacheSize = 32 * 1024; // Level 1: 32k
	constexpr int64_t kDefaultL2CacheSize = 256 * 1024; // Level 2: 256k
	constexpr int64_t kDefaultL3CacheSize = 3072 * 1024; // Level 3: 3M

	#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
	void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
	__asm__ __volatile__("cpuid"
	: "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]),
	"=d"(CPUInfo[3])
	: "a"(function_id), "c"(subfunction_id));
	}

	int64_t _xgetbv(int xcr) {
	int out = 0;
	__asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
	return out;
	}
	#endif

	#ifdef __APPLE__
	util::optional<int64_t> IntegerSysCtlByName(const char* name) {
	size_t len = sizeof(int64_t);
	int64_t data = 0;
	if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
	return data;
	}
	// ENOENT is the official errno value for non-existing sysctl's,
	// but EINVAL and ENOTSUP have been seen in the wild.
	if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
	auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
	ARROW_LOG(WARNING) << st.ToString();
	}
	return util::nullopt;
	}
	#endif

	#if defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
	// There is no direct instruction to get cache size on Arm64 like '__cpuid' on x86;
	// Get Arm64 cache size by reading '/sys/devices/system/cpu/cpu0/cache/index*/size';
	// index* :
	// index0: L1 Dcache
	// index1: L1 Icache
	// index2: L2 cache
	// index3: L3 cache
	const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
	const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
	const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";

	int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
	char* content = nullptr;
	char* last_char = nullptr;
	size_t file_len = 0;

	// Read cache file to 'content' for getting cache size.
	FILE* cache_file = fopen(filename, "r");
	if (cache_file == nullptr) {
	return default_size;
	}
	int res = getline(&content, &file_len, cache_file);
	fclose(cache_file);
	if (res == -1) {
	return default_size;
	}
	std::unique_ptr<char, decltype(&free)> content_guard(content, &free);

	errno = 0;
	const auto cardinal_num = strtoull(content, &last_char, 0);
	if (errno != 0) {
	return default_size;
	}
	// kB, MB, or GB
	int64_t multip = 1;
	switch (*last_char) {
	case 'g':
	case 'G':
	multip *= 1024;
	case 'm':
	case 'M':
	multip *= 1024;
	case 'k':
	case 'K':
	multip *= 1024;
	}
	return cardinal_num * multip;
	}
	#endif

	#if !defined(_WIN32) && !defined(__APPLE__)
	struct {
	std::string name;
	int64_t flag;
	} flag_mappings[] = {
	#if (defined(__i386) \|\| defined(_M_IX86) \|\| defined(__x86_64__) \|\| defined(_M_X64))
	{"ssse3", CpuInfo::SSSE3}, {"sse4_1", CpuInfo::SSE4_1},
	{"sse4_2", CpuInfo::SSE4_2}, {"popcnt", CpuInfo::POPCNT},
	{"avx", CpuInfo::AVX}, {"avx2", CpuInfo::AVX2},
	{"avx512f", CpuInfo::AVX512F}, {"avx512cd", CpuInfo::AVX512CD},
	{"avx512vl", CpuInfo::AVX512VL}, {"avx512dq", CpuInfo::AVX512DQ},
	{"avx512bw", CpuInfo::AVX512BW}, {"bmi1", CpuInfo::BMI1},
	{"bmi2", CpuInfo::BMI2},
	#endif
	#if defined(__aarch64__)
	{"asimd", CpuInfo::ASIMD},
	#endif
	};
	const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);

	// Helper function to parse for hardware flags.
	// values contains a list of space-separated flags. check to see if the flags we
	// care about are present.
	// Returns a bitmap of flags.
	int64_t ParseCPUFlags(const std::string& values) {
	int64_t flags = 0;
	for (int i = 0; i < num_flags; ++i) {
	if (values.find(flag_mappings[i].name) != std::string::npos) {
	flags \|= flag_mappings[i].flag;
	}
	}
	return flags;
	}
	#endif

	#ifdef _WIN32
	bool RetrieveCacheSize(int64_t* cache_sizes) {
	if (!cache_sizes) {
	return false;
	}
	PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr;
	PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr;
	DWORD buffer_size = 0;
	size_t offset = 0;
	typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void, void);
	GetLogicalProcessorInformationFuncPointer func_pointer =
	(GetLogicalProcessorInformationFuncPointer)GetProcAddress(
	GetModuleHandle("kernel32"), "GetLogicalProcessorInformation");

	if (!func_pointer) {
	return false;
	}

	// Get buffer size
	if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
	return false;

	buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size);

	if (!buffer \|\| !func_pointer(buffer, &buffer_size)) {
	return false;
	}

	buffer_position = buffer;
	while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) {
	if (RelationCache == buffer_position->Relationship) {
	PCACHE_DESCRIPTOR cache = &buffer_position->Cache;
	if (cache->Level >= 1 && cache->Level <= 3) {
	cache_sizes[cache->Level - 1] += cache->Size;
	}
	}
	offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
	buffer_position++;
	}

	if (buffer) {
	free(buffer);
	}
	return true;
	}

	// Source: https://en.wikipedia.org/wiki/CPUID
	bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
	CpuInfo::Vendor* vendor) {
	if (!hardware_flags \|\| !model_name \|\| !vendor) {
	return false;
	}
	int register_EAX_id = 1;
	int highest_valid_id = 0;
	int highest_extended_valid_id = 0;
	std::bitset<32> features_ECX;
	std::array<int, 4> cpu_info;

	// Get highest valid id
	__cpuid(cpu_info.data(), 0);
	highest_valid_id = cpu_info[0];
	// HEX of "GenuineIntel": 47656E75 696E6549 6E74656C
	// HEX of "AuthenticAMD": 41757468 656E7469 63414D44
	if (cpu_info[1] == 0x756e6547 && cpu_info[2] == 0x49656e69 &&
	cpu_info[3] == 0x6c65746e) {
	*vendor = CpuInfo::Vendor::Intel;
	} else if (cpu_info[1] == 0x68747541 && cpu_info[2] == 0x69746e65 &&
	cpu_info[3] == 0x444d4163) {
	*vendor = CpuInfo::Vendor::AMD;
	}

	if (highest_valid_id <= register_EAX_id) return false;

	// EAX=1: Processor Info and Feature Bits
	__cpuidex(cpu_info.data(), register_EAX_id, 0);
	features_ECX = cpu_info[2];

	// Get highest extended id
	__cpuid(cpu_info.data(), 0x80000000);
	highest_extended_valid_id = cpu_info[0];

	// Retrieve CPU model name
	if (highest_extended_valid_id >= static_cast<int>(0x80000004)) {
	model_name->clear();
	for (int i = 0x80000002; i <= static_cast<int>(0x80000004); ++i) {
	__cpuidex(cpu_info.data(), i, 0);
	*model_name +=
	std::string(reinterpret_cast<char*>(cpu_info.data()), sizeof(cpu_info));
	}
	}

	bool zmm_enabled = false;
	if (features_ECX[27]) { // OSXSAVE
	// Query if the OS supports saving ZMM registers when switching contexts
	int64_t xcr0 = _xgetbv(0);
	zmm_enabled = (xcr0 & 0xE0) == 0xE0;
	}

	if (features_ECX[9]) *hardware_flags \|= CpuInfo::SSSE3;
	if (features_ECX[19]) *hardware_flags \|= CpuInfo::SSE4_1;
	if (features_ECX[20]) *hardware_flags \|= CpuInfo::SSE4_2;
	if (features_ECX[23]) *hardware_flags \|= CpuInfo::POPCNT;
	if (features_ECX[23]) *hardware_flags \|= CpuInfo::AVX;

	// cpuid with EAX=7, ECX=0: Extended Features
	register_EAX_id = 7;
	if (highest_valid_id > register_EAX_id) {
	__cpuidex(cpu_info.data(), register_EAX_id, 0);
	std::bitset<32> features_EBX = cpu_info[1];

	if (features_EBX[3]) *hardware_flags \|= CpuInfo::BMI1;
	if (features_EBX[5]) *hardware_flags \|= CpuInfo::AVX2;
	if (features_EBX[8]) *hardware_flags \|= CpuInfo::BMI2;
	// ARROW-11427: only use AVX512 if enabled by the OS
	if (zmm_enabled) {
	if (features_EBX[16]) *hardware_flags \|= CpuInfo::AVX512F;
	if (features_EBX[17]) *hardware_flags \|= CpuInfo::AVX512DQ;
	if (features_EBX[28]) *hardware_flags \|= CpuInfo::AVX512CD;
	if (features_EBX[30]) *hardware_flags \|= CpuInfo::AVX512BW;
	if (features_EBX[31]) *hardware_flags \|= CpuInfo::AVX512VL;
	}
	}

	return true;
	}
	#endif

	} // namespace

	CpuInfo::CpuInfo()
	: hardware_flags_(0),
	num_cores_(1),
	model_name_("unknown"),
	vendor_(Vendor::Unknown) {}

	std::unique_ptr<CpuInfo> g_cpu_info;
	static std::once_flag cpuinfo_initialized;

	CpuInfo* CpuInfo::GetInstance() {
	std::call_once(cpuinfo_initialized, []() {
	g_cpu_info.reset(new CpuInfo);
	g_cpu_info->Init();
	});
	return g_cpu_info.get();
	}

	void CpuInfo::Init() {
	std::string line;
	std::string name;
	std::string value;

	float max_mhz = 0;
	int num_cores = 0;

	memset(&cache_sizes_, 0, sizeof(cache_sizes_));

	#ifdef _WIN32
	SYSTEM_INFO system_info;
	GetSystemInfo(&system_info);
	num_cores = system_info.dwNumberOfProcessors;

	LARGE_INTEGER performance_frequency;
	if (QueryPerformanceFrequency(&performance_frequency)) {
	max_mhz = static_cast<float>(performance_frequency.QuadPart);
	}
	#elif defined(__APPLE__)
	// On macOS, get CPU information from system information base
	struct SysCtlCpuFeature {
	const char* name;
	int64_t flag;
	};
	std::vector<SysCtlCpuFeature> features = {
	#if defined(__aarch64__)
	// ARM64 (note that this is exposed under Rosetta as well)
	{"hw.optional.neon", ASIMD},
	#else
	// x86
	{"hw.optional.sse4_2", SSSE3 \| SSE4_1 \| SSE4_2 \| POPCNT},
	{"hw.optional.avx1_0", AVX},
	{"hw.optional.avx2_0", AVX2},
	{"hw.optional.bmi1", BMI1},
	{"hw.optional.bmi2", BMI2},
	{"hw.optional.avx512f", AVX512F},
	{"hw.optional.avx512cd", AVX512CD},
	{"hw.optional.avx512dq", AVX512DQ},
	{"hw.optional.avx512bw", AVX512BW},
	{"hw.optional.avx512vl", AVX512VL},
	#endif
	};
	for (const auto& feature : features) {
	auto v = IntegerSysCtlByName(feature.name);
	if (v.value_or(0)) {
	hardware_flags_ \|= feature.flag;
	}
	}
	#else
	// Read from /proc/cpuinfo
	std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
	while (cpuinfo) {
	std::getline(cpuinfo, line);
	size_t colon = line.find(':');
	if (colon != std::string::npos) {
	name = TrimString(line.substr(0, colon - 1));
	value = TrimString(line.substr(colon + 1, std::string::npos));
	if (name.compare("flags") == 0 \|\| name.compare("Features") == 0) {
	hardware_flags_ \|= ParseCPUFlags(value);
	} else if (name.compare("cpu MHz") == 0) {
	// Every core will report a different speed. We'll take the max, assuming
	// that when impala is running, the core will not be in a lower power state.
	// TODO: is there a more robust way to do this, such as
	// Window's QueryPerformanceFrequency()
	float mhz = static_cast<float>(atof(value.c_str()));
	max_mhz = max(mhz, max_mhz);
	} else if (name.compare("processor") == 0) {
	++num_cores;
	} else if (name.compare("model name") == 0) {
	model_name_ = value;
	} else if (name.compare("vendor_id") == 0) {
	if (value.compare("GenuineIntel") == 0) {
	vendor_ = Vendor::Intel;
	} else if (value.compare("AuthenticAMD") == 0) {
	vendor_ = Vendor::AMD;
	}
	}
	}
	}
	if (cpuinfo.is_open()) cpuinfo.close();
	#endif

	#ifdef __APPLE__
	// On macOS, get cache size from system information base
	SetDefaultCacheSize();
	auto c = IntegerSysCtlByName("hw.l1dcachesize");
	if (c.has_value()) {
	cache_sizes_[0] = *c;
	}
	c = IntegerSysCtlByName("hw.l2cachesize");
	if (c.has_value()) {
	cache_sizes_[1] = *c;
	}
	c = IntegerSysCtlByName("hw.l3cachesize");
	if (c.has_value()) {
	cache_sizes_[2] = *c;
	}
	#elif _WIN32
	if (!RetrieveCacheSize(cache_sizes_)) {
	SetDefaultCacheSize();
	}
	RetrieveCPUInfo(&hardware_flags_, &model_name_, &vendor_);
	#else
	SetDefaultCacheSize();
	#endif

	if (max_mhz != 0) {
	cycles_per_ms_ = static_cast<int64_t>(max_mhz);
	#ifndef _WIN32
	cycles_per_ms_ *= 1000;
	#endif
	} else {
	cycles_per_ms_ = 1000000;
	}
	original_hardware_flags_ = hardware_flags_;

	if (num_cores > 0) {
	num_cores_ = num_cores;
	} else {
	num_cores_ = 1;
	}

	// Parse the user simd level
	ParseUserSimdLevel();
	}

	void CpuInfo::VerifyCpuRequirements() {
	#ifdef ARROW_HAVE_SSE4_2
	if (!IsSupported(CpuInfo::SSSE3)) {
	DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
	}
	#endif
	#if defined(ARROW_HAVE_NEON)
	if (!IsSupported(CpuInfo::ASIMD)) {
	DCHECK(false) << "CPU does not support the Armv8 Neon instruction set";
	}
	#endif
	}

	bool CpuInfo::CanUseSSE4_2() const {
	#if defined(ARROW_HAVE_SSE4_2)
	return IsSupported(CpuInfo::SSE4_2);
	#else
	return false;
	#endif
	}

	void CpuInfo::EnableFeature(int64_t flag, bool enable) {
	if (!enable) {
	hardware_flags_ &= ~flag;
	} else {
	// Can't turn something on that can't be supported
	DCHECK_NE(original_hardware_flags_ & flag, 0);
	hardware_flags_ \|= flag;
	}
	}

	int64_t CpuInfo::hardware_flags() { return hardware_flags_; }

	int64_t CpuInfo::CacheSize(CacheLevel level) { return cache_sizes_[level]; }

	int64_t CpuInfo::cycles_per_ms() { return cycles_per_ms_; }

	int CpuInfo::num_cores() { return num_cores_; }

	std::string CpuInfo::model_name() { return model_name_; }

	void CpuInfo::SetDefaultCacheSize() {
	#if defined(_SC_LEVEL1_DCACHE_SIZE) && !defined(__aarch64__)
	// Call sysconf to query for the cache sizes
	cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
	cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
	cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
	ARROW_UNUSED(kDefaultL1CacheSize);
	ARROW_UNUSED(kDefaultL2CacheSize);
	ARROW_UNUSED(kDefaultL3CacheSize);
	#elif defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
	cache_sizes_[0] = GetArm64CacheSize(kL1CacheSizeFile, kDefaultL1CacheSize);
	cache_sizes_[1] = GetArm64CacheSize(kL2CacheSizeFile, kDefaultL2CacheSize);
	cache_sizes_[2] = GetArm64CacheSize(kL3CacheSizeFile, kDefaultL3CacheSize);
	#else
	// Provide reasonable default values if no info
	cache_sizes_[0] = kDefaultL1CacheSize;
	cache_sizes_[1] = kDefaultL2CacheSize;
	cache_sizes_[2] = kDefaultL3CacheSize;
	#endif
	}

	void CpuInfo::ParseUserSimdLevel() {
	auto maybe_env_var = GetEnvVar("ARROW_USER_SIMD_LEVEL");
	if (!maybe_env_var.ok()) {
	// No user settings
	return;
	}
	std::string s = *std::move(maybe_env_var);
	std::transform(s.begin(), s.end(), s.begin(),
	[](unsigned char c) { return std::toupper(c); });

	int level = USER_SIMD_MAX;
	// Parse the level
	if (s == "AVX512") {
	level = USER_SIMD_AVX512;
	} else if (s == "AVX2") {
	level = USER_SIMD_AVX2;
	} else if (s == "AVX") {
	level = USER_SIMD_AVX;
	} else if (s == "SSE4_2") {
	level = USER_SIMD_SSE4_2;
	} else if (s == "NONE") {
	level = USER_SIMD_NONE;
	} else if (!s.empty()) {
	ARROW_LOG(WARNING) << "Invalid value for ARROW_USER_SIMD_LEVEL: " << s;
	}

	// Disable feature as the level
	if (level < USER_SIMD_AVX512) { // Disable all AVX512 features
	EnableFeature(AVX512, false);
	}
	if (level < USER_SIMD_AVX2) { // Disable all AVX2 features
	EnableFeature(AVX2 \| BMI2, false);
	}
	if (level < USER_SIMD_AVX) { // Disable all AVX features
	EnableFeature(AVX, false);
	}
	if (level < USER_SIMD_SSE4_2) { // Disable all SSE4_2 features
	EnableFeature(SSE4_2 \| BMI1, false);
	}
	}

	} // namespace internal
	} // namespace arrow