Bug#1042079: onednn 2.7.4-1 FTBFS on arm64
Package: onednn
Version: 2.7.4-1
Followup-For: Bug #1042079
User: ubuntu-devel@lists.ubuntu.com
Usertags: origin-ubuntu mantic ubuntu-patch
Control: tags -1 patch
Dear Maintainer,
In Ubuntu, the attached patch was applied to achieve the following:
* Apply upstream patches to fix build on arm64 (LP: #2028759)
+ d/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-lastes-ver(1).patch
+ d/patches/lp2028759/cpu-aarch64-fix-getting-cache-sizes-on-macOS.patch
+ d/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-lastes-ver(2).patch
The second patch (macOS specific) is not essential but allows the third
patch to apply with fewer modifications.
Thanks for considering the patch.
-- System Information:
Debian Release: bookworm/sid
APT prefers lunar-updates
APT policy: (500, 'lunar-updates'), (500, 'lunar-security'), (500, 'lunar'), (100, 'lunar-backports')
Architecture: amd64 (x86_64)
Foreign Architectures: i386
Kernel: Linux 6.1.0-16-generic (SMP w/8 CPU threads; PREEMPT)
Kernel taint flags: TAINT_PROPRIETARY_MODULE, TAINT_OOT_MODULE
Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8), LANGUAGE not set
Shell: /bin/sh linked to /usr/bin/dash
Init: systemd (via /run/systemd/system)
LSM: AppArmor: enabled
diff -Nru onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-fix-getting-cache-sizes-on-macOS.patch onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-fix-getting-cache-sizes-on-macOS.patch
--- onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-fix-getting-cache-sizes-on-macOS.patch 1970-01-01 01:00:00.000000000 +0100
+++ onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-fix-getting-cache-sizes-on-macOS.patch 2023-07-26 12:14:38.000000000 +0200
@@ -0,0 +1,49 @@
+Description: cpu: aarch64: fix getting cache sizes on macOS
+ Notes from ogayot: This is not strictly needed for Ubuntu but allows the
+ subsequent patch (i.e.,
+ cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(2).patch) to apply without
+ too many modifications.
+Author: Denis Samoilov <denis.samoylov@intel.com>
+Bug-Ubuntu: https://launchpad.net/bugs/2028759
+Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1042079
+Forwarded: not-needed
+Applied-Upstream: https://github.com/oneapi-src/oneDNN/commit/ba91a536cb59558b3f3880951def160fe24b2820
+Last-Update: 2023-07-26
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
+index 59f712c7b..cb800b250 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
+@@ -52,17 +52,25 @@ void Cpu::setCacheHierarchy() {
+ }
+ } else {
+ /**
+- * @ToDo Get chache information by `sysconf`
++ * @ToDo Get cache information by `sysconf`
+ * for the case thd dictionary is unavailable.
+ */
++
++// _SC_LEVEL<L>_DCACHE_SIZE macros are not defined on macOS.
++#if defined(__APPLE__)
++#define GET_CACHE_SIZE(ID) 0
++#else
++#define GET_CACHE_SIZE(ID) sysconf(ID)
++#endif
++
+ #define XBYAK_AARCH64_CACHE_SIZE(LEVEL, SIZE, ID, CORES, VAL) \
+- cache_size = sysconf(ID); \
++ cache_size = GET_CACHE_SIZE(ID); \
+ VAL[LEVEL] = cache_size ? (cache_size / (CORES)) : ((SIZE) / (CORES));
+
+ uint32_t cache_size;
+
+ /* If `sysconf` returns zero as cache sizes, 32KiB, 1MiB, 0 and 0 is set as
+- 1st, 2nd, 3rd and 4th level cache sizes. 2nd cahce is assumed as sharing cache. */
++ 1st, 2nd, 3rd and 4th level cache sizes. 2nd cache is assumed as sharing cache. */
+ XBYAK_AARCH64_CACHE_SIZE(0, 1024 * 32, _SC_LEVEL1_DCACHE_SIZE, 1, coresSharingDataCache_);
+ XBYAK_AARCH64_CACHE_SIZE(1, 1024 * 1024, _SC_LEVEL2_CACHE_SIZE, 1, coresSharingDataCache_);
+ XBYAK_AARCH64_CACHE_SIZE(2, 0, _SC_LEVEL3_CACHE_SIZE, 1, coresSharingDataCache_);
+--
+2.39.2
+
diff -Nru onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(1).patch onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(1).patch
--- onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(1).patch 1970-01-01 01:00:00.000000000 +0100
+++ onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(1).patch 2023-07-26 12:14:38.000000000 +0200
@@ -0,0 +1,609 @@
+Description: cpu: aarch64: update xbyak_aarch64 into the latest version
+ - extend error types
+ - fix bug of Label handling
+ - add experimental APIs
+ - getCoresSharingDataCache
+ - getDataCacheLevels
+ - getDataCacheSize
+ - getNumCores
+Author: Kentaro Kawakami <kawakami.k@fujitsu.com>
+Origin: upstream, https://github.com/oneapi-src/oneDNN/pull/1484
+Bug-Ubuntu: https://launchpad.net/bugs/2028759
+Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1042079
+Applied-Upstream: https://github.com/oneapi-src/oneDNN/commit/837623961e655753c96fa79ae5d6887bf4912da0
+Last-Update: 2023-07-26
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+diff --git a/src/cpu/aarch64/CMakeLists.txt b/src/cpu/aarch64/CMakeLists.txt
+index 612d198b2..d5af77fcc 100644
+--- a/src/cpu/aarch64/CMakeLists.txt
++++ b/src/cpu/aarch64/CMakeLists.txt
+@@ -22,6 +22,7 @@ file(GLOB_RECURSE SOURCES
+
+ file(GLOB XBYAK_AARCH64_FILES
+ ${CMAKE_CURRENT_SOURCE_DIR}/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
++ ${CMAKE_CURRENT_SOURCE_DIR}/xbyak_aarch64/src/util_impl.cpp
+ )
+
+ list(REMOVE_ITEM SOURCES ${XBYAK_AARCH64_FILES})
+diff --git a/src/cpu/aarch64/xbyak_aarch64/CMakeLists.txt b/src/cpu/aarch64/xbyak_aarch64/CMakeLists.txt
+index 39d952439..3397ff22c 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/CMakeLists.txt
++++ b/src/cpu/aarch64/xbyak_aarch64/CMakeLists.txt
+@@ -16,6 +16,7 @@
+
+ file(GLOB SOURCES
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/xbyak_aarch64_impl.cpp
++ ${CMAKE_CURRENT_SOURCE_DIR}/src/util_impl.cpp
+ )
+
+ set(OBJ_LIB ${DNNL_LIBRARY_NAME}_cpu_aarch64_xbyak_aarch64)
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
+new file mode 100644
+index 000000000..59f712c7b
+--- /dev/null
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
+@@ -0,0 +1,267 @@
++/*******************************************************************************
++ * Copyright 2020-2022 FUJITSU LIMITED
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *******************************************************************************/
++#include "xbyak_aarch64_util.h"
++
++namespace Xbyak_aarch64 {
++namespace util {
++
++void Cpu::setCacheHierarchy() {
++ /* Cache size of AArch64 CPUs are described in the system registers,
++ which can't be read from user-space applications.
++ Linux provides `sysconf` API and `/sys/devices/system/cpu/`
++ device files to get cache size, but they dosen't always return
++ correct values. It may depend on Linux kernel version and
++ support status of CPUs. To avoid this situation, cahche size is
++ firstly read from `cacheInfoDict`, secondly get by `sysconf`.
++
++ `sysconf` example
++ #include <unistd.h>
++ int main() {
++ reutrn sysconf(_SC_LEVEL1_DCACHE_SIZE);
++ }
++ */
++ const cacheInfo_t *c = nullptr;
++
++ for (size_t j = 0; j < sizeof(cacheInfoDict) / sizeof(cacheInfo_t); j++) {
++ if (cacheInfoDict[j].midr_el1 == midr_el1_) {
++ c = cacheInfoDict + j;
++ break;
++ }
++ }
++
++ if (c != nullptr) {
++ dataCacheLevel_ = c->dataCacheLevel;
++ for (uint32_t i = 0; i < maxNumberCacheLevel; i++) {
++ if (i < c->highestInnerCacheLevel)
++ dataCacheSize_[i] = c->dataCacheSize[i];
++ else
++ dataCacheSize_[i] = c->dataCacheSize[i] / coresSharingDataCache_[i];
++ }
++ } else {
++ /**
++ * @ToDo Get chache information by `sysconf`
++ * for the case thd dictionary is unavailable.
++ */
++#define XBYAK_AARCH64_CACHE_SIZE(LEVEL, SIZE, ID, CORES, VAL) \
++ cache_size = sysconf(ID); \
++ VAL[LEVEL] = cache_size ? (cache_size / (CORES)) : ((SIZE) / (CORES));
++
++ uint32_t cache_size;
++
++ /* If `sysconf` returns zero as cache sizes, 32KiB, 1MiB, 0 and 0 is set as
++ 1st, 2nd, 3rd and 4th level cache sizes. 2nd cahce is assumed as sharing cache. */
++ XBYAK_AARCH64_CACHE_SIZE(0, 1024 * 32, _SC_LEVEL1_DCACHE_SIZE, 1, coresSharingDataCache_);
++ XBYAK_AARCH64_CACHE_SIZE(1, 1024 * 1024, _SC_LEVEL2_CACHE_SIZE, 1, coresSharingDataCache_);
++ XBYAK_AARCH64_CACHE_SIZE(2, 0, _SC_LEVEL3_CACHE_SIZE, 1, coresSharingDataCache_);
++ XBYAK_AARCH64_CACHE_SIZE(3, 0, _SC_LEVEL4_CACHE_SIZE, 1, coresSharingDataCache_);
++
++ XBYAK_AARCH64_CACHE_SIZE(0, 1024 * 32, _SC_LEVEL1_DCACHE_SIZE, 1, dataCacheSize_);
++ XBYAK_AARCH64_CACHE_SIZE(1, 1024 * 1024, _SC_LEVEL2_CACHE_SIZE, 8, dataCacheSize_);
++ XBYAK_AARCH64_CACHE_SIZE(2, 0, _SC_LEVEL3_CACHE_SIZE, 1, dataCacheSize_);
++ XBYAK_AARCH64_CACHE_SIZE(3, 0, _SC_LEVEL4_CACHE_SIZE, 1, dataCacheSize_);
++#undef XBYAK_AARCH64_CACHE_SIZE
++ }
++}
++
++void Cpu::setNumCores() {
++#ifdef __linux__
++ /**
++ * @ToDo There are some methods to get # of cores.
++ Considering various kernel versions and CPUs, a combination of
++ multiple methods may be required.
++ 1) sysconf(_SC_NPROCESSORS_ONLN)
++ 2) /sys/devices/system/cpu/online
++ 3) std::thread::hardware_concurrency()
++ */
++ numCores_[0] = numCores_[1] = sysconf(_SC_NPROCESSORS_ONLN);
++ coresSharingDataCache_[0] = 1;
++
++ /* # of numa nodes: /sys/devices/system/node/node[0-9]+
++ # of cores for each numa node: /sys/devices/system/node/node[0-9]+/cpu[0-9]+
++ It is assumed L2 cache is shared by each numa node. */
++ const int nodes = getFilePathMaxTailNumPlus1(XBYAK_AARCH64_PATH_NODES);
++ int cores = 1;
++
++ if (nodes > 0) {
++ cores = getFilePathMaxTailNumPlus1(XBYAK_AARCH64_PATH_CORES);
++ coresSharingDataCache_[1] = (cores > 0) ? cores : 1;
++ } else {
++ coresSharingDataCache_[1] = 1;
++ }
++#else
++ numCores_[0] = numCores_[1] = 1;
++ for (unsigned int i = 0; i < maxNumberCacheLevel; i++)
++ coresSharingDataCache_[i] = 1;
++
++ coresSharingDataCache_[1] = 8; // Set possible value.
++#endif
++}
++
++void Cpu::setSysRegVal() {
++#ifdef __linux__
++ XBYAK_AARCH64_READ_SYSREG(midr_el1_, MIDR_EL1);
++#endif
++}
++
++/**
++ * Return directory path
++ * @param[in] path ex. /sys/devices/system/node/node
++ * @param[out] buf ex. /sys/devices/system/node
++ */
++int Cpu::getRegEx(char *buf, const char *path, const char *regex) {
++ regex_t regexBuf;
++ regmatch_t match[1];
++
++ if (regcomp(®exBuf, regex, REG_EXTENDED) != 0)
++ throw ERR_INTERNAL;
++
++ const int retVal = regexec(®exBuf, path, 1, match, 0);
++ regfree(®exBuf);
++
++ if (retVal != 0)
++ return -1;
++
++ const int startIdx = match[0].rm_so;
++ const int endIdx = match[0].rm_eo;
++
++ /* Something wrong (multiple match or not match) */
++ if (startIdx == -1 || endIdx == -1 || (endIdx - startIdx - 1) < 1)
++ return -1;
++
++ strncpy(buf, path + startIdx, endIdx - startIdx);
++ buf[endIdx - startIdx] = '\0';
++
++ return 0;
++}
++
++int Cpu::getFilePathMaxTailNumPlus1(const char *path) {
++#ifdef __linux__
++ char dir_path[max_path_len];
++ char file_pattern[max_path_len];
++ int retVal = 0;
++
++ getRegEx(dir_path, path, "/([^/]+/)+");
++ /* Remove last '/'. */
++ dir_path[strlen(dir_path) - 1] = '\0';
++ getRegEx(file_pattern, path, "[^/]+$");
++ strncat(file_pattern, "[0-9]+", 16);
++
++ fflush(stdout);
++
++ DIR *dir = opendir(dir_path);
++ struct dirent *dp;
++
++ dp = readdir(dir);
++ while (dp != NULL) {
++ if (getRegEx(dir_path, dp->d_name, file_pattern) == 0)
++ retVal++;
++ dp = readdir(dir);
++ }
++
++ if (dir != NULL)
++ closedir(dir);
++
++ return retVal;
++#else
++ return 0;
++#endif
++}
++
++Cpu::Cpu() : type_(tNONE), sveLen_(SVE_NONE) {
++#ifdef __linux__
++ unsigned long hwcap = getauxval(AT_HWCAP);
++ if (hwcap & HWCAP_ATOMICS) {
++ type_ |= tATOMIC;
++ }
++
++ if (hwcap & HWCAP_FP) {
++ type_ |= tFP;
++ }
++ if (hwcap & HWCAP_ASIMD) {
++ type_ |= tADVSIMD;
++ }
++#ifdef HWCAP_SVE
++ /* Some old <sys/auxv.h> may not define HWCAP_SVE.
++ In that case, SVE is treated as if it were not supported. */
++ if (hwcap & HWCAP_SVE) {
++ type_ |= tSVE;
++ // svcntb(); if arm_sve.h is available
++ sveLen_ = (sveLen_t)prctl(51); // PR_SVE_GET_VL
++ }
++#endif
++#elif defined(__APPLE__)
++ size_t val = 0;
++ size_t len = sizeof(val);
++
++ if (sysctlbyname(hw_opt_atomics, &val, &len, NULL, 0) != 0)
++ throw Error(ERR_INTERNAL);
++ else
++ type_ |= (val == 1) ? tATOMIC : 0;
++
++ if (sysctlbyname(hw_opt_fp, &val, &len, NULL, 0) != 0)
++ throw Error(ERR_INTERNAL);
++ else
++ type_ |= (val == 1) ? tFP : 0;
++
++ if (sysctlbyname(hw_opt_neon, &val, &len, NULL, 0) != 0)
++ throw Error(ERR_INTERNAL);
++ else
++ type_ |= (val == 1) ? tADVSIMD : 0;
++#endif
++
++ setSysRegVal();
++ setNumCores();
++ setCacheHierarchy();
++}
++
++Type Cpu::getType() const { return type_; }
++bool Cpu::has(Type type) const { return (type & type_) != 0; }
++uint64_t Cpu::getSveLen() const { return sveLen_; }
++bool Cpu::isAtomicSupported() const { return type_ & tATOMIC; }
++const char *Cpu::getImplementer() const {
++ uint64_t implementer = (midr_el1_ >> 24) & 0xff;
++
++ for (size_t i = 0; i < sizeof(implementers) / sizeof(implementer_t); i++) {
++ if (implementers[i].id == implementer)
++ return implementers[i].implementer;
++ }
++
++ return nullptr;
++}
++
++uint32_t Cpu::getCoresSharingDataCache(uint32_t i) const {
++ if (i >= dataCacheLevel_)
++ throw Error(ERR_BAD_PARAMETER);
++ return coresSharingDataCache_[i];
++}
++
++uint32_t Cpu::getDataCacheLevels() const { return dataCacheLevel_; }
++
++uint32_t Cpu::getDataCacheSize(uint32_t i) const {
++ if (i >= dataCacheLevel_)
++ throw Error(ERR_BAD_PARAMETER);
++ return dataCacheSize_[i];
++}
++uint32_t Cpu::getNumCores(Arm64CpuTopologyLevel level) const {
++ switch (level) {
++ case CoreLevel:
++ return numCores_[level - 1];
++ default:
++ throw Error(ERR_BAD_PARAMETER);
++ }
++}
++} // namespace util
++} // namespace Xbyak_aarch64
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h b/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h
+index b634958b9..ba6da0fe3 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h
++++ b/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h
+@@ -485,7 +485,7 @@ uint32_t CodeGenerator::CondBrImmEnc(uint32_t cond, int64_t labelOffset) {
+ }
+
+ void CodeGenerator::CondBrImm(Cond cond, const Label &label) {
+- auto encFunc = [=](int64_t labelOffset) { return CondBrImmEnc(cond, labelOffset); };
++ auto encFunc = [&, cond](int64_t labelOffset) { return CondBrImmEnc(cond, labelOffset); };
+ JmpLabel jmpL = JmpLabel(encFunc, size_);
+ uint32_t code = CondBrImmEnc(cond, genLabelOffset(label, jmpL));
+ dd(code);
+@@ -617,7 +617,7 @@ uint32_t CodeGenerator::UncondBrImmEnc(uint32_t op, int64_t labelOffset) {
+ }
+
+ void CodeGenerator::UncondBrImm(uint32_t op, const Label &label) {
+- auto encFunc = [=](int64_t labelOffset) { return UncondBrImmEnc(op, labelOffset); };
++ auto encFunc = [&, op](int64_t labelOffset) { return UncondBrImmEnc(op, labelOffset); };
+ JmpLabel jmpL = JmpLabel(encFunc, size_);
+ uint32_t code = UncondBrImmEnc(op, genLabelOffset(label, jmpL));
+ dd(code);
+@@ -638,7 +638,7 @@ uint32_t CodeGenerator::CompareBrEnc(uint32_t op, const RReg &rt, int64_t labelO
+ }
+
+ void CodeGenerator::CompareBr(uint32_t op, const RReg &rt, const Label &label) {
+- auto encFunc = [=](int64_t labelOffset) { return CompareBrEnc(op, rt, labelOffset); };
++ auto encFunc = [=, &op](int64_t labelOffset) { return CompareBrEnc(op, rt, labelOffset); };
+ JmpLabel jmpL = JmpLabel(encFunc, size_);
+ uint32_t code = CompareBrEnc(op, rt, genLabelOffset(label, jmpL));
+ dd(code);
+@@ -665,7 +665,7 @@ uint32_t CodeGenerator::TestBrEnc(uint32_t op, const RReg &rt, uint32_t imm, int
+ }
+
+ void CodeGenerator::TestBr(uint32_t op, const RReg &rt, uint32_t imm, const Label &label) {
+- auto encFunc = [=](int64_t labelOffset) { return TestBrEnc(op, rt, imm, labelOffset); };
++ auto encFunc = [&, op, rt, imm](int64_t labelOffset) { return TestBrEnc(op, rt, imm, labelOffset); };
+ JmpLabel jmpL = JmpLabel(encFunc, size_);
+ uint32_t code = TestBrEnc(op, rt, imm, genLabelOffset(label, jmpL));
+ dd(code);
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
+index 353df2ede..09403ed34 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
+@@ -58,32 +58,37 @@ public:
+ if (err_ <= 0)
+ return;
+ fprintf(stderr, "bad err=%d in Xbyak::Error\n", err_);
+- static const char *tbl[32] = {"none",
+- "code is too big",
+- "label is redefined",
+- "label is too far",
+- "label is not found",
+- "bad parameter",
+- "can't protect",
+- "offset is too big",
+- "can't alloc",
+- "label is not set by L()",
+- "label is already set by L()",
+- "internal error",
+- "illegal register index (can not encoding register index)",
+- "illegal register element index (can not encoding element index)",
+- "illegal predicate register type",
+- "illegal immediate parameter (range error)",
+- "illegal immediate parameter (unavailable value error)",
+- "illegal immediate parameter (condition error)",
+- "illegal shift-mode paramater",
+- "illegal extend-mode parameter",
+- "illegal condition parameter",
+- "illegal barrier option",
+- "illegal const parameter (range error)",
+- "illegal const parameter (unavailable error)",
+- "illegal const parameter (condition error)",
+- "illegal type"};
++ static const char *tbl[32] = {
++ "none",
++ "code is too big",
++ "label is redefined",
++ "label is too far",
++ "label is not found",
++ "bad parameter",
++ "can't protect",
++ "offset is too big",
++ "can't alloc",
++ "label is not set by L()",
++ "label is already set by L()",
++ "internal error",
++ "illegal register index (can not encoding register index)",
++ "illegal register element index (can not encoding element index)",
++ "illegal predicate register type",
++ "illegal immediate parameter (range error)",
++ "illegal immediate parameter (unavailable value error)",
++ "illegal immediate parameter (condition error)",
++ "illegal shift-mode paramater",
++ "illegal extend-mode parameter",
++ "illegal condition parameter",
++ "illegal barrier option",
++ "illegal const parameter (range error)",
++ "illegal const parameter (unavailable error)",
++ "illegal const parameter (condition error)",
++ "illegal type",
++ "bad align",
++ "bad addressing",
++ "bad scale",
++ };
+ if ((size_t)err_ >= sizeof(tbl) / sizeof(tbl[0])) {
+ msg_ = "bad err num";
+ } else {
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
+index c23088e48..f8878ced4 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2020-2021 FUJITSU LIMITED
++ * Copyright 2020-2022 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -17,10 +17,17 @@
+ #ifndef XBYAK_AARCH64_UTIL_H_
+ #define XBYAK_AARCH64_UTIL_H_
+
++#include <dirent.h>
++#include <regex.h>
+ #include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
+ #ifdef __linux__
+ #include <sys/auxv.h>
+ #include <sys/prctl.h>
++#include <unistd.h>
+
+ /* In old Linux such as Ubuntu 16.04, HWCAP_ATOMICS, HWCAP_FP, HWCAP_ASIMD
+ can not be found in <bits/hwcap.h> which is included from <sys/auxv.h>.
+@@ -36,8 +43,20 @@
+
+ #include "xbyak_aarch64_err.h"
+
++#define XBYAK_AARCH64_MIDR_EL1(I, V, A, P, R) ((I << 24) | (V << 20) | (A << 16) | (P << 4) | (R << 0))
++#define XBYAK_AARCH64_PATH_NODES "/sys/devices/system/node/node"
++#define XBYAK_AARCH64_PATH_CORES "/sys/devices/system/node/node0/cpu"
++#define XBYAK_AARCH64_READ_SYSREG(var, ID) asm("mrs %0, " #ID : "=r"(var));
++
+ namespace Xbyak_aarch64 {
+ namespace util {
++typedef uint64_t Type;
++
++constexpr uint32_t maxNumberCacheLevel = 4;
++constexpr uint32_t maxTopologyLevel = 2;
++constexpr uint32_t max_path_len = 1024;
++
++enum Arm64CpuTopologyLevel { SmtLevel = 1, CoreLevel = 2 };
+
+ enum sveLen_t {
+ SVE_NONE = 0,
+@@ -59,12 +78,39 @@ enum sveLen_t {
+ SVE_2048 = 16 * 16,
+ };
+
++struct implementer_t {
++ uint32_t id;
++ const char *implementer;
++};
++
++struct cacheInfo_t {
++ uint64_t midr_el1;
++ uint32_t dataCacheLevel;
++ uint32_t highestInnerCacheLevel;
++ uint32_t dataCacheSize[maxNumberCacheLevel];
++};
++
+ #ifdef __APPLE__
+ constexpr char hw_opt_atomics[] = "hw.optional.armv8_1_atomics";
+ constexpr char hw_opt_fp[] = "hw.optional.floatingpoint";
+ constexpr char hw_opt_neon[] = "hw.optional.neon";
+ #endif
+
++const struct implementer_t implementers[] = {{0x00, "Reserved for software use"},
++ {0xC0, "Ampere Computing"},
++ {0x41, "Arm Limited"},
++ {0x42, "Broadcom Corporation"},
++ {0x43, "Cavium Inc."},
++ {0x44, "Digital Equipment Corporation"},
++ {0x46, "Fujitsu Ltd."},
++ {0x49, "Infineon Technologies AG"},
++ {0x4D, "Motorola or Freescale Semiconductor Inc."},
++ {0x4E, "NVIDIA Corporation"},
++ {0x50, "Applied Micro Circuits Corporation"},
++ {0x51, "Qualcomm Inc."},
++ {0x56, "Marvell International Ltd."},
++ {0x69, "Intel Corporation"}};
++
+ /**
+ CPU detection class
+ */
+@@ -72,63 +118,44 @@ class Cpu {
+ uint64_t type_;
+ sveLen_t sveLen_;
+
+-public:
+- typedef uint64_t Type;
++private:
++ const struct cacheInfo_t cacheInfoDict[2] = {
++ {/* A64FX */ XBYAK_AARCH64_MIDR_EL1(0x46, 0x1, 0xf, 0x1, 0x0), 2, 1, {1024 * 64, 1024 * 1024 * 8 * 4, 0, 0}},
++ {/* A64FX */ XBYAK_AARCH64_MIDR_EL1(0x46, 0x2, 0xf, 0x1, 0x0), 2, 1, {1024 * 64, 1024 * 1024 * 8 * 4, 0, 0}},
++ };
+
++ uint32_t coresSharingDataCache_[maxNumberCacheLevel];
++ uint32_t dataCacheSize_[maxNumberCacheLevel];
++ uint32_t dataCacheLevel_;
++ uint64_t midr_el1_;
++ uint32_t numCores_[maxTopologyLevel];
++
++ void setCacheHierarchy();
++ void setNumCores();
++ void setSysRegVal();
++ int getRegEx(char *buf, const char *path, const char *regex);
++ int getFilePathMaxTailNumPlus1(const char *path);
++
++public:
+ static const Type tNONE = 0;
+ static const Type tADVSIMD = 1 << 1;
+ static const Type tFP = 1 << 2;
+ static const Type tSVE = 1 << 3;
+ static const Type tATOMIC = 1 << 4;
+
+- Cpu() : type_(tNONE), sveLen_(SVE_NONE) {
+-#ifdef __linux__
+- unsigned long hwcap = getauxval(AT_HWCAP);
+- if (hwcap & HWCAP_ATOMICS) {
+- type_ |= tATOMIC;
+- }
+-
+- if (hwcap & HWCAP_FP) {
+- type_ |= tFP;
+- }
+- if (hwcap & HWCAP_ASIMD) {
+- type_ |= tADVSIMD;
+- }
+-#ifdef HWCAP_SVE
+- /* Some old <sys/auxv.h> may not define HWCAP_SVE.
+- In that case, SVE is treated as if it were not supported. */
+- if (hwcap & HWCAP_SVE) {
+- type_ |= tSVE;
+- // svcntb(); if arm_sve.h is available
+- sveLen_ = (sveLen_t)prctl(51); // PR_SVE_GET_VL
+- }
+-#endif
+-#elif defined(__APPLE__)
+- size_t val = 0;
+- size_t len = sizeof(val);
+-
+- if (sysctlbyname(hw_opt_atomics, &val, &len, NULL, 0) != 0)
+- throw Error(ERR_INTERNAL);
+- else
+- type_ |= (val == 1) ? tATOMIC : 0;
+-
+- if (sysctlbyname(hw_opt_fp, &val, &len, NULL, 0) != 0)
+- throw Error(ERR_INTERNAL);
+- else
+- type_ |= (val == 1) ? tFP : 0;
+-
+- if (sysctlbyname(hw_opt_neon, &val, &len, NULL, 0) != 0)
+- throw Error(ERR_INTERNAL);
+- else
+- type_ |= (val == 1) ? tADVSIMD : 0;
+-#endif
+- }
+-
+- Type getType() const { return type_; }
+- bool has(Type type) const { return (type & type_) != 0; }
+- uint64_t getSveLen() const { return sveLen_; }
+- bool isAtomicSupported() const { return type_ & tATOMIC; }
++ Cpu();
++
++ Type getType() const;
++ bool has(Type type) const;
++ uint64_t getSveLen() const;
++ bool isAtomicSupported() const;
++ const char *getImplementer() const;
++ uint32_t getCoresSharingDataCache(uint32_t i) const;
++ uint32_t getDataCacheLevels() const;
++ uint32_t getDataCacheSize(uint32_t i) const;
++ uint32_t getNumCores(Arm64CpuTopologyLevel level) const;
+ };
++
+ } // namespace util
+ } // namespace Xbyak_aarch64
+ #endif
+--
+2.39.2
+
diff -Nru onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(2).patch onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(2).patch
--- onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(2).patch 1970-01-01 01:00:00.000000000 +0100
+++ onednn-2.7.4/debian/patches/lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(2).patch 2023-07-26 12:14:38.000000000 +0200
@@ -0,0 +1,1813 @@
+Description: cpu: aarch64: update xbyak_aarch64 into the latest version
+Author: Kentaro Kawakami <kawakami.k@fujitsu.com>
+Origin: upstream, https://github.com/oneapi-src/oneDNN/pull/1557
+Bug: https://github.com/oneapi-src/oneDNN/issues/1600
+Bug-Ubuntu: https://launchpad.net/bugs/2028759
+Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1042079
+Applied-Upstream: https://github.com/oneapi-src/oneDNN/commit/856d8bb1ef9ef80e5488f7ce65171d133de0aae1
+Last-Update: 2023-07-26
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
+index a4c9aaf2c..5f16cd05a 100644
+--- a/src/cpu/aarch64/cpu_isa_traits.hpp
++++ b/src/cpu/aarch64/cpu_isa_traits.hpp
+@@ -1,6 +1,6 @@
+ /*******************************************************************************
+-* Copyright 2018-2022 Intel Corporation
+-* Copyright 2020-2022 FUJITSU LIMITED
++* Copyright 2018-2023 Intel Corporation
++* Copyright 2020-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -181,15 +181,19 @@ static inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) {
+ if ((cpu_isa_mask & cpu_isa) != cpu_isa) return false;
+
+ switch (cpu_isa) {
+- case asimd: return cpu().has(Cpu::tADVSIMD);
++ case asimd: return cpu().has(XBYAK_AARCH64_HWCAP_ADVSIMD);
+ case sve_128:
+- return cpu().has(Cpu::tSVE) && cpu().getSveLen() >= SVE_128;
++ return cpu().has(XBYAK_AARCH64_HWCAP_SVE)
++ && cpu().getSveLen() >= SVE_128;
+ case sve_256:
+- return cpu().has(Cpu::tSVE) && cpu().getSveLen() >= SVE_256;
++ return cpu().has(XBYAK_AARCH64_HWCAP_SVE)
++ && cpu().getSveLen() >= SVE_256;
+ case sve_384:
+- return cpu().has(Cpu::tSVE) && cpu().getSveLen() >= SVE_384;
++ return cpu().has(XBYAK_AARCH64_HWCAP_SVE)
++ && cpu().getSveLen() >= SVE_384;
+ case sve_512:
+- return cpu().has(Cpu::tSVE) && cpu().getSveLen() >= SVE_512;
++ return cpu().has(XBYAK_AARCH64_HWCAP_SVE)
++ && cpu().getSveLen() >= SVE_512;
+ case isa_any: return true;
+ case isa_all: return false;
+ }
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/err_impl.h b/src/cpu/aarch64/xbyak_aarch64/src/err_impl.h
+new file mode 100644
+index 000000000..51d2f136c
+--- /dev/null
++++ b/src/cpu/aarch64/xbyak_aarch64/src/err_impl.h
+@@ -0,0 +1,59 @@
++#pragma once
++/*******************************************************************************
++ * Copyright 2018-2023 Intel Corporation
++ * Copyright 2020-2023 FUJITSU LIMITED
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *******************************************************************************/
++
++Error::Error(int err) : err_(err), msg_("") {
++ if (err_ <= 0)
++ return;
++ fprintf(stderr, "bad err=%d in Xbyak::Error\n", err_);
++ static const char *tbl[32] = {
++ "none",
++ "code is too big",
++ "label is redefined",
++ "label is too far",
++ "label is not found",
++ "bad parameter",
++ "can't protect",
++ "offset is too big",
++ "can't alloc",
++ "label is not set by L()",
++ "label is already set by L()",
++ "internal error",
++ "illegal register index (can not encoding register index)",
++ "illegal register element index (can not encoding element index)",
++ "illegal predicate register type",
++ "illegal immediate parameter (range error)",
++ "illegal immediate parameter (unavailable value error)",
++ "illegal immediate parameter (condition error)",
++ "illegal shift-mode paramater",
++ "illegal extend-mode parameter",
++ "illegal condition parameter",
++ "illegal barrier option",
++ "illegal const parameter (range error)",
++ "illegal const parameter (unavailable error)",
++ "illegal const parameter (condition error)",
++ "illegal type",
++ "bad align",
++ "bad addressing",
++ "bad scale",
++ };
++ if ((size_t)err_ >= sizeof(tbl) / sizeof(tbl[0])) {
++ msg_ = "bad err num";
++ } else {
++ msg_ = tbl[err_];
++ }
++}
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
+index cb800b250..2e05993a8 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+- * Copyright 2020-2022 FUJITSU LIMITED
++ * Copyright 2020-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -13,263 +13,189 @@
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+-#include "xbyak_aarch64_util.h"
+-
+-namespace Xbyak_aarch64 {
+-namespace util {
+-
+-void Cpu::setCacheHierarchy() {
+- /* Cache size of AArch64 CPUs are described in the system registers,
+- which can't be read from user-space applications.
+- Linux provides `sysconf` API and `/sys/devices/system/cpu/`
+- device files to get cache size, but they dosen't always return
+- correct values. It may depend on Linux kernel version and
+- support status of CPUs. To avoid this situation, cahche size is
+- firstly read from `cacheInfoDict`, secondly get by `sysconf`.
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
+
+- `sysconf` example
+- #include <unistd.h>
+- int main() {
+- reutrn sysconf(_SC_LEVEL1_DCACHE_SIZE);
+- }
+- */
+- const cacheInfo_t *c = nullptr;
+-
+- for (size_t j = 0; j < sizeof(cacheInfoDict) / sizeof(cacheInfo_t); j++) {
+- if (cacheInfoDict[j].midr_el1 == midr_el1_) {
+- c = cacheInfoDict + j;
+- break;
+- }
+- }
++#include "xbyak_aarch64_err.h"
++#include "xbyak_aarch64_util.h"
+
+- if (c != nullptr) {
+- dataCacheLevel_ = c->dataCacheLevel;
+- for (uint32_t i = 0; i < maxNumberCacheLevel; i++) {
+- if (i < c->highestInnerCacheLevel)
+- dataCacheSize_[i] = c->dataCacheSize[i];
+- else
+- dataCacheSize_[i] = c->dataCacheSize[i] / coresSharingDataCache_[i];
+- }
+- } else {
+- /**
+- * @ToDo Get cache information by `sysconf`
+- * for the case thd dictionary is unavailable.
+- */
++#include "util_impl.h"
+
+-// _SC_LEVEL<L>_DCACHE_SIZE macros are not defined on macOS.
+-#if defined(__APPLE__)
+-#define GET_CACHE_SIZE(ID) 0
++#if defined(__linux__)
++#include "util_impl_linux.h"
++#elif defined(__APPLE__)
++#include "util_impl_mac.h"
++#elif defined(_M_ARM64)
++#include "util_impl_windows.h"
+ #else
+-#define GET_CACHE_SIZE(ID) sysconf(ID)
++#error "Unsupported OS"
+ #endif
+
+-#define XBYAK_AARCH64_CACHE_SIZE(LEVEL, SIZE, ID, CORES, VAL) \
+- cache_size = GET_CACHE_SIZE(ID); \
+- VAL[LEVEL] = cache_size ? (cache_size / (CORES)) : ((SIZE) / (CORES));
+-
+- uint32_t cache_size;
+-
+- /* If `sysconf` returns zero as cache sizes, 32KiB, 1MiB, 0 and 0 is set as
+- 1st, 2nd, 3rd and 4th level cache sizes. 2nd cache is assumed as sharing cache. */
+- XBYAK_AARCH64_CACHE_SIZE(0, 1024 * 32, _SC_LEVEL1_DCACHE_SIZE, 1, coresSharingDataCache_);
+- XBYAK_AARCH64_CACHE_SIZE(1, 1024 * 1024, _SC_LEVEL2_CACHE_SIZE, 1, coresSharingDataCache_);
+- XBYAK_AARCH64_CACHE_SIZE(2, 0, _SC_LEVEL3_CACHE_SIZE, 1, coresSharingDataCache_);
+- XBYAK_AARCH64_CACHE_SIZE(3, 0, _SC_LEVEL4_CACHE_SIZE, 1, coresSharingDataCache_);
+-
+- XBYAK_AARCH64_CACHE_SIZE(0, 1024 * 32, _SC_LEVEL1_DCACHE_SIZE, 1, dataCacheSize_);
+- XBYAK_AARCH64_CACHE_SIZE(1, 1024 * 1024, _SC_LEVEL2_CACHE_SIZE, 8, dataCacheSize_);
+- XBYAK_AARCH64_CACHE_SIZE(2, 0, _SC_LEVEL3_CACHE_SIZE, 1, dataCacheSize_);
+- XBYAK_AARCH64_CACHE_SIZE(3, 0, _SC_LEVEL4_CACHE_SIZE, 1, dataCacheSize_);
+-#undef XBYAK_AARCH64_CACHE_SIZE
++namespace Xbyak_aarch64 {
++namespace util {
++void CpuInfo::dumpCacheInfo() const {
++ printf("numCores=%d\n", numCores_[/* Phisical cores */ 1]);
++ for (size_t i = 0; i < maxCacheLevel; i++) {
++ auto cache = &cacheInfo_.levelCache[i];
++ printf("L%zd, %d, %d, %d, %d, %d, %d, %d\n", i + 1, cache->type, cache->size[0], cache->size[1], cache->size[2], cache->sharingCores[0], cache->sharingCores[1], cache->sharingCores[2]);
+ }
+ }
+
+-void Cpu::setNumCores() {
+-#ifdef __linux__
+- /**
+- * @ToDo There are some methods to get # of cores.
+- Considering various kernel versions and CPUs, a combination of
+- multiple methods may be required.
+- 1) sysconf(_SC_NPROCESSORS_ONLN)
+- 2) /sys/devices/system/cpu/online
+- 3) std::thread::hardware_concurrency()
+- */
+- numCores_[0] = numCores_[1] = sysconf(_SC_NPROCESSORS_ONLN);
+- coresSharingDataCache_[0] = 1;
+-
+- /* # of numa nodes: /sys/devices/system/node/node[0-9]+
+- # of cores for each numa node: /sys/devices/system/node/node[0-9]+/cpu[0-9]+
+- It is assumed L2 cache is shared by each numa node. */
+- const int nodes = getFilePathMaxTailNumPlus1(XBYAK_AARCH64_PATH_NODES);
+- int cores = 1;
+-
+- if (nodes > 0) {
+- cores = getFilePathMaxTailNumPlus1(XBYAK_AARCH64_PATH_CORES);
+- coresSharingDataCache_[1] = (cores > 0) ? cores : 1;
++int CpuInfo::getCacheSize(cacheType_t type, uint32_t level) const {
++ if (level <= maxCacheLevel) {
++ auto cache = &cacheInfo_.levelCache[level - 1];
++ switch (type) {
++ case ICache:
++ return cache->size[0];
++ break;
++ case DCache:
++ return cache->size[1];
++ break;
++ case UCache:
++ return cache->size[2];
++ break;
++ default:
++ throw Error(ERR_BAD_PARAMETER);
++ break;
++ }
+ } else {
+- coresSharingDataCache_[1] = 1;
++ throw Error(ERR_BAD_PARAMETER);
+ }
+-#else
+- numCores_[0] = numCores_[1] = 1;
+- for (unsigned int i = 0; i < maxNumberCacheLevel; i++)
+- coresSharingDataCache_[i] = 1;
+-
+- coresSharingDataCache_[1] = 8; // Set possible value.
+-#endif
+-}
+-
+-void Cpu::setSysRegVal() {
+-#ifdef __linux__
+- XBYAK_AARCH64_READ_SYSREG(midr_el1_, MIDR_EL1);
+-#endif
+ }
+
+-/**
+- * Return directory path
+- * @param[in] path ex. /sys/devices/system/node/node
+- * @param[out] buf ex. /sys/devices/system/node
+- */
+-int Cpu::getRegEx(char *buf, const char *path, const char *regex) {
+- regex_t regexBuf;
+- regmatch_t match[1];
+-
+- if (regcomp(®exBuf, regex, REG_EXTENDED) != 0)
+- throw ERR_INTERNAL;
+-
+- const int retVal = regexec(®exBuf, path, 1, match, 0);
+- regfree(®exBuf);
+-
+- if (retVal != 0)
+- return -1;
++Arm64CacheType CpuInfo::getCacheType(int level) const { return cacheInfo_.levelCache[level - 1].type; }
++int CpuInfo::getCodeCacheSize(int level) const { return cacheInfo_.levelCache[level - 1].size[0]; }
++
++int CpuInfo::getCoresSharingDataCache(int level) const {
++ auto cache = &cacheInfo_.levelCache[level - 1];
++ int cores;
++
++ switch (cache->type) {
++ case DataCacheOnly:
++ case SeparateCache:
++ cores = cache->sharingCores[1];
++ break;
++ case UnifiedCache:
++ cores = cache->sharingCores[2];
++ break;
++ default:
++ cores = 0;
++ break;
++ }
+
+- const int startIdx = match[0].rm_so;
+- const int endIdx = match[0].rm_eo;
++ return cores;
++}
+
+- /* Something wrong (multiple match or not match) */
+- if (startIdx == -1 || endIdx == -1 || (endIdx - startIdx - 1) < 1)
+- return -1;
++int CpuInfo::getDataCacheSize(int level) const { return cacheInfo_.levelCache[level - 1].size[1]; }
+
+- strncpy(buf, path + startIdx, endIdx - startIdx);
+- buf[endIdx - startIdx] = '\0';
++const char *CpuInfo::getImplementer() const { return implementer_; }
++int CpuInfo::getLastDataCacheLevel() const { return lastDataCacheLevel_; }
+
+- return 0;
++int CpuInfo::getNumCores(Arm64CpuTopologyLevel level) const {
++ switch (level) {
++ case SmtLevel:
++ return numCores_[0];
++ break;
++ case CoreLevel:
++ return numCores_[1];
++ break;
++ default:
++ return 0;
++ }
+ }
+
+-int Cpu::getFilePathMaxTailNumPlus1(const char *path) {
+-#ifdef __linux__
+- char dir_path[max_path_len];
+- char file_pattern[max_path_len];
+- int retVal = 0;
+-
+- getRegEx(dir_path, path, "/([^/]+/)+");
+- /* Remove last '/'. */
+- dir_path[strlen(dir_path) - 1] = '\0';
+- getRegEx(file_pattern, path, "[^/]+$");
+- strncat(file_pattern, "[0-9]+", 16);
++uint64_t CpuInfo::getSveLen() const { return sveLen_; }
++Type CpuInfo::getType() const { return type_; }
++int CpuInfo::getUnifiedCacheSize(int level) const { return cacheInfo_.levelCache[level - 1].size[2]; }
+
+- fflush(stdout);
+-
+- DIR *dir = opendir(dir_path);
+- struct dirent *dp;
++void CpuInfo::init() {
++ for (size_t i = 0; i < maxCacheLevel; i++) {
++ auto cache = &cacheInfo_.levelCache[i];
++ cache->type = NoCache;
++ cache->size[0] = cache->size[1] = cache->size[2] = 0;
++ cache->sharingCores[0] = cache->sharingCores[1] = cache->sharingCores[2] = 0;
++ }
++}
+
+- dp = readdir(dir);
+- while (dp != NULL) {
+- if (getRegEx(dir_path, dp->d_name, file_pattern) == 0)
+- retVal++;
+- dp = readdir(dir);
++void CpuInfo::put() const {
++ printf("numCores=%d\n", numCores_[0]);
++ for (int level = 1; level <= 3; level++) {
++ printf("L%d unified size = %d\n", level, getUnifiedCacheSize(level));
++ printf("L%d code size = %d\n", level, getCodeCacheSize(level));
++ printf("L%d data size = %d\n", level, getDataCacheSize(level));
+ }
++}
+
+- if (dir != NULL)
+- closedir(dir);
++void CpuInfo::setImplementer() {
++ const uint32_t id = (cacheInfo_.midr_el1 >> 24) & 0xff;
++ const int lastId = sizeof(implementers) / sizeof(implementer_t);
+
+- return retVal;
+-#else
+- return 0;
+-#endif
++ for (int i = 0; i < lastId; i++) {
++ if (implementers[i].id == id) {
++ implementer_ = implementers[i].implementer;
++ return;
++ }
++ }
++ implementer_ = (char *)implementers[lastId - 1].implementer;
+ }
+
+-Cpu::Cpu() : type_(tNONE), sveLen_(SVE_NONE) {
+-#ifdef __linux__
+- unsigned long hwcap = getauxval(AT_HWCAP);
+- if (hwcap & HWCAP_ATOMICS) {
+- type_ |= tATOMIC;
++void CpuInfo::setLastDataCacheLevel() {
++ for (uint32_t i = 0; i < maxCacheLevel; i++) {
++ const Arm64CacheType type = cacheInfo_.levelCache[i].type;
++ if (type == DataCacheOnly || type == SeparateCache || type == UnifiedCache)
++ lastDataCacheLevel_ = i + 1;
+ }
++}
+
+- if (hwcap & HWCAP_FP) {
+- type_ |= tFP;
+- }
+- if (hwcap & HWCAP_ASIMD) {
+- type_ |= tADVSIMD;
+- }
+-#ifdef HWCAP_SVE
+- /* Some old <sys/auxv.h> may not define HWCAP_SVE.
+- In that case, SVE is treated as if it were not supported. */
+- if (hwcap & HWCAP_SVE) {
+- type_ |= tSVE;
+- // svcntb(); if arm_sve.h is available
+- sveLen_ = (sveLen_t)prctl(51); // PR_SVE_GET_VL
+- }
+-#endif
++Cpu::Cpu() {
++#if defined(__linux__)
++ info = new CpuInfoLinux();
+ #elif defined(__APPLE__)
+- size_t val = 0;
+- size_t len = sizeof(val);
+-
+- if (sysctlbyname(hw_opt_atomics, &val, &len, NULL, 0) != 0)
+- throw Error(ERR_INTERNAL);
+- else
+- type_ |= (val == 1) ? tATOMIC : 0;
++ info = new CpuInfoMac();
++#elif defined(_M_ARM64)
++ info = new CpuInfoWindows();
++#endif
++}
+
+- if (sysctlbyname(hw_opt_fp, &val, &len, NULL, 0) != 0)
+- throw Error(ERR_INTERNAL);
+- else
+- type_ |= (val == 1) ? tFP : 0;
++void Cpu::dumpCacheInfo() const { return info->dumpCacheInfo(); }
+
+- if (sysctlbyname(hw_opt_neon, &val, &len, NULL, 0) != 0)
+- throw Error(ERR_INTERNAL);
+- else
+- type_ |= (val == 1) ? tADVSIMD : 0;
++/* 2023.02.11 To be removed. */
++#if defined(__GNUC__) || defined(__clang_version__)
++uint32_t Cpu::getDataCacheLevels() const { return getLastDataCacheLevel(); }
+ #endif
+
+- setSysRegVal();
+- setNumCores();
+- setCacheHierarchy();
+-}
++Arm64CacheType Cpu::getCacheType(const Arm64CacheLevel i) const { return info->getCacheType(i); }
+
+-Type Cpu::getType() const { return type_; }
+-bool Cpu::has(Type type) const { return (type & type_) != 0; }
+-uint64_t Cpu::getSveLen() const { return sveLen_; }
+-bool Cpu::isAtomicSupported() const { return type_ & tATOMIC; }
+-const char *Cpu::getImplementer() const {
+- uint64_t implementer = (midr_el1_ >> 24) & 0xff;
++uint32_t Cpu::getCoresSharingDataCache(const Arm64CacheLevel i) const { return info->getCoresSharingDataCache(i); }
+
+- for (size_t i = 0; i < sizeof(implementers) / sizeof(implementer_t); i++) {
+- if (implementers[i].id == implementer)
+- return implementers[i].implementer;
++uint32_t Cpu::getDataCacheSize(const Arm64CacheLevel i) const {
++ uint32_t size;
++ switch (info->getCacheType(i)) {
++ case DataCacheOnly:
++ case SeparateCache:
++ size = info->getDataCacheSize(i);
++ break;
++ case UnifiedCache:
++ size = info->getUnifiedCacheSize(i);
++ break;
++ default:
++ size = 0;
++ break;
+ }
+
+- return nullptr;
++ return size;
+ }
+
+-uint32_t Cpu::getCoresSharingDataCache(uint32_t i) const {
+- if (i >= dataCacheLevel_)
+- throw Error(ERR_BAD_PARAMETER);
+- return coresSharingDataCache_[i];
+-}
++const char *Cpu::getImplementer() const { return info->getImplementer(); }
+
+-uint32_t Cpu::getDataCacheLevels() const { return dataCacheLevel_; }
++uint32_t Cpu::getLastDataCacheLevel() const { return info->getLastDataCacheLevel(); }
++uint32_t Cpu::getNumCores(Arm64CpuTopologyLevel level) const { return info->getNumCores(level); }
++uint64_t Cpu::getSveLen() const { return info->getSveLen(); }
++Type Cpu::getType() const { return info->getType(); }
++bool Cpu::has(Type type) const { return (type & info->getType()) != 0; }
++bool Cpu::isAtomicSupported() const { return info->getType() & (Type)XBYAK_AARCH64_HWCAP_ATOMIC; }
+
+-uint32_t Cpu::getDataCacheSize(uint32_t i) const {
+- if (i >= dataCacheLevel_)
+- throw Error(ERR_BAD_PARAMETER);
+- return dataCacheSize_[i];
+-}
+-uint32_t Cpu::getNumCores(Arm64CpuTopologyLevel level) const {
+- switch (level) {
+- case CoreLevel:
+- return numCores_[level - 1];
+- default:
+- throw Error(ERR_BAD_PARAMETER);
+- }
+-}
+ } // namespace util
+ } // namespace Xbyak_aarch64
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.h
+new file mode 100644
+index 000000000..db3a7bd64
+--- /dev/null
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl.h
+@@ -0,0 +1,87 @@
++/*******************************************************************************
++ * Copyright 2020-2023 FUJITSU LIMITED
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *******************************************************************************/
++namespace Xbyak_aarch64 {
++namespace util {
++
++struct levelCacheInfo_t {
++ Arm64CacheType type;
++ /* I cache size, D cache size and Unified cache size.
++ Example1: some cache level of a CPU core has separate caches of
++ 32KB I$ and 64KB D$, size[] = {1024 * 32, 1024 * 64, 0},
++ sharingCores[] = 1, 1, 0}.
++ Example2: some cache level of CPU cores has a 1MB unified cache,
++ and it is shared by 12 CPU cores,
++ size[] = {0, 0, 1024 * 1024}, sharingCores[] = {0, 0, 12}.
++ */
++ uint32_t size[3]; // I cache, D cache, Unified cache
++ uint32_t sharingCores[3]; // I cache, D cache, Unified cache
++};
++
++struct cacheInfo_v2_t {
++ uint64_t midr_el1; // used as table index
++ levelCacheInfo_t levelCache[maxCacheLevel];
++};
++
++const struct implementer_t implementers[] = {
++ {0x00, "Reserved for software use"},
++ {0xC0, "Ampere Computing"},
++ {0x41, "Arm Limited"},
++ {0x42, "Broadcom Corporation"},
++ {0x43, "Cavium Inc."},
++ {0x44, "Digital Equipment Corporation"},
++ {0x46, "Fujitsu Ltd."},
++ {0x49, "Infineon Technologies AG"},
++ {0x4D, "Motorola or Freescale Semiconductor Inc."},
++ {0x4E, "NVIDIA Corporation"},
++ {0x50, "Applied Micro Circuits Corporation"},
++ {0x51, "Qualcomm Inc."},
++ {0x56, "Marvell International Ltd."},
++ {0x69, "Intel Corporation"},
++ {0xFE, "Apple Inc."}, // Xbyak_aarch64 original definition
++ {0xFF, "Cannot identified"},
++};
++
++class CpuInfo {
++protected:
++ int numCores_[2] = {}; // [0]:SmtLevel, [1], CoreLevel
++ cacheInfo_v2_t cacheInfo_;
++ uint32_t lastDataCacheLevel_;
++ Type type_ = 0;
++ uint64_t sveLen_ = 0;
++ const char *implementer_ = nullptr;
++
++ void init();
++ void setImplementer();
++ void setLastDataCacheLevel();
++
++public:
++ CpuInfo() {}
++ void dumpCacheInfo() const;
++ int getCacheSize(cacheType_t type, uint32_t level) const;
++ Arm64CacheType getCacheType(int level) const;
++ int getCodeCacheSize(int level) const;
++ int getCoresSharingDataCache(int level) const;
++ int getDataCacheSize(int level) const;
++ const char *getImplementer() const;
++ int getLastDataCacheLevel() const;
++ int getNumCores(Arm64CpuTopologyLevel level = CoreLevel) const;
++ uint64_t getSveLen() const;
++ Type getType() const;
++ int getUnifiedCacheSize(int level) const;
++ void put() const;
++};
++} // namespace util
++} // namespace Xbyak_aarch64
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
+new file mode 100644
+index 000000000..3531ab3cb
+--- /dev/null
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
+@@ -0,0 +1,426 @@
++/*******************************************************************************
++ * Copyright 2020-2023 FUJITSU LIMITED
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *******************************************************************************/
++#ifndef __linux__
++#error "Something wrong"
++#endif
++
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++#include <dirent.h>
++#include <regex.h>
++#include <sys/auxv.h>
++#include <sys/prctl.h>
++#include <unistd.h>
++
++#include "xbyak_aarch64_err.h"
++#include "xbyak_aarch64_util.h"
++
++/* In old Linux such as Ubuntu 16.04, HWCAP_ATOMICS, HWCAP_FP, HWCAP_ASIMD
++ can not be found in <bits/hwcap.h> which is included from <sys/auxv.h>.
++ Xbyak_aarch64 uses <asm/hwcap.h> as an alternative.
++ */
++#ifndef HWCAP_FP
++#include <asm/hwcap.h>
++#endif
++
++namespace Xbyak_aarch64 {
++namespace util {
++#define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
++#define XBYAK_AARCH64_PATH_NODES "/sys/devices/system/node/node"
++#define XBYAK_AARCH64_PATH_CORES "/sys/devices/system/node/node0/cpu"
++#define XBYAK_AARCH64_PATH_CACHE_DIR "/sys/devices/system/cpu/cpu0/cache"
++#define XBYAK_AARCH64_PATH_CACHE_LEVEL "/sys/devices/system/cpu/cpu0/cache/index0/level"
++#define XBYAK_AARCH64_PATH_CACHE_SIZE "/sys/devices/system/cpu/cpu0/cache/index0/size"
++#define XBYAK_AARCH64_PATH_CACHE_TYPE "/sys/devices/system/cpu/cpu0/cache/index0/type"
++#define XBYAK_AARCH64_PATH_CACHE_LIST "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list"
++#define XBYAK_AARCH64_MIDR_EL1(I, V, A, P, R) ((I << 24) | (V << 20) | (A << 16) | (P << 4) | (R << 0))
++#define XBYAK_AARCH64_READ_SYSREG(var, ID) asm("mrs %0, " #ID : "=r"(var));
++
++class CpuInfoLinux : public CpuInfo {
++public:
++ CpuInfoLinux() {
++ init();
++ setSysRegVal(); // Read MIDR_EL1 before setCacheHierarchy().
++ setNumCores();
++ setHwCap();
++ setCacheHierarchy();
++ setImplementer();
++ }
++
++private:
++ static constexpr int max_path_len = 1024;
++ static constexpr int buf_size = 1024;
++ const struct cacheInfo_v2_t cacheInfoDict[2] = {{/* A64FX */ XBYAK_AARCH64_MIDR_EL1(0x46, 0x1, 0xf, 0x1, 0x0),
++ {{/* L1 */ SeparateCache, {1024 * 64, 1024 * 64, 0}, {1, 1, 0}},
++ {/* L2 */ UnifiedCache, {0, 0, 1024 * 1024 * 8}, {0, 0, 12}},
++ {/* L3 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L4 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L5 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L6 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L7 */ NoCache, {0, 0, 0}, {0, 0, 0}}}},
++ {/* A64FX */ XBYAK_AARCH64_MIDR_EL1(0x46, 0x2, 0xf, 0x1, 0x0),
++ {{/* L1 */ SeparateCache, {1024 * 64, 1024 * 64, 0}, {1, 1, 0}},
++ {/* L2 */ UnifiedCache, {0, 0, 1024 * 1024 * 8}, {0, 0, 12}},
++ {/* L3 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L4 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L5 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L6 */ NoCache, {0, 0, 0}, {0, 0, 0}},
++ {/* L7 */ NoCache, {0, 0, 0}, {0, 0, 0}}}}};
++
++ int getFilePathMaxTailNumPlus1(const char *path) {
++ char dir_path[max_path_len];
++ char file_pattern[max_path_len];
++ int retVal = 0;
++
++ getRegEx(dir_path, path, "/([^/]+/)+");
++ /* Remove last '/'. */
++ dir_path[strlen(dir_path) - 1] = '\0';
++ getRegEx(file_pattern, path, "[^/]+$");
++ strncat(file_pattern, "[0-9]+", 16);
++
++ DIR *dir = opendir(dir_path);
++ struct dirent *dp;
++
++ dp = readdir(dir);
++ while (dp != NULL) {
++ if (getRegEx(dir_path, dp->d_name, file_pattern) == 0)
++ retVal++;
++ dp = readdir(dir);
++ }
++
++ if (dir != NULL)
++ closedir(dir);
++
++ return retVal;
++ }
++
++ void getLineInFile(char *buf, const char *path, const int num) {
++ auto chomp = [](char *ptr, const int num) {
++ for (int i = 0; i < num; i++) {
++ if ('\n' == *(ptr + i))
++ *(ptr + i) = '\0';
++ else if ('\0' == *(ptr + i))
++ break;
++ }
++ };
++
++ FILE *fp = fopen(path, "r");
++ if (!(fp && fread(buf, sizeof(char), num, fp)))
++ buf[0] = '\0';
++
++ chomp(buf, buf_size);
++ }
++
++ /**
++ * Return directory path
++ * @param[in] path ex. /sys/devices/system/node/node
++ * @param[out] buf ex. /sys/devices/system/node
++ */
++ int getRegEx(char *buf, const char *path, const char *regex) {
++ regex_t regexBuf;
++ regmatch_t match[1];
++
++ if (regcomp(®exBuf, regex, REG_EXTENDED) != 0)
++ throw ERR_INTERNAL;
++
++ const int retVal = regexec(®exBuf, path, 1, match, 0);
++ regfree(®exBuf);
++
++ if (retVal != 0)
++ return -1;
++
++ const int startIdx = match[0].rm_so;
++ const int endIdx = match[0].rm_eo;
++
++ /* Something wrong (multiple match or not match) */
++ if (startIdx == -1 || endIdx == -1 || (endIdx - startIdx - 1) < 1)
++ return -1;
++
++ strncpy(buf, path + startIdx, endIdx - startIdx);
++ buf[endIdx - startIdx] = '\0';
++
++ return 0;
++ }
++
++ /* Read the following files and set cacheInfo_.
++ If an error occurs, halfway result are cleared and return false.
++ /sys/devices/system/cpu/cpu0/cache/index[0-9]+/level "1", "2", ...
++ /sys/devices/system/cpu/cpu0/cache/index[0-9]+/size "32K", "1M"
++ /sys/devices/system/cpu/cpu0/cache/index[0-9]+/type "Instruction", "Data", "Unified"
++ /sys/devices/system/cpu/cpu0/cache/index[0-9]+/shared_cpu_list "0", "0-1"
++ */
++ bool readCacheInfoFromSysDevice() {
++ char buf0[buf_size];
++ char buf1[buf_size];
++ char buf2[buf_size];
++ struct dirent *dp;
++ DIR *dir = opendir(XBYAK_AARCH64_PATH_CACHE_DIR);
++ if (dir == NULL)
++ goto init_and_return_false;
++
++ dp = readdir(dir);
++ while (dp != NULL) {
++ regex_t regexBuf;
++ regmatch_t match[2];
++
++ if (regcomp(®exBuf, "index[0-9]*$", REG_EXTENDED) != 0)
++ throw ERR_INTERNAL;
++
++ if (regexec(®exBuf, dp->d_name, 1, match, 0) == 0) { // Found index[1-9][0-9]. directory
++ char *dir_name = buf0;
++ char *file_name = buf1;
++ char *buf = buf2;
++ char *end_ptr;
++ strncpy(dir_name, XBYAK_AARCH64_PATH_CACHE_DIR, buf_size);
++ strncat(dir_name, "/", 2);
++ strncat(dir_name, dp->d_name + match[0].rm_so, match[0].rm_eo - match[0].rm_so);
++ strncat(dir_name, "/", 2);
++
++ // Get cache level
++ strncpy(file_name, dir_name, buf_size);
++ strncat(file_name, "level", buf_size);
++ getLineInFile(buf, file_name, buf_size);
++ const long int level = strtol(buf, &end_ptr, 10);
++ if ('\0' != *end_ptr) { // Non-numeric characters exist.
++ XBYAK_AARCH64_ERROR_;
++ goto init_and_return_false;
++ }
++
++ // Get cache size
++ strncpy(file_name, dir_name, buf_size);
++ strncat(file_name, "size", buf_size);
++ getLineInFile(buf, file_name, buf_size);
++ long int size = strtol(buf, &end_ptr, 10);
++ if ('\0' != *end_ptr) {
++ if (strncmp(end_ptr, "K", 2) == 0) {
++ size = size * 1024;
++ } else if (strncmp(end_ptr, "M", 2) == 0) {
++ size = size * 1024 * 1024;
++ } else {
++ XBYAK_AARCH64_ERROR_;
++ goto init_and_return_false;
++ }
++ }
++
++ // Get cache type
++ Arm64CacheType type;
++ strncpy(file_name, dir_name, buf_size);
++ strncat(file_name, "type", buf_size);
++ getLineInFile(buf, file_name, buf_size);
++ if (strncmp(buf, "Instruction", buf_size) == 0) {
++ type = InstCacheOnly;
++ } else if (strncmp(buf, "Data", buf_size) == 0) {
++ type = DataCacheOnly;
++ } else if (strncmp(buf, "Unified", buf_size) == 0) {
++ type = UnifiedCache;
++ } else { // Unconsidered text exists.
++ XBYAK_AARCH64_ERROR_;
++ goto init_and_return_false;
++ }
++
++ /* Get cache-sharing cpu list
++ Example0: "0"
++ Example1: "0-7"
++ Example2: "0,64"
++ Example3: "0-31,64-95" */
++ long int start, end;
++ int sharing_cores = 0;
++ strncpy(file_name, dir_name, buf_size);
++ strncat(file_name, "shared_cpu_list", buf_size);
++ getLineInFile(buf, file_name, buf_size);
++ /* Debug:
++ strncpy(buf, "0", buf_size);
++ strncpy(buf, "4-8", buf_size);
++ strncpy(buf, "2,34,111", buf_size);
++ strncpy(buf, "12-23,48-60", buf_size);
++ */
++ end_ptr = buf;
++ while ('\0' != *buf) {
++ printf("%s\n", buf);
++ start = strtol(buf, &end_ptr, 10);
++ if ('\0' == *end_ptr) {
++ sharing_cores += 1;
++ // No more core exists.
++ break;
++ } else if ('-' == *end_ptr) {
++ buf = end_ptr + 1;
++ end = strtol(buf, &end_ptr, 10);
++ sharing_cores += end - start + 1;
++ buf = end_ptr;
++ while (',' == *buf || ' ' == *buf)
++ buf++;
++ } else if (',' == *end_ptr) {
++ buf = end_ptr + 1;
++ sharing_cores += 1;
++ } else {
++ XBYAK_AARCH64_ERROR_;
++ goto init_and_return_false;
++ }
++ }
++
++ auto cache = &cacheInfo_.levelCache[level - 1];
++
++ switch (type) {
++ case UnifiedCache:
++ cache->type = UnifiedCache;
++ cache->size[2] = size;
++ cache->sharingCores[2] = sharing_cores;
++ break;
++ case InstCacheOnly:
++ cache->type = cache->type == DataCacheOnly ? SeparateCache : InstCacheOnly;
++ cache->size[0] = size;
++ cache->sharingCores[0] = sharing_cores;
++ break;
++ case DataCacheOnly:
++ cache->type = cache->type == InstCacheOnly ? SeparateCache : DataCacheOnly;
++ cache->size[1] = size;
++ cache->sharingCores[1] = sharing_cores;
++ break;
++ default:
++ XBYAK_AARCH64_ERROR_;
++ goto init_and_return_false;
++ }
++ }
++
++ regfree(®exBuf);
++ dp = readdir(dir); // Try next
++ }
++
++ if (dir != NULL)
++ closedir(dir);
++
++ setLastDataCacheLevel();
++ return true;
++
++ init_and_return_false:
++ init(); // Clear halfway result
++ return false;
++ }
++
++ void setCacheHierarchy() {
++ /* Cache size of AArch64 CPUs are described in the system registers,
++ which can't be read from user-space applications.
++ Linux provides `sysconf` API and `/sys/devices/system/cpu/`
++ device files to get cache size, but they dosen't always return
++ correct values. It may depend on Linux kernel version and
++ support status of CPUs. To avoid this situation, cahche size is
++ firstly read from `cacheInfoDict`, secondly get by `sysconf`.
++
++ `sysconf` example
++ #include <unistd.h>
++ int main() {
++ reutrn sysconf(_SC_LEVEL1_DCACHE_SIZE);
++ }
++ */
++ const cacheInfo_v2_t *c = nullptr;
++ const uint64_t midr_el1 = cacheInfo_.midr_el1;
++
++ for (size_t j = 0; j < sizeof(cacheInfoDict) / sizeof(cacheInfo_v2_t); j++) {
++ if (cacheInfoDict[j].midr_el1 == midr_el1) {
++ c = cacheInfoDict + j;
++ break;
++ }
++ }
++
++ if (c != nullptr) {
++ for (size_t i = 0; i < maxCacheLevel; i++) {
++ auto dict = &c->levelCache[i];
++ auto cache = &cacheInfo_.levelCache[i];
++ cache->type = dict->type;
++
++ switch (dict->type) {
++ case InstCacheOnly:
++ cache->size[0] = dict->size[0];
++ cache->sharingCores[0] = dict->sharingCores[0];
++ break;
++ case DataCacheOnly:
++ cache->size[1] = dict->size[1];
++ cache->sharingCores[1] = dict->sharingCores[1];
++ break;
++ case SeparateCache:
++ cache->size[0] = dict->size[0];
++ cache->size[1] = dict->size[1];
++ cache->sharingCores[0] = dict->sharingCores[0];
++ cache->sharingCores[1] = dict->sharingCores[1];
++ break;
++ case UnifiedCache:
++ cache->size[2] = dict->size[2];
++ cache->sharingCores[2] = dict->sharingCores[2];
++ break;
++ default:
++ // Do nothing
++ break;
++ }
++ lastDataCacheLevel_ = (dict->size[1] || dict->size[2]) ? i + 1 : lastDataCacheLevel_;
++ }
++ } else if (!readCacheInfoFromSysDevice()) {
++ /**
++ * @ToDo Get chache information by `sysconf`
++ * for the case thd dictionary is unavailable.
++ */
++ lastDataCacheLevel_ = 2; // It is assumed L1 and L2 cache exist.
++
++ cacheInfo_.levelCache[0].size[0] = sysconf(_SC_LEVEL1_ICACHE_SIZE); // L1, ICache
++ cacheInfo_.levelCache[0].size[1] = sysconf(_SC_LEVEL1_DCACHE_SIZE); // L1, DCache
++ cacheInfo_.levelCache[1].size[2] = sysconf(_SC_LEVEL2_CACHE_SIZE); // L2, UCache
++ cacheInfo_.levelCache[2].size[2] = sysconf(_SC_LEVEL3_CACHE_SIZE); // L3, UCache
++ }
++ }
++
++ void setHwCap() {
++ unsigned long hwcap = getauxval(AT_HWCAP);
++ if (hwcap & HWCAP_ATOMICS) {
++ type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
++ }
++
++ if (hwcap & HWCAP_FP) {
++ type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
++ }
++ if (hwcap & HWCAP_ASIMD) {
++ type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
++ }
++#ifdef HWCAP_SVE
++ /* Some old <sys/auxv.h> may not define HWCAP_SVE.
++ In that case, SVE is treated as if it were not supported. */
++ if (hwcap & HWCAP_SVE) {
++ type_ |= (Type)XBYAK_AARCH64_HWCAP_SVE;
++ // svcntb(); if arm_sve.h is available
++ sveLen_ = (sveLen_t)prctl(51); // PR_SVE_GET_VL
++ }
++#endif
++ }
++
++ void setNumCores() {
++ /**
++ * @ToDo There are some methods to get # of cores.
++ Considering various kernel versions and CPUs, a combination of
++ multiple methods may be required.
++ 1) sysconf(_SC_NPROCESSORS_ONLN)
++ 2) /sys/devices/system/cpu/online
++ 3) std::thread::hardware_concurrency()
++ */
++ numCores_[0] = numCores_[1] = sysconf(_SC_NPROCESSORS_ONLN);
++ }
++
++ void setSysRegVal() { XBYAK_AARCH64_READ_SYSREG(cacheInfo_.midr_el1, MIDR_EL1); }
++};
++
++#undef XBYAK_AARCH64_ERROR_
++} // namespace util
++} // namespace Xbyak_aarch64
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
+new file mode 100644
+index 000000000..ebd6dba7c
+--- /dev/null
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
+@@ -0,0 +1,129 @@
++/*******************************************************************************
++ * Copyright 2020-2023 FUJITSU LIMITED
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *******************************************************************************/
++#ifndef __APPLE__
++#error "Something wrong"
++#endif
++
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++#include <sys/sysctl.h>
++
++#include "xbyak_aarch64_err.h"
++#include "xbyak_aarch64_util.h"
++
++namespace Xbyak_aarch64 {
++namespace util {
++constexpr char hw_cacheconfig[] = "hw.cacheconfig";
++constexpr char hw_l1icachesize[] = "hw.l1icachesize";
++constexpr char hw_l1dcachesize[] = "hw.l1dcachesize";
++constexpr char hw_l2cachesize[] = "hw.l2cachesize";
++constexpr char hw_l3cachesize[] = "hw.l3cachesize";
++constexpr char hw_ncpu[] = "hw.ncpu";
++constexpr char hw_opt_atomics[] = "hw.optional.armv8_1_atomics";
++constexpr char hw_opt_fp[] = "hw.optional.floatingpoint";
++constexpr char hw_opt_neon[] = "hw.optional.neon";
++constexpr char hw_perflevel1_logicalcpu[] = "hw.perflevel1.logicalcpu";
++
++class CpuInfoMac : public CpuInfo {
++public:
++ CpuInfoMac() {
++ init();
++ cacheInfo_.midr_el1 = 0xFE << 24;
++ setNumCores();
++ setHwCap();
++ setCacheHierarchy();
++ setImplementer();
++ }
++
++private:
++ uint8_t sysInfoBuf_[128];
++
++ int getSysInfo(char const *name, const size_t len) {
++ size_t len_ = len;
++ int retVal;
++
++ if ((retVal = sysctlbyname(name, sysInfoBuf_, &len_, NULL, 0)) != 0)
++ memset(sysInfoBuf_, 0, sizeof(sysInfoBuf_));
++
++ return retVal;
++ }
++
++ void setCacheHierarchy() {
++ // L1 cache
++ cacheInfo_.levelCache[0].type = SeparateCache;
++ getSysInfo(hw_l1icachesize, sizeof(sysInfoBuf_));
++ cacheInfo_.levelCache[0].size[0] = ((int64_t *)sysInfoBuf_)[0];
++ getSysInfo(hw_l1dcachesize, sizeof(sysInfoBuf_));
++ cacheInfo_.levelCache[0].size[1] = ((int64_t *)sysInfoBuf_)[0];
++
++ // L2 cache
++ cacheInfo_.levelCache[1].type = UnifiedCache;
++ getSysInfo(hw_l2cachesize, sizeof(sysInfoBuf_));
++ cacheInfo_.levelCache[1].size[2] = ((int64_t *)sysInfoBuf_)[0];
++
++ // L3 cache
++ cacheInfo_.levelCache[2].type = UnifiedCache;
++ getSysInfo(hw_l3cachesize, sizeof(sysInfoBuf_));
++ cacheInfo_.levelCache[2].size[2] = ((int64_t *)sysInfoBuf_)[0];
++
++ for (size_t i = 0; i < maxCacheLevel; i++) {
++ const auto cache = &cacheInfo_.levelCache[i];
++ if (cache->size[1] || cache->size[2])
++ lastDataCacheLevel_ = i + 1;
++ }
++
++ getSysInfo(hw_cacheconfig, sizeof(sysInfoBuf_));
++ // L1
++ cacheInfo_.levelCache[0].sharingCores[0] = ((uint64_t *)sysInfoBuf_)[1];
++ cacheInfo_.levelCache[0].sharingCores[1] = ((uint64_t *)sysInfoBuf_)[1];
++ // L2
++ cacheInfo_.levelCache[1].sharingCores[2] = ((uint64_t *)sysInfoBuf_)[2];
++ // L3
++ cacheInfo_.levelCache[2].sharingCores[2] = ((uint64_t *)sysInfoBuf_)[3];
++ }
++
++ void setHwCap() {
++ size_t val = 0;
++ size_t len = sizeof(val);
++
++ if (sysctlbyname(hw_opt_atomics, &val, &len, NULL, 0) != 0)
++ throw Error(ERR_INTERNAL);
++ else
++ type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ATOMIC : 0;
++
++ if (sysctlbyname(hw_opt_fp, &val, &len, NULL, 0) != 0)
++ throw Error(ERR_INTERNAL);
++ else
++ type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_FP : 0;
++
++ if (sysctlbyname(hw_opt_neon, &val, &len, NULL, 0) != 0)
++ throw Error(ERR_INTERNAL);
++ else
++ type_ |= (val == 1) ? (Type)XBYAK_AARCH64_HWCAP_ADVSIMD : 0;
++ }
++
++ void setNumCores() {
++ // ToDo: Distinguish physical and logical cores
++ getSysInfo(hw_ncpu, sizeof(int32_t));
++ numCores_[0] = numCores_[1] = ((int32_t *)sysInfoBuf_)[0];
++ }
++};
++
++} // namespace util
++} // namespace Xbyak_aarch64
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_windows.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_windows.h
+new file mode 100644
+index 000000000..64c122a85
+--- /dev/null
++++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_windows.h
+@@ -0,0 +1,104 @@
++/*******************************************************************************
++ * Copyright 2022-2023 FUJITSU LIMITED
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *******************************************************************************/
++#ifndef _M_ARM64
++#error "Something wrong"
++#endif
++
++#include "xbyak_aarch64_err.h"
++#include "xbyak_aarch64_util.h"
++
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++#ifndef WIN32_LEAN_AND_MEAN
++#define WIN32_LEAN_AND_MEAN
++#endif
++#ifndef NOMINMAX
++#define NOMINMAX
++#endif
++#include <malloc.h>
++#include <windows.h>
++
++namespace Xbyak_aarch64 {
++namespace util {
++
++class CpuInfoWindows : public CpuInfo {
++public:
++ CpuInfoWindows() {
++ init();
++ setCacheHierarchy();
++ setImplementer();
++ }
++
++private:
++ void setCacheHierarchy() {
++ DWORD bufSize = 0;
++ GetLogicalProcessorInformation(NULL, &bufSize);
++ auto *ptr = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)_alloca(bufSize);
++ if (GetLogicalProcessorInformation(ptr, &bufSize) == FALSE)
++ return;
++
++ DWORD offset = 0;
++ while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= bufSize) {
++ switch (ptr->Relationship) {
++ case RelationProcessorCore:
++ numCores_[1]++;
++ break;
++
++ case RelationCache: {
++ const auto cache = &ptr->Cache;
++ auto levelCache = &cacheInfo_.levelCache[cache->Level - 1];
++ ULONG_PTR mask = ptr->ProcessorMask;
++ int count = 0;
++ while (mask) {
++ count += (mask & 0x1) ? 1 : 0;
++ mask = mask >> 1;
++ }
++
++ switch (cache->Type) {
++ case CacheUnified:
++ levelCache->type = UnifiedCache;
++ levelCache->size[2] = cache->Size;
++ levelCache->sharingCores[2] = count;
++ break;
++ case CacheInstruction:
++ levelCache->type = levelCache->type == DataCacheOnly ? SeparateCache : InstCacheOnly;
++ levelCache->size[0] = cache->Size;
++ levelCache->sharingCores[0] = count;
++ break;
++ case CacheData:
++ levelCache->type = levelCache->type == InstCacheOnly ? SeparateCache : DataCacheOnly;
++ levelCache->size[1] = cache->Size;
++ levelCache->sharingCores[1] = count;
++ break;
++ default:
++ break;
++ }
++ }
++ }
++ offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
++ ptr++;
++ }
++ numCores_[0] = numCores_[1];
++
++ setLastDataCacheLevel();
++ }
++};
++
++} // namespace util
++} // namespace Xbyak_aarch64
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.cpp b/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
+index 588848433..ab30dd348 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
++++ b/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+- * Copyright 2020-2022 FUJITSU LIMITED
++ * Copyright 2020-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -16,9 +16,15 @@
+ #define XBYAK_AARCH64_MAKE_INSTANCE
+ #include "xbyak_aarch64.h"
+ #include <memory.h>
++#include <stdio.h>
++#ifdef _WIN32
++#include <intrin.h>
++#include <processthreadsapi.h> // FlushInstructionCache
++#endif
+
+ namespace Xbyak_aarch64 {
+
++#include "err_impl.h"
+ #include "xbyak_aarch64_impl.h"
+ #include "xbyak_aarch64_mnemonic.h"
+
+diff --git a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h b/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h
+index 0cc233e84..f6d6b5a7a 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h
++++ b/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+- * Copyright 2020-2022 FUJITSU LIMITED
++ * Copyright 2020-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -4260,3 +4260,13 @@ void CodeGenerator::SveStorePredVec(const _ZReg &zt, const AdrNoOfs &adr) {
+ uint32_t code = concat({F(0x72, 25), F(3, 23), F(0, 16), F(2, 13), F(0, 10), F(adr.getXn().getIdx(), 5), F(zt.getIdx(), 0)});
+ dd(code);
+ }
++
++void CodeGenerator::clearCache(void *begin, void *end) {
++#ifdef _WIN32
++ FlushInstructionCache(GetCurrentProcess(), begin, ((char *)end) - ((char *)begin));
++#elif defined(__APPLE__)
++ sys_icache_invalidate(begin, ((char *)end) - ((char *)begin));
++#else
++ __builtin___clear_cache((char *)begin, (char *)end);
++#endif
++}
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
+index 7e0381278..b4f545bcf 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2019-2021 FUJITSU LIMITED
++ * Copyright 2019-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -19,7 +19,9 @@
+ #ifndef WIN32_LEAN_AND_MEAN
+ #define WIN32_LEAN_AND_MEAN
+ #endif
++#ifndef NOMINMAX
+ #define NOMINMAX
++#endif
+ #include <windows.h>
+ #undef mvn
+ #endif
+@@ -51,6 +53,7 @@
+ #include <unistd.h>
+ #endif
+
++#include <cstdint>
+ #include <iomanip>
+ #include <sstream>
+
+@@ -66,6 +69,8 @@
+ #endif
+ #endif
+
++#include "xbyak_aarch64_err.h"
++
+ namespace Xbyak_aarch64 {
+ const uint64_t SP_IDX = 31;
+ const uint64_t NUM_VREG_BYTES = 16;
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h
+index b99130fb3..643678f8a 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2019-2022 FUJITSU LIMITED
++ * Copyright 2019-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -14,8 +14,6 @@
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+-
+-#include "xbyak_aarch64_err.h"
+ #include "xbyak_aarch64_reg.h"
+
+ enum ShMod { LSL = 0, LSR = 1, ASR = 2, ROR = 3, MSL = 4, NONE = 5 };
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h
+index ad964a43d..8acaf8029 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2019-2021 FUJITSU LIMITED
++ * Copyright 2019-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -15,7 +15,6 @@
+ * limitations under the License.
+ *******************************************************************************/
+
+-#include "xbyak_aarch64_err.h"
+ #include "xbyak_aarch64_inner.h"
+
+ static const size_t CSIZE = sizeof(uint32_t);
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
+index 09403ed34..403a9c497 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2019-2021 FUJITSU LIMITED
++ * Copyright 2019-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -16,6 +16,8 @@
+ *******************************************************************************/
+ #include <exception>
+
++namespace Xbyak_aarch64 {
++
+ enum {
+ ERR_NONE = 0,
+ ERR_CODE_IS_TOO_BIG, // use at CodeArray
+@@ -54,49 +56,11 @@ class Error : public std::exception {
+ const char *msg_;
+
+ public:
+- explicit Error(int err) : err_(err), msg_("") {
+- if (err_ <= 0)
+- return;
+- fprintf(stderr, "bad err=%d in Xbyak::Error\n", err_);
+- static const char *tbl[32] = {
+- "none",
+- "code is too big",
+- "label is redefined",
+- "label is too far",
+- "label is not found",
+- "bad parameter",
+- "can't protect",
+- "offset is too big",
+- "can't alloc",
+- "label is not set by L()",
+- "label is already set by L()",
+- "internal error",
+- "illegal register index (can not encoding register index)",
+- "illegal register element index (can not encoding element index)",
+- "illegal predicate register type",
+- "illegal immediate parameter (range error)",
+- "illegal immediate parameter (unavailable value error)",
+- "illegal immediate parameter (condition error)",
+- "illegal shift-mode paramater",
+- "illegal extend-mode parameter",
+- "illegal condition parameter",
+- "illegal barrier option",
+- "illegal const parameter (range error)",
+- "illegal const parameter (unavailable error)",
+- "illegal const parameter (condition error)",
+- "illegal type",
+- "bad align",
+- "bad addressing",
+- "bad scale",
+- };
+- if ((size_t)err_ >= sizeof(tbl) / sizeof(tbl[0])) {
+- msg_ = "bad err num";
+- } else {
+- msg_ = tbl[err_];
+- }
+- }
++ explicit Error(int err);
+ operator int() const { return err_; }
+ const char *what() const throw() { return msg_; }
+ };
+
+ inline const char *ConvertErrorToString(const Error &err) { return err.what(); }
++
++} // namespace Xbyak_aarch64
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h
+index 129bc8827..4e828e6db 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2019-2022 FUJITSU LIMITED
++ * Copyright 2019-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -20,7 +20,6 @@
+
+ #include "xbyak_aarch64_adr.h"
+ #include "xbyak_aarch64_code_array.h"
+-#include "xbyak_aarch64_err.h"
+ #include "xbyak_aarch64_label.h"
+ #include "xbyak_aarch64_reg.h"
+
+@@ -740,8 +739,6 @@ public:
+ labelMgr_.set(this);
+ }
+
+- unsigned int getVersion() const { return VERSION; }
+-
+ void L(Label &label) { labelMgr_.defineClabel(label); }
+ Label L() {
+ Label label;
+@@ -771,16 +768,7 @@ public:
+ labelMgr_.set(this);
+ }
+ bool hasUndefinedLabel() const { return labelMgr_.hasUndefClabel(); }
+- void clearCache(void *begin, void *end) {
+-#ifdef _WIN32
+- (void)begin;
+- (void)end;
+-#elif defined(__APPLE__)
+- sys_icache_invalidate(begin, ((char *)end) - ((char *)begin));
+-#else
+- __builtin___clear_cache((char *)begin, (char *)end);
+-#endif
+- }
++ void clearCache(void *begin, void *end);
+ /*
+ MUST call ready() to complete generating code if you use AutoGrow
+ mode.
+@@ -842,6 +830,7 @@ public:
+
+ #include "xbyak_aarch64_meta_mnemonic.h"
+ #include "xbyak_aarch64_mnemonic_def.h"
++#include "xbyak_aarch64_version.h"
+
+ void align(size_t x) {
+ if (x == 4)
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h
+index 23c41f18a..d79d6d600 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2019-2021 FUJITSU LIMITED
++ * Copyright 2019-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -15,10 +15,7 @@
+ * limitations under the License.
+ *******************************************************************************/
+
+-enum {
+- DEFAULT_MAX_CODE_SIZE = 4096,
+- VERSION = 0x5800 /* 0xABCD = A.BC(D) */
+-};
++enum { DEFAULT_MAX_CODE_SIZE = 4096 };
+
+ namespace inner {
+
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h
+index dbf8e4f6f..1c88c8301 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+- * Copyright 2019-2021 FUJITSU LIMITED
++ * Copyright 2019-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -19,7 +19,6 @@
+ #define _XBYAK_AARCH64_LABEL_
+
+ #include "xbyak_aarch64_code_array.h"
+-#include "xbyak_aarch64_err.h"
+ #include "xbyak_aarch64_inner.h"
+
+ struct JmpLabel {
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
+index f8878ced4..6bfcca173 100644
+--- a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
+@@ -1,6 +1,6 @@
+ #pragma once
+ /*******************************************************************************
+- * Copyright 2020-2022 FUJITSU LIMITED
++ * Copyright 2020-2023 FUJITSU LIMITED
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -17,46 +17,24 @@
+ #ifndef XBYAK_AARCH64_UTIL_H_
+ #define XBYAK_AARCH64_UTIL_H_
+
+-#include <dirent.h>
+-#include <regex.h>
+-#include <stdint.h>
+-#include <stdio.h>
+-#include <stdlib.h>
+-#include <string.h>
+-
+-#ifdef __linux__
+-#include <sys/auxv.h>
+-#include <sys/prctl.h>
+-#include <unistd.h>
+-
+-/* In old Linux such as Ubuntu 16.04, HWCAP_ATOMICS, HWCAP_FP, HWCAP_ASIMD
+- can not be found in <bits/hwcap.h> which is included from <sys/auxv.h>.
+- Xbyak_aarch64 uses <asm/hwcap.h> as an alternative.
+- */
+-#ifndef HWCAP_FP
+-#include <asm/hwcap.h>
++#if !(defined(__linux__) || defined(__APPLE__) || defined(_M_ARM64))
++#error "Unsupported OS"
+ #endif
+
+-#elif defined(__APPLE__)
+-#include <sys/sysctl.h>
+-#endif
+-
+-#include "xbyak_aarch64_err.h"
+-
+-#define XBYAK_AARCH64_MIDR_EL1(I, V, A, P, R) ((I << 24) | (V << 20) | (A << 16) | (P << 4) | (R << 0))
+-#define XBYAK_AARCH64_PATH_NODES "/sys/devices/system/node/node"
+-#define XBYAK_AARCH64_PATH_CORES "/sys/devices/system/node/node0/cpu"
+-#define XBYAK_AARCH64_READ_SYSREG(var, ID) asm("mrs %0, " #ID : "=r"(var));
++#include <stdint.h>
+
+ namespace Xbyak_aarch64 {
+ namespace util {
+ typedef uint64_t Type;
+
+-constexpr uint32_t maxNumberCacheLevel = 4;
++constexpr uint32_t maxCacheLevel = 7; // Specification of Armv9
+ constexpr uint32_t maxTopologyLevel = 2;
+ constexpr uint32_t max_path_len = 1024;
+
+ enum Arm64CpuTopologyLevel { SmtLevel = 1, CoreLevel = 2 };
++enum Arm64CacheType { NoCache = 0, InstCacheOnly = 1, DataCacheOnly = 2, SeparateCache = 3, UnifiedCache = 4, OtherCache = 5 };
++enum Arm64CacheLevel { L1 = 1, L2, L3, L4, L5, L6, L7 };
++enum cacheType_t { ICache = 0, DCache = 1, UCache = 2 };
+
+ enum sveLen_t {
+ SVE_NONE = 0,
+@@ -78,82 +56,58 @@ enum sveLen_t {
+ SVE_2048 = 16 * 16,
+ };
+
++enum hwCap_t {
++ XBYAK_AARCH64_HWCAP_NONE = 0,
++ XBYAK_AARCH64_HWCAP_ADVSIMD = 1 << 1,
++ XBYAK_AARCH64_HWCAP_FP = 1 << 2,
++ XBYAK_AARCH64_HWCAP_SVE = 1 << 3,
++ XBYAK_AARCH64_HWCAP_ATOMIC = 1 << 4,
++};
++
+ struct implementer_t {
+ uint32_t id;
+ const char *implementer;
+ };
+
+-struct cacheInfo_t {
++/* 2023.02.11
++ cacheInfo_t does not need to be disclosed. */
++#if defined(__GNUC__) || defined(__clang_version__)
++struct __attribute__((deprecated)) cacheInfo_t {
++#elif defined(_MSC_VER)
++struct __declspec(deprecated) cacheInfo_t {
++#endif
+ uint64_t midr_el1;
+ uint32_t dataCacheLevel;
+ uint32_t highestInnerCacheLevel;
+- uint32_t dataCacheSize[maxNumberCacheLevel];
++ uint32_t dataCacheSize[maxCacheLevel];
+ };
+
+-#ifdef __APPLE__
+-constexpr char hw_opt_atomics[] = "hw.optional.armv8_1_atomics";
+-constexpr char hw_opt_fp[] = "hw.optional.floatingpoint";
+-constexpr char hw_opt_neon[] = "hw.optional.neon";
+-#endif
+-
+-const struct implementer_t implementers[] = {{0x00, "Reserved for software use"},
+- {0xC0, "Ampere Computing"},
+- {0x41, "Arm Limited"},
+- {0x42, "Broadcom Corporation"},
+- {0x43, "Cavium Inc."},
+- {0x44, "Digital Equipment Corporation"},
+- {0x46, "Fujitsu Ltd."},
+- {0x49, "Infineon Technologies AG"},
+- {0x4D, "Motorola or Freescale Semiconductor Inc."},
+- {0x4E, "NVIDIA Corporation"},
+- {0x50, "Applied Micro Circuits Corporation"},
+- {0x51, "Qualcomm Inc."},
+- {0x56, "Marvell International Ltd."},
+- {0x69, "Intel Corporation"}};
+-
+ /**
+ CPU detection class
+ */
++class CpuInfo;
+ class Cpu {
+- uint64_t type_;
+- sveLen_t sveLen_;
+-
+ private:
+- const struct cacheInfo_t cacheInfoDict[2] = {
+- {/* A64FX */ XBYAK_AARCH64_MIDR_EL1(0x46, 0x1, 0xf, 0x1, 0x0), 2, 1, {1024 * 64, 1024 * 1024 * 8 * 4, 0, 0}},
+- {/* A64FX */ XBYAK_AARCH64_MIDR_EL1(0x46, 0x2, 0xf, 0x1, 0x0), 2, 1, {1024 * 64, 1024 * 1024 * 8 * 4, 0, 0}},
+- };
+-
+- uint32_t coresSharingDataCache_[maxNumberCacheLevel];
+- uint32_t dataCacheSize_[maxNumberCacheLevel];
+- uint32_t dataCacheLevel_;
+- uint64_t midr_el1_;
+- uint32_t numCores_[maxTopologyLevel];
+-
+- void setCacheHierarchy();
+- void setNumCores();
+- void setSysRegVal();
+- int getRegEx(char *buf, const char *path, const char *regex);
+- int getFilePathMaxTailNumPlus1(const char *path);
++ CpuInfo *info;
+
+ public:
+- static const Type tNONE = 0;
+- static const Type tADVSIMD = 1 << 1;
+- static const Type tFP = 1 << 2;
+- static const Type tSVE = 1 << 3;
+- static const Type tATOMIC = 1 << 4;
+-
+ Cpu();
+
++ void dumpCacheInfo() const;
++ Arm64CacheType getCacheType(const Arm64CacheLevel i) const;
++ uint32_t getCoresSharingDataCache(const Arm64CacheLevel i) const;
++ /* 2023.02.11 */
++#if defined(__GNUC__) || defined(__clang_version__)
++ uint32_t __attribute__((deprecated("Please use getLastDataCacheLevel()"))) getDataCacheLevels() const;
++#endif
++ uint32_t getDataCacheSize(const Arm64CacheLevel i) const;
++ const char *getImplementer() const;
++ uint32_t getLastDataCacheLevel() const;
++ uint32_t getNumCores(Arm64CpuTopologyLevel level) const;
+ Type getType() const;
+- bool has(Type type) const;
+ uint64_t getSveLen() const;
++ bool has(Type type) const;
+ bool isAtomicSupported() const;
+- const char *getImplementer() const;
+- uint32_t getCoresSharingDataCache(uint32_t i) const;
+- uint32_t getDataCacheLevels() const;
+- uint32_t getDataCacheSize(uint32_t i) const;
+- uint32_t getNumCores(Arm64CpuTopologyLevel level) const;
+ };
+
+ } // namespace util
+diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h
+new file mode 100644
+index 000000000..ff04e7af2
+--- /dev/null
++++ b/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h
+@@ -0,0 +1,20 @@
++/*******************************************************************************
++ * Copyright 2022-2023 FUJITSU LIMITED
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *******************************************************************************/
++static const int majorVersion = 1;
++static const int minorVersion = 0;
++static const int patchVersion = 0;
++static int getVersion() { return (majorVersion << 16) + (minorVersion << 8) + patchVersion; }
++static const char *getVersionString() { return "1.0.0"; }
+--
+2.39.2
+
diff -Nru onednn-2.7.4/debian/patches/series onednn-2.7.4/debian/patches/series
--- onednn-2.7.4/debian/patches/series 2023-07-16 04:46:53.000000000 +0200
+++ onednn-2.7.4/debian/patches/series 2023-07-26 12:14:38.000000000 +0200
@@ -1,3 +1,6 @@
#simde
#0002-Fix-building-with-GCC-11.patch
fix-gcc13-ftbfs.patch
+lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(1).patch
+lp2028759/cpu-aarch64-fix-getting-cache-sizes-on-macOS.patch
+lp2028759/cpu-aarch64-update-xbyak_aarch64-into-the-latest-ver(2).patch
Reply to: