mirror of
https://github.com/xmrig/xmrig.git
synced 2026-04-17 04:59:28 -04:00
Compare commits
68 Commits
v6.24.0
...
a189d84fcd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a189d84fcd | ||
|
|
cb6001945e | ||
|
|
f16a06eb67 | ||
|
|
9d71358f46 | ||
|
|
5a80c65d31 | ||
|
|
67cc6cfd1c | ||
|
|
db24bf5154 | ||
|
|
0d9a372e49 | ||
|
|
c1e3d386fe | ||
|
|
5ca4828255 | ||
|
|
1a04bf2904 | ||
|
|
5feb764b27 | ||
|
|
cb7511507f | ||
|
|
6e6eab1763 | ||
|
|
f35f9d7241 | ||
|
|
45d0a15c98 | ||
|
|
f4845cbd68 | ||
|
|
ed80a8a828 | ||
|
|
9e5492eecc | ||
|
|
e41b28ef78 | ||
|
|
1bd59129c4 | ||
|
|
8ccf7de304 | ||
|
|
30ffb9cb27 | ||
|
|
d3a84c4b52 | ||
|
|
eb49237aaa | ||
|
|
e1efd3dc7f | ||
|
|
e3d0135708 | ||
|
|
f661e1eb30 | ||
|
|
99488751f1 | ||
|
|
5fb0321c84 | ||
|
|
753859caea | ||
|
|
712a5a5e66 | ||
|
|
290a0de6e5 | ||
|
|
e0564b5fdd | ||
|
|
482a1f0b40 | ||
|
|
856813c1ae | ||
|
|
23da1a90f5 | ||
|
|
7981e4a76a | ||
|
|
7ef5142a52 | ||
|
|
db5c6d9190 | ||
|
|
e88009d575 | ||
|
|
5115597e7f | ||
|
|
4cdc35f966 | ||
|
|
b02519b9f5 | ||
|
|
a44b21cef3 | ||
|
|
ea832899f2 | ||
|
|
3ecacf0ac2 | ||
|
|
27c8e60919 | ||
|
|
985fe06e8d | ||
|
|
75b63ddde9 | ||
|
|
643b65f2c0 | ||
|
|
116ba1828f | ||
|
|
da5a5674b4 | ||
|
|
6cc4819cec | ||
|
|
a659397c41 | ||
|
|
20acfd0d79 | ||
|
|
da683d8c3e | ||
|
|
255565b533 | ||
|
|
878e83bf59 | ||
|
|
7abf17cb59 | ||
|
|
eeec5ecd10 | ||
|
|
93f5067999 | ||
|
|
dd6671bc59 | ||
|
|
a1ee2fd9d2 | ||
|
|
2619131176 | ||
|
|
1161f230c5 | ||
|
|
d2363ba28b | ||
|
|
1676da1fe9 |
3
.codespellrc
Normal file
3
.codespellrc
Normal file
@@ -0,0 +1,3 @@
|
||||
[codespell]
|
||||
skip = ./src/3rdparty,./src/crypto/ghostrider,./src/crypto/randomx/blake2,./src/crypto/cn/sse2neon.h,./src/backend/opencl/cl/cn/groestl256.cl,./src/backend/opencl/cl/cn/jh.cl
|
||||
ignore-words-list = Carmel,vor
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -4,3 +4,5 @@ scripts/deps
|
||||
/CMakeLists.txt.user
|
||||
/.idea
|
||||
/src/backend/opencl/cl/cn/cryptonight_gen.cl
|
||||
.vscode
|
||||
/.qtcreator
|
||||
|
||||
26
CHANGELOG.md
26
CHANGELOG.md
@@ -1,3 +1,17 @@
|
||||
# v6.25.0
|
||||
- [#3680](https://github.com/xmrig/xmrig/pull/3680) Added `armv8l` to the list of 32-bit ARM targets.
|
||||
- [#3708](https://github.com/xmrig/xmrig/pull/3708) Minor Aarch64 JIT changes (better instruction selection, don't emit instructions that add 0, etc).
|
||||
- [#3718](https://github.com/xmrig/xmrig/pull/3718) Solo mining: added support for FCMP++ hardfork.
|
||||
- [#3722](https://github.com/xmrig/xmrig/pull/3722) Added Zen4 (Hawk Point) CPUs detection.
|
||||
- [#3725](https://github.com/xmrig/xmrig/pull/3725) Added **RISC-V** support with JIT compiler.
|
||||
- [#3731](https://github.com/xmrig/xmrig/pull/3731) Added initial Haiku OS support.
|
||||
- [#3733](https://github.com/xmrig/xmrig/pull/3733) Added detection for MSVC/2026.
|
||||
- [#3736](https://github.com/xmrig/xmrig/pull/3736) RISC-V: added vectorized dataset init.
|
||||
- [#3740](https://github.com/xmrig/xmrig/pull/3740) RISC-V: added vectorized soft AES.
|
||||
- [#3743](https://github.com/xmrig/xmrig/pull/3743) Linux: added support for transparent huge pages.
|
||||
- Improved LibreSSL support.
|
||||
- Improved compatibility for automatically enabling huge pages on Linux systems without NUMA support.
|
||||
|
||||
# v6.24.0
|
||||
- [#3671](https://github.com/xmrig/xmrig/pull/3671) Fixed detection of L2 cache size for some complex NUMA topologies.
|
||||
- [#3674](https://github.com/xmrig/xmrig/pull/3674) Fixed ARMv7 build.
|
||||
@@ -146,7 +160,7 @@
|
||||
# v6.16.2
|
||||
- [#2751](https://github.com/xmrig/xmrig/pull/2751) Fixed crash on CPUs supporting VAES and running GCC-compiled xmrig.
|
||||
- [#2761](https://github.com/xmrig/xmrig/pull/2761) Fixed broken auto-tuning in GCC Windows build.
|
||||
- [#2771](https://github.com/xmrig/xmrig/issues/2771) Fixed environment variables support for GhostRider and KawPow.
|
||||
- [#2771](https://github.com/xmrig/xmrig/issues/2771) Fixed environment variables support for GhostRider and KawPow.
|
||||
- [#2769](https://github.com/xmrig/xmrig/pull/2769) Performance fixes:
|
||||
- Fixed several performance bottlenecks introduced in v6.16.1.
|
||||
- Fixed overall GCC-compiled build performance, it's the same speed as MSVC build now.
|
||||
@@ -454,7 +468,7 @@
|
||||
- Compiler for Windows gcc builds updated to v10.1.
|
||||
|
||||
# v5.11.1
|
||||
- [#1652](https://github.com/xmrig/xmrig/pull/1652) Up to 1% RandomX perfomance improvement on recent AMD CPUs.
|
||||
- [#1652](https://github.com/xmrig/xmrig/pull/1652) Up to 1% RandomX performance improvement on recent AMD CPUs.
|
||||
- [#1306](https://github.com/xmrig/xmrig/issues/1306) Fixed possible double connection to a pool.
|
||||
- [#1654](https://github.com/xmrig/xmrig/issues/1654) Fixed build with LibreSSL.
|
||||
|
||||
@@ -560,9 +574,9 @@
|
||||
- Added automatic huge pages configuration on Linux if use the miner with root privileges.
|
||||
- **Added [automatic Intel prefetchers configuration](https://xmrig.com/docs/miner/randomx-optimization-guide#intel-specific-optimizations) on Linux.**
|
||||
- Added new option `wrmsr` in `randomx` object with command line equivalent `--randomx-wrmsr=6`.
|
||||
- [#1396](https://github.com/xmrig/xmrig/pull/1396) [#1401](https://github.com/xmrig/xmrig/pull/1401) New performance optimizations for Ryzen CPUs.
|
||||
- [#1385](https://github.com/xmrig/xmrig/issues/1385) Added `max-threads-hint` option support for RandomX dataset initialization threads.
|
||||
- [#1386](https://github.com/xmrig/xmrig/issues/1386) Added `priority` option support for RandomX dataset initialization threads.
|
||||
- [#1396](https://github.com/xmrig/xmrig/pull/1396) [#1401](https://github.com/xmrig/xmrig/pull/1401) New performance optimizations for Ryzen CPUs.
|
||||
- [#1385](https://github.com/xmrig/xmrig/issues/1385) Added `max-threads-hint` option support for RandomX dataset initialization threads.
|
||||
- [#1386](https://github.com/xmrig/xmrig/issues/1386) Added `priority` option support for RandomX dataset initialization threads.
|
||||
- For official builds all dependencies (libuv, hwloc, openssl) updated to recent versions.
|
||||
- Windows `msvc` builds now use Visual Studio 2019 instead of 2017.
|
||||
|
||||
@@ -608,7 +622,7 @@ This release based on 4.x.x series and include all features from v4.6.2-beta, ch
|
||||
- Removed command line option `--http-enabled`, HTTP API enabled automatically if any other `--http-*` option provided.
|
||||
- [#1172](https://github.com/xmrig/xmrig/issues/1172) **Added OpenCL mining backend.**
|
||||
- [#268](https://github.com/xmrig/xmrig-amd/pull/268) [#270](https://github.com/xmrig/xmrig-amd/pull/270) [#271](https://github.com/xmrig/xmrig-amd/pull/271) [#273](https://github.com/xmrig/xmrig-amd/pull/273) [#274](https://github.com/xmrig/xmrig-amd/pull/274) [#1171](https://github.com/xmrig/xmrig/pull/1171) Added RandomX support for OpenCL, thanks [@SChernykh](https://github.com/SChernykh).
|
||||
- Algorithm `cn/wow` removed, as no longer alive.
|
||||
- Algorithm `cn/wow` removed, as no longer alive.
|
||||
|
||||
# Previous versions
|
||||
[doc/CHANGELOG_OLD.md](doc/CHANGELOG_OLD.md)
|
||||
|
||||
@@ -95,7 +95,7 @@ set(HEADERS_CRYPTO
|
||||
src/crypto/common/VirtualMemory.h
|
||||
)
|
||||
|
||||
if (XMRIG_ARM)
|
||||
if (XMRIG_ARM OR XMRIG_RISCV)
|
||||
set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_arm.h)
|
||||
else()
|
||||
set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_x86.h)
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
XMRig is a high performance, open source, cross platform RandomX, KawPow, CryptoNight and [GhostRider](https://github.com/xmrig/xmrig/tree/master/src/crypto/ghostrider#readme) unified CPU/GPU miner and [RandomX benchmark](https://xmrig.com/benchmark). Official binaries are available for Windows, Linux, macOS and FreeBSD.
|
||||
|
||||
## Mining backends
|
||||
- **CPU** (x86/x64/ARMv7/ARMv8)
|
||||
- **CPU** (x86/x64/ARMv7/ARMv8/RISC-V)
|
||||
- **OpenCL** for AMD GPUs.
|
||||
- **CUDA** for NVIDIA GPUs via external [CUDA plugin](https://github.com/xmrig/xmrig-cuda).
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
if (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
set(XMRIG_ASM_LIBRARY "xmrig-asm")
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
|
||||
129
cmake/cpu.cmake
129
cmake/cpu.cmake
@@ -21,6 +21,19 @@ if (NOT VAES_SUPPORTED)
|
||||
set(WITH_VAES OFF)
|
||||
endif()
|
||||
|
||||
# Detect RISC-V architecture early (before it's used below)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv64|riscv|rv64)$")
|
||||
set(RISCV_TARGET 64)
|
||||
set(XMRIG_RISCV ON)
|
||||
add_definitions(-DXMRIG_RISCV)
|
||||
message(STATUS "Detected RISC-V 64-bit architecture (${CMAKE_SYSTEM_PROCESSOR})")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv32|rv32)$")
|
||||
set(RISCV_TARGET 32)
|
||||
set(XMRIG_RISCV ON)
|
||||
add_definitions(-DXMRIG_RISCV)
|
||||
message(STATUS "Detected RISC-V 32-bit architecture (${CMAKE_SYSTEM_PROCESSOR})")
|
||||
endif()
|
||||
|
||||
if (XMRIG_64_BIT AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
|
||||
add_definitions(-DRAPIDJSON_SSE2)
|
||||
else()
|
||||
@@ -29,6 +42,120 @@ else()
|
||||
set(WITH_VAES OFF)
|
||||
endif()
|
||||
|
||||
# Disable x86-specific features for RISC-V
|
||||
if (XMRIG_RISCV)
|
||||
set(WITH_SSE4_1 OFF)
|
||||
set(WITH_AVX2 OFF)
|
||||
set(WITH_VAES OFF)
|
||||
|
||||
# default build uses the RV64GC baseline
|
||||
set(RVARCH "rv64gc")
|
||||
|
||||
enable_language(ASM)
|
||||
|
||||
try_run(RANDOMX_VECTOR_RUN_FAIL
|
||||
RANDOMX_VECTOR_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_vector.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gcv")
|
||||
|
||||
if (RANDOMX_VECTOR_COMPILE_OK AND NOT RANDOMX_VECTOR_RUN_FAIL)
|
||||
set(RVARCH_V ON)
|
||||
message(STATUS "RISC-V vector extension detected")
|
||||
else()
|
||||
set(RVARCH_V OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZICBOP_RUN_FAIL
|
||||
RANDOMX_ZICBOP_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zicbop.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zicbop")
|
||||
|
||||
if (RANDOMX_ZICBOP_COMPILE_OK AND NOT RANDOMX_ZICBOP_RUN_FAIL)
|
||||
set(RVARCH_ZICBOP ON)
|
||||
message(STATUS "RISC-V zicbop extension detected")
|
||||
else()
|
||||
set(RVARCH_ZICBOP OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZBA_RUN_FAIL
|
||||
RANDOMX_ZBA_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zba.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zba")
|
||||
|
||||
if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL)
|
||||
set(RVARCH_ZBA ON)
|
||||
message(STATUS "RISC-V zba extension detected")
|
||||
else()
|
||||
set(RVARCH_ZBA OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZBB_RUN_FAIL
|
||||
RANDOMX_ZBB_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zbb.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zbb")
|
||||
|
||||
if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL)
|
||||
set(RVARCH_ZBB ON)
|
||||
message(STATUS "RISC-V zbb extension detected")
|
||||
else()
|
||||
set(RVARCH_ZBB OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZVKB_RUN_FAIL
|
||||
RANDOMX_ZVKB_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zvkb.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gcv_zvkb")
|
||||
|
||||
if (RANDOMX_ZVKB_COMPILE_OK AND NOT RANDOMX_ZVKB_RUN_FAIL)
|
||||
set(RVARCH_ZVKB ON)
|
||||
message(STATUS "RISC-V zvkb extension detected")
|
||||
else()
|
||||
set(RVARCH_ZVKB OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZVKNED_RUN_FAIL
|
||||
RANDOMX_ZVKNED_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zvkned.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gcv_zvkned")
|
||||
|
||||
if (RANDOMX_ZVKNED_COMPILE_OK AND NOT RANDOMX_ZVKNED_RUN_FAIL)
|
||||
set(RVARCH_ZVKNED ON)
|
||||
message(STATUS "RISC-V zvkned extension detected")
|
||||
else()
|
||||
set(RVARCH_ZVKNED OFF)
|
||||
endif()
|
||||
|
||||
# for native builds, enable Zba and Zbb if supported by the CPU
|
||||
if (ARCH STREQUAL "native")
|
||||
if (RVARCH_V)
|
||||
set(RVARCH "${RVARCH}v")
|
||||
endif()
|
||||
if (RVARCH_ZICBOP)
|
||||
set(RVARCH "${RVARCH}_zicbop")
|
||||
endif()
|
||||
if (RVARCH_ZBA)
|
||||
set(RVARCH "${RVARCH}_zba")
|
||||
endif()
|
||||
if (RVARCH_ZBB)
|
||||
set(RVARCH "${RVARCH}_zbb")
|
||||
endif()
|
||||
if (RVARCH_ZVKB)
|
||||
set(RVARCH "${RVARCH}_zvkb")
|
||||
endif()
|
||||
if (RVARCH_ZVKNED)
|
||||
set(RVARCH "${RVARCH}_zvkned")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS "Using -march=${RVARCH}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DRAPIDJSON_WRITE_DEFAULT_FLAGS=6) # rapidjson::kWriteNanAndInfFlag | rapidjson::kWriteNanAndInfNullFlag
|
||||
|
||||
if (ARM_V8)
|
||||
@@ -40,7 +167,7 @@ endif()
|
||||
if (NOT ARM_TARGET)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|ARM64|armv8-a)$")
|
||||
set(ARM_TARGET 8)
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l|armv7ve)$")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l|armv7ve|armv8l)$")
|
||||
set(ARM_TARGET 7)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -28,6 +28,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
|
||||
elseif (ARM_TARGET EQUAL 7)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions")
|
||||
elseif (XMRIG_RISCV)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}")
|
||||
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
@@ -41,6 +46,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
|
||||
else()
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static -Wl,--large-address-aware")
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Haiku")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc")
|
||||
else()
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
|
||||
endif()
|
||||
@@ -74,6 +81,11 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
|
||||
elseif (ARM_TARGET EQUAL 7)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
elseif (XMRIG_RISCV)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}")
|
||||
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
|
||||
@@ -17,6 +17,10 @@ else()
|
||||
set(XMRIG_OS_LINUX ON)
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL FreeBSD OR CMAKE_SYSTEM_NAME STREQUAL DragonFly)
|
||||
set(XMRIG_OS_FREEBSD ON)
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL OpenBSD)
|
||||
set(XMRIG_OS_OPENBSD ON)
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Haiku")
|
||||
set(XMRIG_OS_HAIKU ON)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -43,6 +47,10 @@ elseif(XMRIG_OS_UNIX)
|
||||
add_definitions(-DXMRIG_OS_LINUX)
|
||||
elseif (XMRIG_OS_FREEBSD)
|
||||
add_definitions(-DXMRIG_OS_FREEBSD)
|
||||
elseif (XMRIG_OS_OPENBSD)
|
||||
add_definitions(-DXMRIG_OS_OPENBSD)
|
||||
elseif (XMRIG_OS_HAIKU)
|
||||
add_definitions(-DXMRIG_OS_HAIKU)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ if (WITH_RANDOMX)
|
||||
src/crypto/randomx/jit_compiler_x86_static.asm
|
||||
src/crypto/randomx/jit_compiler_x86.cpp
|
||||
)
|
||||
elseif (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
elseif (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_x86_static.S
|
||||
src/crypto/randomx/jit_compiler_x86.cpp
|
||||
@@ -80,6 +80,39 @@ if (WITH_RANDOMX)
|
||||
else()
|
||||
set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
|
||||
endif()
|
||||
elseif (XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_rv64_static.S
|
||||
src/crypto/randomx/jit_compiler_rv64_vector_static.S
|
||||
src/crypto/randomx/jit_compiler_rv64.cpp
|
||||
src/crypto/randomx/jit_compiler_rv64_vector.cpp
|
||||
src/crypto/randomx/aes_hash_rv64_vector.cpp
|
||||
src/crypto/randomx/aes_hash_rv64_zvkned.cpp
|
||||
)
|
||||
# cheat because cmake and ccache hate each other
|
||||
set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_static.S PROPERTY LANGUAGE C)
|
||||
set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTY LANGUAGE C)
|
||||
|
||||
set(RV64_VECTOR_FILE_ARCH "rv64gcv")
|
||||
|
||||
if (ARCH STREQUAL "native")
|
||||
if (RVARCH_ZICBOP)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zicbop")
|
||||
endif()
|
||||
if (RVARCH_ZBA)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zba")
|
||||
endif()
|
||||
if (RVARCH_ZBB)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zbb")
|
||||
endif()
|
||||
if (RVARCH_ZVKB)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zvkb")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_source_files_properties(src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}")
|
||||
set_source_files_properties(src/crypto/randomx/aes_hash_rv64_vector.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}")
|
||||
set_source_files_properties(src/crypto/randomx/aes_hash_rv64_zvkned.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}_zvkned")
|
||||
else()
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_fallback.cpp
|
||||
@@ -116,7 +149,7 @@ if (WITH_RANDOMX)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (WITH_MSR AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX))
|
||||
if (WITH_MSR AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX))
|
||||
add_definitions(/DXMRIG_FEATURE_MSR)
|
||||
add_definitions(/DXMRIG_FIX_RYZEN)
|
||||
message("-- WITH_MSR=ON")
|
||||
@@ -157,6 +190,15 @@ if (WITH_RANDOMX)
|
||||
list(APPEND HEADERS_CRYPTO src/crypto/rx/Profiler.h)
|
||||
list(APPEND SOURCES_CRYPTO src/crypto/rx/Profiler.cpp)
|
||||
endif()
|
||||
|
||||
if (WITH_VAES)
|
||||
set(SOURCES_CRYPTO "${SOURCES_CRYPTO}" src/crypto/randomx/aes_hash_vaes512.cpp)
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
set_source_files_properties(src/crypto/randomx/aes_hash_vaes512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
|
||||
elseif (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
|
||||
set_source_files_properties(src/crypto/randomx/aes_hash_vaes512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mvaes")
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
remove_definitions(/DXMRIG_ALGO_RANDOMX)
|
||||
endif()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# HTTP API
|
||||
|
||||
If you want use HTTP API you need enable it (`"enabled": true,`) then choice `port` and optionaly `host`. API not available if miner built without HTTP support (`-DWITH_HTTP=OFF`).
|
||||
If you want use HTTP API you need enable it (`"enabled": true,`) then choice `port` and optionally `host`. API not available if miner built without HTTP support (`-DWITH_HTTP=OFF`).
|
||||
|
||||
Offical HTTP client for API: http://workers.xmrig.info/
|
||||
Official HTTP client for API: http://workers.xmrig.info/
|
||||
|
||||
Example configuration:
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ Double check that you see `Huge pages 100%` both for dataset and for all threads
|
||||
|
||||
### Benchmark with custom config
|
||||
|
||||
You can run benchmark with any configuration you want. Just start without command line parameteres, use regular config.json and add `"benchmark":"1M",` on the next line after pool url.
|
||||
You can run benchmark with any configuration you want. Just start without command line parameters, use regular config.json and add `"benchmark":"1M",` on the next line after pool url.
|
||||
|
||||
# Stress test
|
||||
|
||||
@@ -26,4 +26,4 @@ You can also run continuous stress-test that is as close to the real RandomX min
|
||||
xmrig --stress
|
||||
xmrig --stress -a rx/wow
|
||||
```
|
||||
This will require Internet connection and will run indefinitely.
|
||||
This will require Internet connection and will run indefinitely.
|
||||
|
||||
@@ -57,7 +57,7 @@
|
||||
# v4.0.0-beta
|
||||
- [#1172](https://github.com/xmrig/xmrig/issues/1172) **Added OpenCL mining backend.**
|
||||
- [#268](https://github.com/xmrig/xmrig-amd/pull/268) [#270](https://github.com/xmrig/xmrig-amd/pull/270) [#271](https://github.com/xmrig/xmrig-amd/pull/271) [#273](https://github.com/xmrig/xmrig-amd/pull/273) [#274](https://github.com/xmrig/xmrig-amd/pull/274) [#1171](https://github.com/xmrig/xmrig/pull/1171) Added RandomX support for OpenCL, thanks [@SChernykh](https://github.com/SChernykh).
|
||||
- Algorithm `cn/wow` removed, as no longer alive.
|
||||
- Algorithm `cn/wow` removed, as no longer alive.
|
||||
|
||||
# v3.2.0
|
||||
- Added per pool option `coin` with single possible value `monero` for pools without algorithm negotiation, for upcoming Monero fork.
|
||||
@@ -103,7 +103,7 @@
|
||||
- [#1105](https://github.com/xmrig/xmrig/issues/1105) Improved auto configuration for `cn-pico` algorithm.
|
||||
- Added commands `pause` and `resume` via JSON RPC 2.0 API (`POST /json_rpc`).
|
||||
- Added command line option `--export-topology` for export hwloc topology to a XML file.
|
||||
- Breaked backward compatibility with previous configs and command line, `variant` option replaced to `algo`, global option `algo` removed, all CPU related settings moved to `cpu` object.
|
||||
- Broken backward compatibility with previous configs and command line, `variant` option replaced to `algo`, global option `algo` removed, all CPU related settings moved to `cpu` object.
|
||||
- Options `av`, `safe` and `max-cpu-usage` removed.
|
||||
- Algorithm `cn/msr` renamed to `cn/fast`.
|
||||
- Algorithm `cn/xtl` removed.
|
||||
@@ -122,7 +122,7 @@
|
||||
- [#1092](https://github.com/xmrig/xmrig/issues/1092) Fixed crash if wrong CPU affinity used.
|
||||
- [#1103](https://github.com/xmrig/xmrig/issues/1103) Improved auto configuration for RandomX for CPUs where L2 cache is limiting factor.
|
||||
- [#1105](https://github.com/xmrig/xmrig/issues/1105) Improved auto configuration for `cn-pico` algorithm.
|
||||
- [#1106](https://github.com/xmrig/xmrig/issues/1106) Fixed `hugepages` field in summary API.
|
||||
- [#1106](https://github.com/xmrig/xmrig/issues/1106) Fixed `hugepages` field in summary API.
|
||||
- Added alternative short format for CPU threads.
|
||||
- Changed format for CPU threads with intensity above 1.
|
||||
- Name for reference RandomX configuration changed to `rx/test` to avoid potential conflicts in future.
|
||||
@@ -150,7 +150,7 @@
|
||||
- [#1050](https://github.com/xmrig/xmrig/pull/1050) Added RandomXL algorithm for [Loki](https://loki.network/), algorithm name used by miner is `randomx/loki` or `rx/loki`.
|
||||
- Added [flexible](https://github.com/xmrig/xmrig/blob/evo/doc/CPU.md) multi algorithm configuration.
|
||||
- Added unlimited switching between incompatible algorithms, all mining options can be changed in runtime.
|
||||
- Breaked backward compatibility with previous configs and command line, `variant` option replaced to `algo`, global option `algo` removed, all CPU related settings moved to `cpu` object.
|
||||
- Broken backward compatibility with previous configs and command line, `variant` option replaced to `algo`, global option `algo` removed, all CPU related settings moved to `cpu` object.
|
||||
- Options `av`, `safe` and `max-cpu-usage` removed.
|
||||
- Algorithm `cn/msr` renamed to `cn/fast`.
|
||||
- Algorithm `cn/xtl` removed.
|
||||
@@ -183,7 +183,7 @@
|
||||
- [#314](https://github.com/xmrig/xmrig-proxy/issues/314) Added donate over proxy feature.
|
||||
- Added new option `donate-over-proxy`.
|
||||
- Added real graceful exit.
|
||||
|
||||
|
||||
# v2.14.4
|
||||
- [#992](https://github.com/xmrig/xmrig/pull/992) Fixed compilation with Clang 3.5.
|
||||
- [#1012](https://github.com/xmrig/xmrig/pull/1012) Fixed compilation with Clang 9.0.
|
||||
@@ -250,7 +250,7 @@
|
||||
# v2.8.1
|
||||
- [#768](https://github.com/xmrig/xmrig/issues/768) Fixed build with Visual Studio 2015.
|
||||
- [#769](https://github.com/xmrig/xmrig/issues/769) Fixed regression, some ANSI escape sequences was in log with disabled colors.
|
||||
- [#777](https://github.com/xmrig/xmrig/issues/777) Better report about pool connection issues.
|
||||
- [#777](https://github.com/xmrig/xmrig/issues/777) Better report about pool connection issues.
|
||||
- Simplified checks for ASM auto detection, only AES support necessary.
|
||||
- Added missing options to `--help` output.
|
||||
|
||||
@@ -259,7 +259,7 @@
|
||||
- Added global and per thread option `"asm"` and command line equivalent.
|
||||
- **[#758](https://github.com/xmrig/xmrig/issues/758) Added SSL/TLS support for secure connections to pools.**
|
||||
- Added per pool options `"tls"` and `"tls-fingerprint"` and command line equivalents.
|
||||
- [#767](https://github.com/xmrig/xmrig/issues/767) Added config autosave feature, same with GPU miners.
|
||||
- [#767](https://github.com/xmrig/xmrig/issues/767) Added config autosave feature, same with GPU miners.
|
||||
- [#245](https://github.com/xmrig/xmrig-proxy/issues/245) Fixed API ID collision when run multiple miners on same machine.
|
||||
- [#757](https://github.com/xmrig/xmrig/issues/757) Fixed send buffer overflow.
|
||||
|
||||
@@ -346,7 +346,7 @@
|
||||
|
||||
# v2.4.4
|
||||
- Added libmicrohttpd version to --version output.
|
||||
- Fixed bug in singal handler, in some cases miner wasn't shutdown properly.
|
||||
- Fixed bug in signal handler, in some cases miner wasn't shutdown properly.
|
||||
- Fixed recent MSVC 2017 version detection.
|
||||
- [#279](https://github.com/xmrig/xmrig/pull/279) Fixed build on some macOS versions.
|
||||
|
||||
@@ -359,7 +359,7 @@
|
||||
# v2.4.2
|
||||
- [#60](https://github.com/xmrig/xmrig/issues/60) Added FreeBSD support, thanks [vcambur](https://github.com/vcambur).
|
||||
- [#153](https://github.com/xmrig/xmrig/issues/153) Fixed issues with dwarfpool.com.
|
||||
|
||||
|
||||
# v2.4.1
|
||||
- [#147](https://github.com/xmrig/xmrig/issues/147) Fixed comparability with monero-stratum.
|
||||
|
||||
@@ -371,7 +371,7 @@
|
||||
- [#101](https://github.com/xmrig/xmrig/issues/101) Fixed MSVC 2017 (15.3) compile time version detection.
|
||||
- [#108](https://github.com/xmrig/xmrig/issues/108) Silently ignore invalid values for `donate-level` option.
|
||||
- [#111](https://github.com/xmrig/xmrig/issues/111) Fixed build without AEON support.
|
||||
|
||||
|
||||
# v2.3.1
|
||||
- [#68](https://github.com/xmrig/xmrig/issues/68) Fixed compatibility with Docker containers, was nothing print on console.
|
||||
|
||||
@@ -398,7 +398,7 @@
|
||||
# v2.1.0
|
||||
- [#40](https://github.com/xmrig/xmrig/issues/40)
|
||||
Improved miner shutdown, fixed crash on exit for Linux and OS X.
|
||||
- Fixed, login request was contain malformed JSON if username or password has some special characters for example `\`.
|
||||
- Fixed, login request was contain malformed JSON if username or password has some special characters for example `\`.
|
||||
- [#220](https://github.com/fireice-uk/xmr-stak-cpu/pull/220) Better support for Round Robin DNS, IP address now always chosen randomly instead of stuck on first one.
|
||||
- Changed donation address, new [xmrig-proxy](https://github.com/xmrig/xmrig-proxy) is coming soon.
|
||||
|
||||
@@ -418,16 +418,16 @@ Improved miner shutdown, fixed crash on exit for Linux and OS X.
|
||||
- Fixed Windows XP support.
|
||||
- Fixed regression, option `--no-color` was not fully disable colored output.
|
||||
- Show resolved pool IP address in miner output.
|
||||
|
||||
|
||||
# v1.0.1
|
||||
- Fix broken software AES implementation, app has crashed if CPU not support AES-NI, only version 1.0.0 affected.
|
||||
|
||||
# v1.0.0
|
||||
- Miner complete rewritten in C++ with libuv.
|
||||
- This version should be fully compatible (except config file) with previos versions, many new nice features will come in next versions.
|
||||
- This is still beta. If you found regression, stability or perfomance issues or have an idea for new feature please fell free to open new [issue](https://github.com/xmrig/xmrig/issues/new).
|
||||
- This version should be fully compatible (except config file) with previous versions, many new nice features will come in next versions.
|
||||
- This is still beta. If you found regression, stability or performance issues or have an idea for new feature please fell free to open new [issue](https://github.com/xmrig/xmrig/issues/new).
|
||||
- Added new option `--print-time=N`, print hashrate report every N seconds.
|
||||
- New hashrate reports, by default every 60 secons.
|
||||
- New hashrate reports, by default every 60 seconds.
|
||||
- Added Microsoft Visual C++ 2015 and 2017 support.
|
||||
- Removed dependency on libcurl.
|
||||
- To compile this version from source please switch to [dev](https://github.com/xmrig/xmrig/tree/dev) branch.
|
||||
@@ -440,7 +440,7 @@ Improved miner shutdown, fixed crash on exit for Linux and OS X.
|
||||
- Fixed gcc 7.1 support.
|
||||
|
||||
# v0.8.1
|
||||
- Added nicehash support, detects automaticaly by pool URL, for example `cryptonight.eu.nicehash.com:3355` or manually via option `--nicehash`.
|
||||
- Added nicehash support, detects automatically by pool URL, for example `cryptonight.eu.nicehash.com:3355` or manually via option `--nicehash`.
|
||||
|
||||
# v0.8.0
|
||||
- Added double hash mode, also known as lower power mode. `--av=2` and `--av=4`.
|
||||
|
||||
@@ -124,7 +124,7 @@ Force enable (`true`) or disable (`false`) hardware AES support. Default value `
|
||||
Mining threads priority, value from `1` (lowest priority) to `5` (highest possible priority). Default value `null` means miner don't change threads priority at all. Setting priority higher than 2 can make your PC unresponsive.
|
||||
|
||||
#### `memory-pool` (since v4.3.0)
|
||||
Use continuous, persistent memory block for mining threads, useful for preserve huge pages allocation while algorithm switching. Possible values `false` (feature disabled, by default) or `true` or specific count of 2 MB huge pages. It helps to avoid loosing huge pages for scratchpads when RandomX dataset is updated and mining threads restart after a 2-3 days of mining.
|
||||
Use continuous, persistent memory block for mining threads, useful for preserve huge pages allocation while algorithm switching. Possible values `false` (feature disabled, by default) or `true` or specific count of 2 MB huge pages. It helps to avoid losing huge pages for scratchpads when RandomX dataset is updated and mining threads restart after a 2-3 days of mining.
|
||||
|
||||
#### `yield` (since v5.1.1)
|
||||
Prefer system better system response/stability `true` (default value) or maximum hashrate `false`.
|
||||
@@ -133,7 +133,7 @@ Prefer system better system response/stability `true` (default value) or maximum
|
||||
Enable/configure or disable ASM optimizations. Possible values: `true`, `false`, `"intel"`, `"ryzen"`, `"bulldozer"`.
|
||||
|
||||
#### `argon2-impl` (since v3.1.0)
|
||||
Allow override automatically detected Argon2 implementation, this option added mostly for debug purposes, default value `null` means autodetect. This is used in RandomX dataset initialization and also in some other mining algorithms. Other possible values: `"x86_64"`, `"SSE2"`, `"SSSE3"`, `"XOP"`, `"AVX2"`, `"AVX-512F"`. Manual selection has no safe guards - if your CPU doesn't support required instuctions, miner will crash.
|
||||
Allow override automatically detected Argon2 implementation, this option added mostly for debug purposes, default value `null` means autodetect. This is used in RandomX dataset initialization and also in some other mining algorithms. Other possible values: `"x86_64"`, `"SSE2"`, `"SSSE3"`, `"XOP"`, `"AVX2"`, `"AVX-512F"`. Manual selection has no safe guards - if your CPU doesn't support required instructions, miner will crash.
|
||||
|
||||
#### `astrobwt-max-size`
|
||||
AstroBWT algorithm: skip hashes with large stage 2 size, default: `550`, min: `400`, max: `1200`. Optimal value depends on your CPU/GPU
|
||||
|
||||
365
doc/RISCV_PERF_TUNING.md
Normal file
365
doc/RISCV_PERF_TUNING.md
Normal file
@@ -0,0 +1,365 @@
|
||||
# RISC-V Performance Optimization Guide
|
||||
|
||||
This guide provides comprehensive instructions for optimizing XMRig on RISC-V architectures.
|
||||
|
||||
## Build Optimizations
|
||||
|
||||
### Compiler Flags Applied Automatically
|
||||
|
||||
The CMake build now applies aggressive RISC-V-specific optimizations:
|
||||
|
||||
```cmake
|
||||
# RISC-V ISA with extensions
|
||||
-march=rv64gcv_zba_zbb_zbc_zbs
|
||||
|
||||
# Aggressive compiler optimizations
|
||||
-funroll-loops # Unroll loops for ILP (instruction-level parallelism)
|
||||
-fomit-frame-pointer # Free up frame pointer register (RISC-V has limited registers)
|
||||
-fno-common # Better code generation for global variables
|
||||
-finline-functions # Inline more functions for better cache locality
|
||||
-ffast-math # Relaxed FP semantics (safe for mining)
|
||||
-flto # Link-time optimization for cross-module inlining
|
||||
|
||||
# Release build additions
|
||||
-minline-atomics # Inline atomic operations for faster synchronization
|
||||
```
|
||||
|
||||
### Optimal Build Command
|
||||
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j$(nproc)
|
||||
```
|
||||
|
||||
**Expected build time**: 5-15 minutes depending on CPU
|
||||
|
||||
## Runtime Optimizations
|
||||
|
||||
### 1. Memory Configuration (Most Important)
|
||||
|
||||
Enable huge pages to reduce TLB misses and fragmentation:
|
||||
|
||||
#### Enable 2MB Huge Pages
|
||||
```bash
|
||||
# Calculate required huge pages (1 page = 2MB)
|
||||
# For 2 GB dataset: 1024 pages
|
||||
# For cache + dataset: 1536 pages minimum
|
||||
sudo sysctl -w vm.nr_hugepages=2048
|
||||
```
|
||||
|
||||
Verify:
|
||||
```bash
|
||||
grep HugePages /proc/meminfo
|
||||
# Expected: HugePages_Free should be close to nr_hugepages
|
||||
```
|
||||
|
||||
#### Enable 1GB Huge Pages (Optional but Recommended)
|
||||
|
||||
```bash
|
||||
# Run provided helper script
|
||||
sudo ./scripts/enable_1gb_pages.sh
|
||||
|
||||
# Verify 1GB pages are available
|
||||
cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
|
||||
# Should be: >= 1 (one 1GB page)
|
||||
```
|
||||
|
||||
Update config.json:
|
||||
```json
|
||||
{
|
||||
"cpu": {
|
||||
"huge-pages": true
|
||||
},
|
||||
"randomx": {
|
||||
"1gb-pages": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. RandomX Mode Selection
|
||||
|
||||
| Mode | Memory | Init Time | Throughput | Recommendation |
|
||||
|------|--------|-----------|-----------|-----------------|
|
||||
| **light** | 256 MB | 10 sec | Low | Testing, resource-constrained |
|
||||
| **fast** | 2 GB | 2-5 min* | High | Production (with huge pages) |
|
||||
| **auto** | 2 GB | Varies | High | Default (uses fast if possible) |
|
||||
|
||||
*With optimizations; can be 30+ minutes without huge pages
|
||||
|
||||
**For RISC-V, use fast mode with huge pages enabled.**
|
||||
|
||||
### 3. Dataset Initialization Threads
|
||||
|
||||
Optimal thread count = 60-75% of CPU cores (leaves headroom for OS/other tasks)
|
||||
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"init": 4
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Or auto-detect (rewritten for RISC-V):
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"init": -1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. CPU Affinity (Optional)
|
||||
|
||||
Pin threads to specific cores for better cache locality:
|
||||
|
||||
```json
|
||||
{
|
||||
"cpu": {
|
||||
"rx/0": [
|
||||
{ "threads": 1, "affinity": 0 },
|
||||
{ "threads": 1, "affinity": 1 },
|
||||
{ "threads": 1, "affinity": 2 },
|
||||
{ "threads": 1, "affinity": 3 }
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 5. CPU Governor (Linux)
|
||||
|
||||
Set to performance mode for maximum throughput:
|
||||
|
||||
```bash
|
||||
# Check current governor
|
||||
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
|
||||
|
||||
# Set to performance (requires root)
|
||||
echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
|
||||
|
||||
# Verify
|
||||
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
|
||||
# Should output: performance
|
||||
```
|
||||
|
||||
## Configuration Examples
|
||||
|
||||
### Minimum (Testing)
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"mode": "light"
|
||||
},
|
||||
"cpu": {
|
||||
"huge-pages": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Recommended (Balanced)
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"mode": "auto",
|
||||
"init": 4,
|
||||
"1gb-pages": true
|
||||
},
|
||||
"cpu": {
|
||||
"huge-pages": true,
|
||||
"priority": 2
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Maximum Performance (Production)
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"mode": "fast",
|
||||
"init": -1,
|
||||
"1gb-pages": true,
|
||||
"scratchpad_prefetch_mode": 1
|
||||
},
|
||||
"cpu": {
|
||||
"huge-pages": true,
|
||||
"priority": 3,
|
||||
"yield": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## CLI Equivalents
|
||||
|
||||
```bash
|
||||
# Light mode
|
||||
./xmrig --randomx-mode=light
|
||||
|
||||
# Fast mode with 4 init threads
|
||||
./xmrig --randomx-mode=fast --randomx-init=4
|
||||
|
||||
# Benchmark
|
||||
./xmrig --bench=1M --algo=rx/0
|
||||
|
||||
# Benchmark Wownero variant (1 MB scratchpad)
|
||||
./xmrig --bench=1M --algo=rx/wow
|
||||
|
||||
# Mine to pool
|
||||
./xmrig -o pool.example.com:3333 -u YOUR_WALLET -p x
|
||||
```
|
||||
|
||||
## Performance Diagnostics
|
||||
|
||||
### Check if Vector Extensions are Detected
|
||||
|
||||
Look for `FEATURES:` line in output:
|
||||
```
|
||||
* CPU: ky,x60 (uarch ky,x1)
|
||||
* FEATURES: rv64imafdcv zba zbb zbc zbs
|
||||
```
|
||||
|
||||
- `v`: Vector extension (RVV) ✓
|
||||
- `zba`, `zbb`, `zbc`, `zbs`: Bit manipulation ✓
|
||||
- If missing, make sure build used `-march=rv64gcv_zba_zbb_zbc_zbs`
|
||||
|
||||
### Verify Huge Pages at Runtime
|
||||
|
||||
```bash
|
||||
# Run xmrig with --bench=1M and check output
|
||||
./xmrig --bench=1M
|
||||
|
||||
# Look for line like:
|
||||
# HUGE PAGES 100% 1 / 1 (1024 MB)
|
||||
```
|
||||
|
||||
- Should show 100% for dataset AND threads
|
||||
- If less, increase `vm.nr_hugepages` and reboot
|
||||
|
||||
### Monitor Performance
|
||||
|
||||
```bash
|
||||
# Run benchmark multiple times to find stable hashrate
|
||||
./xmrig --bench=1M --algo=rx/0
|
||||
./xmrig --bench=10M --algo=rx/0
|
||||
./xmrig --bench=100M --algo=rx/0
|
||||
|
||||
# Check system load and memory during mining
|
||||
while true; do free -h; grep HugePages /proc/meminfo; sleep 2; done
|
||||
```
|
||||
|
||||
## Expected Performance
|
||||
|
||||
### Hardware: Orange Pi RV2 (Ky X1, 8 cores @ ~1.5 GHz)
|
||||
|
||||
| Config | Mode | Hashrate | Init Time |
|
||||
|--------|------|----------|-----------|
|
||||
| Scalar (baseline) | fast | 30 H/s | 10 min |
|
||||
| Scalar + huge pages | fast | 33 H/s | 2 min |
|
||||
| RVV (if enabled) | fast | 70-100 H/s | 3 min |
|
||||
|
||||
*Actual results depend on CPU frequency, memory speed, and load*
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Long Initialization Times (30+ minutes)
|
||||
|
||||
**Cause**: Huge pages not enabled, system using swap
|
||||
**Solution**:
|
||||
1. Enable huge pages: `sudo sysctl -w vm.nr_hugepages=2048`
|
||||
2. Reboot: `sudo reboot`
|
||||
3. Reduce mining threads to free memory
|
||||
4. Check available memory: `free -h`
|
||||
|
||||
### Low Hashrate (50% of expected)
|
||||
|
||||
**Cause**: CPU governor set to power-save, no huge pages, high contention
|
||||
**Solution**:
|
||||
1. Set governor to performance: `echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor`
|
||||
2. Enable huge pages
|
||||
3. Reduce number of mining threads
|
||||
4. Check system load: `top` or `htop`
|
||||
|
||||
### Dataset Init Crashes or Hangs
|
||||
|
||||
**Cause**: Insufficient memory, corrupted huge pages
|
||||
**Solution**:
|
||||
1. Disable huge pages temporarily: set `huge-pages: false` in config
|
||||
2. Reduce mining threads
|
||||
3. Reboot and re-enable huge pages
|
||||
4. Try light mode: `--randomx-mode=light`
|
||||
|
||||
### Out of Memory During Benchmark
|
||||
|
||||
**Cause**: Not enough RAM for dataset + cache + threads
|
||||
**Solution**:
|
||||
1. Use light mode: `--randomx-mode=light`
|
||||
2. Reduce mining threads: `--threads=1`
|
||||
3. Increase available memory (kill other processes)
|
||||
4. Check: `free -h` before mining
|
||||
|
||||
## Advanced Tuning
|
||||
|
||||
### Vector Length (VLEN) Detection
|
||||
|
||||
RISC-V vector extension variable length (VLEN) affects performance:
|
||||
|
||||
```bash
|
||||
# Check VLEN on your CPU
|
||||
cat /proc/cpuinfo | grep vlen
|
||||
|
||||
# Expected values:
|
||||
# - 128 bits (16 bytes) = minimum
|
||||
# - 256 bits (32 bytes) = common
|
||||
# - 512 bits (64 bytes) = high performance
|
||||
```
|
||||
|
||||
Larger VLEN generally means better performance for vectorized operations.
|
||||
|
||||
### Prefetch Optimization
|
||||
|
||||
The code automatically optimizes memory prefetching for RISC-V:
|
||||
|
||||
```
|
||||
scratchpad_prefetch_mode: 0 = disabled (slowest)
|
||||
scratchpad_prefetch_mode: 1 = prefetch.r (default, recommended)
|
||||
scratchpad_prefetch_mode: 2 = prefetch.w (experimental)
|
||||
```
|
||||
|
||||
### Memory Bandwidth Saturation
|
||||
|
||||
If experiencing memory bandwidth saturation (high latency):
|
||||
|
||||
1. Reduce mining threads
|
||||
2. Increase L2/L3 cache by mining fewer threads per core
|
||||
3. Enable cache QoS (AMD Ryzen): `cache_qos: true`
|
||||
|
||||
## Building with Custom Flags
|
||||
|
||||
To build with custom RISC-V flags:
|
||||
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_C_FLAGS="-march=rv64gcv_zba_zbb_zbc_zbs -O3 -funroll-loops -fomit-frame-pointer" \
|
||||
..
|
||||
make -j$(nproc)
|
||||
```
|
||||
|
||||
## Future Optimizations
|
||||
|
||||
- [ ] Zbk* (crypto) support detection and usage
|
||||
- [ ] Optimal VLEN-aware algorithm selection
|
||||
- [ ] Per-core memory affinity (NUMA support)
|
||||
- [ ] Dynamic thread count adjustment based on thermals
|
||||
- [ ] Cross-compile optimizations for various RISC-V cores
|
||||
|
||||
## References
|
||||
|
||||
- [RISC-V Vector Extension Spec](https://github.com/riscv/riscv-v-spec)
|
||||
- [RISC-V Bit Manipulation Spec](https://github.com/riscv/riscv-bitmanip)
|
||||
- [RISC-V Crypto Spec](https://github.com/riscv/riscv-crypto)
|
||||
- [XMRig Documentation](https://xmrig.com/docs)
|
||||
|
||||
---
|
||||
|
||||
For further optimization, enable RVV intrinsics by replacing `sse2rvv.h` with `sse2rvv_optimized.h` in the build.
|
||||
@@ -12,7 +12,7 @@ if grep -E 'AMD Ryzen|AMD EPYC|AuthenticAMD' /proc/cpuinfo > /dev/null;
|
||||
then
|
||||
if grep "cpu family[[:space:]]\{1,\}:[[:space:]]25" /proc/cpuinfo > /dev/null;
|
||||
then
|
||||
if grep "model[[:space:]]\{1,\}:[[:space:]]97" /proc/cpuinfo > /dev/null;
|
||||
if grep "model[[:space:]]\{1,\}:[[:space:]]\(97\|117\)" /proc/cpuinfo > /dev/null;
|
||||
then
|
||||
echo "Detected Zen4 CPU"
|
||||
wrmsr -a 0xc0011020 0x4400000000000
|
||||
|
||||
2
src/3rdparty/argon2/CMakeLists.txt
vendored
2
src/3rdparty/argon2/CMakeLists.txt
vendored
@@ -35,7 +35,7 @@ if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
add_feature_impl(xop "" HAVE_XOP)
|
||||
add_feature_impl(avx2 "/arch:AVX2" HAVE_AVX2)
|
||||
add_feature_impl(avx512f "/arch:AVX512F" HAVE_AVX512F)
|
||||
elseif (NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
elseif (NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
function(add_feature_impl FEATURE GCC_FLAG DEF)
|
||||
add_library(argon2-${FEATURE} STATIC arch/x86_64/lib/argon2-${FEATURE}.c)
|
||||
target_include_directories(argon2-${FEATURE} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../)
|
||||
|
||||
2
src/3rdparty/libethash/endian.h
vendored
2
src/3rdparty/libethash/endian.h
vendored
@@ -31,7 +31,7 @@
|
||||
#include <libkern/OSByteOrder.h>
|
||||
#define ethash_swap_u32(input_) OSSwapInt32(input_)
|
||||
#define ethash_swap_u64(input_) OSSwapInt64(input_)
|
||||
#elif defined(__FreeBSD__) || defined(__DragonFly__) || defined(__NetBSD__)
|
||||
#elif defined(__FreeBSD__) || defined(__DragonFly__) || defined(__NetBSD__) || defined(__HAIKU__)
|
||||
#define ethash_swap_u32(input_) bswap32(input_)
|
||||
#define ethash_swap_u64(input_) bswap64(input_)
|
||||
#elif defined(__OpenBSD__)
|
||||
|
||||
@@ -89,11 +89,16 @@ static void print_cpu(const Config *)
|
||||
{
|
||||
const auto info = Cpu::info();
|
||||
|
||||
Log::print(GREEN_BOLD(" * ") WHITE_BOLD("%-13s%s (%zu)") " %s %sAES%s",
|
||||
Log::print(GREEN_BOLD(" * ") WHITE_BOLD("%-13s%s (%zu)") " %s %s%sAES%s",
|
||||
"CPU",
|
||||
info->brand(),
|
||||
info->packages(),
|
||||
ICpuInfo::is64bit() ? GREEN_BOLD("64-bit") : RED_BOLD("32-bit"),
|
||||
#ifdef XMRIG_RISCV
|
||||
info->hasRISCV_Vector() ? GREEN_BOLD_S "RVV " : RED_BOLD_S "-RVV ",
|
||||
#else
|
||||
"",
|
||||
#endif
|
||||
info->hasAES() ? GREEN_BOLD_S : RED_BOLD_S "-",
|
||||
info->isVM() ? RED_BOLD_S " VM" : ""
|
||||
);
|
||||
|
||||
@@ -48,6 +48,24 @@ static const std::map<int, std::map<uint32_t, uint64_t> > hashCheck = {
|
||||
{ 9000000U, 0x323935102AB6B45CULL },
|
||||
{ 10000000U, 0xB5231262E2792B26ULL }
|
||||
}},
|
||||
{ Algorithm::RX_V2, {
|
||||
# ifndef NDEBUG
|
||||
{ 10000U, 0x57d2051d099613a4ULL },
|
||||
{ 20000U, 0x0bae0155cc797f01ULL },
|
||||
# endif
|
||||
{ 250000U, 0x18cf741a71484072ULL },
|
||||
{ 500000U, 0xcd8c3e6ec31b2faeULL },
|
||||
{ 1000000U, 0x88d6b8fb70cd479dULL },
|
||||
{ 2000000U, 0x0e16828d236a1a63ULL },
|
||||
{ 3000000U, 0x2739bdd0f25b83a6ULL },
|
||||
{ 4000000U, 0x32f42d9006d2d34bULL },
|
||||
{ 5000000U, 0x16d9c6286cb82251ULL },
|
||||
{ 6000000U, 0x1f916ae19d6bcf07ULL },
|
||||
{ 7000000U, 0x1f474f99a873948fULL },
|
||||
{ 8000000U, 0x8d67e0ddf05476bbULL },
|
||||
{ 9000000U, 0x3ebf37dcd5c4a215ULL },
|
||||
{ 10000000U, 0x7efbddff3f30fb74ULL }
|
||||
}},
|
||||
{ Algorithm::RX_WOW, {
|
||||
# ifndef NDEBUG
|
||||
{ 10000U, 0x6B0918757100B338ULL },
|
||||
@@ -88,6 +106,24 @@ static const std::map<int, std::map<uint32_t, uint64_t> > hashCheck1T = {
|
||||
{ 9000000U, 0xC6D39EF59213A07CULL },
|
||||
{ 10000000U, 0x95E6BAE68DD779CDULL }
|
||||
}},
|
||||
{ Algorithm::RX_V2, {
|
||||
# ifndef NDEBUG
|
||||
{ 10000, 0x90eb7c07cd9e0d90ULL },
|
||||
{ 20000, 0x6523a3658d7d9930ULL },
|
||||
# endif
|
||||
{ 250000, 0xf83b6d9d355ee5b1ULL },
|
||||
{ 500000, 0xbea3c1bf1465e9abULL },
|
||||
{ 1000000, 0x9e16f7cb56b366e1ULL },
|
||||
{ 2000000, 0x3b5e671f47e15e55ULL },
|
||||
{ 3000000, 0xec5819c180df03e2ULL },
|
||||
{ 4000000, 0x19d31b498f86aad4ULL },
|
||||
{ 5000000, 0x2487626c75cd12ccULL },
|
||||
{ 6000000, 0xa323a25a5286c39aULL },
|
||||
{ 7000000, 0xa123b100f3104dfcULL },
|
||||
{ 8000000, 0x602db9d83bfa0ddcULL },
|
||||
{ 9000000, 0x98da909e579765ddULL },
|
||||
{ 10000000, 0x3a45b7247cec9895ULL }
|
||||
}},
|
||||
{ Algorithm::RX_WOW, {
|
||||
# ifndef NDEBUG
|
||||
{ 10000U, 0x9EC1B9B8C8C7F082ULL },
|
||||
|
||||
@@ -87,14 +87,14 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
|
||||
if (!cn_heavyZen3Memory) {
|
||||
// Round up number of threads to the multiple of 8
|
||||
const size_t num_threads = ((m_threads + 7) / 8) * 8;
|
||||
cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * num_threads, data.hugePages, false, false, node());
|
||||
cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * num_threads, data.hugePages, false, false, node(), VirtualMemory::kDefaultHugePageSize);
|
||||
}
|
||||
m_memory = cn_heavyZen3Memory;
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
|
||||
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node(), VirtualMemory::kDefaultHugePageSize);
|
||||
}
|
||||
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
@@ -256,7 +256,10 @@ void xmrig::CpuWorker<N>::start()
|
||||
|
||||
# ifdef XMRIG_ALGO_RANDOMX
|
||||
bool first = true;
|
||||
alignas(16) uint64_t tempHash[8] = {};
|
||||
alignas(64) uint64_t tempHash[8] = {};
|
||||
|
||||
size_t prev_job_size = 0;
|
||||
alignas(64) uint8_t prev_job[Job::kMaxBlobSize] = {};
|
||||
# endif
|
||||
|
||||
while (!Nonce::isOutdated(Nonce::CPU, m_job.sequence())) {
|
||||
@@ -297,6 +300,11 @@ void xmrig::CpuWorker<N>::start()
|
||||
job.generateMinerSignature(m_job.blob(), job.size(), miner_signature_ptr);
|
||||
}
|
||||
randomx_calculate_hash_first(m_vm, tempHash, m_job.blob(), job.size());
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_COMMITMENT) {
|
||||
prev_job_size = job.size();
|
||||
memcpy(prev_job, m_job.blob(), prev_job_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (!nextRound()) {
|
||||
@@ -307,7 +315,15 @@ void xmrig::CpuWorker<N>::start()
|
||||
memcpy(miner_signature_saved, miner_signature_ptr, sizeof(miner_signature_saved));
|
||||
job.generateMinerSignature(m_job.blob(), job.size(), miner_signature_ptr);
|
||||
}
|
||||
|
||||
randomx_calculate_hash_next(m_vm, tempHash, m_job.blob(), job.size(), m_hash);
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_COMMITMENT) {
|
||||
memcpy(m_commitment, m_hash, RANDOMX_HASH_SIZE);
|
||||
randomx_calculate_commitment(prev_job, prev_job_size, m_hash, m_hash);
|
||||
prev_job_size = job.size();
|
||||
memcpy(prev_job, m_job.blob(), prev_job_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
||||
@@ -83,6 +83,7 @@ private:
|
||||
void allocateCnCtx();
|
||||
void consumeJob();
|
||||
|
||||
alignas(8) uint8_t m_commitment[N * 32]{ 0 };
|
||||
alignas(8) uint8_t m_hash[N * 32]{ 0 };
|
||||
const Algorithm m_algorithm;
|
||||
const Assembly m_assembly;
|
||||
|
||||
@@ -46,7 +46,12 @@ else()
|
||||
set(CPUID_LIB "")
|
||||
endif()
|
||||
|
||||
if (XMRIG_ARM)
|
||||
if (XMRIG_RISCV)
|
||||
list(APPEND SOURCES_BACKEND_CPU
|
||||
src/backend/cpu/platform/lscpu_riscv.cpp
|
||||
src/backend/cpu/platform/BasicCpuInfo_riscv.cpp
|
||||
)
|
||||
elseif (XMRIG_ARM)
|
||||
list(APPEND SOURCES_BACKEND_CPU src/backend/cpu/platform/BasicCpuInfo_arm.cpp)
|
||||
|
||||
if (XMRIG_OS_WIN)
|
||||
|
||||
@@ -85,13 +85,14 @@ public:
|
||||
FLAG_POPCNT,
|
||||
FLAG_CAT_L3,
|
||||
FLAG_VM,
|
||||
FLAG_RISCV_VECTOR,
|
||||
FLAG_MAX
|
||||
};
|
||||
|
||||
ICpuInfo() = default;
|
||||
virtual ~ICpuInfo() = default;
|
||||
|
||||
# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__)
|
||||
# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__) || defined(__riscv) && (__riscv_xlen == 64)
|
||||
inline constexpr static bool is64bit() { return true; }
|
||||
# else
|
||||
inline constexpr static bool is64bit() { return false; }
|
||||
@@ -109,6 +110,7 @@ public:
|
||||
virtual bool hasOneGbPages() const = 0;
|
||||
virtual bool hasXOP() const = 0;
|
||||
virtual bool isVM() const = 0;
|
||||
virtual bool hasRISCV_Vector() const = 0;
|
||||
virtual bool jccErratum() const = 0;
|
||||
virtual const char *backend() const = 0;
|
||||
virtual const char *brand() const = 0;
|
||||
|
||||
@@ -58,8 +58,8 @@
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
constexpr size_t kCpuFlagsSize = 15;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "vaes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
|
||||
constexpr size_t kCpuFlagsSize = 16;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "vaes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm", "rvv" };
|
||||
static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch");
|
||||
|
||||
|
||||
@@ -250,7 +250,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
|
||||
break;
|
||||
|
||||
case 0x19:
|
||||
if (m_model == 0x61) {
|
||||
if ((m_model == 0x61) || (m_model == 0x75)) {
|
||||
m_arch = ARCH_ZEN4;
|
||||
m_msrMod = MSR_MOD_RYZEN_19H_ZEN4;
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ protected:
|
||||
inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); }
|
||||
inline bool hasXOP() const override { return has(FLAG_XOP); }
|
||||
inline bool isVM() const override { return has(FLAG_VM); }
|
||||
inline bool hasRISCV_Vector() const override { return has(FLAG_RISCV_VECTOR); }
|
||||
inline bool jccErratum() const override { return m_jccErratum; }
|
||||
inline const char *brand() const override { return m_brand; }
|
||||
inline const std::vector<int32_t> &units() const override { return m_units; }
|
||||
@@ -65,7 +66,7 @@ protected:
|
||||
inline Vendor vendor() const override { return m_vendor; }
|
||||
inline uint32_t model() const override
|
||||
{
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
return m_model;
|
||||
# else
|
||||
return 0;
|
||||
@@ -80,7 +81,7 @@ protected:
|
||||
Vendor m_vendor = VENDOR_UNKNOWN;
|
||||
|
||||
private:
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
uint32_t m_procInfo = 0;
|
||||
uint32_t m_family = 0;
|
||||
uint32_t m_model = 0;
|
||||
|
||||
119
src/backend/cpu/platform/BasicCpuInfo_riscv.cpp
Normal file
119
src/backend/cpu/platform/BasicCpuInfo_riscv.cpp
Normal file
@@ -0,0 +1,119 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 Slayingripper <https://github.com/Slayingripper>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright (c) 2016-2025 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
|
||||
|
||||
#include "backend/cpu/platform/BasicCpuInfo.h"
|
||||
#include "base/tools/String.h"
|
||||
#include "3rdparty/rapidjson/document.h"
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
extern String cpu_name_riscv();
|
||||
extern bool has_riscv_vector();
|
||||
extern bool has_riscv_aes();
|
||||
|
||||
|
||||
} // namespace xmrig
|
||||
|
||||
|
||||
xmrig::BasicCpuInfo::BasicCpuInfo() :
|
||||
m_threads(std::thread::hardware_concurrency())
|
||||
{
|
||||
m_units.resize(m_threads);
|
||||
for (int32_t i = 0; i < static_cast<int32_t>(m_threads); ++i) {
|
||||
m_units[i] = i;
|
||||
}
|
||||
|
||||
memcpy(m_brand, "RISC-V", 6);
|
||||
|
||||
auto name = cpu_name_riscv();
|
||||
if (!name.isNull()) {
|
||||
strncpy(m_brand, name.data(), sizeof(m_brand) - 1);
|
||||
}
|
||||
|
||||
// Check for vector extensions
|
||||
m_flags.set(FLAG_RISCV_VECTOR, has_riscv_vector());
|
||||
|
||||
// Check for AES extensions (Zknd/Zkne)
|
||||
m_flags.set(FLAG_AES, has_riscv_aes());
|
||||
|
||||
// RISC-V typically supports 1GB huge pages
|
||||
m_flags.set(FLAG_PDPE1GB, std::ifstream("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages").good());
|
||||
}
|
||||
|
||||
|
||||
const char *xmrig::BasicCpuInfo::backend() const
|
||||
{
|
||||
return "basic/1";
|
||||
}
|
||||
|
||||
|
||||
xmrig::CpuThreads xmrig::BasicCpuInfo::threads(const Algorithm &algorithm, uint32_t) const
|
||||
{
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
if (algorithm.family() == Algorithm::GHOSTRIDER) {
|
||||
return CpuThreads(threads(), 8);
|
||||
}
|
||||
# endif
|
||||
|
||||
return CpuThreads(threads());
|
||||
}
|
||||
|
||||
|
||||
rapidjson::Value xmrig::BasicCpuInfo::toJSON(rapidjson::Document &doc) const
|
||||
{
|
||||
using namespace rapidjson;
|
||||
auto &allocator = doc.GetAllocator();
|
||||
|
||||
Value out(kObjectType);
|
||||
|
||||
out.AddMember("brand", StringRef(brand()), allocator);
|
||||
out.AddMember("aes", hasAES(), allocator);
|
||||
out.AddMember("avx2", false, allocator);
|
||||
out.AddMember("x64", is64bit(), allocator); // DEPRECATED will be removed in the next major release.
|
||||
out.AddMember("64_bit", is64bit(), allocator);
|
||||
out.AddMember("l2", static_cast<uint64_t>(L2()), allocator);
|
||||
out.AddMember("l3", static_cast<uint64_t>(L3()), allocator);
|
||||
out.AddMember("cores", static_cast<uint64_t>(cores()), allocator);
|
||||
out.AddMember("threads", static_cast<uint64_t>(threads()), allocator);
|
||||
out.AddMember("packages", static_cast<uint64_t>(packages()), allocator);
|
||||
out.AddMember("nodes", static_cast<uint64_t>(nodes()), allocator);
|
||||
out.AddMember("backend", StringRef(backend()), allocator);
|
||||
out.AddMember("msr", "none", allocator);
|
||||
out.AddMember("assembly", "none", allocator);
|
||||
out.AddMember("arch", "riscv64", allocator);
|
||||
|
||||
Value flags(kArrayType);
|
||||
|
||||
if (hasAES()) {
|
||||
flags.PushBack("aes", allocator);
|
||||
}
|
||||
|
||||
out.AddMember("flags", flags, allocator);
|
||||
|
||||
return out;
|
||||
}
|
||||
@@ -87,7 +87,7 @@ static inline size_t countByType(hwloc_topology_t topology, hwloc_obj_type_t typ
|
||||
}
|
||||
|
||||
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
static inline std::vector<hwloc_obj_t> findByType(hwloc_obj_t obj, hwloc_obj_type_t type)
|
||||
{
|
||||
std::vector<hwloc_obj_t> out;
|
||||
@@ -207,7 +207,7 @@ bool xmrig::HwlocCpuInfo::membind(hwloc_const_bitmap_t nodeset)
|
||||
|
||||
xmrig::CpuThreads xmrig::HwlocCpuInfo::threads(const Algorithm &algorithm, uint32_t limit) const
|
||||
{
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
if (L2() == 0 && L3() == 0) {
|
||||
return BasicCpuInfo::threads(algorithm, limit);
|
||||
}
|
||||
@@ -277,7 +277,7 @@ xmrig::CpuThreads xmrig::HwlocCpuInfo::allThreads(const Algorithm &algorithm, ui
|
||||
|
||||
void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorithm &algorithm, CpuThreads &threads, size_t limit) const
|
||||
{
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
constexpr size_t oneMiB = 1024U * 1024U;
|
||||
|
||||
size_t PUs = countByType(cache, HWLOC_OBJ_PU);
|
||||
|
||||
150
src/backend/cpu/platform/lscpu_riscv.cpp
Normal file
150
src/backend/cpu/platform/lscpu_riscv.cpp
Normal file
@@ -0,0 +1,150 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 Slayingripper <https://github.com/Slayingripper>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "base/tools/String.h"
|
||||
#include "3rdparty/fmt/core.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
struct riscv_cpu_desc
|
||||
{
|
||||
String model;
|
||||
String isa;
|
||||
String uarch;
|
||||
bool has_vector = false;
|
||||
bool has_aes = false;
|
||||
|
||||
inline bool isReady() const { return !isa.isNull(); }
|
||||
};
|
||||
|
||||
static bool lookup_riscv(char *line, const char *pattern, String &value)
|
||||
{
|
||||
char *p = strstr(line, pattern);
|
||||
if (!p) {
|
||||
return false;
|
||||
}
|
||||
|
||||
p += strlen(pattern);
|
||||
while (isspace(*p)) {
|
||||
++p;
|
||||
}
|
||||
|
||||
if (*p == ':') {
|
||||
++p;
|
||||
}
|
||||
|
||||
while (isspace(*p)) {
|
||||
++p;
|
||||
}
|
||||
|
||||
// Remove trailing newline
|
||||
size_t len = strlen(p);
|
||||
if (len > 0 && p[len - 1] == '\n') {
|
||||
p[len - 1] = '\0';
|
||||
}
|
||||
|
||||
// Ensure we call the const char* assignment (which performs a copy)
|
||||
// instead of the char* overload (which would take ownership of the pointer)
|
||||
value = (const char*)p;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool read_riscv_cpuinfo(riscv_cpu_desc *desc)
|
||||
{
|
||||
auto fp = fopen("/proc/cpuinfo", "r");
|
||||
if (!fp) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char buf[2048]; // Larger buffer for long ISA strings
|
||||
while (fgets(buf, sizeof(buf), fp) != nullptr) {
|
||||
lookup_riscv(buf, "model name", desc->model);
|
||||
|
||||
if (lookup_riscv(buf, "isa", desc->isa)) {
|
||||
desc->isa.toLower();
|
||||
|
||||
for (const String& s : desc->isa.split('_')) {
|
||||
const char* p = s.data();
|
||||
const size_t n = s.size();
|
||||
|
||||
if ((s.size() > 4) && (memcmp(p, "rv64", 4) == 0)) {
|
||||
for (size_t i = 4; i < n; ++i) {
|
||||
if (p[i] == 'v') {
|
||||
desc->has_vector = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (s == "zve64d") {
|
||||
desc->has_vector = true;
|
||||
}
|
||||
else if ((s == "zvkn") || (s == "zvknc") || (s == "zvkned") || (s == "zvkng")){
|
||||
desc->has_aes = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lookup_riscv(buf, "uarch", desc->uarch);
|
||||
|
||||
if (desc->isReady()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return desc->isReady();
|
||||
}
|
||||
|
||||
String cpu_name_riscv()
|
||||
{
|
||||
riscv_cpu_desc desc;
|
||||
if (read_riscv_cpuinfo(&desc)) {
|
||||
if (!desc.uarch.isNull()) {
|
||||
return fmt::format("{} ({})", desc.model, desc.uarch).c_str();
|
||||
}
|
||||
return desc.model;
|
||||
}
|
||||
|
||||
return "RISC-V";
|
||||
}
|
||||
|
||||
bool has_riscv_vector()
|
||||
{
|
||||
riscv_cpu_desc desc;
|
||||
if (read_riscv_cpuinfo(&desc)) {
|
||||
return desc.has_vector;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_riscv_aes()
|
||||
{
|
||||
riscv_cpu_desc desc;
|
||||
if (read_riscv_cpuinfo(&desc)) {
|
||||
return desc.has_aes;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace xmrig
|
||||
@@ -19,6 +19,7 @@
|
||||
#define ALGO_CN_PICO_TLO 0x63120274
|
||||
#define ALGO_CN_UPX2 0x63110200
|
||||
#define ALGO_RX_0 0x72151200
|
||||
#define ALGO_RX_V2 0x72151202
|
||||
#define ALGO_RX_WOW 0x72141177
|
||||
#define ALGO_RX_ARQMA 0x72121061
|
||||
#define ALGO_RX_SFX 0x72151273
|
||||
|
||||
@@ -706,7 +706,7 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
|
||||
}
|
||||
|
||||
# if (ALGO_FAMILY == FAMILY_CN_HEAVY)
|
||||
/* Also left over threads performe this loop.
|
||||
/* Also left over threads perform this loop.
|
||||
* The left over thread results will be ignored
|
||||
*/
|
||||
#pragma unroll 16
|
||||
@@ -1005,7 +1005,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
|
||||
ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL };
|
||||
ulong H[8], M[8];
|
||||
|
||||
// BUG: AMD driver 19.7.X crashs if this is written as loop
|
||||
// BUG: AMD driver 19.7.X crashes if this is written as loop
|
||||
// Thx AMD for so bad software
|
||||
{
|
||||
((ulong8 *)M)[0] = vload8(0, states);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,7 +10,7 @@
|
||||
#else
|
||||
# define STATIC
|
||||
/* taken from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_media_ops.txt
|
||||
* Build-in Function
|
||||
* Built-in Function
|
||||
* uintn amd_bitalign (uintn src0, uintn src1, uintn src2)
|
||||
* Description
|
||||
* dst.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31))
|
||||
|
||||
@@ -77,7 +77,7 @@ void keccak_f800_round(uint32_t st[25], const int r)
|
||||
void keccak_f800(uint32_t* st)
|
||||
{
|
||||
// Complete all 22 rounds as a separate impl to
|
||||
// evaluate only first 8 words is wasteful of regsters
|
||||
// evaluate only first 8 words is wasteful of registers
|
||||
for (int r = 0; r < 22; r++) {
|
||||
keccak_f800_round(st, r);
|
||||
}
|
||||
@@ -181,7 +181,7 @@ __kernel void progpow_search(__global dag_t const* g_dag, __global uint* job_blo
|
||||
for (int i = 10; i < 25; i++)
|
||||
state[i] = ravencoin_rndc[i-10];
|
||||
|
||||
// Run intial keccak round
|
||||
// Run initial keccak round
|
||||
keccak_f800(state);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -77,6 +77,7 @@ const char *Algorithm::kCN_UPX2 = "cn/upx2";
|
||||
#ifdef XMRIG_ALGO_RANDOMX
|
||||
const char *Algorithm::kRX = "rx";
|
||||
const char *Algorithm::kRX_0 = "rx/0";
|
||||
const char *Algorithm::kRX_V2 = "rx/2";
|
||||
const char *Algorithm::kRX_WOW = "rx/wow";
|
||||
const char *Algorithm::kRX_ARQ = "rx/arq";
|
||||
const char *Algorithm::kRX_GRAFT = "rx/graft";
|
||||
@@ -143,6 +144,7 @@ static const std::map<uint32_t, const char *> kAlgorithmNames = {
|
||||
|
||||
# ifdef XMRIG_ALGO_RANDOMX
|
||||
ALGO_NAME(RX_0),
|
||||
ALGO_NAME(RX_V2),
|
||||
ALGO_NAME(RX_WOW),
|
||||
ALGO_NAME(RX_ARQ),
|
||||
ALGO_NAME(RX_GRAFT),
|
||||
@@ -253,6 +255,8 @@ static const std::map<const char *, Algorithm::Id, aliasCompare> kAlgorithmAlias
|
||||
ALGO_ALIAS(RX_0, "rx/test"),
|
||||
ALGO_ALIAS(RX_0, "randomx"),
|
||||
ALGO_ALIAS(RX_0, "rx"),
|
||||
ALGO_ALIAS_AUTO(RX_V2), ALGO_ALIAS(RX_V2, "randomx/v2"),
|
||||
ALGO_ALIAS(RX_V2, "rx/v2"),
|
||||
ALGO_ALIAS_AUTO(RX_WOW), ALGO_ALIAS(RX_WOW, "randomx/wow"),
|
||||
ALGO_ALIAS(RX_WOW, "randomwow"),
|
||||
ALGO_ALIAS_AUTO(RX_ARQ), ALGO_ALIAS(RX_ARQ, "randomx/arq"),
|
||||
@@ -350,7 +354,7 @@ std::vector<xmrig::Algorithm> xmrig::Algorithm::all(const std::function<bool(con
|
||||
CN_HEAVY_0, CN_HEAVY_TUBE, CN_HEAVY_XHV,
|
||||
CN_PICO_0, CN_PICO_TLO,
|
||||
CN_UPX2,
|
||||
RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_YADA,
|
||||
RX_0, RX_V2, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_YADA,
|
||||
AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ,
|
||||
KAWPOW_RVN,
|
||||
GHOSTRIDER_RTM
|
||||
|
||||
@@ -73,6 +73,7 @@ public:
|
||||
CN_GR_5 = 0x63120105, // "cn/turtle-lite" GhostRider
|
||||
GHOSTRIDER_RTM = 0x6c150000, // "ghostrider" GhostRider
|
||||
RX_0 = 0x72151200, // "rx/0" RandomX (reference configuration).
|
||||
RX_V2 = 0x72151202, // "rx/2" RandomX (Monero v2).
|
||||
RX_WOW = 0x72141177, // "rx/wow" RandomWOW (Wownero).
|
||||
RX_ARQ = 0x72121061, // "rx/arq" RandomARQ (Arqma).
|
||||
RX_GRAFT = 0x72151267, // "rx/graft" RandomGRAFT (Graft).
|
||||
@@ -139,6 +140,7 @@ public:
|
||||
# ifdef XMRIG_ALGO_RANDOMX
|
||||
static const char *kRX;
|
||||
static const char *kRX_0;
|
||||
static const char* kRX_V2;
|
||||
static const char *kRX_WOW;
|
||||
static const char *kRX_ARQ;
|
||||
static const char *kRX_GRAFT;
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
#define KECCAK_ROUNDS 24
|
||||
|
||||
|
||||
/* *************************** Public Inteface ************************ */
|
||||
/* *************************** Public Interface ************************ */
|
||||
|
||||
/* For Init or Reset call these: */
|
||||
sha3_return_t
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -71,11 +71,11 @@ char *xmrig::Platform::createUserAgent()
|
||||
|
||||
|
||||
#ifndef XMRIG_FEATURE_HWLOC
|
||||
#ifdef __DragonFly__
|
||||
#if defined(__DragonFly__) || defined(XMRIG_OS_OPENBSD) || defined(XMRIG_OS_HAIKU)
|
||||
|
||||
bool xmrig::Platform::setThreadAffinity(uint64_t cpu_id)
|
||||
{
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
@@ -554,6 +554,7 @@ int64_t xmrig::Client::send(size_t size)
|
||||
}
|
||||
|
||||
m_expire = Chrono::steadyMSecs() + kResponseTimeout;
|
||||
startTimeout();
|
||||
return m_sequence++;
|
||||
}
|
||||
|
||||
@@ -661,8 +662,6 @@ void xmrig::Client::onClose()
|
||||
|
||||
void xmrig::Client::parse(char *line, size_t len)
|
||||
{
|
||||
startTimeout();
|
||||
|
||||
LOG_DEBUG("[%s] received (%d bytes): \"%.*s\"", url(), len, static_cast<int>(len), line);
|
||||
|
||||
if (len < 22 || line[0] != '{') {
|
||||
@@ -857,8 +856,6 @@ void xmrig::Client::parseResponse(int64_t id, const rapidjson::Value &result, co
|
||||
void xmrig::Client::ping()
|
||||
{
|
||||
send(snprintf(m_sendBuf.data(), m_sendBuf.size(), "{\"id\":%" PRId64 ",\"jsonrpc\":\"2.0\",\"method\":\"keepalived\",\"params\":{\"id\":\"%s\"}}\n", m_sequence, m_rpcId.data()));
|
||||
|
||||
m_keepAlive = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
* Copyright (c) 2018-2023 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2023 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -45,7 +45,7 @@ namespace xmrig {
|
||||
|
||||
|
||||
// https://wiki.openssl.org/index.php/Diffie-Hellman_parameters
|
||||
#if OPENSSL_VERSION_NUMBER < 0x30000000L || defined(LIBRESSL_VERSION_NUMBER)
|
||||
#if OPENSSL_VERSION_NUMBER < 0x30000000L || (defined(LIBRESSL_VERSION_NUMBER) && !defined(LIBRESSL_HAS_TLS1_3))
|
||||
static DH *get_dh2048()
|
||||
{
|
||||
static unsigned char dhp_2048[] = {
|
||||
@@ -152,7 +152,7 @@ bool xmrig::TlsContext::load(const TlsConfig &config)
|
||||
SSL_CTX_set_options(m_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3);
|
||||
SSL_CTX_set_options(m_ctx, SSL_OP_CIPHER_SERVER_PREFERENCE);
|
||||
|
||||
# if OPENSSL_VERSION_NUMBER >= 0x1010100fL && !defined(LIBRESSL_VERSION_NUMBER)
|
||||
# if OPENSSL_VERSION_NUMBER >= 0x1010100fL || defined(LIBRESSL_HAS_TLS1_3)
|
||||
SSL_CTX_set_max_early_data(m_ctx, 0);
|
||||
# endif
|
||||
|
||||
@@ -180,7 +180,7 @@ bool xmrig::TlsContext::setCipherSuites(const char *ciphersuites)
|
||||
return true;
|
||||
}
|
||||
|
||||
# if OPENSSL_VERSION_NUMBER >= 0x1010100fL && !defined(LIBRESSL_VERSION_NUMBER)
|
||||
# if OPENSSL_VERSION_NUMBER >= 0x1010100fL || defined(LIBRESSL_HAS_TLS1_3)
|
||||
if (SSL_CTX_set_ciphersuites(m_ctx, ciphersuites) == 1) {
|
||||
return true;
|
||||
}
|
||||
@@ -194,7 +194,7 @@ bool xmrig::TlsContext::setCipherSuites(const char *ciphersuites)
|
||||
|
||||
bool xmrig::TlsContext::setDH(const char *dhparam)
|
||||
{
|
||||
# if OPENSSL_VERSION_NUMBER < 0x30000000L || defined(LIBRESSL_VERSION_NUMBER)
|
||||
# if OPENSSL_VERSION_NUMBER < 0x30000000L || (defined(LIBRESSL_VERSION_NUMBER) && !defined(LIBRESSL_HAS_TLS1_3))
|
||||
DH *dh = nullptr;
|
||||
|
||||
if (dhparam != nullptr) {
|
||||
|
||||
@@ -241,8 +241,13 @@ bool xmrig::BlockTemplate::parse(bool hashes)
|
||||
ar(m_amount);
|
||||
ar(m_outputType);
|
||||
|
||||
// output type must be txout_to_key (2) or txout_to_tagged_key (3)
|
||||
if ((m_outputType != 2) && (m_outputType != 3)) {
|
||||
const bool is_fcmp_pp = (m_coin == Coin::MONERO) && (m_version.first >= 17);
|
||||
|
||||
// output type must be txout_to_key (2) or txout_to_tagged_key (3) for versions < 17, and txout_to_carrot_v1 (0) for version FCMP++
|
||||
if (is_fcmp_pp && (m_outputType == 0)) {
|
||||
// all good
|
||||
}
|
||||
else if ((m_outputType != 2) && (m_outputType != 3)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -250,6 +255,11 @@ bool xmrig::BlockTemplate::parse(bool hashes)
|
||||
|
||||
ar(m_ephPublicKey, kKeySize);
|
||||
|
||||
if (is_fcmp_pp) {
|
||||
ar(m_carrotViewTag);
|
||||
ar(m_janusAnchor);
|
||||
}
|
||||
|
||||
if (m_coin == Coin::ZEPHYR) {
|
||||
if (m_outputType != 2) {
|
||||
return false;
|
||||
|
||||
@@ -148,6 +148,8 @@ private:
|
||||
Buffer m_hashes;
|
||||
Buffer m_minerTxMerkleTreeBranch;
|
||||
uint8_t m_rootHash[kHashSize]{};
|
||||
uint8_t m_carrotViewTag[3]{};
|
||||
uint8_t m_janusAnchor[16]{};
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#include "crypto/common/VirtualMemory.h"
|
||||
|
||||
|
||||
#if defined(XMRIG_ARM)
|
||||
#if defined(XMRIG_ARM) || defined(XMRIG_RISCV)
|
||||
# include "crypto/cn/CryptoNight_arm.h"
|
||||
#else
|
||||
# include "crypto/cn/CryptoNight_x86.h"
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined _MSC_VER || defined XMRIG_ARM
|
||||
#if defined _MSC_VER || defined XMRIG_ARM || defined XMRIG_RISCV
|
||||
# define ABI_ATTRIBUTE
|
||||
#else
|
||||
# define ABI_ATTRIBUTE __attribute__((ms_abi))
|
||||
|
||||
@@ -27,6 +27,9 @@
|
||||
#ifndef XMRIG_CRYPTONIGHT_ARM_H
|
||||
#define XMRIG_CRYPTONIGHT_ARM_H
|
||||
|
||||
#ifdef XMRIG_RISCV
|
||||
# include "crypto/cn/sse2rvv.h"
|
||||
#endif
|
||||
|
||||
#include "base/crypto/keccak.h"
|
||||
#include "crypto/cn/CnAlgo.h"
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <math.h>
|
||||
|
||||
// VARIANT ALTERATIONS
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
# define VARIANT1_INIT(part) \
|
||||
uint64_t tweak1_2_##part = 0; \
|
||||
if (BASE == Algorithm::CN_1) { \
|
||||
@@ -60,7 +60,7 @@
|
||||
}
|
||||
|
||||
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
# define VARIANT2_INIT(part) \
|
||||
__m128i division_result_xmm_##part = _mm_cvtsi64_si128(static_cast<int64_t>(h##part[12])); \
|
||||
__m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(static_cast<int64_t>(h##part[13]));
|
||||
|
||||
@@ -235,7 +235,7 @@ static HashReturn Init(hashState *state, int hashbitlen)
|
||||
/*initialize the initial hash value of JH*/
|
||||
state->hashbitlen = hashbitlen;
|
||||
|
||||
/*load the intital hash value into state*/
|
||||
/*load the initial hash value into state*/
|
||||
switch (hashbitlen)
|
||||
{
|
||||
case 224: memcpy(state->x,JH224_H0,128); break;
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
multiple of size / 8)
|
||||
|
||||
ptr_cast(x,size) casts a pointer to a pointer to a
|
||||
varaiable of length 'size' bits
|
||||
variable of length 'size' bits
|
||||
*/
|
||||
|
||||
#define ui_type(size) uint##size##_t
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#if defined(XMRIG_ARM)
|
||||
# include "crypto/cn/sse2neon.h"
|
||||
#elif defined(XMRIG_RISCV)
|
||||
# include "crypto/cn/sse2rvv.h"
|
||||
#elif defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
#else
|
||||
|
||||
748
src/crypto/cn/sse2rvv.h
Normal file
748
src/crypto/cn/sse2rvv.h
Normal file
@@ -0,0 +1,748 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 Slayingripper <https://github.com/Slayingripper>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* SSE to RISC-V Vector (RVV) optimized compatibility header
|
||||
* Provides both scalar fallback and vectorized implementations using RVV intrinsics
|
||||
*
|
||||
* Based on sse2neon.h concepts, adapted for RISC-V architecture with RVV extensions
|
||||
* Original sse2neon.h: https://github.com/DLTcollab/sse2neon
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
#define XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Check if RVV is available */
|
||||
#if defined(__riscv_vector)
|
||||
#include <riscv_vector.h>
|
||||
#define USE_RVV_INTRINSICS 1
|
||||
#else
|
||||
#define USE_RVV_INTRINSICS 0
|
||||
#endif
|
||||
|
||||
/* 128-bit vector type */
|
||||
typedef union {
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
int8_t i8[16];
|
||||
int16_t i16[8];
|
||||
int32_t i32[4];
|
||||
int64_t i64[2];
|
||||
} __m128i_union;
|
||||
|
||||
typedef __m128i_union __m128i;
|
||||
|
||||
/* Set operations */
|
||||
static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i32[0] = e0;
|
||||
result.i32[1] = e1;
|
||||
result.i32[2] = e2;
|
||||
result.i32[3] = e3;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = e0;
|
||||
result.i64[1] = e1;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_setzero_si128(void)
|
||||
{
|
||||
__m128i result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Extract/insert operations */
|
||||
static inline int _mm_cvtsi128_si32(__m128i a)
|
||||
{
|
||||
return a.i32[0];
|
||||
}
|
||||
|
||||
static inline int64_t _mm_cvtsi128_si64(__m128i a)
|
||||
{
|
||||
return a.i64[0];
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi32_si128(int a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i32[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi64_si128(int64_t a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i64[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shuffle operations */
|
||||
static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
result.u32[0] = a.u32[(imm8 >> 0) & 0x3];
|
||||
result.u32[1] = a.u32[(imm8 >> 2) & 0x3];
|
||||
result.u32[2] = a.u32[(imm8 >> 4) & 0x3];
|
||||
result.u32[3] = a.u32[(imm8 >> 6) & 0x3];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Logical operations - optimized with RVV when available */
|
||||
static inline __m128i _mm_xor_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_or_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_and_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_andnot_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (~a.u64[0]) & b.u64[0];
|
||||
result.u64[1] = (~a.u64[1]) & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Shift operations */
|
||||
static inline __m128i _mm_slli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = 0; i < 16 - count; i++) {
|
||||
result.u8[i + count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = count; i < 16; i++) {
|
||||
result.u8[i - count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_slli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] << imm8;
|
||||
result.u64[1] = a.u64[1] << imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] >> imm8;
|
||||
result.u64[1] = a.u64[1] >> imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Load/store operations - optimized with RVV */
|
||||
static inline __m128i _mm_load_si128(const __m128i* p)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, v, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_loadu_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void _mm_store_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
__riscv_vse64_v_u64m1((uint64_t*)p, v, vl);
|
||||
#else
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void _mm_storeu_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
/* Arithmetic operations - optimized with RVV */
|
||||
static inline __m128i _mm_add_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_add_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a.i32[i] + b.i32[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_sub_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] - b.u64[0];
|
||||
result.u64[1] = a.u64[1] - b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_mul_epu32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl);
|
||||
vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl);
|
||||
vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0];
|
||||
result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Unpack operations */
|
||||
static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0];
|
||||
result.u64[1] = b.u64[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[1];
|
||||
result.u64[1] = b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Pause instruction for spin-wait loops */
|
||||
static inline void _mm_pause(void)
|
||||
{
|
||||
/* RISC-V pause hint if available (requires Zihintpause extension) */
|
||||
#if defined(__riscv_zihintpause)
|
||||
__asm__ __volatile__("pause");
|
||||
#else
|
||||
__asm__ __volatile__("nop");
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Memory fence - optimized for RISC-V */
|
||||
static inline void _mm_mfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence rw,rw" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_lfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence r,r" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_sfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence w,w" ::: "memory");
|
||||
}
|
||||
|
||||
/* Comparison operations */
|
||||
static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional shift operations */
|
||||
static inline __m128i _mm_slli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] << imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] >> imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 64-bit integer operations */
|
||||
static inline __m128i _mm_set1_epi64x(int64_t a)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = a;
|
||||
result.i64[1] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Float type for compatibility */
|
||||
typedef __m128i __m128;
|
||||
|
||||
/* Float operations - simplified scalar implementations */
|
||||
static inline __m128 _mm_set1_ps(float a)
|
||||
{
|
||||
__m128 result;
|
||||
uint32_t val;
|
||||
memcpy(&val, &a, sizeof(float));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = val;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_setzero_ps(void)
|
||||
{
|
||||
__m128 result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_add_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] + fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_mul_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] * fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_and_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_or_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_cvtepi32_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
float fr[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = (float)a.i32[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvttps_epi32(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
float fa[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = (int32_t)fa[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Casting operations */
|
||||
static inline __m128 _mm_castsi128_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_castps_si128(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional set operations */
|
||||
static inline __m128i _mm_set1_epi32(int a)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* AES instructions - placeholders for soft_aes compatibility */
|
||||
static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey)
|
||||
{
|
||||
return _mm_xor_si128(a, roundkey);
|
||||
}
|
||||
|
||||
static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
||||
{
|
||||
return a;
|
||||
}
|
||||
|
||||
/* Rotate right operation for soft_aes.h */
|
||||
static inline uint32_t _rotr(uint32_t value, unsigned int count)
|
||||
{
|
||||
const unsigned int mask = 31;
|
||||
count &= mask;
|
||||
return (value >> count) | (value << ((-count) & mask));
|
||||
}
|
||||
|
||||
/* ARM NEON compatibility types and intrinsics for RISC-V */
|
||||
typedef __m128i_union uint64x2_t;
|
||||
typedef __m128i_union uint8x16_t;
|
||||
typedef __m128i_union int64x2_t;
|
||||
typedef __m128i_union int32x4_t;
|
||||
|
||||
static inline uint64x2_t vld1q_u64(const uint64_t *ptr)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = ptr[0];
|
||||
result.u64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64x2_t vld1q_s64(const int64_t *ptr)
|
||||
{
|
||||
int64x2_t result;
|
||||
result.i64[0] = ptr[0];
|
||||
result.i64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val)
|
||||
{
|
||||
ptr[0] = val.u64[0];
|
||||
ptr[1] = val.u64[1];
|
||||
}
|
||||
|
||||
static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_add_epi64(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a)
|
||||
{
|
||||
uint64x2_t result;
|
||||
memcpy(&result, &a, sizeof(uint64x2_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane)
|
||||
{
|
||||
return v.u64[lane];
|
||||
}
|
||||
|
||||
static inline int64_t vgetq_lane_s64(int64x2_t v, int lane)
|
||||
{
|
||||
return v.i64[lane];
|
||||
}
|
||||
|
||||
static inline int32_t vgetq_lane_s32(int32x4_t v, int lane)
|
||||
{
|
||||
return v.i32[lane];
|
||||
}
|
||||
|
||||
typedef struct { uint64_t val[1]; } uint64x1_t;
|
||||
|
||||
static inline uint64x1_t vcreate_u64(uint64_t a)
|
||||
{
|
||||
uint64x1_t result;
|
||||
result.val[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = low.val[0];
|
||||
result.u64[1] = high.val[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */
|
||||
748
src/crypto/cn/sse2rvv_optimized.h
Normal file
748
src/crypto/cn/sse2rvv_optimized.h
Normal file
@@ -0,0 +1,748 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* SSE to RISC-V Vector (RVV) optimized compatibility header
|
||||
* Provides both scalar fallback and vectorized implementations using RVV intrinsics
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
#define XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Check if RVV is available */
|
||||
#if defined(__riscv_vector)
|
||||
#include <riscv_vector.h>
|
||||
#define USE_RVV_INTRINSICS 1
|
||||
#else
|
||||
#define USE_RVV_INTRINSICS 0
|
||||
#endif
|
||||
|
||||
/* 128-bit vector type */
|
||||
typedef union {
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
int8_t i8[16];
|
||||
int16_t i16[8];
|
||||
int32_t i32[4];
|
||||
int64_t i64[2];
|
||||
#if USE_RVV_INTRINSICS
|
||||
vuint64m1_t rvv_u64;
|
||||
vuint32m1_t rvv_u32;
|
||||
vuint8m1_t rvv_u8;
|
||||
#endif
|
||||
} __m128i_union;
|
||||
|
||||
typedef __m128i_union __m128i;
|
||||
|
||||
/* Set operations */
|
||||
static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i32[0] = e0;
|
||||
result.i32[1] = e1;
|
||||
result.i32[2] = e2;
|
||||
result.i32[3] = e3;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = e0;
|
||||
result.i64[1] = e1;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_setzero_si128(void)
|
||||
{
|
||||
__m128i result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Extract/insert operations */
|
||||
static inline int _mm_cvtsi128_si32(__m128i a)
|
||||
{
|
||||
return a.i32[0];
|
||||
}
|
||||
|
||||
static inline int64_t _mm_cvtsi128_si64(__m128i a)
|
||||
{
|
||||
return a.i64[0];
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi32_si128(int a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i32[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi64_si128(int64_t a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i64[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shuffle operations */
|
||||
static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
result.u32[0] = a.u32[(imm8 >> 0) & 0x3];
|
||||
result.u32[1] = a.u32[(imm8 >> 2) & 0x3];
|
||||
result.u32[2] = a.u32[(imm8 >> 4) & 0x3];
|
||||
result.u32[3] = a.u32[(imm8 >> 6) & 0x3];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Logical operations - optimized with RVV when available */
|
||||
static inline __m128i _mm_xor_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_or_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_and_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_andnot_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (~a.u64[0]) & b.u64[0];
|
||||
result.u64[1] = (~a.u64[1]) & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Shift operations */
|
||||
static inline __m128i _mm_slli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = 0; i < 16 - count; i++) {
|
||||
result.u8[i + count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = count; i < 16; i++) {
|
||||
result.u8[i - count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_slli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] << imm8;
|
||||
result.u64[1] = a.u64[1] << imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] >> imm8;
|
||||
result.u64[1] = a.u64[1] >> imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Load/store operations - optimized with RVV */
|
||||
static inline __m128i _mm_load_si128(const __m128i* p)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, v, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_loadu_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void _mm_store_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
__riscv_vse64_v_u64m1((uint64_t*)p, v, vl);
|
||||
#else
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void _mm_storeu_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
/* Arithmetic operations - optimized with RVV */
|
||||
static inline __m128i _mm_add_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_add_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a.i32[i] + b.i32[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_sub_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] - b.u64[0];
|
||||
result.u64[1] = a.u64[1] - b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_mul_epu32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl);
|
||||
vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl);
|
||||
vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0];
|
||||
result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Unpack operations */
|
||||
static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0];
|
||||
result.u64[1] = b.u64[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[1];
|
||||
result.u64[1] = b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Pause instruction for spin-wait loops */
|
||||
static inline void _mm_pause(void)
|
||||
{
|
||||
/* RISC-V pause hint if available (requires Zihintpause extension) */
|
||||
#if defined(__riscv_zihintpause)
|
||||
__asm__ __volatile__("pause");
|
||||
#else
|
||||
__asm__ __volatile__("nop");
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Memory fence - optimized for RISC-V */
|
||||
static inline void _mm_mfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence rw,rw" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_lfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence r,r" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_sfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence w,w" ::: "memory");
|
||||
}
|
||||
|
||||
/* Comparison operations */
|
||||
static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional shift operations */
|
||||
static inline __m128i _mm_slli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] << imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] >> imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 64-bit integer operations */
|
||||
static inline __m128i _mm_set1_epi64x(int64_t a)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = a;
|
||||
result.i64[1] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Float type for compatibility */
|
||||
typedef __m128i __m128;
|
||||
|
||||
/* Float operations - simplified scalar implementations */
|
||||
static inline __m128 _mm_set1_ps(float a)
|
||||
{
|
||||
__m128 result;
|
||||
uint32_t val;
|
||||
memcpy(&val, &a, sizeof(float));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = val;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_setzero_ps(void)
|
||||
{
|
||||
__m128 result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_add_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] + fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_mul_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] * fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_and_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_or_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_cvtepi32_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
float fr[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = (float)a.i32[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvttps_epi32(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
float fa[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = (int32_t)fa[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Casting operations */
|
||||
static inline __m128 _mm_castsi128_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_castps_si128(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional set operations */
|
||||
static inline __m128i _mm_set1_epi32(int a)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* AES instructions - placeholders for soft_aes compatibility */
|
||||
static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey)
|
||||
{
|
||||
return _mm_xor_si128(a, roundkey);
|
||||
}
|
||||
|
||||
static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
||||
{
|
||||
return a;
|
||||
}
|
||||
|
||||
/* Rotate right operation for soft_aes.h */
|
||||
static inline uint32_t _rotr(uint32_t value, unsigned int count)
|
||||
{
|
||||
const unsigned int mask = 31;
|
||||
count &= mask;
|
||||
return (value >> count) | (value << ((-count) & mask));
|
||||
}
|
||||
|
||||
/* ARM NEON compatibility types and intrinsics for RISC-V */
|
||||
typedef __m128i_union uint64x2_t;
|
||||
typedef __m128i_union uint8x16_t;
|
||||
typedef __m128i_union int64x2_t;
|
||||
typedef __m128i_union int32x4_t;
|
||||
|
||||
static inline uint64x2_t vld1q_u64(const uint64_t *ptr)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = ptr[0];
|
||||
result.u64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64x2_t vld1q_s64(const int64_t *ptr)
|
||||
{
|
||||
int64x2_t result;
|
||||
result.i64[0] = ptr[0];
|
||||
result.i64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val)
|
||||
{
|
||||
ptr[0] = val.u64[0];
|
||||
ptr[1] = val.u64[1];
|
||||
}
|
||||
|
||||
static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_add_epi64(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a)
|
||||
{
|
||||
uint64x2_t result;
|
||||
memcpy(&result, &a, sizeof(uint64x2_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane)
|
||||
{
|
||||
return v.u64[lane];
|
||||
}
|
||||
|
||||
static inline int64_t vgetq_lane_s64(int64x2_t v, int lane)
|
||||
{
|
||||
return v.i64[lane];
|
||||
}
|
||||
|
||||
static inline int32_t vgetq_lane_s32(int32x4_t v, int lane)
|
||||
{
|
||||
return v.i32[lane];
|
||||
}
|
||||
|
||||
typedef struct { uint64_t val[1]; } uint64x1_t;
|
||||
|
||||
static inline uint64x1_t vcreate_u64(uint64_t a)
|
||||
{
|
||||
uint64x1_t result;
|
||||
result.val[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = low.val[0];
|
||||
result.u64[1] = high.val[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */
|
||||
571
src/crypto/cn/sse2rvv_scalar_backup.h
Normal file
571
src/crypto/cn/sse2rvv_scalar_backup.h
Normal file
@@ -0,0 +1,571 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* SSE to RISC-V compatibility header
|
||||
* Provides scalar implementations of SSE intrinsics for RISC-V architecture
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_SSE2RVV_H
|
||||
#define XMRIG_SSE2RVV_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/* 128-bit vector type */
|
||||
typedef union {
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
int8_t i8[16];
|
||||
int16_t i16[8];
|
||||
int32_t i32[4];
|
||||
int64_t i64[2];
|
||||
} __m128i_union;
|
||||
|
||||
typedef __m128i_union __m128i;
|
||||
|
||||
/* Set operations */
|
||||
static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i32[0] = e0;
|
||||
result.i32[1] = e1;
|
||||
result.i32[2] = e2;
|
||||
result.i32[3] = e3;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = e0;
|
||||
result.i64[1] = e1;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_setzero_si128(void)
|
||||
{
|
||||
__m128i result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Extract/insert operations */
|
||||
static inline int _mm_cvtsi128_si32(__m128i a)
|
||||
{
|
||||
return a.i32[0];
|
||||
}
|
||||
|
||||
static inline int64_t _mm_cvtsi128_si64(__m128i a)
|
||||
{
|
||||
return a.i64[0];
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi32_si128(int a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i32[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi64_si128(int64_t a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i64[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shuffle operations */
|
||||
static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
result.u32[0] = a.u32[(imm8 >> 0) & 0x3];
|
||||
result.u32[1] = a.u32[(imm8 >> 2) & 0x3];
|
||||
result.u32[2] = a.u32[(imm8 >> 4) & 0x3];
|
||||
result.u32[3] = a.u32[(imm8 >> 6) & 0x3];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Logical operations */
|
||||
static inline __m128i _mm_xor_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_or_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_and_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_andnot_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = (~a.u64[0]) & b.u64[0];
|
||||
result.u64[1] = (~a.u64[1]) & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shift operations */
|
||||
static inline __m128i _mm_slli_si128(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = 0; i < 16 - count; i++) {
|
||||
result.u8[i + count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_si128(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = count; i < 16; i++) {
|
||||
result.u8[i - count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_slli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] << imm8;
|
||||
result.u64[1] = a.u64[1] << imm8;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] >> imm8;
|
||||
result.u64[1] = a.u64[1] >> imm8;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Load/store operations */
|
||||
static inline __m128i _mm_load_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_loadu_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void _mm_store_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
static inline void _mm_storeu_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
/* Arithmetic operations */
|
||||
static inline __m128i _mm_add_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_add_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a.i32[i] + b.i32[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_sub_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] - b.u64[0];
|
||||
result.u64[1] = a.u64[1] - b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_mul_epu32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0];
|
||||
result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Unpack operations */
|
||||
static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0];
|
||||
result.u64[1] = b.u64[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[1];
|
||||
result.u64[1] = b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Pause instruction for spin-wait loops */
|
||||
static inline void _mm_pause(void)
|
||||
{
|
||||
/* RISC-V doesn't have a direct equivalent to x86 PAUSE
|
||||
* Use a simple NOP or yield hint */
|
||||
__asm__ __volatile__("nop");
|
||||
}
|
||||
|
||||
/* Memory fence */
|
||||
static inline void _mm_mfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_lfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence r,r" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_sfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence w,w" ::: "memory");
|
||||
}
|
||||
|
||||
/* Comparison operations */
|
||||
static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional shift operations */
|
||||
static inline __m128i _mm_slli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] << imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] >> imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* 64-bit integer operations */
|
||||
static inline __m128i _mm_set1_epi64x(int64_t a)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = a;
|
||||
result.i64[1] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Float type for compatibility - we'll treat it as int for simplicity */
|
||||
typedef __m128i __m128;
|
||||
|
||||
/* Float operations - simplified scalar implementations */
|
||||
static inline __m128 _mm_set1_ps(float a)
|
||||
{
|
||||
__m128 result;
|
||||
uint32_t val;
|
||||
memcpy(&val, &a, sizeof(float));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = val;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_setzero_ps(void)
|
||||
{
|
||||
__m128 result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_add_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] + fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_mul_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] * fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_and_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_or_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_cvtepi32_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
float fr[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = (float)a.i32[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvttps_epi32(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
float fa[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = (int32_t)fa[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Casting operations */
|
||||
static inline __m128 _mm_castsi128_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_castps_si128(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional set operations */
|
||||
static inline __m128i _mm_set1_epi32(int a)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* AES instructions - these are placeholders, actual AES is done via soft_aes.h */
|
||||
/* On RISC-V without crypto extensions, these should never be called directly */
|
||||
/* They are only here for compilation compatibility */
|
||||
static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey)
|
||||
{
|
||||
/* This is a placeholder - actual implementation should use soft_aes */
|
||||
/* If this function is called, it means SOFT_AES template parameter wasn't used */
|
||||
/* We return a XOR as a minimal fallback, but proper code should use soft_aesenc */
|
||||
return _mm_xor_si128(a, roundkey);
|
||||
}
|
||||
|
||||
static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
||||
{
|
||||
/* Placeholder for AES key generation - should use soft_aeskeygenassist */
|
||||
return a;
|
||||
}
|
||||
|
||||
/* Rotate right operation for soft_aes.h */
|
||||
static inline uint32_t _rotr(uint32_t value, unsigned int count)
|
||||
{
|
||||
const unsigned int mask = 31;
|
||||
count &= mask;
|
||||
return (value >> count) | (value << ((-count) & mask));
|
||||
}
|
||||
|
||||
/* ARM NEON compatibility types and intrinsics for RISC-V */
|
||||
typedef __m128i_union uint64x2_t;
|
||||
typedef __m128i_union uint8x16_t;
|
||||
typedef __m128i_union int64x2_t;
|
||||
typedef __m128i_union int32x4_t;
|
||||
|
||||
static inline uint64x2_t vld1q_u64(const uint64_t *ptr)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = ptr[0];
|
||||
result.u64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64x2_t vld1q_s64(const int64_t *ptr)
|
||||
{
|
||||
int64x2_t result;
|
||||
result.i64[0] = ptr[0];
|
||||
result.i64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val)
|
||||
{
|
||||
ptr[0] = val.u64[0];
|
||||
ptr[1] = val.u64[1];
|
||||
}
|
||||
|
||||
static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a)
|
||||
{
|
||||
uint64x2_t result;
|
||||
memcpy(&result, &a, sizeof(uint64x2_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane)
|
||||
{
|
||||
return v.u64[lane];
|
||||
}
|
||||
|
||||
static inline int64_t vgetq_lane_s64(int64x2_t v, int lane)
|
||||
{
|
||||
return v.i64[lane];
|
||||
}
|
||||
|
||||
static inline int32_t vgetq_lane_s32(int32x4_t v, int lane)
|
||||
{
|
||||
return v.i32[lane];
|
||||
}
|
||||
|
||||
typedef struct { uint64_t val[1]; } uint64x1_t;
|
||||
|
||||
static inline uint64x1_t vcreate_u64(uint64_t a)
|
||||
{
|
||||
uint64x1_t result;
|
||||
result.val[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = low.val[0];
|
||||
result.u64[1] = high.val[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* XMRIG_SSE2RVV_H */
|
||||
@@ -1,6 +1,6 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -35,15 +35,69 @@ constexpr size_t twoMiB = 2U * 1024U * 1024U;
|
||||
constexpr size_t oneGiB = 1024U * 1024U * 1024U;
|
||||
|
||||
|
||||
static inline std::string sysfs_path(uint32_t node, size_t hugePageSize, bool nr)
|
||||
static bool sysfs_write(const std::string &path, uint64_t value)
|
||||
{
|
||||
std::ofstream file(path, std::ios::out | std::ios::binary | std::ios::trunc);
|
||||
if (!file.is_open()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
file << value;
|
||||
file.flush();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static int64_t sysfs_read(const std::string &path)
|
||||
{
|
||||
std::ifstream file(path);
|
||||
if (!file.is_open()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint64_t value = 0;
|
||||
file >> value;
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
static std::string sysfs_path(uint32_t node, size_t hugePageSize, bool nr)
|
||||
{
|
||||
return fmt::format("/sys/devices/system/node/node{}/hugepages/hugepages-{}kB/{}_hugepages", node, hugePageSize / 1024, nr ? "nr" : "free");
|
||||
}
|
||||
|
||||
|
||||
static inline bool write_nr_hugepages(uint32_t node, size_t hugePageSize, uint64_t count) { return LinuxMemory::write(sysfs_path(node, hugePageSize, true).c_str(), count); }
|
||||
static inline int64_t free_hugepages(uint32_t node, size_t hugePageSize) { return LinuxMemory::read(sysfs_path(node, hugePageSize, false).c_str()); }
|
||||
static inline int64_t nr_hugepages(uint32_t node, size_t hugePageSize) { return LinuxMemory::read(sysfs_path(node, hugePageSize, true).c_str()); }
|
||||
static std::string sysfs_path(size_t hugePageSize, bool nr)
|
||||
{
|
||||
return fmt::format("/sys/kernel/mm/hugepages/hugepages-{}kB/{}_hugepages", hugePageSize / 1024, nr ? "nr" : "free");
|
||||
}
|
||||
|
||||
|
||||
static bool write_nr_hugepages(uint32_t node, size_t hugePageSize, uint64_t count)
|
||||
{
|
||||
if (sysfs_write(sysfs_path(node, hugePageSize, true), count)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return sysfs_write(sysfs_path(hugePageSize, true), count);
|
||||
}
|
||||
|
||||
|
||||
static int64_t sysfs_read_hugepages(uint32_t node, size_t hugePageSize, bool nr)
|
||||
{
|
||||
const int64_t value = sysfs_read(sysfs_path(node, hugePageSize, nr));
|
||||
if (value >= 0) {
|
||||
return value;
|
||||
}
|
||||
|
||||
return sysfs_read(sysfs_path(hugePageSize, nr));
|
||||
}
|
||||
|
||||
|
||||
static inline int64_t free_hugepages(uint32_t node, size_t hugePageSize) { return sysfs_read_hugepages(node, hugePageSize, false); }
|
||||
static inline int64_t nr_hugepages(uint32_t node, size_t hugePageSize) { return sysfs_read_hugepages(node, hugePageSize, true); }
|
||||
|
||||
|
||||
} // namespace xmrig
|
||||
@@ -62,31 +116,3 @@ bool xmrig::LinuxMemory::reserve(size_t size, uint32_t node, size_t hugePageSize
|
||||
|
||||
return write_nr_hugepages(node, hugePageSize, std::max<size_t>(nr_hugepages(node, hugePageSize), 0) + (required - available));
|
||||
}
|
||||
|
||||
|
||||
bool xmrig::LinuxMemory::write(const char *path, uint64_t value)
|
||||
{
|
||||
std::ofstream file(path, std::ios::out | std::ios::binary | std::ios::trunc);
|
||||
if (!file.is_open()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
file << value;
|
||||
file.flush();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int64_t xmrig::LinuxMemory::read(const char *path)
|
||||
{
|
||||
std::ifstream file(path);
|
||||
if (!file.is_open()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint64_t value = 0;
|
||||
file >> value;
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2018-2021 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2021 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -31,13 +31,10 @@ class LinuxMemory
|
||||
{
|
||||
public:
|
||||
static bool reserve(size_t size, uint32_t node, size_t hugePageSize);
|
||||
|
||||
static bool write(const char *path, uint64_t value);
|
||||
static int64_t read(const char *path);
|
||||
};
|
||||
|
||||
|
||||
} /* namespace xmrig */
|
||||
} // namespace xmrig
|
||||
|
||||
|
||||
#endif /* XMRIG_LINUXMEMORY_H */
|
||||
#endif // XMRIG_LINUXMEMORY_H
|
||||
|
||||
@@ -49,7 +49,7 @@ xmrig::MemoryPool::MemoryPool(size_t size, bool hugePages, uint32_t node)
|
||||
|
||||
constexpr size_t alignment = 1 << 24;
|
||||
|
||||
m_memory = new VirtualMemory(size * pageSize + alignment, hugePages, false, false, node);
|
||||
m_memory = new VirtualMemory(size * pageSize + alignment, hugePages, false, false, node, VirtualMemory::kDefaultHugePageSize);
|
||||
|
||||
m_alignOffset = (alignment - (((size_t)m_memory->scratchpad()) % alignment)) % alignment;
|
||||
}
|
||||
|
||||
@@ -75,6 +75,16 @@ xmrig::VirtualMemory::VirtualMemory(size_t size, bool hugePages, bool oneGbPages
|
||||
}
|
||||
|
||||
m_scratchpad = static_cast<uint8_t*>(_mm_malloc(m_size, alignSize));
|
||||
|
||||
// Huge pages failed to allocate, but try to enable transparent huge pages for the range
|
||||
if (alignSize >= kDefaultHugePageSize) {
|
||||
if (m_scratchpad) {
|
||||
adviseLargePages(m_scratchpad, m_size);
|
||||
}
|
||||
else {
|
||||
m_scratchpad = static_cast<uint8_t*>(_mm_malloc(m_size, 64));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ public:
|
||||
static void *allocateExecutableMemory(size_t size, bool hugePages);
|
||||
static void *allocateLargePagesMemory(size_t size);
|
||||
static void *allocateOneGbPagesMemory(size_t size);
|
||||
static bool adviseLargePages(void *p, size_t size);
|
||||
static void destroy();
|
||||
static void flushInstructionCache(void *p, size_t size);
|
||||
static void freeLargePagesMemory(void *p, size_t size);
|
||||
|
||||
@@ -86,7 +86,7 @@ bool xmrig::VirtualMemory::isHugepagesAvailable()
|
||||
{
|
||||
# ifdef XMRIG_OS_LINUX
|
||||
return std::ifstream("/proc/sys/vm/nr_hugepages").good() || std::ifstream("/sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages").good();
|
||||
# elif defined(XMRIG_OS_MACOS) && defined(XMRIG_ARM)
|
||||
# elif defined(XMRIG_OS_MACOS) && defined(XMRIG_ARM) || defined(XMRIG_OS_HAIKU)
|
||||
return false;
|
||||
# else
|
||||
return true;
|
||||
@@ -156,7 +156,8 @@ void *xmrig::VirtualMemory::allocateExecutableMemory(size_t size, bool hugePages
|
||||
if (!mem) {
|
||||
mem = mmap(0, size, PROT_READ | PROT_WRITE | SECURE_PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
}
|
||||
|
||||
# elif defined(XMRIG_OS_HAIKU)
|
||||
void *mem = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
# else
|
||||
|
||||
void *mem = nullptr;
|
||||
@@ -181,6 +182,8 @@ void *xmrig::VirtualMemory::allocateLargePagesMemory(size_t size)
|
||||
void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
|
||||
# elif defined(XMRIG_OS_FREEBSD)
|
||||
void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0);
|
||||
# elif defined(XMRIG_OS_HAIKU)
|
||||
void *mem = nullptr;
|
||||
# else
|
||||
void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE | hugePagesFlag(hugePageSize()), 0, 0);
|
||||
# endif
|
||||
@@ -273,6 +276,16 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()
|
||||
}
|
||||
|
||||
|
||||
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
|
||||
{
|
||||
# ifdef XMRIG_OS_LINUX
|
||||
return (madvise(p, size, MADV_HUGEPAGE) == 0);
|
||||
# else
|
||||
return false;
|
||||
# endif
|
||||
}
|
||||
|
||||
|
||||
void xmrig::VirtualMemory::freeLargePagesMemory()
|
||||
{
|
||||
if (m_flags.test(FLAG_LOCK)) {
|
||||
|
||||
@@ -260,6 +260,12 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()
|
||||
}
|
||||
|
||||
|
||||
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void xmrig::VirtualMemory::freeLargePagesMemory()
|
||||
{
|
||||
freeLargePagesMemory(m_scratchpad, m_size);
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#define XMRIG_MM_MALLOC_PORTABLE_H
|
||||
|
||||
|
||||
#if defined(XMRIG_ARM) && !defined(__clang__)
|
||||
#if (defined(XMRIG_ARM) || defined(XMRIG_RISCV)) && !defined(__clang__)
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
|
||||
@@ -57,6 +57,9 @@
|
||||
|
||||
#if defined(XMRIG_ARM)
|
||||
# include "crypto/cn/sse2neon.h"
|
||||
#elif defined(XMRIG_RISCV)
|
||||
// RISC-V doesn't have SSE/NEON, provide minimal compatibility
|
||||
# define _mm_pause() __asm__ __volatile__("nop")
|
||||
#elif defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
#else
|
||||
@@ -286,7 +289,7 @@ struct HelperThread
|
||||
|
||||
void benchmark()
|
||||
{
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
static std::atomic<int> done{ 0 };
|
||||
if (done.exchange(1)) {
|
||||
return;
|
||||
@@ -478,7 +481,7 @@ static inline bool findByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambd
|
||||
|
||||
HelperThread* create_helper_thread(int64_t cpu_index, int priority, const std::vector<int64_t>& affinities)
|
||||
{
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
hwloc_bitmap_t helper_cpu_set = hwloc_bitmap_alloc();
|
||||
hwloc_bitmap_t main_threads_set = hwloc_bitmap_alloc();
|
||||
|
||||
@@ -807,7 +810,7 @@ void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ct
|
||||
uint32_t cn_indices[6];
|
||||
select_indices(cn_indices, seed);
|
||||
|
||||
#ifdef XMRIG_ARM
|
||||
#if defined(XMRIG_ARM) || defined(XMRIG_RISCV)
|
||||
uint32_t step[6] = { 1, 1, 1, 1, 1, 1 };
|
||||
#else
|
||||
uint32_t step[6] = { 4, 4, 1, 2, 4, 4 };
|
||||
|
||||
@@ -38,6 +38,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/common.hpp"
|
||||
#include "crypto/rx/Profiler.h"
|
||||
|
||||
#include "backend/cpu/Cpu.h"
|
||||
|
||||
#ifdef XMRIG_RISCV
|
||||
#include "crypto/randomx/aes_hash_rv64_vector.hpp"
|
||||
#include "crypto/randomx/aes_hash_rv64_zvkned.hpp"
|
||||
#endif
|
||||
|
||||
#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
|
||||
#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
|
||||
#define AES_HASH_1R_STATE2 0xe8a07ce4, 0x5079506b, 0xae62c7d0, 0x6a770017
|
||||
@@ -59,14 +66,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
Hashing throughput: >20 GiB/s per CPU core with hardware AES
|
||||
*/
|
||||
template<int softAes>
|
||||
void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
|
||||
void hashAes1Rx4(const void *input, size_t inputSize, void *hash)
|
||||
{
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
hashAes1Rx4_zvkned(input, inputSize, hash);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
hashAes1Rx4_RVV<softAes>(input, inputSize, hash);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint8_t* inptr = (uint8_t*)input;
|
||||
const uint8_t* inputEnd = inptr + inputSize;
|
||||
|
||||
rx_vec_i128 state0, state1, state2, state3;
|
||||
rx_vec_i128 in0, in1, in2, in3;
|
||||
|
||||
//intial state
|
||||
//initial state
|
||||
state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0);
|
||||
state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1);
|
||||
state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2);
|
||||
@@ -127,7 +147,20 @@ template void hashAes1Rx4<true>(const void *input, size_t inputSize, void *hash)
|
||||
calls to this function.
|
||||
*/
|
||||
template<int softAes>
|
||||
void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
|
||||
void fillAes1Rx4(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
fillAes1Rx4_zvkned(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
fillAes1Rx4_RVV<softAes>(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
@@ -171,7 +204,20 @@ static constexpr randomx::Instruction inst{ 0xFF, 7, 7, 0xFF, 0xFFFFFFFFU };
|
||||
alignas(16) static const randomx::Instruction inst_mask[2] = { inst, inst };
|
||||
|
||||
template<int softAes>
|
||||
void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
|
||||
void fillAes4Rx4(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
fillAes4Rx4_zvkned(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
fillAes4Rx4_RVV<softAes>(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
@@ -235,10 +281,34 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
|
||||
template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
#ifdef XMRIG_VAES
|
||||
void hashAndFillAes1Rx4_VAES512(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
#endif
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state)
|
||||
{
|
||||
PROFILE_SCOPE(RandomX_AES);
|
||||
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
hashAndFillAes1Rx4_zvkned(scratchpad, scratchpadSize, hash, fill_state);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
hashAndFillAes1Rx4_RVV<softAes, unroll>(scratchpad, scratchpadSize, hash, fill_state);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef XMRIG_VAES
|
||||
if (xmrig::Cpu::info()->arch() == xmrig::ICpuInfo::ARCH_ZEN5) {
|
||||
hashAndFillAes1Rx4_VAES512(scratchpad, scratchpadSize, hash, fill_state);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
@@ -386,43 +456,54 @@ hashAndFillAes1Rx4_impl* softAESImpl = &hashAndFillAes1Rx4<1,1>;
|
||||
|
||||
void SelectSoftAESImpl(size_t threadsCount)
|
||||
{
|
||||
constexpr uint64_t test_length_ms = 100;
|
||||
const std::array<hashAndFillAes1Rx4_impl *, 4> impl = {
|
||||
&hashAndFillAes1Rx4<1,1>,
|
||||
&hashAndFillAes1Rx4<2,1>,
|
||||
&hashAndFillAes1Rx4<2,2>,
|
||||
&hashAndFillAes1Rx4<2,4>,
|
||||
};
|
||||
size_t fast_idx = 0;
|
||||
double fast_speed = 0.0;
|
||||
for (size_t run = 0; run < 3; ++run) {
|
||||
for (size_t i = 0; i < impl.size(); ++i) {
|
||||
const double t1 = xmrig::Chrono::highResolutionMSecs();
|
||||
std::vector<uint32_t> count(threadsCount, 0);
|
||||
std::vector<std::thread> threads;
|
||||
for (size_t t = 0; t < threadsCount; ++t) {
|
||||
threads.emplace_back([&, t]() {
|
||||
std::vector<uint8_t> scratchpad(10 * 1024);
|
||||
alignas(16) uint8_t hash[64] = {};
|
||||
alignas(16) uint8_t state[64] = {};
|
||||
do {
|
||||
(*impl[i])(scratchpad.data(), scratchpad.size(), hash, state);
|
||||
++count[t];
|
||||
} while (xmrig::Chrono::highResolutionMSecs() - t1 < test_length_ms);
|
||||
});
|
||||
}
|
||||
uint32_t total = 0;
|
||||
for (size_t t = 0; t < threadsCount; ++t) {
|
||||
threads[t].join();
|
||||
total += count[t];
|
||||
}
|
||||
const double t2 = xmrig::Chrono::highResolutionMSecs();
|
||||
const double speed = total * 1e3 / (t2 - t1);
|
||||
if (speed > fast_speed) {
|
||||
fast_idx = i;
|
||||
fast_speed = speed;
|
||||
}
|
||||
}
|
||||
}
|
||||
softAESImpl = impl[fast_idx];
|
||||
constexpr uint64_t test_length_ms = 100;
|
||||
|
||||
const std::array<hashAndFillAes1Rx4_impl *, 4> impl = {
|
||||
&hashAndFillAes1Rx4<1,1>,
|
||||
&hashAndFillAes1Rx4<2,1>,
|
||||
&hashAndFillAes1Rx4<2,2>,
|
||||
&hashAndFillAes1Rx4<2,4>,
|
||||
};
|
||||
|
||||
size_t fast_idx = 0;
|
||||
double fast_speed = 0.0;
|
||||
|
||||
for (size_t run = 0; run < 3; ++run) {
|
||||
for (size_t i = 0; i < impl.size(); ++i) {
|
||||
const double t1 = xmrig::Chrono::highResolutionMSecs();
|
||||
|
||||
std::vector<uint32_t> count(threadsCount, 0);
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
for (size_t t = 0; t < threadsCount; ++t) {
|
||||
threads.emplace_back([&, t]() {
|
||||
std::vector<uint8_t> scratchpad(10 * 1024);
|
||||
|
||||
alignas(16) uint8_t hash[64] = {};
|
||||
alignas(16) uint8_t state[64] = {};
|
||||
|
||||
do {
|
||||
(*impl[i])(scratchpad.data(), scratchpad.size(), hash, state);
|
||||
++count[t];
|
||||
} while (xmrig::Chrono::highResolutionMSecs() - t1 < test_length_ms);
|
||||
});
|
||||
}
|
||||
|
||||
uint32_t total = 0;
|
||||
|
||||
for (size_t t = 0; t < threadsCount; ++t) {
|
||||
threads[t].join();
|
||||
total += count[t];
|
||||
}
|
||||
|
||||
const double t2 = xmrig::Chrono::highResolutionMSecs();
|
||||
const double speed = total * 1e3 / (t2 - t1);
|
||||
|
||||
if (speed > fast_speed) {
|
||||
fast_idx = i;
|
||||
fast_speed = speed;
|
||||
}
|
||||
}
|
||||
}
|
||||
softAESImpl = impl[fast_idx];
|
||||
}
|
||||
|
||||
322
src/crypto/randomx/aes_hash_rv64_vector.cpp
Normal file
322
src/crypto/randomx/aes_hash_rv64_vector.cpp
Normal file
@@ -0,0 +1,322 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
#include "crypto/randomx/soft_aes.h"
|
||||
#include "crypto/randomx/randomx.h"
|
||||
|
||||
static FORCE_INLINE vuint32m1_t softaes_vector_double(
|
||||
vuint32m1_t in,
|
||||
vuint32m1_t key,
|
||||
vuint8m1_t i0, vuint8m1_t i1, vuint8m1_t i2, vuint8m1_t i3,
|
||||
const uint32_t* lut0, const uint32_t* lut1, const uint32_t *lut2, const uint32_t* lut3)
|
||||
{
|
||||
const vuint8m1_t in8 = __riscv_vreinterpret_v_u32m1_u8m1(in);
|
||||
|
||||
const vuint32m1_t index0 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i0, 32));
|
||||
const vuint32m1_t index1 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i1, 32));
|
||||
const vuint32m1_t index2 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i2, 32));
|
||||
const vuint32m1_t index3 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i3, 32));
|
||||
|
||||
vuint32m1_t s0 = __riscv_vluxei32_v_u32m1(lut0, __riscv_vsll_vx_u32m1(index0, 2, 8), 8);
|
||||
vuint32m1_t s1 = __riscv_vluxei32_v_u32m1(lut1, __riscv_vsll_vx_u32m1(index1, 2, 8), 8);
|
||||
vuint32m1_t s2 = __riscv_vluxei32_v_u32m1(lut2, __riscv_vsll_vx_u32m1(index2, 2, 8), 8);
|
||||
vuint32m1_t s3 = __riscv_vluxei32_v_u32m1(lut3, __riscv_vsll_vx_u32m1(index3, 2, 8), 8);
|
||||
|
||||
s0 = __riscv_vxor_vv_u32m1(s0, s1, 8);
|
||||
s2 = __riscv_vxor_vv_u32m1(s2, s3, 8);
|
||||
s0 = __riscv_vxor_vv_u32m1(s0, s2, 8);
|
||||
|
||||
return __riscv_vxor_vv_u32m1(s0, key, 8);
|
||||
}
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
|
||||
static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
|
||||
|
||||
static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
|
||||
static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
|
||||
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 0, 4, 8, 12, 64, 68, 72, 76 };
|
||||
|
||||
template<int softAes>
|
||||
void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash) {
|
||||
const uint8_t* inptr = (const uint8_t*)input;
|
||||
const uint8_t* inputEnd = inptr + inputSize;
|
||||
|
||||
//intial state
|
||||
vuint32m1_t state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (inptr < inputEnd) {
|
||||
state02 = softaes_vector_double(state02, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
state13 = softaes_vector_double(state13, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
inptr += 64;
|
||||
}
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
state02 = softaes_vector_double(state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
state13 = softaes_vector_double(state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
state02 = softaes_vector_double(state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
state13 = softaes_vector_double(state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
template void hashAes1Rx4_RVV<false>(const void *input, size_t inputSize, void *hash);
|
||||
template void hashAes1Rx4_RVV<true>(const void *input, size_t inputSize, void *hash);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer) {
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = softaes_vector_double(state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
template void fillAes1Rx4_RVV<false>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes1Rx4_RVV<true>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer) {
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t stride4 = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X4, 8);
|
||||
|
||||
const vuint32m1_t key04 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 0), stride4, 8);
|
||||
const vuint32m1_t key15 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 1), stride4, 8);
|
||||
const vuint32m1_t key26 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 2), stride4, 8);
|
||||
const vuint32m1_t key37 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 3), stride4, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = softaes_vector_double(state02, key04, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key04, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
state02 = softaes_vector_double(state02, key15, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key15, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
state02 = softaes_vector_double(state02, key26, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key26, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
state02 = softaes_vector_double(state02, key37, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key37, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
}
|
||||
|
||||
template void fillAes4Rx4_RVV<false>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes4Rx4_RVV<true>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
|
||||
vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (scratchpadPtr < scratchpadEnd) {
|
||||
#define HASH_STATE(k) \
|
||||
hash_state02 = softaes_vector_double(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
|
||||
hash_state13 = softaes_vector_double(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
#define FILL_STATE(k) \
|
||||
fill_state02 = softaes_vector_double(fill_state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); \
|
||||
fill_state13 = softaes_vector_double(fill_state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, fill_state02, 8); \
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, fill_state13, 8);
|
||||
|
||||
switch (softAes) {
|
||||
case 0:
|
||||
HASH_STATE(0);
|
||||
HASH_STATE(1);
|
||||
|
||||
FILL_STATE(0);
|
||||
FILL_STATE(1);
|
||||
|
||||
scratchpadPtr += 128;
|
||||
break;
|
||||
|
||||
default:
|
||||
switch (unroll) {
|
||||
case 4:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
HASH_STATE(1);
|
||||
FILL_STATE(1);
|
||||
|
||||
HASH_STATE(2);
|
||||
FILL_STATE(2);
|
||||
|
||||
HASH_STATE(3);
|
||||
FILL_STATE(3);
|
||||
|
||||
scratchpadPtr += 64 * 4;
|
||||
break;
|
||||
|
||||
case 2:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
HASH_STATE(1);
|
||||
FILL_STATE(1);
|
||||
|
||||
scratchpadPtr += 64 * 2;
|
||||
break;
|
||||
|
||||
default:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#undef HASH_STATE
|
||||
#undef FILL_STATE
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
hash_state02 = softaes_vector_double(hash_state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
hash_state13 = softaes_vector_double(hash_state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
hash_state02 = softaes_vector_double(hash_state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
hash_state13 = softaes_vector_double(hash_state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
|
||||
}
|
||||
|
||||
template void hashAndFillAes1Rx4_RVV<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<2,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<2,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<2,4>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
42
src/crypto/randomx/aes_hash_rv64_vector.hpp
Normal file
42
src/crypto/randomx/aes_hash_rv64_vector.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
template<int softAes>
|
||||
void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
199
src/crypto/randomx/aes_hash_rv64_zvkned.cpp
Normal file
199
src/crypto/randomx/aes_hash_rv64_zvkned.cpp
Normal file
@@ -0,0 +1,199 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "crypto/randomx/aes_hash.hpp"
|
||||
#include "crypto/randomx/randomx.h"
|
||||
#include "crypto/rx/Profiler.h"
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
static FORCE_INLINE vuint32m1_t aesenc_zvkned(vuint32m1_t a, vuint32m1_t b) { return __riscv_vaesem_vv_u32m1(a, b, 8); }
|
||||
static FORCE_INLINE vuint32m1_t aesdec_zvkned(vuint32m1_t a, vuint32m1_t b, vuint32m1_t zero) { return __riscv_vxor_vv_u32m1(__riscv_vaesdm_vv_u32m1(a, zero, 8), b, 8); }
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
|
||||
static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
|
||||
|
||||
static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
|
||||
static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
|
||||
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 0, 4, 8, 12, 64, 68, 72, 76 };
|
||||
|
||||
void hashAes1Rx4_zvkned(const void *input, size_t inputSize, void *hash)
|
||||
{
|
||||
const uint8_t* inptr = (const uint8_t*)input;
|
||||
const uint8_t* inputEnd = inptr + inputSize;
|
||||
|
||||
//intial state
|
||||
vuint32m1_t state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (inptr < inputEnd) {
|
||||
state02 = aesenc_zvkned(state02, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 0, stride, 8));
|
||||
state13 = aesdec_zvkned(state13, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 4, stride, 8), zero);
|
||||
|
||||
inptr += 64;
|
||||
}
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
state02 = aesenc_zvkned(state02, xkey00);
|
||||
state13 = aesdec_zvkned(state13, xkey00, zero);
|
||||
|
||||
state02 = aesenc_zvkned(state02, xkey11);
|
||||
state13 = aesdec_zvkned(state13, xkey11, zero);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
void fillAes1Rx4_zvkned(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = aesdec_zvkned(state02, key02, zero);
|
||||
state13 = aesenc_zvkned(state13, key13);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
void fillAes4Rx4_zvkned(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t stride4 = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X4, 8);
|
||||
|
||||
const vuint32m1_t key04 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 0), stride4, 8);
|
||||
const vuint32m1_t key15 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 1), stride4, 8);
|
||||
const vuint32m1_t key26 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 2), stride4, 8);
|
||||
const vuint32m1_t key37 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 3), stride4, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = aesdec_zvkned(state02, key04, zero);
|
||||
state13 = aesenc_zvkned(state13, key04);
|
||||
|
||||
state02 = aesdec_zvkned(state02, key15, zero);
|
||||
state13 = aesenc_zvkned(state13, key15);
|
||||
|
||||
state02 = aesdec_zvkned(state02, key26, zero);
|
||||
state13 = aesenc_zvkned(state13, key26);
|
||||
|
||||
state02 = aesdec_zvkned(state02, key37, zero);
|
||||
state13 = aesenc_zvkned(state13, key37);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
}
|
||||
|
||||
void hashAndFillAes1Rx4_zvkned(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state)
|
||||
{
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
|
||||
vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (scratchpadPtr < scratchpadEnd) {
|
||||
hash_state02 = aesenc_zvkned(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + 0, stride, 8));
|
||||
hash_state13 = aesdec_zvkned(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + 4, stride, 8), zero);
|
||||
|
||||
fill_state02 = aesdec_zvkned(fill_state02, key02, zero);
|
||||
fill_state13 = aesenc_zvkned(fill_state13, key13);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + 0, stride, fill_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + 4, stride, fill_state13, 8);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
}
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
hash_state02 = aesenc_zvkned(hash_state02, xkey00);
|
||||
hash_state13 = aesdec_zvkned(hash_state13, xkey00, zero);
|
||||
|
||||
hash_state02 = aesenc_zvkned(hash_state02, xkey11);
|
||||
hash_state13 = aesdec_zvkned(hash_state13, xkey11, zero);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
|
||||
}
|
||||
35
src/crypto/randomx/aes_hash_rv64_zvkned.hpp
Normal file
35
src/crypto/randomx/aes_hash_rv64_zvkned.hpp
Normal file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
void hashAes1Rx4_zvkned(const void *input, size_t inputSize, void *hash);
|
||||
void fillAes1Rx4_zvkned(void *state, size_t outputSize, void *buffer);
|
||||
void fillAes4Rx4_zvkned(void *state, size_t outputSize, void *buffer);
|
||||
void hashAndFillAes1Rx4_zvkned(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
148
src/crypto/randomx/aes_hash_vaes512.cpp
Normal file
148
src/crypto/randomx/aes_hash_vaes512.cpp
Normal file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2026 XMRig <support@xmrig.com>
|
||||
Copyright (c) 2026 SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <immintrin.h>
|
||||
|
||||
#define REVERSE_4(A, B, C, D) D, C, B, A
|
||||
|
||||
alignas(64) static const uint32_t AES_HASH_1R_STATE[] = {
|
||||
REVERSE_4(0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d),
|
||||
REVERSE_4(0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e),
|
||||
REVERSE_4(0xe8a07ce4, 0x5079506b, 0xae62c7d0, 0x6a770017),
|
||||
REVERSE_4(0x7e994948, 0x79a10005, 0x07ad828d, 0x630a240c)
|
||||
};
|
||||
|
||||
alignas(64) static const uint32_t AES_GEN_1R_KEY[] = {
|
||||
REVERSE_4(0xb4f44917, 0xdbb5552b, 0x62716609, 0x6daca553),
|
||||
REVERSE_4(0x0da1dc4e, 0x1725d378, 0x846a710d, 0x6d7caf07),
|
||||
REVERSE_4(0x3e20e345, 0xf4c0794f, 0x9f947ec6, 0x3f1262f1),
|
||||
REVERSE_4(0x49169154, 0x16314c88, 0xb1ba317c, 0x6aef8135)
|
||||
};
|
||||
|
||||
alignas(64) static const uint32_t AES_HASH_1R_XKEY0[] = {
|
||||
REVERSE_4(0x06890201, 0x90dc56bf, 0x8b24949f, 0xf6fa8389),
|
||||
REVERSE_4(0x06890201, 0x90dc56bf, 0x8b24949f, 0xf6fa8389),
|
||||
REVERSE_4(0x06890201, 0x90dc56bf, 0x8b24949f, 0xf6fa8389),
|
||||
REVERSE_4(0x06890201, 0x90dc56bf, 0x8b24949f, 0xf6fa8389)
|
||||
};
|
||||
|
||||
alignas(64) static const uint32_t AES_HASH_1R_XKEY1[] = {
|
||||
REVERSE_4(0xed18f99b, 0xee1043c6, 0x51f4e03c, 0x61b263d1),
|
||||
REVERSE_4(0xed18f99b, 0xee1043c6, 0x51f4e03c, 0x61b263d1),
|
||||
REVERSE_4(0xed18f99b, 0xee1043c6, 0x51f4e03c, 0x61b263d1),
|
||||
REVERSE_4(0xed18f99b, 0xee1043c6, 0x51f4e03c, 0x61b263d1)
|
||||
};
|
||||
|
||||
void hashAndFillAes1Rx4_VAES512(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state)
|
||||
{
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
const __m512i fill_key = _mm512_load_si512(AES_GEN_1R_KEY);
|
||||
|
||||
const __m512i initial_hash_state = _mm512_load_si512(AES_HASH_1R_STATE);
|
||||
const __m512i initial_fill_state = _mm512_load_si512(fill_state);
|
||||
|
||||
constexpr uint8_t mask = 0b11001100;
|
||||
|
||||
// enc_data[0] = hash_state[0]
|
||||
// enc_data[1] = fill_state[1]
|
||||
// enc_data[2] = hash_state[2]
|
||||
// enc_data[3] = fill_state[3]
|
||||
__m512i enc_data = _mm512_mask_blend_epi64(mask, initial_hash_state, initial_fill_state);
|
||||
|
||||
// dec_data[0] = fill_state[0]
|
||||
// dec_data[1] = hash_state[1]
|
||||
// dec_data[2] = fill_state[2]
|
||||
// dec_data[3] = hash_state[3]
|
||||
__m512i dec_data = _mm512_mask_blend_epi64(mask, initial_fill_state, initial_hash_state);
|
||||
|
||||
constexpr int PREFETCH_DISTANCE = 7168;
|
||||
|
||||
const uint8_t* prefetchPtr = scratchpadPtr + PREFETCH_DISTANCE;
|
||||
scratchpadEnd -= PREFETCH_DISTANCE;
|
||||
|
||||
for (const uint8_t* p = scratchpadPtr; p < prefetchPtr; p += 256) {
|
||||
_mm_prefetch((const char*)(p + 0), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(p + 64), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(p + 128), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(p + 192), _MM_HINT_T0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
while (scratchpadPtr < scratchpadEnd) {
|
||||
const __m512i scratchpad_data = _mm512_load_si512(scratchpadPtr);
|
||||
|
||||
// enc_key[0] = scratchpad_data[0]
|
||||
// enc_key[1] = fill_key[1]
|
||||
// enc_key[2] = scratchpad_data[2]
|
||||
// enc_key[3] = fill_key[3]
|
||||
enc_data = _mm512_aesenc_epi128(enc_data, _mm512_mask_blend_epi64(mask, scratchpad_data, fill_key));
|
||||
|
||||
// dec_key[0] = fill_key[0]
|
||||
// dec_key[1] = scratchpad_data[1]
|
||||
// dec_key[2] = fill_key[2]
|
||||
// dec_key[3] = scratchpad_data[3]
|
||||
dec_data = _mm512_aesdec_epi128(dec_data, _mm512_mask_blend_epi64(mask, fill_key, scratchpad_data));
|
||||
|
||||
// fill_state[0] = dec_data[0]
|
||||
// fill_state[1] = enc_data[1]
|
||||
// fill_state[2] = dec_data[2]
|
||||
// fill_state[3] = enc_data[3]
|
||||
_mm512_store_si512(scratchpadPtr, _mm512_mask_blend_epi64(mask, dec_data, enc_data));
|
||||
|
||||
_mm_prefetch((const char*)prefetchPtr, _MM_HINT_T0);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
prefetchPtr += 64;
|
||||
}
|
||||
prefetchPtr = (const uint8_t*) scratchpad;
|
||||
scratchpadEnd += PREFETCH_DISTANCE;
|
||||
}
|
||||
|
||||
_mm512_store_si512(fill_state, _mm512_mask_blend_epi64(mask, dec_data, enc_data));
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const __m512i xkey0 = _mm512_load_si512(AES_HASH_1R_XKEY0);
|
||||
const __m512i xkey1 = _mm512_load_si512(AES_HASH_1R_XKEY1);
|
||||
|
||||
enc_data = _mm512_aesenc_epi128(enc_data, xkey0);
|
||||
dec_data = _mm512_aesdec_epi128(dec_data, xkey0);
|
||||
enc_data = _mm512_aesenc_epi128(enc_data, xkey1);
|
||||
dec_data = _mm512_aesdec_epi128(dec_data, xkey1);
|
||||
|
||||
//output hash
|
||||
_mm512_store_si512(hash, _mm512_mask_blend_epi64(mask, enc_data, dec_data));
|
||||
|
||||
// Just in case
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
;# save VM register values
|
||||
add rsp, 40
|
||||
add rsp, 248
|
||||
pop rcx
|
||||
mov qword ptr [rcx+0], r8
|
||||
mov qword ptr [rcx+8], r9
|
||||
|
||||
30
src/crypto/randomx/asm/program_loop_store_hard_aes.inc
Normal file
30
src/crypto/randomx/asm/program_loop_store_hard_aes.inc
Normal file
@@ -0,0 +1,30 @@
|
||||
mov rcx, [rsp+24]
|
||||
mov qword ptr [rcx+0], r8
|
||||
mov qword ptr [rcx+8], r9
|
||||
mov qword ptr [rcx+16], r10
|
||||
mov qword ptr [rcx+24], r11
|
||||
mov qword ptr [rcx+32], r12
|
||||
mov qword ptr [rcx+40], r13
|
||||
mov qword ptr [rcx+48], r14
|
||||
mov qword ptr [rcx+56], r15
|
||||
mov rcx, [rsp+16]
|
||||
aesenc xmm0, xmm4
|
||||
aesdec xmm1, xmm4
|
||||
aesenc xmm2, xmm4
|
||||
aesdec xmm3, xmm4
|
||||
aesenc xmm0, xmm5
|
||||
aesdec xmm1, xmm5
|
||||
aesenc xmm2, xmm5
|
||||
aesdec xmm3, xmm5
|
||||
aesenc xmm0, xmm6
|
||||
aesdec xmm1, xmm6
|
||||
aesenc xmm2, xmm6
|
||||
aesdec xmm3, xmm6
|
||||
aesenc xmm0, xmm7
|
||||
aesdec xmm1, xmm7
|
||||
aesenc xmm2, xmm7
|
||||
aesdec xmm3, xmm7
|
||||
movapd xmmword ptr [rcx+0], xmm0
|
||||
movapd xmmword ptr [rcx+16], xmm1
|
||||
movapd xmmword ptr [rcx+32], xmm2
|
||||
movapd xmmword ptr [rcx+48], xmm3
|
||||
196
src/crypto/randomx/asm/program_loop_store_soft_aes.inc
Normal file
196
src/crypto/randomx/asm/program_loop_store_soft_aes.inc
Normal file
@@ -0,0 +1,196 @@
|
||||
mov rcx, [rsp+24]
|
||||
mov qword ptr [rcx+0], r8
|
||||
mov qword ptr [rcx+8], r9
|
||||
mov qword ptr [rcx+16], r10
|
||||
mov qword ptr [rcx+24], r11
|
||||
mov qword ptr [rcx+32], r12
|
||||
mov qword ptr [rcx+40], r13
|
||||
mov qword ptr [rcx+48], r14
|
||||
mov qword ptr [rcx+56], r15
|
||||
|
||||
movapd xmmword ptr [rsp+40], xmm0
|
||||
movapd xmmword ptr [rsp+56], xmm1
|
||||
movapd xmmword ptr [rsp+72], xmm2
|
||||
movapd xmmword ptr [rsp+88], xmm3
|
||||
movapd xmmword ptr [rsp+104], xmm4
|
||||
movapd xmmword ptr [rsp+120], xmm5
|
||||
movapd xmmword ptr [rsp+136], xmm6
|
||||
movapd xmmword ptr [rsp+152], xmm7
|
||||
|
||||
mov [rsp+168], rax
|
||||
mov [rsp+176], rbx
|
||||
mov [rsp+184], rdx
|
||||
mov [rsp+192], rsi
|
||||
mov [rsp+200], rdi
|
||||
mov [rsp+208], rbp
|
||||
mov [rsp+216], r8
|
||||
mov [rsp+224], r9
|
||||
|
||||
mov r8, [rsp+232] ;# aes_lut_enc
|
||||
mov r9, [rsp+240] ;# aes_lut_dec
|
||||
|
||||
movapd xmm12, xmmword ptr [rsp-8] ;# "call" will overwrite IMUL_RCP's data on stack, so save it
|
||||
|
||||
lea rsi, [rsp+104]
|
||||
lea rdi, [rsp+40]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+56]
|
||||
call soft_aes_dec
|
||||
lea rdi, [rsp+72]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+88]
|
||||
call soft_aes_dec
|
||||
|
||||
lea rsi, [rsp+120]
|
||||
lea rdi, [rsp+40]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+56]
|
||||
call soft_aes_dec
|
||||
lea rdi, [rsp+72]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+88]
|
||||
call soft_aes_dec
|
||||
|
||||
lea rsi, [rsp+136]
|
||||
lea rdi, [rsp+40]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+56]
|
||||
call soft_aes_dec
|
||||
lea rdi, [rsp+72]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+88]
|
||||
call soft_aes_dec
|
||||
|
||||
lea rsi, [rsp+152]
|
||||
lea rdi, [rsp+40]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+56]
|
||||
call soft_aes_dec
|
||||
lea rdi, [rsp+72]
|
||||
call soft_aes_enc
|
||||
lea rdi, [rsp+88]
|
||||
call soft_aes_dec
|
||||
|
||||
movapd xmmword ptr [rsp-8], xmm12
|
||||
|
||||
jmp soft_aes_end
|
||||
|
||||
soft_aes_enc:
|
||||
mov eax, dword ptr [rsi+0]
|
||||
mov ebx, dword ptr [rsi+4]
|
||||
mov ecx, dword ptr [rsi+8]
|
||||
mov edx, dword ptr [rsi+12]
|
||||
|
||||
movzx ebp, byte ptr [rdi+0]
|
||||
xor eax, dword ptr [r8+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+1]
|
||||
xor edx, dword ptr [r8+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+2]
|
||||
xor ecx, dword ptr [r8+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+3]
|
||||
xor ebx, dword ptr [r8+rbp*4+3072]
|
||||
|
||||
movzx ebp, byte ptr [rdi+4]
|
||||
xor ebx, dword ptr [r8+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+5]
|
||||
xor eax, dword ptr [r8+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+6]
|
||||
xor edx, dword ptr [r8+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+7]
|
||||
xor ecx, dword ptr [r8+rbp*4+3072]
|
||||
|
||||
movzx ebp, byte ptr [rdi+8]
|
||||
xor ecx, dword ptr [r8+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+9]
|
||||
xor ebx, dword ptr [r8+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+10]
|
||||
xor eax, dword ptr [r8+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+11]
|
||||
xor edx, dword ptr [r8+rbp*4+3072]
|
||||
|
||||
movzx ebp, byte ptr [rdi+12]
|
||||
xor edx, dword ptr [r8+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+13]
|
||||
xor ecx, dword ptr [r8+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+14]
|
||||
xor ebx, dword ptr [r8+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+15]
|
||||
xor eax, dword ptr [r8+rbp*4+3072]
|
||||
|
||||
mov dword ptr [rdi+0], eax
|
||||
mov dword ptr [rdi+4], ebx
|
||||
mov dword ptr [rdi+8], ecx
|
||||
mov dword ptr [rdi+12], edx
|
||||
|
||||
ret
|
||||
|
||||
soft_aes_dec:
|
||||
mov eax, dword ptr [rsi+0]
|
||||
mov ebx, dword ptr [rsi+4]
|
||||
mov ecx, dword ptr [rsi+8]
|
||||
mov edx, dword ptr [rsi+12]
|
||||
|
||||
movzx ebp, byte ptr [rdi+0]
|
||||
xor eax, dword ptr [r9+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+1]
|
||||
xor ebx, dword ptr [r9+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+2]
|
||||
xor ecx, dword ptr [r9+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+3]
|
||||
xor edx, dword ptr [r9+rbp*4+3072]
|
||||
|
||||
movzx ebp, byte ptr [rdi+4]
|
||||
xor ebx, dword ptr [r9+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+5]
|
||||
xor ecx, dword ptr [r9+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+6]
|
||||
xor edx, dword ptr [r9+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+7]
|
||||
xor eax, dword ptr [r9+rbp*4+3072]
|
||||
|
||||
movzx ebp, byte ptr [rdi+8]
|
||||
xor ecx, dword ptr [r9+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+9]
|
||||
xor edx, dword ptr [r9+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+10]
|
||||
xor eax, dword ptr [r9+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+11]
|
||||
xor ebx, dword ptr [r9+rbp*4+3072]
|
||||
|
||||
movzx ebp, byte ptr [rdi+12]
|
||||
xor edx, dword ptr [r9+rbp*4]
|
||||
movzx ebp, byte ptr [rdi+13]
|
||||
xor eax, dword ptr [r9+rbp*4+1024]
|
||||
movzx ebp, byte ptr [rdi+14]
|
||||
xor ebx, dword ptr [r9+rbp*4+2048]
|
||||
movzx ebp, byte ptr [rdi+15]
|
||||
xor ecx, dword ptr [r9+rbp*4+3072]
|
||||
|
||||
mov dword ptr [rdi+0], eax
|
||||
mov dword ptr [rdi+4], ebx
|
||||
mov dword ptr [rdi+8], ecx
|
||||
mov dword ptr [rdi+12], edx
|
||||
|
||||
ret
|
||||
|
||||
soft_aes_end:
|
||||
|
||||
mov rax, [rsp+168]
|
||||
mov rbx, [rsp+176]
|
||||
mov rcx, [rsp+16]
|
||||
mov rdx, [rsp+184]
|
||||
mov rsi, [rsp+192]
|
||||
mov rdi, [rsp+200]
|
||||
mov rbp, [rsp+208]
|
||||
mov r8, [rsp+216]
|
||||
mov r9, [rsp+224]
|
||||
|
||||
movapd xmm0, xmmword ptr [rsp+40]
|
||||
movapd xmm1, xmmword ptr [rsp+56]
|
||||
movapd xmm2, xmmword ptr [rsp+72]
|
||||
movapd xmm3, xmmword ptr [rsp+88]
|
||||
|
||||
movapd xmmword ptr [rcx+0], xmm0
|
||||
movapd xmmword ptr [rcx+16], xmm1
|
||||
movapd xmmword ptr [rcx+32], xmm2
|
||||
movapd xmmword ptr [rcx+48], xmm3
|
||||
16
src/crypto/randomx/asm/program_read_dataset_v2.inc
Normal file
16
src/crypto/randomx/asm/program_read_dataset_v2.inc
Normal file
@@ -0,0 +1,16 @@
|
||||
mov ecx, ebp ;# ecx = ma
|
||||
and ecx, RANDOMX_DATASET_BASE_MASK
|
||||
xor r8, qword ptr [rdi+rcx]
|
||||
xor rbp, rax ;# modify "ma"
|
||||
mov edx, ebp ;# edx = "ma"
|
||||
ror rbp, 32 ;# swap "ma" and "mx"
|
||||
and edx, RANDOMX_DATASET_BASE_MASK
|
||||
prefetchnta byte ptr [rdi+rdx]
|
||||
xor r9, qword ptr [rdi+rcx+8]
|
||||
xor r10, qword ptr [rdi+rcx+16]
|
||||
xor r11, qword ptr [rdi+rcx+24]
|
||||
xor r12, qword ptr [rdi+rcx+32]
|
||||
xor r13, qword ptr [rdi+rcx+40]
|
||||
xor r14, qword ptr [rdi+rcx+48]
|
||||
xor r15, qword ptr [rdi+rcx+56]
|
||||
|
||||
@@ -225,7 +225,10 @@ namespace randomx {
|
||||
}
|
||||
|
||||
static void exe_CFROUND(RANDOMX_EXE_ARGS) {
|
||||
rx_set_rounding_mode(rotr64(*ibc.isrc, static_cast<uint32_t>(ibc.imm)) % 4);
|
||||
uint64_t isrc = rotr64(*ibc.isrc, ibc.imm);
|
||||
if (!RandomX_CurrentConfig.Tweak_V2_CFROUND || ((isrc & 60) == 0)) {
|
||||
rx_set_rounding_mode(isrc % 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void exe_ISTORE(RANDOMX_EXE_ARGS) {
|
||||
|
||||
@@ -111,6 +111,10 @@ namespace randomx {
|
||||
#define RANDOMX_HAVE_COMPILER 1
|
||||
class JitCompilerA64;
|
||||
using JitCompiler = JitCompilerA64;
|
||||
#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
|
||||
#define RANDOMX_HAVE_COMPILER 1
|
||||
class JitCompilerRV64;
|
||||
using JitCompiler = JitCompilerRV64;
|
||||
#else
|
||||
#define RANDOMX_HAVE_COMPILER 0
|
||||
class JitCompilerFallback;
|
||||
|
||||
@@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define RANDOMX_DATASET_MAX_SIZE 2181038080
|
||||
|
||||
// Increase it if some configs use larger programs
|
||||
#define RANDOMX_PROGRAM_MAX_SIZE 280
|
||||
#define RANDOMX_PROGRAM_MAX_SIZE 384
|
||||
|
||||
// Increase it if some configs use larger scratchpad
|
||||
#define RANDOMX_SCRATCHPAD_L3_MAX_SIZE 2097152
|
||||
|
||||
@@ -174,7 +174,7 @@ FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
|
||||
_mm_setcsr(rx_mxcsr_default | (mode << 13));
|
||||
}
|
||||
|
||||
#elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
|
||||
#elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors can't use doubles or 64 bit integers with SIMD
|
||||
#include <cstdint>
|
||||
#include <stdexcept>
|
||||
#include <cstdlib>
|
||||
|
||||
@@ -32,6 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/jit_compiler_x86.hpp"
|
||||
#elif defined(__aarch64__)
|
||||
#include "crypto/randomx/jit_compiler_a64.hpp"
|
||||
#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
|
||||
#include "crypto/randomx/jit_compiler_rv64.hpp"
|
||||
#else
|
||||
#include "crypto/randomx/jit_compiler_fallback.hpp"
|
||||
#endif
|
||||
|
||||
@@ -67,7 +67,6 @@ constexpr uint32_t LDR_LITERAL = 0x58000000;
|
||||
constexpr uint32_t ROR = 0x9AC02C00;
|
||||
constexpr uint32_t ROR_IMM = 0x93C00000;
|
||||
constexpr uint32_t MOV_REG = 0xAA0003E0;
|
||||
constexpr uint32_t MOV_VREG_EL = 0x6E080400;
|
||||
constexpr uint32_t FADD = 0x4E60D400;
|
||||
constexpr uint32_t FSUB = 0x4EE0D400;
|
||||
constexpr uint32_t FEOR = 0x6E201C00;
|
||||
@@ -102,7 +101,7 @@ static size_t CalcDatasetItemSize()
|
||||
((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result);
|
||||
}
|
||||
|
||||
constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
|
||||
constexpr uint8_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };
|
||||
|
||||
JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) :
|
||||
hugePages(hugePagesJIT && hugePagesEnable),
|
||||
@@ -115,7 +114,7 @@ JitCompilerA64::~JitCompilerA64()
|
||||
freePagedMemory(code, allocatedSize);
|
||||
}
|
||||
|
||||
void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config, uint32_t)
|
||||
void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config, uint32_t flags)
|
||||
{
|
||||
if (!allocatedSize) {
|
||||
allocate(CodeSize);
|
||||
@@ -126,13 +125,16 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
|
||||
}
|
||||
#endif
|
||||
|
||||
vm_flags = flags;
|
||||
|
||||
uint32_t codePos = MainLoopBegin + 4;
|
||||
|
||||
uint32_t mask = ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10);
|
||||
// and w16, w10, ScratchpadL3Mask64
|
||||
emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
|
||||
emit32(0x121A0000 | 16 | (10 << 5) | mask, code, codePos);
|
||||
|
||||
// and w17, w20, ScratchpadL3Mask64
|
||||
emit32(0x121A0000 | 17 | (20 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos);
|
||||
emit32(0x121A0000 | 17 | (20 << 5) | mask, code, codePos);
|
||||
|
||||
codePos = PrologueSize;
|
||||
literalPos = ImulRcpLiteralsEnd;
|
||||
@@ -155,19 +157,52 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
|
||||
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
|
||||
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
||||
|
||||
mask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10);
|
||||
// and w20, w20, CacheLineAlignMask
|
||||
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
|
||||
emit32(0x121A0000 | 20 | (20 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
|
||||
emit32(0x121A0000 | 20 | (20 << 5) | mask, code, codePos);
|
||||
|
||||
// and w10, w10, CacheLineAlignMask
|
||||
codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
|
||||
emit32(0x121A0000 | 10 | (10 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
|
||||
emit32(0x121A0000 | 10 | (10 << 5) | mask, code, codePos);
|
||||
|
||||
// Update spMix1
|
||||
// eor x10, config.readReg0, config.readReg1
|
||||
codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
|
||||
emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
|
||||
|
||||
codePos = ((uint8_t*)randomx_program_aarch64_v2_FE_mix) - ((uint8_t*)randomx_program_aarch64);
|
||||
|
||||
// Enable RandomX v2 AES tweak
|
||||
if (RandomX_CurrentConfig.Tweak_V2_AES) {
|
||||
if (flags & RANDOMX_FLAG_HARD_AES) {
|
||||
// Disable the jump to RandomX v1 FE mix code by writing "movi v28.4s, 0" instruction
|
||||
emit32(0x4F00041C, code, codePos);
|
||||
}
|
||||
else {
|
||||
// Jump to RandomX v2 FE mix soft AES code by writing "b randomx_program_aarch64_v2_FE_mix_soft_aes" instruction
|
||||
const uint32_t offset = (uint8_t*)randomx_program_aarch64_v2_FE_mix_soft_aes - (uint8_t*)randomx_program_aarch64_v2_FE_mix;
|
||||
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Restore the jump to RandomX v1 FE mix code
|
||||
const uint32_t offset = (uint8_t*)randomx_program_aarch64_v1_FE_mix - (uint8_t*)randomx_program_aarch64_v2_FE_mix;
|
||||
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
||||
}
|
||||
|
||||
// Apply v2 prefetch tweak
|
||||
if (RandomX_CurrentConfig.Tweak_V2_PREFETCH) {
|
||||
uint32_t dst = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64));
|
||||
uint32_t src = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_v2) - ((uint8_t*)randomx_program_aarch64));
|
||||
memcpy(code + dst, code + src, 16);
|
||||
}
|
||||
else {
|
||||
uint32_t dst = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64));
|
||||
uint32_t src = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_v1) - ((uint8_t*)randomx_program_aarch64));
|
||||
memcpy(code + dst, code + src, 16);
|
||||
}
|
||||
|
||||
# ifndef XMRIG_OS_APPLE
|
||||
xmrig::VirtualMemory::flushInstructionCache(reinterpret_cast<char*>(code + MainLoopBegin), codePos - MainLoopBegin);
|
||||
# endif
|
||||
@@ -209,19 +244,51 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
|
||||
// eor w20, config.readReg2, config.readReg3
|
||||
emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
|
||||
|
||||
// Apply v2 prefetch tweak
|
||||
if (RandomX_CurrentConfig.Tweak_V2_PREFETCH) {
|
||||
uint32_t dst = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light_tweak) - ((uint8_t*)randomx_program_aarch64));
|
||||
uint32_t src = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light_v2) - ((uint8_t*)randomx_program_aarch64));
|
||||
memcpy(code + dst, code + src, 8);
|
||||
}
|
||||
else {
|
||||
uint32_t dst = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light_tweak) - ((uint8_t*)randomx_program_aarch64));
|
||||
uint32_t src = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light_v1) - ((uint8_t*)randomx_program_aarch64));
|
||||
memcpy(code + dst, code + src, 8);
|
||||
}
|
||||
|
||||
// Jump back to the main loop
|
||||
const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos;
|
||||
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
||||
|
||||
// and w2, w9, CacheLineAlignMask
|
||||
// and w2, w2, CacheLineAlignMask
|
||||
codePos = (((uint8_t*)randomx_program_aarch64_light_cacheline_align_mask) - ((uint8_t*)randomx_program_aarch64));
|
||||
emit32(0x121A0000 | 2 | (9 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
|
||||
emit32(0x121A0000 | 2 | (2 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos);
|
||||
|
||||
// Update spMix1
|
||||
// eor x10, config.readReg0, config.readReg1
|
||||
codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
|
||||
emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);
|
||||
|
||||
codePos = ((uint8_t*)randomx_program_aarch64_v2_FE_mix) - ((uint8_t*)randomx_program_aarch64);
|
||||
|
||||
// Enable RandomX v2 AES tweak
|
||||
if (RandomX_CurrentConfig.Tweak_V2_AES) {
|
||||
if (vm_flags & RANDOMX_FLAG_HARD_AES) {
|
||||
// Disable the jump to RandomX v1 FE mix code by writing "movi v28.4s, 0" instruction
|
||||
emit32(0x4F00041C, code, codePos);
|
||||
}
|
||||
else {
|
||||
// Jump to RandomX v2 FE mix soft AES code by writing "b randomx_program_aarch64_v2_FE_mix_soft_aes" instruction
|
||||
const uint32_t offset = (uint8_t*)randomx_program_aarch64_v2_FE_mix_soft_aes - (uint8_t*)randomx_program_aarch64_v2_FE_mix;
|
||||
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Restore the jump to RandomX v1 FE mix code
|
||||
const uint32_t offset = (uint8_t*)randomx_program_aarch64_v1_FE_mix - (uint8_t*)randomx_program_aarch64_v2_FE_mix;
|
||||
emit32(ARMV8A::B | (offset / 4), code, codePos);
|
||||
}
|
||||
|
||||
// Apply dataset offset
|
||||
codePos = ((uint8_t*)randomx_program_aarch64_light_dataset_offset) - ((uint8_t*)randomx_program_aarch64);
|
||||
|
||||
@@ -497,9 +564,12 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
|
||||
if (src != dst)
|
||||
{
|
||||
imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
|
||||
emitAddImmediate(tmp_reg, src, imm, code, k);
|
||||
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
|
||||
if (imm)
|
||||
emitAddImmediate(tmp_reg, src, imm, code, k);
|
||||
else
|
||||
t = 0x927d0000 | tmp_reg | (src << 5);
|
||||
|
||||
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
|
||||
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
|
||||
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
|
||||
|
||||
@@ -511,10 +581,18 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
|
||||
else
|
||||
{
|
||||
imm = (imm & ScratchpadL3Mask) >> 3;
|
||||
emitMovImmediate(tmp_reg, imm, code, k);
|
||||
if (imm)
|
||||
{
|
||||
emitMovImmediate(tmp_reg, imm, code, k);
|
||||
|
||||
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
|
||||
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
|
||||
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
|
||||
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
|
||||
}
|
||||
else
|
||||
{
|
||||
// ldr tmp_reg, [x2]
|
||||
emit32(0xf9400040 | tmp_reg, code, k);
|
||||
}
|
||||
}
|
||||
|
||||
codePos = k;
|
||||
@@ -529,25 +607,22 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
|
||||
constexpr uint32_t tmp_reg = 19;
|
||||
|
||||
imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1);
|
||||
emitAddImmediate(tmp_reg, src, imm, code, k);
|
||||
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
|
||||
if (imm)
|
||||
emitAddImmediate(tmp_reg, src, imm, code, k);
|
||||
else
|
||||
t = 0x927d0000 | tmp_reg | (src << 5);
|
||||
|
||||
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
|
||||
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
|
||||
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
|
||||
|
||||
emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k);
|
||||
|
||||
// add tmp_reg, x2, tmp_reg
|
||||
emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k);
|
||||
// ldr tmp_reg_fp, [x2, tmp_reg]
|
||||
emit32(0x3ce06800 | tmp_reg_fp | (2 << 5) | (tmp_reg << 16), code, k);
|
||||
|
||||
// ldpsw tmp_reg, tmp_reg + 1, [tmp_reg]
|
||||
emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k);
|
||||
|
||||
// ins tmp_reg_fp.d[0], tmp_reg
|
||||
emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k);
|
||||
|
||||
// ins tmp_reg_fp.d[1], tmp_reg + 1
|
||||
emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k);
|
||||
// sxtl.2d tmp_reg_fp, tmp_reg_fp
|
||||
emit32(0x0f20a400 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
|
||||
|
||||
// scvtf tmp_reg_fp.2d, tmp_reg_fp.2d
|
||||
emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k);
|
||||
@@ -835,7 +910,8 @@ void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos)
|
||||
else
|
||||
{
|
||||
// ror dst, dst, imm
|
||||
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
|
||||
if ((instr.getImm32() & 63))
|
||||
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos);
|
||||
}
|
||||
|
||||
reg_changed_offset[instr.dst] = codePos;
|
||||
@@ -861,7 +937,8 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
|
||||
else
|
||||
{
|
||||
// ror dst, dst, imm
|
||||
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
|
||||
if ((instr.getImm32() & 63))
|
||||
emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k);
|
||||
}
|
||||
|
||||
reg_changed_offset[instr.dst] = k;
|
||||
@@ -894,13 +971,8 @@ void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos)
|
||||
|
||||
const uint32_t dst = instr.dst + 16;
|
||||
|
||||
constexpr uint32_t tmp_reg_fp = 28;
|
||||
constexpr uint32_t src_index1 = 1 << 14;
|
||||
constexpr uint32_t dst_index1 = 1 << 20;
|
||||
|
||||
emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k);
|
||||
emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k);
|
||||
emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k);
|
||||
// ext dst.16b, dst.16b, dst.16b, #0x8
|
||||
emit32(0x6e004000 | dst | (dst << 5) | (dst << 16), code, k);
|
||||
|
||||
codePos = k;
|
||||
}
|
||||
@@ -1032,6 +1104,14 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
|
||||
// ror tmp_reg, src, imm
|
||||
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||
// tst tmp_reg, 60
|
||||
emit32(0xF27E0E9F, code, k);
|
||||
|
||||
// bne next
|
||||
emit32(0x54000081, code, k);
|
||||
}
|
||||
|
||||
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
|
||||
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
|
||||
|
||||
@@ -1059,9 +1139,12 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
|
||||
else
|
||||
imm &= RandomX_CurrentConfig.ScratchpadL3_Size - 1;
|
||||
|
||||
emitAddImmediate(tmp_reg, dst, imm, code, k);
|
||||
uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
|
||||
if (imm)
|
||||
emitAddImmediate(tmp_reg, dst, imm, code, k);
|
||||
else
|
||||
t = 0x927d0000 | tmp_reg | (dst << 5);
|
||||
|
||||
constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5);
|
||||
const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10);
|
||||
const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10);
|
||||
const uint32_t andInstrL3 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 4) << 10);
|
||||
|
||||
@@ -83,6 +83,7 @@ namespace randomx {
|
||||
uint32_t literalPos;
|
||||
uint32_t num32bitLiterals = 0;
|
||||
size_t allocatedSize = 0;
|
||||
uint32_t vm_flags = 0;
|
||||
|
||||
void allocate(size_t size);
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
#define DECL(x) x
|
||||
#endif
|
||||
|
||||
.arch armv8-a
|
||||
.arch armv8-a+crypto
|
||||
.text
|
||||
.global DECL(randomx_program_aarch64)
|
||||
.global DECL(randomx_program_aarch64_main_loop)
|
||||
@@ -41,9 +41,17 @@
|
||||
.global DECL(randomx_program_aarch64_cacheline_align_mask1)
|
||||
.global DECL(randomx_program_aarch64_cacheline_align_mask2)
|
||||
.global DECL(randomx_program_aarch64_update_spMix1)
|
||||
.global DECL(randomx_program_aarch64_v2_FE_mix)
|
||||
.global DECL(randomx_program_aarch64_v1_FE_mix)
|
||||
.global DECL(randomx_program_aarch64_v2_FE_mix_soft_aes)
|
||||
.global DECL(randomx_program_aarch64_vm_instructions_end_light)
|
||||
.global DECL(randomx_program_aarch64_vm_instructions_end_light_tweak)
|
||||
.global DECL(randomx_program_aarch64_light_cacheline_align_mask)
|
||||
.global DECL(randomx_program_aarch64_light_dataset_offset)
|
||||
.global DECL(randomx_program_aarch64_vm_instructions_end_v1)
|
||||
.global DECL(randomx_program_aarch64_vm_instructions_end_v2)
|
||||
.global DECL(randomx_program_aarch64_vm_instructions_end_light_v1)
|
||||
.global DECL(randomx_program_aarch64_vm_instructions_end_light_v2)
|
||||
.global DECL(randomx_init_dataset_aarch64)
|
||||
.global DECL(randomx_init_dataset_aarch64_end)
|
||||
.global DECL(randomx_calc_dataset_item_aarch64)
|
||||
@@ -100,9 +108,9 @@
|
||||
# v26 -> "a2"
|
||||
# v27 -> "a3"
|
||||
# v28 -> temporary
|
||||
# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
|
||||
# v30 -> E 'or' mask = 0x3*00000000******3*00000000******
|
||||
# v31 -> scale mask = 0x81f000000000000081f0000000000000
|
||||
# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
|
||||
# v30 -> E 'or' mask = 0x3*00000000******'3*00000000******
|
||||
# v31 -> scale mask = 0x80f0000000000000'80f0000000000000
|
||||
|
||||
.balign 4
|
||||
DECL(randomx_program_aarch64):
|
||||
@@ -142,17 +150,14 @@ DECL(randomx_program_aarch64):
|
||||
ldp q26, q27, [x0, 224]
|
||||
|
||||
# Load E 'and' mask
|
||||
mov x16, 0x00FFFFFFFFFFFFFF
|
||||
ins v29.d[0], x16
|
||||
ins v29.d[1], x16
|
||||
movi v29.2d, #0x00FFFFFFFFFFFFFF
|
||||
|
||||
# Load E 'or' mask (stored in reg.f[0])
|
||||
ldr q30, [x0, 64]
|
||||
|
||||
# Load scale mask
|
||||
mov x16, 0x80f0000000000000
|
||||
ins v31.d[0], x16
|
||||
ins v31.d[1], x16
|
||||
dup v31.2d, x16
|
||||
|
||||
# Read fpcr
|
||||
mrs x8, fpcr
|
||||
@@ -162,35 +167,22 @@ DECL(randomx_program_aarch64):
|
||||
str x0, [sp, -16]!
|
||||
|
||||
# Read literals
|
||||
ldr x0, literal_x0
|
||||
ldr x11, literal_x11
|
||||
ldr x21, literal_x21
|
||||
ldr x22, literal_x22
|
||||
ldr x23, literal_x23
|
||||
ldr x24, literal_x24
|
||||
ldr x25, literal_x25
|
||||
ldr x26, literal_x26
|
||||
ldr x27, literal_x27
|
||||
ldr x28, literal_x28
|
||||
ldr x29, literal_x29
|
||||
ldr x30, literal_x30
|
||||
adr x30, literal_v0
|
||||
ldp q0, q1, [x30]
|
||||
ldp q2, q3, [x30, 32]
|
||||
ldp q4, q5, [x30, 64]
|
||||
ldp q6, q7, [x30, 96]
|
||||
ldp q8, q9, [x30, 128]
|
||||
ldp q10, q11, [x30, 160]
|
||||
ldp q12, q13, [x30, 192]
|
||||
ldp q14, q15, [x30, 224]
|
||||
|
||||
ldr q0, literal_v0
|
||||
ldr q1, literal_v1
|
||||
ldr q2, literal_v2
|
||||
ldr q3, literal_v3
|
||||
ldr q4, literal_v4
|
||||
ldr q5, literal_v5
|
||||
ldr q6, literal_v6
|
||||
ldr q7, literal_v7
|
||||
ldr q8, literal_v8
|
||||
ldr q9, literal_v9
|
||||
ldr q10, literal_v10
|
||||
ldr q11, literal_v11
|
||||
ldr q12, literal_v12
|
||||
ldr q13, literal_v13
|
||||
ldr q14, literal_v14
|
||||
ldr q15, literal_v15
|
||||
ldp x0, x11, [x30, -96] // literal_x0
|
||||
ldp x21, x22, [x30, -80] // literal_x21
|
||||
ldp x23, x24, [x30, -64] // literal_x23
|
||||
ldp x25, x26, [x30, -48] // literal_x25
|
||||
ldp x27, x28, [x30, -32] // literal_x27
|
||||
ldp x29, x30, [x30, -16] // literal_x29
|
||||
|
||||
DECL(randomx_program_aarch64_main_loop):
|
||||
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
||||
@@ -221,40 +213,31 @@ DECL(randomx_program_aarch64_main_loop):
|
||||
eor x15, x15, x19
|
||||
|
||||
# Load group F registers (spAddr1)
|
||||
ldpsw x20, x19, [x17]
|
||||
ins v16.d[0], x20
|
||||
ins v16.d[1], x19
|
||||
ldpsw x20, x19, [x17, 8]
|
||||
ins v17.d[0], x20
|
||||
ins v17.d[1], x19
|
||||
ldpsw x20, x19, [x17, 16]
|
||||
ins v18.d[0], x20
|
||||
ins v18.d[1], x19
|
||||
ldpsw x20, x19, [x17, 24]
|
||||
ins v19.d[0], x20
|
||||
ins v19.d[1], x19
|
||||
ldr q17, [x17]
|
||||
sxtl v16.2d, v17.2s
|
||||
scvtf v16.2d, v16.2d
|
||||
sxtl2 v17.2d, v17.4s
|
||||
scvtf v17.2d, v17.2d
|
||||
|
||||
ldr q19, [x17, 16]
|
||||
sxtl v18.2d, v19.2s
|
||||
scvtf v18.2d, v18.2d
|
||||
sxtl2 v19.2d, v19.4s
|
||||
scvtf v19.2d, v19.2d
|
||||
|
||||
# Load group E registers (spAddr1)
|
||||
ldpsw x20, x19, [x17, 32]
|
||||
ins v20.d[0], x20
|
||||
ins v20.d[1], x19
|
||||
ldpsw x20, x19, [x17, 40]
|
||||
ins v21.d[0], x20
|
||||
ins v21.d[1], x19
|
||||
ldpsw x20, x19, [x17, 48]
|
||||
ins v22.d[0], x20
|
||||
ins v22.d[1], x19
|
||||
ldpsw x20, x19, [x17, 56]
|
||||
ins v23.d[0], x20
|
||||
ins v23.d[1], x19
|
||||
ldr q21, [x17, 32]
|
||||
sxtl v20.2d, v21.2s
|
||||
scvtf v20.2d, v20.2d
|
||||
sxtl2 v21.2d, v21.4s
|
||||
scvtf v21.2d, v21.2d
|
||||
|
||||
ldr q23, [x17, 48]
|
||||
sxtl v22.2d, v23.2s
|
||||
scvtf v22.2d, v22.2d
|
||||
sxtl2 v23.2d, v23.4s
|
||||
scvtf v23.2d, v23.2d
|
||||
|
||||
and v20.16b, v20.16b, v29.16b
|
||||
and v21.16b, v21.16b, v29.16b
|
||||
and v22.16b, v22.16b, v29.16b
|
||||
@@ -267,8 +250,8 @@ DECL(randomx_program_aarch64_main_loop):
|
||||
# Execute VM instructions
|
||||
DECL(randomx_program_aarch64_vm_instructions):
|
||||
|
||||
# 16 KB buffer for generated instructions
|
||||
.fill 4096,4,0
|
||||
# 24 KB buffer for generated instructions
|
||||
.fill 6144,4,0
|
||||
|
||||
literal_x0: .fill 1,8,0
|
||||
literal_x11: .fill 1,8,0
|
||||
@@ -311,6 +294,10 @@ DECL(randomx_program_aarch64_vm_instructions_end):
|
||||
|
||||
# Calculate dataset pointer for dataset prefetch
|
||||
mov w20, w9
|
||||
|
||||
# mx <-> ma
|
||||
ror x9, x9, 32
|
||||
|
||||
DECL(randomx_program_aarch64_cacheline_align_mask1):
|
||||
# Actual mask will be inserted by JIT compiler
|
||||
and x20, x20, 1
|
||||
@@ -319,9 +306,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
|
||||
# Prefetch dataset data
|
||||
prfm pldl2strm, [x20]
|
||||
|
||||
# mx <-> ma
|
||||
ror x9, x9, 32
|
||||
|
||||
DECL(randomx_program_aarch64_cacheline_align_mask2):
|
||||
# Actual mask will be inserted by JIT compiler
|
||||
and x10, x10, 1
|
||||
@@ -352,12 +336,93 @@ DECL(randomx_program_aarch64_update_spMix1):
|
||||
stp x12, x13, [x17, 32]
|
||||
stp x14, x15, [x17, 48]
|
||||
|
||||
# xor group F and group E registers
|
||||
# RandomX v2 AES tweak (mix group F and group E registers using AES)
|
||||
DECL(randomx_program_aarch64_v2_FE_mix):
|
||||
|
||||
# Jump to v1 FE mix code if we're running RandomX v1
|
||||
# JIT compiler will write a "movi v28.4s, 0" (set v28 to all 0) here if we're running RandomX v2
|
||||
# Or, JIT compiler will write a "b randomx_program_aarch64_v2_FE_mix_soft_aes" if we're running RandomX v2 with soft AES
|
||||
b DECL(randomx_program_aarch64_v1_FE_mix)
|
||||
|
||||
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v20.16b
|
||||
eor v17.16b, v17.16b, v20.16b
|
||||
eor v18.16b, v18.16b, v20.16b
|
||||
eor v19.16b, v19.16b, v20.16b
|
||||
|
||||
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v21.16b
|
||||
eor v17.16b, v17.16b, v21.16b
|
||||
eor v18.16b, v18.16b, v21.16b
|
||||
eor v19.16b, v19.16b, v21.16b
|
||||
|
||||
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v22.16b
|
||||
eor v17.16b, v17.16b, v22.16b
|
||||
eor v18.16b, v18.16b, v22.16b
|
||||
eor v19.16b, v19.16b, v22.16b
|
||||
|
||||
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v23.16b
|
||||
eor v17.16b, v17.16b, v23.16b
|
||||
eor v18.16b, v18.16b, v23.16b
|
||||
eor v19.16b, v19.16b, v23.16b
|
||||
|
||||
# Skip v1 FE mix code because we already did v2 FE mix
|
||||
b randomx_program_aarch64_FE_store
|
||||
|
||||
DECL(randomx_program_aarch64_v1_FE_mix):
|
||||
eor v16.16b, v16.16b, v20.16b
|
||||
eor v17.16b, v17.16b, v21.16b
|
||||
eor v18.16b, v18.16b, v22.16b
|
||||
eor v19.16b, v19.16b, v23.16b
|
||||
|
||||
randomx_program_aarch64_FE_store:
|
||||
|
||||
# Store FP registers to scratchpad (spAddr0)
|
||||
stp q16, q17, [x16, 0]
|
||||
stp q18, q19, [x16, 32]
|
||||
@@ -402,6 +467,13 @@ DECL(randomx_program_aarch64_vm_instructions_end_light):
|
||||
stp x0, x1, [sp, 64]
|
||||
stp x2, x30, [sp, 80]
|
||||
|
||||
lsr x2, x9, 32
|
||||
|
||||
DECL(randomx_program_aarch64_light_cacheline_align_mask):
|
||||
# Actual mask will be inserted by JIT compiler
|
||||
and w2, w2, 1
|
||||
|
||||
DECL(randomx_program_aarch64_vm_instructions_end_light_tweak):
|
||||
# mx ^= r[readReg2] ^ r[readReg3];
|
||||
eor x9, x9, x20
|
||||
|
||||
@@ -414,10 +486,6 @@ DECL(randomx_program_aarch64_vm_instructions_end_light):
|
||||
# x1 -> pointer to output
|
||||
mov x1, sp
|
||||
|
||||
DECL(randomx_program_aarch64_light_cacheline_align_mask):
|
||||
# Actual mask will be inserted by JIT compiler
|
||||
and w2, w9, 1
|
||||
|
||||
# x2 -> item number
|
||||
lsr x2, x2, 6
|
||||
|
||||
@@ -435,6 +503,500 @@ DECL(randomx_program_aarch64_light_dataset_offset):
|
||||
|
||||
b DECL(randomx_program_aarch64_xor_with_dataset_line)
|
||||
|
||||
DECL(randomx_program_aarch64_vm_instructions_end_v1):
|
||||
lsr x10, x9, 32
|
||||
eor x9, x9, x20
|
||||
mov w20, w9
|
||||
ror x9, x9, 32
|
||||
|
||||
DECL(randomx_program_aarch64_vm_instructions_end_v2):
|
||||
lsr x10, x9, 32
|
||||
ror x9, x9, 32
|
||||
eor x9, x9, x20
|
||||
mov w20, w9
|
||||
|
||||
DECL(randomx_program_aarch64_vm_instructions_end_light_v1):
|
||||
eor x9, x9, x20
|
||||
ror x9, x9, 32
|
||||
|
||||
DECL(randomx_program_aarch64_vm_instructions_end_light_v2):
|
||||
ror x9, x9, 32
|
||||
eor x9, x9, x20
|
||||
|
||||
DECL(randomx_program_aarch64_v2_FE_mix_soft_aes):
|
||||
sub sp, sp, 176
|
||||
|
||||
stp x0, x1, [sp]
|
||||
stp x2, x3, [sp, 16]
|
||||
stp x4, x5, [sp, 32]
|
||||
stp x6, x7, [sp, 48]
|
||||
stp x8, x9, [sp, 64]
|
||||
stp x10, x11, [sp, 80]
|
||||
stp x12, x13, [sp, 96]
|
||||
stp x14, x15, [sp, 112]
|
||||
stp x16, x30, [sp, 128]
|
||||
stp q0, q1, [sp, 144]
|
||||
|
||||
adr x19, randomx_aes_lut_enc
|
||||
adr x20, randomx_aes_lut_dec
|
||||
|
||||
# f0 = aesenc(f0, e0), f0 = aesenc(f0, e1), f0 = aesenc(f0, e2), f0 = aesenc(f0, e3)
|
||||
mov v0.16b, v16.16b
|
||||
mov v1.16b, v20.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v1.16b, v21.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v1.16b, v22.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v1.16b, v23.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v16.16b, v0.16b
|
||||
|
||||
# f1 = aesdec(f1, e0), f1 = aesdec(f1, e1), f1 = aesdec(f1, e2), f1 = aesdec(f1, e3)
|
||||
mov v0.16b, v17.16b
|
||||
mov v1.16b, v20.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v1.16b, v21.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v1.16b, v22.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v1.16b, v23.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v17.16b, v0.16b
|
||||
|
||||
# f2 = aesenc(f2, e0), f2 = aesenc(f2, e1), f2 = aesenc(f2, e2), f2 = aesenc(f2, e3)
|
||||
mov v0.16b, v18.16b
|
||||
mov v1.16b, v20.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v1.16b, v21.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v1.16b, v22.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v1.16b, v23.16b
|
||||
bl randomx_soft_aesenc
|
||||
mov v18.16b, v0.16b
|
||||
|
||||
# f3 = aesdec(f3, e0), f3 = aesdec(f3, e1), f3 = aesdec(f3, e2), f3 = aesdec(f3, e3)
|
||||
mov v0.16b, v19.16b
|
||||
mov v1.16b, v20.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v1.16b, v21.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v1.16b, v22.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v1.16b, v23.16b
|
||||
bl randomx_soft_aesdec
|
||||
mov v19.16b, v0.16b
|
||||
|
||||
ldp x0, x1, [sp]
|
||||
ldp x2, x3, [sp, 16]
|
||||
ldp x4, x5, [sp, 32]
|
||||
ldp x6, x7, [sp, 48]
|
||||
ldp x8, x9, [sp, 64]
|
||||
ldp x10, x11, [sp, 80]
|
||||
ldp x12, x13, [sp, 96]
|
||||
ldp x14, x15, [sp, 112]
|
||||
ldp x16, x30, [sp, 128]
|
||||
ldp q0, q1, [sp, 144]
|
||||
|
||||
add sp, sp, 176
|
||||
|
||||
b randomx_program_aarch64_FE_store
|
||||
|
||||
|
||||
randomx_soft_aesenc:
|
||||
umov w4, v0.b[5]
|
||||
umov w1, v0.b[10]
|
||||
umov w12, v0.b[15]
|
||||
umov w9, v0.b[9]
|
||||
umov w2, v0.b[14]
|
||||
umov w11, v0.b[3]
|
||||
umov w5, v0.b[0]
|
||||
umov w16, v0.b[4]
|
||||
add x4, x4, 256
|
||||
add x1, x1, 512
|
||||
add x12, x12, 768
|
||||
umov w3, v0.b[13]
|
||||
umov w8, v0.b[2]
|
||||
umov w7, v0.b[7]
|
||||
add x9, x9, 256
|
||||
add x2, x2, 512
|
||||
add x11, x11, 768
|
||||
ldr w10, [x19, x4, lsl 2]
|
||||
ldr w15, [x19, x5, lsl 2]
|
||||
umov w13, v0.b[8]
|
||||
ldr w14, [x19, x12, lsl 2]
|
||||
umov w6, v0.b[1]
|
||||
ldr w1, [x19, x1, lsl 2]
|
||||
eor w10, w10, w15
|
||||
ldr w2, [x19, x2, lsl 2]
|
||||
umov w5, v0.b[6]
|
||||
ldr w9, [x19, x9, lsl 2]
|
||||
umov w4, v0.b[11]
|
||||
ldr w12, [x19, x16, lsl 2]
|
||||
eor w1, w1, w14
|
||||
ldr w11, [x19, x11, lsl 2]
|
||||
eor w1, w1, w10
|
||||
add x8, x8, 512
|
||||
add x3, x3, 256
|
||||
add x7, x7, 768
|
||||
eor w9, w9, w12
|
||||
fmov s28, w1
|
||||
eor w1, w2, w11
|
||||
umov w10, v0.b[12]
|
||||
eor w1, w1, w9
|
||||
ldr w3, [x19, x3, lsl 2]
|
||||
add x6, x6, 256
|
||||
ldr w9, [x19, x13, lsl 2]
|
||||
ins v28.s[1], w1
|
||||
ldr w2, [x19, x8, lsl 2]
|
||||
add x5, x5, 512
|
||||
ldr w7, [x19, x7, lsl 2]
|
||||
add x4, x4, 768
|
||||
eor w1, w3, w9
|
||||
ldr w3, [x19, x6, lsl 2]
|
||||
eor w2, w2, w7
|
||||
ldr w6, [x19, x10, lsl 2]
|
||||
eor w2, w2, w1
|
||||
ldr w1, [x19, x5, lsl 2]
|
||||
ldr w0, [x19, x4, lsl 2]
|
||||
eor w3, w3, w6
|
||||
ins v28.s[2], w2
|
||||
eor w0, w1, w0
|
||||
eor w0, w0, w3
|
||||
ins v28.s[3], w0
|
||||
eor v0.16b, v1.16b, v28.16b
|
||||
ret
|
||||
|
||||
randomx_soft_aesdec:
|
||||
umov w1, v0.b[10]
|
||||
umov w3, v0.b[7]
|
||||
umov w12, v0.b[13]
|
||||
umov w2, v0.b[14]
|
||||
umov w9, v0.b[11]
|
||||
umov w11, v0.b[1]
|
||||
umov w4, v0.b[0]
|
||||
umov w16, v0.b[4]
|
||||
add x3, x3, 768
|
||||
add x1, x1, 512
|
||||
add x12, x12, 256
|
||||
umov w8, v0.b[5]
|
||||
umov w6, v0.b[2]
|
||||
umov w7, v0.b[15]
|
||||
add x9, x9, 768
|
||||
add x2, x2, 512
|
||||
add x11, x11, 256
|
||||
ldr w15, [x20, x3, lsl 2]
|
||||
ldr w10, [x20, x4, lsl 2]
|
||||
umov w13, v0.b[8]
|
||||
ldr w14, [x20, x12, lsl 2]
|
||||
umov w5, v0.b[9]
|
||||
ldr w1, [x20, x1, lsl 2]
|
||||
umov w3, v0.b[6]
|
||||
ldr w12, [x20, x9, lsl 2]
|
||||
umov w4, v0.b[3]
|
||||
ldr w9, [x20, x16, lsl 2]
|
||||
eor w1, w1, w15
|
||||
ldr w2, [x20, x2, lsl 2]
|
||||
eor w10, w10, w14
|
||||
ldr w11, [x20, x11, lsl 2]
|
||||
eor w1, w1, w10
|
||||
add x8, x8, 256
|
||||
add x6, x6, 512
|
||||
add x7, x7, 768
|
||||
eor w2, w2, w12
|
||||
fmov s28, w1
|
||||
eor w1, w9, w11
|
||||
eor w1, w2, w1
|
||||
umov w9, v0.b[12]
|
||||
ldr w2, [x20, x13, lsl 2]
|
||||
add x5, x5, 256
|
||||
ldr w8, [x20, x8, lsl 2]
|
||||
ins v28.s[1], w1
|
||||
ldr w6, [x20, x6, lsl 2]
|
||||
add x3, x3, 512
|
||||
ldr w7, [x20, x7, lsl 2]
|
||||
add x4, x4, 768
|
||||
eor w2, w2, w8
|
||||
ldr w1, [x20, x9, lsl 2]
|
||||
eor w6, w6, w7
|
||||
ldr w3, [x20, x3, lsl 2]
|
||||
eor w2, w2, w6
|
||||
ldr w4, [x20, x4, lsl 2]
|
||||
ldr w5, [x20, x5, lsl 2]
|
||||
ins v28.s[2], w2
|
||||
eor w0, w1, w5
|
||||
eor w1, w3, w4
|
||||
eor w0, w0, w1
|
||||
ins v28.s[3], w0
|
||||
eor v0.16b, v1.16b, v28.16b
|
||||
ret
|
||||
|
||||
randomx_aes_lut_enc:
|
||||
.word 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591
|
||||
.word 0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56, 0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec
|
||||
.word 0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa, 0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb
|
||||
.word 0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45, 0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b
|
||||
.word 0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c, 0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83
|
||||
.word 0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9, 0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a
|
||||
.word 0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d, 0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f
|
||||
.word 0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df, 0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea
|
||||
.word 0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34, 0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b
|
||||
.word 0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d, 0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413
|
||||
.word 0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1, 0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6
|
||||
.word 0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972, 0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85
|
||||
.word 0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed, 0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511
|
||||
.word 0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe, 0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b
|
||||
.word 0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05, 0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1
|
||||
.word 0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142, 0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf
|
||||
.word 0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3, 0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e
|
||||
.word 0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a, 0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6
|
||||
.word 0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3, 0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b
|
||||
.word 0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428, 0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad
|
||||
.word 0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14, 0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8
|
||||
.word 0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4, 0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2
|
||||
.word 0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda, 0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949
|
||||
.word 0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf, 0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810
|
||||
.word 0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c, 0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697
|
||||
.word 0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e, 0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f
|
||||
.word 0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc, 0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c
|
||||
.word 0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969, 0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27
|
||||
.word 0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122, 0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433
|
||||
.word 0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9, 0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5
|
||||
.word 0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0
|
||||
.word 0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e, 0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c
|
||||
|
||||
.word 0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d, 0xf2f2ff0d, 0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154
|
||||
.word 0x30306050, 0x01010203, 0x6767cea9, 0x2b2b567d, 0xfefee719, 0xd7d7b562, 0xabab4de6, 0x7676ec9a
|
||||
.word 0xcaca8f45, 0x82821f9d, 0xc9c98940, 0x7d7dfa87, 0xfafaef15, 0x5959b2eb, 0x47478ec9, 0xf0f0fb0b
|
||||
.word 0xadad41ec, 0xd4d4b367, 0xa2a25ffd, 0xafaf45ea, 0x9c9c23bf, 0xa4a453f7, 0x7272e496, 0xc0c09b5b
|
||||
.word 0xb7b775c2, 0xfdfde11c, 0x93933dae, 0x26264c6a, 0x36366c5a, 0x3f3f7e41, 0xf7f7f502, 0xcccc834f
|
||||
.word 0x3434685c, 0xa5a551f4, 0xe5e5d134, 0xf1f1f908, 0x7171e293, 0xd8d8ab73, 0x31316253, 0x15152a3f
|
||||
.word 0x0404080c, 0xc7c79552, 0x23234665, 0xc3c39d5e, 0x18183028, 0x969637a1, 0x05050a0f, 0x9a9a2fb5
|
||||
.word 0x07070e09, 0x12122436, 0x80801b9b, 0xe2e2df3d, 0xebebcd26, 0x27274e69, 0xb2b27fcd, 0x7575ea9f
|
||||
.word 0x0909121b, 0x83831d9e, 0x2c2c5874, 0x1a1a342e, 0x1b1b362d, 0x6e6edcb2, 0x5a5ab4ee, 0xa0a05bfb
|
||||
.word 0x5252a4f6, 0x3b3b764d, 0xd6d6b761, 0xb3b37dce, 0x2929527b, 0xe3e3dd3e, 0x2f2f5e71, 0x84841397
|
||||
.word 0x5353a6f5, 0xd1d1b968, 0x00000000, 0xededc12c, 0x20204060, 0xfcfce31f, 0xb1b179c8, 0x5b5bb6ed
|
||||
.word 0x6a6ad4be, 0xcbcb8d46, 0xbebe67d9, 0x3939724b, 0x4a4a94de, 0x4c4c98d4, 0x5858b0e8, 0xcfcf854a
|
||||
.word 0xd0d0bb6b, 0xefefc52a, 0xaaaa4fe5, 0xfbfbed16, 0x434386c5, 0x4d4d9ad7, 0x33336655, 0x85851194
|
||||
.word 0x45458acf, 0xf9f9e910, 0x02020406, 0x7f7ffe81, 0x5050a0f0, 0x3c3c7844, 0x9f9f25ba, 0xa8a84be3
|
||||
.word 0x5151a2f3, 0xa3a35dfe, 0x404080c0, 0x8f8f058a, 0x92923fad, 0x9d9d21bc, 0x38387048, 0xf5f5f104
|
||||
.word 0xbcbc63df, 0xb6b677c1, 0xdadaaf75, 0x21214263, 0x10102030, 0xffffe51a, 0xf3f3fd0e, 0xd2d2bf6d
|
||||
.word 0xcdcd814c, 0x0c0c1814, 0x13132635, 0xececc32f, 0x5f5fbee1, 0x979735a2, 0x444488cc, 0x17172e39
|
||||
.word 0xc4c49357, 0xa7a755f2, 0x7e7efc82, 0x3d3d7a47, 0x6464c8ac, 0x5d5dbae7, 0x1919322b, 0x7373e695
|
||||
.word 0x6060c0a0, 0x81811998, 0x4f4f9ed1, 0xdcdca37f, 0x22224466, 0x2a2a547e, 0x90903bab, 0x88880b83
|
||||
.word 0x46468cca, 0xeeeec729, 0xb8b86bd3, 0x1414283c, 0xdedea779, 0x5e5ebce2, 0x0b0b161d, 0xdbdbad76
|
||||
.word 0xe0e0db3b, 0x32326456, 0x3a3a744e, 0x0a0a141e, 0x494992db, 0x06060c0a, 0x2424486c, 0x5c5cb8e4
|
||||
.word 0xc2c29f5d, 0xd3d3bd6e, 0xacac43ef, 0x6262c4a6, 0x919139a8, 0x959531a4, 0xe4e4d337, 0x7979f28b
|
||||
.word 0xe7e7d532, 0xc8c88b43, 0x37376e59, 0x6d6ddab7, 0x8d8d018c, 0xd5d5b164, 0x4e4e9cd2, 0xa9a949e0
|
||||
.word 0x6c6cd8b4, 0x5656acfa, 0xf4f4f307, 0xeaeacf25, 0x6565caaf, 0x7a7af48e, 0xaeae47e9, 0x08081018
|
||||
.word 0xbaba6fd5, 0x7878f088, 0x25254a6f, 0x2e2e5c72, 0x1c1c3824, 0xa6a657f1, 0xb4b473c7, 0xc6c69751
|
||||
.word 0xe8e8cb23, 0xdddda17c, 0x7474e89c, 0x1f1f3e21, 0x4b4b96dd, 0xbdbd61dc, 0x8b8b0d86, 0x8a8a0f85
|
||||
.word 0x7070e090, 0x3e3e7c42, 0xb5b571c4, 0x6666ccaa, 0x484890d8, 0x03030605, 0xf6f6f701, 0x0e0e1c12
|
||||
.word 0x6161c2a3, 0x35356a5f, 0x5757aef9, 0xb9b969d0, 0x86861791, 0xc1c19958, 0x1d1d3a27, 0x9e9e27b9
|
||||
.word 0xe1e1d938, 0xf8f8eb13, 0x98982bb3, 0x11112233, 0x6969d2bb, 0xd9d9a970, 0x8e8e0789, 0x949433a7
|
||||
.word 0x9b9b2db6, 0x1e1e3c22, 0x87871592, 0xe9e9c920, 0xcece8749, 0x5555aaff, 0x28285078, 0xdfdfa57a
|
||||
.word 0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17, 0xbfbf65da, 0xe6e6d731, 0x424284c6, 0x6868d0b8
|
||||
.word 0x414182c3, 0x999929b0, 0x2d2d5a77, 0x0f0f1e11, 0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6, 0x16162c3a
|
||||
|
||||
.word 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b, 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5
|
||||
.word 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b, 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76
|
||||
.word 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d, 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0
|
||||
.word 0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf, 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0
|
||||
.word 0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26, 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc
|
||||
.word 0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1, 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15
|
||||
.word 0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3, 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a
|
||||
.word 0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2, 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75
|
||||
.word 0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a, 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0
|
||||
.word 0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3, 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784
|
||||
.word 0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced, 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b
|
||||
.word 0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39, 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf
|
||||
.word 0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb, 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485
|
||||
.word 0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f, 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8
|
||||
.word 0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f, 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5
|
||||
.word 0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321, 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2
|
||||
.word 0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec, 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917
|
||||
.word 0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d, 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573
|
||||
.word 0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc, 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388
|
||||
.word 0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14, 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db
|
||||
.word 0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a, 0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c
|
||||
.word 0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662, 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79
|
||||
.word 0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d, 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9
|
||||
.word 0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea, 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808
|
||||
.word 0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e, 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6
|
||||
.word 0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f, 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a
|
||||
.word 0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66, 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e
|
||||
.word 0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9, 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e
|
||||
.word 0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311, 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794
|
||||
.word 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9, 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf
|
||||
.word 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d, 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868
|
||||
.word 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f, 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16
|
||||
|
||||
.word 0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b, 0xff0df2f2, 0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5
|
||||
.word 0x60503030, 0x02030101, 0xcea96767, 0x567d2b2b, 0xe719fefe, 0xb562d7d7, 0x4de6abab, 0xec9a7676
|
||||
.word 0x8f45caca, 0x1f9d8282, 0x8940c9c9, 0xfa877d7d, 0xef15fafa, 0xb2eb5959, 0x8ec94747, 0xfb0bf0f0
|
||||
.word 0x41ecadad, 0xb367d4d4, 0x5ffda2a2, 0x45eaafaf, 0x23bf9c9c, 0x53f7a4a4, 0xe4967272, 0x9b5bc0c0
|
||||
.word 0x75c2b7b7, 0xe11cfdfd, 0x3dae9393, 0x4c6a2626, 0x6c5a3636, 0x7e413f3f, 0xf502f7f7, 0x834fcccc
|
||||
.word 0x685c3434, 0x51f4a5a5, 0xd134e5e5, 0xf908f1f1, 0xe2937171, 0xab73d8d8, 0x62533131, 0x2a3f1515
|
||||
.word 0x080c0404, 0x9552c7c7, 0x46652323, 0x9d5ec3c3, 0x30281818, 0x37a19696, 0x0a0f0505, 0x2fb59a9a
|
||||
.word 0x0e090707, 0x24361212, 0x1b9b8080, 0xdf3de2e2, 0xcd26ebeb, 0x4e692727, 0x7fcdb2b2, 0xea9f7575
|
||||
.word 0x121b0909, 0x1d9e8383, 0x58742c2c, 0x342e1a1a, 0x362d1b1b, 0xdcb26e6e, 0xb4ee5a5a, 0x5bfba0a0
|
||||
.word 0xa4f65252, 0x764d3b3b, 0xb761d6d6, 0x7dceb3b3, 0x527b2929, 0xdd3ee3e3, 0x5e712f2f, 0x13978484
|
||||
.word 0xa6f55353, 0xb968d1d1, 0x00000000, 0xc12ceded, 0x40602020, 0xe31ffcfc, 0x79c8b1b1, 0xb6ed5b5b
|
||||
.word 0xd4be6a6a, 0x8d46cbcb, 0x67d9bebe, 0x724b3939, 0x94de4a4a, 0x98d44c4c, 0xb0e85858, 0x854acfcf
|
||||
.word 0xbb6bd0d0, 0xc52aefef, 0x4fe5aaaa, 0xed16fbfb, 0x86c54343, 0x9ad74d4d, 0x66553333, 0x11948585
|
||||
.word 0x8acf4545, 0xe910f9f9, 0x04060202, 0xfe817f7f, 0xa0f05050, 0x78443c3c, 0x25ba9f9f, 0x4be3a8a8
|
||||
.word 0xa2f35151, 0x5dfea3a3, 0x80c04040, 0x058a8f8f, 0x3fad9292, 0x21bc9d9d, 0x70483838, 0xf104f5f5
|
||||
.word 0x63dfbcbc, 0x77c1b6b6, 0xaf75dada, 0x42632121, 0x20301010, 0xe51affff, 0xfd0ef3f3, 0xbf6dd2d2
|
||||
.word 0x814ccdcd, 0x18140c0c, 0x26351313, 0xc32fecec, 0xbee15f5f, 0x35a29797, 0x88cc4444, 0x2e391717
|
||||
.word 0x9357c4c4, 0x55f2a7a7, 0xfc827e7e, 0x7a473d3d, 0xc8ac6464, 0xbae75d5d, 0x322b1919, 0xe6957373
|
||||
.word 0xc0a06060, 0x19988181, 0x9ed14f4f, 0xa37fdcdc, 0x44662222, 0x547e2a2a, 0x3bab9090, 0x0b838888
|
||||
.word 0x8cca4646, 0xc729eeee, 0x6bd3b8b8, 0x283c1414, 0xa779dede, 0xbce25e5e, 0x161d0b0b, 0xad76dbdb
|
||||
.word 0xdb3be0e0, 0x64563232, 0x744e3a3a, 0x141e0a0a, 0x92db4949, 0x0c0a0606, 0x486c2424, 0xb8e45c5c
|
||||
.word 0x9f5dc2c2, 0xbd6ed3d3, 0x43efacac, 0xc4a66262, 0x39a89191, 0x31a49595, 0xd337e4e4, 0xf28b7979
|
||||
.word 0xd532e7e7, 0x8b43c8c8, 0x6e593737, 0xdab76d6d, 0x018c8d8d, 0xb164d5d5, 0x9cd24e4e, 0x49e0a9a9
|
||||
.word 0xd8b46c6c, 0xacfa5656, 0xf307f4f4, 0xcf25eaea, 0xcaaf6565, 0xf48e7a7a, 0x47e9aeae, 0x10180808
|
||||
.word 0x6fd5baba, 0xf0887878, 0x4a6f2525, 0x5c722e2e, 0x38241c1c, 0x57f1a6a6, 0x73c7b4b4, 0x9751c6c6
|
||||
.word 0xcb23e8e8, 0xa17cdddd, 0xe89c7474, 0x3e211f1f, 0x96dd4b4b, 0x61dcbdbd, 0x0d868b8b, 0x0f858a8a
|
||||
.word 0xe0907070, 0x7c423e3e, 0x71c4b5b5, 0xccaa6666, 0x90d84848, 0x06050303, 0xf701f6f6, 0x1c120e0e
|
||||
.word 0xc2a36161, 0x6a5f3535, 0xaef95757, 0x69d0b9b9, 0x17918686, 0x9958c1c1, 0x3a271d1d, 0x27b99e9e
|
||||
.word 0xd938e1e1, 0xeb13f8f8, 0x2bb39898, 0x22331111, 0xd2bb6969, 0xa970d9d9, 0x07898e8e, 0x33a79494
|
||||
.word 0x2db69b9b, 0x3c221e1e, 0x15928787, 0xc920e9e9, 0x8749cece, 0xaaff5555, 0x50782828, 0xa57adfdf
|
||||
.word 0x038f8c8c, 0x59f8a1a1, 0x09808989, 0x1a170d0d, 0x65dabfbf, 0xd731e6e6, 0x84c64242, 0xd0b86868
|
||||
.word 0x82c34141, 0x29b09999, 0x5a772d2d, 0x1e110f0f, 0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb, 0x2c3a1616
|
||||
|
||||
randomx_aes_lut_dec:
|
||||
.word 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b
|
||||
.word 0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5
|
||||
.word 0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b
|
||||
.word 0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e
|
||||
.word 0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927, 0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d
|
||||
.word 0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9
|
||||
.word 0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566
|
||||
.word 0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3, 0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed
|
||||
.word 0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4
|
||||
.word 0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd
|
||||
.word 0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d, 0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060
|
||||
.word 0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879
|
||||
.word 0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c
|
||||
.word 0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36, 0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624
|
||||
.word 0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c
|
||||
.word 0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14
|
||||
.word 0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3, 0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b
|
||||
.word 0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684
|
||||
.word 0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177
|
||||
.word 0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947, 0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322
|
||||
.word 0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f
|
||||
.word 0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382
|
||||
.word 0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf, 0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb
|
||||
.word 0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef
|
||||
.word 0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235
|
||||
.word 0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733, 0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117
|
||||
.word 0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546
|
||||
.word 0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d
|
||||
.word 0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb, 0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a
|
||||
.word 0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478
|
||||
.word 0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff
|
||||
.word 0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0
|
||||
|
||||
.word 0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93
|
||||
.word 0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f
|
||||
.word 0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6
|
||||
.word 0x5f8f03e7, 0x9c921595, 0x7a6dbfeb, 0x595295da, 0x83bed42d, 0x217458d3, 0x69e04929, 0xc8c98e44
|
||||
.word 0x89c2756a, 0x798ef478, 0x3e58996b, 0x71b927dd, 0x4fe1beb6, 0xad88f017, 0xac20c966, 0x3ace7db4
|
||||
.word 0x4adf6318, 0x311ae582, 0x33519760, 0x7f536245, 0x7764b1e0, 0xae6bbb84, 0xa081fe1c, 0x2b08f994
|
||||
.word 0x68487058, 0xfd458f19, 0x6cde9487, 0xf87b52b7, 0xd373ab23, 0x024b72e2, 0x8f1fe357, 0xab55662a
|
||||
.word 0x28ebb207, 0xc2b52f03, 0x7bc5869a, 0x0837d3a5, 0x872830f2, 0xa5bf23b2, 0x6a0302ba, 0x8216ed5c
|
||||
.word 0x1ccf8a2b, 0xb479a792, 0xf207f3f0, 0xe2694ea1, 0xf4da65cd, 0xbe0506d5, 0x6234d11f, 0xfea6c48a
|
||||
.word 0x532e349d, 0x55f3a2a0, 0xe18a0532, 0xebf6a475, 0xec830b39, 0xef6040aa, 0x9f715e06, 0x106ebd51
|
||||
.word 0x8a213ef9, 0x06dd963d, 0x053eddae, 0xbde64d46, 0x8d5491b5, 0x5dc47105, 0xd406046f, 0x155060ff
|
||||
.word 0xfb981924, 0xe9bdd697, 0x434089cc, 0x9ed96777, 0x42e8b0bd, 0x8b890788, 0x5b19e738, 0xeec879db
|
||||
.word 0x0a7ca147, 0x0f427ce9, 0x1e84f8c9, 0x00000000, 0x86800983, 0xed2b3248, 0x70111eac, 0x725a6c4e
|
||||
.word 0xff0efdfb, 0x38850f56, 0xd5ae3d1e, 0x392d3627, 0xd90f0a64, 0xa65c6821, 0x545b9bd1, 0x2e36243a
|
||||
.word 0x670a0cb1, 0xe757930f, 0x96eeb4d2, 0x919b1b9e, 0xc5c0804f, 0x20dc61a2, 0x4b775a69, 0x1a121c16
|
||||
.word 0xba93e20a, 0x2aa0c0e5, 0xe0223c43, 0x171b121d, 0x0d090e0b, 0xc78bf2ad, 0xa8b62db9, 0xa91e14c8
|
||||
.word 0x19f15785, 0x0775af4c, 0xdd99eebb, 0x607fa3fd, 0x2601f79f, 0xf5725cbc, 0x3b6644c5, 0x7efb5b34
|
||||
.word 0x29438b76, 0xc623cbdc, 0xfcedb668, 0xf1e4b863, 0xdc31d7ca, 0x85634210, 0x22971340, 0x11c68420
|
||||
.word 0x244a857d, 0x3dbbd2f8, 0x32f9ae11, 0xa129c76d, 0x2f9e1d4b, 0x30b2dcf3, 0x52860dec, 0xe3c177d0
|
||||
.word 0x16b32b6c, 0xb970a999, 0x489411fa, 0x64e94722, 0x8cfca8c4, 0x3ff0a01a, 0x2c7d56d8, 0x903322ef
|
||||
.word 0x4e4987c7, 0xd138d9c1, 0xa2ca8cfe, 0x0bd49836, 0x81f5a6cf, 0xde7aa528, 0x8eb7da26, 0xbfad3fa4
|
||||
.word 0x9d3a2ce4, 0x9278500d, 0xcc5f6a9b, 0x467e5462, 0x138df6c2, 0xb8d890e8, 0xf7392e5e, 0xafc382f5
|
||||
.word 0x805d9fbe, 0x93d0697c, 0x2dd56fa9, 0x1225cfb3, 0x99acc83b, 0x7d1810a7, 0x639ce86e, 0xbb3bdb7b
|
||||
.word 0x7826cd09, 0x18596ef4, 0xb79aec01, 0x9a4f83a8, 0x6e95e665, 0xe6ffaa7e, 0xcfbc2108, 0xe815efe6
|
||||
.word 0x9be7bad9, 0x366f4ace, 0x099fead4, 0x7cb029d6, 0xb2a431af, 0x233f2a31, 0x94a5c630, 0x66a235c0
|
||||
.word 0xbc4e7437, 0xca82fca6, 0xd090e0b0, 0xd8a73315, 0x9804f14a, 0xdaec41f7, 0x50cd7f0e, 0xf691172f
|
||||
.word 0xd64d768d, 0xb0ef434d, 0x4daacc54, 0x0496e4df, 0xb5d19ee3, 0x886a4c1b, 0x1f2cc1b8, 0x5165467f
|
||||
.word 0xea5e9d04, 0x358c015d, 0x7487fa73, 0x410bfb2e, 0x1d67b35a, 0xd2db9252, 0x5610e933, 0x47d66d13
|
||||
.word 0x61d79a8c, 0x0ca1377a, 0x14f8598e, 0x3c13eb89, 0x27a9ceee, 0xc961b735, 0xe51ce1ed, 0xb1477a3c
|
||||
.word 0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886
|
||||
.word 0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41
|
||||
.word 0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042
|
||||
|
||||
.word 0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303
|
||||
.word 0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3
|
||||
.word 0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9
|
||||
.word 0x8f03e75f, 0x9215959c, 0x6dbfeb7a, 0x5295da59, 0xbed42d83, 0x7458d321, 0xe0492969, 0xc98e44c8
|
||||
.word 0xc2756a89, 0x8ef47879, 0x58996b3e, 0xb927dd71, 0xe1beb64f, 0x88f017ad, 0x20c966ac, 0xce7db43a
|
||||
.word 0xdf63184a, 0x1ae58231, 0x51976033, 0x5362457f, 0x64b1e077, 0x6bbb84ae, 0x81fe1ca0, 0x08f9942b
|
||||
.word 0x48705868, 0x458f19fd, 0xde94876c, 0x7b52b7f8, 0x73ab23d3, 0x4b72e202, 0x1fe3578f, 0x55662aab
|
||||
.word 0xebb20728, 0xb52f03c2, 0xc5869a7b, 0x37d3a508, 0x2830f287, 0xbf23b2a5, 0x0302ba6a, 0x16ed5c82
|
||||
.word 0xcf8a2b1c, 0x79a792b4, 0x07f3f0f2, 0x694ea1e2, 0xda65cdf4, 0x0506d5be, 0x34d11f62, 0xa6c48afe
|
||||
.word 0x2e349d53, 0xf3a2a055, 0x8a0532e1, 0xf6a475eb, 0x830b39ec, 0x6040aaef, 0x715e069f, 0x6ebd5110
|
||||
.word 0x213ef98a, 0xdd963d06, 0x3eddae05, 0xe64d46bd, 0x5491b58d, 0xc471055d, 0x06046fd4, 0x5060ff15
|
||||
.word 0x981924fb, 0xbdd697e9, 0x4089cc43, 0xd967779e, 0xe8b0bd42, 0x8907888b, 0x19e7385b, 0xc879dbee
|
||||
.word 0x7ca1470a, 0x427ce90f, 0x84f8c91e, 0x00000000, 0x80098386, 0x2b3248ed, 0x111eac70, 0x5a6c4e72
|
||||
.word 0x0efdfbff, 0x850f5638, 0xae3d1ed5, 0x2d362739, 0x0f0a64d9, 0x5c6821a6, 0x5b9bd154, 0x36243a2e
|
||||
.word 0x0a0cb167, 0x57930fe7, 0xeeb4d296, 0x9b1b9e91, 0xc0804fc5, 0xdc61a220, 0x775a694b, 0x121c161a
|
||||
.word 0x93e20aba, 0xa0c0e52a, 0x223c43e0, 0x1b121d17, 0x090e0b0d, 0x8bf2adc7, 0xb62db9a8, 0x1e14c8a9
|
||||
.word 0xf1578519, 0x75af4c07, 0x99eebbdd, 0x7fa3fd60, 0x01f79f26, 0x725cbcf5, 0x6644c53b, 0xfb5b347e
|
||||
.word 0x438b7629, 0x23cbdcc6, 0xedb668fc, 0xe4b863f1, 0x31d7cadc, 0x63421085, 0x97134022, 0xc6842011
|
||||
.word 0x4a857d24, 0xbbd2f83d, 0xf9ae1132, 0x29c76da1, 0x9e1d4b2f, 0xb2dcf330, 0x860dec52, 0xc177d0e3
|
||||
.word 0xb32b6c16, 0x70a999b9, 0x9411fa48, 0xe9472264, 0xfca8c48c, 0xf0a01a3f, 0x7d56d82c, 0x3322ef90
|
||||
.word 0x4987c74e, 0x38d9c1d1, 0xca8cfea2, 0xd498360b, 0xf5a6cf81, 0x7aa528de, 0xb7da268e, 0xad3fa4bf
|
||||
.word 0x3a2ce49d, 0x78500d92, 0x5f6a9bcc, 0x7e546246, 0x8df6c213, 0xd890e8b8, 0x392e5ef7, 0xc382f5af
|
||||
.word 0x5d9fbe80, 0xd0697c93, 0xd56fa92d, 0x25cfb312, 0xacc83b99, 0x1810a77d, 0x9ce86e63, 0x3bdb7bbb
|
||||
.word 0x26cd0978, 0x596ef418, 0x9aec01b7, 0x4f83a89a, 0x95e6656e, 0xffaa7ee6, 0xbc2108cf, 0x15efe6e8
|
||||
.word 0xe7bad99b, 0x6f4ace36, 0x9fead409, 0xb029d67c, 0xa431afb2, 0x3f2a3123, 0xa5c63094, 0xa235c066
|
||||
.word 0x4e7437bc, 0x82fca6ca, 0x90e0b0d0, 0xa73315d8, 0x04f14a98, 0xec41f7da, 0xcd7f0e50, 0x91172ff6
|
||||
.word 0x4d768dd6, 0xef434db0, 0xaacc544d, 0x96e4df04, 0xd19ee3b5, 0x6a4c1b88, 0x2cc1b81f, 0x65467f51
|
||||
.word 0x5e9d04ea, 0x8c015d35, 0x87fa7374, 0x0bfb2e41, 0x67b35a1d, 0xdb9252d2, 0x10e93356, 0xd66d1347
|
||||
.word 0xd79a8c61, 0xa1377a0c, 0xf8598e14, 0x13eb893c, 0xa9ceee27, 0x61b735c9, 0x1ce1ede5, 0x477a3cb1
|
||||
.word 0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db
|
||||
.word 0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195
|
||||
.word 0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257
|
||||
|
||||
.word 0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3
|
||||
.word 0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362
|
||||
.word 0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3
|
||||
.word 0x03e75f8f, 0x15959c92, 0xbfeb7a6d, 0x95da5952, 0xd42d83be, 0x58d32174, 0x492969e0, 0x8e44c8c9
|
||||
.word 0x756a89c2, 0xf478798e, 0x996b3e58, 0x27dd71b9, 0xbeb64fe1, 0xf017ad88, 0xc966ac20, 0x7db43ace
|
||||
.word 0x63184adf, 0xe582311a, 0x97603351, 0x62457f53, 0xb1e07764, 0xbb84ae6b, 0xfe1ca081, 0xf9942b08
|
||||
.word 0x70586848, 0x8f19fd45, 0x94876cde, 0x52b7f87b, 0xab23d373, 0x72e2024b, 0xe3578f1f, 0x662aab55
|
||||
.word 0xb20728eb, 0x2f03c2b5, 0x869a7bc5, 0xd3a50837, 0x30f28728, 0x23b2a5bf, 0x02ba6a03, 0xed5c8216
|
||||
.word 0x8a2b1ccf, 0xa792b479, 0xf3f0f207, 0x4ea1e269, 0x65cdf4da, 0x06d5be05, 0xd11f6234, 0xc48afea6
|
||||
.word 0x349d532e, 0xa2a055f3, 0x0532e18a, 0xa475ebf6, 0x0b39ec83, 0x40aaef60, 0x5e069f71, 0xbd51106e
|
||||
.word 0x3ef98a21, 0x963d06dd, 0xddae053e, 0x4d46bde6, 0x91b58d54, 0x71055dc4, 0x046fd406, 0x60ff1550
|
||||
.word 0x1924fb98, 0xd697e9bd, 0x89cc4340, 0x67779ed9, 0xb0bd42e8, 0x07888b89, 0xe7385b19, 0x79dbeec8
|
||||
.word 0xa1470a7c, 0x7ce90f42, 0xf8c91e84, 0x00000000, 0x09838680, 0x3248ed2b, 0x1eac7011, 0x6c4e725a
|
||||
.word 0xfdfbff0e, 0x0f563885, 0x3d1ed5ae, 0x3627392d, 0x0a64d90f, 0x6821a65c, 0x9bd1545b, 0x243a2e36
|
||||
.word 0x0cb1670a, 0x930fe757, 0xb4d296ee, 0x1b9e919b, 0x804fc5c0, 0x61a220dc, 0x5a694b77, 0x1c161a12
|
||||
.word 0xe20aba93, 0xc0e52aa0, 0x3c43e022, 0x121d171b, 0x0e0b0d09, 0xf2adc78b, 0x2db9a8b6, 0x14c8a91e
|
||||
.word 0x578519f1, 0xaf4c0775, 0xeebbdd99, 0xa3fd607f, 0xf79f2601, 0x5cbcf572, 0x44c53b66, 0x5b347efb
|
||||
.word 0x8b762943, 0xcbdcc623, 0xb668fced, 0xb863f1e4, 0xd7cadc31, 0x42108563, 0x13402297, 0x842011c6
|
||||
.word 0x857d244a, 0xd2f83dbb, 0xae1132f9, 0xc76da129, 0x1d4b2f9e, 0xdcf330b2, 0x0dec5286, 0x77d0e3c1
|
||||
.word 0x2b6c16b3, 0xa999b970, 0x11fa4894, 0x472264e9, 0xa8c48cfc, 0xa01a3ff0, 0x56d82c7d, 0x22ef9033
|
||||
.word 0x87c74e49, 0xd9c1d138, 0x8cfea2ca, 0x98360bd4, 0xa6cf81f5, 0xa528de7a, 0xda268eb7, 0x3fa4bfad
|
||||
.word 0x2ce49d3a, 0x500d9278, 0x6a9bcc5f, 0x5462467e, 0xf6c2138d, 0x90e8b8d8, 0x2e5ef739, 0x82f5afc3
|
||||
.word 0x9fbe805d, 0x697c93d0, 0x6fa92dd5, 0xcfb31225, 0xc83b99ac, 0x10a77d18, 0xe86e639c, 0xdb7bbb3b
|
||||
.word 0xcd097826, 0x6ef41859, 0xec01b79a, 0x83a89a4f, 0xe6656e95, 0xaa7ee6ff, 0x2108cfbc, 0xefe6e815
|
||||
.word 0xbad99be7, 0x4ace366f, 0xead4099f, 0x29d67cb0, 0x31afb2a4, 0x2a31233f, 0xc63094a5, 0x35c066a2
|
||||
.word 0x7437bc4e, 0xfca6ca82, 0xe0b0d090, 0x3315d8a7, 0xf14a9804, 0x41f7daec, 0x7f0e50cd, 0x172ff691
|
||||
.word 0x768dd64d, 0x434db0ef, 0xcc544daa, 0xe4df0496, 0x9ee3b5d1, 0x4c1b886a, 0xc1b81f2c, 0x467f5165
|
||||
.word 0x9d04ea5e, 0x015d358c, 0xfa737487, 0xfb2e410b, 0xb35a1d67, 0x9252d2db, 0xe9335610, 0x6d1347d6
|
||||
.word 0x9a8c61d7, 0x377a0ca1, 0x598e14f8, 0xeb893c13, 0xceee27a9, 0xb735c961, 0xe1ede51c, 0x7a3cb147
|
||||
.word 0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44
|
||||
.word 0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d
|
||||
.word 0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8
|
||||
|
||||
|
||||
# Input parameters
|
||||
@@ -491,42 +1053,39 @@ DECL(randomx_calc_dataset_item_aarch64):
|
||||
stp x10, x11, [sp, 80]
|
||||
stp x12, x13, [sp, 96]
|
||||
|
||||
ldr x12, superscalarMul0
|
||||
adr x7, superscalarMul0
|
||||
# superscalarMul0, superscalarAdd1
|
||||
ldp x12, x13, [x7]
|
||||
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
ldp x8, x9, [sp]
|
||||
mov x10, x2
|
||||
|
||||
# rl[0] = (itemNumber + 1) * superscalarMul0;
|
||||
madd x0, x2, x12, x12
|
||||
|
||||
# rl[1] = rl[0] ^ superscalarAdd1;
|
||||
ldr x12, superscalarAdd1
|
||||
eor x1, x0, x12
|
||||
eor x1, x0, x13
|
||||
|
||||
# rl[2] = rl[0] ^ superscalarAdd2;
|
||||
ldr x12, superscalarAdd2
|
||||
ldp x12, x13, [x7, 16]
|
||||
eor x2, x0, x12
|
||||
|
||||
# rl[3] = rl[0] ^ superscalarAdd3;
|
||||
ldr x12, superscalarAdd3
|
||||
eor x3, x0, x12
|
||||
eor x3, x0, x13
|
||||
|
||||
# rl[4] = rl[0] ^ superscalarAdd4;
|
||||
ldr x12, superscalarAdd4
|
||||
ldp x12, x13, [x7, 32]
|
||||
eor x4, x0, x12
|
||||
|
||||
# rl[5] = rl[0] ^ superscalarAdd5;
|
||||
ldr x12, superscalarAdd5
|
||||
eor x5, x0, x12
|
||||
eor x5, x0, x13
|
||||
|
||||
# rl[6] = rl[0] ^ superscalarAdd6;
|
||||
ldr x12, superscalarAdd6
|
||||
ldp x12, x13, [x7, 48]
|
||||
eor x6, x0, x12
|
||||
|
||||
# rl[7] = rl[0] ^ superscalarAdd7;
|
||||
ldr x12, superscalarAdd7
|
||||
eor x7, x0, x12
|
||||
eor x7, x0, x13
|
||||
|
||||
b DECL(randomx_calc_dataset_item_aarch64_prefetch)
|
||||
|
||||
|
||||
@@ -38,9 +38,17 @@ extern "C" {
|
||||
void randomx_program_aarch64_cacheline_align_mask1();
|
||||
void randomx_program_aarch64_cacheline_align_mask2();
|
||||
void randomx_program_aarch64_update_spMix1();
|
||||
void randomx_program_aarch64_v2_FE_mix();
|
||||
void randomx_program_aarch64_v1_FE_mix();
|
||||
void randomx_program_aarch64_v2_FE_mix_soft_aes();
|
||||
void randomx_program_aarch64_vm_instructions_end_light();
|
||||
void randomx_program_aarch64_vm_instructions_end_light_tweak();
|
||||
void randomx_program_aarch64_light_cacheline_align_mask();
|
||||
void randomx_program_aarch64_light_dataset_offset();
|
||||
void randomx_program_aarch64_vm_instructions_end_v1();
|
||||
void randomx_program_aarch64_vm_instructions_end_v2();
|
||||
void randomx_program_aarch64_vm_instructions_end_light_v1();
|
||||
void randomx_program_aarch64_vm_instructions_end_light_v2();
|
||||
void randomx_init_dataset_aarch64();
|
||||
void randomx_init_dataset_aarch64_end();
|
||||
void randomx_calc_dataset_item_aarch64();
|
||||
|
||||
1204
src/crypto/randomx/jit_compiler_rv64.cpp
Normal file
1204
src/crypto/randomx/jit_compiler_rv64.cpp
Normal file
File diff suppressed because it is too large
Load Diff
151
src/crypto/randomx/jit_compiler_rv64.hpp
Normal file
151
src/crypto/randomx/jit_compiler_rv64.hpp
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
Copyright (c) 2023 tevador <tevador@gmail.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include "crypto/randomx/common.hpp"
|
||||
#include "crypto/randomx/jit_compiler_rv64_static.hpp"
|
||||
|
||||
namespace randomx {
|
||||
|
||||
struct CodeBuffer {
|
||||
uint8_t* code;
|
||||
int32_t codePos;
|
||||
int32_t rcpCount;
|
||||
|
||||
void emit(const uint8_t* src, int32_t len) {
|
||||
memcpy(&code[codePos], src, len);
|
||||
codePos += len;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void emit(T src) {
|
||||
memcpy(&code[codePos], &src, sizeof(src));
|
||||
codePos += sizeof(src);
|
||||
}
|
||||
|
||||
void emitAt(int32_t codePos, const uint8_t* src, int32_t len) {
|
||||
memcpy(&code[codePos], src, len);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void emitAt(int32_t codePos, T src) {
|
||||
memcpy(&code[codePos], &src, sizeof(src));
|
||||
}
|
||||
};
|
||||
|
||||
struct CompilerState : public CodeBuffer {
|
||||
int32_t instructionOffsets[RANDOMX_PROGRAM_MAX_SIZE];
|
||||
int registerUsage[RegistersCount];
|
||||
};
|
||||
|
||||
class Program;
|
||||
struct ProgramConfiguration;
|
||||
class SuperscalarProgram;
|
||||
class Instruction;
|
||||
|
||||
#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i
|
||||
typedef void(*InstructionGeneratorRV64)(HANDLER_ARGS);
|
||||
|
||||
class JitCompilerRV64 {
|
||||
public:
|
||||
JitCompilerRV64(bool hugePagesEnable, bool optimizedInitDatasetEnable);
|
||||
~JitCompilerRV64();
|
||||
|
||||
void prepare() {}
|
||||
void generateProgram(Program&, ProgramConfiguration&, uint32_t);
|
||||
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
|
||||
|
||||
template<size_t N>
|
||||
void generateSuperscalarHash(SuperscalarProgram(&programs)[N]);
|
||||
|
||||
void generateDatasetInitCode() {}
|
||||
|
||||
ProgramFunc* getProgramFunc() {
|
||||
return (ProgramFunc*)(vectorCode ? entryProgramVector : entryProgram);
|
||||
}
|
||||
DatasetInitFunc* getDatasetInitFunc() {
|
||||
return (DatasetInitFunc*)(vectorCode ? entryDataInitVector : entryDataInit);
|
||||
}
|
||||
uint8_t* getCode() {
|
||||
return state.code;
|
||||
}
|
||||
size_t getCodeSize();
|
||||
|
||||
void enableWriting() const;
|
||||
void enableExecution() const;
|
||||
|
||||
static InstructionGeneratorRV64 engine[256];
|
||||
static uint8_t inst_map[256];
|
||||
private:
|
||||
CompilerState state;
|
||||
|
||||
uint8_t* vectorCode = nullptr;
|
||||
size_t vectorCodeSize = 0;
|
||||
|
||||
void* entryDataInit = nullptr;
|
||||
void* entryDataInitVector = nullptr;
|
||||
void* entryProgram = nullptr;
|
||||
void* entryProgramVector = nullptr;
|
||||
|
||||
public:
|
||||
static void v1_IADD_RS(HANDLER_ARGS);
|
||||
static void v1_IADD_M(HANDLER_ARGS);
|
||||
static void v1_ISUB_R(HANDLER_ARGS);
|
||||
static void v1_ISUB_M(HANDLER_ARGS);
|
||||
static void v1_IMUL_R(HANDLER_ARGS);
|
||||
static void v1_IMUL_M(HANDLER_ARGS);
|
||||
static void v1_IMULH_R(HANDLER_ARGS);
|
||||
static void v1_IMULH_M(HANDLER_ARGS);
|
||||
static void v1_ISMULH_R(HANDLER_ARGS);
|
||||
static void v1_ISMULH_M(HANDLER_ARGS);
|
||||
static void v1_IMUL_RCP(HANDLER_ARGS);
|
||||
static void v1_INEG_R(HANDLER_ARGS);
|
||||
static void v1_IXOR_R(HANDLER_ARGS);
|
||||
static void v1_IXOR_M(HANDLER_ARGS);
|
||||
static void v1_IROR_R(HANDLER_ARGS);
|
||||
static void v1_IROL_R(HANDLER_ARGS);
|
||||
static void v1_ISWAP_R(HANDLER_ARGS);
|
||||
static void v1_FSWAP_R(HANDLER_ARGS);
|
||||
static void v1_FADD_R(HANDLER_ARGS);
|
||||
static void v1_FADD_M(HANDLER_ARGS);
|
||||
static void v1_FSUB_R(HANDLER_ARGS);
|
||||
static void v1_FSUB_M(HANDLER_ARGS);
|
||||
static void v1_FSCAL_R(HANDLER_ARGS);
|
||||
static void v1_FMUL_R(HANDLER_ARGS);
|
||||
static void v1_FDIV_M(HANDLER_ARGS);
|
||||
static void v1_FSQRT_R(HANDLER_ARGS);
|
||||
static void v1_CBRANCH(HANDLER_ARGS);
|
||||
static void v1_CFROUND(HANDLER_ARGS);
|
||||
static void v1_ISTORE(HANDLER_ARGS);
|
||||
static void v1_NOP(HANDLER_ARGS);
|
||||
};
|
||||
}
|
||||
1236
src/crypto/randomx/jit_compiler_rv64_static.S
Normal file
1236
src/crypto/randomx/jit_compiler_rv64_static.S
Normal file
File diff suppressed because it is too large
Load Diff
53
src/crypto/randomx/jit_compiler_rv64_static.hpp
Normal file
53
src/crypto/randomx/jit_compiler_rv64_static.hpp
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
Copyright (c) 2023 tevador <tevador@gmail.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
extern "C" {
|
||||
void randomx_riscv64_literals();
|
||||
void randomx_riscv64_literals_end();
|
||||
void randomx_riscv64_data_init();
|
||||
void randomx_riscv64_fix_data_call();
|
||||
void randomx_riscv64_prologue();
|
||||
void randomx_riscv64_loop_begin();
|
||||
void randomx_riscv64_data_read();
|
||||
void randomx_riscv64_data_read_light();
|
||||
void randomx_riscv64_fix_loop_call();
|
||||
void randomx_riscv64_spad_store();
|
||||
void randomx_riscv64_spad_store_hardaes();
|
||||
void randomx_riscv64_spad_store_softaes();
|
||||
void randomx_riscv64_loop_end();
|
||||
void randomx_riscv64_fix_continue_loop();
|
||||
void randomx_riscv64_epilogue();
|
||||
void randomx_riscv64_softaes();
|
||||
void randomx_riscv64_program_end();
|
||||
void randomx_riscv64_ssh_init();
|
||||
void randomx_riscv64_ssh_load();
|
||||
void randomx_riscv64_ssh_prefetch();
|
||||
void randomx_riscv64_ssh_end();
|
||||
}
|
||||
845
src/crypto/randomx/jit_compiler_rv64_vector.cpp
Normal file
845
src/crypto/randomx/jit_compiler_rv64_vector.cpp
Normal file
@@ -0,0 +1,845 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "crypto/randomx/configuration.h"
|
||||
#include "crypto/randomx/jit_compiler_rv64_vector.h"
|
||||
#include "crypto/randomx/jit_compiler_rv64_vector_static.h"
|
||||
#include "crypto/randomx/reciprocal.h"
|
||||
#include "crypto/randomx/superscalar.hpp"
|
||||
#include "crypto/randomx/program.hpp"
|
||||
|
||||
namespace randomx {
|
||||
|
||||
#define ADDR(x) ((uint8_t*) &(x))
|
||||
#define DIST(x, y) (ADDR(y) - ADDR(x))
|
||||
|
||||
void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs)
|
||||
{
|
||||
uint8_t* p = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions);
|
||||
|
||||
uint8_t* literals = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_imul_rcp_literals);
|
||||
uint8_t* cur_literal = literals;
|
||||
|
||||
for (size_t i = 0; i < num_programs; ++i) {
|
||||
// Step 4
|
||||
size_t k = DIST(randomx_riscv64_vector_sshash_cache_prefetch, randomx_riscv64_vector_sshash_xor);
|
||||
memcpy(p, reinterpret_cast<void*>(randomx_riscv64_vector_sshash_cache_prefetch), k);
|
||||
p += k;
|
||||
|
||||
// Step 5
|
||||
for (uint32_t j = 0; j < programs[i].size; ++j) {
|
||||
const uint32_t dst = programs[i].programBuffer[j].dst & 7;
|
||||
const uint32_t src = programs[i].programBuffer[j].src & 7;
|
||||
const uint32_t modShift = (programs[i].programBuffer[j].mod >> 2) & 3;
|
||||
const uint32_t imm32 = programs[i].programBuffer[j].imm32;
|
||||
|
||||
uint32_t inst;
|
||||
#define EMIT(data) inst = (data); memcpy(p, &inst, 4); p += 4
|
||||
|
||||
switch (static_cast<SuperscalarInstructionType>(programs[i].programBuffer[j].opcode)) {
|
||||
case SuperscalarInstructionType::ISUB_R:
|
||||
// 57 00 00 0A vsub.vv v0, v0, v0
|
||||
EMIT(0x0A000057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IXOR_R:
|
||||
// 57 00 00 2E vxor.vv v0, v0, v0
|
||||
EMIT(0x2E000057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IADD_RS:
|
||||
if (modShift == 0) {
|
||||
// 57 00 00 02 vadd.vv v0, v0, v0
|
||||
EMIT(0x02000057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
}
|
||||
else {
|
||||
// 57 39 00 96 vsll.vi v18, v0, 0
|
||||
// 57 00 09 02 vadd.vv v0, v0, v18
|
||||
EMIT(0x96003957 | (modShift << 15) | (src << 20));
|
||||
EMIT(0x02090057 | (dst << 7) | (dst << 20));
|
||||
}
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IMUL_R:
|
||||
// 57 20 00 96 vmul.vv v0, v0, v0
|
||||
EMIT(0x96002057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IROR_C:
|
||||
{
|
||||
#ifdef __riscv_zvkb
|
||||
// 57 30 00 52 vror.vi v0, v0, 0
|
||||
EMIT(0x52003057 | (dst << 7) | (dst << 20) | ((imm32 & 31) << 15) | ((imm32 & 32) << 21));
|
||||
#else // __riscv_zvkb
|
||||
const uint32_t shift_right = imm32 & 63;
|
||||
const uint32_t shift_left = 64 - shift_right;
|
||||
|
||||
if (shift_right < 32) {
|
||||
// 57 39 00 A2 vsrl.vi v18, v0, 0
|
||||
EMIT(0xA2003957 | (shift_right << 15) | (dst << 20));
|
||||
}
|
||||
else {
|
||||
// 93 02 00 00 li x5, 0
|
||||
// 57 C9 02 A2 vsrl.vx v18, v0, x5
|
||||
EMIT(0x00000293 | (shift_right << 20));
|
||||
EMIT(0xA202C957 | (dst << 20));
|
||||
}
|
||||
|
||||
if (shift_left < 32) {
|
||||
// 57 30 00 96 vsll.vi v0, v0, 0
|
||||
EMIT(0x96003057 | (dst << 7) | (shift_left << 15) | (dst << 20));
|
||||
}
|
||||
else {
|
||||
// 93 02 00 00 li x5, 0
|
||||
// 57 C0 02 96 vsll.vx v0, v0, x5
|
||||
EMIT(0x00000293 | (shift_left << 20));
|
||||
EMIT(0x9602C057 | (dst << 7) | (dst << 20));
|
||||
}
|
||||
|
||||
// 57 00 20 2B vor.vv v0, v18, v0
|
||||
EMIT(0x2B200057 | (dst << 7) | (dst << 15));
|
||||
#endif // __riscv_zvkb
|
||||
}
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IADD_C7:
|
||||
case SuperscalarInstructionType::IADD_C8:
|
||||
case SuperscalarInstructionType::IADD_C9:
|
||||
// B7 02 00 00 lui x5, 0
|
||||
// 9B 82 02 00 addiw x5, x5, 0
|
||||
// 57 C0 02 02 vadd.vx v0, v0, x5
|
||||
EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000));
|
||||
EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20));
|
||||
EMIT(0x0202C057 | (dst << 7) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IXOR_C7:
|
||||
case SuperscalarInstructionType::IXOR_C8:
|
||||
case SuperscalarInstructionType::IXOR_C9:
|
||||
// B7 02 00 00 lui x5, 0
|
||||
// 9B 82 02 00 addiw x5, x5, 0
|
||||
// 57 C0 02 2E vxor.vx v0, v0, x5
|
||||
EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000));
|
||||
EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20));
|
||||
EMIT(0x2E02C057 | (dst << 7) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IMULH_R:
|
||||
// 57 20 00 92 vmulhu.vv v0, v0, v0
|
||||
EMIT(0x92002057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::ISMULH_R:
|
||||
// 57 20 00 9E vmulh.vv v0, v0, v0
|
||||
EMIT(0x9E002057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IMUL_RCP:
|
||||
{
|
||||
uint32_t offset = cur_literal - literals;
|
||||
|
||||
if (offset == 2040) {
|
||||
literals += 2040;
|
||||
offset = 0;
|
||||
|
||||
// 93 87 87 7F add x15, x15, 2040
|
||||
EMIT(0x7F878793);
|
||||
}
|
||||
|
||||
const uint64_t r = randomx_reciprocal_fast(imm32);
|
||||
memcpy(cur_literal, &r, 8);
|
||||
cur_literal += 8;
|
||||
|
||||
// 83 B2 07 00 ld x5, (x15)
|
||||
// 57 E0 02 96 vmul.vx v0, v0, x5
|
||||
EMIT(0x0007B283 | (offset << 20));
|
||||
EMIT(0x9602E057 | (dst << 7) | (dst << 20));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
UNREACHABLE;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 6
|
||||
k = DIST(randomx_riscv64_vector_sshash_xor, randomx_riscv64_vector_sshash_end);
|
||||
memcpy(p, reinterpret_cast<void*>(randomx_riscv64_vector_sshash_xor), k);
|
||||
p += k;
|
||||
|
||||
// Step 7. Set cacheIndex to the value of the register that has the longest dependency chain in the SuperscalarHash function executed in step 5.
|
||||
if (i + 1 < num_programs) {
|
||||
// vmv.v.v v9, v0 + programs[i].getAddressRegister()
|
||||
const uint32_t t = 0x5E0004D7 + (static_cast<uint32_t>(programs[i].getAddressRegister()) << 15);
|
||||
memcpy(p, &t, 4);
|
||||
p += 4;
|
||||
}
|
||||
}
|
||||
|
||||
// Emit "J randomx_riscv64_vector_sshash_generated_instructions_end" instruction
|
||||
const uint8_t* e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions_end);
|
||||
const uint32_t k = e - p;
|
||||
const uint32_t j = 0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000);
|
||||
memcpy(p, &j, 4);
|
||||
|
||||
char* result = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_dataset_init));
|
||||
|
||||
#ifdef __GNUC__
|
||||
__builtin___clear_cache(result, (char*)(buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_end)));
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#define emit16(value) { const uint16_t t = value; memcpy(p, &t, 2); p += 2; }
|
||||
#define emit32(value) { const uint32_t t = value; memcpy(p, &t, 4); p += 4; }
|
||||
#define emit64(value) { const uint64_t t = value; memcpy(p, &t, 8); p += 8; }
|
||||
#define emit_data(arr) { memcpy(p, arr, sizeof(arr)); p += sizeof(arr); }
|
||||
|
||||
static void imm_to_x5(uint32_t imm, uint8_t*& p)
|
||||
{
|
||||
const uint32_t imm_hi = (imm + ((imm & 0x800) << 1)) & 0xFFFFF000U;
|
||||
const uint32_t imm_lo = imm & 0x00000FFFU;
|
||||
|
||||
if (imm_hi == 0) {
|
||||
// li x5, imm_lo
|
||||
emit32(0x00000293 + (imm_lo << 20));
|
||||
return;
|
||||
}
|
||||
|
||||
if (imm_lo == 0) {
|
||||
// lui x5, imm_hi
|
||||
emit32(0x000002B7 + imm_hi);
|
||||
return;
|
||||
}
|
||||
|
||||
if (imm_hi < (32 << 12)) {
|
||||
//c.lui x5, imm_hi
|
||||
emit16(0x6281 + (imm_hi >> 10));
|
||||
}
|
||||
else {
|
||||
// lui x5, imm_hi
|
||||
emit32(0x000002B7 + imm_hi);
|
||||
}
|
||||
|
||||
// addiw x5, x5, imm_lo
|
||||
emit32(0x0002829B | (imm_lo << 20));
|
||||
}
|
||||
|
||||
static void loadFromScratchpad(uint32_t src, uint32_t dst, uint32_t mod, uint32_t imm, uint8_t*& p)
|
||||
{
|
||||
if (src == dst) {
|
||||
imm &= RandomX_CurrentConfig.ScratchpadL3Mask_Calculated;
|
||||
|
||||
if (imm <= 2047) {
|
||||
// ld x5, imm(x12)
|
||||
emit32(0x00063283 | (imm << 20));
|
||||
}
|
||||
else if (imm <= 2047 * 2) {
|
||||
// addi x5, x12, 2047
|
||||
emit32(0x7FF60293);
|
||||
// ld x5, (imm - 2047)(x5)
|
||||
emit32(0x0002B283 | ((imm - 2047) << 20));
|
||||
}
|
||||
else {
|
||||
// lui x5, imm & 0xFFFFF000U
|
||||
emit32(0x000002B7 | ((imm + ((imm & 0x800) << 1)) & 0xFFFFF000U));
|
||||
// c.add x5, x12
|
||||
emit16(0x92B2);
|
||||
// ld x5, (imm & 0xFFF)(x5)
|
||||
emit32(0x0002B283 | ((imm & 0xFFF) << 20));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t shift = 32;
|
||||
uint32_t mask_reg;
|
||||
|
||||
if ((mod & 3) == 0) {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL2;
|
||||
mask_reg = 17;
|
||||
}
|
||||
else {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL1;
|
||||
mask_reg = 16;
|
||||
}
|
||||
|
||||
imm = static_cast<uint32_t>(static_cast<int32_t>(imm << shift) >> shift);
|
||||
|
||||
// 0-0x7FF, 0xFFFFF800-0xFFFFFFFF fit into 12 bit (a single addi instruction)
|
||||
if (imm - 0xFFFFF800U < 0x1000U) {
|
||||
// addi x5, x20 + src, imm
|
||||
emit32(0x000A0293 + (src << 15) + (imm << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(imm, p);
|
||||
// c.add x5, x20 + src
|
||||
emit16(0x92D2 + (src << 2));
|
||||
}
|
||||
|
||||
// and x5, x5, mask_reg
|
||||
emit32(0x0002F2B3 + (mask_reg << 20));
|
||||
// c.add x5, x12
|
||||
emit16(0x92B2);
|
||||
// ld x5, 0(x5)
|
||||
emit32(0x0002B283);
|
||||
}
|
||||
|
||||
void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset)
|
||||
{
|
||||
uint64_t* params = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params));
|
||||
|
||||
params[0] = RandomX_CurrentConfig.ScratchpadL1_Size - 8;
|
||||
params[1] = RandomX_CurrentConfig.ScratchpadL2_Size - 8;
|
||||
params[2] = RandomX_CurrentConfig.ScratchpadL3_Size - 8;
|
||||
params[3] = RandomX_CurrentConfig.DatasetBaseSize - 64;
|
||||
params[4] = (1 << RandomX_ConfigurationBase::JumpBits) - 1;
|
||||
|
||||
uint64_t* imul_rcp_literals = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_imul_rcp_literals));
|
||||
uint64_t* cur_literal = imul_rcp_literals;
|
||||
|
||||
uint32_t* spaddr_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_spaddr_xor));
|
||||
uint32_t* spaddr_xor2 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch));
|
||||
uint32_t* mx_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor));
|
||||
uint32_t* mx_xor_light = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor_light_mode));
|
||||
|
||||
*spaddr_xor = 0x014A47B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x15, readReg0, readReg1
|
||||
*spaddr_xor2 = 0x014A42B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x5, readReg0, readReg1
|
||||
const uint32_t mx_xor_value = 0x014A42B3 + (pcfg.readReg2 << 15) + (pcfg.readReg3 << 20); // xor x5, readReg2, readReg3
|
||||
|
||||
*mx_xor = mx_xor_value;
|
||||
*mx_xor_light = mx_xor_value;
|
||||
|
||||
if (entryDataInitScalar) {
|
||||
void* light_mode_data = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_light_mode_data);
|
||||
|
||||
const uint64_t data[2] = { reinterpret_cast<uint64_t>(entryDataInitScalar), datasetOffset };
|
||||
memcpy(light_mode_data, &data, sizeof(data));
|
||||
}
|
||||
|
||||
uint8_t* p = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions));
|
||||
|
||||
// 57C8025E vmv.v.x v16, x5
|
||||
// 57A9034B vsext.vf2 v18, v16
|
||||
// 5798214B vfcvt.f.x.v v16, v18
|
||||
static constexpr uint8_t group_f_convert[] = {
|
||||
0x57, 0xC8, 0x02, 0x5E, 0x57, 0xA9, 0x03, 0x4B, 0x57, 0x98, 0x21, 0x4B
|
||||
};
|
||||
|
||||
// 57080627 vand.vv v16, v16, v12
|
||||
// 5788062B vor.vv v16, v16, v13
|
||||
static constexpr uint8_t group_e_post_process[] = { 0x57, 0x08, 0x06, 0x27, 0x57, 0x88, 0x06, 0x2B };
|
||||
|
||||
uint8_t* last_modified[RegistersCount] = { p, p, p, p, p, p, p, p };
|
||||
|
||||
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
|
||||
Instruction instr = prog(i);
|
||||
|
||||
uint32_t src = instr.src % RegistersCount;
|
||||
uint32_t dst = instr.dst % RegistersCount;
|
||||
const uint32_t shift = instr.getModShift();
|
||||
uint32_t imm = instr.getImm32();
|
||||
const uint32_t mod = instr.mod;
|
||||
|
||||
switch (static_cast<InstructionType>(inst_map[instr.opcode])) {
|
||||
case InstructionType::IADD_RS:
|
||||
if (shift == 0) {
|
||||
// c.add x20 + dst, x20 + src
|
||||
emit16(0x9A52 + (src << 2) + (dst << 7));
|
||||
}
|
||||
else {
|
||||
#ifdef __riscv_zba
|
||||
// sh{shift}add x20 + dst, x20 + src, x20 + dst
|
||||
emit32(0x214A0A33 + (shift << 13) + (dst << 7) + (src << 15) + (dst << 20));
|
||||
#else // __riscv_zba
|
||||
// slli x5, x20 + src, shift
|
||||
emit32(0x000A1293 + (src << 15) + (shift << 20));
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
#endif // __riscv_zba
|
||||
}
|
||||
if (dst == RegisterNeedsDisplacement) {
|
||||
imm_to_x5(imm, p);
|
||||
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IADD_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISUB_R:
|
||||
if (src != dst) {
|
||||
// sub x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x414A0A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(-imm, p);
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISUB_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// sub x20 + dst, x20 + dst, x5
|
||||
emit32(0x405A0A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMUL_R:
|
||||
if (src != dst) {
|
||||
// mul x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x034A0A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(imm, p);
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMUL_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMULH_R:
|
||||
// mulhu x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x034A3A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMULH_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// mulhu x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A3A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISMULH_R:
|
||||
// mulh x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x034A1A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISMULH_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// mulh x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A1A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMUL_RCP:
|
||||
if (!isZeroOrPowerOf2(imm)) {
|
||||
const uint64_t offset = (cur_literal - imul_rcp_literals) * 8;
|
||||
*(cur_literal++) = randomx_reciprocal_fast(imm);
|
||||
|
||||
static constexpr uint32_t rcp_regs[26] = {
|
||||
/* Integer */ 8, 10, 28, 29, 30, 31,
|
||||
/* Float */ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 28, 29, 30, 31
|
||||
};
|
||||
|
||||
if (offset < 6 * 8) {
|
||||
// mul x20 + dst, x20 + dst, rcp_reg
|
||||
emit32(0x020A0A33 + (dst << 7) + (dst << 15) + (rcp_regs[offset / 8] << 20));
|
||||
}
|
||||
else if (offset < 26 * 8) {
|
||||
// fmv.x.d x5, rcp_reg
|
||||
emit32(0xE20002D3 + (rcp_regs[offset / 8] << 15));
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
else {
|
||||
// ld x5, offset(x18)
|
||||
emit32(0x00093283 + (offset << 20));
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
}
|
||||
break;
|
||||
|
||||
case InstructionType::INEG_R:
|
||||
// sub x20 + dst, x0, x20 + dst
|
||||
emit32(0x41400A33 + (dst << 7) + (dst << 20));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IXOR_R:
|
||||
if (src != dst) {
|
||||
// xor x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x014A4A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(imm, p);
|
||||
// xor x20, x20, x5
|
||||
emit32(0x005A4A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IXOR_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// xor x20, x20, x5
|
||||
emit32(0x005A4A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
#ifdef __riscv_zbb
|
||||
case InstructionType::IROR_R:
|
||||
if (src != dst) {
|
||||
// ror x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x614A5A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
// rori x20 + dst, x20 + dst, imm
|
||||
emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((imm & 63) << 20));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IROL_R:
|
||||
if (src != dst) {
|
||||
// rol x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x614A1A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
// rori x20 + dst, x20 + dst, -imm
|
||||
emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((-imm & 63) << 20));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
#else // __riscv_zbb
|
||||
case InstructionType::IROR_R:
|
||||
if (src != dst) {
|
||||
// sub x5, x0, x20 + src
|
||||
emit32(0x414002B3 + (src << 20));
|
||||
// srl x6, x20 + dst, x20 + src
|
||||
emit32(0x014A5333 + (dst << 15) + (src << 20));
|
||||
// sll x20 + dst, x20 + dst, x5
|
||||
emit32(0x005A1A33 + (dst << 7) + (dst << 15));
|
||||
// or x20 + dst, x20 + dst, x6
|
||||
emit32(0x006A6A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
else {
|
||||
// srli x5, x20 + dst, imm
|
||||
emit32(0x000A5293 + (dst << 15) + ((imm & 63) << 20));
|
||||
// slli x6, x20 + dst, -imm
|
||||
emit32(0x000A1313 + (dst << 15) + ((-imm & 63) << 20));
|
||||
// or x20 + dst, x5, x6
|
||||
emit32(0x0062EA33 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IROL_R:
|
||||
if (src != dst) {
|
||||
// sub x5, x0, x20 + src
|
||||
emit32(0x414002B3 + (src << 20));
|
||||
// sll x6, x20 + dst, x20 + src
|
||||
emit32(0x014A1333 + (dst << 15) + (src << 20));
|
||||
// srl x20 + dst, x20 + dst, x5
|
||||
emit32(0x005A5A33 + (dst << 7) + (dst << 15));
|
||||
// or x20 + dst, x20 + dst, x6
|
||||
emit32(0x006A6A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
else {
|
||||
// srli x5, x20 + dst, -imm
|
||||
emit32(0x000A5293 + (dst << 15) + ((-imm & 63) << 20));
|
||||
// slli x6, x20 + dst, imm
|
||||
emit32(0x000A1313 + (dst << 15) + ((imm & 63) << 20));
|
||||
// or x20 + dst, x5, x6
|
||||
emit32(0x0062EA33 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
#endif // __riscv_zbb
|
||||
|
||||
case InstructionType::ISWAP_R:
|
||||
if (src != dst) {
|
||||
// c.mv x5, x20 + dst
|
||||
emit16(0x82D2 + (dst << 2));
|
||||
// c.mv x20 + dst, x20 + src
|
||||
emit16(0x8A52 + (src << 2) + (dst << 7));
|
||||
// c.mv x20 + src, x5
|
||||
emit16(0x8A16 + (src << 7));
|
||||
|
||||
last_modified[src] = p;
|
||||
last_modified[dst] = p;
|
||||
}
|
||||
break;
|
||||
|
||||
case InstructionType::FSWAP_R:
|
||||
// vmv.x.s x5, v0 + dst
|
||||
emit32(0x420022D7 + (dst << 20));
|
||||
// vslide1down.vx v0 + dst, v0 + dst, x5
|
||||
emit32(0x3E02E057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FADD_R:
|
||||
src %= RegisterCountFlt;
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfadd.vv v0 + dst, v0 + dst, v8 + src
|
||||
emit32(0x02041057 + (dst << 7) + (src << 15) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FADD_M:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
loadFromScratchpad(src, RegistersCount, mod, imm, p);
|
||||
emit_data(group_f_convert);
|
||||
|
||||
// vfadd.vv v0 + dst, v0 + dst, v16
|
||||
emit32(0x02081057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSUB_R:
|
||||
src %= RegisterCountFlt;
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfsub.vv v0 + dst, v0 + dst, v8 + src
|
||||
emit32(0x0A041057 + (dst << 7) + (src << 15) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSUB_M:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
loadFromScratchpad(src, RegistersCount, mod, imm, p);
|
||||
emit_data(group_f_convert);
|
||||
|
||||
// vfsub.vv v0 + dst, v0 + dst, v16
|
||||
emit32(0x0A081057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSCAL_R:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vxor.vv v0, v0, v14
|
||||
emit32(0x2E070057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FMUL_R:
|
||||
src %= RegisterCountFlt;
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfmul.vv v4 + dst, v4 + dst, v8 + src
|
||||
emit32(0x92441257 + (dst << 7) + (src << 15) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FDIV_M:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
loadFromScratchpad(src, RegistersCount, mod, imm, p);
|
||||
emit_data(group_f_convert);
|
||||
emit_data(group_e_post_process);
|
||||
|
||||
// vfdiv.vv v0 + dst, v0 + dst, v16
|
||||
emit32(0x82481257 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSQRT_R:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfsqrt.v v4 + dst, v4 + dst
|
||||
emit32(0x4E401257 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::CBRANCH:
|
||||
{
|
||||
const uint32_t shift = (mod >> 4) + RandomX_ConfigurationBase::JumpOffset;
|
||||
|
||||
imm |= (1UL << shift);
|
||||
|
||||
if (RandomX_ConfigurationBase::JumpOffset > 0 || shift > 0) {
|
||||
imm &= ~(1UL << (shift - 1));
|
||||
}
|
||||
|
||||
// slli x6, x7, shift
|
||||
// x6 = branchMask
|
||||
emit32(0x00039313 + (shift << 20));
|
||||
|
||||
// x5 = imm
|
||||
imm_to_x5(imm, p);
|
||||
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
|
||||
// and x5, x20 + dst, x6
|
||||
emit32(0x006A72B3 + (dst << 15));
|
||||
|
||||
const int offset = static_cast<int>(last_modified[dst] - p);
|
||||
|
||||
if (offset >= -4096) {
|
||||
// beqz x5, offset
|
||||
const uint32_t k = static_cast<uint32_t>(offset);
|
||||
emit32(0x80028063 | ((k & 0x1E) << 7) | ((k & 0x7E0) << 20) | ((k & 0x800) >> 4));
|
||||
}
|
||||
else {
|
||||
// bnez x5, 8
|
||||
emit32(0x00029463);
|
||||
// j offset
|
||||
const uint32_t k = static_cast<uint32_t>(offset - 4);
|
||||
emit32(0x8000006F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000));
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < RegistersCount; ++j) {
|
||||
last_modified[j] = p;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case InstructionType::CFROUND:
|
||||
if ((imm - 1) & 63) {
|
||||
#ifdef __riscv_zbb
|
||||
// rori x5, x20 + src, imm - 1
|
||||
emit32(0x600A5293 + (src << 15) + (((imm - 1) & 63) << 20));
|
||||
#else // __riscv_zbb
|
||||
// srli x5, x20 + src, imm - 1
|
||||
emit32(0x000A5293 + (src << 15) + (((imm - 1) & 63) << 20));
|
||||
// slli x6, x20 + src, 1 - imm
|
||||
emit32(0x000A1313 + (src << 15) + (((1 - imm) & 63) << 20));
|
||||
// or x5, x5, x6
|
||||
emit32(0x0062E2B3);
|
||||
#endif // __riscv_zbb
|
||||
|
||||
// andi x5, x5, 6
|
||||
emit32(0x0062F293);
|
||||
}
|
||||
else {
|
||||
// andi x5, x20 + src, 6
|
||||
emit32(0x006A7293 + (src << 15));
|
||||
}
|
||||
|
||||
// li x6, 01111000b
|
||||
// x6 = CFROUND lookup table
|
||||
emit32(0x07800313);
|
||||
// srl x5, x6, x5
|
||||
emit32(0x005352B3);
|
||||
// andi x5, x5, 3
|
||||
emit32(0x0032F293);
|
||||
// csrw frm, x5
|
||||
emit32(0x00229073);
|
||||
break;
|
||||
|
||||
case InstructionType::ISTORE:
|
||||
{
|
||||
uint32_t mask_reg;
|
||||
uint32_t shift = 32;
|
||||
|
||||
if ((mod >> 4) >= 14) {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL3;
|
||||
mask_reg = 1; // x1 = L3 mask
|
||||
}
|
||||
else {
|
||||
if ((mod & 3) == 0) {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL2;
|
||||
mask_reg = 17; // x17 = L2 mask
|
||||
}
|
||||
else {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL1;
|
||||
mask_reg = 16; // x16 = L1 mask
|
||||
}
|
||||
}
|
||||
|
||||
imm = static_cast<uint32_t>(static_cast<int32_t>(imm << shift) >> shift);
|
||||
imm_to_x5(imm, p);
|
||||
|
||||
// c.add x5, x20 + dst
|
||||
emit16(0x92D2 + (dst << 2));
|
||||
// and x5, x5, x0 + mask_reg
|
||||
emit32(0x0002F2B3 + (mask_reg << 20));
|
||||
// c.add x5, x12
|
||||
emit16(0x92B2);
|
||||
// sd x20 + src, 0(x5)
|
||||
emit32(0x0142B023 + (src << 20));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
UNREACHABLE;
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t* e;
|
||||
|
||||
if (entryDataInitScalar) {
|
||||
// Emit "J randomx_riscv64_vector_program_main_loop_instructions_end_light_mode" instruction
|
||||
e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end_light_mode);
|
||||
}
|
||||
else {
|
||||
// Emit "J randomx_riscv64_vector_program_main_loop_instructions_end" instruction
|
||||
e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end);
|
||||
}
|
||||
|
||||
const uint32_t k = e - p;
|
||||
emit32(0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000));
|
||||
|
||||
#ifdef __GNUC__
|
||||
char* p1 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params));
|
||||
char* p2 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_end));
|
||||
|
||||
__builtin___clear_cache(p1, p2);
|
||||
#endif
|
||||
|
||||
return buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_begin);
|
||||
}
|
||||
|
||||
} // namespace randomx
|
||||
45
src/crypto/randomx/jit_compiler_rv64_vector.h
Normal file
45
src/crypto/randomx/jit_compiler_rv64_vector.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace randomx {
|
||||
|
||||
class SuperscalarProgram;
|
||||
struct ProgramConfiguration;
|
||||
class Program;
|
||||
|
||||
void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs);
|
||||
void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset);
|
||||
|
||||
} // namespace randomx
|
||||
869
src/crypto/randomx/jit_compiler_rv64_vector_static.S
Normal file
869
src/crypto/randomx/jit_compiler_rv64_vector_static.S
Normal file
@@ -0,0 +1,869 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "configuration.h"
|
||||
|
||||
// Compatibility macros
|
||||
|
||||
#if !defined(RANDOMX_CACHE_ACCESSES) && defined(RANDOMX_CACHE_MAX_ACCESSES)
|
||||
#define RANDOMX_CACHE_ACCESSES RANDOMX_CACHE_MAX_ACCESSES
|
||||
#endif
|
||||
|
||||
#if defined(RANDOMX_ARGON_MEMORY)
|
||||
#define RANDOMX_CACHE_MASK RANDOMX_ARGON_MEMORY * 1024 / 64 - 1
|
||||
#elif defined(RANDOMX_CACHE_MAX_SIZE)
|
||||
#define RANDOMX_CACHE_MASK RANDOMX_CACHE_MAX_SIZE / 64 - 1
|
||||
#endif
|
||||
|
||||
#define DECL(x) x
|
||||
|
||||
.text
|
||||
|
||||
#ifndef __riscv_v
|
||||
#error This file requires rv64gcv
|
||||
#endif
|
||||
|
||||
.option pic
|
||||
|
||||
.global DECL(randomx_riscv64_vector_code_begin)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_sshash_begin)
|
||||
.global DECL(randomx_riscv64_vector_sshash_imul_rcp_literals)
|
||||
.global DECL(randomx_riscv64_vector_sshash_dataset_init)
|
||||
.global DECL(randomx_riscv64_vector_sshash_generated_instructions)
|
||||
.global DECL(randomx_riscv64_vector_sshash_generated_instructions_end)
|
||||
.global DECL(randomx_riscv64_vector_sshash_cache_prefetch)
|
||||
.global DECL(randomx_riscv64_vector_sshash_xor)
|
||||
.global DECL(randomx_riscv64_vector_sshash_end)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_program_params)
|
||||
.global DECL(randomx_riscv64_vector_program_imul_rcp_literals)
|
||||
.global DECL(randomx_riscv64_vector_program_begin)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_instructions)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_light_mode_data)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode)
|
||||
.global DECL(randomx_riscv64_vector_program_scratchpad_prefetch)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_program_end)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_code_end)
|
||||
|
||||
.balign 8
|
||||
|
||||
DECL(randomx_riscv64_vector_code_begin):
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_begin):
|
||||
|
||||
sshash_constant_0: .dword 6364136223846793005
|
||||
sshash_constant_1: .dword 9298411001130361340
|
||||
sshash_constant_2: .dword 12065312585734608966
|
||||
sshash_constant_3: .dword 9306329213124626780
|
||||
sshash_constant_4: .dword 5281919268842080866
|
||||
sshash_constant_5: .dword 10536153434571861004
|
||||
sshash_constant_6: .dword 3398623926847679864
|
||||
sshash_constant_7: .dword 9549104520008361294
|
||||
sshash_offsets: .dword 0,1,2,3
|
||||
store_offsets: .dword 0,64,128,192
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_imul_rcp_literals): .fill 512,8,0
|
||||
|
||||
/*
|
||||
Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#73-dataset-block-generation
|
||||
|
||||
Register layout
|
||||
---------------
|
||||
x5 = temporary
|
||||
|
||||
x10 = randomx cache
|
||||
x11 = output buffer
|
||||
x12 = startBlock
|
||||
x13 = endBlock
|
||||
|
||||
x14 = cache mask
|
||||
x15 = imul_rcp literal pointer
|
||||
|
||||
v0-v7 = r0-r7
|
||||
v8 = itemNumber
|
||||
v9 = cacheIndex, then a pointer into cache->memory (for prefetch), then a byte offset into cache->memory
|
||||
|
||||
v10-v17 = sshash constants
|
||||
|
||||
v18 = temporary
|
||||
|
||||
v19 = dataset item store offsets
|
||||
*/
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_dataset_init):
|
||||
// Process 4 64-bit values at a time
|
||||
vsetivli zero, 4, e64, m1, ta, ma
|
||||
|
||||
// Load cache->memory pointer
|
||||
ld x10, (x10)
|
||||
|
||||
// Init cache mask
|
||||
li x14, RANDOMX_CACHE_MASK
|
||||
|
||||
// Init dataset item store offsets
|
||||
lla x5, store_offsets
|
||||
vle64.v v19, (x5)
|
||||
|
||||
// Init itemNumber vector to (startBlock, startBlock + 1, startBlock + 2, startBlock + 3)
|
||||
lla x5, sshash_offsets
|
||||
vle64.v v8, (x5)
|
||||
vadd.vx v8, v8, x12
|
||||
|
||||
// Load constants (stride = x0 = 0, so a 64-bit value will be broadcast into each element of a vector)
|
||||
lla x5, sshash_constant_0
|
||||
vlse64.v v10, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_1
|
||||
vlse64.v v11, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_2
|
||||
vlse64.v v12, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_3
|
||||
vlse64.v v13, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_4
|
||||
vlse64.v v14, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_5
|
||||
vlse64.v v15, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_6
|
||||
vlse64.v v16, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_7
|
||||
vlse64.v v17, (x5), x0
|
||||
|
||||
// Calculate the end pointer for dataset init
|
||||
sub x13, x13, x12
|
||||
slli x13, x13, 6
|
||||
add x13, x13, x11
|
||||
|
||||
init_item:
|
||||
// Step 1. Init r0-r7
|
||||
|
||||
// r0 = (itemNumber + 1) * 6364136223846793005
|
||||
vmv.v.v v0, v8
|
||||
vmadd.vv v0, v10, v10
|
||||
|
||||
// r_i = r0 ^ c_i for i = 1..7
|
||||
vxor.vv v1, v0, v11
|
||||
vxor.vv v2, v0, v12
|
||||
vxor.vv v3, v0, v13
|
||||
vxor.vv v4, v0, v14
|
||||
vxor.vv v5, v0, v15
|
||||
vxor.vv v6, v0, v16
|
||||
vxor.vv v7, v0, v17
|
||||
|
||||
// Step 2. Let cacheIndex = itemNumber
|
||||
vmv.v.v v9, v8
|
||||
|
||||
// Step 3 is implicit (all iterations are inlined, there is no "i")
|
||||
|
||||
// Init imul_rcp literal pointer
|
||||
lla x15, randomx_riscv64_vector_sshash_imul_rcp_literals
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_generated_instructions):
|
||||
// Generated by JIT compiler
|
||||
//
|
||||
// Step 4. randomx_riscv64_vector_sshash_cache_prefetch
|
||||
// Step 5. SuperscalarHash[i]
|
||||
// Step 6. randomx_riscv64_vector_sshash_xor
|
||||
//
|
||||
// Above steps will be repeated RANDOMX_CACHE_ACCESSES times
|
||||
.fill RANDOMX_CACHE_ACCESSES * 2048, 4, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
|
||||
// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
|
||||
vsuxei64.v v0, (x11), v19
|
||||
|
||||
add x5, x11, 8
|
||||
vsuxei64.v v1, (x5), v19
|
||||
|
||||
add x5, x11, 16
|
||||
vsuxei64.v v2, (x5), v19
|
||||
|
||||
add x5, x11, 24
|
||||
vsuxei64.v v3, (x5), v19
|
||||
|
||||
add x5, x11, 32
|
||||
vsuxei64.v v4, (x5), v19
|
||||
|
||||
add x5, x11, 40
|
||||
vsuxei64.v v5, (x5), v19
|
||||
|
||||
add x5, x11, 48
|
||||
vsuxei64.v v6, (x5), v19
|
||||
|
||||
add x5, x11, 56
|
||||
vsuxei64.v v7, (x5), v19
|
||||
|
||||
// Iterate to the next 4 items
|
||||
vadd.vi v8, v8, 4
|
||||
add x11, x11, 256
|
||||
bltu x11, x13, init_item
|
||||
|
||||
ret
|
||||
|
||||
// Step 4. Load a 64-byte item from the Cache. The item index is given by cacheIndex modulo the total number of 64-byte items in Cache.
|
||||
DECL(randomx_riscv64_vector_sshash_cache_prefetch):
|
||||
// v9 = convert from cacheIndex to a direct pointer into cache->memory
|
||||
vand.vx v9, v9, x14
|
||||
vsll.vi v9, v9, 6
|
||||
vadd.vx v9, v9, x10
|
||||
|
||||
// Prefetch element 0
|
||||
vmv.x.s x5, v9
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// Prefetch element 1
|
||||
vslidedown.vi v18, v9, 1
|
||||
vmv.x.s x5, v18
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// Prefetch element 2
|
||||
vslidedown.vi v18, v9, 2
|
||||
vmv.x.s x5, v18
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// Prefetch element 3
|
||||
vslidedown.vi v18, v9, 3
|
||||
vmv.x.s x5, v18
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// v9 = byte offset into cache->memory
|
||||
vsub.vx v9, v9, x10
|
||||
|
||||
// Step 6. XOR all registers with data loaded from randomx cache
|
||||
DECL(randomx_riscv64_vector_sshash_xor):
|
||||
vluxei64.v v18, (x10), v9
|
||||
vxor.vv v0, v0, v18
|
||||
|
||||
add x5, x10, 8
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v1, v1, v18
|
||||
|
||||
add x5, x10, 16
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v2, v2, v18
|
||||
|
||||
add x5, x10, 24
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v3, v3, v18
|
||||
|
||||
add x5, x10, 32
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v4, v4, v18
|
||||
|
||||
add x5, x10, 40
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v5, v5, v18
|
||||
|
||||
add x5, x10, 48
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v6, v6, v18
|
||||
|
||||
add x5, x10, 56
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v7, v7, v18
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_end):
|
||||
|
||||
/*
|
||||
Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#46-vm-execution
|
||||
|
||||
C declarations:
|
||||
|
||||
struct RegisterFile {
|
||||
uint64_t r[8];
|
||||
double f[4][2];
|
||||
double e[4][2];
|
||||
double a[4][2];
|
||||
};
|
||||
|
||||
struct MemoryRegisters {
|
||||
uint32_t mx, ma;
|
||||
uint8_t* memory; // dataset (fast mode) or cache (light mode)
|
||||
};
|
||||
|
||||
void ProgramFunc(RegisterFile* reg, MemoryRegisters* mem, uint8_t* scratchpad, uint64_t iterations);
|
||||
|
||||
Register layout
|
||||
---------------
|
||||
x0 = zero
|
||||
x1 = scratchpad L3 mask
|
||||
x2 = stack pointer
|
||||
x3 = global pointer (unused)
|
||||
x4 = thread pointer (unused)
|
||||
x5 = temporary
|
||||
x6 = temporary
|
||||
x7 = branch mask (unshifted)
|
||||
x8 = frame pointer, also 64-bit literal inside the loop
|
||||
x9 = scratchpad L3 mask (64-byte aligned)
|
||||
x10 = RegisterFile* reg, also 64-bit literal inside the loop
|
||||
x11 = MemoryRegisters* mem, then dataset/cache pointer
|
||||
x12 = scratchpad
|
||||
x13 = iterations
|
||||
x14 = mx, ma (always stored with dataset mask applied)
|
||||
x15 = spAddr0, spAddr1
|
||||
x16 = scratchpad L1 mask
|
||||
x17 = scratchpad L2 mask
|
||||
x18 = IMUL_RCP literals pointer
|
||||
x19 = dataset mask
|
||||
x20-x27 = r0-r7
|
||||
x28-x31 = 64-bit literals
|
||||
|
||||
f0-f7 = 64-bit literals
|
||||
f10-f17 = 64-bit literals
|
||||
f28-f31 = 64-bit literals
|
||||
|
||||
v0-v3 = f0-f3
|
||||
v4-v7 = e0-e3
|
||||
v8-v11 = a0-a3
|
||||
v12 = E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
|
||||
v13 = E 'or' mask = 0x3*00000000******'3*00000000******
|
||||
v14 = scale mask = 0x80f0000000000000'80f0000000000000
|
||||
|
||||
v15 = unused
|
||||
v16 = temporary
|
||||
v17 = unused
|
||||
v18 = temporary
|
||||
|
||||
v19-v31 = unused
|
||||
*/
|
||||
|
||||
.balign 8
|
||||
|
||||
DECL(randomx_riscv64_vector_program_params):
|
||||
|
||||
// JIT compiler will adjust these values for different RandomX variants
|
||||
randomx_masks: .dword 16376, 262136, 2097144, 2147483584, 255
|
||||
|
||||
DECL(randomx_riscv64_vector_program_imul_rcp_literals):
|
||||
|
||||
imul_rcp_literals: .fill RANDOMX_PROGRAM_MAX_SIZE, 8, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_program_begin):
|
||||
addi sp, sp, -112
|
||||
sd x8, 96(sp) // save old frame pointer
|
||||
addi x8, sp, 112 // setup new frame pointer
|
||||
sd x1, 104(sp) // save return address
|
||||
|
||||
// Save callee-saved registers
|
||||
sd x9, 0(sp)
|
||||
sd x18, 8(sp)
|
||||
sd x19, 16(sp)
|
||||
sd x20, 24(sp)
|
||||
sd x21, 32(sp)
|
||||
sd x22, 40(sp)
|
||||
sd x23, 48(sp)
|
||||
sd x24, 56(sp)
|
||||
sd x25, 64(sp)
|
||||
sd x26, 72(sp)
|
||||
sd x27, 80(sp)
|
||||
|
||||
// Save x10 as it will be used as an IMUL_RCP literal
|
||||
sd x10, 88(sp)
|
||||
|
||||
// Load mx, ma and dataset pointer
|
||||
ld x14, (x11)
|
||||
ld x11, 8(x11)
|
||||
|
||||
// Initialize spAddr0-spAddr1
|
||||
mv x15, x14
|
||||
|
||||
// Set registers r0-r7 to zero
|
||||
li x20, 0
|
||||
li x21, 0
|
||||
li x22, 0
|
||||
li x23, 0
|
||||
li x24, 0
|
||||
li x25, 0
|
||||
li x26, 0
|
||||
li x27, 0
|
||||
|
||||
// Load masks
|
||||
lla x5, randomx_masks
|
||||
ld x16, 0(x5)
|
||||
ld x17, 8(x5)
|
||||
ld x1, 16(x5)
|
||||
ld x19, 24(x5)
|
||||
ld x7, 32(x5)
|
||||
addi x9, x1, -56
|
||||
|
||||
// Set vector registers to 2x64 bit
|
||||
vsetivli zero, 2, e64, m1, ta, ma
|
||||
|
||||
// Apply dataset mask to mx, ma
|
||||
slli x5, x19, 32
|
||||
or x5, x5, x19
|
||||
and x14, x14, x5
|
||||
|
||||
// Load group A registers
|
||||
addi x5, x10, 192
|
||||
vle64.v v8, (x5)
|
||||
|
||||
addi x5, x10, 208
|
||||
vle64.v v9, (x5)
|
||||
|
||||
addi x5, x10, 224
|
||||
vle64.v v10, (x5)
|
||||
|
||||
addi x5, x10, 240
|
||||
vle64.v v11, (x5)
|
||||
|
||||
// Load E 'and' mask
|
||||
vmv.v.i v12, -1
|
||||
vsrl.vi v12, v12, 8
|
||||
|
||||
// Load E 'or' mask (stored in reg.f[0])
|
||||
addi x5, x10, 64
|
||||
vle64.v v13, (x5)
|
||||
|
||||
// Load scale mask
|
||||
lui x5, 0x80f00
|
||||
slli x5, x5, 32
|
||||
vmv.v.x v14, x5
|
||||
|
||||
// IMUL_RCP literals pointer
|
||||
lla x18, imul_rcp_literals
|
||||
|
||||
// Load IMUL_RCP literals
|
||||
ld x8, 0(x18)
|
||||
ld x10, 8(x18)
|
||||
ld x28, 16(x18)
|
||||
ld x29, 24(x18)
|
||||
ld x30, 32(x18)
|
||||
ld x31, 40(x18)
|
||||
fld f0, 48(x18)
|
||||
fld f1, 56(x18)
|
||||
fld f2, 64(x18)
|
||||
fld f3, 72(x18)
|
||||
fld f4, 80(x18)
|
||||
fld f5, 88(x18)
|
||||
fld f6, 96(x18)
|
||||
fld f7, 104(x18)
|
||||
fld f10, 112(x18)
|
||||
fld f11, 120(x18)
|
||||
fld f12, 128(x18)
|
||||
fld f13, 136(x18)
|
||||
fld f14, 144(x18)
|
||||
fld f15, 152(x18)
|
||||
fld f16, 160(x18)
|
||||
fld f17, 168(x18)
|
||||
fld f28, 176(x18)
|
||||
fld f29, 184(x18)
|
||||
fld f30, 192(x18)
|
||||
fld f31, 200(x18)
|
||||
|
||||
randomx_riscv64_vector_program_main_loop:
|
||||
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
||||
|
||||
// read a 64-byte line from scratchpad (indexed by spAddr0) and XOR it with r0-r7
|
||||
ld x6, 0(x5)
|
||||
xor x20, x20, x6
|
||||
ld x6, 8(x5)
|
||||
xor x21, x21, x6
|
||||
ld x6, 16(x5)
|
||||
xor x22, x22, x6
|
||||
ld x6, 24(x5)
|
||||
xor x23, x23, x6
|
||||
ld x6, 32(x5)
|
||||
xor x24, x24, x6
|
||||
ld x6, 40(x5)
|
||||
xor x25, x25, x6
|
||||
ld x6, 48(x5)
|
||||
xor x26, x26, x6
|
||||
ld x6, 56(x5)
|
||||
xor x27, x27, x6
|
||||
|
||||
srli x5, x15, 32 // x5 = spAddr1
|
||||
and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
|
||||
|
||||
// read a 64-byte line from scratchpad (indexed by spAddr1) and initialize f0-f3, e0-e3 registers
|
||||
|
||||
// Set vector registers to 2x32 bit
|
||||
vsetivli zero, 2, e32, m1, ta, ma
|
||||
|
||||
// load f0
|
||||
vle32.v v16, (x5)
|
||||
vfwcvt.f.x.v v0, v16
|
||||
|
||||
// load f1
|
||||
addi x6, x5, 8
|
||||
vle32.v v1, (x6)
|
||||
// Use v16 as an intermediary register because vfwcvt accepts only registers with even numbers here
|
||||
vfwcvt.f.x.v v16, v1
|
||||
vmv1r.v v1, v16
|
||||
|
||||
// load f2
|
||||
addi x6, x5, 16
|
||||
vle32.v v16, (x6)
|
||||
vfwcvt.f.x.v v2, v16
|
||||
|
||||
// load f3
|
||||
addi x6, x5, 24
|
||||
vle32.v v3, (x6)
|
||||
vfwcvt.f.x.v v16, v3
|
||||
vmv1r.v v3, v16
|
||||
|
||||
// load e0
|
||||
addi x6, x5, 32
|
||||
vle32.v v16, (x6)
|
||||
vfwcvt.f.x.v v4, v16
|
||||
|
||||
// load e1
|
||||
addi x6, x5, 40
|
||||
vle32.v v5, (x6)
|
||||
vfwcvt.f.x.v v16, v5
|
||||
vmv1r.v v5, v16
|
||||
|
||||
// load e2
|
||||
addi x6, x5, 48
|
||||
vle32.v v16, (x6)
|
||||
vfwcvt.f.x.v v6, v16
|
||||
|
||||
// load e3
|
||||
addi x6, x5, 56
|
||||
vle32.v v7, (x6)
|
||||
vfwcvt.f.x.v v16, v7
|
||||
vmv1r.v v7, v16
|
||||
|
||||
// Set vector registers back to 2x64 bit
|
||||
vsetivli zero, 2, e64, m1, ta, ma
|
||||
|
||||
// post-process e0-e3
|
||||
vand.vv v4, v4, v12
|
||||
vand.vv v5, v5, v12
|
||||
vand.vv v6, v6, v12
|
||||
vand.vv v7, v7, v12
|
||||
|
||||
vor.vv v4, v4, v13
|
||||
vor.vv v5, v5, v13
|
||||
vor.vv v6, v6, v13
|
||||
vor.vv v7, v7, v13
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_instructions):
|
||||
// Generated by JIT compiler
|
||||
// FDIV_M can generate up to 50 bytes of code (round it up to 52 - a multiple of 4)
|
||||
// +32 bytes for the scratchpad prefetch and the final jump instruction
|
||||
.fill RANDOMX_PROGRAM_MAX_SIZE * 52 + 32, 1, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_instructions_end):
|
||||
// Calculate dataset pointer for dataset read
|
||||
// Do it here to break false dependency from readReg2 and readReg3 (see below)
|
||||
srli x6, x14, 32 // x6 = ma & dataset mask
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_mx_xor):
|
||||
xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers)
|
||||
|
||||
and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask
|
||||
xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask
|
||||
|
||||
add x5, x14, x11 // x5 = &dataset[mx & dataset mask]
|
||||
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
add x5, x6, x11 // x5 = &dataset[ma & dataset mask]
|
||||
|
||||
// read a 64-byte line from dataset and XOR it with r0-r7
|
||||
ld x6, 0(x5)
|
||||
xor x20, x20, x6
|
||||
ld x6, 8(x5)
|
||||
xor x21, x21, x6
|
||||
ld x6, 16(x5)
|
||||
xor x22, x22, x6
|
||||
ld x6, 24(x5)
|
||||
xor x23, x23, x6
|
||||
ld x6, 32(x5)
|
||||
xor x24, x24, x6
|
||||
ld x6, 40(x5)
|
||||
xor x25, x25, x6
|
||||
ld x6, 48(x5)
|
||||
xor x26, x26, x6
|
||||
ld x6, 56(x5)
|
||||
xor x27, x27, x6
|
||||
|
||||
DECL(randomx_riscv64_vector_program_scratchpad_prefetch):
|
||||
xor x5, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers)
|
||||
srli x6, x5, 32 // x6 = spAddr1
|
||||
|
||||
and x5, x5, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
||||
and x6, x6, x9 // x6 = spAddr1 & 64-byte aligned L3 mask
|
||||
|
||||
c.add x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
||||
c.add x6, x12 // x6 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
|
||||
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
prefetch.r (x6)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
ld x6, (x6)
|
||||
#endif
|
||||
|
||||
// swap mx <-> ma
|
||||
#ifdef __riscv_zbb
|
||||
rori x14, x14, 32
|
||||
#else
|
||||
srli x5, x14, 32
|
||||
slli x14, x14, 32
|
||||
or x14, x14, x5
|
||||
#endif
|
||||
|
||||
srli x5, x15, 32 // x5 = spAddr1
|
||||
and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
|
||||
|
||||
// store registers r0-r7 to the scratchpad
|
||||
sd x20, 0(x5)
|
||||
sd x21, 8(x5)
|
||||
sd x22, 16(x5)
|
||||
sd x23, 24(x5)
|
||||
sd x24, 32(x5)
|
||||
sd x25, 40(x5)
|
||||
sd x26, 48(x5)
|
||||
sd x27, 56(x5)
|
||||
|
||||
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor):
|
||||
xor x15, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers)
|
||||
|
||||
// store registers f0-f3 to the scratchpad (f0-f3 are first combined with e0-e3)
|
||||
vxor.vv v0, v0, v4
|
||||
vxor.vv v1, v1, v5
|
||||
vxor.vv v2, v2, v6
|
||||
vxor.vv v3, v3, v7
|
||||
|
||||
vse64.v v0, (x5)
|
||||
|
||||
addi x6, x5, 16
|
||||
vse64.v v1, (x6)
|
||||
|
||||
addi x6, x5, 32
|
||||
vse64.v v2, (x6)
|
||||
|
||||
addi x6, x5, 48
|
||||
vse64.v v3, (x6)
|
||||
|
||||
addi x13, x13, -1
|
||||
beqz x13, randomx_riscv64_vector_program_main_loop_end
|
||||
j randomx_riscv64_vector_program_main_loop
|
||||
|
||||
randomx_riscv64_vector_program_main_loop_end:
|
||||
// Restore x8 and x10
|
||||
addi x8, sp, 112
|
||||
ld x10, 88(sp)
|
||||
|
||||
// Store integer registers
|
||||
sd x20, 0(x10)
|
||||
sd x21, 8(x10)
|
||||
sd x22, 16(x10)
|
||||
sd x23, 24(x10)
|
||||
sd x24, 32(x10)
|
||||
sd x25, 40(x10)
|
||||
sd x26, 48(x10)
|
||||
sd x27, 56(x10)
|
||||
|
||||
// Store FP registers
|
||||
addi x5, x10, 64
|
||||
vse64.v v0, (x5)
|
||||
|
||||
addi x5, x10, 80
|
||||
vse64.v v1, (x5)
|
||||
|
||||
addi x5, x10, 96
|
||||
vse64.v v2, (x5)
|
||||
|
||||
addi x5, x10, 112
|
||||
vse64.v v3, (x5)
|
||||
|
||||
addi x5, x10, 128
|
||||
vse64.v v4, (x5)
|
||||
|
||||
addi x5, x10, 144
|
||||
vse64.v v5, (x5)
|
||||
|
||||
addi x5, x10, 160
|
||||
vse64.v v6, (x5)
|
||||
|
||||
addi x5, x10, 176
|
||||
vse64.v v7, (x5)
|
||||
|
||||
// Restore callee-saved registers
|
||||
ld x9, 0(sp)
|
||||
ld x18, 8(sp)
|
||||
ld x19, 16(sp)
|
||||
ld x20, 24(sp)
|
||||
ld x21, 32(sp)
|
||||
ld x22, 40(sp)
|
||||
ld x23, 48(sp)
|
||||
ld x24, 56(sp)
|
||||
ld x25, 64(sp)
|
||||
ld x26, 72(sp)
|
||||
ld x27, 80(sp)
|
||||
|
||||
ld x8, 96(sp) // old frame pointer
|
||||
ld x1, 104(sp) // return address
|
||||
|
||||
addi sp, sp, 112
|
||||
|
||||
ret
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_light_mode_data):
|
||||
// 1) Pointer to the scalar dataset init function
|
||||
// 2) Dataset offset
|
||||
.dword 0, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode):
|
||||
// Calculate dataset pointer for dataset read
|
||||
// Do it here to break false dependency from readReg2 and readReg3 (see below)
|
||||
srli x6, x14, 32 // x6 = ma & dataset mask
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode):
|
||||
xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers)
|
||||
and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask
|
||||
xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask
|
||||
|
||||
// Save all registers modified when calling dataset_init_scalar_func_ptr
|
||||
addi sp, sp, -192
|
||||
|
||||
// bytes [0, 127] - saved registers
|
||||
// bytes [128, 191] - output buffer
|
||||
|
||||
sd x1, 0(sp)
|
||||
sd x7, 16(sp)
|
||||
sd x10, 24(sp)
|
||||
sd x11, 32(sp)
|
||||
sd x12, 40(sp)
|
||||
sd x13, 48(sp)
|
||||
sd x14, 56(sp)
|
||||
sd x15, 64(sp)
|
||||
sd x16, 72(sp)
|
||||
sd x17, 80(sp)
|
||||
sd x28, 88(sp)
|
||||
sd x29, 96(sp)
|
||||
sd x30, 104(sp)
|
||||
sd x31, 112(sp)
|
||||
|
||||
// setup randomx_riscv64_vector_sshash_dataset_init's parameters
|
||||
|
||||
// x10 = pointer to pointer to cache memory
|
||||
// pointer to cache memory was saved in "sd x11, 32(sp)", so x10 = sp + 32
|
||||
addi x10, sp, 32
|
||||
|
||||
// x11 = output buffer (64 bytes)
|
||||
addi x11, sp, 128
|
||||
|
||||
// x12 = start block
|
||||
lla x5, randomx_riscv64_vector_program_main_loop_light_mode_data
|
||||
ld x12, 8(x5)
|
||||
add x12, x12, x6
|
||||
srli x12, x12, 6
|
||||
|
||||
// x13 = end block
|
||||
addi x13, x12, 1
|
||||
|
||||
ld x5, 0(x5)
|
||||
jalr x1, 0(x5)
|
||||
|
||||
// restore registers
|
||||
ld x1, 0(sp)
|
||||
ld x7, 16(sp)
|
||||
ld x10, 24(sp)
|
||||
ld x11, 32(sp)
|
||||
ld x12, 40(sp)
|
||||
ld x13, 48(sp)
|
||||
ld x14, 56(sp)
|
||||
ld x15, 64(sp)
|
||||
ld x16, 72(sp)
|
||||
ld x17, 80(sp)
|
||||
ld x28, 88(sp)
|
||||
ld x29, 96(sp)
|
||||
ld x30, 104(sp)
|
||||
ld x31, 112(sp)
|
||||
|
||||
// read a 64-byte line from dataset and XOR it with r0-r7
|
||||
ld x5, 128(sp)
|
||||
xor x20, x20, x5
|
||||
ld x5, 136(sp)
|
||||
xor x21, x21, x5
|
||||
ld x5, 144(sp)
|
||||
xor x22, x22, x5
|
||||
ld x5, 152(sp)
|
||||
xor x23, x23, x5
|
||||
ld x5, 160(sp)
|
||||
xor x24, x24, x5
|
||||
ld x5, 168(sp)
|
||||
xor x25, x25, x5
|
||||
ld x5, 176(sp)
|
||||
xor x26, x26, x5
|
||||
ld x5, 184(sp)
|
||||
xor x27, x27, x5
|
||||
|
||||
addi sp, sp, 192
|
||||
|
||||
j randomx_riscv64_vector_program_scratchpad_prefetch
|
||||
|
||||
DECL(randomx_riscv64_vector_program_end):
|
||||
|
||||
DECL(randomx_riscv64_vector_code_end):
|
||||
74
src/crypto/randomx/jit_compiler_rv64_vector_static.h
Normal file
74
src/crypto/randomx/jit_compiler_rv64_vector_static.h
Normal file
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#include <cstdint>
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct randomx_cache;
|
||||
|
||||
void randomx_riscv64_vector_code_begin();
|
||||
|
||||
void randomx_riscv64_vector_sshash_begin();
|
||||
void randomx_riscv64_vector_sshash_imul_rcp_literals();
|
||||
void randomx_riscv64_vector_sshash_dataset_init(struct randomx_cache* cache, uint8_t* output_buf, uint32_t startBlock, uint32_t endBlock);
|
||||
void randomx_riscv64_vector_sshash_cache_prefetch();
|
||||
void randomx_riscv64_vector_sshash_generated_instructions();
|
||||
void randomx_riscv64_vector_sshash_generated_instructions_end();
|
||||
void randomx_riscv64_vector_sshash_cache_prefetch();
|
||||
void randomx_riscv64_vector_sshash_xor();
|
||||
void randomx_riscv64_vector_sshash_end();
|
||||
|
||||
void randomx_riscv64_vector_program_params();
|
||||
void randomx_riscv64_vector_program_imul_rcp_literals();
|
||||
void randomx_riscv64_vector_program_begin();
|
||||
void randomx_riscv64_vector_program_main_loop_instructions();
|
||||
void randomx_riscv64_vector_program_main_loop_instructions_end();
|
||||
void randomx_riscv64_vector_program_main_loop_mx_xor();
|
||||
void randomx_riscv64_vector_program_main_loop_spaddr_xor();
|
||||
void randomx_riscv64_vector_program_main_loop_light_mode_data();
|
||||
void randomx_riscv64_vector_program_main_loop_instructions_end_light_mode();
|
||||
void randomx_riscv64_vector_program_main_loop_mx_xor_light_mode();
|
||||
void randomx_riscv64_vector_program_end();
|
||||
void randomx_riscv64_vector_program_scratchpad_prefetch();
|
||||
|
||||
void randomx_riscv64_vector_code_end();
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/reciprocal.h"
|
||||
#include "crypto/randomx/superscalar.hpp"
|
||||
#include "crypto/randomx/virtual_memory.hpp"
|
||||
#include "crypto/randomx/soft_aes.h"
|
||||
#include "crypto/rx/Profiler.h"
|
||||
|
||||
#ifdef XMRIG_FIX_RYZEN
|
||||
@@ -116,6 +117,7 @@ namespace randomx {
|
||||
#define codeLoopLoadXOP ADDR(randomx_program_loop_load_xop)
|
||||
#define codeProgramStart ADDR(randomx_program_start)
|
||||
#define codeReadDataset ADDR(randomx_program_read_dataset)
|
||||
#define codeReadDatasetV2 ADDR(randomx_program_read_dataset_v2)
|
||||
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
|
||||
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
|
||||
#define codeDatasetInit ADDR(randomx_dataset_init)
|
||||
@@ -125,6 +127,8 @@ namespace randomx {
|
||||
#define codeDatasetInitAVX2SshLoad ADDR(randomx_dataset_init_avx2_ssh_load)
|
||||
#define codeDatasetInitAVX2SshPrefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
|
||||
#define codeLoopStore ADDR(randomx_program_loop_store)
|
||||
#define codeLoopStoreHardAES ADDR(randomx_program_loop_store_hard_aes)
|
||||
#define codeLoopStoreSoftAES ADDR(randomx_program_loop_store_soft_aes)
|
||||
#define codeLoopEnd ADDR(randomx_program_loop_end)
|
||||
#define codeEpilogue ADDR(randomx_program_epilogue)
|
||||
#define codeProgramEnd ADDR(randomx_program_end)
|
||||
@@ -136,10 +140,13 @@ namespace randomx {
|
||||
#define prologueSize (codeLoopBegin - codePrologue)
|
||||
#define loopLoadSize (codeLoopLoadXOP - codeLoopLoad)
|
||||
#define loopLoadXOPSize (codeProgramStart - codeLoopLoadXOP)
|
||||
#define readDatasetSize (codeReadDatasetLightSshInit - codeReadDataset)
|
||||
#define readDatasetSize (codeReadDatasetV2 - codeReadDataset)
|
||||
#define readDatasetV2Size (codeReadDatasetLightSshInit - codeReadDatasetV2)
|
||||
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
|
||||
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
|
||||
#define loopStoreSize (codeLoopEnd - codeLoopStore)
|
||||
#define loopStoreSize (codeLoopStoreHardAES - codeLoopStore)
|
||||
#define loopStoreHardAESSize (codeLoopStoreSoftAES - codeLoopStoreHardAES)
|
||||
#define loopStoreSoftAESSize (codeLoopEnd - codeLoopStoreSoftAES)
|
||||
#define datasetInitSize (codeDatasetInitAVX2Prologue - codeDatasetInit)
|
||||
#define datasetInitAVX2PrologueSize (codeDatasetInitAVX2LoopEnd - codeDatasetInitAVX2Prologue)
|
||||
#define datasetInitAVX2LoopEndSize (codeDatasetInitAVX2Epilogue - codeDatasetInitAVX2LoopEnd)
|
||||
@@ -223,6 +230,8 @@ namespace randomx {
|
||||
JitCompilerX86::JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable) {
|
||||
BranchesWithin32B = xmrig::Cpu::info()->jccErratum();
|
||||
|
||||
hasAES = xmrig::Cpu::info()->hasAES();
|
||||
|
||||
hasAVX = xmrig::Cpu::info()->hasAVX();
|
||||
hasAVX2 = xmrig::Cpu::info()->hasAVX2();
|
||||
|
||||
@@ -341,7 +350,14 @@ namespace randomx {
|
||||
vm_flags = flags;
|
||||
|
||||
generateProgramPrologue(prog, pcfg);
|
||||
emit(codeReadDataset, readDatasetSize, code, codePos);
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_PREFETCH) {
|
||||
emit(codeReadDatasetV2, readDatasetV2Size, code, codePos);
|
||||
}
|
||||
else {
|
||||
emit(codeReadDataset, readDatasetSize, code, codePos);
|
||||
}
|
||||
|
||||
generateProgramEpilogue(prog, pcfg);
|
||||
}
|
||||
|
||||
@@ -424,8 +440,15 @@ namespace randomx {
|
||||
|
||||
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
|
||||
codePos = ADDR(randomx_program_prologue_first_load) - ADDR(randomx_program_prologue);
|
||||
*(uint32_t*)(code + codePos + 4) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
||||
*(uint32_t*)(code + codePos + 14) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_AES && !hasAES) {
|
||||
*(uint64_t*)(code + codePos + 9) = reinterpret_cast<uint64_t>(lutEnc);
|
||||
*(uint64_t*)(code + codePos + 27) = reinterpret_cast<uint64_t>(lutDec);
|
||||
}
|
||||
|
||||
*(uint32_t*)(code + codePos + 47) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
||||
*(uint32_t*)(code + codePos + 57) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
||||
|
||||
if (hasAVX) {
|
||||
uint32_t* p = (uint32_t*)(code + codePos + 61);
|
||||
*p = (*p & 0xFF000000U) | 0x0077F8C5U; // vzeroupper
|
||||
@@ -476,8 +499,21 @@ namespace randomx {
|
||||
*(uint64_t*)(code + codePos) = 0xc03349c08b49ull + (static_cast<uint64_t>(pcfg.readReg0) << 16) + (static_cast<uint64_t>(pcfg.readReg1) << 40);
|
||||
codePos += 6;
|
||||
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, RandomX_CurrentConfig.codePrefetchScratchpadTweakedSize, code, codePos);
|
||||
memcpy(code + codePos, codeLoopStore, loopStoreSize);
|
||||
codePos += loopStoreSize;
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_AES) {
|
||||
if (hasAES) {
|
||||
memcpy(code + codePos, codeLoopStoreHardAES, loopStoreHardAESSize);
|
||||
codePos += loopStoreHardAESSize;
|
||||
}
|
||||
else {
|
||||
memcpy(code + codePos, codeLoopStoreSoftAES, loopStoreSoftAESSize);
|
||||
codePos += loopStoreSoftAESSize;
|
||||
}
|
||||
}
|
||||
else {
|
||||
memcpy(code + codePos, codeLoopStore, loopStoreSize);
|
||||
codePos += loopStoreSize;
|
||||
}
|
||||
|
||||
if (BranchesWithin32B) {
|
||||
const uint32_t branch_begin = static_cast<uint32_t>(codePos);
|
||||
@@ -1307,7 +1343,7 @@ namespace randomx {
|
||||
uint8_t* const p = code;
|
||||
int32_t t = prevCFROUND;
|
||||
|
||||
if (t > prevFPOperation) {
|
||||
if ((t > prevFPOperation) && !RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||
if (vm_flags & RANDOMX_FLAG_AMD) {
|
||||
memcpy(p + t, NOP26, 26);
|
||||
}
|
||||
@@ -1326,14 +1362,38 @@ namespace randomx {
|
||||
*(uint32_t*)(p + pos + 3) = 0x00C8C148 + (rotate << 24);
|
||||
|
||||
if (vm_flags & RANDOMX_FLAG_AMD) {
|
||||
*(uint64_t*)(p + pos + 7) = 0x742024443B0CE083ULL;
|
||||
*(uint64_t*)(p + pos + 15) = 0x8900EB0414AE0F0AULL;
|
||||
*(uint32_t*)(p + pos + 23) = 0x202444;
|
||||
pos += 26;
|
||||
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||
*(uint32_t*)(p + pos + 7) = 0x1375F0A8;
|
||||
pos += 11;
|
||||
}
|
||||
else {
|
||||
pos += 7;
|
||||
}
|
||||
*(uint64_t*)(p + pos) = 0x742024443B0CE083ULL;
|
||||
*(uint64_t*)(p + pos + 8) = 0x8900EB0414AE0F0AULL;
|
||||
*(uint32_t*)(p + pos + 16) = 0x202444;
|
||||
pos += 19;
|
||||
}
|
||||
else {
|
||||
*(uint64_t*)(p + pos + 7) = 0x0414AE0F0CE083ULL;
|
||||
pos += 14;
|
||||
pos += 7;
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||
if (BranchesWithin32B) {
|
||||
const uint32_t branch_begin = static_cast<uint32_t>(pos + 2) & 31;
|
||||
|
||||
// If the jump crosses or touches 32-byte boundary, align it
|
||||
if (branch_begin >= 30) {
|
||||
const uint32_t alignment_size = 32 - branch_begin;
|
||||
emit(NOPX[alignment_size - 1], alignment_size, code, pos);
|
||||
}
|
||||
}
|
||||
|
||||
*(uint32_t*)(p + pos) = 0x0775F0A8;
|
||||
pos += 4;
|
||||
}
|
||||
|
||||
*(uint64_t*)(p + pos) = 0x0414AE0F0CE083ULL;
|
||||
pos += 7;
|
||||
}
|
||||
|
||||
codePos = pos;
|
||||
@@ -1343,7 +1403,7 @@ namespace randomx {
|
||||
uint8_t* const p = code;
|
||||
int32_t t = prevCFROUND;
|
||||
|
||||
if (t > prevFPOperation) {
|
||||
if ((t > prevFPOperation) && !RandomX_CurrentConfig.Tweak_V2_CFROUND){
|
||||
if (vm_flags & RANDOMX_FLAG_AMD) {
|
||||
memcpy(p + t, NOP25, 25);
|
||||
}
|
||||
@@ -1361,14 +1421,38 @@ namespace randomx {
|
||||
*(uint64_t*)(p + pos) = 0xC0F0FBC3C4ULL | (src << 32) | (rotate << 40);
|
||||
|
||||
if (vm_flags & RANDOMX_FLAG_AMD) {
|
||||
*(uint64_t*)(p + pos + 6) = 0x742024443B0CE083ULL;
|
||||
*(uint64_t*)(p + pos + 14) = 0x8900EB0414AE0F0AULL;
|
||||
*(uint32_t*)(p + pos + 22) = 0x202444;
|
||||
pos += 25;
|
||||
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||
*(uint32_t*)(p + pos + 6) = 0x1375F0A8;
|
||||
pos += 10;
|
||||
}
|
||||
else {
|
||||
pos += 6;
|
||||
}
|
||||
*(uint64_t*)(p + pos) = 0x742024443B0CE083ULL;
|
||||
*(uint64_t*)(p + pos + 8) = 0x8900EB0414AE0F0AULL;
|
||||
*(uint32_t*)(p + pos + 16) = 0x202444;
|
||||
pos += 19;
|
||||
}
|
||||
else {
|
||||
*(uint64_t*)(p + pos + 6) = 0x0414AE0F0CE083ULL;
|
||||
pos += 13;
|
||||
pos += 6;
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||
if (BranchesWithin32B) {
|
||||
const uint32_t branch_begin = static_cast<uint32_t>(pos + 2) & 31;
|
||||
|
||||
// If the jump crosses or touches 32-byte boundary, align it
|
||||
if (branch_begin >= 30) {
|
||||
const uint32_t alignment_size = 32 - branch_begin;
|
||||
emit(NOPX[alignment_size - 1], alignment_size, code, pos);
|
||||
}
|
||||
}
|
||||
|
||||
*(uint32_t*)(p + pos) = 0x0775F0A8;
|
||||
pos += 4;
|
||||
}
|
||||
|
||||
*(uint64_t*)(p + pos) = 0x0414AE0F0CE083ULL;
|
||||
pos += 7;
|
||||
}
|
||||
|
||||
codePos = pos;
|
||||
|
||||
@@ -97,6 +97,7 @@ namespace randomx {
|
||||
# endif
|
||||
|
||||
bool BranchesWithin32B = false;
|
||||
bool hasAES;
|
||||
bool hasAVX;
|
||||
bool hasAVX2;
|
||||
bool initDatasetAVX2;
|
||||
|
||||
@@ -48,9 +48,12 @@
|
||||
.global DECL(randomx_program_loop_load_xop)
|
||||
.global DECL(randomx_program_start)
|
||||
.global DECL(randomx_program_read_dataset)
|
||||
.global DECL(randomx_program_read_dataset_v2)
|
||||
.global DECL(randomx_program_read_dataset_sshash_init)
|
||||
.global DECL(randomx_program_read_dataset_sshash_fin)
|
||||
.global DECL(randomx_program_loop_store)
|
||||
.global DECL(randomx_program_loop_store_hard_aes)
|
||||
.global DECL(randomx_program_loop_store_soft_aes)
|
||||
.global DECL(randomx_program_loop_end)
|
||||
.global DECL(randomx_dataset_init)
|
||||
.global DECL(randomx_dataset_init_avx2_prologue)
|
||||
@@ -101,19 +104,23 @@ DECL(randomx_program_prologue):
|
||||
movapd xmm15, xmmword ptr [scaleMask+rip]
|
||||
|
||||
DECL(randomx_program_prologue_first_load):
|
||||
sub rsp, 248
|
||||
mov rdx, 0x1111111111111111
|
||||
mov [rsp+232], rdx ;# aes_lut_enc
|
||||
mov rdx, 0x1111111111111111
|
||||
mov [rsp+240], rdx ;# aes_lut_dec
|
||||
mov rdx, rax
|
||||
and eax, RANDOMX_SCRATCHPAD_MASK
|
||||
ror rdx, 32
|
||||
and edx, RANDOMX_SCRATCHPAD_MASK
|
||||
sub rsp, 40
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
mov dword ptr [rsp], 0x9FC0
|
||||
mov dword ptr [rsp+4], 0xBFC0
|
||||
mov dword ptr [rsp+8], 0xDFC0
|
||||
mov dword ptr [rsp+12], 0xFFC0
|
||||
mov dword ptr [rsp+32], -1
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
jmp DECL(randomx_program_imul_rcp_store)
|
||||
|
||||
.balign 64
|
||||
@@ -139,6 +146,9 @@ DECL(randomx_program_start):
|
||||
DECL(randomx_program_read_dataset):
|
||||
#include "asm/program_read_dataset.inc"
|
||||
|
||||
DECL(randomx_program_read_dataset_v2):
|
||||
#include "asm/program_read_dataset_v2.inc"
|
||||
|
||||
DECL(randomx_program_read_dataset_sshash_init):
|
||||
#include "asm/program_read_dataset_sshash_init.inc"
|
||||
|
||||
@@ -148,6 +158,12 @@ DECL(randomx_program_read_dataset_sshash_fin):
|
||||
DECL(randomx_program_loop_store):
|
||||
#include "asm/program_loop_store.inc"
|
||||
|
||||
DECL(randomx_program_loop_store_hard_aes):
|
||||
#include "asm/program_loop_store_hard_aes.inc"
|
||||
|
||||
DECL(randomx_program_loop_store_soft_aes):
|
||||
#include "asm/program_loop_store_soft_aes.inc"
|
||||
|
||||
DECL(randomx_program_loop_end):
|
||||
nop
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ PUBLIC randomx_program_loop_load
|
||||
PUBLIC randomx_program_loop_load_xop
|
||||
PUBLIC randomx_program_start
|
||||
PUBLIC randomx_program_read_dataset
|
||||
PUBLIC randomx_program_read_dataset_v2
|
||||
PUBLIC randomx_program_read_dataset_sshash_init
|
||||
PUBLIC randomx_program_read_dataset_sshash_fin
|
||||
PUBLIC randomx_dataset_init
|
||||
@@ -48,6 +49,8 @@ PUBLIC randomx_dataset_init_avx2_epilogue
|
||||
PUBLIC randomx_dataset_init_avx2_ssh_load
|
||||
PUBLIC randomx_dataset_init_avx2_ssh_prefetch
|
||||
PUBLIC randomx_program_loop_store
|
||||
PUBLIC randomx_program_loop_store_hard_aes
|
||||
PUBLIC randomx_program_loop_store_soft_aes
|
||||
PUBLIC randomx_program_loop_end
|
||||
PUBLIC randomx_program_epilogue
|
||||
PUBLIC randomx_sshash_load
|
||||
@@ -90,19 +93,23 @@ randomx_program_prologue PROC
|
||||
randomx_program_prologue ENDP
|
||||
|
||||
randomx_program_prologue_first_load PROC
|
||||
sub rsp, 248
|
||||
mov rdx, 01111111111111111h
|
||||
mov [rsp+232], rdx ;# aes_lut_enc
|
||||
mov rdx, 01111111111111111h
|
||||
mov [rsp+240], rdx ;# aes_lut_dec
|
||||
mov rdx, rax
|
||||
and eax, RANDOMX_SCRATCHPAD_MASK
|
||||
ror rdx, 32
|
||||
and edx, RANDOMX_SCRATCHPAD_MASK
|
||||
sub rsp, 40
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
mov dword ptr [rsp], 9FC0h
|
||||
mov dword ptr [rsp+4], 0BFC0h
|
||||
mov dword ptr [rsp+8], 0DFC0h
|
||||
mov dword ptr [rsp+12], 0FFC0h
|
||||
mov dword ptr [rsp+32], -1
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
jmp randomx_program_imul_rcp_store
|
||||
randomx_program_prologue_first_load ENDP
|
||||
|
||||
@@ -135,6 +142,10 @@ randomx_program_read_dataset PROC
|
||||
include asm/program_read_dataset.inc
|
||||
randomx_program_read_dataset ENDP
|
||||
|
||||
randomx_program_read_dataset_v2 PROC
|
||||
include asm/program_read_dataset_v2.inc
|
||||
randomx_program_read_dataset_v2 ENDP
|
||||
|
||||
randomx_program_read_dataset_sshash_init PROC
|
||||
include asm/program_read_dataset_sshash_init.inc
|
||||
randomx_program_read_dataset_sshash_init ENDP
|
||||
@@ -147,6 +158,14 @@ randomx_program_loop_store PROC
|
||||
include asm/program_loop_store.inc
|
||||
randomx_program_loop_store ENDP
|
||||
|
||||
randomx_program_loop_store_hard_aes PROC
|
||||
include asm/program_loop_store_hard_aes.inc
|
||||
randomx_program_loop_store_hard_aes ENDP
|
||||
|
||||
randomx_program_loop_store_soft_aes PROC
|
||||
include asm/program_loop_store_soft_aes.inc
|
||||
randomx_program_loop_store_soft_aes ENDP
|
||||
|
||||
randomx_program_loop_end PROC
|
||||
nop
|
||||
randomx_program_loop_end ENDP
|
||||
|
||||
@@ -40,9 +40,12 @@ extern "C" {
|
||||
void randomx_program_loop_load_xop();
|
||||
void randomx_program_start();
|
||||
void randomx_program_read_dataset();
|
||||
void randomx_program_read_dataset_v2();
|
||||
void randomx_program_read_dataset_sshash_init();
|
||||
void randomx_program_read_dataset_sshash_fin();
|
||||
void randomx_program_loop_store();
|
||||
void randomx_program_loop_store_hard_aes();
|
||||
void randomx_program_loop_store_soft_aes();
|
||||
void randomx_program_loop_end();
|
||||
void randomx_dataset_init();
|
||||
void randomx_dataset_init_avx2_prologue();
|
||||
|
||||
@@ -39,6 +39,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/jit_compiler_x86_static.hpp"
|
||||
#elif (XMRIG_ARM == 8)
|
||||
#include "crypto/randomx/jit_compiler_a64_static.hpp"
|
||||
#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
|
||||
#include "crypto/randomx/jit_compiler_rv64_static.hpp"
|
||||
#endif
|
||||
|
||||
#include "backend/cpu/Cpu.h"
|
||||
@@ -48,6 +50,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <cassert>
|
||||
|
||||
#include "crypto/rx/Profiler.h"
|
||||
#include "base/net/stratum/Job.h"
|
||||
|
||||
RandomX_ConfigurationMoneroV2::RandomX_ConfigurationMoneroV2()
|
||||
{
|
||||
ProgramSize = 384;
|
||||
|
||||
Tweak_V2_CFROUND = 1;
|
||||
Tweak_V2_AES = 1;
|
||||
Tweak_V2_PREFETCH = 1;
|
||||
Tweak_V2_COMMITMENT = 1;
|
||||
}
|
||||
|
||||
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
|
||||
{
|
||||
@@ -148,6 +161,10 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
|
||||
, RANDOMX_FREQ_CFROUND(1)
|
||||
, RANDOMX_FREQ_ISTORE(16)
|
||||
, RANDOMX_FREQ_NOP(0)
|
||||
, Tweak_V2_CFROUND(0)
|
||||
, Tweak_V2_AES(0)
|
||||
, Tweak_V2_PREFETCH(0)
|
||||
, Tweak_V2_COMMITMENT(0)
|
||||
{
|
||||
fillAes4Rx4_Key[0] = rx_set_int_vec_i128(0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd);
|
||||
fillAes4Rx4_Key[1] = rx_set_int_vec_i128(0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450);
|
||||
@@ -190,7 +207,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
|
||||
# endif
|
||||
}
|
||||
|
||||
#if (XMRIG_ARM == 8)
|
||||
#if (XMRIG_ARM == 8) || defined(XMRIG_RISCV)
|
||||
static uint32_t Log2(size_t value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
|
||||
#endif
|
||||
|
||||
@@ -274,6 +291,17 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
|
||||
|
||||
#define JIT_HANDLE(x, prev) randomx::JitCompilerA64::engine[k] = &randomx::JitCompilerA64::h_##x
|
||||
|
||||
#elif defined(XMRIG_RISCV)
|
||||
|
||||
Log2_ScratchpadL1 = Log2(ScratchpadL1_Size);
|
||||
Log2_ScratchpadL2 = Log2(ScratchpadL2_Size);
|
||||
Log2_ScratchpadL3 = Log2(ScratchpadL3_Size);
|
||||
|
||||
#define JIT_HANDLE(x, prev) do { \
|
||||
randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x; \
|
||||
randomx::JitCompilerRV64::inst_map[k] = static_cast<uint8_t>(randomx::InstructionType::x); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
#define JIT_HANDLE(x, prev)
|
||||
#endif
|
||||
@@ -354,6 +382,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
|
||||
}
|
||||
|
||||
RandomX_ConfigurationMonero RandomX_MoneroConfig;
|
||||
RandomX_ConfigurationMoneroV2 RandomX_MoneroConfigV2;
|
||||
RandomX_ConfigurationWownero RandomX_WowneroConfig;
|
||||
RandomX_ConfigurationArqma RandomX_ArqmaConfig;
|
||||
RandomX_ConfigurationGraft RandomX_GraftConfig;
|
||||
@@ -601,4 +630,11 @@ extern "C" {
|
||||
machine->hashAndFill(output, tempHash);
|
||||
}
|
||||
|
||||
void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out) {
|
||||
uint8_t buf[xmrig::Job::kMaxBlobSize + RANDOMX_HASH_SIZE];
|
||||
memcpy(buf, input, inputSize);
|
||||
memcpy(buf + inputSize, hash_in, RANDOMX_HASH_SIZE);
|
||||
rx_blake2b_wrapper::run(com_out, RANDOMX_HASH_SIZE, buf, inputSize + RANDOMX_HASH_SIZE);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -125,6 +125,11 @@ struct RandomX_ConfigurationBase
|
||||
|
||||
rx_vec_i128 fillAes4Rx4_Key[8];
|
||||
|
||||
uint32_t Tweak_V2_CFROUND : 1;
|
||||
uint32_t Tweak_V2_AES : 1;
|
||||
uint32_t Tweak_V2_PREFETCH : 1;
|
||||
uint32_t Tweak_V2_COMMITMENT : 1;
|
||||
|
||||
uint8_t codeSshPrefetchTweaked[20];
|
||||
uint8_t codePrefetchScratchpadTweaked[28];
|
||||
uint32_t codePrefetchScratchpadTweakedSize;
|
||||
@@ -133,7 +138,7 @@ struct RandomX_ConfigurationBase
|
||||
uint32_t ScratchpadL3Mask_Calculated;
|
||||
uint32_t ScratchpadL3Mask64_Calculated;
|
||||
|
||||
# if (XMRIG_ARM == 8)
|
||||
# if (XMRIG_ARM == 8) || defined(XMRIG_RISCV)
|
||||
uint32_t Log2_ScratchpadL1;
|
||||
uint32_t Log2_ScratchpadL2;
|
||||
uint32_t Log2_ScratchpadL3;
|
||||
@@ -143,6 +148,7 @@ struct RandomX_ConfigurationBase
|
||||
};
|
||||
|
||||
struct RandomX_ConfigurationMonero : public RandomX_ConfigurationBase {};
|
||||
struct RandomX_ConfigurationMoneroV2 : public RandomX_ConfigurationBase { RandomX_ConfigurationMoneroV2(); };
|
||||
struct RandomX_ConfigurationWownero : public RandomX_ConfigurationBase { RandomX_ConfigurationWownero(); };
|
||||
struct RandomX_ConfigurationArqma : public RandomX_ConfigurationBase { RandomX_ConfigurationArqma(); };
|
||||
struct RandomX_ConfigurationGraft : public RandomX_ConfigurationBase { RandomX_ConfigurationGraft(); };
|
||||
@@ -150,6 +156,7 @@ struct RandomX_ConfigurationSafex : public RandomX_ConfigurationBase { RandomX_C
|
||||
struct RandomX_ConfigurationYada : public RandomX_ConfigurationBase { RandomX_ConfigurationYada(); };
|
||||
|
||||
extern RandomX_ConfigurationMonero RandomX_MoneroConfig;
|
||||
extern RandomX_ConfigurationMoneroV2 RandomX_MoneroConfigV2;
|
||||
extern RandomX_ConfigurationWownero RandomX_WowneroConfig;
|
||||
extern RandomX_ConfigurationArqma RandomX_ArqmaConfig;
|
||||
extern RandomX_ConfigurationGraft RandomX_GraftConfig;
|
||||
@@ -231,7 +238,7 @@ RANDOMX_EXPORT unsigned long randomx_dataset_item_count(void);
|
||||
*
|
||||
* @param dataset is a pointer to a previously allocated randomx_dataset structure. Must not be NULL.
|
||||
* @param cache is a pointer to a previously allocated and initialized randomx_cache structure. Must not be NULL.
|
||||
* @param startItem is the item number where intialization should start.
|
||||
* @param startItem is the item number where initialization should start.
|
||||
* @param itemCount is the number of items that should be initialized.
|
||||
*/
|
||||
RANDOMX_EXPORT void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startItem, unsigned long itemCount);
|
||||
@@ -318,6 +325,17 @@ RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *inpu
|
||||
RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize);
|
||||
RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output);
|
||||
|
||||
/**
|
||||
* Calculate a RandomX commitment from a RandomX hash and its input.
|
||||
*
|
||||
* @param input is a pointer to memory that was hashed. Must not be NULL.
|
||||
* @param inputSize is the number of bytes in the input.
|
||||
* @param hash_in is the output from randomx_calculate_hash* (RANDOMX_HASH_SIZE bytes).
|
||||
* @param com_out is a pointer to memory where the commitment will be stored. Must not
|
||||
* be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
|
||||
*/
|
||||
RANDOMX_EXPORT void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -73,8 +73,20 @@ uint64_t randomx_reciprocal(uint64_t divisor) {
|
||||
|
||||
#if !RANDOMX_HAVE_FAST_RECIPROCAL
|
||||
|
||||
#ifdef __GNUC__
|
||||
uint64_t randomx_reciprocal_fast(uint64_t divisor)
|
||||
{
|
||||
const uint64_t q = (1ULL << 63) / divisor;
|
||||
const uint64_t r = (1ULL << 63) % divisor;
|
||||
|
||||
const uint64_t shift = 64 - __builtin_clzll(divisor);
|
||||
|
||||
return (q << shift) + ((r << shift) / divisor);
|
||||
}
|
||||
#else
|
||||
uint64_t randomx_reciprocal_fast(uint64_t divisor) {
|
||||
return randomx_reciprocal(divisor);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -29,15 +29,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "crypto/randomx/soft_aes.h"
|
||||
|
||||
alignas(64) uint32_t lutEnc0[256];
|
||||
alignas(64) uint32_t lutEnc1[256];
|
||||
alignas(64) uint32_t lutEnc2[256];
|
||||
alignas(64) uint32_t lutEnc3[256];
|
||||
alignas(64) uint32_t lutEnc[4][256];
|
||||
alignas(64) uint32_t lutDec[4][256];
|
||||
|
||||
alignas(64) uint32_t lutDec0[256];
|
||||
alignas(64) uint32_t lutDec1[256];
|
||||
alignas(64) uint32_t lutDec2[256];
|
||||
alignas(64) uint32_t lutDec3[256];
|
||||
alignas(64) uint8_t lutEncIndex[4][32];
|
||||
alignas(64) uint8_t lutDecIndex[4][32];
|
||||
|
||||
static uint32_t mul_gf2(uint32_t b, uint32_t c)
|
||||
{
|
||||
@@ -99,10 +95,10 @@ static struct SAESInitializer
|
||||
p[2] = s;
|
||||
p[3] = mul_gf2(s, 3);
|
||||
|
||||
lutEnc0[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutEnc1[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutEnc2[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutEnc3[i] = w;
|
||||
lutEnc[0][i] = w; w = (w << 8) | (w >> 24);
|
||||
lutEnc[1][i] = w; w = (w << 8) | (w >> 24);
|
||||
lutEnc[2][i] = w; w = (w << 8) | (w >> 24);
|
||||
lutEnc[3][i] = w;
|
||||
|
||||
s = sbox_reverse[i];
|
||||
p[0] = mul_gf2(s, 0xe);
|
||||
@@ -110,10 +106,54 @@ static struct SAESInitializer
|
||||
p[2] = mul_gf2(s, 0xd);
|
||||
p[3] = mul_gf2(s, 0xb);
|
||||
|
||||
lutDec0[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec1[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec2[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec3[i] = w;
|
||||
lutDec[0][i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec[1][i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec[2][i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec[3][i] = w;
|
||||
}
|
||||
|
||||
memset(lutEncIndex, -1, sizeof(lutEncIndex));
|
||||
memset(lutDecIndex, -1, sizeof(lutDecIndex));
|
||||
|
||||
lutEncIndex[0][ 0] = 0;
|
||||
lutEncIndex[0][ 4] = 4;
|
||||
lutEncIndex[0][ 8] = 8;
|
||||
lutEncIndex[0][12] = 12;
|
||||
lutEncIndex[1][ 0] = 5;
|
||||
lutEncIndex[1][ 4] = 9;
|
||||
lutEncIndex[1][ 8] = 13;
|
||||
lutEncIndex[1][12] = 1;
|
||||
lutEncIndex[2][ 0] = 10;
|
||||
lutEncIndex[2][ 4] = 14;
|
||||
lutEncIndex[2][ 8] = 2;
|
||||
lutEncIndex[2][12] = 6;
|
||||
lutEncIndex[3][ 0] = 15;
|
||||
lutEncIndex[3][ 4] = 3;
|
||||
lutEncIndex[3][ 8] = 7;
|
||||
lutEncIndex[3][12] = 11;
|
||||
|
||||
lutDecIndex[0][ 0] = 0;
|
||||
lutDecIndex[0][ 4] = 4;
|
||||
lutDecIndex[0][ 8] = 8;
|
||||
lutDecIndex[0][12] = 12;
|
||||
lutDecIndex[1][ 0] = 13;
|
||||
lutDecIndex[1][ 4] = 1;
|
||||
lutDecIndex[1][ 8] = 5;
|
||||
lutDecIndex[1][12] = 9;
|
||||
lutDecIndex[2][ 0] = 10;
|
||||
lutDecIndex[2][ 4] = 14;
|
||||
lutDecIndex[2][ 8] = 2;
|
||||
lutDecIndex[2][12] = 6;
|
||||
lutDecIndex[3][ 0] = 7;
|
||||
lutDecIndex[3][ 4] = 11;
|
||||
lutDecIndex[3][ 8] = 15;
|
||||
lutDecIndex[3][12] = 3;
|
||||
|
||||
for (uint32_t i = 0; i < 4; ++i) {
|
||||
for (uint32_t j = 0; j < 16; j += 4) {
|
||||
lutEncIndex[i][j + 16] = lutEncIndex[i][j] + 16;
|
||||
lutDecIndex[i][j + 16] = lutDecIndex[i][j] + 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
} aes_initializer;
|
||||
|
||||
@@ -32,14 +32,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <stdint.h>
|
||||
#include "crypto/randomx/intrin_portable.h"
|
||||
|
||||
extern uint32_t lutEnc0[256];
|
||||
extern uint32_t lutEnc1[256];
|
||||
extern uint32_t lutEnc2[256];
|
||||
extern uint32_t lutEnc3[256];
|
||||
extern uint32_t lutDec0[256];
|
||||
extern uint32_t lutDec1[256];
|
||||
extern uint32_t lutDec2[256];
|
||||
extern uint32_t lutDec3[256];
|
||||
extern uint32_t lutEnc[4][256];
|
||||
extern uint32_t lutDec[4][256];
|
||||
|
||||
extern uint8_t lutEncIndex[4][32];
|
||||
extern uint8_t lutDecIndex[4][32];
|
||||
|
||||
template<int soft> rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key);
|
||||
template<int soft> rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key);
|
||||
@@ -49,25 +46,25 @@ FORCE_INLINE rx_vec_i128 aesenc<1>(rx_vec_i128 in, rx_vec_i128 key) {
|
||||
volatile uint8_t s[16];
|
||||
memcpy((void*) s, &in, 16);
|
||||
|
||||
uint32_t s0 = lutEnc0[s[ 0]];
|
||||
uint32_t s1 = lutEnc0[s[ 4]];
|
||||
uint32_t s2 = lutEnc0[s[ 8]];
|
||||
uint32_t s3 = lutEnc0[s[12]];
|
||||
uint32_t s0 = lutEnc[0][s[ 0]];
|
||||
uint32_t s1 = lutEnc[0][s[ 4]];
|
||||
uint32_t s2 = lutEnc[0][s[ 8]];
|
||||
uint32_t s3 = lutEnc[0][s[12]];
|
||||
|
||||
s0 ^= lutEnc1[s[ 5]];
|
||||
s1 ^= lutEnc1[s[ 9]];
|
||||
s2 ^= lutEnc1[s[13]];
|
||||
s3 ^= lutEnc1[s[ 1]];
|
||||
s0 ^= lutEnc[1][s[ 5]];
|
||||
s1 ^= lutEnc[1][s[ 9]];
|
||||
s2 ^= lutEnc[1][s[13]];
|
||||
s3 ^= lutEnc[1][s[ 1]];
|
||||
|
||||
s0 ^= lutEnc2[s[10]];
|
||||
s1 ^= lutEnc2[s[14]];
|
||||
s2 ^= lutEnc2[s[ 2]];
|
||||
s3 ^= lutEnc2[s[ 6]];
|
||||
s0 ^= lutEnc[2][s[10]];
|
||||
s1 ^= lutEnc[2][s[14]];
|
||||
s2 ^= lutEnc[2][s[ 2]];
|
||||
s3 ^= lutEnc[2][s[ 6]];
|
||||
|
||||
s0 ^= lutEnc3[s[15]];
|
||||
s1 ^= lutEnc3[s[ 3]];
|
||||
s2 ^= lutEnc3[s[ 7]];
|
||||
s3 ^= lutEnc3[s[11]];
|
||||
s0 ^= lutEnc[3][s[15]];
|
||||
s1 ^= lutEnc[3][s[ 3]];
|
||||
s2 ^= lutEnc[3][s[ 7]];
|
||||
s3 ^= lutEnc[3][s[11]];
|
||||
|
||||
return rx_xor_vec_i128(rx_set_int_vec_i128(s3, s2, s1, s0), key);
|
||||
}
|
||||
@@ -77,25 +74,25 @@ FORCE_INLINE rx_vec_i128 aesdec<1>(rx_vec_i128 in, rx_vec_i128 key) {
|
||||
volatile uint8_t s[16];
|
||||
memcpy((void*) s, &in, 16);
|
||||
|
||||
uint32_t s0 = lutDec0[s[ 0]];
|
||||
uint32_t s1 = lutDec0[s[ 4]];
|
||||
uint32_t s2 = lutDec0[s[ 8]];
|
||||
uint32_t s3 = lutDec0[s[12]];
|
||||
uint32_t s0 = lutDec[0][s[ 0]];
|
||||
uint32_t s1 = lutDec[0][s[ 4]];
|
||||
uint32_t s2 = lutDec[0][s[ 8]];
|
||||
uint32_t s3 = lutDec[0][s[12]];
|
||||
|
||||
s0 ^= lutDec1[s[13]];
|
||||
s1 ^= lutDec1[s[ 1]];
|
||||
s2 ^= lutDec1[s[ 5]];
|
||||
s3 ^= lutDec1[s[ 9]];
|
||||
s0 ^= lutDec[1][s[13]];
|
||||
s1 ^= lutDec[1][s[ 1]];
|
||||
s2 ^= lutDec[1][s[ 5]];
|
||||
s3 ^= lutDec[1][s[ 9]];
|
||||
|
||||
s0 ^= lutDec2[s[10]];
|
||||
s1 ^= lutDec2[s[14]];
|
||||
s2 ^= lutDec2[s[ 2]];
|
||||
s3 ^= lutDec2[s[ 6]];
|
||||
s0 ^= lutDec[2][s[10]];
|
||||
s1 ^= lutDec[2][s[14]];
|
||||
s2 ^= lutDec[2][s[ 2]];
|
||||
s3 ^= lutDec[2][s[ 6]];
|
||||
|
||||
s0 ^= lutDec3[s[ 7]];
|
||||
s1 ^= lutDec3[s[11]];
|
||||
s2 ^= lutDec3[s[15]];
|
||||
s3 ^= lutDec3[s[ 3]];
|
||||
s0 ^= lutDec[3][s[ 7]];
|
||||
s1 ^= lutDec[3][s[11]];
|
||||
s2 ^= lutDec[3][s[15]];
|
||||
s3 ^= lutDec[3][s[ 3]];
|
||||
|
||||
return rx_xor_vec_i128(rx_set_int_vec_i128(s3, s2, s1, s0), key);
|
||||
}
|
||||
@@ -110,10 +107,10 @@ FORCE_INLINE rx_vec_i128 aesenc<2>(rx_vec_i128 in, rx_vec_i128 key) {
|
||||
s3 = rx_vec_i128_x(in);
|
||||
|
||||
rx_vec_i128 out = rx_set_int_vec_i128(
|
||||
(lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]),
|
||||
(lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]),
|
||||
(lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]),
|
||||
(lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24])
|
||||
(lutEnc[0][s0 & 0xff] ^ lutEnc[1][(s3 >> 8) & 0xff] ^ lutEnc[2][(s2 >> 16) & 0xff] ^ lutEnc[3][s1 >> 24]),
|
||||
(lutEnc[0][s1 & 0xff] ^ lutEnc[1][(s0 >> 8) & 0xff] ^ lutEnc[2][(s3 >> 16) & 0xff] ^ lutEnc[3][s2 >> 24]),
|
||||
(lutEnc[0][s2 & 0xff] ^ lutEnc[1][(s1 >> 8) & 0xff] ^ lutEnc[2][(s0 >> 16) & 0xff] ^ lutEnc[3][s3 >> 24]),
|
||||
(lutEnc[0][s3 & 0xff] ^ lutEnc[1][(s2 >> 8) & 0xff] ^ lutEnc[2][(s1 >> 16) & 0xff] ^ lutEnc[3][s0 >> 24])
|
||||
);
|
||||
|
||||
return rx_xor_vec_i128(out, key);
|
||||
@@ -129,10 +126,10 @@ FORCE_INLINE rx_vec_i128 aesdec<2>(rx_vec_i128 in, rx_vec_i128 key) {
|
||||
s3 = rx_vec_i128_x(in);
|
||||
|
||||
rx_vec_i128 out = rx_set_int_vec_i128(
|
||||
(lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]),
|
||||
(lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]),
|
||||
(lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]),
|
||||
(lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24])
|
||||
(lutDec[0][s0 & 0xff] ^ lutDec[1][(s1 >> 8) & 0xff] ^ lutDec[2][(s2 >> 16) & 0xff] ^ lutDec[3][s3 >> 24]),
|
||||
(lutDec[0][s1 & 0xff] ^ lutDec[1][(s2 >> 8) & 0xff] ^ lutDec[2][(s3 >> 16) & 0xff] ^ lutDec[3][s0 >> 24]),
|
||||
(lutDec[0][s2 & 0xff] ^ lutDec[1][(s3 >> 8) & 0xff] ^ lutDec[2][(s0 >> 16) & 0xff] ^ lutDec[3][s1 >> 24]),
|
||||
(lutDec[0][s3 & 0xff] ^ lutDec[1][(s0 >> 8) & 0xff] ^ lutDec[2][(s1 >> 16) & 0xff] ^ lutDec[3][s2 >> 24])
|
||||
);
|
||||
|
||||
return rx_xor_vec_i128(out, key);
|
||||
|
||||
12
src/crypto/randomx/tests/riscv64_vector.s
Normal file
12
src/crypto/randomx/tests/riscv64_vector.s
Normal file
@@ -0,0 +1,12 @@
|
||||
/* RISC-V - test if the vector extension is present */
|
||||
|
||||
.text
|
||||
.option arch, rv64gcv
|
||||
.global main
|
||||
|
||||
main:
|
||||
li x5, 4
|
||||
vsetvli x6, x5, e64, m1, ta, ma
|
||||
vxor.vv v0, v0, v0
|
||||
sub x10, x5, x6
|
||||
ret
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user