diff --git a/.gitignore b/.gitignore index 3db117d49..9687ec69d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ scripts/deps /CMakeLists.txt.user /.idea /src/backend/opencl/cl/cn/cryptonight_gen.cl +.vscode +/.qtcreator diff --git a/CHANGELOG.md b/CHANGELOG.md index f2e477dd2..dc25c1b97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +# v6.25.0 +- [#3680](https://github.com/xmrig/xmrig/pull/3680) Added `armv8l` to the list of 32-bit ARM targets. +- [#3708](https://github.com/xmrig/xmrig/pull/3708) Minor Aarch64 JIT changes (better instruction selection, don't emit instructions that add 0, etc). +- [#3718](https://github.com/xmrig/xmrig/pull/3718) Solo mining: added support for FCMP++ hardfork. +- [#3722](https://github.com/xmrig/xmrig/pull/3722) Added Zen4 (Hawk Point) CPUs detection. +- [#3725](https://github.com/xmrig/xmrig/pull/3725) Added **RISC-V** support with JIT compiler. +- [#3731](https://github.com/xmrig/xmrig/pull/3731) Added initial Haiku OS support. +- [#3733](https://github.com/xmrig/xmrig/pull/3733) Added detection for MSVC/2026. +- [#3736](https://github.com/xmrig/xmrig/pull/3736) RISC-V: added vectorized dataset init. +- [#3740](https://github.com/xmrig/xmrig/pull/3740) RISC-V: added vectorized soft AES. +- [#3743](https://github.com/xmrig/xmrig/pull/3743) Linux: added support for transparent huge pages. +- Improved LibreSSL support. +- Improved compatibility for automatically enabling huge pages on Linux systems without NUMA support. + # v6.24.0 - [#3671](https://github.com/xmrig/xmrig/pull/3671) Fixed detection of L2 cache size for some complex NUMA topologies. - [#3674](https://github.com/xmrig/xmrig/pull/3674) Fixed ARMv7 build. diff --git a/CMakeLists.txt b/CMakeLists.txt index 313923226..4b36a8dfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,7 @@ set(HEADERS_CRYPTO src/crypto/common/VirtualMemory.h ) -if (XMRIG_ARM) +if (XMRIG_ARM OR XMRIG_RISCV) set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_arm.h) else() set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_x86.h) diff --git a/README.md b/README.md index b4d40751c..7b6e66c54 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ XMRig is a high performance, open source, cross platform RandomX, KawPow, CryptoNight and [GhostRider](https://github.com/xmrig/xmrig/tree/master/src/crypto/ghostrider#readme) unified CPU/GPU miner and [RandomX benchmark](https://xmrig.com/benchmark). Official binaries are available for Windows, Linux, macOS and FreeBSD. ## Mining backends -- **CPU** (x86/x64/ARMv7/ARMv8) +- **CPU** (x86/x64/ARMv7/ARMv8/RISC-V) - **OpenCL** for AMD GPUs. - **CUDA** for NVIDIA GPUs via external [CUDA plugin](https://github.com/xmrig/xmrig-cuda). diff --git a/cmake/asm.cmake b/cmake/asm.cmake index e445defde..30a119c30 100644 --- a/cmake/asm.cmake +++ b/cmake/asm.cmake @@ -1,4 +1,4 @@ -if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) +if (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) set(XMRIG_ASM_LIBRARY "xmrig-asm") if (CMAKE_C_COMPILER_ID MATCHES MSVC) diff --git a/cmake/cpu.cmake b/cmake/cpu.cmake index 12dbe9b1b..515c2ccbb 100644 --- a/cmake/cpu.cmake +++ b/cmake/cpu.cmake @@ -21,6 +21,19 @@ if (NOT VAES_SUPPORTED) set(WITH_VAES OFF) endif() +# Detect RISC-V architecture early (before it's used below) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv64|riscv|rv64)$") + set(RISCV_TARGET 64) + set(XMRIG_RISCV ON) + add_definitions(-DXMRIG_RISCV) + message(STATUS "Detected RISC-V 64-bit architecture (${CMAKE_SYSTEM_PROCESSOR})") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv32|rv32)$") + set(RISCV_TARGET 32) + set(XMRIG_RISCV ON) + add_definitions(-DXMRIG_RISCV) + message(STATUS "Detected RISC-V 32-bit architecture (${CMAKE_SYSTEM_PROCESSOR})") +endif() + if (XMRIG_64_BIT AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$") add_definitions(-DRAPIDJSON_SSE2) else() @@ -29,6 +42,57 @@ else() set(WITH_VAES OFF) endif() +# Disable x86-specific features for RISC-V +if (XMRIG_RISCV) + set(WITH_SSE4_1 OFF) + set(WITH_AVX2 OFF) + set(WITH_VAES OFF) + + # default build uses the RV64GC baseline + set(RVARCH "rv64gc") + + # for native builds, enable Zba and Zbb if supported by the CPU + if(ARCH STREQUAL "native") + enable_language(ASM) + + try_run(RANDOMX_VECTOR_RUN_FAIL + RANDOMX_VECTOR_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_vector.s + COMPILE_DEFINITIONS "-march=rv64gcv_zicbop") + + if (RANDOMX_VECTOR_COMPILE_OK AND NOT RANDOMX_VECTOR_RUN_FAIL) + set(RVARCH "${RVARCH}v_zicbop") + add_definitions(-DXMRIG_RVV_ENABLED) + message(STATUS "RISC-V vector extension detected") + endif() + + try_run(RANDOMX_ZBA_RUN_FAIL + RANDOMX_ZBA_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zba.s + COMPILE_DEFINITIONS "-march=rv64gc_zba") + + if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL) + set(RVARCH "${RVARCH}_zba") + message(STATUS "RISC-V zba extension detected") + endif() + + try_run(RANDOMX_ZBB_RUN_FAIL + RANDOMX_ZBB_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zbb.s + COMPILE_DEFINITIONS "-march=rv64gc_zbb") + + if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL) + set(RVARCH "${RVARCH}_zbb") + message(STATUS "RISC-V zbb extension detected") + endif() + endif() + + message(STATUS "Using -march=${RVARCH}") +endif() + add_definitions(-DRAPIDJSON_WRITE_DEFAULT_FLAGS=6) # rapidjson::kWriteNanAndInfFlag | rapidjson::kWriteNanAndInfNullFlag if (ARM_V8) @@ -40,7 +104,7 @@ endif() if (NOT ARM_TARGET) if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|ARM64|armv8-a)$") set(ARM_TARGET 8) - elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l|armv7ve)$") + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l|armv7ve|armv8l)$") set(ARM_TARGET 7) endif() endif() diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9abf212a0..a36d18256 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -28,6 +28,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU) elseif (ARM_TARGET EQUAL 7) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions") + elseif (XMRIG_RISCV) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}") + + add_definitions(-DHAVE_ROTR) else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") @@ -41,6 +46,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU) else() set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static -Wl,--large-address-aware") endif() + elseif(CMAKE_SYSTEM_NAME STREQUAL "Haiku") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc") else() set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++") endif() @@ -74,6 +81,11 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang) elseif (ARM_TARGET EQUAL 7) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") + elseif (XMRIG_RISCV) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}") + + add_definitions(-DHAVE_ROTR) else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") diff --git a/cmake/os.cmake b/cmake/os.cmake index 8f70e9f42..3025e0c09 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -17,6 +17,10 @@ else() set(XMRIG_OS_LINUX ON) elseif(CMAKE_SYSTEM_NAME STREQUAL FreeBSD OR CMAKE_SYSTEM_NAME STREQUAL DragonFly) set(XMRIG_OS_FREEBSD ON) + elseif(CMAKE_SYSTEM_NAME STREQUAL OpenBSD) + set(XMRIG_OS_OPENBSD ON) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Haiku") + set(XMRIG_OS_HAIKU ON) endif() endif() @@ -43,6 +47,10 @@ elseif(XMRIG_OS_UNIX) add_definitions(-DXMRIG_OS_LINUX) elseif (XMRIG_OS_FREEBSD) add_definitions(-DXMRIG_OS_FREEBSD) + elseif (XMRIG_OS_OPENBSD) + add_definitions(-DXMRIG_OS_OPENBSD) + elseif (XMRIG_OS_HAIKU) + add_definitions(-DXMRIG_OS_HAIKU) endif() endif() diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index a50e078fd..c15024c97 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -62,7 +62,7 @@ if (WITH_RANDOMX) src/crypto/randomx/jit_compiler_x86_static.asm src/crypto/randomx/jit_compiler_x86.cpp ) - elseif (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) + elseif (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) list(APPEND SOURCES_CRYPTO src/crypto/randomx/jit_compiler_x86_static.S src/crypto/randomx/jit_compiler_x86.cpp @@ -80,6 +80,16 @@ if (WITH_RANDOMX) else() set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C) endif() + elseif (XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) + list(APPEND SOURCES_CRYPTO + src/crypto/randomx/jit_compiler_rv64_static.S + src/crypto/randomx/jit_compiler_rv64_vector_static.S + src/crypto/randomx/jit_compiler_rv64.cpp + src/crypto/randomx/jit_compiler_rv64_vector.cpp + ) + # cheat because cmake and ccache hate each other + set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_static.S PROPERTY LANGUAGE C) + set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTY LANGUAGE C) else() list(APPEND SOURCES_CRYPTO src/crypto/randomx/jit_compiler_fallback.cpp @@ -116,7 +126,7 @@ if (WITH_RANDOMX) ) endif() - if (WITH_MSR AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX)) + if (WITH_MSR AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX)) add_definitions(/DXMRIG_FEATURE_MSR) add_definitions(/DXMRIG_FIX_RYZEN) message("-- WITH_MSR=ON") diff --git a/doc/RISCV_PERF_TUNING.md b/doc/RISCV_PERF_TUNING.md new file mode 100644 index 000000000..b37a530d3 --- /dev/null +++ b/doc/RISCV_PERF_TUNING.md @@ -0,0 +1,365 @@ +# RISC-V Performance Optimization Guide + +This guide provides comprehensive instructions for optimizing XMRig on RISC-V architectures. + +## Build Optimizations + +### Compiler Flags Applied Automatically + +The CMake build now applies aggressive RISC-V-specific optimizations: + +```cmake +# RISC-V ISA with extensions +-march=rv64gcv_zba_zbb_zbc_zbs + +# Aggressive compiler optimizations +-funroll-loops # Unroll loops for ILP (instruction-level parallelism) +-fomit-frame-pointer # Free up frame pointer register (RISC-V has limited registers) +-fno-common # Better code generation for global variables +-finline-functions # Inline more functions for better cache locality +-ffast-math # Relaxed FP semantics (safe for mining) +-flto # Link-time optimization for cross-module inlining + +# Release build additions +-minline-atomics # Inline atomic operations for faster synchronization +``` + +### Optimal Build Command + +```bash +mkdir build && cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +make -j$(nproc) +``` + +**Expected build time**: 5-15 minutes depending on CPU + +## Runtime Optimizations + +### 1. Memory Configuration (Most Important) + +Enable huge pages to reduce TLB misses and fragmentation: + +#### Enable 2MB Huge Pages +```bash +# Calculate required huge pages (1 page = 2MB) +# For 2 GB dataset: 1024 pages +# For cache + dataset: 1536 pages minimum +sudo sysctl -w vm.nr_hugepages=2048 +``` + +Verify: +```bash +grep HugePages /proc/meminfo +# Expected: HugePages_Free should be close to nr_hugepages +``` + +#### Enable 1GB Huge Pages (Optional but Recommended) + +```bash +# Run provided helper script +sudo ./scripts/enable_1gb_pages.sh + +# Verify 1GB pages are available +cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +# Should be: >= 1 (one 1GB page) +``` + +Update config.json: +```json +{ + "cpu": { + "huge-pages": true + }, + "randomx": { + "1gb-pages": true + } +} +``` + +### 2. RandomX Mode Selection + +| Mode | Memory | Init Time | Throughput | Recommendation | +|------|--------|-----------|-----------|-----------------| +| **light** | 256 MB | 10 sec | Low | Testing, resource-constrained | +| **fast** | 2 GB | 2-5 min* | High | Production (with huge pages) | +| **auto** | 2 GB | Varies | High | Default (uses fast if possible) | + +*With optimizations; can be 30+ minutes without huge pages + +**For RISC-V, use fast mode with huge pages enabled.** + +### 3. Dataset Initialization Threads + +Optimal thread count = 60-75% of CPU cores (leaves headroom for OS/other tasks) + +```json +{ + "randomx": { + "init": 4 + } +} +``` + +Or auto-detect (rewritten for RISC-V): +```json +{ + "randomx": { + "init": -1 + } +} +``` + +### 4. CPU Affinity (Optional) + +Pin threads to specific cores for better cache locality: + +```json +{ + "cpu": { + "rx/0": [ + { "threads": 1, "affinity": 0 }, + { "threads": 1, "affinity": 1 }, + { "threads": 1, "affinity": 2 }, + { "threads": 1, "affinity": 3 } + ] + } +} +``` + +### 5. CPU Governor (Linux) + +Set to performance mode for maximum throughput: + +```bash +# Check current governor +cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor + +# Set to performance (requires root) +echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor + +# Verify +cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor +# Should output: performance +``` + +## Configuration Examples + +### Minimum (Testing) +```json +{ + "randomx": { + "mode": "light" + }, + "cpu": { + "huge-pages": false + } +} +``` + +### Recommended (Balanced) +```json +{ + "randomx": { + "mode": "auto", + "init": 4, + "1gb-pages": true + }, + "cpu": { + "huge-pages": true, + "priority": 2 + } +} +``` + +### Maximum Performance (Production) +```json +{ + "randomx": { + "mode": "fast", + "init": -1, + "1gb-pages": true, + "scratchpad_prefetch_mode": 1 + }, + "cpu": { + "huge-pages": true, + "priority": 3, + "yield": false + } +} +``` + +## CLI Equivalents + +```bash +# Light mode +./xmrig --randomx-mode=light + +# Fast mode with 4 init threads +./xmrig --randomx-mode=fast --randomx-init=4 + +# Benchmark +./xmrig --bench=1M --algo=rx/0 + +# Benchmark Wownero variant (1 MB scratchpad) +./xmrig --bench=1M --algo=rx/wow + +# Mine to pool +./xmrig -o pool.example.com:3333 -u YOUR_WALLET -p x +``` + +## Performance Diagnostics + +### Check if Vector Extensions are Detected + +Look for `FEATURES:` line in output: +``` + * CPU: ky,x60 (uarch ky,x1) + * FEATURES: rv64imafdcv zba zbb zbc zbs +``` + +- `v`: Vector extension (RVV) ✓ +- `zba`, `zbb`, `zbc`, `zbs`: Bit manipulation ✓ +- If missing, make sure build used `-march=rv64gcv_zba_zbb_zbc_zbs` + +### Verify Huge Pages at Runtime + +```bash +# Run xmrig with --bench=1M and check output +./xmrig --bench=1M + +# Look for line like: +# HUGE PAGES 100% 1 / 1 (1024 MB) +``` + +- Should show 100% for dataset AND threads +- If less, increase `vm.nr_hugepages` and reboot + +### Monitor Performance + +```bash +# Run benchmark multiple times to find stable hashrate +./xmrig --bench=1M --algo=rx/0 +./xmrig --bench=10M --algo=rx/0 +./xmrig --bench=100M --algo=rx/0 + +# Check system load and memory during mining +while true; do free -h; grep HugePages /proc/meminfo; sleep 2; done +``` + +## Expected Performance + +### Hardware: Orange Pi RV2 (Ky X1, 8 cores @ ~1.5 GHz) + +| Config | Mode | Hashrate | Init Time | +|--------|------|----------|-----------| +| Scalar (baseline) | fast | 30 H/s | 10 min | +| Scalar + huge pages | fast | 33 H/s | 2 min | +| RVV (if enabled) | fast | 70-100 H/s | 3 min | + +*Actual results depend on CPU frequency, memory speed, and load* + +## Troubleshooting + +### Long Initialization Times (30+ minutes) + +**Cause**: Huge pages not enabled, system using swap +**Solution**: +1. Enable huge pages: `sudo sysctl -w vm.nr_hugepages=2048` +2. Reboot: `sudo reboot` +3. Reduce mining threads to free memory +4. Check available memory: `free -h` + +### Low Hashrate (50% of expected) + +**Cause**: CPU governor set to power-save, no huge pages, high contention +**Solution**: +1. Set governor to performance: `echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor` +2. Enable huge pages +3. Reduce number of mining threads +4. Check system load: `top` or `htop` + +### Dataset Init Crashes or Hangs + +**Cause**: Insufficient memory, corrupted huge pages +**Solution**: +1. Disable huge pages temporarily: set `huge-pages: false` in config +2. Reduce mining threads +3. Reboot and re-enable huge pages +4. Try light mode: `--randomx-mode=light` + +### Out of Memory During Benchmark + +**Cause**: Not enough RAM for dataset + cache + threads +**Solution**: +1. Use light mode: `--randomx-mode=light` +2. Reduce mining threads: `--threads=1` +3. Increase available memory (kill other processes) +4. Check: `free -h` before mining + +## Advanced Tuning + +### Vector Length (VLEN) Detection + +RISC-V vector extension variable length (VLEN) affects performance: + +```bash +# Check VLEN on your CPU +cat /proc/cpuinfo | grep vlen + +# Expected values: +# - 128 bits (16 bytes) = minimum +# - 256 bits (32 bytes) = common +# - 512 bits (64 bytes) = high performance +``` + +Larger VLEN generally means better performance for vectorized operations. + +### Prefetch Optimization + +The code automatically optimizes memory prefetching for RISC-V: + +``` +scratchpad_prefetch_mode: 0 = disabled (slowest) +scratchpad_prefetch_mode: 1 = prefetch.r (default, recommended) +scratchpad_prefetch_mode: 2 = prefetch.w (experimental) +``` + +### Memory Bandwidth Saturation + +If experiencing memory bandwidth saturation (high latency): + +1. Reduce mining threads +2. Increase L2/L3 cache by mining fewer threads per core +3. Enable cache QoS (AMD Ryzen): `cache_qos: true` + +## Building with Custom Flags + +To build with custom RISC-V flags: + +```bash +mkdir build && cd build +cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="-march=rv64gcv_zba_zbb_zbc_zbs -O3 -funroll-loops -fomit-frame-pointer" \ + .. +make -j$(nproc) +``` + +## Future Optimizations + +- [ ] Zbk* (crypto) support detection and usage +- [ ] Optimal VLEN-aware algorithm selection +- [ ] Per-core memory affinity (NUMA support) +- [ ] Dynamic thread count adjustment based on thermals +- [ ] Cross-compile optimizations for various RISC-V cores + +## References + +- [RISC-V Vector Extension Spec](https://github.com/riscv/riscv-v-spec) +- [RISC-V Bit Manipulation Spec](https://github.com/riscv/riscv-bitmanip) +- [RISC-V Crypto Spec](https://github.com/riscv/riscv-crypto) +- [XMRig Documentation](https://xmrig.com/docs) + +--- + +For further optimization, enable RVV intrinsics by replacing `sse2rvv.h` with `sse2rvv_optimized.h` in the build. diff --git a/scripts/randomx_boost.sh b/scripts/randomx_boost.sh index 8580229a5..4181a95c0 100755 --- a/scripts/randomx_boost.sh +++ b/scripts/randomx_boost.sh @@ -12,7 +12,7 @@ if grep -E 'AMD Ryzen|AMD EPYC|AuthenticAMD' /proc/cpuinfo > /dev/null; then if grep "cpu family[[:space:]]\{1,\}:[[:space:]]25" /proc/cpuinfo > /dev/null; then - if grep "model[[:space:]]\{1,\}:[[:space:]]97" /proc/cpuinfo > /dev/null; + if grep "model[[:space:]]\{1,\}:[[:space:]]\(97\|117\)" /proc/cpuinfo > /dev/null; then echo "Detected Zen4 CPU" wrmsr -a 0xc0011020 0x4400000000000 diff --git a/src/3rdparty/argon2/CMakeLists.txt b/src/3rdparty/argon2/CMakeLists.txt index a9751fd94..7d09e5172 100644 --- a/src/3rdparty/argon2/CMakeLists.txt +++ b/src/3rdparty/argon2/CMakeLists.txt @@ -35,7 +35,7 @@ if (CMAKE_C_COMPILER_ID MATCHES MSVC) add_feature_impl(xop "" HAVE_XOP) add_feature_impl(avx2 "/arch:AVX2" HAVE_AVX2) add_feature_impl(avx512f "/arch:AVX512F" HAVE_AVX512F) -elseif (NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) +elseif (NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) function(add_feature_impl FEATURE GCC_FLAG DEF) add_library(argon2-${FEATURE} STATIC arch/x86_64/lib/argon2-${FEATURE}.c) target_include_directories(argon2-${FEATURE} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../) diff --git a/src/3rdparty/libethash/endian.h b/src/3rdparty/libethash/endian.h index f960d7429..deb57620a 100644 --- a/src/3rdparty/libethash/endian.h +++ b/src/3rdparty/libethash/endian.h @@ -31,7 +31,7 @@ #include #define ethash_swap_u32(input_) OSSwapInt32(input_) #define ethash_swap_u64(input_) OSSwapInt64(input_) -#elif defined(__FreeBSD__) || defined(__DragonFly__) || defined(__NetBSD__) +#elif defined(__FreeBSD__) || defined(__DragonFly__) || defined(__NetBSD__) || defined(__HAIKU__) #define ethash_swap_u32(input_) bswap32(input_) #define ethash_swap_u64(input_) bswap64(input_) #elif defined(__OpenBSD__) diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index bef2e898e..cba7e8839 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -87,14 +87,14 @@ xmrig::CpuWorker::CpuWorker(size_t id, const CpuLaunchData &data) : if (!cn_heavyZen3Memory) { // Round up number of threads to the multiple of 8 const size_t num_threads = ((m_threads + 7) / 8) * 8; - cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * num_threads, data.hugePages, false, false, node()); + cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * num_threads, data.hugePages, false, false, node(), VirtualMemory::kDefaultHugePageSize); } m_memory = cn_heavyZen3Memory; } else # endif { - m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node()); + m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node(), VirtualMemory::kDefaultHugePageSize); } # ifdef XMRIG_ALGO_GHOSTRIDER diff --git a/src/backend/cpu/cpu.cmake b/src/backend/cpu/cpu.cmake index f9a02abd8..3c9d779b0 100644 --- a/src/backend/cpu/cpu.cmake +++ b/src/backend/cpu/cpu.cmake @@ -46,7 +46,12 @@ else() set(CPUID_LIB "") endif() -if (XMRIG_ARM) +if (XMRIG_RISCV) + list(APPEND SOURCES_BACKEND_CPU + src/backend/cpu/platform/lscpu_riscv.cpp + src/backend/cpu/platform/BasicCpuInfo_riscv.cpp + ) +elseif (XMRIG_ARM) list(APPEND SOURCES_BACKEND_CPU src/backend/cpu/platform/BasicCpuInfo_arm.cpp) if (XMRIG_OS_WIN) diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index 8d10d4d29..e28a14734 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -91,7 +91,7 @@ public: ICpuInfo() = default; virtual ~ICpuInfo() = default; -# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__) +# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__) || defined(__riscv) && (__riscv_xlen == 64) inline constexpr static bool is64bit() { return true; } # else inline constexpr static bool is64bit() { return false; } diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index 30a78f828..9f5595aac 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -250,7 +250,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : break; case 0x19: - if (m_model == 0x61) { + if ((m_model == 0x61) || (m_model == 0x75)) { m_arch = ARCH_ZEN4; m_msrMod = MSR_MOD_RYZEN_19H_ZEN4; } diff --git a/src/backend/cpu/platform/BasicCpuInfo.h b/src/backend/cpu/platform/BasicCpuInfo.h index 5ea5661d1..97fe20e1b 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.h +++ b/src/backend/cpu/platform/BasicCpuInfo.h @@ -65,7 +65,7 @@ protected: inline Vendor vendor() const override { return m_vendor; } inline uint32_t model() const override { -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) return m_model; # else return 0; @@ -80,7 +80,7 @@ protected: Vendor m_vendor = VENDOR_UNKNOWN; private: -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) uint32_t m_procInfo = 0; uint32_t m_family = 0; uint32_t m_model = 0; diff --git a/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp b/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp new file mode 100644 index 000000000..fd9c9ce62 --- /dev/null +++ b/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp @@ -0,0 +1,116 @@ +/* XMRig + * Copyright (c) 2025 Slayingripper + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2017-2019 XMR-Stak , + * Copyright (c) 2016-2025 XMRig + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + + +#include "backend/cpu/platform/BasicCpuInfo.h" +#include "base/tools/String.h" +#include "3rdparty/rapidjson/document.h" + + +namespace xmrig { + + +extern String cpu_name_riscv(); +extern bool has_riscv_vector(); +extern bool has_riscv_crypto(); + + +} // namespace xmrig + + +xmrig::BasicCpuInfo::BasicCpuInfo() : + m_threads(std::thread::hardware_concurrency()) +{ + m_units.resize(m_threads); + for (int32_t i = 0; i < static_cast(m_threads); ++i) { + m_units[i] = i; + } + + memcpy(m_brand, "RISC-V", 6); + + auto name = cpu_name_riscv(); + if (!name.isNull()) { + strncpy(m_brand, name.data(), sizeof(m_brand) - 1); + } + + // Check for crypto extensions (Zknd/Zkne/Zknh - AES and SHA) + m_flags.set(FLAG_AES, has_riscv_crypto()); + + // RISC-V typically supports 1GB huge pages + m_flags.set(FLAG_PDPE1GB, std::ifstream("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages").good()); +} + + +const char *xmrig::BasicCpuInfo::backend() const +{ + return "basic/1"; +} + + +xmrig::CpuThreads xmrig::BasicCpuInfo::threads(const Algorithm &algorithm, uint32_t) const +{ +# ifdef XMRIG_ALGO_GHOSTRIDER + if (algorithm.family() == Algorithm::GHOSTRIDER) { + return CpuThreads(threads(), 8); + } +# endif + + return CpuThreads(threads()); +} + + +rapidjson::Value xmrig::BasicCpuInfo::toJSON(rapidjson::Document &doc) const +{ + using namespace rapidjson; + auto &allocator = doc.GetAllocator(); + + Value out(kObjectType); + + out.AddMember("brand", StringRef(brand()), allocator); + out.AddMember("aes", hasAES(), allocator); + out.AddMember("avx2", false, allocator); + out.AddMember("x64", is64bit(), allocator); // DEPRECATED will be removed in the next major release. + out.AddMember("64_bit", is64bit(), allocator); + out.AddMember("l2", static_cast(L2()), allocator); + out.AddMember("l3", static_cast(L3()), allocator); + out.AddMember("cores", static_cast(cores()), allocator); + out.AddMember("threads", static_cast(threads()), allocator); + out.AddMember("packages", static_cast(packages()), allocator); + out.AddMember("nodes", static_cast(nodes()), allocator); + out.AddMember("backend", StringRef(backend()), allocator); + out.AddMember("msr", "none", allocator); + out.AddMember("assembly", "none", allocator); + out.AddMember("arch", "riscv64", allocator); + + Value flags(kArrayType); + + if (hasAES()) { + flags.PushBack("aes", allocator); + } + + out.AddMember("flags", flags, allocator); + + return out; +} diff --git a/src/backend/cpu/platform/HwlocCpuInfo.cpp b/src/backend/cpu/platform/HwlocCpuInfo.cpp index f796416b4..1cb071b7a 100644 --- a/src/backend/cpu/platform/HwlocCpuInfo.cpp +++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp @@ -87,7 +87,7 @@ static inline size_t countByType(hwloc_topology_t topology, hwloc_obj_type_t typ } -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) static inline std::vector findByType(hwloc_obj_t obj, hwloc_obj_type_t type) { std::vector out; @@ -207,7 +207,7 @@ bool xmrig::HwlocCpuInfo::membind(hwloc_const_bitmap_t nodeset) xmrig::CpuThreads xmrig::HwlocCpuInfo::threads(const Algorithm &algorithm, uint32_t limit) const { -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) if (L2() == 0 && L3() == 0) { return BasicCpuInfo::threads(algorithm, limit); } @@ -277,7 +277,7 @@ xmrig::CpuThreads xmrig::HwlocCpuInfo::allThreads(const Algorithm &algorithm, ui void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorithm &algorithm, CpuThreads &threads, size_t limit) const { -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) constexpr size_t oneMiB = 1024U * 1024U; size_t PUs = countByType(cache, HWLOC_OBJ_PU); diff --git a/src/backend/cpu/platform/lscpu_riscv.cpp b/src/backend/cpu/platform/lscpu_riscv.cpp new file mode 100644 index 000000000..d19d26a8f --- /dev/null +++ b/src/backend/cpu/platform/lscpu_riscv.cpp @@ -0,0 +1,140 @@ +/* XMRig + * Copyright (c) 2025 Slayingripper + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "base/tools/String.h" +#include "3rdparty/fmt/core.h" + +#include +#include +#include + +namespace xmrig { + +struct riscv_cpu_desc +{ + String model; + String isa; + String uarch; + bool has_vector = false; + bool has_crypto = false; + + inline bool isReady() const { return !model.isNull(); } +}; + +static bool lookup_riscv(char *line, const char *pattern, String &value) +{ + char *p = strstr(line, pattern); + if (!p) { + return false; + } + + p += strlen(pattern); + while (isspace(*p)) { + ++p; + } + + if (*p == ':') { + ++p; + } + + while (isspace(*p)) { + ++p; + } + + // Remove trailing newline + size_t len = strlen(p); + if (len > 0 && p[len - 1] == '\n') { + p[len - 1] = '\0'; + } + + // Ensure we call the const char* assignment (which performs a copy) + // instead of the char* overload (which would take ownership of the pointer) + value = (const char*)p; + return true; +} + +static bool read_riscv_cpuinfo(riscv_cpu_desc *desc) +{ + auto fp = fopen("/proc/cpuinfo", "r"); + if (!fp) { + return false; + } + + char buf[2048]; // Larger buffer for long ISA strings + while (fgets(buf, sizeof(buf), fp) != nullptr) { + lookup_riscv(buf, "model name", desc->model); + + if (lookup_riscv(buf, "isa", desc->isa)) { + // Check for vector extensions + if (strstr(buf, "zve") || strstr(buf, "v_")) { + desc->has_vector = true; + } + // Check for crypto extensions (AES, SHA, etc.) + // zkn* = NIST crypto suite, zks* = SM crypto suite + // Note: zba/zbb/zbc/zbs are bit-manipulation, NOT crypto + if (strstr(buf, "zknd") || strstr(buf, "zkne") || strstr(buf, "zknh") || + strstr(buf, "zksed") || strstr(buf, "zksh")) { + desc->has_crypto = true; + } + } + + lookup_riscv(buf, "uarch", desc->uarch); + + if (desc->isReady() && !desc->isa.isNull()) { + break; + } + } + + fclose(fp); + + return desc->isReady(); +} + +String cpu_name_riscv() +{ + riscv_cpu_desc desc; + if (read_riscv_cpuinfo(&desc)) { + if (!desc.uarch.isNull()) { + return fmt::format("{} ({})", desc.model, desc.uarch).c_str(); + } + return desc.model; + } + + return "RISC-V"; +} + +bool has_riscv_vector() +{ + riscv_cpu_desc desc; + if (read_riscv_cpuinfo(&desc)) { + return desc.has_vector; + } + return false; +} + +bool has_riscv_crypto() +{ + riscv_cpu_desc desc; + if (read_riscv_cpuinfo(&desc)) { + return desc.has_crypto; + } + return false; +} + +} // namespace xmrig diff --git a/src/base/kernel/Platform_unix.cpp b/src/base/kernel/Platform_unix.cpp index 4ffee2140..e53fe58d5 100644 --- a/src/base/kernel/Platform_unix.cpp +++ b/src/base/kernel/Platform_unix.cpp @@ -1,6 +1,6 @@ /* XMRig - * Copyright (c) 2018-2021 SChernykh - * Copyright (c) 2016-2021 XMRig , + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -71,11 +71,11 @@ char *xmrig::Platform::createUserAgent() #ifndef XMRIG_FEATURE_HWLOC -#ifdef __DragonFly__ +#if defined(__DragonFly__) || defined(XMRIG_OS_OPENBSD) || defined(XMRIG_OS_HAIKU) bool xmrig::Platform::setThreadAffinity(uint64_t cpu_id) { - return true; + return false; } #else diff --git a/src/base/net/tls/TlsContext.cpp b/src/base/net/tls/TlsContext.cpp index 54b904eab..410059fb5 100644 --- a/src/base/net/tls/TlsContext.cpp +++ b/src/base/net/tls/TlsContext.cpp @@ -1,7 +1,7 @@ /* XMRig * Copyright (c) 2018 Lee Clagett - * Copyright (c) 2018-2023 SChernykh - * Copyright (c) 2016-2023 XMRig , + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,7 +45,7 @@ namespace xmrig { // https://wiki.openssl.org/index.php/Diffie-Hellman_parameters -#if OPENSSL_VERSION_NUMBER < 0x30000000L || defined(LIBRESSL_VERSION_NUMBER) +#if OPENSSL_VERSION_NUMBER < 0x30000000L || (defined(LIBRESSL_VERSION_NUMBER) && !defined(LIBRESSL_HAS_TLS1_3)) static DH *get_dh2048() { static unsigned char dhp_2048[] = { @@ -152,7 +152,7 @@ bool xmrig::TlsContext::load(const TlsConfig &config) SSL_CTX_set_options(m_ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); SSL_CTX_set_options(m_ctx, SSL_OP_CIPHER_SERVER_PREFERENCE); -# if OPENSSL_VERSION_NUMBER >= 0x1010100fL && !defined(LIBRESSL_VERSION_NUMBER) +# if OPENSSL_VERSION_NUMBER >= 0x1010100fL || defined(LIBRESSL_HAS_TLS1_3) SSL_CTX_set_max_early_data(m_ctx, 0); # endif @@ -180,7 +180,7 @@ bool xmrig::TlsContext::setCipherSuites(const char *ciphersuites) return true; } -# if OPENSSL_VERSION_NUMBER >= 0x1010100fL && !defined(LIBRESSL_VERSION_NUMBER) +# if OPENSSL_VERSION_NUMBER >= 0x1010100fL || defined(LIBRESSL_HAS_TLS1_3) if (SSL_CTX_set_ciphersuites(m_ctx, ciphersuites) == 1) { return true; } @@ -194,7 +194,7 @@ bool xmrig::TlsContext::setCipherSuites(const char *ciphersuites) bool xmrig::TlsContext::setDH(const char *dhparam) { -# if OPENSSL_VERSION_NUMBER < 0x30000000L || defined(LIBRESSL_VERSION_NUMBER) +# if OPENSSL_VERSION_NUMBER < 0x30000000L || (defined(LIBRESSL_VERSION_NUMBER) && !defined(LIBRESSL_HAS_TLS1_3)) DH *dh = nullptr; if (dhparam != nullptr) { diff --git a/src/base/tools/cryptonote/BlockTemplate.cpp b/src/base/tools/cryptonote/BlockTemplate.cpp index 310fedf4d..1b64f2ee5 100644 --- a/src/base/tools/cryptonote/BlockTemplate.cpp +++ b/src/base/tools/cryptonote/BlockTemplate.cpp @@ -241,8 +241,13 @@ bool xmrig::BlockTemplate::parse(bool hashes) ar(m_amount); ar(m_outputType); - // output type must be txout_to_key (2) or txout_to_tagged_key (3) - if ((m_outputType != 2) && (m_outputType != 3)) { + const bool is_fcmp_pp = (m_coin == Coin::MONERO) && (m_version.first >= 17); + + // output type must be txout_to_key (2) or txout_to_tagged_key (3) for versions < 17, and txout_to_carrot_v1 (0) for version FCMP++ + if (is_fcmp_pp && (m_outputType == 0)) { + // all good + } + else if ((m_outputType != 2) && (m_outputType != 3)) { return false; } @@ -250,6 +255,11 @@ bool xmrig::BlockTemplate::parse(bool hashes) ar(m_ephPublicKey, kKeySize); + if (is_fcmp_pp) { + ar(m_carrotViewTag); + ar(m_janusAnchor); + } + if (m_coin == Coin::ZEPHYR) { if (m_outputType != 2) { return false; diff --git a/src/base/tools/cryptonote/BlockTemplate.h b/src/base/tools/cryptonote/BlockTemplate.h index c731aad23..a4e75f3ff 100644 --- a/src/base/tools/cryptonote/BlockTemplate.h +++ b/src/base/tools/cryptonote/BlockTemplate.h @@ -148,6 +148,8 @@ private: Buffer m_hashes; Buffer m_minerTxMerkleTreeBranch; uint8_t m_rootHash[kHashSize]{}; + uint8_t m_carrotViewTag[3]{}; + uint8_t m_janusAnchor[16]{}; }; diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp index 4b4b006f3..b1f228b21 100644 --- a/src/crypto/cn/CnHash.cpp +++ b/src/crypto/cn/CnHash.cpp @@ -23,7 +23,7 @@ #include "crypto/common/VirtualMemory.h" -#if defined(XMRIG_ARM) +#if defined(XMRIG_ARM) || defined(XMRIG_RISCV) # include "crypto/cn/CryptoNight_arm.h" #else # include "crypto/cn/CryptoNight_x86.h" diff --git a/src/crypto/cn/CryptoNight.h b/src/crypto/cn/CryptoNight.h index 897890d28..d37c3ea8e 100644 --- a/src/crypto/cn/CryptoNight.h +++ b/src/crypto/cn/CryptoNight.h @@ -30,7 +30,7 @@ #include #include -#if defined _MSC_VER || defined XMRIG_ARM +#if defined _MSC_VER || defined XMRIG_ARM || defined XMRIG_RISCV # define ABI_ATTRIBUTE #else # define ABI_ATTRIBUTE __attribute__((ms_abi)) diff --git a/src/crypto/cn/CryptoNight_arm.h b/src/crypto/cn/CryptoNight_arm.h index 7b47e97da..eeb5bd007 100644 --- a/src/crypto/cn/CryptoNight_arm.h +++ b/src/crypto/cn/CryptoNight_arm.h @@ -27,6 +27,9 @@ #ifndef XMRIG_CRYPTONIGHT_ARM_H #define XMRIG_CRYPTONIGHT_ARM_H +#ifdef XMRIG_RISCV +# include "crypto/cn/sse2rvv.h" +#endif #include "base/crypto/keccak.h" #include "crypto/cn/CnAlgo.h" diff --git a/src/crypto/cn/CryptoNight_monero.h b/src/crypto/cn/CryptoNight_monero.h index a9975e784..6c3d115ed 100644 --- a/src/crypto/cn/CryptoNight_monero.h +++ b/src/crypto/cn/CryptoNight_monero.h @@ -30,7 +30,7 @@ #include // VARIANT ALTERATIONS -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) # define VARIANT1_INIT(part) \ uint64_t tweak1_2_##part = 0; \ if (BASE == Algorithm::CN_1) { \ @@ -60,7 +60,7 @@ } -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) # define VARIANT2_INIT(part) \ __m128i division_result_xmm_##part = _mm_cvtsi64_si128(static_cast(h##part[12])); \ __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(static_cast(h##part[13])); diff --git a/src/crypto/cn/soft_aes.h b/src/crypto/cn/soft_aes.h index fc3712298..6de0089db 100644 --- a/src/crypto/cn/soft_aes.h +++ b/src/crypto/cn/soft_aes.h @@ -29,6 +29,8 @@ #if defined(XMRIG_ARM) # include "crypto/cn/sse2neon.h" +#elif defined(XMRIG_RISCV) +# include "crypto/cn/sse2rvv.h" #elif defined(__GNUC__) # include #else diff --git a/src/crypto/cn/sse2rvv.h b/src/crypto/cn/sse2rvv.h new file mode 100644 index 000000000..d5b525b51 --- /dev/null +++ b/src/crypto/cn/sse2rvv.h @@ -0,0 +1,748 @@ +/* XMRig + * Copyright (c) 2025 Slayingripper + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * SSE to RISC-V Vector (RVV) optimized compatibility header + * Provides both scalar fallback and vectorized implementations using RVV intrinsics + * + * Based on sse2neon.h concepts, adapted for RISC-V architecture with RVV extensions + * Original sse2neon.h: https://github.com/DLTcollab/sse2neon + */ + +#ifndef XMRIG_SSE2RVV_OPTIMIZED_H +#define XMRIG_SSE2RVV_OPTIMIZED_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Check if RVV is available */ +#if defined(__riscv_vector) +#include +#define USE_RVV_INTRINSICS 1 +#else +#define USE_RVV_INTRINSICS 0 +#endif + +/* 128-bit vector type */ +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; +} __m128i_union; + +typedef __m128i_union __m128i; + +/* Set operations */ +static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0) +{ + __m128i result; + result.i32[0] = e0; + result.i32[1] = e1; + result.i32[2] = e2; + result.i32[3] = e3; + return result; +} + +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) +{ + __m128i result; + result.i64[0] = e0; + result.i64[1] = e1; + return result; +} + +static inline __m128i _mm_setzero_si128(void) +{ + __m128i result; + memset(&result, 0, sizeof(result)); + return result; +} + +/* Extract/insert operations */ +static inline int _mm_cvtsi128_si32(__m128i a) +{ + return a.i32[0]; +} + +static inline int64_t _mm_cvtsi128_si64(__m128i a) +{ + return a.i64[0]; +} + +static inline __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = _mm_setzero_si128(); + result.i32[0] = a; + return result; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) +{ + __m128i result = _mm_setzero_si128(); + result.i64[0] = a; + return result; +} + +/* Shuffle operations */ +static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8) +{ + __m128i result; + result.u32[0] = a.u32[(imm8 >> 0) & 0x3]; + result.u32[1] = a.u32[(imm8 >> 2) & 0x3]; + result.u32[2] = a.u32[(imm8 >> 4) & 0x3]; + result.u32[3] = a.u32[(imm8 >> 6) & 0x3]; + return result; +} + +/* Logical operations - optimized with RVV when available */ +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_or_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (~a.u64[0]) & b.u64[0]; + result.u64[1] = (~a.u64[1]) & b.u64[1]; + return result; +#endif +} + +/* Shift operations */ +static inline __m128i _mm_slli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = 0; i < 16 - count; i++) { + result.u8[i + count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_srli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = count; i < 16; i++) { + result.u8[i - count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_slli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] << imm8; + result.u64[1] = a.u64[1] << imm8; + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] >> imm8; + result.u64[1] = a.u64[1] >> imm8; + } + return result; +#endif +} + +/* Load/store operations - optimized with RVV */ +static inline __m128i _mm_load_si128(const __m128i* p) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl); + __riscv_vse64_v_u64m1(result.u64, v, vl); + return result; +#else + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +#endif +} + +static inline __m128i _mm_loadu_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline void _mm_store_si128(__m128i* p, __m128i a) +{ +#if USE_RVV_INTRINSICS + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl); + __riscv_vse64_v_u64m1((uint64_t*)p, v, vl); +#else + memcpy(p, &a, sizeof(__m128i)); +#endif +} + +static inline void _mm_storeu_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +/* Arithmetic operations - optimized with RVV */ +static inline __m128i _mm_add_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl); + vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + return result; +#else + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a.i32[i] + b.i32[i]; + } + return result; +#endif +} + +static inline __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] - b.u64[0]; + result.u64[1] = a.u64[1] - b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl); + vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl); + vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0]; + result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2]; + return result; +#endif +} + +/* Unpack operations */ +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0]; + result.u64[1] = b.u64[0]; + return result; +} + +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[1]; + result.u64[1] = b.u64[1]; + return result; +} + +/* Pause instruction for spin-wait loops */ +static inline void _mm_pause(void) +{ + /* RISC-V pause hint if available (requires Zihintpause extension) */ +#if defined(__riscv_zihintpause) + __asm__ __volatile__("pause"); +#else + __asm__ __volatile__("nop"); +#endif +} + +/* Memory fence - optimized for RISC-V */ +static inline void _mm_mfence(void) +{ + __asm__ __volatile__("fence rw,rw" ::: "memory"); +} + +static inline void _mm_lfence(void) +{ + __asm__ __volatile__("fence r,r" ::: "memory"); +} + +static inline void _mm_sfence(void) +{ + __asm__ __volatile__("fence w,w" ::: "memory"); +} + +/* Comparison operations */ +static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0; + } + return result; +} + +static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 2; i++) { + result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + } + return result; +} + +/* Additional shift operations */ +static inline __m128i _mm_slli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] << imm8; + } + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] >> imm8; + } + } + return result; +#endif +} + +/* 64-bit integer operations */ +static inline __m128i _mm_set1_epi64x(int64_t a) +{ + __m128i result; + result.i64[0] = a; + result.i64[1] = a; + return result; +} + +/* Float type for compatibility */ +typedef __m128i __m128; + +/* Float operations - simplified scalar implementations */ +static inline __m128 _mm_set1_ps(float a) +{ + __m128 result; + uint32_t val; + memcpy(&val, &a, sizeof(float)); + for (int i = 0; i < 4; i++) { + result.u32[i] = val; + } + return result; +} + +static inline __m128 _mm_setzero_ps(void) +{ + __m128 result; + memset(&result, 0, sizeof(result)); + return result; +} + +static inline __m128 _mm_add_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] + fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] * fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_and_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128 _mm_or_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128 _mm_cvtepi32_ps(__m128i a) +{ + __m128 result; + float fr[4]; + for (int i = 0; i < 4; i++) { + fr[i] = (float)a.i32[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_cvttps_epi32(__m128 a) +{ + __m128i result; + float fa[4]; + memcpy(fa, &a, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + result.i32[i] = (int32_t)fa[i]; + } + return result; +} + +/* Casting operations */ +static inline __m128 _mm_castsi128_ps(__m128i a) +{ + __m128 result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_castps_si128(__m128 a) +{ + __m128i result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +/* Additional set operations */ +static inline __m128i _mm_set1_epi32(int a) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a; + } + return result; +} + +/* AES instructions - placeholders for soft_aes compatibility */ +static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey) +{ + return _mm_xor_si128(a, roundkey); +} + +static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + return a; +} + +/* Rotate right operation for soft_aes.h */ +static inline uint32_t _rotr(uint32_t value, unsigned int count) +{ + const unsigned int mask = 31; + count &= mask; + return (value >> count) | (value << ((-count) & mask)); +} + +/* ARM NEON compatibility types and intrinsics for RISC-V */ +typedef __m128i_union uint64x2_t; +typedef __m128i_union uint8x16_t; +typedef __m128i_union int64x2_t; +typedef __m128i_union int32x4_t; + +static inline uint64x2_t vld1q_u64(const uint64_t *ptr) +{ + uint64x2_t result; + result.u64[0] = ptr[0]; + result.u64[1] = ptr[1]; + return result; +} + +static inline int64x2_t vld1q_s64(const int64_t *ptr) +{ + int64x2_t result; + result.i64[0] = ptr[0]; + result.i64[1] = ptr[1]; + return result; +} + +static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val) +{ + ptr[0] = val.u64[0]; + ptr[1] = val.u64[1]; +} + +static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_xor_si128(a, b); +} + +static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_add_epi64(a, b); +} + +static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a) +{ + uint64x2_t result; + memcpy(&result, &a, sizeof(uint64x2_t)); + return result; +} + +static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane) +{ + return v.u64[lane]; +} + +static inline int64_t vgetq_lane_s64(int64x2_t v, int lane) +{ + return v.i64[lane]; +} + +static inline int32_t vgetq_lane_s32(int32x4_t v, int lane) +{ + return v.i32[lane]; +} + +typedef struct { uint64_t val[1]; } uint64x1_t; + +static inline uint64x1_t vcreate_u64(uint64_t a) +{ + uint64x1_t result; + result.val[0] = a; + return result; +} + +static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high) +{ + uint64x2_t result; + result.u64[0] = low.val[0]; + result.u64[1] = high.val[0]; + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */ diff --git a/src/crypto/cn/sse2rvv_optimized.h b/src/crypto/cn/sse2rvv_optimized.h new file mode 100644 index 000000000..f83f1101c --- /dev/null +++ b/src/crypto/cn/sse2rvv_optimized.h @@ -0,0 +1,748 @@ +/* XMRig + * Copyright (c) 2025 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * SSE to RISC-V Vector (RVV) optimized compatibility header + * Provides both scalar fallback and vectorized implementations using RVV intrinsics + */ + +#ifndef XMRIG_SSE2RVV_OPTIMIZED_H +#define XMRIG_SSE2RVV_OPTIMIZED_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Check if RVV is available */ +#if defined(__riscv_vector) +#include +#define USE_RVV_INTRINSICS 1 +#else +#define USE_RVV_INTRINSICS 0 +#endif + +/* 128-bit vector type */ +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; +#if USE_RVV_INTRINSICS + vuint64m1_t rvv_u64; + vuint32m1_t rvv_u32; + vuint8m1_t rvv_u8; +#endif +} __m128i_union; + +typedef __m128i_union __m128i; + +/* Set operations */ +static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0) +{ + __m128i result; + result.i32[0] = e0; + result.i32[1] = e1; + result.i32[2] = e2; + result.i32[3] = e3; + return result; +} + +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) +{ + __m128i result; + result.i64[0] = e0; + result.i64[1] = e1; + return result; +} + +static inline __m128i _mm_setzero_si128(void) +{ + __m128i result; + memset(&result, 0, sizeof(result)); + return result; +} + +/* Extract/insert operations */ +static inline int _mm_cvtsi128_si32(__m128i a) +{ + return a.i32[0]; +} + +static inline int64_t _mm_cvtsi128_si64(__m128i a) +{ + return a.i64[0]; +} + +static inline __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = _mm_setzero_si128(); + result.i32[0] = a; + return result; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) +{ + __m128i result = _mm_setzero_si128(); + result.i64[0] = a; + return result; +} + +/* Shuffle operations */ +static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8) +{ + __m128i result; + result.u32[0] = a.u32[(imm8 >> 0) & 0x3]; + result.u32[1] = a.u32[(imm8 >> 2) & 0x3]; + result.u32[2] = a.u32[(imm8 >> 4) & 0x3]; + result.u32[3] = a.u32[(imm8 >> 6) & 0x3]; + return result; +} + +/* Logical operations - optimized with RVV when available */ +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_or_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (~a.u64[0]) & b.u64[0]; + result.u64[1] = (~a.u64[1]) & b.u64[1]; + return result; +#endif +} + +/* Shift operations */ +static inline __m128i _mm_slli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = 0; i < 16 - count; i++) { + result.u8[i + count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_srli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = count; i < 16; i++) { + result.u8[i - count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_slli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] << imm8; + result.u64[1] = a.u64[1] << imm8; + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] >> imm8; + result.u64[1] = a.u64[1] >> imm8; + } + return result; +#endif +} + +/* Load/store operations - optimized with RVV */ +static inline __m128i _mm_load_si128(const __m128i* p) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl); + __riscv_vse64_v_u64m1(result.u64, v, vl); + return result; +#else + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +#endif +} + +static inline __m128i _mm_loadu_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline void _mm_store_si128(__m128i* p, __m128i a) +{ +#if USE_RVV_INTRINSICS + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl); + __riscv_vse64_v_u64m1((uint64_t*)p, v, vl); +#else + memcpy(p, &a, sizeof(__m128i)); +#endif +} + +static inline void _mm_storeu_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +/* Arithmetic operations - optimized with RVV */ +static inline __m128i _mm_add_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl); + vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + return result; +#else + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a.i32[i] + b.i32[i]; + } + return result; +#endif +} + +static inline __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] - b.u64[0]; + result.u64[1] = a.u64[1] - b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl); + vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl); + vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0]; + result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2]; + return result; +#endif +} + +/* Unpack operations */ +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0]; + result.u64[1] = b.u64[0]; + return result; +} + +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[1]; + result.u64[1] = b.u64[1]; + return result; +} + +/* Pause instruction for spin-wait loops */ +static inline void _mm_pause(void) +{ + /* RISC-V pause hint if available (requires Zihintpause extension) */ +#if defined(__riscv_zihintpause) + __asm__ __volatile__("pause"); +#else + __asm__ __volatile__("nop"); +#endif +} + +/* Memory fence - optimized for RISC-V */ +static inline void _mm_mfence(void) +{ + __asm__ __volatile__("fence rw,rw" ::: "memory"); +} + +static inline void _mm_lfence(void) +{ + __asm__ __volatile__("fence r,r" ::: "memory"); +} + +static inline void _mm_sfence(void) +{ + __asm__ __volatile__("fence w,w" ::: "memory"); +} + +/* Comparison operations */ +static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0; + } + return result; +} + +static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 2; i++) { + result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + } + return result; +} + +/* Additional shift operations */ +static inline __m128i _mm_slli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] << imm8; + } + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] >> imm8; + } + } + return result; +#endif +} + +/* 64-bit integer operations */ +static inline __m128i _mm_set1_epi64x(int64_t a) +{ + __m128i result; + result.i64[0] = a; + result.i64[1] = a; + return result; +} + +/* Float type for compatibility */ +typedef __m128i __m128; + +/* Float operations - simplified scalar implementations */ +static inline __m128 _mm_set1_ps(float a) +{ + __m128 result; + uint32_t val; + memcpy(&val, &a, sizeof(float)); + for (int i = 0; i < 4; i++) { + result.u32[i] = val; + } + return result; +} + +static inline __m128 _mm_setzero_ps(void) +{ + __m128 result; + memset(&result, 0, sizeof(result)); + return result; +} + +static inline __m128 _mm_add_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] + fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] * fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_and_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128 _mm_or_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128 _mm_cvtepi32_ps(__m128i a) +{ + __m128 result; + float fr[4]; + for (int i = 0; i < 4; i++) { + fr[i] = (float)a.i32[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_cvttps_epi32(__m128 a) +{ + __m128i result; + float fa[4]; + memcpy(fa, &a, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + result.i32[i] = (int32_t)fa[i]; + } + return result; +} + +/* Casting operations */ +static inline __m128 _mm_castsi128_ps(__m128i a) +{ + __m128 result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_castps_si128(__m128 a) +{ + __m128i result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +/* Additional set operations */ +static inline __m128i _mm_set1_epi32(int a) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a; + } + return result; +} + +/* AES instructions - placeholders for soft_aes compatibility */ +static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey) +{ + return _mm_xor_si128(a, roundkey); +} + +static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + return a; +} + +/* Rotate right operation for soft_aes.h */ +static inline uint32_t _rotr(uint32_t value, unsigned int count) +{ + const unsigned int mask = 31; + count &= mask; + return (value >> count) | (value << ((-count) & mask)); +} + +/* ARM NEON compatibility types and intrinsics for RISC-V */ +typedef __m128i_union uint64x2_t; +typedef __m128i_union uint8x16_t; +typedef __m128i_union int64x2_t; +typedef __m128i_union int32x4_t; + +static inline uint64x2_t vld1q_u64(const uint64_t *ptr) +{ + uint64x2_t result; + result.u64[0] = ptr[0]; + result.u64[1] = ptr[1]; + return result; +} + +static inline int64x2_t vld1q_s64(const int64_t *ptr) +{ + int64x2_t result; + result.i64[0] = ptr[0]; + result.i64[1] = ptr[1]; + return result; +} + +static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val) +{ + ptr[0] = val.u64[0]; + ptr[1] = val.u64[1]; +} + +static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_xor_si128(a, b); +} + +static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_add_epi64(a, b); +} + +static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a) +{ + uint64x2_t result; + memcpy(&result, &a, sizeof(uint64x2_t)); + return result; +} + +static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane) +{ + return v.u64[lane]; +} + +static inline int64_t vgetq_lane_s64(int64x2_t v, int lane) +{ + return v.i64[lane]; +} + +static inline int32_t vgetq_lane_s32(int32x4_t v, int lane) +{ + return v.i32[lane]; +} + +typedef struct { uint64_t val[1]; } uint64x1_t; + +static inline uint64x1_t vcreate_u64(uint64_t a) +{ + uint64x1_t result; + result.val[0] = a; + return result; +} + +static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high) +{ + uint64x2_t result; + result.u64[0] = low.val[0]; + result.u64[1] = high.val[0]; + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */ diff --git a/src/crypto/cn/sse2rvv_scalar_backup.h b/src/crypto/cn/sse2rvv_scalar_backup.h new file mode 100644 index 000000000..853adbb88 --- /dev/null +++ b/src/crypto/cn/sse2rvv_scalar_backup.h @@ -0,0 +1,571 @@ +/* XMRig + * Copyright (c) 2025 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * SSE to RISC-V compatibility header + * Provides scalar implementations of SSE intrinsics for RISC-V architecture + */ + +#ifndef XMRIG_SSE2RVV_H +#define XMRIG_SSE2RVV_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* 128-bit vector type */ +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; +} __m128i_union; + +typedef __m128i_union __m128i; + +/* Set operations */ +static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0) +{ + __m128i result; + result.i32[0] = e0; + result.i32[1] = e1; + result.i32[2] = e2; + result.i32[3] = e3; + return result; +} + +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) +{ + __m128i result; + result.i64[0] = e0; + result.i64[1] = e1; + return result; +} + +static inline __m128i _mm_setzero_si128(void) +{ + __m128i result; + memset(&result, 0, sizeof(result)); + return result; +} + +/* Extract/insert operations */ +static inline int _mm_cvtsi128_si32(__m128i a) +{ + return a.i32[0]; +} + +static inline int64_t _mm_cvtsi128_si64(__m128i a) +{ + return a.i64[0]; +} + +static inline __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = _mm_setzero_si128(); + result.i32[0] = a; + return result; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) +{ + __m128i result = _mm_setzero_si128(); + result.i64[0] = a; + return result; +} + +/* Shuffle operations */ +static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8) +{ + __m128i result; + result.u32[0] = a.u32[(imm8 >> 0) & 0x3]; + result.u32[1] = a.u32[(imm8 >> 2) & 0x3]; + result.u32[2] = a.u32[(imm8 >> 4) & 0x3]; + result.u32[3] = a.u32[(imm8 >> 6) & 0x3]; + return result; +} + +/* Logical operations */ +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +} + +static inline __m128i _mm_or_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = (~a.u64[0]) & b.u64[0]; + result.u64[1] = (~a.u64[1]) & b.u64[1]; + return result; +} + +/* Shift operations */ +static inline __m128i _mm_slli_si128(__m128i a, int imm8) +{ + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = 0; i < 16 - count; i++) { + result.u8[i + count] = a.u8[i]; + } + return result; +} + +static inline __m128i _mm_srli_si128(__m128i a, int imm8) +{ + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = count; i < 16; i++) { + result.u8[i - count] = a.u8[i]; + } + return result; +} + +static inline __m128i _mm_slli_epi64(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] << imm8; + result.u64[1] = a.u64[1] << imm8; + } + return result; +} + +static inline __m128i _mm_srli_epi64(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] >> imm8; + result.u64[1] = a.u64[1] >> imm8; + } + return result; +} + +/* Load/store operations */ +static inline __m128i _mm_load_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline __m128i _mm_loadu_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline void _mm_store_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +static inline void _mm_storeu_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +/* Arithmetic operations */ +static inline __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +} + +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a.i32[i] + b.i32[i]; + } + return result; +} + +static inline __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] - b.u64[0]; + result.u64[1] = a.u64[1] - b.u64[1]; + return result; +} + +static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0]; + result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2]; + return result; +} + +/* Unpack operations */ +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0]; + result.u64[1] = b.u64[0]; + return result; +} + +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[1]; + result.u64[1] = b.u64[1]; + return result; +} + +/* Pause instruction for spin-wait loops */ +static inline void _mm_pause(void) +{ + /* RISC-V doesn't have a direct equivalent to x86 PAUSE + * Use a simple NOP or yield hint */ + __asm__ __volatile__("nop"); +} + +/* Memory fence */ +static inline void _mm_mfence(void) +{ + __asm__ __volatile__("fence" ::: "memory"); +} + +static inline void _mm_lfence(void) +{ + __asm__ __volatile__("fence r,r" ::: "memory"); +} + +static inline void _mm_sfence(void) +{ + __asm__ __volatile__("fence w,w" ::: "memory"); +} + +/* Comparison operations */ +static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0; + } + return result; +} + +static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 2; i++) { + result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + } + return result; +} + +/* Additional shift operations */ +static inline __m128i _mm_slli_epi32(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] << imm8; + } + } + return result; +} + +static inline __m128i _mm_srli_epi32(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] >> imm8; + } + } + return result; +} + +/* 64-bit integer operations */ +static inline __m128i _mm_set1_epi64x(int64_t a) +{ + __m128i result; + result.i64[0] = a; + result.i64[1] = a; + return result; +} + +/* Float type for compatibility - we'll treat it as int for simplicity */ +typedef __m128i __m128; + +/* Float operations - simplified scalar implementations */ +static inline __m128 _mm_set1_ps(float a) +{ + __m128 result; + uint32_t val; + memcpy(&val, &a, sizeof(float)); + for (int i = 0; i < 4; i++) { + result.u32[i] = val; + } + return result; +} + +static inline __m128 _mm_setzero_ps(void) +{ + __m128 result; + memset(&result, 0, sizeof(result)); + return result; +} + +static inline __m128 _mm_add_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] + fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] * fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_and_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128 _mm_or_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128 _mm_cvtepi32_ps(__m128i a) +{ + __m128 result; + float fr[4]; + for (int i = 0; i < 4; i++) { + fr[i] = (float)a.i32[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_cvttps_epi32(__m128 a) +{ + __m128i result; + float fa[4]; + memcpy(fa, &a, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + result.i32[i] = (int32_t)fa[i]; + } + return result; +} + +/* Casting operations */ +static inline __m128 _mm_castsi128_ps(__m128i a) +{ + __m128 result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_castps_si128(__m128 a) +{ + __m128i result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +/* Additional set operations */ +static inline __m128i _mm_set1_epi32(int a) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a; + } + return result; +} + +/* AES instructions - these are placeholders, actual AES is done via soft_aes.h */ +/* On RISC-V without crypto extensions, these should never be called directly */ +/* They are only here for compilation compatibility */ +static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey) +{ + /* This is a placeholder - actual implementation should use soft_aes */ + /* If this function is called, it means SOFT_AES template parameter wasn't used */ + /* We return a XOR as a minimal fallback, but proper code should use soft_aesenc */ + return _mm_xor_si128(a, roundkey); +} + +static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + /* Placeholder for AES key generation - should use soft_aeskeygenassist */ + return a; +} + +/* Rotate right operation for soft_aes.h */ +static inline uint32_t _rotr(uint32_t value, unsigned int count) +{ + const unsigned int mask = 31; + count &= mask; + return (value >> count) | (value << ((-count) & mask)); +} + +/* ARM NEON compatibility types and intrinsics for RISC-V */ +typedef __m128i_union uint64x2_t; +typedef __m128i_union uint8x16_t; +typedef __m128i_union int64x2_t; +typedef __m128i_union int32x4_t; + +static inline uint64x2_t vld1q_u64(const uint64_t *ptr) +{ + uint64x2_t result; + result.u64[0] = ptr[0]; + result.u64[1] = ptr[1]; + return result; +} + +static inline int64x2_t vld1q_s64(const int64_t *ptr) +{ + int64x2_t result; + result.i64[0] = ptr[0]; + result.i64[1] = ptr[1]; + return result; +} + +static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val) +{ + ptr[0] = val.u64[0]; + ptr[1] = val.u64[1]; +} + +static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b) +{ + uint64x2_t result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +} + +static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) +{ + uint64x2_t result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +} + +static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a) +{ + uint64x2_t result; + memcpy(&result, &a, sizeof(uint64x2_t)); + return result; +} + +static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane) +{ + return v.u64[lane]; +} + +static inline int64_t vgetq_lane_s64(int64x2_t v, int lane) +{ + return v.i64[lane]; +} + +static inline int32_t vgetq_lane_s32(int32x4_t v, int lane) +{ + return v.i32[lane]; +} + +typedef struct { uint64_t val[1]; } uint64x1_t; + +static inline uint64x1_t vcreate_u64(uint64_t a) +{ + uint64x1_t result; + result.val[0] = a; + return result; +} + +static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high) +{ + uint64x2_t result; + result.u64[0] = low.val[0]; + result.u64[1] = high.val[0]; + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* XMRIG_SSE2RVV_H */ diff --git a/src/crypto/common/LinuxMemory.cpp b/src/crypto/common/LinuxMemory.cpp index 8a00e1c36..a09f5a1c7 100644 --- a/src/crypto/common/LinuxMemory.cpp +++ b/src/crypto/common/LinuxMemory.cpp @@ -1,6 +1,6 @@ /* XMRig - * Copyright (c) 2018-2021 SChernykh - * Copyright (c) 2016-2021 XMRig , + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -35,15 +35,69 @@ constexpr size_t twoMiB = 2U * 1024U * 1024U; constexpr size_t oneGiB = 1024U * 1024U * 1024U; -static inline std::string sysfs_path(uint32_t node, size_t hugePageSize, bool nr) +static bool sysfs_write(const std::string &path, uint64_t value) +{ + std::ofstream file(path, std::ios::out | std::ios::binary | std::ios::trunc); + if (!file.is_open()) { + return false; + } + + file << value; + file.flush(); + + return true; +} + + +static int64_t sysfs_read(const std::string &path) +{ + std::ifstream file(path); + if (!file.is_open()) { + return -1; + } + + uint64_t value = 0; + file >> value; + + return value; +} + + +static std::string sysfs_path(uint32_t node, size_t hugePageSize, bool nr) { return fmt::format("/sys/devices/system/node/node{}/hugepages/hugepages-{}kB/{}_hugepages", node, hugePageSize / 1024, nr ? "nr" : "free"); } -static inline bool write_nr_hugepages(uint32_t node, size_t hugePageSize, uint64_t count) { return LinuxMemory::write(sysfs_path(node, hugePageSize, true).c_str(), count); } -static inline int64_t free_hugepages(uint32_t node, size_t hugePageSize) { return LinuxMemory::read(sysfs_path(node, hugePageSize, false).c_str()); } -static inline int64_t nr_hugepages(uint32_t node, size_t hugePageSize) { return LinuxMemory::read(sysfs_path(node, hugePageSize, true).c_str()); } +static std::string sysfs_path(size_t hugePageSize, bool nr) +{ + return fmt::format("/sys/kernel/mm/hugepages/hugepages-{}kB/{}_hugepages", hugePageSize / 1024, nr ? "nr" : "free"); +} + + +static bool write_nr_hugepages(uint32_t node, size_t hugePageSize, uint64_t count) +{ + if (sysfs_write(sysfs_path(node, hugePageSize, true), count)) { + return true; + } + + return sysfs_write(sysfs_path(hugePageSize, true), count); +} + + +static int64_t sysfs_read_hugepages(uint32_t node, size_t hugePageSize, bool nr) +{ + const int64_t value = sysfs_read(sysfs_path(node, hugePageSize, nr)); + if (value >= 0) { + return value; + } + + return sysfs_read(sysfs_path(hugePageSize, nr)); +} + + +static inline int64_t free_hugepages(uint32_t node, size_t hugePageSize) { return sysfs_read_hugepages(node, hugePageSize, false); } +static inline int64_t nr_hugepages(uint32_t node, size_t hugePageSize) { return sysfs_read_hugepages(node, hugePageSize, true); } } // namespace xmrig @@ -62,31 +116,3 @@ bool xmrig::LinuxMemory::reserve(size_t size, uint32_t node, size_t hugePageSize return write_nr_hugepages(node, hugePageSize, std::max(nr_hugepages(node, hugePageSize), 0) + (required - available)); } - - -bool xmrig::LinuxMemory::write(const char *path, uint64_t value) -{ - std::ofstream file(path, std::ios::out | std::ios::binary | std::ios::trunc); - if (!file.is_open()) { - return false; - } - - file << value; - file.flush(); - - return true; -} - - -int64_t xmrig::LinuxMemory::read(const char *path) -{ - std::ifstream file(path); - if (!file.is_open()) { - return -1; - } - - uint64_t value = 0; - file >> value; - - return value; -} diff --git a/src/crypto/common/LinuxMemory.h b/src/crypto/common/LinuxMemory.h index 0d71af249..c39f96edc 100644 --- a/src/crypto/common/LinuxMemory.h +++ b/src/crypto/common/LinuxMemory.h @@ -1,6 +1,6 @@ /* XMRig - * Copyright (c) 2018-2021 SChernykh - * Copyright (c) 2016-2021 XMRig , + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -31,13 +31,10 @@ class LinuxMemory { public: static bool reserve(size_t size, uint32_t node, size_t hugePageSize); - - static bool write(const char *path, uint64_t value); - static int64_t read(const char *path); }; -} /* namespace xmrig */ +} // namespace xmrig -#endif /* XMRIG_LINUXMEMORY_H */ +#endif // XMRIG_LINUXMEMORY_H diff --git a/src/crypto/common/MemoryPool.cpp b/src/crypto/common/MemoryPool.cpp index e99757ee9..0e809125a 100644 --- a/src/crypto/common/MemoryPool.cpp +++ b/src/crypto/common/MemoryPool.cpp @@ -49,7 +49,7 @@ xmrig::MemoryPool::MemoryPool(size_t size, bool hugePages, uint32_t node) constexpr size_t alignment = 1 << 24; - m_memory = new VirtualMemory(size * pageSize + alignment, hugePages, false, false, node); + m_memory = new VirtualMemory(size * pageSize + alignment, hugePages, false, false, node, VirtualMemory::kDefaultHugePageSize); m_alignOffset = (alignment - (((size_t)m_memory->scratchpad()) % alignment)) % alignment; } diff --git a/src/crypto/common/VirtualMemory.cpp b/src/crypto/common/VirtualMemory.cpp index e425750dd..d7d3a545e 100644 --- a/src/crypto/common/VirtualMemory.cpp +++ b/src/crypto/common/VirtualMemory.cpp @@ -75,6 +75,16 @@ xmrig::VirtualMemory::VirtualMemory(size_t size, bool hugePages, bool oneGbPages } m_scratchpad = static_cast(_mm_malloc(m_size, alignSize)); + + // Huge pages failed to allocate, but try to enable transparent huge pages for the range + if (alignSize >= kDefaultHugePageSize) { + if (m_scratchpad) { + adviseLargePages(m_scratchpad, m_size); + } + else { + m_scratchpad = static_cast(_mm_malloc(m_size, 64)); + } + } } diff --git a/src/crypto/common/VirtualMemory.h b/src/crypto/common/VirtualMemory.h index 3056cbaed..2edd3ae92 100644 --- a/src/crypto/common/VirtualMemory.h +++ b/src/crypto/common/VirtualMemory.h @@ -65,6 +65,7 @@ public: static void *allocateExecutableMemory(size_t size, bool hugePages); static void *allocateLargePagesMemory(size_t size); static void *allocateOneGbPagesMemory(size_t size); + static bool adviseLargePages(void *p, size_t size); static void destroy(); static void flushInstructionCache(void *p, size_t size); static void freeLargePagesMemory(void *p, size_t size); diff --git a/src/crypto/common/VirtualMemory_unix.cpp b/src/crypto/common/VirtualMemory_unix.cpp index 003b92e45..471c9cf07 100644 --- a/src/crypto/common/VirtualMemory_unix.cpp +++ b/src/crypto/common/VirtualMemory_unix.cpp @@ -86,7 +86,7 @@ bool xmrig::VirtualMemory::isHugepagesAvailable() { # ifdef XMRIG_OS_LINUX return std::ifstream("/proc/sys/vm/nr_hugepages").good() || std::ifstream("/sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages").good(); -# elif defined(XMRIG_OS_MACOS) && defined(XMRIG_ARM) +# elif defined(XMRIG_OS_MACOS) && defined(XMRIG_ARM) || defined(XMRIG_OS_HAIKU) return false; # else return true; @@ -156,7 +156,8 @@ void *xmrig::VirtualMemory::allocateExecutableMemory(size_t size, bool hugePages if (!mem) { mem = mmap(0, size, PROT_READ | PROT_WRITE | SECURE_PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); } - +# elif defined(XMRIG_OS_HAIKU) + void *mem = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); # else void *mem = nullptr; @@ -181,6 +182,8 @@ void *xmrig::VirtualMemory::allocateLargePagesMemory(size_t size) void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); # elif defined(XMRIG_OS_FREEBSD) void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0); +# elif defined(XMRIG_OS_HAIKU) + void *mem = nullptr; # else void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE | hugePagesFlag(hugePageSize()), 0, 0); # endif @@ -273,6 +276,16 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory() } +bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size) +{ +# ifdef XMRIG_OS_LINUX + return (madvise(p, size, MADV_HUGEPAGE) == 0); +# else + return false; +# endif +} + + void xmrig::VirtualMemory::freeLargePagesMemory() { if (m_flags.test(FLAG_LOCK)) { diff --git a/src/crypto/common/VirtualMemory_win.cpp b/src/crypto/common/VirtualMemory_win.cpp index acf8119fa..28f515bac 100644 --- a/src/crypto/common/VirtualMemory_win.cpp +++ b/src/crypto/common/VirtualMemory_win.cpp @@ -260,6 +260,12 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory() } +bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size) +{ + return false; +} + + void xmrig::VirtualMemory::freeLargePagesMemory() { freeLargePagesMemory(m_scratchpad, m_size); diff --git a/src/crypto/common/portable/mm_malloc.h b/src/crypto/common/portable/mm_malloc.h index 34ca7d48b..388da645a 100644 --- a/src/crypto/common/portable/mm_malloc.h +++ b/src/crypto/common/portable/mm_malloc.h @@ -26,7 +26,7 @@ #define XMRIG_MM_MALLOC_PORTABLE_H -#if defined(XMRIG_ARM) && !defined(__clang__) +#if (defined(XMRIG_ARM) || defined(XMRIG_RISCV)) && !defined(__clang__) #include diff --git a/src/crypto/ghostrider/ghostrider.cpp b/src/crypto/ghostrider/ghostrider.cpp index 25bb44e74..4a21ae032 100644 --- a/src/crypto/ghostrider/ghostrider.cpp +++ b/src/crypto/ghostrider/ghostrider.cpp @@ -57,6 +57,9 @@ #if defined(XMRIG_ARM) # include "crypto/cn/sse2neon.h" +#elif defined(XMRIG_RISCV) + // RISC-V doesn't have SSE/NEON, provide minimal compatibility +# define _mm_pause() __asm__ __volatile__("nop") #elif defined(__GNUC__) # include #else @@ -286,7 +289,7 @@ struct HelperThread void benchmark() { -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) static std::atomic done{ 0 }; if (done.exchange(1)) { return; @@ -478,7 +481,7 @@ static inline bool findByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambd HelperThread* create_helper_thread(int64_t cpu_index, int priority, const std::vector& affinities) { -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) hwloc_bitmap_t helper_cpu_set = hwloc_bitmap_alloc(); hwloc_bitmap_t main_threads_set = hwloc_bitmap_alloc(); @@ -807,7 +810,7 @@ void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ct uint32_t cn_indices[6]; select_indices(cn_indices, seed); -#ifdef XMRIG_ARM +#if defined(XMRIG_ARM) || defined(XMRIG_RISCV) uint32_t step[6] = { 1, 1, 1, 1, 1, 1 }; #else uint32_t step[6] = { 4, 4, 1, 2, 4, 4 }; diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 38eb4d645..04b813b15 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -235,6 +235,131 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); +#if defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED) +static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 }; +static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 }; + +static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 }; +static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 }; + +static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 }; +static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b }; + +static constexpr uint32_t AES_HASH_STRIDE[8] = { 0, 4, 8, 12, 32, 36, 40, 44 }; + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { + PROFILE_SCOPE(RandomX_AES); + + uint8_t* scratchpadPtr = (uint8_t*)scratchpad; + const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; + + vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8); + vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8); + + const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8); + const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8); + + const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE, 8); + + vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8); + vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8); + + const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32); + const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32); + const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32); + const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32); + + const vuint8m1_t& lutdec_index0 = lutenc_index0; + const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32); + const vuint8m1_t& lutdec_index2 = lutenc_index2; + const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32); + + //process 64 bytes at a time in 4 lanes + while (scratchpadPtr < scratchpadEnd) { +#define HASH_STATE(k) \ + hash_state02 = softaes_vector_double(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \ + hash_state13 = softaes_vector_double(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); + +#define FILL_STATE(k) \ + fill_state02 = softaes_vector_double(fill_state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); \ + fill_state13 = softaes_vector_double(fill_state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \ + __riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, fill_state02, 8); \ + __riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, fill_state13, 8); + + switch (softAes) { + case 0: + HASH_STATE(0); + HASH_STATE(1); + + FILL_STATE(0); + FILL_STATE(1); + + scratchpadPtr += 128; + break; + + default: + switch (unroll) { + case 4: + HASH_STATE(0); + FILL_STATE(0); + + HASH_STATE(1); + FILL_STATE(1); + + HASH_STATE(2); + FILL_STATE(2); + + HASH_STATE(3); + FILL_STATE(3); + + scratchpadPtr += 64 * 4; + break; + + case 2: + HASH_STATE(0); + FILL_STATE(0); + + HASH_STATE(1); + FILL_STATE(1); + + scratchpadPtr += 64 * 2; + break; + + default: + HASH_STATE(0); + FILL_STATE(0); + + scratchpadPtr += 64; + break; + } + break; + } + } + +#undef HASH_STATE +#undef FILL_STATE + + __riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8); + __riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8); + + //two extra rounds to achieve full diffusion + const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8); + const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8); + + hash_state02 = softaes_vector_double(hash_state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); + hash_state13 = softaes_vector_double(hash_state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); + + hash_state02 = softaes_vector_double(hash_state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); + hash_state13 = softaes_vector_double(hash_state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); + + //output hash + __riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8); + __riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8); +} + +#else // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED) + template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { PROFILE_SCOPE(RandomX_AES); @@ -375,6 +500,7 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2); rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3); } +#endif // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED) template void hashAndFillAes1Rx4<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); template void hashAndFillAes1Rx4<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); diff --git a/src/crypto/randomx/common.hpp b/src/crypto/randomx/common.hpp index 98f96727b..6fbfb9785 100644 --- a/src/crypto/randomx/common.hpp +++ b/src/crypto/randomx/common.hpp @@ -111,6 +111,10 @@ namespace randomx { #define RANDOMX_HAVE_COMPILER 1 class JitCompilerA64; using JitCompiler = JitCompilerA64; +#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) + #define RANDOMX_HAVE_COMPILER 1 + class JitCompilerRV64; + using JitCompiler = JitCompilerRV64; #else #define RANDOMX_HAVE_COMPILER 0 class JitCompilerFallback; diff --git a/src/crypto/randomx/jit_compiler.hpp b/src/crypto/randomx/jit_compiler.hpp index db635c6f4..114ec3bd0 100644 --- a/src/crypto/randomx/jit_compiler.hpp +++ b/src/crypto/randomx/jit_compiler.hpp @@ -32,6 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/jit_compiler_x86.hpp" #elif defined(__aarch64__) #include "crypto/randomx/jit_compiler_a64.hpp" +#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) +#include "crypto/randomx/jit_compiler_rv64.hpp" #else #include "crypto/randomx/jit_compiler_fallback.hpp" #endif diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index 860503081..6192cdeca 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -67,7 +67,6 @@ constexpr uint32_t LDR_LITERAL = 0x58000000; constexpr uint32_t ROR = 0x9AC02C00; constexpr uint32_t ROR_IMM = 0x93C00000; constexpr uint32_t MOV_REG = 0xAA0003E0; -constexpr uint32_t MOV_VREG_EL = 0x6E080400; constexpr uint32_t FADD = 0x4E60D400; constexpr uint32_t FSUB = 0x4EE0D400; constexpr uint32_t FEOR = 0x6E201C00; @@ -102,7 +101,7 @@ static size_t CalcDatasetItemSize() ((uint8_t*)randomx_calc_dataset_item_aarch64_end - (uint8_t*)randomx_calc_dataset_item_aarch64_store_result); } -constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; +constexpr uint8_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 }; JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) : hugePages(hugePagesJIT && hugePagesEnable), @@ -128,11 +127,12 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con uint32_t codePos = MainLoopBegin + 4; + uint32_t mask = ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10); // and w16, w10, ScratchpadL3Mask64 - emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); + emit32(0x121A0000 | 16 | (10 << 5) | mask, code, codePos); // and w17, w20, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (20 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); + emit32(0x121A0000 | 17 | (20 << 5) | mask, code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -155,13 +155,14 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; emit32(ARMV8A::B | (offset / 4), code, codePos); - // and w20, w20, CacheLineAlignMask + mask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10); + // and w20, w9, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 20 | (20 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); + emit32(0x121A0000 | 20 | (9 << 5) | mask, code, codePos); // and w10, w10, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 10 | (10 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); + emit32(0x121A0000 | 10 | (10 << 5) | mask, code, codePos); // Update spMix1 // eor x10, config.readReg0, config.readReg1 @@ -497,9 +498,12 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, if (src != dst) { imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1); - emitAddImmediate(tmp_reg, src, imm, code, k); + uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + if (imm) + emitAddImmediate(tmp_reg, src, imm, code, k); + else + t = 0x927d0000 | tmp_reg | (src << 5); - constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); @@ -511,10 +515,18 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr, else { imm = (imm & ScratchpadL3Mask) >> 3; - emitMovImmediate(tmp_reg, imm, code, k); + if (imm) + { + emitMovImmediate(tmp_reg, imm, code, k); - // ldr tmp_reg, [x2, tmp_reg, lsl 3] - emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); + // ldr tmp_reg, [x2, tmp_reg, lsl 3] + emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); + } + else + { + // ldr tmp_reg, [x2] + emit32(0xf9400040 | tmp_reg, code, k); + } } codePos = k; @@ -529,25 +541,22 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co constexpr uint32_t tmp_reg = 19; imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1); - emitAddImmediate(tmp_reg, src, imm, code, k); + uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + if (imm) + emitAddImmediate(tmp_reg, src, imm, code, k); + else + t = 0x927d0000 | tmp_reg | (src << 5); - constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); emit32(instr.getModMem() ? andInstrL1 : andInstrL2, code, k); - // add tmp_reg, x2, tmp_reg - emit32(ARMV8A::ADD | tmp_reg | (2 << 5) | (tmp_reg << 16), code, k); + // ldr tmp_reg_fp, [x2, tmp_reg] + emit32(0x3ce06800 | tmp_reg_fp | (2 << 5) | (tmp_reg << 16), code, k); - // ldpsw tmp_reg, tmp_reg + 1, [tmp_reg] - emit32(0x69400000 | tmp_reg | (tmp_reg << 5) | ((tmp_reg + 1) << 10), code, k); - - // ins tmp_reg_fp.d[0], tmp_reg - emit32(0x4E081C00 | tmp_reg_fp | (tmp_reg << 5), code, k); - - // ins tmp_reg_fp.d[1], tmp_reg + 1 - emit32(0x4E181C00 | tmp_reg_fp | ((tmp_reg + 1) << 5), code, k); + // sxtl.2d tmp_reg_fp, tmp_reg_fp + emit32(0x0f20a400 | tmp_reg_fp | (tmp_reg_fp << 5), code, k); // scvtf tmp_reg_fp.2d, tmp_reg_fp.2d emit32(0x4E61D800 | tmp_reg_fp | (tmp_reg_fp << 5), code, k); @@ -835,7 +844,8 @@ void JitCompilerA64::h_IROR_R(Instruction& instr, uint32_t& codePos) else { // ror dst, dst, imm - emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); + if ((instr.getImm32() & 63)) + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((instr.getImm32() & 63) << 10) | (dst << 16), code, codePos); } reg_changed_offset[instr.dst] = codePos; @@ -861,7 +871,8 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos) else { // ror dst, dst, imm - emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k); + if ((instr.getImm32() & 63)) + emit32(ARMV8A::ROR_IMM | dst | (dst << 5) | ((-instr.getImm32() & 63) << 10) | (dst << 16), code, k); } reg_changed_offset[instr.dst] = k; @@ -894,13 +905,8 @@ void JitCompilerA64::h_FSWAP_R(Instruction& instr, uint32_t& codePos) const uint32_t dst = instr.dst + 16; - constexpr uint32_t tmp_reg_fp = 28; - constexpr uint32_t src_index1 = 1 << 14; - constexpr uint32_t dst_index1 = 1 << 20; - - emit32(ARMV8A::MOV_VREG_EL | tmp_reg_fp | (dst << 5) | src_index1, code, k); - emit32(ARMV8A::MOV_VREG_EL | dst | (dst << 5) | dst_index1, code, k); - emit32(ARMV8A::MOV_VREG_EL | dst | (tmp_reg_fp << 5), code, k); + // ext dst.16b, dst.16b, dst.16b, #0x8 + emit32(0x6e004000 | dst | (dst << 5) | (dst << 16), code, k); codePos = k; } @@ -1029,11 +1035,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg = 20; constexpr uint32_t fpcr_tmp_reg = 8; - // ror tmp_reg, src, imm - emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); + if (instr.getImm32() & 63) + { + // ror tmp_reg, src, imm + emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); - // bfi fpcr_tmp_reg, tmp_reg, 40, 2 - emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); + // bfi fpcr_tmp_reg, tmp_reg, 40, 2 + emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); + } + else // no rotation + { + // bfi fpcr_tmp_reg, src, 40, 2 + emit32(0xB3580400 | fpcr_tmp_reg | (src << 5), code, k); + } // rbit tmp_reg, fpcr_tmp_reg emit32(0xDAC00000 | tmp_reg | (fpcr_tmp_reg << 5), code, k); @@ -1059,9 +1073,12 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos) else imm &= RandomX_CurrentConfig.ScratchpadL3_Size - 1; - emitAddImmediate(tmp_reg, dst, imm, code, k); + uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); + if (imm) + emitAddImmediate(tmp_reg, dst, imm, code, k); + else + t = 0x927d0000 | tmp_reg | (dst << 5); - constexpr uint32_t t = 0x927d0000 | tmp_reg | (tmp_reg << 5); const uint32_t andInstrL1 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL1 - 4) << 10); const uint32_t andInstrL2 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL2 - 4) << 10); const uint32_t andInstrL3 = t | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 4) << 10); diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S index e019c6b4b..6133b284a 100644 --- a/src/crypto/randomx/jit_compiler_a64_static.S +++ b/src/crypto/randomx/jit_compiler_a64_static.S @@ -100,9 +100,9 @@ # v26 -> "a2" # v27 -> "a3" # v28 -> temporary -# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff -# v30 -> E 'or' mask = 0x3*00000000******3*00000000****** -# v31 -> scale mask = 0x81f000000000000081f0000000000000 +# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff +# v30 -> E 'or' mask = 0x3*00000000******'3*00000000****** +# v31 -> scale mask = 0x80f0000000000000'80f0000000000000 .balign 4 DECL(randomx_program_aarch64): @@ -142,17 +142,14 @@ DECL(randomx_program_aarch64): ldp q26, q27, [x0, 224] # Load E 'and' mask - mov x16, 0x00FFFFFFFFFFFFFF - ins v29.d[0], x16 - ins v29.d[1], x16 + movi v29.2d, #0x00FFFFFFFFFFFFFF # Load E 'or' mask (stored in reg.f[0]) ldr q30, [x0, 64] # Load scale mask mov x16, 0x80f0000000000000 - ins v31.d[0], x16 - ins v31.d[1], x16 + dup v31.2d, x16 # Read fpcr mrs x8, fpcr @@ -162,35 +159,22 @@ DECL(randomx_program_aarch64): str x0, [sp, -16]! # Read literals - ldr x0, literal_x0 - ldr x11, literal_x11 - ldr x21, literal_x21 - ldr x22, literal_x22 - ldr x23, literal_x23 - ldr x24, literal_x24 - ldr x25, literal_x25 - ldr x26, literal_x26 - ldr x27, literal_x27 - ldr x28, literal_x28 - ldr x29, literal_x29 - ldr x30, literal_x30 + adr x30, literal_v0 + ldp q0, q1, [x30] + ldp q2, q3, [x30, 32] + ldp q4, q5, [x30, 64] + ldp q6, q7, [x30, 96] + ldp q8, q9, [x30, 128] + ldp q10, q11, [x30, 160] + ldp q12, q13, [x30, 192] + ldp q14, q15, [x30, 224] - ldr q0, literal_v0 - ldr q1, literal_v1 - ldr q2, literal_v2 - ldr q3, literal_v3 - ldr q4, literal_v4 - ldr q5, literal_v5 - ldr q6, literal_v6 - ldr q7, literal_v7 - ldr q8, literal_v8 - ldr q9, literal_v9 - ldr q10, literal_v10 - ldr q11, literal_v11 - ldr q12, literal_v12 - ldr q13, literal_v13 - ldr q14, literal_v14 - ldr q15, literal_v15 + ldp x0, x11, [x30, -96] // literal_x0 + ldp x21, x22, [x30, -80] // literal_x21 + ldp x23, x24, [x30, -64] // literal_x23 + ldp x25, x26, [x30, -48] // literal_x25 + ldp x27, x28, [x30, -32] // literal_x27 + ldp x29, x30, [x30, -16] // literal_x29 DECL(randomx_program_aarch64_main_loop): # spAddr0 = spMix1 & ScratchpadL3Mask64; @@ -221,40 +205,31 @@ DECL(randomx_program_aarch64_main_loop): eor x15, x15, x19 # Load group F registers (spAddr1) - ldpsw x20, x19, [x17] - ins v16.d[0], x20 - ins v16.d[1], x19 - ldpsw x20, x19, [x17, 8] - ins v17.d[0], x20 - ins v17.d[1], x19 - ldpsw x20, x19, [x17, 16] - ins v18.d[0], x20 - ins v18.d[1], x19 - ldpsw x20, x19, [x17, 24] - ins v19.d[0], x20 - ins v19.d[1], x19 + ldr q17, [x17] + sxtl v16.2d, v17.2s scvtf v16.2d, v16.2d + sxtl2 v17.2d, v17.4s scvtf v17.2d, v17.2d + + ldr q19, [x17, 16] + sxtl v18.2d, v19.2s scvtf v18.2d, v18.2d + sxtl2 v19.2d, v19.4s scvtf v19.2d, v19.2d # Load group E registers (spAddr1) - ldpsw x20, x19, [x17, 32] - ins v20.d[0], x20 - ins v20.d[1], x19 - ldpsw x20, x19, [x17, 40] - ins v21.d[0], x20 - ins v21.d[1], x19 - ldpsw x20, x19, [x17, 48] - ins v22.d[0], x20 - ins v22.d[1], x19 - ldpsw x20, x19, [x17, 56] - ins v23.d[0], x20 - ins v23.d[1], x19 + ldr q21, [x17, 32] + sxtl v20.2d, v21.2s scvtf v20.2d, v20.2d + sxtl2 v21.2d, v21.4s scvtf v21.2d, v21.2d + + ldr q23, [x17, 48] + sxtl v22.2d, v23.2s scvtf v22.2d, v22.2d + sxtl2 v23.2d, v23.4s scvtf v23.2d, v23.2d + and v20.16b, v20.16b, v29.16b and v21.16b, v21.16b, v29.16b and v22.16b, v22.16b, v29.16b @@ -310,10 +285,9 @@ DECL(randomx_program_aarch64_vm_instructions_end): eor x9, x9, x20 # Calculate dataset pointer for dataset prefetch - mov w20, w9 DECL(randomx_program_aarch64_cacheline_align_mask1): # Actual mask will be inserted by JIT compiler - and x20, x20, 1 + and x20, x9, 1 add x20, x20, x1 # Prefetch dataset data @@ -491,42 +465,39 @@ DECL(randomx_calc_dataset_item_aarch64): stp x10, x11, [sp, 80] stp x12, x13, [sp, 96] - ldr x12, superscalarMul0 + adr x7, superscalarMul0 + # superscalarMul0, superscalarAdd1 + ldp x12, x13, [x7] - mov x8, x0 - mov x9, x1 + ldp x8, x9, [sp] mov x10, x2 # rl[0] = (itemNumber + 1) * superscalarMul0; madd x0, x2, x12, x12 # rl[1] = rl[0] ^ superscalarAdd1; - ldr x12, superscalarAdd1 - eor x1, x0, x12 + eor x1, x0, x13 # rl[2] = rl[0] ^ superscalarAdd2; - ldr x12, superscalarAdd2 + ldp x12, x13, [x7, 16] eor x2, x0, x12 # rl[3] = rl[0] ^ superscalarAdd3; - ldr x12, superscalarAdd3 - eor x3, x0, x12 + eor x3, x0, x13 # rl[4] = rl[0] ^ superscalarAdd4; - ldr x12, superscalarAdd4 + ldp x12, x13, [x7, 32] eor x4, x0, x12 # rl[5] = rl[0] ^ superscalarAdd5; - ldr x12, superscalarAdd5 - eor x5, x0, x12 + eor x5, x0, x13 # rl[6] = rl[0] ^ superscalarAdd6; - ldr x12, superscalarAdd6 + ldp x12, x13, [x7, 48] eor x6, x0, x12 # rl[7] = rl[0] ^ superscalarAdd7; - ldr x12, superscalarAdd7 - eor x7, x0, x12 + eor x7, x0, x13 b DECL(randomx_calc_dataset_item_aarch64_prefetch) diff --git a/src/crypto/randomx/jit_compiler_rv64.cpp b/src/crypto/randomx/jit_compiler_rv64.cpp new file mode 100644 index 000000000..161343471 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64.cpp @@ -0,0 +1,1187 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include "crypto/randomx/jit_compiler_rv64.hpp" +#include "crypto/randomx/jit_compiler_rv64_static.hpp" +#include "crypto/randomx/jit_compiler_rv64_vector.h" +#include "crypto/randomx/jit_compiler_rv64_vector_static.h" +#include "crypto/randomx/superscalar.hpp" +#include "crypto/randomx/program.hpp" +#include "crypto/randomx/reciprocal.h" +#include "crypto/randomx/virtual_memory.hpp" +#include "crypto/common/VirtualMemory.h" + + +static bool hugePagesJIT = false; +static int optimizedDatasetInit = -1; + +void randomx_set_huge_pages_jit(bool hugePages) +{ + hugePagesJIT = hugePages; +} + +void randomx_set_optimized_dataset_init(int value) +{ + optimizedDatasetInit = value; +} + +#define alignSize(pos, align) (((pos - 1) / align + 1) * align) + + +namespace rv64 { + constexpr uint16_t C_LUI = 0x6001; + constexpr uint32_t LUI = 0x00000037; + constexpr uint16_t C_ADDI = 0x0001; + constexpr uint32_t ADDI = 0x00000013; + constexpr uint32_t ADDIW = 0x0000001b; + constexpr uint16_t C_ADD = 0x9002; + constexpr uint32_t ADD = 0x00000033; + constexpr uint32_t SHXADD = 0x20000033; //Zba + constexpr uint32_t SLL = 0x00001033; + constexpr uint32_t SRL = 0x00005033; + constexpr uint32_t SLLI = 0x00001013; + constexpr uint32_t C_SLLI = 0x0002; + constexpr uint32_t SRLI = 0x00005013; + constexpr uint32_t AND = 0x00007033; + constexpr uint32_t ANDI = 0x00007013; + constexpr uint16_t C_AND = 0x8c61; + constexpr uint16_t C_ANDI = 0x8801; + constexpr uint32_t OR = 0x00006033; + constexpr uint16_t C_OR = 0x8c41; + constexpr uint32_t XOR = 0x00004033; + constexpr uint16_t C_XOR = 0x8c21; + constexpr uint32_t LD = 0x00003003; + constexpr uint16_t C_LD = 0x6000; + constexpr uint16_t C_LW = 0x4000; + constexpr uint32_t SD = 0x00003023; + constexpr uint32_t SUB = 0x40000033; + constexpr uint16_t C_SUB = 0x8c01; + constexpr uint32_t MUL = 0x02000033; + constexpr uint32_t MULHU = 0x02003033; + constexpr uint32_t MULH = 0x02001033; + constexpr uint16_t C_MV = 0x8002; + constexpr uint32_t ROR = 0x60005033; //Zbb + constexpr uint32_t RORI = 0x60005013; //Zbb + constexpr uint32_t ROL = 0x60001033; //Zbb + constexpr uint32_t FMV_X_D = 0xe2000053; + constexpr uint32_t FMV_D_X = 0xf2000053; + constexpr uint32_t FMV_D = 0x22000053; + constexpr uint32_t FADD_D = 0x02007053; + constexpr uint32_t FSUB_D = 0x0a007053; + constexpr uint32_t FMUL_D = 0x12007053; + constexpr uint32_t FDIV_D = 0x1a007053; + constexpr uint32_t FSQRT_D = 0x5a007053; + constexpr uint32_t FCVT_D_W = 0xd2000053; + constexpr uint32_t FSRM = 0x00201073; + constexpr uint16_t C_BEQZ = 0xc001; + constexpr uint32_t BEQ = 0x00000063; + constexpr uint16_t C_BNEZ = 0xe001; + constexpr uint32_t JAL = 0x0000006f; + constexpr uint16_t C_RET = 0x8082; +} + +namespace randomx { + + constexpr size_t MaxRandomXInstrCodeSize = 56; //FDIV_M requires 56 bytes of rv64 code + constexpr size_t MaxSuperscalarInstrSize = 12; //IXOR_C requires 12 bytes of rv64 code + constexpr size_t SuperscalarProgramHeader = 136; //overhead per superscalar program + constexpr size_t CodeAlign = 4096; //align code size to a multiple of 4 KiB + constexpr size_t LiteralPoolSize = CodeAlign; + constexpr size_t SuperscalarLiteraPoolSize = RANDOMX_CACHE_MAX_ACCESSES * CodeAlign; + constexpr size_t ReserveCodeSize = CodeAlign; //prologue, epilogue + reserve + + constexpr size_t RandomXCodeSize = alignSize(LiteralPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_MAX_SIZE, CodeAlign); + constexpr size_t SuperscalarSize = alignSize(SuperscalarLiteraPoolSize + ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_MAX_ACCESSES, CodeAlign); + + constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize; + constexpr uint32_t ExecutableSize = CodeSize - LiteralPoolSize; + + constexpr int32_t LiteralPoolOffset = LiteralPoolSize / 2; + constexpr int32_t SuperScalarLiteralPoolOffset = RandomXCodeSize; + constexpr int32_t SuperScalarLiteralPoolRefOffset = RandomXCodeSize + (RANDOMX_CACHE_MAX_ACCESSES - 1) * LiteralPoolSize + LiteralPoolOffset; + constexpr int32_t SuperScalarHashOffset = SuperScalarLiteralPoolOffset + SuperscalarLiteraPoolSize; + + constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); + } + +#define MaskL1Shift (32 - RandomX_CurrentConfig.Log2_ScratchpadL1) +#define MaskL2Shift (32 - RandomX_CurrentConfig.Log2_ScratchpadL2) +#define MaskL3Shift (32 - RandomX_CurrentConfig.Log2_ScratchpadL3) + + constexpr int RcpLiteralsOffset = 144; + + constexpr int LiteralPoolReg = 3; //x3 + constexpr int SpadReg = 5; //x5 + constexpr int DataReg = 6; //x6 + constexpr int SuperscalarReg = 7; //x7 + constexpr int SshTmp1Reg = 28; //x28 + constexpr int SshTmp2Reg = 29; //x29 + constexpr int SshPoolReg = 30; //x30 + constexpr int SshRcpReg = 31; //x31 + constexpr int Tmp1Reg = 8; //x8 + constexpr int Tmp2Reg = 9; //x9 + constexpr int Tmp1RegF = 24; //f24 + constexpr int Tmp2RegF = 25; //f25 + constexpr int MaskL1Reg = 10; //x10 + constexpr int MaskL2Reg = 11; //x11 + constexpr int MaskFscalReg = 12; //x12 + constexpr int MaskEclear = 13; //x13 + constexpr int MaskEsetLo = 14; //x14 + constexpr int MaskEsetHi = 15; //x15 + constexpr int MaskL3Reg = 1; //x1 + constexpr int ReturnReg = 1; //x1 + constexpr int SpAddr0Reg = 26; //x26 + constexpr int OffsetXC = -8; //x8-x15 + constexpr int OffsetR = 16; //x16-x23 + constexpr int OffsetF = 0; //f0-f7 + constexpr int OffsetE = 8; //f8-f15 + constexpr int OffsetA = 16; //f16-f23 + constexpr int OffsetRcp = 28; //x28-x31 + constexpr int OffsetRcpF = 22; //f26-f31 + constexpr int OffsetSsh = 8; //x8-x15 + + //destination register (bit 7+) + constexpr int rvrd(int reg) { + return reg << 7; + } + + //first source register (bit 15+) + constexpr int rvrs1(int reg) { + return reg << 15; + } + + //second source register (bit 20+) + constexpr int rvrs2(int reg) { + return reg << 20; + } + + //compressed source register (bit 2+) + constexpr int rvcrs(int reg) { + return reg << 2; + } + + //base instruction: {op} x{rd}, x{rs1}, x{rs2} + constexpr uint32_t rvi(uint32_t op, int rd, int rs1, int rs2 = 0) { + return op | rvrs2(rs2) | rvrs1(rs1) | rvrd(rd); + } + + //compressed instruction: op x{rd}, x{rs} + constexpr uint16_t rvc(uint16_t op, int rd, int rs) { + return op | rvrd(rd) | rvcrs(rs); + } + + //compressed instruction: op x{rd}, imm6 + constexpr uint16_t rvc(uint16_t op, int imm5, int rd, int imm40) { + return op | (imm5 << 12) | rvrd(rd) | (imm40 << 2); + } + + constexpr int regR(int reg) { + return reg + OffsetR; + } + + constexpr int regLoA(int reg) { + return 2 * reg + OffsetA; + } + + constexpr int regHiA(int reg) { + return 2 * reg + OffsetA + 1; + } + + constexpr int regLoF(int reg) { + return 2 * reg + OffsetF; + } + + constexpr int regHiF(int reg) { + return 2 * reg + OffsetF + 1; + } + + constexpr int regLoE(int reg) { + return 2 * reg + OffsetE; + } + + constexpr int regHiE(int reg) { + return 2 * reg + OffsetE + 1; + } + + constexpr int regRcp(int reg) { + return reg + OffsetRcp; + } + + constexpr int regRcpF(int reg) { + return reg + OffsetRcpF; + } + + constexpr int regSS(int reg) { + return reg + OffsetSsh; + } + + static const uint8_t* codeLiterals = (uint8_t*)&randomx_riscv64_literals; + static const uint8_t* codeLiteralsEnd = (uint8_t*)&randomx_riscv64_literals_end; + static const uint8_t* codeDataInit = (uint8_t*)&randomx_riscv64_data_init; + static const uint8_t* codeFixDataCall = (uint8_t*)&randomx_riscv64_fix_data_call; + static const uint8_t* codePrologue = (uint8_t*)&randomx_riscv64_prologue; + static const uint8_t* codeLoopBegin = (uint8_t*)&randomx_riscv64_loop_begin; + static const uint8_t* codeDataRead = (uint8_t*)&randomx_riscv64_data_read; + static const uint8_t* codeDataReadLight = (uint8_t*)&randomx_riscv64_data_read_light; + static const uint8_t* codeFixLoopCall = (uint8_t*)&randomx_riscv64_fix_loop_call; + static const uint8_t* codeSpadStore = (uint8_t*)&randomx_riscv64_spad_store; + static const uint8_t* codeSpadStoreHardAes = (uint8_t*)&randomx_riscv64_spad_store_hardaes; + static const uint8_t* codeSpadStoreSoftAes = (uint8_t*)&randomx_riscv64_spad_store_softaes; + static const uint8_t* codeLoopEnd = (uint8_t*)&randomx_riscv64_loop_end; + static const uint8_t* codeFixContinueLoop = (uint8_t*)&randomx_riscv64_fix_continue_loop; + static const uint8_t* codeEpilogue = (uint8_t*)&randomx_riscv64_epilogue; + static const uint8_t* codeSoftAes = (uint8_t*)&randomx_riscv64_softaes; + static const uint8_t* codeProgramEnd = (uint8_t*)&randomx_riscv64_program_end; + static const uint8_t* codeSshInit = (uint8_t*)&randomx_riscv64_ssh_init; + static const uint8_t* codeSshLoad = (uint8_t*)&randomx_riscv64_ssh_load; + static const uint8_t* codeSshPrefetch = (uint8_t*)&randomx_riscv64_ssh_prefetch; + static const uint8_t* codeSshEnd = (uint8_t*)&randomx_riscv64_ssh_end; + + static const int32_t sizeLiterals = codeLiteralsEnd - codeLiterals; + static const int32_t sizeDataInit = codePrologue - codeDataInit; + static const int32_t sizePrologue = codeLoopBegin - codePrologue; + static const int32_t sizeLoopBegin = codeDataRead - codeLoopBegin; + static const int32_t sizeDataRead = codeDataReadLight - codeDataRead; + static const int32_t sizeDataReadLight = codeSpadStore - codeDataReadLight; + static const int32_t sizeSpadStore = codeSpadStoreHardAes - codeSpadStore; + static const int32_t sizeSpadStoreSoftAes = codeLoopEnd - codeSpadStoreSoftAes; + static const int32_t sizeLoopEnd = codeEpilogue - codeLoopEnd; + static const int32_t sizeEpilogue = codeSoftAes - codeEpilogue; + static const int32_t sizeSoftAes = codeProgramEnd - codeSoftAes; + static const int32_t sizeSshInit = codeSshLoad - codeSshInit; + static const int32_t sizeSshLoad = codeSshPrefetch - codeSshLoad; + static const int32_t sizeSshPrefetch = codeSshEnd - codeSshPrefetch; + + static const int32_t offsetFixDataCall = codeFixDataCall - codeDataInit; + static const int32_t offsetFixLoopCall = codeFixLoopCall - codeDataReadLight; + static const int32_t offsetFixContinueLoop = codeFixContinueLoop - codeLoopEnd; + + static const int32_t LoopTopPos = LiteralPoolSize + sizeDataInit + sizePrologue; + static const int32_t RandomXCodePos = LoopTopPos + sizeLoopBegin; + + static void clearCache(CodeBuffer& buf) { +#ifdef __GNUC__ + __builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize)); +#endif + } + + //emits code to calculate: x{dst} = x{src} + {imm32} + //takes 1-3 isns, 2-10 bytes + static void emitImm32(CodeBuffer& buf, int32_t imm, int dst, int src = 0, int tmp = 0) { + + //lower 12 bits + int32_t limm = (imm << 20) >> 20; + //upper 20 bits + int32_t uimm = (imm >> 12) + (limm < 0); + + //If there are no upper bits, the whole thing + //can be done with a single instruction. + if (uimm == 0) { + //addi x{dst}, x{src}, {limm} + buf.emit(rvi(rv64::ADDI, dst, src, limm)); + return; + } + + //dst1 is the register where imm will be materialized + int dst1 = src != dst ? dst : tmp; + assert(dst1 != 0); + //src1 is the register that will be added to the result + int src1 = src != dst ? src : dst1; + + //load upper bits + if (uimm >= -32 && uimm <= 31) { + //c.lui x{dst1}, {uimm} + buf.emit(rvc(rv64::C_LUI, (uimm < 0), dst1, (uimm & 31))); + } + else { + //lui x{dst1}, {uimm} + buf.emit(rv64::LUI | (uimm << 12) | rvrd(dst1)); + } + //load lower bits + if (limm != 0) { + //Note: this must be addiw NOT addi, otherwise the upper 32 bits + //of the 64-bit register will be incorrect. + //addiw x{dst1}, x{dst1}, {limm} + buf.emit(rvi(rv64::ADDIW, dst1, dst1, limm)); + } + //add src + if (src1 != 0) { + //c.add x{dst}, x{src1} + buf.emit(rvc(rv64::C_ADD, dst, src1)); + } + } + + //x9 = &Scratchpad[isn.imm] + //takes 3 isns, 10 bytes + static void genAddressRegImm(CodeBuffer& buf, const Instruction& isn) { + //signed offset 8-byte aligned + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()) & ScratchpadL3Mask; + //x9 = x5 + {imm} + emitImm32(buf, imm, Tmp2Reg, SpadReg, Tmp1Reg); + } + + //x9 = &Scratchpad[isn.src + isn.imm] (for reading) + //takes 5 isns, 12 bytes + static void genAddressReg(CodeBuffer& buf, const Instruction& isn) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{src} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.src), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, (Tmp2Reg + OffsetXC), (maskReg + OffsetXC))); + //c.add x9, x{spadReg} + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + + //x8 = Scratchpad[isn] + static void loadFromScratchpad(CodeBuffer& buf, const Instruction& isn) { + if (isn.src != isn.dst) { + //x9 = &Scratchpad[isn.src + isn.imm] + genAddressReg(buf, isn); + } + else { + ///x9 = &Scratchpad[isn.imm] + genAddressRegImm(buf, isn); + } + //c.ld x8, 0(x9) + buf.emit(rvc(rv64::C_LD, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + } + + //x9 = &Scratchpad[isn.dst + isn.imm32] (for writing) + //takes 5 isns, 12-16 bytes + static void genAddressRegDst(CodeBuffer& buf, const Instruction& isn) { + if (isn.getModCond() < StoreL3Condition) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, maskReg + OffsetXC)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + else { + int shift = MaskL3Shift; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //and x9, x9, x1 + buf.emit(rvi(rv64::AND, Tmp2Reg, Tmp2Reg, MaskL3Reg)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + } + + static void emitRcpLiteral1(CodeBuffer& buf, uint64_t literal) { + //first 238 at positive offsets + if (buf.rcpCount < 238) { + buf.emitAt(LiteralPoolOffset + RcpLiteralsOffset + buf.rcpCount * 8, literal); + buf.rcpCount++; + } + //next 256 at negative offsets + else if (buf.rcpCount < 494) { + buf.emitAt(buf.rcpCount * 8 - (2048 - RcpLiteralsOffset), literal); + buf.rcpCount++; + } + else { + //checked at compile time, but double-check here + throw std::runtime_error("Literal pool overflow"); + } + } + + static void emitRcpLiteral2(CodeBuffer& buf, uint64_t literal, bool lastLiteral) { + //store the current literal in the pool + int32_t offset = 2040 - buf.rcpCount * 8; + buf.emitAt(SuperScalarLiteralPoolRefOffset + offset, literal); + buf.rcpCount++; + if (lastLiteral) { + return; + } + //load the next literal + offset -= 8; + int32_t imm = offset & 0xfff; + //ld x31, {offset}(x30) + buf.emit(rvi(rv64::LD, SshRcpReg, SshPoolReg, imm)); + if (imm == 0x800) { + //move pool pointer back 4KB + //c.lui x29, 0xfffff + buf.emit(rvc(rv64::C_LUI, 1, SshTmp2Reg, 31)); + //c.add x30, x29 + buf.emit(rvc(rv64::C_ADD, SshPoolReg, SshTmp2Reg)); + } + } + + static void emitJump(CodeBuffer& buf, int dst, int32_t codePos, int32_t targetPos) { + int32_t imm = targetPos - codePos; + int32_t imm20 = (imm < 0) << 11; + int32_t imm1912 = (imm >> 7) & 8160; + int32_t imm11 = (imm >> 11) & 1; + int32_t imm101 = imm & 2046; + //jal x{dst}, {imm} + buf.emitAt(codePos, rvi(rv64::JAL, dst + imm1912, 0, imm20 + imm101 + imm11)); + } + + static void emitInstruction(CompilerState& state, Instruction isn, int i) { + state.instructionOffsets[i] = state.codePos; + (*JitCompilerRV64::engine[isn.opcode])(state, isn, i); + } + + static void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg) { + state.codePos = RandomXCodePos; + state.rcpCount = 0; + state.emitAt(LiteralPoolOffset + sizeLiterals, pcfg.eMask[0]); + state.emitAt(LiteralPoolOffset + sizeLiterals + 8, pcfg.eMask[1]); + for (unsigned i = 0; i < RegistersCount; ++i) { + state.registerUsage[i] = -1; + } + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + emitInstruction(state, instr, i); + } + } + + static void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg) { + state.emit(codeSpadStore, sizeSpadStore); + int32_t fixPos = state.codePos; + state.emit(codeLoopEnd, sizeLoopEnd); + //xor x26, x{readReg0}, x{readReg1} + state.emitAt(fixPos, rvi(rv64::XOR, SpAddr0Reg, regR(pcfg.readReg0), regR(pcfg.readReg1))); + fixPos += offsetFixContinueLoop; + //j LoopTop + emitJump(state, 0, fixPos, LoopTopPos); + state.emit(codeEpilogue, sizeEpilogue); + } + + static void generateSuperscalarCode(CodeBuffer& buf, Instruction isn, bool lastLiteral) { + switch ((SuperscalarInstructionType)isn.opcode) + { + case randomx::SuperscalarInstructionType::ISUB_R: + //c.sub x{dst}, x{src} + buf.emit(rvc(rv64::C_SUB, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + //c.xor x{dst}, x{src} + buf.emit(rvc(rv64::C_XOR, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + { + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), regSS(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + buf.emit(rv64::SHXADD | rvrs2(regSS(isn.dst)) | rvrs1(regSS(isn.src)) | (shift << 13) | rvrd(regSS(isn.dst))); +#else + //slli x28, x{src}, {shift} + buf.emit(rvi(rv64::SLLI, SshTmp1Reg, regSS(isn.src), shift)); + //c.add x{dst}, x28 + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), SshTmp1Reg)); +#endif + } + } + break; + case randomx::SuperscalarInstructionType::IMUL_R: + //mul x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IROR_C: + { +#ifdef __riscv_zbb + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + buf.emit(rvi(rv64::RORI, regSS(isn.dst), regSS(isn.dst), imm)); +#else + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x28, x{dst}, {immr} + buf.emit(rvi(rv64::SRLI, SshTmp1Reg, regSS(isn.dst), immr)); + //c.slli x{dst}, {imml} + buf.emit(rvc(rv64::C_SLLI, imml5, regSS(isn.dst), imml40)); + //or x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::OR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); +#endif + } + break; + case randomx::SuperscalarInstructionType::IADD_C7: + case randomx::SuperscalarInstructionType::IADD_C8: + case randomx::SuperscalarInstructionType::IADD_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(buf, imm, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg); + } + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + case randomx::SuperscalarInstructionType::IXOR_C8: + case randomx::SuperscalarInstructionType::IXOR_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x28 = {imm} + emitImm32(buf, imm, SshTmp1Reg); + //xor x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::XOR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); + } + break; + case randomx::SuperscalarInstructionType::IMULH_R: + //mulhu x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULHU, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + //mulh x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULH, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + //mul x{dst}, x{dst}, x31 + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), SshRcpReg)); + //load the next literal into x31 + emitRcpLiteral2(buf, randomx_reciprocal(isn.getImm32()), lastLiteral); + break; + default: + UNREACHABLE; + } + } + + size_t JitCompilerRV64::getCodeSize() { + return CodeSize; + } + + JitCompilerRV64::JitCompilerRV64(bool hugePagesEnable, bool) { + state.code = static_cast(allocExecutableMemory(CodeSize, hugePagesJIT && hugePagesEnable)); + state.emitAt(LiteralPoolOffset, codeLiterals, sizeLiterals); + + const uint32_t L1_Mask = RandomX_CurrentConfig.ScratchpadL1_Size - 8; + const uint32_t L2_Mask = RandomX_CurrentConfig.ScratchpadL2_Size - 8; + const uint32_t L3_Mask = RandomX_CurrentConfig.ScratchpadL3_Size - 64; + const uint32_t DatasetBaseSize_Mask = RandomX_CurrentConfig.DatasetBaseSize - 64; + + state.emitAt(LiteralPoolOffset + 80, reinterpret_cast(&L1_Mask), sizeof(L1_Mask)); + state.emitAt(LiteralPoolOffset + 84, reinterpret_cast(&L2_Mask), sizeof(L2_Mask)); + state.emitAt(LiteralPoolOffset + 88, reinterpret_cast(&L3_Mask), sizeof(L3_Mask)); + state.emitAt(LiteralPoolOffset + 92, reinterpret_cast(&DatasetBaseSize_Mask), sizeof(DatasetBaseSize_Mask)); + + state.emitAt(LiteralPoolSize, codeDataInit, sizeDataInit + sizePrologue + sizeLoopBegin); + entryDataInit = state.code + LiteralPoolSize; + entryProgram = state.code + LiteralPoolSize + sizeDataInit; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset); + + vectorCodeSize = ((uint8_t*)randomx_riscv64_vector_sshash_end) - ((uint8_t*)randomx_riscv64_vector_sshash_begin); + vectorCode = static_cast(allocExecutableMemory(vectorCodeSize, hugePagesJIT && hugePagesEnable)); + } + + JitCompilerRV64::~JitCompilerRV64() { + freePagedMemory(state.code, CodeSize); + freePagedMemory(vectorCode, vectorCodeSize); + } + + void JitCompilerRV64::enableWriting() const + { + xmrig::VirtualMemory::protectRW(entryDataInit, ExecutableSize); + + if (vectorCode) { + xmrig::VirtualMemory::protectRW(vectorCode, vectorCodeSize); + } + } + + void JitCompilerRV64::enableExecution() const + { + xmrig::VirtualMemory::protectRX(entryDataInit, ExecutableSize); + + if (vectorCode) { + xmrig::VirtualMemory::protectRX(vectorCode, vectorCodeSize); + } + } + + void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataRead, sizeDataRead); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataReadLight, sizeDataReadLight); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + int32_t imm = datasetOffset / CacheLineSize; + int32_t limm = (imm << 20) >> 20; + int32_t uimm = (imm >> 12) + (limm < 0); + //lui x9, {uimm} + state.emitAt(fixPos + 4, rv64::LUI | (uimm << 12) | rvrd(Tmp2Reg)); + //addi x9, x9, {limm} + state.emitAt(fixPos + 8, rvi(rv64::ADDI, Tmp2Reg, Tmp2Reg, limm)); + fixPos += offsetFixLoopCall; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, fixPos, SuperScalarHashOffset); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + template + void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) { + if (optimizedDatasetInit > 0) { + entryDataInitOptimized = generateDatasetInitVectorRV64(vectorCode, vectorCodeSize, programs, RandomX_ConfigurationBase::CacheAccesses); + return; + } + + state.codePos = SuperScalarHashOffset; + state.rcpCount = 0; + state.emit(codeSshInit, sizeSshInit); + + std::pair lastLiteral{ 0xFFFFFFFFUL, 0xFFFFFFFFUL }; + + for (int j = RandomX_ConfigurationBase::CacheAccesses - 1; (j >= 0) && (lastLiteral.first == 0xFFFFFFFFUL); --j) { + SuperscalarProgram& prog = programs[j]; + for (int i = prog.getSize() - 1; i >= 0; --i) { + if (prog(i).opcode == static_cast(SuperscalarInstructionType::IMUL_RCP)) { + lastLiteral.first = j; + lastLiteral.second = i; + break; + } + } + } + + for (unsigned j = 0; j < RandomX_ConfigurationBase::CacheAccesses; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + generateSuperscalarCode(state, instr, (j == lastLiteral.first) && (i == lastLiteral.second)); + } + state.emit(codeSshLoad, sizeSshLoad); + if (j < RandomX_ConfigurationBase::CacheAccesses - 1) { + int32_t fixPos = state.codePos; + state.emit(codeSshPrefetch, sizeSshPrefetch); + //and x7, x{addrReg}, x7 + state.emitAt(fixPos, rvi(rv64::AND, SuperscalarReg, regSS(prog.getAddressRegister()), SuperscalarReg)); + } + } + state.emit(rvc(rv64::C_RET, 0, 0)); + clearCache(state); + } + + template void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram(&)[RANDOMX_CACHE_MAX_ACCESSES]); + + DatasetInitFunc* JitCompilerRV64::getDatasetInitFunc() { + return (DatasetInitFunc*)((optimizedDatasetInit > 0) ? entryDataInitOptimized : entryDataInit); + } + + void JitCompilerRV64::v1_IADD_RS(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + state.emit(rvc(rv64::C_ADD, regR(isn.dst), regR(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + state.emit(rv64::SHXADD | rvrs2(regR(isn.dst)) | rvrs1(regR(isn.src)) | (shift << 13) | rvrd(regR(isn.dst))); +#else + //slli x8, x{src}, {shift} + state.emit(rvi(rv64::SLLI, Tmp1Reg, regR(isn.src), shift)); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); +#endif + } + if (isn.dst == RegisterNeedsDisplacement) { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + void JitCompilerRV64::v1_IADD_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_ISUB_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //sub x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add + //x{dst} = x{dst} + {-imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + void JitCompilerRV64::v1_ISUB_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //sub x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IMUL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //mul x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + void JitCompilerRV64::v1_IMUL_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulhu x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + void JitCompilerRV64::v1_IMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulhu x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_ISMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulh x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + void JitCompilerRV64::v1_ISMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulh x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IMUL_RCP(HANDLER_ARGS) { + const uint32_t divisor = isn.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + state.registerUsage[isn.dst] = i; + if (state.rcpCount < 4) { + //mul x{dst}, x{dst}, x{rcp} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regRcp(state.rcpCount))); + } + else if (state.rcpCount < 10) { + //fmv.x.d x8, f{rcp} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regRcpF(state.rcpCount))); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + else { + int32_t offset = RcpLiteralsOffset + state.rcpCount * 8; + //ld x8, {offset}(x3) + state.emit(rvi(rv64::LD, Tmp1Reg, LiteralPoolReg, offset)); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + emitRcpLiteral1(state, randomx_reciprocal_fast(divisor)); + } + } + + void JitCompilerRV64::v1_INEG_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //sub x{dst}, x0, x{dst} + state.emit(rvi(rv64::SUB, regR(isn.dst), 0, regR(isn.dst))); + } + + void JitCompilerRV64::v1_IXOR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //xor x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + void JitCompilerRV64::v1_IXOR_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IROR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //ror x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //srl x9, x{dst}, x{src} + state.emit(rvi(rv64::SRL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //sll x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SLL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + void JitCompilerRV64::v1_IROL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //rol x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = -isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //sll x9, x{dst}, x{src} + state.emit(rvi(rv64::SLL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //srl x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SRL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t imml = isn.getImm32() & 63; + int32_t immr = -imml & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + void JitCompilerRV64::v1_ISWAP_R(HANDLER_ARGS) { + if (isn.src != isn.dst) { + state.registerUsage[isn.dst] = i; + state.registerUsage[isn.src] = i; + //c.mv x8, x{dst} + state.emit(rvc(rv64::C_MV, Tmp1Reg, regR(isn.dst))); + //c.mv x{dst}, x{src} + state.emit(rvc(rv64::C_MV, regR(isn.dst), regR(isn.src))); + //c.mv x{src}, x8 + state.emit(rvc(rv64::C_MV, regR(isn.src), Tmp1Reg)); + } + } + + void JitCompilerRV64::v1_FSWAP_R(HANDLER_ARGS) { + //fmv.d f24, f{dst_lo} + state.emit(rvi(rv64::FMV_D, Tmp1RegF, regLoF(isn.dst), regLoF(isn.dst))); + //fmv.d f{dst_lo}, f{dst_hi} + state.emit(rvi(rv64::FMV_D, regLoF(isn.dst), regHiF(isn.dst), regHiF(isn.dst))); + //fmv.d f{dst_hi}, f24 + state.emit(rvi(rv64::FMV_D, regHiF(isn.dst), Tmp1RegF, Tmp1RegF)); + } + + void JitCompilerRV64::v1_FADD_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fadd.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fadd.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + void JitCompilerRV64::v1_FADD_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fadd.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fadd.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + void JitCompilerRV64::v1_FSUB_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fsub.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fsub.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + void JitCompilerRV64::v1_FSUB_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //c.lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //c.lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fsub.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fsub.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + void JitCompilerRV64::v1_FSCAL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fmv.x.d x8, f{dst_lo} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regLoF(isn.dst))); + //fmv.x.d x9, f{dst_hi} + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, regHiF(isn.dst))); + //c.xor x8, x12 + state.emit(rvc(rv64::C_XOR, Tmp1Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //c.xor x9, x12 + state.emit(rvc(rv64::C_XOR, Tmp2Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //fmv.d.x f{dst_lo}, x8 + state.emit(rvi(rv64::FMV_D_X, regLoF(isn.dst), Tmp1Reg)); + //fmv.d.x f{dst_hi}, x9 + state.emit(rvi(rv64::FMV_D_X, regHiF(isn.dst), Tmp2Reg)); + } + + void JitCompilerRV64::v1_FMUL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fmul.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FMUL_D, regLoE(isn.dst), regLoE(isn.dst), regLoA(isn.src))); + //fmul.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FMUL_D, regHiE(isn.dst), regHiE(isn.dst), regHiA(isn.src))); + } + + void JitCompilerRV64::v1_FDIV_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fmv.x.d x8, f24 + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, Tmp1RegF)); + //fmv.x.d x9, f25 + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, Tmp2RegF)); + //c.and x8, x13 + state.emit(rvc(rv64::C_AND, Tmp1Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.and x9, x13 + state.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.or x8, x14 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, MaskEsetLo + OffsetXC)); + //c.or x9, x15 + state.emit(rvc(rv64::C_OR, Tmp2Reg + OffsetXC, MaskEsetHi + OffsetXC)); + //fmv.d.x f24, x8 + state.emit(rvi(rv64::FMV_D_X, Tmp1RegF, Tmp1Reg)); + //fmv.d.x f25, x9 + state.emit(rvi(rv64::FMV_D_X, Tmp2RegF, Tmp2Reg)); + //fdiv.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FDIV_D, regLoE(isn.dst), regLoE(isn.dst), Tmp1RegF)); + //fdiv.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FDIV_D, regHiE(isn.dst), regHiE(isn.dst), Tmp2RegF)); + } + + void JitCompilerRV64::v1_FSQRT_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fsqrt.d f{dst_lo}, f{dst_lo} + state.emit(rvi(rv64::FSQRT_D, regLoE(isn.dst), regLoE(isn.dst))); + //fsqrt.d f{dst_hi}, f{dst_hi} + state.emit(rvi(rv64::FSQRT_D, regHiE(isn.dst), regHiE(isn.dst))); + } + + void JitCompilerRV64::v1_CBRANCH(HANDLER_ARGS) { + int reg = isn.dst; + int target = state.registerUsage[reg] + 1; + int shift = isn.getModCond() + RandomX_ConfigurationBase::JumpOffset; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm |= (1UL << shift); + if (RandomX_ConfigurationBase::JumpOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + //x8 = branchMask + emitImm32(state, (int32_t)((1 << RandomX_ConfigurationBase::JumpBits) - 1) << shift, Tmp1Reg); + //x{dst} += {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp2Reg); + //and x8, x8, x{dst} + state.emit(rvi(rv64::AND, Tmp1Reg, Tmp1Reg, regR(isn.dst))); + int32_t targetPos = state.instructionOffsets[target]; + int offset = targetPos - state.codePos; + if (offset >= -256) { //C.BEQZ only has a range of 256B + //c.beqz x8, {offset} + int imm8 = 1; //sign bit is always 1 + int imm21 = offset & 6; //offset[2:1] + int imm5 = (offset >> 5) & 1; //offset[5] + int imm43 = offset & 24; //offset[4:3] + int imm76 = (offset >> 3) & 24; //offset[7:6] + state.emit(rvc(rv64::C_BEQZ, imm8, imm43 + (Tmp1Reg + OffsetXC), imm76 + imm21 + imm5)); + } + else if (offset >= -4096) { //BEQ only has a range of 4KB + //beq x8, x0, offset + int imm12 = 1 << 11; //sign bit is always 1 + int imm105 = offset & 2016; //offset[10:5] + int imm41 = offset & 30; //offset[4:1] + int imm11 = (offset >> 11) & 1; //offset[11] + state.emit(rvi(rv64::BEQ, imm41 + imm11, Tmp1Reg, imm12 + imm105)); + } + else { + //c.bnez x8, +6 + state.emit(rvc(rv64::C_BNEZ, Tmp1Reg + OffsetXC, 6)); + //j targetPos + emitJump(state, 0, state.codePos, targetPos); + state.codePos += 4; + } + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + state.registerUsage[j] = i; + } + } + + void JitCompilerRV64::v1_CFROUND(HANDLER_ARGS) { + int32_t imm = (isn.getImm32() - 2) & 63; //-2 to avoid a later left shift to multiply by 4 + if (imm != 0) { +#ifdef __riscv_zbb + //rori x8, x{src}, {imm} + state.emit(rvi(rv64::RORI, Tmp1Reg, regR(isn.src), imm)); +#else + int32_t imml = -imm & 63; + //srli x8, x{src}, {imm} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.src), imm)); + //slli x9, x{src}, {imml} + state.emit(rvi(rv64::SLLI, Tmp2Reg, regR(isn.src), imml)); + //c.or x8, x9 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, Tmp2Reg + OffsetXC)); +#endif + //c.andi x8, 12 + state.emit(rvc(rv64::C_ANDI, Tmp1Reg + OffsetXC, 12)); + } + else { + //and x8, x{src}, 12 + state.emit(rvi(rv64::ANDI, Tmp1Reg, regR(isn.src), 12)); + } + //c.add x8, x3 + state.emit(rvc(rv64::C_ADD, Tmp1Reg, LiteralPoolReg)); + //c.lw x8, 64(x8) + state.emit(rvc(rv64::C_LW, Tmp1Reg + OffsetXC, 8 + Tmp1Reg + OffsetXC)); + //fsrm x8 + state.emit(rvi(rv64::FSRM, 0, Tmp1Reg, 0)); + } + + void JitCompilerRV64::v1_ISTORE(HANDLER_ARGS) { + genAddressRegDst(state, isn); + //sd x{src}, 0(x9) + state.emit(rvi(rv64::SD, 0, Tmp2Reg, regR(isn.src))); + } + + void JitCompilerRV64::v1_NOP(HANDLER_ARGS) { + } + +InstructionGeneratorRV64 JitCompilerRV64::engine[256] = {}; +} diff --git a/src/crypto/randomx/jit_compiler_rv64.hpp b/src/crypto/randomx/jit_compiler_rv64.hpp new file mode 100644 index 000000000..dbad88e1b --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64.hpp @@ -0,0 +1,147 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "crypto/randomx/common.hpp" +#include "crypto/randomx/jit_compiler_rv64_static.hpp" + +namespace randomx { + + struct CodeBuffer { + uint8_t* code; + int32_t codePos; + int32_t rcpCount; + + void emit(const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + codePos += len; + } + + template + void emit(T src) { + memcpy(&code[codePos], &src, sizeof(src)); + codePos += sizeof(src); + } + + void emitAt(int32_t codePos, const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + } + + template + void emitAt(int32_t codePos, T src) { + memcpy(&code[codePos], &src, sizeof(src)); + } + }; + + struct CompilerState : public CodeBuffer { + int32_t instructionOffsets[RANDOMX_PROGRAM_MAX_SIZE]; + int registerUsage[RegistersCount]; + }; + + class Program; + struct ProgramConfiguration; + class SuperscalarProgram; + class Instruction; + +#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i + typedef void(*InstructionGeneratorRV64)(HANDLER_ARGS); + + class JitCompilerRV64 { + public: + JitCompilerRV64(bool hugePagesEnable, bool optimizedInitDatasetEnable); + ~JitCompilerRV64(); + + void prepare() {} + void generateProgram(Program&, ProgramConfiguration&, uint32_t); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + + template + void generateSuperscalarHash(SuperscalarProgram(&programs)[N]); + + void generateDatasetInitCode() {} + + ProgramFunc* getProgramFunc() { + return (ProgramFunc*)entryProgram; + } + DatasetInitFunc* getDatasetInitFunc(); + uint8_t* getCode() { + return state.code; + } + size_t getCodeSize(); + + void enableWriting() const; + void enableExecution() const; + + static InstructionGeneratorRV64 engine[256]; + private: + CompilerState state; + + uint8_t* vectorCode; + size_t vectorCodeSize; + + void* entryDataInit; + void* entryDataInitOptimized; + void* entryProgram; + + public: + static void v1_IADD_RS(HANDLER_ARGS); + static void v1_IADD_M(HANDLER_ARGS); + static void v1_ISUB_R(HANDLER_ARGS); + static void v1_ISUB_M(HANDLER_ARGS); + static void v1_IMUL_R(HANDLER_ARGS); + static void v1_IMUL_M(HANDLER_ARGS); + static void v1_IMULH_R(HANDLER_ARGS); + static void v1_IMULH_M(HANDLER_ARGS); + static void v1_ISMULH_R(HANDLER_ARGS); + static void v1_ISMULH_M(HANDLER_ARGS); + static void v1_IMUL_RCP(HANDLER_ARGS); + static void v1_INEG_R(HANDLER_ARGS); + static void v1_IXOR_R(HANDLER_ARGS); + static void v1_IXOR_M(HANDLER_ARGS); + static void v1_IROR_R(HANDLER_ARGS); + static void v1_IROL_R(HANDLER_ARGS); + static void v1_ISWAP_R(HANDLER_ARGS); + static void v1_FSWAP_R(HANDLER_ARGS); + static void v1_FADD_R(HANDLER_ARGS); + static void v1_FADD_M(HANDLER_ARGS); + static void v1_FSUB_R(HANDLER_ARGS); + static void v1_FSUB_M(HANDLER_ARGS); + static void v1_FSCAL_R(HANDLER_ARGS); + static void v1_FMUL_R(HANDLER_ARGS); + static void v1_FDIV_M(HANDLER_ARGS); + static void v1_FSQRT_R(HANDLER_ARGS); + static void v1_CBRANCH(HANDLER_ARGS); + static void v1_CFROUND(HANDLER_ARGS); + static void v1_ISTORE(HANDLER_ARGS); + static void v1_NOP(HANDLER_ARGS); + }; +} diff --git a/src/crypto/randomx/jit_compiler_rv64_static.S b/src/crypto/randomx/jit_compiler_rv64_static.S new file mode 100644 index 000000000..c4f341adb --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_static.S @@ -0,0 +1,1236 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define DECL(x) x + +.text +.option rvc + +#include "configuration.h" + +.global DECL(randomx_riscv64_literals) +.global DECL(randomx_riscv64_literals_end) +.global DECL(randomx_riscv64_data_init) +.global DECL(randomx_riscv64_fix_data_call) +.global DECL(randomx_riscv64_prologue) +.global DECL(randomx_riscv64_loop_begin) +.global DECL(randomx_riscv64_data_read) +.global DECL(randomx_riscv64_data_read_light) +.global DECL(randomx_riscv64_fix_loop_call) +.global DECL(randomx_riscv64_spad_store) +.global DECL(randomx_riscv64_spad_store_hardaes) +.global DECL(randomx_riscv64_spad_store_softaes) +.global DECL(randomx_riscv64_loop_end) +.global DECL(randomx_riscv64_fix_continue_loop) +.global DECL(randomx_riscv64_epilogue) +.global DECL(randomx_riscv64_softaes) +.global DECL(randomx_riscv64_program_end) +.global DECL(randomx_riscv64_ssh_init) +.global DECL(randomx_riscv64_ssh_load) +.global DECL(randomx_riscv64_ssh_prefetch) +.global DECL(randomx_riscv64_ssh_end) + +/* The literal pool can fit at most 494 IMUL_RCP literals */ +#if RANDOMX_PROGRAM_SIZE > 494 + #error RANDOMX_PROGRAM_SIZE larger than 494 is not supported. +#endif + +#define RANDOMX_ARGON_MEMORY 262144 +#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1) + +/* shared literal pool: 4 KB */ + /* space for 256 IMUL_RCP literals -2048 */ + /* filled by JIT compiler */ +DECL(randomx_riscv64_literals): +literal_pool: + /* SuperscalarHash constants +0 */ + .dword 6364136223846793005 + .dword 9298411001130361340 + .dword 12065312585734608966 + .dword 9306329213124626780 + .dword 5281919268842080866 + .dword 10536153434571861004 + .dword 3398623926847679864 + .dword 9549104520008361294 + /* CFROUND lookup table +64 */ + .word 0x00000000 /* RTN */ + .word 0x00000002 /* RDN */ + .word 0x00000003 /* RUP */ + .word 0x00000001 /* RTZ */ + /* mask literals +80,+84,+88,+92,+96,+104 */ + .word (/*RANDOMX_SCRATCHPAD_L1*/0) /* filled by JIT compiler */ + .word (/*RANDOMX_SCRATCHPAD_L2*/0) /* filled by JIT compiler */ + .word (/*RANDOMX_SCRATCHPAD_L3*/0) /* filled by JIT compiler */ + .word (/*RANDOMX_DATASET_BASE_SIZE*/0) /* filled by JIT compiler */ + .dword 0x80f0000000000000 + .dword 0x00ffffffffffffff +DECL(randomx_riscv64_literals_end): + /* E reg. set masks, +112,+120 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* soft AES table addresses, +128,+136 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* space for 238 IMUL_RCP literals, +144 */ + .fill 238,8,0 /* filled by JIT compiler */ + +/* ================================= */ +/* Dataset init function entry point */ +/* ================================= */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/return address + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> dataset pointer + x6 -> cache pointer + x7 -> temp/itemNumber + x8-x15 -> SuperscalarHash registers + x16 -> itemNumber + x17 -> endItem + x28-x31 -> temp + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> saved x3 + 16 -> saved x8-x9 + 32 -> caller stack +*/ +DECL(randomx_riscv64_data_init): + addi sp, sp, -32 + /* dataset ptr */ + mv x5, x11 + /* cache->memory */ + ld x6, 0(x10) + /* callee saved registers */ + sd x1, 0(sp) + sd x3, 8(sp) + /* literal pool */ + lla x3, literal_pool + sd x8, 16(sp) + sd x9, 24(sp) + /* startItem */ + mv x16, x12 + /* endItem */ + mv x17, x13 +init_item: + mv x7, x16 +DECL(randomx_riscv64_fix_data_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + sd x8, 0(x5) + sd x9, 8(x5) + sd x10, 16(x5) + sd x11, 24(x5) + sd x12, 32(x5) + sd x13, 40(x5) + sd x14, 48(x5) + sd x15, 56(x5) + addi x5, x5, 64 + addi x16, x16, 1 + bltu x16, x17, init_item + ld x1, 0(sp) + ld x3, 8(sp) + ld x8, 16(sp) + ld x9, 24(sp) + addi sp, sp, 32 + ret + +/* ====================================== */ +/* Program execution function entry point */ +/* ====================================== */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/scratchpad L3 mask + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> scratchpad pointer + x6 -> dataset/cache pointer + x7 -> temp/next dataset access + x8 -> temp + x9 -> temp + x10 -> scratchpad L1 mask (0x0000000000003ff8) + x11 -> scratchpad L2 mask (0x000000000003fff8) + x12 -> FSCAL_R mask (0x80f0000000000000) + x13 -> E reg. clear mask (0x00ffffffffffffff) + x14 -> E reg. set mask (0x3*00000000******) + x15 -> E reg. set mask (0x3*00000000******) + x16-x23 -> VM registers "r0"-"r7" + x24 -> iteration counter "ic" + x25 -> VM registers "mx", "ma" + x26 -> spAddr0 + x27 -> spAddr1 + x28-x31 -> temp/literals for IMUL_RCP (4x) + + (Note: We avoid using x4 because it breaks debugging with gdb.) + + f0-f7 -> VM registers "f0"-"f3" + f8-f15 -> VM registers "e0"-"e3" + f16-f23 -> VM registers "a0"-"a3" + f24-f25 -> temp + f26-f31 -> literals for IMUL_RCP (6x) + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> register file ptr + 16 -> saved x3-x4 + 32 -> saved x8-x9 + 48 -> saved x18-x27 + 128 -> saved f8-f9 + 144 -> saved f18-f27 + 224 -> caller stack +*/ + +DECL(randomx_riscv64_prologue): + addi sp, sp, -224 + /* scratchpad pointer */ + mv x5, x12 + /* register file pointer */ + sd x10, 8(sp) + /* callee saved registers */ + sd x3, 16(sp) + sd x8, 32(sp) + sd x9, 40(sp) + sd x18, 48(sp) + sd x19, 56(sp) + sd x20, 64(sp) + sd x21, 72(sp) + sd x22, 80(sp) + sd x23, 88(sp) + sd x24, 96(sp) + sd x25, 104(sp) + sd x26, 112(sp) + sd x27, 120(sp) + fsd f8, 128(sp) + fsd f9, 136(sp) + fsd f18, 144(sp) + fsd f19, 152(sp) + fsd f20, 160(sp) + fsd f21, 168(sp) + fsd f22, 176(sp) + fsd f23, 184(sp) + fsd f24, 192(sp) + fsd f25, 200(sp) + fsd f26, 208(sp) + fsd f27, 216(sp) + /* iteration counter */ + mv x24, x13 + /* return address */ + sd x1, 0(sp) + /* literal pool */ + lla x3, literal_pool + /* load (ma, mx) */ + ld x25, 0(x11) + /* dataset ptr */ + ld x6, 8(x11) + /* load dataset mask */ + lwu x1, 92(x3) + /* zero registers r0-r3, load a0-a1 */ + li x16, 0 + fld f16, 192(x10) + li x17, 0 + fld f17, 200(x10) + srli x7, x25, 32 /* x7 = ma */ + li x18, 0 + fld f18, 208(x10) + mv x27, x7 /* x27 = ma */ + li x19, 0 + fld f19, 216(x10) + /* set dataset read address */ + and x7, x7, x1 + add x7, x7, x6 + /* zero registers r4-r7, load a2-a3 */ + li x20, 0 + fld f20, 224(x10) + li x21, 0 + fld f21, 232(x10) + li x22, 0 + fld f22, 240(x10) + li x23, 0 + fld f23, 248(x10) + /* load L3 mask */ + lwu x1, 88(x3) + /* load scratchpad masks */ + lwu x10, 80(x3) + lwu x11, 84(x3) + /* set spAddr0, spAddr1 */ + and x26, x25, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* FSCAL, E reg. masks */ + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + /* IMUL_RCP literals */ + fld f26, 176(x3) + fld f27, 184(x3) + fld f28, 192(x3) + fld f29, 200(x3) + fld f30, 208(x3) + fld f31, 216(x3) + +.balign 4 +DECL(randomx_riscv64_loop_begin): +loop_begin: + /* mix integer registers */ + ld x8, 0(x26) + ld x9, 8(x26) + ld x30, 16(x26) + ld x31, 24(x26) + xor x16, x16, x8 + ld x8, 32(x26) + xor x17, x17, x9 + ld x9, 40(x26) + xor x18, x18, x30 + ld x30, 48(x26) + xor x19, x19, x31 + ld x31, 56(x26) + xor x20, x20, x8 + lw x8, 0(x27) + xor x21, x21, x9 + lw x9, 4(x27) + xor x22, x22, x30 + lw x30, 8(x27) + xor x23, x23, x31 + lw x31, 12(x27) + /* load F registers */ + fcvt.d.w f0, x8 + lw x8, 16(x27) + fcvt.d.w f1, x9 + lw x9, 20(x27) + fcvt.d.w f2, x30 + lw x30, 24(x27) + fcvt.d.w f3, x31 + lw x31, 28(x27) + fcvt.d.w f4, x8 + lw x8, 32(x27) + fcvt.d.w f5, x9 + lw x9, 36(x27) + fcvt.d.w f6, x30 + lw x30, 40(x27) + fcvt.d.w f7, x31 + lw x31, 44(x27) + /* load E registers */ + fcvt.d.w f8, x8 + lw x8, 48(x27) + fcvt.d.w f9, x9 + lw x9, 52(x27) + fcvt.d.w f10, x30 + lw x30, 56(x27) + fcvt.d.w f11, x31 + lw x31, 60(x27) + fcvt.d.w f12, x8 + fmv.x.d x8, f8 + fcvt.d.w f13, x9 + fmv.x.d x9, f9 + fcvt.d.w f14, x30 + fmv.x.d x30, f10 + fcvt.d.w f15, x31 + fmv.x.d x31, f11 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f8, x8 + fmv.d.x f9, x9 + fmv.d.x f10, x30 + fmv.d.x f11, x31 + fmv.x.d x8, f12 + fmv.x.d x9, f13 + fmv.x.d x30, f14 + fmv.x.d x31, f15 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + fmv.d.x f12, x8 + fmv.d.x f13, x9 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f14, x30 + fmv.d.x f15, x31 + /* reload clobbered IMUL_RCP regs */ + ld x28, 144(x3) + ld x29, 152(x3) + ld x30, 160(x3) + ld x31, 168(x3) + +DECL(randomx_riscv64_data_read): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset mask */ + lwu x1, 92(x3) + /* zero-extend x8 */ +#ifdef __riscv_zba + zext.w x8, x8 +#else + slli x8, x8, 32 + srli x8, x8, 32 +#endif + /* update "mx" */ + xor x25, x25, x8 + /* read dataset and update registers */ + ld x8, 0(x7) + ld x9, 8(x7) + ld x30, 16(x7) + ld x31, 24(x7) + xor x16, x16, x8 + ld x8, 32(x7) + xor x17, x17, x9 + ld x9, 40(x7) + xor x18, x18, x30 + ld x30, 48(x7) + xor x19, x19, x31 + ld x31, 56(x7) + xor x20, x20, x8 + /* calculate the next dataset address */ + and x7, x25, x1 + xor x21, x21, x9 + add x7, x7, x6 + xor x22, x22, x30 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + xor x23, x23, x31 + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x9, x25, 32 + slli x25, x25, 32 + or x25, x25, x9 +#endif + +DECL(randomx_riscv64_data_read_light): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset offset */ + lui x9, 0x02000 /* JIT compiler will adjust the immediate */ + addi x9, x9, -64 + /* load dataset mask */ + lwu x1, 92(x3) + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x31, x25, 32 + slli x25, x25, 32 + or x25, x25, x31 +#endif + slli x8, x8, 32 + /* update "mx" */ + xor x25, x25, x8 + /* the next dataset item */ + and x7, x25, x1 + srli x7, x7, 6 + add x7, x7, x9 +DECL(randomx_riscv64_fix_loop_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + xor x16, x16, x8 + xor x17, x17, x9 + xor x18, x18, x10 + xor x19, x19, x11 + xor x20, x20, x12 + xor x21, x21, x13 + xor x22, x22, x14 + xor x23, x23, x15 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_spad_store): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* XOR and store f0,e0 */ + fmv.x.d x8, f0 + fmv.x.d x9, f8 + fmv.x.d x30, f1 + fmv.x.d x31, f9 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 0(x26) + fmv.d.x f0, x8 + sd x30, 8(x26) + fmv.d.x f1, x30 + /* XOR and store f1,e1 */ + fmv.x.d x8, f2 + fmv.x.d x9, f10 + fmv.x.d x30, f3 + fmv.x.d x31, f11 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 16(x26) + fmv.d.x f2, x8 + sd x30, 24(x26) + fmv.d.x f3, x30 + /* XOR and store f2,e2 */ + fmv.x.d x8, f4 + fmv.x.d x9, f12 + fmv.x.d x30, f5 + fmv.x.d x31, f13 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 32(x26) + fmv.d.x f4, x8 + sd x30, 40(x26) + fmv.d.x f5, x30 + /* XOR and store f3,e3 */ + fmv.x.d x8, f6 + fmv.x.d x9, f14 + fmv.x.d x30, f7 + fmv.x.d x31, f15 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 48(x26) + fmv.d.x f6, x8 + sd x30, 56(x26) + fmv.d.x f7, x30 + +DECL(randomx_riscv64_spad_store_hardaes): + nop /* not implemented */ + +DECL(randomx_riscv64_spad_store_softaes): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* process f0 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f0 + fmv.x.d x31, f1 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 0(x26) + fmv.d.x f0, x30 + sd x31, 8(x26) + fmv.d.x f1, x31 + /* process f1 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f2 + fmv.x.d x31, f3 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 16(x26) + fmv.d.x f2, x30 + sd x31, 24(x26) + fmv.d.x f3, x31 + /* process f2 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f4 + fmv.x.d x31, f5 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 32(x26) + fmv.d.x f4, x30 + sd x31, 40(x26) + fmv.d.x f5, x31 + /* process f3 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f6 + fmv.x.d x31, f7 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 48(x26) + fmv.d.x f6, x30 + sd x31, 56(x26) + fmv.d.x f7, x31 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_loop_end): + xor x26, x16, x18 /* JIT compiler will adjust the registers */ + /* load L3 mask */ + lwu x1, 88(x3) + addi x24, x24, -1 + srli x27, x26, 32 + /* set spAddr0, spAddr1 */ + and x26, x26, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* conditional branch doesn't have sufficient range */ + j condition_check +DECL(randomx_riscv64_fix_continue_loop): +continue_loop: + .word 0 /* JIT compiler will write a jump to loop_begin */ +condition_check: + bnez x24, continue_loop + +DECL(randomx_riscv64_epilogue): + /* restore callee saved registers */ + ld x10, 8(sp) + ld x1, 0(sp) + ld x3, 16(sp) + ld x8, 32(sp) + ld x9, 40(sp) + ld x24, 96(sp) + ld x25, 104(sp) + ld x26, 112(sp) + ld x27, 120(sp) + fld f18, 144(sp) + fld f19, 152(sp) + fld f20, 160(sp) + fld f21, 168(sp) + fld f22, 176(sp) + fld f23, 184(sp) + fld f24, 192(sp) + fld f25, 200(sp) + fld f26, 208(sp) + fld f27, 216(sp) + /* save VM registers */ + sd x16, 0(x10) + sd x17, 8(x10) + sd x18, 16(x10) + sd x19, 24(x10) + sd x20, 32(x10) + sd x21, 40(x10) + sd x22, 48(x10) + sd x23, 56(x10) + fsd f0, 64(x10) + fsd f1, 72(x10) + fsd f2, 80(x10) + fsd f3, 88(x10) + fsd f4, 96(x10) + fsd f5, 104(x10) + fsd f6, 112(x10) + fsd f7, 120(x10) + fsd f8, 128(x10) + fsd f9, 136(x10) + fsd f10, 144(x10) + fsd f11, 152(x10) + fsd f12, 160(x10) + fsd f13, 168(x10) + fsd f14, 176(x10) + fsd f15, 184(x10) + /* restore callee saved registers */ + ld x18, 48(sp) + ld x19, 56(sp) + ld x20, 64(sp) + ld x21, 72(sp) + ld x22, 80(sp) + ld x23, 88(sp) + fld f8, 128(sp) + fld f9, 136(sp) + /* restore stack pointer */ + addi sp, sp, 224 + /* return */ + ret + +/* + Soft AES subroutines + in: + x3 = literal pool + x8, x10 = round key + x30, x31 = plaintext + out: + x30, x31 = ciphertext + clobbers: + x8-x11 (limbs) + x12-x13 (LUTs) + x14-x15 (temp) +*/ +DECL(randomx_riscv64_softaes): +softaes_enc: + /* enc. lookup table */ + ld x13, 128(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x30, x30, x15 + + ret + +softaes_dec: + /* dec. lookup table */ + ld x13, 136(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x31, x31, x15 + + ret + +DECL(randomx_riscv64_program_end): + nop + + +/* literal pool for SuperscalarHash */ + /* space for remaining IMUL_RCP literals */ +ssh_literal_pool: + /* space for 256 IMUL_RCP literals */ + .fill 256,8,0 + +/* + SuperscalarHash subroutine + in: + x3 = literal pool + x6 = cache + x7 = itemNumber + out: + x8-x15 = 64-byte hash + clobbers: + x7, x28-x31 +*/ +DECL(randomx_riscv64_ssh_init): +superscalar_hash: + ld x30, 0(x3) /* superscalarMul0 */ + addi x8, x7, 1 + ld x9, 8(x3) + li x31, RANDOMX_CACHE_MASK + ld x10, 16(x3) + ld x11, 24(x3) + mul x8, x8, x30 + ld x12, 32(x3) + ld x13, 40(x3) + lla x30, ssh_literal_pool + ld x14, 48(x3) + and x7, x7, x31 + ld x15, 56(x3) + slli x7, x7, 6 + xor x9, x9, x8 + add x7, x7, x6 + xor x10, x10, x8 + /* load the first IMUL_RCP literal */ + ld x31, 2040(x30) + xor x11, x11, x8 + xor x12, x12, x8 + xor x13, x13, x8 + xor x14, x14, x8 + xor x15, x15, x8 + +DECL(randomx_riscv64_ssh_load): + ld x28, 0(x7) + ld x29, 8(x7) + xor x8, x8, x28 + ld x28, 16(x7) + xor x9, x9, x29 + ld x29, 24(x7) + xor x10, x10, x28 + ld x28, 32(x7) + xor x11, x11, x29 + ld x29, 40(x7) + xor x12, x12, x28 + ld x28, 48(x7) + xor x13, x13, x29 + ld x29, 56(x7) + xor x14, x14, x28 + li x7, RANDOMX_CACHE_MASK + xor x15, x15, x29 + +DECL(randomx_riscv64_ssh_prefetch): + and x7, x8, x7 /* JIT compiler will adjust the register */ + slli x7, x7, 6 + add x7, x7, x6 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + +DECL(randomx_riscv64_ssh_end): + nop diff --git a/src/crypto/randomx/jit_compiler_rv64_static.hpp b/src/crypto/randomx/jit_compiler_rv64_static.hpp new file mode 100644 index 000000000..656623c74 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_static.hpp @@ -0,0 +1,53 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_riscv64_literals(); + void randomx_riscv64_literals_end(); + void randomx_riscv64_data_init(); + void randomx_riscv64_fix_data_call(); + void randomx_riscv64_prologue(); + void randomx_riscv64_loop_begin(); + void randomx_riscv64_data_read(); + void randomx_riscv64_data_read_light(); + void randomx_riscv64_fix_loop_call(); + void randomx_riscv64_spad_store(); + void randomx_riscv64_spad_store_hardaes(); + void randomx_riscv64_spad_store_softaes(); + void randomx_riscv64_loop_end(); + void randomx_riscv64_fix_continue_loop(); + void randomx_riscv64_epilogue(); + void randomx_riscv64_softaes(); + void randomx_riscv64_program_end(); + void randomx_riscv64_ssh_init(); + void randomx_riscv64_ssh_load(); + void randomx_riscv64_ssh_prefetch(); + void randomx_riscv64_ssh_end(); +} diff --git a/src/crypto/randomx/jit_compiler_rv64_vector.cpp b/src/crypto/randomx/jit_compiler_rv64_vector.cpp new file mode 100644 index 000000000..8dc95613e --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_vector.cpp @@ -0,0 +1,207 @@ +/* +Copyright (c) 2018-2020, tevador +Copyright (c) 2019-2021, XMRig , +Copyright (c) 2025, SChernykh + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "crypto/randomx/configuration.h" +#include "crypto/randomx/jit_compiler_rv64_vector.h" +#include "crypto/randomx/jit_compiler_rv64_vector_static.h" +#include "crypto/randomx/reciprocal.h" +#include "crypto/randomx/superscalar.hpp" + +namespace randomx { + +#define ADDR(x) ((uint8_t*) &(x)) +#define DIST(x, y) (ADDR(y) - ADDR(x)) + +void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarProgram* programs, size_t num_programs) +{ + memcpy(buf, reinterpret_cast(randomx_riscv64_vector_sshash_begin), buf_size); + + uint8_t* p = buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_generated_instructions); + + uint8_t* literals = buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_imul_rcp_literals); + uint8_t* cur_literal = literals; + + for (size_t i = 0; i < num_programs; ++i) { + // Step 4 + size_t k = DIST(randomx_riscv64_vector_sshash_cache_prefetch, randomx_riscv64_vector_sshash_xor); + memcpy(p, reinterpret_cast(randomx_riscv64_vector_sshash_cache_prefetch), k); + p += k; + + // Step 5 + for (uint32_t j = 0; j < programs[i].size; ++j) { + const uint32_t dst = programs[i].programBuffer[j].dst & 7; + const uint32_t src = programs[i].programBuffer[j].src & 7; + const uint32_t modShift = (programs[i].programBuffer[j].mod >> 2) & 3; + const uint32_t imm32 = programs[i].programBuffer[j].imm32; + + uint32_t inst; + #define EMIT(data) inst = (data); memcpy(p, &inst, 4); p += 4 + + switch (static_cast(programs[i].programBuffer[j].opcode)) { + case SuperscalarInstructionType::ISUB_R: + // 57 00 00 0A vsub.vv v0, v0, v0 + EMIT(0x0A000057 | (dst << 7) | (src << 15) | (dst << 20)); + break; + + case SuperscalarInstructionType::IXOR_R: + // 57 00 00 2E vxor.vv v0, v0, v0 + EMIT(0x2E000057 | (dst << 7) | (src << 15) | (dst << 20)); + break; + + case SuperscalarInstructionType::IADD_RS: + // 57 39 00 96 vsll.vi v18, v0, 0 + // 57 00 09 02 vadd.vv v0, v0, v18 + EMIT(0x96003957 | (modShift << 15) | (src << 20)); + EMIT(0x02090057 | (dst << 7) | (dst << 20)); + break; + + case SuperscalarInstructionType::IMUL_R: + // 57 20 00 96 vmul.vv v0, v0, v0 + EMIT(0x96002057 | (dst << 7) | (src << 15) | (dst << 20)); + break; + + case SuperscalarInstructionType::IROR_C: + { + const uint32_t shift_right = imm32 & 63; + const uint32_t shift_left = 64 - shift_right; + + if (shift_right < 32) { + // 57 39 00 A2 vsrl.vi v18, v0, 0 + EMIT(0xA2003957 | (shift_right << 15) | (dst << 20)); + } + else { + // 93 02 00 00 li x5, 0 + // 57 C9 02 A2 vsrl.vx v18, v0, x5 + EMIT(0x00000293 | (shift_right << 20)); + EMIT(0xA202C957 | (dst << 20)); + } + + if (shift_left < 32) { + // 57 30 00 96 vsll.vi v0, v0, 0 + EMIT(0x96003057 | (dst << 7) | (shift_left << 15) | (dst << 20)); + } + else { + // 93 02 00 00 li x5, 0 + // 57 C0 02 96 vsll.vx v0, v0, x5 + EMIT(0x00000293 | (shift_left << 20)); + EMIT(0x9602C057 | (dst << 7) | (dst << 20)); + } + + // 57 00 20 2B vor.vv v0, v18, v0 + EMIT(0x2B200057 | (dst << 7) | (dst << 15)); + } + break; + + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: + // B7 02 00 00 lui x5, 0 + // 9B 82 02 00 addiw x5, x5, 0 + // 57 C0 02 02 vadd.vx v0, v0, x5 + EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000)); + EMIT(0x0002829B | ((imm32 & 0x00000FFF)) << 20); + EMIT(0x0202C057 | (dst << 7) | (dst << 20)); + break; + + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: + // B7 02 00 00 lui x5, 0 + // 9B 82 02 00 addiw x5, x5, 0 + // 57 C0 02 2E vxor.vx v0, v0, x5 + EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000)); + EMIT(0x0002829B | ((imm32 & 0x00000FFF)) << 20); + EMIT(0x2E02C057 | (dst << 7) | (dst << 20)); + break; + + case SuperscalarInstructionType::IMULH_R: + // 57 20 00 92 vmulhu.vv v0, v0, v0 + EMIT(0x92002057 | (dst << 7) | (src << 15) | (dst << 20)); + break; + + case SuperscalarInstructionType::ISMULH_R: + // 57 20 00 9E vmulh.vv v0, v0, v0 + EMIT(0x9E002057 | (dst << 7) | (src << 15) | (dst << 20)); + break; + + case SuperscalarInstructionType::IMUL_RCP: + { + uint32_t offset = cur_literal - literals; + + if (offset == 2040) { + literals += 2040; + offset = 0; + + // 93 87 87 7F add x15, x15, 2040 + EMIT(0x7F878793); + } + + const uint64_t r = randomx_reciprocal_fast(imm32); + memcpy(cur_literal, &r, 8); + cur_literal += 8; + + // 83 B2 07 00 ld x5, (x15) + // 57 E0 02 96 vmul.vx v0, v0, x5 + EMIT(0x0007B283 | (offset << 20)); + EMIT(0x9602E057 | (dst << 7) | (dst << 20)); + } + break; + + default: + break; + } + } + + // Step 6 + k = DIST(randomx_riscv64_vector_sshash_xor, randomx_riscv64_vector_sshash_set_cache_index); + memcpy(p, reinterpret_cast(randomx_riscv64_vector_sshash_xor), k); + p += k; + + // Step 7 + if (i + 1 < num_programs) { + memcpy(p, reinterpret_cast(randomx_riscv64_vector_sshash_set_cache_index) + programs[i].getAddressRegister() * 4, 4); + p += 4; + } + } + + // Emit "J randomx_riscv64_vector_sshash_generated_instructions_end" instruction + const uint8_t* e = buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_generated_instructions_end); + const uint32_t k = e - p; + const uint32_t j = 0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000); + memcpy(p, &j, 4); + +#ifdef __GNUC__ + __builtin___clear_cache((char*) buf, (char*)(buf + buf_size)); +#endif + + return buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_dataset_init); +} + +} // namespace randomx diff --git a/src/crypto/randomx/jit_compiler_rv64_vector.h b/src/crypto/randomx/jit_compiler_rv64_vector.h new file mode 100644 index 000000000..ea06862e5 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_vector.h @@ -0,0 +1,42 @@ +/* +Copyright (c) 2018-2020, tevador +Copyright (c) 2019-2021, XMRig , +Copyright (c) 2025, SChernykh + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include + +namespace randomx { + +class SuperscalarProgram; + +void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarProgram* programs, size_t num_programs); + +} // namespace randomx diff --git a/src/crypto/randomx/jit_compiler_rv64_vector_static.S b/src/crypto/randomx/jit_compiler_rv64_vector_static.S new file mode 100644 index 000000000..ac63c625f --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.S @@ -0,0 +1,296 @@ +/* +Copyright (c) 2018-2020, tevador +Copyright (c) 2019-2021, XMRig , +Copyright (c) 2025, SChernykh + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "configuration.h" + +// Compatibility macros + +#if !defined(RANDOMX_CACHE_ACCESSES) && defined(RANDOMX_CACHE_MAX_ACCESSES) +#define RANDOMX_CACHE_ACCESSES RANDOMX_CACHE_MAX_ACCESSES +#endif + +#if defined(RANDOMX_ARGON_MEMORY) +#define RANDOMX_CACHE_MASK RANDOMX_ARGON_MEMORY * 1024 / 64 - 1 +#elif defined(RANDOMX_CACHE_MAX_SIZE) +#define RANDOMX_CACHE_MASK RANDOMX_CACHE_MAX_SIZE / 64 - 1 +#endif + +#define DECL(x) x + +.text + +.option arch, rv64gcv_zicbop +.option pic + +.global DECL(randomx_riscv64_vector_sshash_begin) +.global DECL(randomx_riscv64_vector_sshash_imul_rcp_literals) +.global DECL(randomx_riscv64_vector_sshash_dataset_init) +.global DECL(randomx_riscv64_vector_sshash_generated_instructions) +.global DECL(randomx_riscv64_vector_sshash_generated_instructions_end) +.global DECL(randomx_riscv64_vector_sshash_cache_prefetch) +.global DECL(randomx_riscv64_vector_sshash_xor) +.global DECL(randomx_riscv64_vector_sshash_set_cache_index) +.global DECL(randomx_riscv64_vector_sshash_end) + +.balign 8 + +DECL(randomx_riscv64_vector_sshash_begin): + +sshash_constant_0: .dword 6364136223846793005 +sshash_constant_1: .dword 9298411001130361340 +sshash_constant_2: .dword 12065312585734608966 +sshash_constant_3: .dword 9306329213124626780 +sshash_constant_4: .dword 5281919268842080866 +sshash_constant_5: .dword 10536153434571861004 +sshash_constant_6: .dword 3398623926847679864 +sshash_constant_7: .dword 9549104520008361294 +sshash_offsets: .dword 0,1,2,3 +store_offsets: .dword 0,64,128,192 + +DECL(randomx_riscv64_vector_sshash_imul_rcp_literals): .fill 512,8,0 + +/* +Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#73-dataset-block-generation + +Register layout +--------------- +x5 = temporary + +x10 = randomx cache +x11 = output buffer +x12 = startBlock +x13 = endBlock + +x14 = cache mask +x15 = imul_rcp literal pointer + +v0-v7 = r0-r7 +v8 = itemNumber +v9 = cacheIndex, then a pointer into cache->memory (for prefetch), then a byte offset into cache->memory + +v10-v17 = sshash constants + +v18 = temporary + +v19 = dataset item store offsets +*/ + +DECL(randomx_riscv64_vector_sshash_dataset_init): + // Process 4 64-bit values at a time + li x5, 4 + vsetvli x5, x5, e64, m1, ta, ma + + // Load cache->memory pointer + ld x10, (x10) + + // Init cache mask + li x14, RANDOMX_CACHE_MASK + + // Init dataset item store offsets + lla x5, store_offsets + vle64.v v19, (x5) + + // Init itemNumber vector to (startBlock, startBlock + 1, startBlock + 2, startBlock + 3) + lla x5, sshash_offsets + vle64.v v8, (x5) + vadd.vx v8, v8, x12 + + // Load constants (stride = x0 = 0, so a 64-bit value will be broadcast into each element of a vector) + lla x5, sshash_constant_0 + vlse64.v v10, (x5), x0 + + lla x5, sshash_constant_1 + vlse64.v v11, (x5), x0 + + lla x5, sshash_constant_2 + vlse64.v v12, (x5), x0 + + lla x5, sshash_constant_3 + vlse64.v v13, (x5), x0 + + lla x5, sshash_constant_4 + vlse64.v v14, (x5), x0 + + lla x5, sshash_constant_5 + vlse64.v v15, (x5), x0 + + lla x5, sshash_constant_6 + vlse64.v v16, (x5), x0 + + lla x5, sshash_constant_7 + vlse64.v v17, (x5), x0 + + // Calculate the end pointer for dataset init + sub x13, x13, x12 + slli x13, x13, 6 + add x13, x13, x11 + +init_item: + // Step 1. Init r0-r7 + + // r0 = (itemNumber + 1) * 6364136223846793005 + vmv.v.v v0, v8 + vmadd.vv v0, v10, v10 + + // r_i = r0 ^ c_i for i = 1..7 + vxor.vv v1, v0, v11 + vxor.vv v2, v0, v12 + vxor.vv v3, v0, v13 + vxor.vv v4, v0, v14 + vxor.vv v5, v0, v15 + vxor.vv v6, v0, v16 + vxor.vv v7, v0, v17 + + // Step 2. Let cacheIndex = itemNumber + vmv.v.v v9, v8 + + // Step 3 is implicit (all iterations are inlined, there is no "i") + + // Init imul_rcp literal pointer + lla x15, randomx_riscv64_vector_sshash_imul_rcp_literals + +DECL(randomx_riscv64_vector_sshash_generated_instructions): + // Generated by JIT compiler + // + // Step 4. randomx_riscv64_vector_sshash_cache_prefetch + // Step 5. SuperscalarHash[i] + // Step 6. randomx_riscv64_vector_sshash_xor + // Step 7. randomx_riscv64_vector_sshash_set_cache_index + // + // Above steps will be repeated RANDOMX_CACHE_ACCESSES times + .fill RANDOMX_CACHE_ACCESSES * 2048, 4, 0 + +DECL(randomx_riscv64_vector_sshash_generated_instructions_end): + // Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data. + vsuxei64.v v0, (x11), v19 + + add x5, x11, 8 + vsuxei64.v v1, (x5), v19 + + add x5, x11, 16 + vsuxei64.v v2, (x5), v19 + + add x5, x11, 24 + vsuxei64.v v3, (x5), v19 + + add x5, x11, 32 + vsuxei64.v v4, (x5), v19 + + add x5, x11, 40 + vsuxei64.v v5, (x5), v19 + + add x5, x11, 48 + vsuxei64.v v6, (x5), v19 + + add x5, x11, 56 + vsuxei64.v v7, (x5), v19 + + // Iterate to the next 4 items + vadd.vi v8, v8, 4 + add x11, x11, 256 + bltu x11, x13, init_item + + ret + +// Step 4. Load a 64-byte item from the Cache. The item index is given by cacheIndex modulo the total number of 64-byte items in Cache. +DECL(randomx_riscv64_vector_sshash_cache_prefetch): + // v9 = convert from cacheIndex to a direct pointer into cache->memory + vand.vx v9, v9, x14 + vsll.vi v9, v9, 6 + vadd.vx v9, v9, x10 + + // Prefetch element 0 + vmv.x.s x5, v9 + prefetch.r (x5) + + // Prefetch element 1 + vslidedown.vi v18, v9, 1 + vmv.x.s x5, v18 + prefetch.r (x5) + + // Prefetch element 2 + vslidedown.vi v18, v9, 2 + vmv.x.s x5, v18 + prefetch.r (x5) + + // Prefetch element 3 + vslidedown.vi v18, v9, 3 + vmv.x.s x5, v18 + prefetch.r (x5) + + // v9 = byte offset into cache->memory + vsub.vx v9, v9, x10 + +// Step 6. XOR all registers with data loaded from randomx cache +DECL(randomx_riscv64_vector_sshash_xor): + vluxei64.v v18, (x10), v9 + vxor.vv v0, v0, v18 + + add x5, x10, 8 + vluxei64.v v18, (x5), v9 + vxor.vv v1, v1, v18 + + add x5, x10, 16 + vluxei64.v v18, (x5), v9 + vxor.vv v2, v2, v18 + + add x5, x10, 24 + vluxei64.v v18, (x5), v9 + vxor.vv v3, v3, v18 + + add x5, x10, 32 + vluxei64.v v18, (x5), v9 + vxor.vv v4, v4, v18 + + add x5, x10, 40 + vluxei64.v v18, (x5), v9 + vxor.vv v5, v5, v18 + + add x5, x10, 48 + vluxei64.v v18, (x5), v9 + vxor.vv v6, v6, v18 + + add x5, x10, 56 + vluxei64.v v18, (x5), v9 + vxor.vv v7, v7, v18 + +// Step 7. Set cacheIndex to the value of the register that has the longest dependency chain in the SuperscalarHash function executed in step 5. +DECL(randomx_riscv64_vector_sshash_set_cache_index): + // JIT compiler will pick a single instruction reading from the required register + vmv.v.v v9, v0 + vmv.v.v v9, v1 + vmv.v.v v9, v2 + vmv.v.v v9, v3 + vmv.v.v v9, v4 + vmv.v.v v9, v5 + vmv.v.v v9, v6 + vmv.v.v v9, v7 + +DECL(randomx_riscv64_vector_sshash_end): diff --git a/src/crypto/randomx/jit_compiler_rv64_vector_static.h b/src/crypto/randomx/jit_compiler_rv64_vector_static.h new file mode 100644 index 000000000..09bab597e --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.h @@ -0,0 +1,58 @@ +/* +Copyright (c) 2018-2020, tevador +Copyright (c) 2019-2021, XMRig , +Copyright (c) 2025, SChernykh + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#if defined(__cplusplus) +#include +#else +#include +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +struct randomx_cache; + +void randomx_riscv64_vector_sshash_begin(); +void randomx_riscv64_vector_sshash_imul_rcp_literals(); +void randomx_riscv64_vector_sshash_dataset_init(struct randomx_cache* cache, uint8_t* output_buf, uint32_t startBlock, uint32_t endBlock); +void randomx_riscv64_vector_sshash_cache_prefetch(); +void randomx_riscv64_vector_sshash_generated_instructions(); +void randomx_riscv64_vector_sshash_generated_instructions_end(); +void randomx_riscv64_vector_sshash_cache_prefetch(); +void randomx_riscv64_vector_sshash_xor(); +void randomx_riscv64_vector_sshash_set_cache_index(); +void randomx_riscv64_vector_sshash_end(); + +#if defined(__cplusplus) +} +#endif diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 1126c7a2e..1609a4af3 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -39,6 +39,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/jit_compiler_x86_static.hpp" #elif (XMRIG_ARM == 8) #include "crypto/randomx/jit_compiler_a64_static.hpp" +#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) +#include "crypto/randomx/jit_compiler_rv64_static.hpp" #endif #include "backend/cpu/Cpu.h" @@ -190,7 +192,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase() # endif } -#if (XMRIG_ARM == 8) +#if (XMRIG_ARM == 8) || defined(XMRIG_RISCV) static uint32_t Log2(size_t value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; } #endif @@ -274,6 +276,14 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx #define JIT_HANDLE(x, prev) randomx::JitCompilerA64::engine[k] = &randomx::JitCompilerA64::h_##x +#elif defined(XMRIG_RISCV) + + Log2_ScratchpadL1 = Log2(ScratchpadL1_Size); + Log2_ScratchpadL2 = Log2(ScratchpadL2_Size); + Log2_ScratchpadL3 = Log2(ScratchpadL3_Size); + +#define JIT_HANDLE(x, prev) randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x + #else #define JIT_HANDLE(x, prev) #endif diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index c2d244447..70abff348 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -133,7 +133,7 @@ struct RandomX_ConfigurationBase uint32_t ScratchpadL3Mask_Calculated; uint32_t ScratchpadL3Mask64_Calculated; -# if (XMRIG_ARM == 8) +# if (XMRIG_ARM == 8) || defined(XMRIG_RISCV) uint32_t Log2_ScratchpadL1; uint32_t Log2_ScratchpadL2; uint32_t Log2_ScratchpadL3; diff --git a/src/crypto/randomx/reciprocal.c b/src/crypto/randomx/reciprocal.c index 87cda2677..ebd7662ca 100644 --- a/src/crypto/randomx/reciprocal.c +++ b/src/crypto/randomx/reciprocal.c @@ -73,8 +73,20 @@ uint64_t randomx_reciprocal(uint64_t divisor) { #if !RANDOMX_HAVE_FAST_RECIPROCAL +#ifdef __GNUC__ +uint64_t randomx_reciprocal_fast(uint64_t divisor) +{ + const uint64_t q = (1ULL << 63) / divisor; + const uint64_t r = (1ULL << 63) % divisor; + + const uint64_t shift = 64 - __builtin_clzll(divisor); + + return (q << shift) + ((r << shift) / divisor); +} +#else uint64_t randomx_reciprocal_fast(uint64_t divisor) { return randomx_reciprocal(divisor); } +#endif #endif diff --git a/src/crypto/randomx/soft_aes.cpp b/src/crypto/randomx/soft_aes.cpp index 04fb7ac0e..aa5cdd494 100644 --- a/src/crypto/randomx/soft_aes.cpp +++ b/src/crypto/randomx/soft_aes.cpp @@ -39,6 +39,9 @@ alignas(64) uint32_t lutDec1[256]; alignas(64) uint32_t lutDec2[256]; alignas(64) uint32_t lutDec3[256]; +alignas(64) uint8_t lutEncIndex[4][32]; +alignas(64) uint8_t lutDecIndex[4][32]; + static uint32_t mul_gf2(uint32_t b, uint32_t c) { uint32_t s = 0; @@ -115,5 +118,49 @@ static struct SAESInitializer lutDec2[i] = w; w = (w << 8) | (w >> 24); lutDec3[i] = w; } + + memset(lutEncIndex, -1, sizeof(lutEncIndex)); + memset(lutDecIndex, -1, sizeof(lutDecIndex)); + + lutEncIndex[0][ 0] = 0; + lutEncIndex[0][ 4] = 4; + lutEncIndex[0][ 8] = 8; + lutEncIndex[0][12] = 12; + lutEncIndex[1][ 0] = 5; + lutEncIndex[1][ 4] = 9; + lutEncIndex[1][ 8] = 13; + lutEncIndex[1][12] = 1; + lutEncIndex[2][ 0] = 10; + lutEncIndex[2][ 4] = 14; + lutEncIndex[2][ 8] = 2; + lutEncIndex[2][12] = 6; + lutEncIndex[3][ 0] = 15; + lutEncIndex[3][ 4] = 3; + lutEncIndex[3][ 8] = 7; + lutEncIndex[3][12] = 11; + + lutDecIndex[0][ 0] = 0; + lutDecIndex[0][ 4] = 4; + lutDecIndex[0][ 8] = 8; + lutDecIndex[0][12] = 12; + lutDecIndex[1][ 0] = 13; + lutDecIndex[1][ 4] = 1; + lutDecIndex[1][ 8] = 5; + lutDecIndex[1][12] = 9; + lutDecIndex[2][ 0] = 10; + lutDecIndex[2][ 4] = 14; + lutDecIndex[2][ 8] = 2; + lutDecIndex[2][12] = 6; + lutDecIndex[3][ 0] = 7; + lutDecIndex[3][ 4] = 11; + lutDecIndex[3][ 8] = 15; + lutDecIndex[3][12] = 3; + + for (uint32_t i = 0; i < 4; ++i) { + for (uint32_t j = 0; j < 16; j += 4) { + lutEncIndex[i][j + 16] = lutEncIndex[i][j] + 16; + lutDecIndex[i][j + 16] = lutDecIndex[i][j] + 16; + } + } } } aes_initializer; diff --git a/src/crypto/randomx/soft_aes.h b/src/crypto/randomx/soft_aes.h index 2b7d5a1e9..4e133910d 100644 --- a/src/crypto/randomx/soft_aes.h +++ b/src/crypto/randomx/soft_aes.h @@ -41,6 +41,9 @@ extern uint32_t lutDec1[256]; extern uint32_t lutDec2[256]; extern uint32_t lutDec3[256]; +extern uint8_t lutEncIndex[4][32]; +extern uint8_t lutDecIndex[4][32]; + template rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key); template rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key); @@ -147,3 +150,32 @@ template<> FORCE_INLINE rx_vec_i128 aesdec<0>(rx_vec_i128 in, rx_vec_i128 key) { return rx_aesdec_vec_i128(in, key); } + +#if defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED) +#include + +FORCE_INLINE vuint32m1_t softaes_vector_double( + vuint32m1_t in, + vuint32m1_t key, + vuint8m1_t i0, vuint8m1_t i1, vuint8m1_t i2, vuint8m1_t i3, + const uint32_t* lut0, const uint32_t* lut1, const uint32_t *lut2, const uint32_t* lut3) +{ + const vuint8m1_t in8 = __riscv_vreinterpret_v_u32m1_u8m1(in); + + const vuint32m1_t index0 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i0, 32)); + const vuint32m1_t index1 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i1, 32)); + const vuint32m1_t index2 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i2, 32)); + const vuint32m1_t index3 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i3, 32)); + + vuint32m1_t s0 = __riscv_vluxei32_v_u32m1(lut0, __riscv_vsll_vx_u32m1(index0, 2, 8), 8); + vuint32m1_t s1 = __riscv_vluxei32_v_u32m1(lut1, __riscv_vsll_vx_u32m1(index1, 2, 8), 8); + vuint32m1_t s2 = __riscv_vluxei32_v_u32m1(lut2, __riscv_vsll_vx_u32m1(index2, 2, 8), 8); + vuint32m1_t s3 = __riscv_vluxei32_v_u32m1(lut3, __riscv_vsll_vx_u32m1(index3, 2, 8), 8); + + s0 = __riscv_vxor_vv_u32m1(s0, s1, 8); + s2 = __riscv_vxor_vv_u32m1(s2, s3, 8); + s0 = __riscv_vxor_vv_u32m1(s0, s2, 8); + + return __riscv_vxor_vv_u32m1(s0, key, 8); +} +#endif // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED) diff --git a/src/crypto/randomx/tests/riscv64_vector.s b/src/crypto/randomx/tests/riscv64_vector.s new file mode 100644 index 000000000..ee4c234f7 --- /dev/null +++ b/src/crypto/randomx/tests/riscv64_vector.s @@ -0,0 +1,14 @@ +/* RISC-V - test if the vector extension and prefetch instruction are present */ + +.text +.option arch, rv64gcv_zicbop +.global main + +main: + lla x5, main + prefetch.r (x5) + li x5, 4 + vsetvli x6, x5, e64, m1, ta, ma + vxor.vv v0, v0, v0 + sub x10, x5, x6 + ret diff --git a/src/crypto/randomx/tests/riscv64_zba.s b/src/crypto/randomx/tests/riscv64_zba.s new file mode 100644 index 000000000..e1947e7a6 --- /dev/null +++ b/src/crypto/randomx/tests/riscv64_zba.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zba extension is present */ + +.text +.global main + +main: + sh1add x6, x6, x7 + li x10, 0 + ret diff --git a/src/crypto/randomx/tests/riscv64_zbb.s b/src/crypto/randomx/tests/riscv64_zbb.s new file mode 100644 index 000000000..d922043f0 --- /dev/null +++ b/src/crypto/randomx/tests/riscv64_zbb.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zbb extension is present */ + +.text +.global main + +main: + ror x6, x6, x7 + li x10, 0 + ret diff --git a/src/crypto/rx/RxDataset.cpp b/src/crypto/rx/RxDataset.cpp index 86b3a3f6d..3495d7baa 100644 --- a/src/crypto/rx/RxDataset.cpp +++ b/src/crypto/rx/RxDataset.cpp @@ -43,6 +43,12 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache, randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5)); randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5); } +#ifdef XMRIG_RISCV + else if (itemCount % 4) { + randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 4)); + randomx_init_dataset(dataset, cache, startItem + itemCount - 4, 4); + } +#endif else { randomx_init_dataset(dataset, cache, startItem, itemCount); } @@ -209,7 +215,7 @@ void xmrig::RxDataset::allocate(bool hugePages, bool oneGbPages) return; } - m_memory = new VirtualMemory(maxSize(), hugePages, oneGbPages, false, m_node); + m_memory = new VirtualMemory(maxSize(), hugePages, oneGbPages, false, m_node, VirtualMemory::kDefaultHugePageSize); if (m_memory->isOneGbPages()) { m_scratchpadOffset = maxSize() + RANDOMX_CACHE_MAX_SIZE; diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp index acaa25e05..6ffe210d4 100644 --- a/src/crypto/rx/RxVm.cpp +++ b/src/crypto/rx/RxVm.cpp @@ -29,9 +29,17 @@ randomx_vm *xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so { int flags = 0; + // On RISC-V, force software AES path even if CPU reports AES capability. + // The RandomX portable intrinsics will throw at runtime when HAVE_AES is not defined + // for this architecture. Until native AES intrinsics are wired for RISC-V, avoid + // setting HARD_AES to prevent "Platform doesn't support hardware AES" aborts. +# ifndef XMRIG_RISCV if (!softAes) { flags |= RANDOMX_FLAG_HARD_AES; } +# else + (void)softAes; // unused on RISC-V to force soft AES +# endif if (dataset->get()) { flags |= RANDOMX_FLAG_FULL_MEM; diff --git a/src/net/JobResults.cpp b/src/net/JobResults.cpp index 19a1dc43c..57e4a8db0 100644 --- a/src/net/JobResults.cpp +++ b/src/net/JobResults.cpp @@ -115,7 +115,7 @@ static inline void checkHash(const JobBundle &bundle, std::vector &re static void getResults(JobBundle &bundle, std::vector &results, uint32_t &errors, bool hwAES) { const auto &algorithm = bundle.job.algorithm(); - auto memory = new VirtualMemory(algorithm.l3(), false, false, false); + auto memory = new VirtualMemory(algorithm.l3(), false, false, false, 0, VirtualMemory::kDefaultHugePageSize); alignas(16) uint8_t hash[32]{ 0 }; if (algorithm.family() == Algorithm::RANDOM_X) { diff --git a/src/version.h b/src/version.h index 9176a3d95..805fe727c 100644 --- a/src/version.h +++ b/src/version.h @@ -2,18 +2,7 @@ * Copyright (c) 2018-2025 SChernykh * Copyright (c) 2016-2025 XMRig , * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . + * SPDX-License-Identifier: GPL-3.0-or-later */ #ifndef XMRIG_VERSION_H @@ -22,18 +11,20 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "XMRig miner" -#define APP_VERSION "6.24.0" +#define APP_VERSION "6.25.0-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2025 xmrig.com" #define APP_KIND "miner" #define APP_VER_MAJOR 6 -#define APP_VER_MINOR 24 +#define APP_VER_MINOR 25 #define APP_VER_PATCH 0 #ifdef _MSC_VER -# if (_MSC_VER >= 1930) +# if (_MSC_VER >= 1950) +# define MSVC_VERSION 2026 +# elif (_MSC_VER >=1930 && _MSC_VER < 1950) # define MSVC_VERSION 2022 # elif (_MSC_VER >= 1920 && _MSC_VER < 1930) # define MSVC_VERSION 2019 @@ -64,6 +55,10 @@ # define APP_OS "Linux" #elif defined XMRIG_OS_FREEBSD # define APP_OS "FreeBSD" +#elif defined XMRIG_OS_OPENBSD +# define APP_OS "OpenBSD" +#elif defined XMRIG_OS_HAIKU +# define APP_OS "Haiku" #else # define APP_OS "Unknown OS" #endif @@ -73,6 +68,8 @@ #ifdef XMRIG_ARM # define APP_ARCH "ARMv" STR2(XMRIG_ARM) +#elif defined(XMRIG_RISCV) +# define APP_ARCH "RISC-V" #else # if defined(__x86_64__) || defined(__amd64__) || defined(_M_X64) || defined(_M_AMD64) # define APP_ARCH "x86-64"