diff --git a/CMakeLists.txt b/CMakeLists.txt index 313923226..ff7604836 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,6 +97,8 @@ set(HEADERS_CRYPTO if (XMRIG_ARM) set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_arm.h) +elseif (XMRIG_RISCV) + set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_arm.h) else() set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_x86.h) endif() diff --git a/README.md b/README.md index b4d40751c..a6f4c3587 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ XMRig is a high performance, open source, cross platform RandomX, KawPow, CryptoNight and [GhostRider](https://github.com/xmrig/xmrig/tree/master/src/crypto/ghostrider#readme) unified CPU/GPU miner and [RandomX benchmark](https://xmrig.com/benchmark). Official binaries are available for Windows, Linux, macOS and FreeBSD. ## Mining backends -- **CPU** (x86/x64/ARMv7/ARMv8) +- **CPU** (x86/x64/ARMv7/ARMv8,RISC-V) - **OpenCL** for AMD GPUs. - **CUDA** for NVIDIA GPUs via external [CUDA plugin](https://github.com/xmrig/xmrig-cuda). diff --git a/cmake/asm.cmake b/cmake/asm.cmake index e445defde..30a119c30 100644 --- a/cmake/asm.cmake +++ b/cmake/asm.cmake @@ -1,4 +1,4 @@ -if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) +if (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) set(XMRIG_ASM_LIBRARY "xmrig-asm") if (CMAKE_C_COMPILER_ID MATCHES MSVC) diff --git a/cmake/cpu.cmake b/cmake/cpu.cmake index fe322a3fd..5701720eb 100644 --- a/cmake/cpu.cmake +++ b/cmake/cpu.cmake @@ -21,6 +21,19 @@ if (NOT VAES_SUPPORTED) set(WITH_VAES OFF) endif() +# Detect RISC-V architecture early (before it's used below) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv64|riscv|rv64)$") + set(RISCV_TARGET 64) + set(XMRIG_RISCV ON) + add_definitions(-DXMRIG_RISCV) + message(STATUS "Detected RISC-V 64-bit architecture (${CMAKE_SYSTEM_PROCESSOR})") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv32|rv32)$") + set(RISCV_TARGET 32) + set(XMRIG_RISCV ON) + add_definitions(-DXMRIG_RISCV) + message(STATUS "Detected RISC-V 32-bit architecture (${CMAKE_SYSTEM_PROCESSOR})") +endif() + if (XMRIG_64_BIT AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$") add_definitions(-DRAPIDJSON_SSE2) else() @@ -29,6 +42,45 @@ else() set(WITH_VAES OFF) endif() +# Disable x86-specific features for RISC-V +if (XMRIG_RISCV) + set(WITH_SSE4_1 OFF) + set(WITH_AVX2 OFF) + set(WITH_VAES OFF) + + # default build uses the RV64GC baseline + set(RVARCH "rv64gc") + + # for native builds, enable Zba and Zbb if supported by the CPU + if(ARCH STREQUAL "native") + enable_language(ASM) + + try_run(RANDOMX_ZBA_RUN_FAIL + RANDOMX_ZBA_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zba.s + COMPILE_DEFINITIONS "-march=rv64gc_zba") + + if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL) + set(RVARCH "${RVARCH}_zba") + message(STATUS "RISC-V zba extension detected") + endif() + + try_run(RANDOMX_ZBB_RUN_FAIL + RANDOMX_ZBB_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zbb.s + COMPILE_DEFINITIONS "-march=rv64gc_zbb") + + if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL) + set(RVARCH "${RVARCH}_zbb") + message(STATUS "RISC-V zbb extension detected") + endif() + endif() + + message(STATUS "Using -march=${RVARCH}") +endif() + add_definitions(-DRAPIDJSON_WRITE_DEFAULT_FLAGS=6) # rapidjson::kWriteNanAndInfFlag | rapidjson::kWriteNanAndInfNullFlag if (ARM_V8) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 9abf212a0..a29a1a6d5 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -25,9 +25,16 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU) if (ARM_TARGET EQUAL 8) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS} -flax-vector-conversions") + add_definitions(-DHAVE_ROTR) elseif (ARM_TARGET EQUAL 7) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions") + add_definitions(-DHAVE_ROTR) + elseif (XMRIG_RISCV) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}") + + add_definitions(-DHAVE_ROTR) else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") @@ -71,9 +78,16 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang) if (ARM_TARGET EQUAL 8) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS}") + add_definitions(-DHAVE_ROTR) elseif (ARM_TARGET EQUAL 7) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") + add_definitions(-DHAVE_ROTR) + elseif (XMRIG_RISCV) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}") + + add_definitions(-DHAVE_ROTR) else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index a50e078fd..5aa20b807 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -62,7 +62,7 @@ if (WITH_RANDOMX) src/crypto/randomx/jit_compiler_x86_static.asm src/crypto/randomx/jit_compiler_x86.cpp ) - elseif (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) + elseif (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) list(APPEND SOURCES_CRYPTO src/crypto/randomx/jit_compiler_x86_static.S src/crypto/randomx/jit_compiler_x86.cpp @@ -80,6 +80,13 @@ if (WITH_RANDOMX) else() set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C) endif() + elseif (XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) + list(APPEND SOURCES_CRYPTO + src/crypto/randomx/jit_compiler_rv64_static.S + src/crypto/randomx/jit_compiler_rv64.cpp + ) + # cheat because cmake and ccache hate each other + set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_static.S PROPERTY LANGUAGE C) else() list(APPEND SOURCES_CRYPTO src/crypto/randomx/jit_compiler_fallback.cpp @@ -116,7 +123,7 @@ if (WITH_RANDOMX) ) endif() - if (WITH_MSR AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX)) + if (WITH_MSR AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX)) add_definitions(/DXMRIG_FEATURE_MSR) add_definitions(/DXMRIG_FIX_RYZEN) message("-- WITH_MSR=ON") diff --git a/doc/RISCV_PERF_TUNING.md b/doc/RISCV_PERF_TUNING.md new file mode 100644 index 000000000..b37a530d3 --- /dev/null +++ b/doc/RISCV_PERF_TUNING.md @@ -0,0 +1,365 @@ +# RISC-V Performance Optimization Guide + +This guide provides comprehensive instructions for optimizing XMRig on RISC-V architectures. + +## Build Optimizations + +### Compiler Flags Applied Automatically + +The CMake build now applies aggressive RISC-V-specific optimizations: + +```cmake +# RISC-V ISA with extensions +-march=rv64gcv_zba_zbb_zbc_zbs + +# Aggressive compiler optimizations +-funroll-loops # Unroll loops for ILP (instruction-level parallelism) +-fomit-frame-pointer # Free up frame pointer register (RISC-V has limited registers) +-fno-common # Better code generation for global variables +-finline-functions # Inline more functions for better cache locality +-ffast-math # Relaxed FP semantics (safe for mining) +-flto # Link-time optimization for cross-module inlining + +# Release build additions +-minline-atomics # Inline atomic operations for faster synchronization +``` + +### Optimal Build Command + +```bash +mkdir build && cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +make -j$(nproc) +``` + +**Expected build time**: 5-15 minutes depending on CPU + +## Runtime Optimizations + +### 1. Memory Configuration (Most Important) + +Enable huge pages to reduce TLB misses and fragmentation: + +#### Enable 2MB Huge Pages +```bash +# Calculate required huge pages (1 page = 2MB) +# For 2 GB dataset: 1024 pages +# For cache + dataset: 1536 pages minimum +sudo sysctl -w vm.nr_hugepages=2048 +``` + +Verify: +```bash +grep HugePages /proc/meminfo +# Expected: HugePages_Free should be close to nr_hugepages +``` + +#### Enable 1GB Huge Pages (Optional but Recommended) + +```bash +# Run provided helper script +sudo ./scripts/enable_1gb_pages.sh + +# Verify 1GB pages are available +cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +# Should be: >= 1 (one 1GB page) +``` + +Update config.json: +```json +{ + "cpu": { + "huge-pages": true + }, + "randomx": { + "1gb-pages": true + } +} +``` + +### 2. RandomX Mode Selection + +| Mode | Memory | Init Time | Throughput | Recommendation | +|------|--------|-----------|-----------|-----------------| +| **light** | 256 MB | 10 sec | Low | Testing, resource-constrained | +| **fast** | 2 GB | 2-5 min* | High | Production (with huge pages) | +| **auto** | 2 GB | Varies | High | Default (uses fast if possible) | + +*With optimizations; can be 30+ minutes without huge pages + +**For RISC-V, use fast mode with huge pages enabled.** + +### 3. Dataset Initialization Threads + +Optimal thread count = 60-75% of CPU cores (leaves headroom for OS/other tasks) + +```json +{ + "randomx": { + "init": 4 + } +} +``` + +Or auto-detect (rewritten for RISC-V): +```json +{ + "randomx": { + "init": -1 + } +} +``` + +### 4. CPU Affinity (Optional) + +Pin threads to specific cores for better cache locality: + +```json +{ + "cpu": { + "rx/0": [ + { "threads": 1, "affinity": 0 }, + { "threads": 1, "affinity": 1 }, + { "threads": 1, "affinity": 2 }, + { "threads": 1, "affinity": 3 } + ] + } +} +``` + +### 5. CPU Governor (Linux) + +Set to performance mode for maximum throughput: + +```bash +# Check current governor +cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor + +# Set to performance (requires root) +echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor + +# Verify +cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor +# Should output: performance +``` + +## Configuration Examples + +### Minimum (Testing) +```json +{ + "randomx": { + "mode": "light" + }, + "cpu": { + "huge-pages": false + } +} +``` + +### Recommended (Balanced) +```json +{ + "randomx": { + "mode": "auto", + "init": 4, + "1gb-pages": true + }, + "cpu": { + "huge-pages": true, + "priority": 2 + } +} +``` + +### Maximum Performance (Production) +```json +{ + "randomx": { + "mode": "fast", + "init": -1, + "1gb-pages": true, + "scratchpad_prefetch_mode": 1 + }, + "cpu": { + "huge-pages": true, + "priority": 3, + "yield": false + } +} +``` + +## CLI Equivalents + +```bash +# Light mode +./xmrig --randomx-mode=light + +# Fast mode with 4 init threads +./xmrig --randomx-mode=fast --randomx-init=4 + +# Benchmark +./xmrig --bench=1M --algo=rx/0 + +# Benchmark Wownero variant (1 MB scratchpad) +./xmrig --bench=1M --algo=rx/wow + +# Mine to pool +./xmrig -o pool.example.com:3333 -u YOUR_WALLET -p x +``` + +## Performance Diagnostics + +### Check if Vector Extensions are Detected + +Look for `FEATURES:` line in output: +``` + * CPU: ky,x60 (uarch ky,x1) + * FEATURES: rv64imafdcv zba zbb zbc zbs +``` + +- `v`: Vector extension (RVV) ✓ +- `zba`, `zbb`, `zbc`, `zbs`: Bit manipulation ✓ +- If missing, make sure build used `-march=rv64gcv_zba_zbb_zbc_zbs` + +### Verify Huge Pages at Runtime + +```bash +# Run xmrig with --bench=1M and check output +./xmrig --bench=1M + +# Look for line like: +# HUGE PAGES 100% 1 / 1 (1024 MB) +``` + +- Should show 100% for dataset AND threads +- If less, increase `vm.nr_hugepages` and reboot + +### Monitor Performance + +```bash +# Run benchmark multiple times to find stable hashrate +./xmrig --bench=1M --algo=rx/0 +./xmrig --bench=10M --algo=rx/0 +./xmrig --bench=100M --algo=rx/0 + +# Check system load and memory during mining +while true; do free -h; grep HugePages /proc/meminfo; sleep 2; done +``` + +## Expected Performance + +### Hardware: Orange Pi RV2 (Ky X1, 8 cores @ ~1.5 GHz) + +| Config | Mode | Hashrate | Init Time | +|--------|------|----------|-----------| +| Scalar (baseline) | fast | 30 H/s | 10 min | +| Scalar + huge pages | fast | 33 H/s | 2 min | +| RVV (if enabled) | fast | 70-100 H/s | 3 min | + +*Actual results depend on CPU frequency, memory speed, and load* + +## Troubleshooting + +### Long Initialization Times (30+ minutes) + +**Cause**: Huge pages not enabled, system using swap +**Solution**: +1. Enable huge pages: `sudo sysctl -w vm.nr_hugepages=2048` +2. Reboot: `sudo reboot` +3. Reduce mining threads to free memory +4. Check available memory: `free -h` + +### Low Hashrate (50% of expected) + +**Cause**: CPU governor set to power-save, no huge pages, high contention +**Solution**: +1. Set governor to performance: `echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor` +2. Enable huge pages +3. Reduce number of mining threads +4. Check system load: `top` or `htop` + +### Dataset Init Crashes or Hangs + +**Cause**: Insufficient memory, corrupted huge pages +**Solution**: +1. Disable huge pages temporarily: set `huge-pages: false` in config +2. Reduce mining threads +3. Reboot and re-enable huge pages +4. Try light mode: `--randomx-mode=light` + +### Out of Memory During Benchmark + +**Cause**: Not enough RAM for dataset + cache + threads +**Solution**: +1. Use light mode: `--randomx-mode=light` +2. Reduce mining threads: `--threads=1` +3. Increase available memory (kill other processes) +4. Check: `free -h` before mining + +## Advanced Tuning + +### Vector Length (VLEN) Detection + +RISC-V vector extension variable length (VLEN) affects performance: + +```bash +# Check VLEN on your CPU +cat /proc/cpuinfo | grep vlen + +# Expected values: +# - 128 bits (16 bytes) = minimum +# - 256 bits (32 bytes) = common +# - 512 bits (64 bytes) = high performance +``` + +Larger VLEN generally means better performance for vectorized operations. + +### Prefetch Optimization + +The code automatically optimizes memory prefetching for RISC-V: + +``` +scratchpad_prefetch_mode: 0 = disabled (slowest) +scratchpad_prefetch_mode: 1 = prefetch.r (default, recommended) +scratchpad_prefetch_mode: 2 = prefetch.w (experimental) +``` + +### Memory Bandwidth Saturation + +If experiencing memory bandwidth saturation (high latency): + +1. Reduce mining threads +2. Increase L2/L3 cache by mining fewer threads per core +3. Enable cache QoS (AMD Ryzen): `cache_qos: true` + +## Building with Custom Flags + +To build with custom RISC-V flags: + +```bash +mkdir build && cd build +cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="-march=rv64gcv_zba_zbb_zbc_zbs -O3 -funroll-loops -fomit-frame-pointer" \ + .. +make -j$(nproc) +``` + +## Future Optimizations + +- [ ] Zbk* (crypto) support detection and usage +- [ ] Optimal VLEN-aware algorithm selection +- [ ] Per-core memory affinity (NUMA support) +- [ ] Dynamic thread count adjustment based on thermals +- [ ] Cross-compile optimizations for various RISC-V cores + +## References + +- [RISC-V Vector Extension Spec](https://github.com/riscv/riscv-v-spec) +- [RISC-V Bit Manipulation Spec](https://github.com/riscv/riscv-bitmanip) +- [RISC-V Crypto Spec](https://github.com/riscv/riscv-crypto) +- [XMRig Documentation](https://xmrig.com/docs) + +--- + +For further optimization, enable RVV intrinsics by replacing `sse2rvv.h` with `sse2rvv_optimized.h` in the build. diff --git a/src/3rdparty/argon2/CMakeLists.txt b/src/3rdparty/argon2/CMakeLists.txt index a9751fd94..7d09e5172 100644 --- a/src/3rdparty/argon2/CMakeLists.txt +++ b/src/3rdparty/argon2/CMakeLists.txt @@ -35,7 +35,7 @@ if (CMAKE_C_COMPILER_ID MATCHES MSVC) add_feature_impl(xop "" HAVE_XOP) add_feature_impl(avx2 "/arch:AVX2" HAVE_AVX2) add_feature_impl(avx512f "/arch:AVX512F" HAVE_AVX512F) -elseif (NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) +elseif (NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8) function(add_feature_impl FEATURE GCC_FLAG DEF) add_library(argon2-${FEATURE} STATIC arch/x86_64/lib/argon2-${FEATURE}.c) target_include_directories(argon2-${FEATURE} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../) diff --git a/src/backend/cpu/cpu.cmake b/src/backend/cpu/cpu.cmake index f9a02abd8..3c9d779b0 100644 --- a/src/backend/cpu/cpu.cmake +++ b/src/backend/cpu/cpu.cmake @@ -46,7 +46,12 @@ else() set(CPUID_LIB "") endif() -if (XMRIG_ARM) +if (XMRIG_RISCV) + list(APPEND SOURCES_BACKEND_CPU + src/backend/cpu/platform/lscpu_riscv.cpp + src/backend/cpu/platform/BasicCpuInfo_riscv.cpp + ) +elseif (XMRIG_ARM) list(APPEND SOURCES_BACKEND_CPU src/backend/cpu/platform/BasicCpuInfo_arm.cpp) if (XMRIG_OS_WIN) diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index 8d10d4d29..e28a14734 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -91,7 +91,7 @@ public: ICpuInfo() = default; virtual ~ICpuInfo() = default; -# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__) +# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__) || defined(__riscv) && (__riscv_xlen == 64) inline constexpr static bool is64bit() { return true; } # else inline constexpr static bool is64bit() { return false; } diff --git a/src/backend/cpu/platform/BasicCpuInfo.h b/src/backend/cpu/platform/BasicCpuInfo.h index 5ea5661d1..97fe20e1b 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.h +++ b/src/backend/cpu/platform/BasicCpuInfo.h @@ -65,7 +65,7 @@ protected: inline Vendor vendor() const override { return m_vendor; } inline uint32_t model() const override { -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) return m_model; # else return 0; @@ -80,7 +80,7 @@ protected: Vendor m_vendor = VENDOR_UNKNOWN; private: -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) uint32_t m_procInfo = 0; uint32_t m_family = 0; uint32_t m_model = 0; diff --git a/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp b/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp new file mode 100644 index 000000000..fd9c9ce62 --- /dev/null +++ b/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp @@ -0,0 +1,116 @@ +/* XMRig + * Copyright (c) 2025 Slayingripper + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2017-2019 XMR-Stak , + * Copyright (c) 2016-2025 XMRig + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + + +#include "backend/cpu/platform/BasicCpuInfo.h" +#include "base/tools/String.h" +#include "3rdparty/rapidjson/document.h" + + +namespace xmrig { + + +extern String cpu_name_riscv(); +extern bool has_riscv_vector(); +extern bool has_riscv_crypto(); + + +} // namespace xmrig + + +xmrig::BasicCpuInfo::BasicCpuInfo() : + m_threads(std::thread::hardware_concurrency()) +{ + m_units.resize(m_threads); + for (int32_t i = 0; i < static_cast(m_threads); ++i) { + m_units[i] = i; + } + + memcpy(m_brand, "RISC-V", 6); + + auto name = cpu_name_riscv(); + if (!name.isNull()) { + strncpy(m_brand, name.data(), sizeof(m_brand) - 1); + } + + // Check for crypto extensions (Zknd/Zkne/Zknh - AES and SHA) + m_flags.set(FLAG_AES, has_riscv_crypto()); + + // RISC-V typically supports 1GB huge pages + m_flags.set(FLAG_PDPE1GB, std::ifstream("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages").good()); +} + + +const char *xmrig::BasicCpuInfo::backend() const +{ + return "basic/1"; +} + + +xmrig::CpuThreads xmrig::BasicCpuInfo::threads(const Algorithm &algorithm, uint32_t) const +{ +# ifdef XMRIG_ALGO_GHOSTRIDER + if (algorithm.family() == Algorithm::GHOSTRIDER) { + return CpuThreads(threads(), 8); + } +# endif + + return CpuThreads(threads()); +} + + +rapidjson::Value xmrig::BasicCpuInfo::toJSON(rapidjson::Document &doc) const +{ + using namespace rapidjson; + auto &allocator = doc.GetAllocator(); + + Value out(kObjectType); + + out.AddMember("brand", StringRef(brand()), allocator); + out.AddMember("aes", hasAES(), allocator); + out.AddMember("avx2", false, allocator); + out.AddMember("x64", is64bit(), allocator); // DEPRECATED will be removed in the next major release. + out.AddMember("64_bit", is64bit(), allocator); + out.AddMember("l2", static_cast(L2()), allocator); + out.AddMember("l3", static_cast(L3()), allocator); + out.AddMember("cores", static_cast(cores()), allocator); + out.AddMember("threads", static_cast(threads()), allocator); + out.AddMember("packages", static_cast(packages()), allocator); + out.AddMember("nodes", static_cast(nodes()), allocator); + out.AddMember("backend", StringRef(backend()), allocator); + out.AddMember("msr", "none", allocator); + out.AddMember("assembly", "none", allocator); + out.AddMember("arch", "riscv64", allocator); + + Value flags(kArrayType); + + if (hasAES()) { + flags.PushBack("aes", allocator); + } + + out.AddMember("flags", flags, allocator); + + return out; +} diff --git a/src/backend/cpu/platform/HwlocCpuInfo.cpp b/src/backend/cpu/platform/HwlocCpuInfo.cpp index f796416b4..1cb071b7a 100644 --- a/src/backend/cpu/platform/HwlocCpuInfo.cpp +++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp @@ -87,7 +87,7 @@ static inline size_t countByType(hwloc_topology_t topology, hwloc_obj_type_t typ } -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) static inline std::vector findByType(hwloc_obj_t obj, hwloc_obj_type_t type) { std::vector out; @@ -207,7 +207,7 @@ bool xmrig::HwlocCpuInfo::membind(hwloc_const_bitmap_t nodeset) xmrig::CpuThreads xmrig::HwlocCpuInfo::threads(const Algorithm &algorithm, uint32_t limit) const { -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) if (L2() == 0 && L3() == 0) { return BasicCpuInfo::threads(algorithm, limit); } @@ -277,7 +277,7 @@ xmrig::CpuThreads xmrig::HwlocCpuInfo::allThreads(const Algorithm &algorithm, ui void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorithm &algorithm, CpuThreads &threads, size_t limit) const { -# ifndef XMRIG_ARM +# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) constexpr size_t oneMiB = 1024U * 1024U; size_t PUs = countByType(cache, HWLOC_OBJ_PU); diff --git a/src/backend/cpu/platform/lscpu_riscv.cpp b/src/backend/cpu/platform/lscpu_riscv.cpp new file mode 100644 index 000000000..d19d26a8f --- /dev/null +++ b/src/backend/cpu/platform/lscpu_riscv.cpp @@ -0,0 +1,140 @@ +/* XMRig + * Copyright (c) 2025 Slayingripper + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "base/tools/String.h" +#include "3rdparty/fmt/core.h" + +#include +#include +#include + +namespace xmrig { + +struct riscv_cpu_desc +{ + String model; + String isa; + String uarch; + bool has_vector = false; + bool has_crypto = false; + + inline bool isReady() const { return !model.isNull(); } +}; + +static bool lookup_riscv(char *line, const char *pattern, String &value) +{ + char *p = strstr(line, pattern); + if (!p) { + return false; + } + + p += strlen(pattern); + while (isspace(*p)) { + ++p; + } + + if (*p == ':') { + ++p; + } + + while (isspace(*p)) { + ++p; + } + + // Remove trailing newline + size_t len = strlen(p); + if (len > 0 && p[len - 1] == '\n') { + p[len - 1] = '\0'; + } + + // Ensure we call the const char* assignment (which performs a copy) + // instead of the char* overload (which would take ownership of the pointer) + value = (const char*)p; + return true; +} + +static bool read_riscv_cpuinfo(riscv_cpu_desc *desc) +{ + auto fp = fopen("/proc/cpuinfo", "r"); + if (!fp) { + return false; + } + + char buf[2048]; // Larger buffer for long ISA strings + while (fgets(buf, sizeof(buf), fp) != nullptr) { + lookup_riscv(buf, "model name", desc->model); + + if (lookup_riscv(buf, "isa", desc->isa)) { + // Check for vector extensions + if (strstr(buf, "zve") || strstr(buf, "v_")) { + desc->has_vector = true; + } + // Check for crypto extensions (AES, SHA, etc.) + // zkn* = NIST crypto suite, zks* = SM crypto suite + // Note: zba/zbb/zbc/zbs are bit-manipulation, NOT crypto + if (strstr(buf, "zknd") || strstr(buf, "zkne") || strstr(buf, "zknh") || + strstr(buf, "zksed") || strstr(buf, "zksh")) { + desc->has_crypto = true; + } + } + + lookup_riscv(buf, "uarch", desc->uarch); + + if (desc->isReady() && !desc->isa.isNull()) { + break; + } + } + + fclose(fp); + + return desc->isReady(); +} + +String cpu_name_riscv() +{ + riscv_cpu_desc desc; + if (read_riscv_cpuinfo(&desc)) { + if (!desc.uarch.isNull()) { + return fmt::format("{} ({})", desc.model, desc.uarch).c_str(); + } + return desc.model; + } + + return "RISC-V"; +} + +bool has_riscv_vector() +{ + riscv_cpu_desc desc; + if (read_riscv_cpuinfo(&desc)) { + return desc.has_vector; + } + return false; +} + +bool has_riscv_crypto() +{ + riscv_cpu_desc desc; + if (read_riscv_cpuinfo(&desc)) { + return desc.has_crypto; + } + return false; +} + +} // namespace xmrig diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp index 4b4b006f3..b1f228b21 100644 --- a/src/crypto/cn/CnHash.cpp +++ b/src/crypto/cn/CnHash.cpp @@ -23,7 +23,7 @@ #include "crypto/common/VirtualMemory.h" -#if defined(XMRIG_ARM) +#if defined(XMRIG_ARM) || defined(XMRIG_RISCV) # include "crypto/cn/CryptoNight_arm.h" #else # include "crypto/cn/CryptoNight_x86.h" diff --git a/src/crypto/cn/CryptoNight.h b/src/crypto/cn/CryptoNight.h index 897890d28..d37c3ea8e 100644 --- a/src/crypto/cn/CryptoNight.h +++ b/src/crypto/cn/CryptoNight.h @@ -30,7 +30,7 @@ #include #include -#if defined _MSC_VER || defined XMRIG_ARM +#if defined _MSC_VER || defined XMRIG_ARM || defined XMRIG_RISCV # define ABI_ATTRIBUTE #else # define ABI_ATTRIBUTE __attribute__((ms_abi)) diff --git a/src/crypto/cn/CryptoNight_arm.h b/src/crypto/cn/CryptoNight_arm.h index 7b47e97da..eeb5bd007 100644 --- a/src/crypto/cn/CryptoNight_arm.h +++ b/src/crypto/cn/CryptoNight_arm.h @@ -27,6 +27,9 @@ #ifndef XMRIG_CRYPTONIGHT_ARM_H #define XMRIG_CRYPTONIGHT_ARM_H +#ifdef XMRIG_RISCV +# include "crypto/cn/sse2rvv.h" +#endif #include "base/crypto/keccak.h" #include "crypto/cn/CnAlgo.h" diff --git a/src/crypto/cn/CryptoNight_monero.h b/src/crypto/cn/CryptoNight_monero.h index a9975e784..6c3d115ed 100644 --- a/src/crypto/cn/CryptoNight_monero.h +++ b/src/crypto/cn/CryptoNight_monero.h @@ -30,7 +30,7 @@ #include // VARIANT ALTERATIONS -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) # define VARIANT1_INIT(part) \ uint64_t tweak1_2_##part = 0; \ if (BASE == Algorithm::CN_1) { \ @@ -60,7 +60,7 @@ } -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) # define VARIANT2_INIT(part) \ __m128i division_result_xmm_##part = _mm_cvtsi64_si128(static_cast(h##part[12])); \ __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(static_cast(h##part[13])); diff --git a/src/crypto/cn/soft_aes.h b/src/crypto/cn/soft_aes.h index fc3712298..6de0089db 100644 --- a/src/crypto/cn/soft_aes.h +++ b/src/crypto/cn/soft_aes.h @@ -29,6 +29,8 @@ #if defined(XMRIG_ARM) # include "crypto/cn/sse2neon.h" +#elif defined(XMRIG_RISCV) +# include "crypto/cn/sse2rvv.h" #elif defined(__GNUC__) # include #else diff --git a/src/crypto/cn/sse2rvv.h b/src/crypto/cn/sse2rvv.h new file mode 100644 index 000000000..d5b525b51 --- /dev/null +++ b/src/crypto/cn/sse2rvv.h @@ -0,0 +1,748 @@ +/* XMRig + * Copyright (c) 2025 Slayingripper + * Copyright (c) 2018-2025 SChernykh + * Copyright (c) 2016-2025 XMRig + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * SSE to RISC-V Vector (RVV) optimized compatibility header + * Provides both scalar fallback and vectorized implementations using RVV intrinsics + * + * Based on sse2neon.h concepts, adapted for RISC-V architecture with RVV extensions + * Original sse2neon.h: https://github.com/DLTcollab/sse2neon + */ + +#ifndef XMRIG_SSE2RVV_OPTIMIZED_H +#define XMRIG_SSE2RVV_OPTIMIZED_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Check if RVV is available */ +#if defined(__riscv_vector) +#include +#define USE_RVV_INTRINSICS 1 +#else +#define USE_RVV_INTRINSICS 0 +#endif + +/* 128-bit vector type */ +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; +} __m128i_union; + +typedef __m128i_union __m128i; + +/* Set operations */ +static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0) +{ + __m128i result; + result.i32[0] = e0; + result.i32[1] = e1; + result.i32[2] = e2; + result.i32[3] = e3; + return result; +} + +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) +{ + __m128i result; + result.i64[0] = e0; + result.i64[1] = e1; + return result; +} + +static inline __m128i _mm_setzero_si128(void) +{ + __m128i result; + memset(&result, 0, sizeof(result)); + return result; +} + +/* Extract/insert operations */ +static inline int _mm_cvtsi128_si32(__m128i a) +{ + return a.i32[0]; +} + +static inline int64_t _mm_cvtsi128_si64(__m128i a) +{ + return a.i64[0]; +} + +static inline __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = _mm_setzero_si128(); + result.i32[0] = a; + return result; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) +{ + __m128i result = _mm_setzero_si128(); + result.i64[0] = a; + return result; +} + +/* Shuffle operations */ +static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8) +{ + __m128i result; + result.u32[0] = a.u32[(imm8 >> 0) & 0x3]; + result.u32[1] = a.u32[(imm8 >> 2) & 0x3]; + result.u32[2] = a.u32[(imm8 >> 4) & 0x3]; + result.u32[3] = a.u32[(imm8 >> 6) & 0x3]; + return result; +} + +/* Logical operations - optimized with RVV when available */ +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_or_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (~a.u64[0]) & b.u64[0]; + result.u64[1] = (~a.u64[1]) & b.u64[1]; + return result; +#endif +} + +/* Shift operations */ +static inline __m128i _mm_slli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = 0; i < 16 - count; i++) { + result.u8[i + count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_srli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = count; i < 16; i++) { + result.u8[i - count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_slli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] << imm8; + result.u64[1] = a.u64[1] << imm8; + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] >> imm8; + result.u64[1] = a.u64[1] >> imm8; + } + return result; +#endif +} + +/* Load/store operations - optimized with RVV */ +static inline __m128i _mm_load_si128(const __m128i* p) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl); + __riscv_vse64_v_u64m1(result.u64, v, vl); + return result; +#else + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +#endif +} + +static inline __m128i _mm_loadu_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline void _mm_store_si128(__m128i* p, __m128i a) +{ +#if USE_RVV_INTRINSICS + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl); + __riscv_vse64_v_u64m1((uint64_t*)p, v, vl); +#else + memcpy(p, &a, sizeof(__m128i)); +#endif +} + +static inline void _mm_storeu_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +/* Arithmetic operations - optimized with RVV */ +static inline __m128i _mm_add_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl); + vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + return result; +#else + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a.i32[i] + b.i32[i]; + } + return result; +#endif +} + +static inline __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] - b.u64[0]; + result.u64[1] = a.u64[1] - b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl); + vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl); + vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0]; + result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2]; + return result; +#endif +} + +/* Unpack operations */ +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0]; + result.u64[1] = b.u64[0]; + return result; +} + +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[1]; + result.u64[1] = b.u64[1]; + return result; +} + +/* Pause instruction for spin-wait loops */ +static inline void _mm_pause(void) +{ + /* RISC-V pause hint if available (requires Zihintpause extension) */ +#if defined(__riscv_zihintpause) + __asm__ __volatile__("pause"); +#else + __asm__ __volatile__("nop"); +#endif +} + +/* Memory fence - optimized for RISC-V */ +static inline void _mm_mfence(void) +{ + __asm__ __volatile__("fence rw,rw" ::: "memory"); +} + +static inline void _mm_lfence(void) +{ + __asm__ __volatile__("fence r,r" ::: "memory"); +} + +static inline void _mm_sfence(void) +{ + __asm__ __volatile__("fence w,w" ::: "memory"); +} + +/* Comparison operations */ +static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0; + } + return result; +} + +static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 2; i++) { + result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + } + return result; +} + +/* Additional shift operations */ +static inline __m128i _mm_slli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] << imm8; + } + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] >> imm8; + } + } + return result; +#endif +} + +/* 64-bit integer operations */ +static inline __m128i _mm_set1_epi64x(int64_t a) +{ + __m128i result; + result.i64[0] = a; + result.i64[1] = a; + return result; +} + +/* Float type for compatibility */ +typedef __m128i __m128; + +/* Float operations - simplified scalar implementations */ +static inline __m128 _mm_set1_ps(float a) +{ + __m128 result; + uint32_t val; + memcpy(&val, &a, sizeof(float)); + for (int i = 0; i < 4; i++) { + result.u32[i] = val; + } + return result; +} + +static inline __m128 _mm_setzero_ps(void) +{ + __m128 result; + memset(&result, 0, sizeof(result)); + return result; +} + +static inline __m128 _mm_add_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] + fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] * fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_and_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128 _mm_or_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128 _mm_cvtepi32_ps(__m128i a) +{ + __m128 result; + float fr[4]; + for (int i = 0; i < 4; i++) { + fr[i] = (float)a.i32[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_cvttps_epi32(__m128 a) +{ + __m128i result; + float fa[4]; + memcpy(fa, &a, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + result.i32[i] = (int32_t)fa[i]; + } + return result; +} + +/* Casting operations */ +static inline __m128 _mm_castsi128_ps(__m128i a) +{ + __m128 result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_castps_si128(__m128 a) +{ + __m128i result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +/* Additional set operations */ +static inline __m128i _mm_set1_epi32(int a) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a; + } + return result; +} + +/* AES instructions - placeholders for soft_aes compatibility */ +static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey) +{ + return _mm_xor_si128(a, roundkey); +} + +static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + return a; +} + +/* Rotate right operation for soft_aes.h */ +static inline uint32_t _rotr(uint32_t value, unsigned int count) +{ + const unsigned int mask = 31; + count &= mask; + return (value >> count) | (value << ((-count) & mask)); +} + +/* ARM NEON compatibility types and intrinsics for RISC-V */ +typedef __m128i_union uint64x2_t; +typedef __m128i_union uint8x16_t; +typedef __m128i_union int64x2_t; +typedef __m128i_union int32x4_t; + +static inline uint64x2_t vld1q_u64(const uint64_t *ptr) +{ + uint64x2_t result; + result.u64[0] = ptr[0]; + result.u64[1] = ptr[1]; + return result; +} + +static inline int64x2_t vld1q_s64(const int64_t *ptr) +{ + int64x2_t result; + result.i64[0] = ptr[0]; + result.i64[1] = ptr[1]; + return result; +} + +static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val) +{ + ptr[0] = val.u64[0]; + ptr[1] = val.u64[1]; +} + +static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_xor_si128(a, b); +} + +static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_add_epi64(a, b); +} + +static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a) +{ + uint64x2_t result; + memcpy(&result, &a, sizeof(uint64x2_t)); + return result; +} + +static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane) +{ + return v.u64[lane]; +} + +static inline int64_t vgetq_lane_s64(int64x2_t v, int lane) +{ + return v.i64[lane]; +} + +static inline int32_t vgetq_lane_s32(int32x4_t v, int lane) +{ + return v.i32[lane]; +} + +typedef struct { uint64_t val[1]; } uint64x1_t; + +static inline uint64x1_t vcreate_u64(uint64_t a) +{ + uint64x1_t result; + result.val[0] = a; + return result; +} + +static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high) +{ + uint64x2_t result; + result.u64[0] = low.val[0]; + result.u64[1] = high.val[0]; + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */ diff --git a/src/crypto/cn/sse2rvv_optimized.h b/src/crypto/cn/sse2rvv_optimized.h new file mode 100644 index 000000000..f83f1101c --- /dev/null +++ b/src/crypto/cn/sse2rvv_optimized.h @@ -0,0 +1,748 @@ +/* XMRig + * Copyright (c) 2025 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * SSE to RISC-V Vector (RVV) optimized compatibility header + * Provides both scalar fallback and vectorized implementations using RVV intrinsics + */ + +#ifndef XMRIG_SSE2RVV_OPTIMIZED_H +#define XMRIG_SSE2RVV_OPTIMIZED_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Check if RVV is available */ +#if defined(__riscv_vector) +#include +#define USE_RVV_INTRINSICS 1 +#else +#define USE_RVV_INTRINSICS 0 +#endif + +/* 128-bit vector type */ +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; +#if USE_RVV_INTRINSICS + vuint64m1_t rvv_u64; + vuint32m1_t rvv_u32; + vuint8m1_t rvv_u8; +#endif +} __m128i_union; + +typedef __m128i_union __m128i; + +/* Set operations */ +static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0) +{ + __m128i result; + result.i32[0] = e0; + result.i32[1] = e1; + result.i32[2] = e2; + result.i32[3] = e3; + return result; +} + +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) +{ + __m128i result; + result.i64[0] = e0; + result.i64[1] = e1; + return result; +} + +static inline __m128i _mm_setzero_si128(void) +{ + __m128i result; + memset(&result, 0, sizeof(result)); + return result; +} + +/* Extract/insert operations */ +static inline int _mm_cvtsi128_si32(__m128i a) +{ + return a.i32[0]; +} + +static inline int64_t _mm_cvtsi128_si64(__m128i a) +{ + return a.i64[0]; +} + +static inline __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = _mm_setzero_si128(); + result.i32[0] = a; + return result; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) +{ + __m128i result = _mm_setzero_si128(); + result.i64[0] = a; + return result; +} + +/* Shuffle operations */ +static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8) +{ + __m128i result; + result.u32[0] = a.u32[(imm8 >> 0) & 0x3]; + result.u32[1] = a.u32[(imm8 >> 2) & 0x3]; + result.u32[2] = a.u32[(imm8 >> 4) & 0x3]; + result.u32[3] = a.u32[(imm8 >> 6) & 0x3]; + return result; +} + +/* Logical operations - optimized with RVV when available */ +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_or_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl); + vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (~a.u64[0]) & b.u64[0]; + result.u64[1] = (~a.u64[1]) & b.u64[1]; + return result; +#endif +} + +/* Shift operations */ +static inline __m128i _mm_slli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = 0; i < 16 - count; i++) { + result.u8[i + count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_srli_si128(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + size_t vl = __riscv_vsetvl_e8m1(16); + vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl); + vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl); + __riscv_vse8_v_u8m1(result.u8, vr, vl); + return result; +#else + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = count; i < 16; i++) { + result.u8[i - count] = a.u8[i]; + } + return result; +#endif +} + +static inline __m128i _mm_slli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] << imm8; + result.u64[1] = a.u64[1] << imm8; + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi64(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] >> imm8; + result.u64[1] = a.u64[1] >> imm8; + } + return result; +#endif +} + +/* Load/store operations - optimized with RVV */ +static inline __m128i _mm_load_si128(const __m128i* p) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl); + __riscv_vse64_v_u64m1(result.u64, v, vl); + return result; +#else + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +#endif +} + +static inline __m128i _mm_loadu_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline void _mm_store_si128(__m128i* p, __m128i a) +{ +#if USE_RVV_INTRINSICS + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl); + __riscv_vse64_v_u64m1((uint64_t*)p, v, vl); +#else + memcpy(p, &a, sizeof(__m128i)); +#endif +} + +static inline void _mm_storeu_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +/* Arithmetic operations - optimized with RVV */ +static inline __m128i _mm_add_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl); + vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + return result; +#else + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a.i32[i] + b.i32[i]; + } + return result; +#endif +} + +static inline __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl); + vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl); + vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = a.u64[0] - b.u64[0]; + result.u64[1] = a.u64[1] - b.u64[1]; + return result; +#endif +} + +static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ +#if USE_RVV_INTRINSICS + __m128i result; + size_t vl = __riscv_vsetvl_e64m1(2); + vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl); + vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl); + vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl); + __riscv_vse64_v_u64m1(result.u64, vr, vl); + return result; +#else + __m128i result; + result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0]; + result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2]; + return result; +#endif +} + +/* Unpack operations */ +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0]; + result.u64[1] = b.u64[0]; + return result; +} + +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[1]; + result.u64[1] = b.u64[1]; + return result; +} + +/* Pause instruction for spin-wait loops */ +static inline void _mm_pause(void) +{ + /* RISC-V pause hint if available (requires Zihintpause extension) */ +#if defined(__riscv_zihintpause) + __asm__ __volatile__("pause"); +#else + __asm__ __volatile__("nop"); +#endif +} + +/* Memory fence - optimized for RISC-V */ +static inline void _mm_mfence(void) +{ + __asm__ __volatile__("fence rw,rw" ::: "memory"); +} + +static inline void _mm_lfence(void) +{ + __asm__ __volatile__("fence r,r" ::: "memory"); +} + +static inline void _mm_sfence(void) +{ + __asm__ __volatile__("fence w,w" ::: "memory"); +} + +/* Comparison operations */ +static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0; + } + return result; +} + +static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 2; i++) { + result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + } + return result; +} + +/* Additional shift operations */ +static inline __m128i _mm_slli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] << imm8; + } + } + return result; +#endif +} + +static inline __m128i _mm_srli_epi32(__m128i a, int imm8) +{ +#if USE_RVV_INTRINSICS + __m128i result; + if (imm8 > 31) { + memset(&result, 0, sizeof(result)); + } else { + size_t vl = __riscv_vsetvl_e32m1(4); + vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl); + vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl); + __riscv_vse32_v_u32m1(result.u32, vr, vl); + } + return result; +#else + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] >> imm8; + } + } + return result; +#endif +} + +/* 64-bit integer operations */ +static inline __m128i _mm_set1_epi64x(int64_t a) +{ + __m128i result; + result.i64[0] = a; + result.i64[1] = a; + return result; +} + +/* Float type for compatibility */ +typedef __m128i __m128; + +/* Float operations - simplified scalar implementations */ +static inline __m128 _mm_set1_ps(float a) +{ + __m128 result; + uint32_t val; + memcpy(&val, &a, sizeof(float)); + for (int i = 0; i < 4; i++) { + result.u32[i] = val; + } + return result; +} + +static inline __m128 _mm_setzero_ps(void) +{ + __m128 result; + memset(&result, 0, sizeof(result)); + return result; +} + +static inline __m128 _mm_add_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] + fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] * fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_and_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128 _mm_or_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128 _mm_cvtepi32_ps(__m128i a) +{ + __m128 result; + float fr[4]; + for (int i = 0; i < 4; i++) { + fr[i] = (float)a.i32[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_cvttps_epi32(__m128 a) +{ + __m128i result; + float fa[4]; + memcpy(fa, &a, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + result.i32[i] = (int32_t)fa[i]; + } + return result; +} + +/* Casting operations */ +static inline __m128 _mm_castsi128_ps(__m128i a) +{ + __m128 result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_castps_si128(__m128 a) +{ + __m128i result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +/* Additional set operations */ +static inline __m128i _mm_set1_epi32(int a) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a; + } + return result; +} + +/* AES instructions - placeholders for soft_aes compatibility */ +static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey) +{ + return _mm_xor_si128(a, roundkey); +} + +static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + return a; +} + +/* Rotate right operation for soft_aes.h */ +static inline uint32_t _rotr(uint32_t value, unsigned int count) +{ + const unsigned int mask = 31; + count &= mask; + return (value >> count) | (value << ((-count) & mask)); +} + +/* ARM NEON compatibility types and intrinsics for RISC-V */ +typedef __m128i_union uint64x2_t; +typedef __m128i_union uint8x16_t; +typedef __m128i_union int64x2_t; +typedef __m128i_union int32x4_t; + +static inline uint64x2_t vld1q_u64(const uint64_t *ptr) +{ + uint64x2_t result; + result.u64[0] = ptr[0]; + result.u64[1] = ptr[1]; + return result; +} + +static inline int64x2_t vld1q_s64(const int64_t *ptr) +{ + int64x2_t result; + result.i64[0] = ptr[0]; + result.i64[1] = ptr[1]; + return result; +} + +static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val) +{ + ptr[0] = val.u64[0]; + ptr[1] = val.u64[1]; +} + +static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_xor_si128(a, b); +} + +static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) +{ + return _mm_add_epi64(a, b); +} + +static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a) +{ + uint64x2_t result; + memcpy(&result, &a, sizeof(uint64x2_t)); + return result; +} + +static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane) +{ + return v.u64[lane]; +} + +static inline int64_t vgetq_lane_s64(int64x2_t v, int lane) +{ + return v.i64[lane]; +} + +static inline int32_t vgetq_lane_s32(int32x4_t v, int lane) +{ + return v.i32[lane]; +} + +typedef struct { uint64_t val[1]; } uint64x1_t; + +static inline uint64x1_t vcreate_u64(uint64_t a) +{ + uint64x1_t result; + result.val[0] = a; + return result; +} + +static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high) +{ + uint64x2_t result; + result.u64[0] = low.val[0]; + result.u64[1] = high.val[0]; + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */ diff --git a/src/crypto/cn/sse2rvv_scalar_backup.h b/src/crypto/cn/sse2rvv_scalar_backup.h new file mode 100644 index 000000000..853adbb88 --- /dev/null +++ b/src/crypto/cn/sse2rvv_scalar_backup.h @@ -0,0 +1,571 @@ +/* XMRig + * Copyright (c) 2025 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * SSE to RISC-V compatibility header + * Provides scalar implementations of SSE intrinsics for RISC-V architecture + */ + +#ifndef XMRIG_SSE2RVV_H +#define XMRIG_SSE2RVV_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* 128-bit vector type */ +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; +} __m128i_union; + +typedef __m128i_union __m128i; + +/* Set operations */ +static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0) +{ + __m128i result; + result.i32[0] = e0; + result.i32[1] = e1; + result.i32[2] = e2; + result.i32[3] = e3; + return result; +} + +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) +{ + __m128i result; + result.i64[0] = e0; + result.i64[1] = e1; + return result; +} + +static inline __m128i _mm_setzero_si128(void) +{ + __m128i result; + memset(&result, 0, sizeof(result)); + return result; +} + +/* Extract/insert operations */ +static inline int _mm_cvtsi128_si32(__m128i a) +{ + return a.i32[0]; +} + +static inline int64_t _mm_cvtsi128_si64(__m128i a) +{ + return a.i64[0]; +} + +static inline __m128i _mm_cvtsi32_si128(int a) +{ + __m128i result = _mm_setzero_si128(); + result.i32[0] = a; + return result; +} + +static inline __m128i _mm_cvtsi64_si128(int64_t a) +{ + __m128i result = _mm_setzero_si128(); + result.i64[0] = a; + return result; +} + +/* Shuffle operations */ +static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8) +{ + __m128i result; + result.u32[0] = a.u32[(imm8 >> 0) & 0x3]; + result.u32[1] = a.u32[(imm8 >> 2) & 0x3]; + result.u32[2] = a.u32[(imm8 >> 4) & 0x3]; + result.u32[3] = a.u32[(imm8 >> 6) & 0x3]; + return result; +} + +/* Logical operations */ +static inline __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +} + +static inline __m128i _mm_or_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = (~a.u64[0]) & b.u64[0]; + result.u64[1] = (~a.u64[1]) & b.u64[1]; + return result; +} + +/* Shift operations */ +static inline __m128i _mm_slli_si128(__m128i a, int imm8) +{ + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = 0; i < 16 - count; i++) { + result.u8[i + count] = a.u8[i]; + } + return result; +} + +static inline __m128i _mm_srli_si128(__m128i a, int imm8) +{ + __m128i result = _mm_setzero_si128(); + int count = imm8 & 0xFF; + if (count > 15) return result; + + for (int i = count; i < 16; i++) { + result.u8[i - count] = a.u8[i]; + } + return result; +} + +static inline __m128i _mm_slli_epi64(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] << imm8; + result.u64[1] = a.u64[1] << imm8; + } + return result; +} + +static inline __m128i _mm_srli_epi64(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 63) { + result.u64[0] = 0; + result.u64[1] = 0; + } else { + result.u64[0] = a.u64[0] >> imm8; + result.u64[1] = a.u64[1] >> imm8; + } + return result; +} + +/* Load/store operations */ +static inline __m128i _mm_load_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline __m128i _mm_loadu_si128(const __m128i* p) +{ + __m128i result; + memcpy(&result, p, sizeof(__m128i)); + return result; +} + +static inline void _mm_store_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +static inline void _mm_storeu_si128(__m128i* p, __m128i a) +{ + memcpy(p, &a, sizeof(__m128i)); +} + +/* Arithmetic operations */ +static inline __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +} + +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a.i32[i] + b.i32[i]; + } + return result; +} + +static inline __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0] - b.u64[0]; + result.u64[1] = a.u64[1] - b.u64[1]; + return result; +} + +static inline __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0]; + result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2]; + return result; +} + +/* Unpack operations */ +static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[0]; + result.u64[1] = b.u64[0]; + return result; +} + +static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + __m128i result; + result.u64[0] = a.u64[1]; + result.u64[1] = b.u64[1]; + return result; +} + +/* Pause instruction for spin-wait loops */ +static inline void _mm_pause(void) +{ + /* RISC-V doesn't have a direct equivalent to x86 PAUSE + * Use a simple NOP or yield hint */ + __asm__ __volatile__("nop"); +} + +/* Memory fence */ +static inline void _mm_mfence(void) +{ + __asm__ __volatile__("fence" ::: "memory"); +} + +static inline void _mm_lfence(void) +{ + __asm__ __volatile__("fence r,r" ::: "memory"); +} + +static inline void _mm_sfence(void) +{ + __asm__ __volatile__("fence w,w" ::: "memory"); +} + +/* Comparison operations */ +static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0; + } + return result; +} + +static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 2; i++) { + result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0; + } + return result; +} + +/* Additional shift operations */ +static inline __m128i _mm_slli_epi32(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] << imm8; + } + } + return result; +} + +static inline __m128i _mm_srli_epi32(__m128i a, int imm8) +{ + __m128i result; + if (imm8 > 31) { + for (int i = 0; i < 4; i++) result.u32[i] = 0; + } else { + for (int i = 0; i < 4; i++) { + result.u32[i] = a.u32[i] >> imm8; + } + } + return result; +} + +/* 64-bit integer operations */ +static inline __m128i _mm_set1_epi64x(int64_t a) +{ + __m128i result; + result.i64[0] = a; + result.i64[1] = a; + return result; +} + +/* Float type for compatibility - we'll treat it as int for simplicity */ +typedef __m128i __m128; + +/* Float operations - simplified scalar implementations */ +static inline __m128 _mm_set1_ps(float a) +{ + __m128 result; + uint32_t val; + memcpy(&val, &a, sizeof(float)); + for (int i = 0; i < 4; i++) { + result.u32[i] = val; + } + return result; +} + +static inline __m128 _mm_setzero_ps(void) +{ + __m128 result; + memset(&result, 0, sizeof(result)); + return result; +} + +static inline __m128 _mm_add_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] + fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + __m128 result; + float fa[4], fb[4], fr[4]; + memcpy(fa, &a, sizeof(__m128)); + memcpy(fb, &b, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + fr[i] = fa[i] * fb[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128 _mm_and_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] & b.u64[0]; + result.u64[1] = a.u64[1] & b.u64[1]; + return result; +} + +static inline __m128 _mm_or_ps(__m128 a, __m128 b) +{ + __m128 result; + result.u64[0] = a.u64[0] | b.u64[0]; + result.u64[1] = a.u64[1] | b.u64[1]; + return result; +} + +static inline __m128 _mm_cvtepi32_ps(__m128i a) +{ + __m128 result; + float fr[4]; + for (int i = 0; i < 4; i++) { + fr[i] = (float)a.i32[i]; + } + memcpy(&result, fr, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_cvttps_epi32(__m128 a) +{ + __m128i result; + float fa[4]; + memcpy(fa, &a, sizeof(__m128)); + for (int i = 0; i < 4; i++) { + result.i32[i] = (int32_t)fa[i]; + } + return result; +} + +/* Casting operations */ +static inline __m128 _mm_castsi128_ps(__m128i a) +{ + __m128 result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +static inline __m128i _mm_castps_si128(__m128 a) +{ + __m128i result; + memcpy(&result, &a, sizeof(__m128)); + return result; +} + +/* Additional set operations */ +static inline __m128i _mm_set1_epi32(int a) +{ + __m128i result; + for (int i = 0; i < 4; i++) { + result.i32[i] = a; + } + return result; +} + +/* AES instructions - these are placeholders, actual AES is done via soft_aes.h */ +/* On RISC-V without crypto extensions, these should never be called directly */ +/* They are only here for compilation compatibility */ +static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey) +{ + /* This is a placeholder - actual implementation should use soft_aes */ + /* If this function is called, it means SOFT_AES template parameter wasn't used */ + /* We return a XOR as a minimal fallback, but proper code should use soft_aesenc */ + return _mm_xor_si128(a, roundkey); +} + +static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + /* Placeholder for AES key generation - should use soft_aeskeygenassist */ + return a; +} + +/* Rotate right operation for soft_aes.h */ +static inline uint32_t _rotr(uint32_t value, unsigned int count) +{ + const unsigned int mask = 31; + count &= mask; + return (value >> count) | (value << ((-count) & mask)); +} + +/* ARM NEON compatibility types and intrinsics for RISC-V */ +typedef __m128i_union uint64x2_t; +typedef __m128i_union uint8x16_t; +typedef __m128i_union int64x2_t; +typedef __m128i_union int32x4_t; + +static inline uint64x2_t vld1q_u64(const uint64_t *ptr) +{ + uint64x2_t result; + result.u64[0] = ptr[0]; + result.u64[1] = ptr[1]; + return result; +} + +static inline int64x2_t vld1q_s64(const int64_t *ptr) +{ + int64x2_t result; + result.i64[0] = ptr[0]; + result.i64[1] = ptr[1]; + return result; +} + +static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val) +{ + ptr[0] = val.u64[0]; + ptr[1] = val.u64[1]; +} + +static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b) +{ + uint64x2_t result; + result.u64[0] = a.u64[0] ^ b.u64[0]; + result.u64[1] = a.u64[1] ^ b.u64[1]; + return result; +} + +static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) +{ + uint64x2_t result; + result.u64[0] = a.u64[0] + b.u64[0]; + result.u64[1] = a.u64[1] + b.u64[1]; + return result; +} + +static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a) +{ + uint64x2_t result; + memcpy(&result, &a, sizeof(uint64x2_t)); + return result; +} + +static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane) +{ + return v.u64[lane]; +} + +static inline int64_t vgetq_lane_s64(int64x2_t v, int lane) +{ + return v.i64[lane]; +} + +static inline int32_t vgetq_lane_s32(int32x4_t v, int lane) +{ + return v.i32[lane]; +} + +typedef struct { uint64_t val[1]; } uint64x1_t; + +static inline uint64x1_t vcreate_u64(uint64_t a) +{ + uint64x1_t result; + result.val[0] = a; + return result; +} + +static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high) +{ + uint64x2_t result; + result.u64[0] = low.val[0]; + result.u64[1] = high.val[0]; + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* XMRIG_SSE2RVV_H */ diff --git a/src/crypto/common/portable/mm_malloc.h b/src/crypto/common/portable/mm_malloc.h index 34ca7d48b..388da645a 100644 --- a/src/crypto/common/portable/mm_malloc.h +++ b/src/crypto/common/portable/mm_malloc.h @@ -26,7 +26,7 @@ #define XMRIG_MM_MALLOC_PORTABLE_H -#if defined(XMRIG_ARM) && !defined(__clang__) +#if (defined(XMRIG_ARM) || defined(XMRIG_RISCV)) && !defined(__clang__) #include diff --git a/src/crypto/ghostrider/ghostrider.cpp b/src/crypto/ghostrider/ghostrider.cpp index 25bb44e74..4a21ae032 100644 --- a/src/crypto/ghostrider/ghostrider.cpp +++ b/src/crypto/ghostrider/ghostrider.cpp @@ -57,6 +57,9 @@ #if defined(XMRIG_ARM) # include "crypto/cn/sse2neon.h" +#elif defined(XMRIG_RISCV) + // RISC-V doesn't have SSE/NEON, provide minimal compatibility +# define _mm_pause() __asm__ __volatile__("nop") #elif defined(__GNUC__) # include #else @@ -286,7 +289,7 @@ struct HelperThread void benchmark() { -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) static std::atomic done{ 0 }; if (done.exchange(1)) { return; @@ -478,7 +481,7 @@ static inline bool findByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambd HelperThread* create_helper_thread(int64_t cpu_index, int priority, const std::vector& affinities) { -#ifndef XMRIG_ARM +#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV) hwloc_bitmap_t helper_cpu_set = hwloc_bitmap_alloc(); hwloc_bitmap_t main_threads_set = hwloc_bitmap_alloc(); @@ -807,7 +810,7 @@ void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ct uint32_t cn_indices[6]; select_indices(cn_indices, seed); -#ifdef XMRIG_ARM +#if defined(XMRIG_ARM) || defined(XMRIG_RISCV) uint32_t step[6] = { 1, 1, 1, 1, 1, 1 }; #else uint32_t step[6] = { 4, 4, 1, 2, 4, 4 }; diff --git a/src/crypto/randomx/common.hpp b/src/crypto/randomx/common.hpp index 98f96727b..6fbfb9785 100644 --- a/src/crypto/randomx/common.hpp +++ b/src/crypto/randomx/common.hpp @@ -111,6 +111,10 @@ namespace randomx { #define RANDOMX_HAVE_COMPILER 1 class JitCompilerA64; using JitCompiler = JitCompilerA64; +#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) + #define RANDOMX_HAVE_COMPILER 1 + class JitCompilerRV64; + using JitCompiler = JitCompilerRV64; #else #define RANDOMX_HAVE_COMPILER 0 class JitCompilerFallback; diff --git a/src/crypto/randomx/jit_compiler.hpp b/src/crypto/randomx/jit_compiler.hpp index db635c6f4..114ec3bd0 100644 --- a/src/crypto/randomx/jit_compiler.hpp +++ b/src/crypto/randomx/jit_compiler.hpp @@ -32,6 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/jit_compiler_x86.hpp" #elif defined(__aarch64__) #include "crypto/randomx/jit_compiler_a64.hpp" +#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) +#include "crypto/randomx/jit_compiler_rv64.hpp" #else #include "crypto/randomx/jit_compiler_fallback.hpp" #endif diff --git a/src/crypto/randomx/jit_compiler_rv64.cpp b/src/crypto/randomx/jit_compiler_rv64.cpp new file mode 100644 index 000000000..130cf9015 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64.cpp @@ -0,0 +1,1164 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include "crypto/randomx/jit_compiler_rv64.hpp" +#include "crypto/randomx/jit_compiler_rv64_static.hpp" +#include "crypto/randomx/superscalar.hpp" +#include "crypto/randomx/program.hpp" +#include "crypto/randomx/reciprocal.h" +#include "crypto/randomx/virtual_memory.hpp" +#include "crypto/common/VirtualMemory.h" + + +static bool hugePagesJIT = false; +static int optimizedDatasetInit = -1; + +void randomx_set_huge_pages_jit(bool hugePages) +{ + hugePagesJIT = hugePages; +} + +void randomx_set_optimized_dataset_init(int value) +{ + optimizedDatasetInit = value; +} + +#define alignSize(pos, align) (((pos - 1) / align + 1) * align) + + +namespace rv64 { + constexpr uint16_t C_LUI = 0x6001; + constexpr uint32_t LUI = 0x00000037; + constexpr uint16_t C_ADDI = 0x0001; + constexpr uint32_t ADDI = 0x00000013; + constexpr uint32_t ADDIW = 0x0000001b; + constexpr uint16_t C_ADD = 0x9002; + constexpr uint32_t ADD = 0x00000033; + constexpr uint32_t SHXADD = 0x20000033; //Zba + constexpr uint32_t SLL = 0x00001033; + constexpr uint32_t SRL = 0x00005033; + constexpr uint32_t SLLI = 0x00001013; + constexpr uint32_t C_SLLI = 0x0002; + constexpr uint32_t SRLI = 0x00005013; + constexpr uint32_t AND = 0x00007033; + constexpr uint32_t ANDI = 0x00007013; + constexpr uint16_t C_AND = 0x8c61; + constexpr uint16_t C_ANDI = 0x8801; + constexpr uint32_t OR = 0x00006033; + constexpr uint16_t C_OR = 0x8c41; + constexpr uint32_t XOR = 0x00004033; + constexpr uint16_t C_XOR = 0x8c21; + constexpr uint32_t LD = 0x00003003; + constexpr uint16_t C_LD = 0x6000; + constexpr uint16_t C_LW = 0x4000; + constexpr uint32_t SD = 0x00003023; + constexpr uint32_t SUB = 0x40000033; + constexpr uint16_t C_SUB = 0x8c01; + constexpr uint32_t MUL = 0x02000033; + constexpr uint32_t MULHU = 0x02003033; + constexpr uint32_t MULH = 0x02001033; + constexpr uint16_t C_MV = 0x8002; + constexpr uint32_t ROR = 0x60005033; //Zbb + constexpr uint32_t RORI = 0x60005013; //Zbb + constexpr uint32_t ROL = 0x60001033; //Zbb + constexpr uint32_t FMV_X_D = 0xe2000053; + constexpr uint32_t FMV_D_X = 0xf2000053; + constexpr uint32_t FMV_D = 0x22000053; + constexpr uint32_t FADD_D = 0x02007053; + constexpr uint32_t FSUB_D = 0x0a007053; + constexpr uint32_t FMUL_D = 0x12007053; + constexpr uint32_t FDIV_D = 0x1a007053; + constexpr uint32_t FSQRT_D = 0x5a007053; + constexpr uint32_t FCVT_D_W = 0xd2000053; + constexpr uint32_t FSRM = 0x00201073; + constexpr uint16_t C_BEQZ = 0xc001; + constexpr uint32_t BEQ = 0x00000063; + constexpr uint16_t C_BNEZ = 0xe001; + constexpr uint32_t JAL = 0x0000006f; + constexpr uint16_t C_RET = 0x8082; +} + +namespace randomx { + + constexpr size_t MaxRandomXInstrCodeSize = 56; //FDIV_M requires 56 bytes of rv64 code + constexpr size_t MaxSuperscalarInstrSize = 12; //IXOR_C requires 12 bytes of rv64 code + constexpr size_t SuperscalarProgramHeader = 136; //overhead per superscalar program + constexpr size_t CodeAlign = 4096; //align code size to a multiple of 4 KiB + constexpr size_t LiteralPoolSize = CodeAlign; + constexpr size_t SuperscalarLiteraPoolSize = RANDOMX_CACHE_MAX_ACCESSES * CodeAlign; + constexpr size_t ReserveCodeSize = CodeAlign; //prologue, epilogue + reserve + + constexpr size_t RandomXCodeSize = alignSize(LiteralPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_MAX_SIZE, CodeAlign); + constexpr size_t SuperscalarSize = alignSize(SuperscalarLiteraPoolSize + ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_MAX_ACCESSES, CodeAlign); + + constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize; + constexpr uint32_t ExecutableSize = CodeSize - LiteralPoolSize; + + constexpr int32_t LiteralPoolOffset = LiteralPoolSize / 2; + constexpr int32_t SuperScalarLiteralPoolOffset = RandomXCodeSize; + constexpr int32_t SuperScalarLiteralPoolRefOffset = RandomXCodeSize + (RANDOMX_CACHE_MAX_ACCESSES - 1) * LiteralPoolSize + LiteralPoolOffset; + constexpr int32_t SuperScalarHashOffset = SuperScalarLiteralPoolOffset + SuperscalarLiteraPoolSize; + + constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); + } + +#define MaskL1Shift (32 - RandomX_CurrentConfig.Log2_ScratchpadL1) +#define MaskL2Shift (32 - RandomX_CurrentConfig.Log2_ScratchpadL2) +#define MaskL3Shift (32 - RandomX_CurrentConfig.Log2_ScratchpadL3) + + constexpr int RcpLiteralsOffset = 144; + + constexpr int LiteralPoolReg = 3; //x3 + constexpr int SpadReg = 5; //x5 + constexpr int DataReg = 6; //x6 + constexpr int SuperscalarReg = 7; //x7 + constexpr int SshTmp1Reg = 28; //x28 + constexpr int SshTmp2Reg = 29; //x29 + constexpr int SshPoolReg = 30; //x30 + constexpr int SshRcpReg = 31; //x31 + constexpr int Tmp1Reg = 8; //x8 + constexpr int Tmp2Reg = 9; //x9 + constexpr int Tmp1RegF = 24; //f24 + constexpr int Tmp2RegF = 25; //f25 + constexpr int MaskL1Reg = 10; //x10 + constexpr int MaskL2Reg = 11; //x11 + constexpr int MaskFscalReg = 12; //x12 + constexpr int MaskEclear = 13; //x13 + constexpr int MaskEsetLo = 14; //x14 + constexpr int MaskEsetHi = 15; //x15 + constexpr int MaskL3Reg = 1; //x1 + constexpr int ReturnReg = 1; //x1 + constexpr int SpAddr0Reg = 26; //x26 + constexpr int OffsetXC = -8; //x8-x15 + constexpr int OffsetR = 16; //x16-x23 + constexpr int OffsetF = 0; //f0-f7 + constexpr int OffsetE = 8; //f8-f15 + constexpr int OffsetA = 16; //f16-f23 + constexpr int OffsetRcp = 28; //x28-x31 + constexpr int OffsetRcpF = 22; //f26-f31 + constexpr int OffsetSsh = 8; //x8-x15 + + //destination register (bit 7+) + constexpr int rvrd(int reg) { + return reg << 7; + } + + //first source register (bit 15+) + constexpr int rvrs1(int reg) { + return reg << 15; + } + + //second source register (bit 20+) + constexpr int rvrs2(int reg) { + return reg << 20; + } + + //compressed source register (bit 2+) + constexpr int rvcrs(int reg) { + return reg << 2; + } + + //base instruction: {op} x{rd}, x{rs1}, x{rs2} + constexpr uint32_t rvi(uint32_t op, int rd, int rs1, int rs2 = 0) { + return op | rvrs2(rs2) | rvrs1(rs1) | rvrd(rd); + } + + //compressed instruction: op x{rd}, x{rs} + constexpr uint16_t rvc(uint16_t op, int rd, int rs) { + return op | rvrd(rd) | rvcrs(rs); + } + + //compressed instruction: op x{rd}, imm6 + constexpr uint16_t rvc(uint16_t op, int imm5, int rd, int imm40) { + return op | (imm5 << 12) | rvrd(rd) | (imm40 << 2); + } + + constexpr int regR(int reg) { + return reg + OffsetR; + } + + constexpr int regLoA(int reg) { + return 2 * reg + OffsetA; + } + + constexpr int regHiA(int reg) { + return 2 * reg + OffsetA + 1; + } + + constexpr int regLoF(int reg) { + return 2 * reg + OffsetF; + } + + constexpr int regHiF(int reg) { + return 2 * reg + OffsetF + 1; + } + + constexpr int regLoE(int reg) { + return 2 * reg + OffsetE; + } + + constexpr int regHiE(int reg) { + return 2 * reg + OffsetE + 1; + } + + constexpr int regRcp(int reg) { + return reg + OffsetRcp; + } + + constexpr int regRcpF(int reg) { + return reg + OffsetRcpF; + } + + constexpr int regSS(int reg) { + return reg + OffsetSsh; + } + + static const uint8_t* codeLiterals = (uint8_t*)&randomx_riscv64_literals; + static const uint8_t* codeLiteralsEnd = (uint8_t*)&randomx_riscv64_literals_end; + static const uint8_t* codeDataInit = (uint8_t*)&randomx_riscv64_data_init; + static const uint8_t* codeFixDataCall = (uint8_t*)&randomx_riscv64_fix_data_call; + static const uint8_t* codePrologue = (uint8_t*)&randomx_riscv64_prologue; + static const uint8_t* codeLoopBegin = (uint8_t*)&randomx_riscv64_loop_begin; + static const uint8_t* codeDataRead = (uint8_t*)&randomx_riscv64_data_read; + static const uint8_t* codeDataReadLight = (uint8_t*)&randomx_riscv64_data_read_light; + static const uint8_t* codeFixLoopCall = (uint8_t*)&randomx_riscv64_fix_loop_call; + static const uint8_t* codeSpadStore = (uint8_t*)&randomx_riscv64_spad_store; + static const uint8_t* codeSpadStoreHardAes = (uint8_t*)&randomx_riscv64_spad_store_hardaes; + static const uint8_t* codeSpadStoreSoftAes = (uint8_t*)&randomx_riscv64_spad_store_softaes; + static const uint8_t* codeLoopEnd = (uint8_t*)&randomx_riscv64_loop_end; + static const uint8_t* codeFixContinueLoop = (uint8_t*)&randomx_riscv64_fix_continue_loop; + static const uint8_t* codeEpilogue = (uint8_t*)&randomx_riscv64_epilogue; + static const uint8_t* codeSoftAes = (uint8_t*)&randomx_riscv64_softaes; + static const uint8_t* codeProgramEnd = (uint8_t*)&randomx_riscv64_program_end; + static const uint8_t* codeSshInit = (uint8_t*)&randomx_riscv64_ssh_init; + static const uint8_t* codeSshLoad = (uint8_t*)&randomx_riscv64_ssh_load; + static const uint8_t* codeSshPrefetch = (uint8_t*)&randomx_riscv64_ssh_prefetch; + static const uint8_t* codeSshEnd = (uint8_t*)&randomx_riscv64_ssh_end; + + static const int32_t sizeLiterals = codeLiteralsEnd - codeLiterals; + static const int32_t sizeDataInit = codePrologue - codeDataInit; + static const int32_t sizePrologue = codeLoopBegin - codePrologue; + static const int32_t sizeLoopBegin = codeDataRead - codeLoopBegin; + static const int32_t sizeDataRead = codeDataReadLight - codeDataRead; + static const int32_t sizeDataReadLight = codeSpadStore - codeDataReadLight; + static const int32_t sizeSpadStore = codeSpadStoreHardAes - codeSpadStore; + static const int32_t sizeSpadStoreSoftAes = codeLoopEnd - codeSpadStoreSoftAes; + static const int32_t sizeLoopEnd = codeEpilogue - codeLoopEnd; + static const int32_t sizeEpilogue = codeSoftAes - codeEpilogue; + static const int32_t sizeSoftAes = codeProgramEnd - codeSoftAes; + static const int32_t sizeSshInit = codeSshLoad - codeSshInit; + static const int32_t sizeSshLoad = codeSshPrefetch - codeSshLoad; + static const int32_t sizeSshPrefetch = codeSshEnd - codeSshPrefetch; + + static const int32_t offsetFixDataCall = codeFixDataCall - codeDataInit; + static const int32_t offsetFixLoopCall = codeFixLoopCall - codeDataReadLight; + static const int32_t offsetFixContinueLoop = codeFixContinueLoop - codeLoopEnd; + + static const int32_t LoopTopPos = LiteralPoolSize + sizeDataInit + sizePrologue; + static const int32_t RandomXCodePos = LoopTopPos + sizeLoopBegin; + + static void clearCache(CodeBuffer& buf) { +#ifdef __GNUC__ + __builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize)); +#endif + } + + //emits code to calculate: x{dst} = x{src} + {imm32} + //takes 1-3 isns, 2-10 bytes + static void emitImm32(CodeBuffer& buf, int32_t imm, int dst, int src = 0, int tmp = 0) { + + //lower 12 bits + int32_t limm = (imm << 20) >> 20; + //upper 20 bits + int32_t uimm = (imm >> 12) + (limm < 0); + + //If there are no upper bits, the whole thing + //can be done with a single instruction. + if (uimm == 0) { + //addi x{dst}, x{src}, {limm} + buf.emit(rvi(rv64::ADDI, dst, src, limm)); + return; + } + + //dst1 is the register where imm will be materialized + int dst1 = src != dst ? dst : tmp; + assert(dst1 != 0); + //src1 is the register that will be added to the result + int src1 = src != dst ? src : dst1; + + //load upper bits + if (uimm >= -32 && uimm <= 31) { + //c.lui x{dst1}, {uimm} + buf.emit(rvc(rv64::C_LUI, (uimm < 0), dst1, (uimm & 31))); + } + else { + //lui x{dst1}, {uimm} + buf.emit(rv64::LUI | (uimm << 12) | rvrd(dst1)); + } + //load lower bits + if (limm != 0) { + //Note: this must be addiw NOT addi, otherwise the upper 32 bits + //of the 64-bit register will be incorrect. + //addiw x{dst1}, x{dst1}, {limm} + buf.emit(rvi(rv64::ADDIW, dst1, dst1, limm)); + } + //add src + if (src1 != 0) { + //c.add x{dst}, x{src1} + buf.emit(rvc(rv64::C_ADD, dst, src1)); + } + } + + //x9 = &Scratchpad[isn.imm] + //takes 3 isns, 10 bytes + static void genAddressRegImm(CodeBuffer& buf, const Instruction& isn) { + //signed offset 8-byte aligned + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()) & ScratchpadL3Mask; + //x9 = x5 + {imm} + emitImm32(buf, imm, Tmp2Reg, SpadReg, Tmp1Reg); + } + + //x9 = &Scratchpad[isn.src + isn.imm] (for reading) + //takes 5 isns, 12 bytes + static void genAddressReg(CodeBuffer& buf, const Instruction& isn) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{src} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.src), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, (Tmp2Reg + OffsetXC), (maskReg + OffsetXC))); + //c.add x9, x{spadReg} + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + + //x8 = Scratchpad[isn] + static void loadFromScratchpad(CodeBuffer& buf, const Instruction& isn) { + if (isn.src != isn.dst) { + //x9 = &Scratchpad[isn.src + isn.imm] + genAddressReg(buf, isn); + } + else { + ///x9 = &Scratchpad[isn.imm] + genAddressRegImm(buf, isn); + } + //c.ld x8, 0(x9) + buf.emit(rvc(rv64::C_LD, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + } + + //x9 = &Scratchpad[isn.dst + isn.imm32] (for writing) + //takes 5 isns, 12-16 bytes + static void genAddressRegDst(CodeBuffer& buf, const Instruction& isn) { + if (isn.getModCond() < StoreL3Condition) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, maskReg + OffsetXC)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + else { + int shift = MaskL3Shift; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //and x9, x9, x1 + buf.emit(rvi(rv64::AND, Tmp2Reg, Tmp2Reg, MaskL3Reg)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + } + + static void emitRcpLiteral1(CodeBuffer& buf, uint64_t literal) { + //first 238 at positive offsets + if (buf.rcpCount < 238) { + buf.emitAt(LiteralPoolOffset + RcpLiteralsOffset + buf.rcpCount * 8, literal); + buf.rcpCount++; + } + //next 256 at negative offsets + else if (buf.rcpCount < 494) { + buf.emitAt(buf.rcpCount * 8 - (2048 - RcpLiteralsOffset), literal); + buf.rcpCount++; + } + else { + //checked at compile time, but double-check here + throw std::runtime_error("Literal pool overflow"); + } + } + + static void emitRcpLiteral2(CodeBuffer& buf, uint64_t literal, bool lastLiteral) { + //store the current literal in the pool + int32_t offset = 2040 - buf.rcpCount * 8; + buf.emitAt(SuperScalarLiteralPoolRefOffset + offset, literal); + buf.rcpCount++; + if (lastLiteral) { + return; + } + //load the next literal + offset -= 8; + int32_t imm = offset & 0xfff; + //ld x31, {offset}(x30) + buf.emit(rvi(rv64::LD, SshRcpReg, SshPoolReg, imm)); + if (imm == 0x800) { + //move pool pointer back 4KB + //c.lui x29, 0xfffff + buf.emit(rvc(rv64::C_LUI, 1, SshTmp2Reg, 31)); + //c.add x30, x29 + buf.emit(rvc(rv64::C_ADD, SshPoolReg, SshTmp2Reg)); + } + } + + static void emitJump(CodeBuffer& buf, int dst, int32_t codePos, int32_t targetPos) { + int32_t imm = targetPos - codePos; + int32_t imm20 = (imm < 0) << 11; + int32_t imm1912 = (imm >> 7) & 8160; + int32_t imm11 = (imm >> 11) & 1; + int32_t imm101 = imm & 2046; + //jal x{dst}, {imm} + buf.emitAt(codePos, rvi(rv64::JAL, dst + imm1912, 0, imm20 + imm101 + imm11)); + } + + static void emitInstruction(CompilerState& state, Instruction isn, int i) { + state.instructionOffsets[i] = state.codePos; + (*JitCompilerRV64::engine[isn.opcode])(state, isn, i); + } + + static void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg) { + state.codePos = RandomXCodePos; + state.rcpCount = 0; + state.emitAt(LiteralPoolOffset + sizeLiterals, pcfg.eMask[0]); + state.emitAt(LiteralPoolOffset + sizeLiterals + 8, pcfg.eMask[1]); + for (unsigned i = 0; i < RegistersCount; ++i) { + state.registerUsage[i] = -1; + } + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + emitInstruction(state, instr, i); + } + } + + static void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg) { + state.emit(codeSpadStore, sizeSpadStore); + int32_t fixPos = state.codePos; + state.emit(codeLoopEnd, sizeLoopEnd); + //xor x26, x{readReg0}, x{readReg1} + state.emitAt(fixPos, rvi(rv64::XOR, SpAddr0Reg, regR(pcfg.readReg0), regR(pcfg.readReg1))); + fixPos += offsetFixContinueLoop; + //j LoopTop + emitJump(state, 0, fixPos, LoopTopPos); + state.emit(codeEpilogue, sizeEpilogue); + } + + static void generateSuperscalarCode(CodeBuffer& buf, Instruction isn, bool lastLiteral) { + switch ((SuperscalarInstructionType)isn.opcode) + { + case randomx::SuperscalarInstructionType::ISUB_R: + //c.sub x{dst}, x{src} + buf.emit(rvc(rv64::C_SUB, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + //c.xor x{dst}, x{src} + buf.emit(rvc(rv64::C_XOR, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + { + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), regSS(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + buf.emit(rv64::SHXADD | rvrs2(regSS(isn.dst)) | rvrs1(regSS(isn.src)) | (shift << 13) | rvrd(regSS(isn.dst))); +#else + //slli x28, x{src}, {shift} + buf.emit(rvi(rv64::SLLI, SshTmp1Reg, regSS(isn.src), shift)); + //c.add x{dst}, x28 + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), SshTmp1Reg)); +#endif + } + } + break; + case randomx::SuperscalarInstructionType::IMUL_R: + //mul x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IROR_C: + { +#ifdef __riscv_zbb + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + buf.emit(rvi(rv64::RORI, regSS(isn.dst), regSS(isn.dst), imm)); +#else + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x28, x{dst}, {immr} + buf.emit(rvi(rv64::SRLI, SshTmp1Reg, regSS(isn.dst), immr)); + //c.slli x{dst}, {imml} + buf.emit(rvc(rv64::C_SLLI, imml5, regSS(isn.dst), imml40)); + //or x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::OR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); +#endif + } + break; + case randomx::SuperscalarInstructionType::IADD_C7: + case randomx::SuperscalarInstructionType::IADD_C8: + case randomx::SuperscalarInstructionType::IADD_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(buf, imm, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg); + } + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + case randomx::SuperscalarInstructionType::IXOR_C8: + case randomx::SuperscalarInstructionType::IXOR_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x28 = {imm} + emitImm32(buf, imm, SshTmp1Reg); + //xor x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::XOR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); + } + break; + case randomx::SuperscalarInstructionType::IMULH_R: + //mulhu x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULHU, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + //mulh x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULH, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + //mul x{dst}, x{dst}, x31 + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), SshRcpReg)); + //load the next literal into x31 + emitRcpLiteral2(buf, randomx_reciprocal(isn.getImm32()), lastLiteral); + break; + default: + UNREACHABLE; + } + } + + size_t JitCompilerRV64::getCodeSize() { + return CodeSize; + } + + JitCompilerRV64::JitCompilerRV64(bool hugePagesEnable, bool) { + state.code = static_cast(allocExecutableMemory(CodeSize, hugePagesJIT && hugePagesEnable)); + state.emitAt(LiteralPoolOffset, codeLiterals, sizeLiterals); + + const uint32_t L1_Mask = RandomX_CurrentConfig.ScratchpadL1_Size - 8; + const uint32_t L2_Mask = RandomX_CurrentConfig.ScratchpadL2_Size - 8; + const uint32_t L3_Mask = RandomX_CurrentConfig.ScratchpadL3_Size - 64; + const uint32_t DatasetBaseSize_Mask = RandomX_CurrentConfig.DatasetBaseSize - 64; + + state.emitAt(LiteralPoolOffset + 80, reinterpret_cast(&L1_Mask), sizeof(L1_Mask)); + state.emitAt(LiteralPoolOffset + 84, reinterpret_cast(&L2_Mask), sizeof(L2_Mask)); + state.emitAt(LiteralPoolOffset + 88, reinterpret_cast(&L3_Mask), sizeof(L3_Mask)); + state.emitAt(LiteralPoolOffset + 92, reinterpret_cast(&DatasetBaseSize_Mask), sizeof(DatasetBaseSize_Mask)); + + state.emitAt(LiteralPoolSize, codeDataInit, sizeDataInit + sizePrologue + sizeLoopBegin); + entryDataInit = state.code + LiteralPoolSize; + entryProgram = state.code + LiteralPoolSize + sizeDataInit; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset); + } + + JitCompilerRV64::~JitCompilerRV64() { + freePagedMemory(state.code, CodeSize); + } + + void JitCompilerRV64::enableWriting() const + { + xmrig::VirtualMemory::protectRW(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::enableExecution() const + { + xmrig::VirtualMemory::protectRX(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataRead, sizeDataRead); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataReadLight, sizeDataReadLight); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + int32_t imm = datasetOffset / CacheLineSize; + int32_t limm = (imm << 20) >> 20; + int32_t uimm = (imm >> 12) + (limm < 0); + //lui x9, {uimm} + state.emitAt(fixPos + 4, rv64::LUI | (uimm << 12) | rvrd(Tmp2Reg)); + //addi x9, x9, {limm} + state.emitAt(fixPos + 8, rvi(rv64::ADDI, Tmp2Reg, Tmp2Reg, limm)); + fixPos += offsetFixLoopCall; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, fixPos, SuperScalarHashOffset); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + template + void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) { + state.codePos = SuperScalarHashOffset; + state.rcpCount = 0; + state.emit(codeSshInit, sizeSshInit); + + std::pair lastLiteral{ 0xFFFFFFFFUL, 0xFFFFFFFFUL }; + + for (int j = RandomX_ConfigurationBase::CacheAccesses - 1; (j >= 0) && (lastLiteral.first == 0xFFFFFFFFUL); --j) { + SuperscalarProgram& prog = programs[j]; + for (int i = prog.getSize() - 1; i >= 0; --i) { + if (prog(i).opcode == static_cast(SuperscalarInstructionType::IMUL_RCP)) { + lastLiteral.first = j; + lastLiteral.second = i; + break; + } + } + } + + for (unsigned j = 0; j < RandomX_ConfigurationBase::CacheAccesses; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + generateSuperscalarCode(state, instr, (j == lastLiteral.first) && (i == lastLiteral.second)); + } + state.emit(codeSshLoad, sizeSshLoad); + if (j < RandomX_ConfigurationBase::CacheAccesses - 1) { + int32_t fixPos = state.codePos; + state.emit(codeSshPrefetch, sizeSshPrefetch); + //and x7, x{addrReg}, x7 + state.emitAt(fixPos, rvi(rv64::AND, SuperscalarReg, regSS(prog.getAddressRegister()), SuperscalarReg)); + } + } + state.emit(rvc(rv64::C_RET, 0, 0)); + clearCache(state); + } + + template void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram(&)[RANDOMX_CACHE_MAX_ACCESSES]); + + void JitCompilerRV64::v1_IADD_RS(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + state.emit(rvc(rv64::C_ADD, regR(isn.dst), regR(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + state.emit(rv64::SHXADD | rvrs2(regR(isn.dst)) | rvrs1(regR(isn.src)) | (shift << 13) | rvrd(regR(isn.dst))); +#else + //slli x8, x{src}, {shift} + state.emit(rvi(rv64::SLLI, Tmp1Reg, regR(isn.src), shift)); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); +#endif + } + if (isn.dst == RegisterNeedsDisplacement) { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + void JitCompilerRV64::v1_IADD_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_ISUB_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //sub x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add + //x{dst} = x{dst} + {-imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + void JitCompilerRV64::v1_ISUB_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //sub x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IMUL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //mul x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + void JitCompilerRV64::v1_IMUL_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulhu x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + void JitCompilerRV64::v1_IMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulhu x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_ISMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulh x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + void JitCompilerRV64::v1_ISMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulh x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IMUL_RCP(HANDLER_ARGS) { + const uint32_t divisor = isn.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + state.registerUsage[isn.dst] = i; + if (state.rcpCount < 4) { + //mul x{dst}, x{dst}, x{rcp} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regRcp(state.rcpCount))); + } + else if (state.rcpCount < 10) { + //fmv.x.d x8, f{rcp} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regRcpF(state.rcpCount))); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + else { + int32_t offset = RcpLiteralsOffset + state.rcpCount * 8; + //ld x8, {offset}(x3) + state.emit(rvi(rv64::LD, Tmp1Reg, LiteralPoolReg, offset)); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + emitRcpLiteral1(state, randomx_reciprocal_fast(divisor)); + } + } + + void JitCompilerRV64::v1_INEG_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //sub x{dst}, x0, x{dst} + state.emit(rvi(rv64::SUB, regR(isn.dst), 0, regR(isn.dst))); + } + + void JitCompilerRV64::v1_IXOR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //xor x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + void JitCompilerRV64::v1_IXOR_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + void JitCompilerRV64::v1_IROR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //ror x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //srl x9, x{dst}, x{src} + state.emit(rvi(rv64::SRL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //sll x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SLL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + void JitCompilerRV64::v1_IROL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //rol x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = -isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //sll x9, x{dst}, x{src} + state.emit(rvi(rv64::SLL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //srl x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SRL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t imml = isn.getImm32() & 63; + int32_t immr = -imml & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + void JitCompilerRV64::v1_ISWAP_R(HANDLER_ARGS) { + if (isn.src != isn.dst) { + state.registerUsage[isn.dst] = i; + state.registerUsage[isn.src] = i; + //c.mv x8, x{dst} + state.emit(rvc(rv64::C_MV, Tmp1Reg, regR(isn.dst))); + //c.mv x{dst}, x{src} + state.emit(rvc(rv64::C_MV, regR(isn.dst), regR(isn.src))); + //c.mv x{src}, x8 + state.emit(rvc(rv64::C_MV, regR(isn.src), Tmp1Reg)); + } + } + + void JitCompilerRV64::v1_FSWAP_R(HANDLER_ARGS) { + //fmv.d f24, f{dst_lo} + state.emit(rvi(rv64::FMV_D, Tmp1RegF, regLoF(isn.dst), regLoF(isn.dst))); + //fmv.d f{dst_lo}, f{dst_hi} + state.emit(rvi(rv64::FMV_D, regLoF(isn.dst), regHiF(isn.dst), regHiF(isn.dst))); + //fmv.d f{dst_hi}, f24 + state.emit(rvi(rv64::FMV_D, regHiF(isn.dst), Tmp1RegF, Tmp1RegF)); + } + + void JitCompilerRV64::v1_FADD_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fadd.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fadd.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + void JitCompilerRV64::v1_FADD_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fadd.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fadd.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + void JitCompilerRV64::v1_FSUB_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fsub.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fsub.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + void JitCompilerRV64::v1_FSUB_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //c.lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //c.lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fsub.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fsub.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + void JitCompilerRV64::v1_FSCAL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fmv.x.d x8, f{dst_lo} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regLoF(isn.dst))); + //fmv.x.d x9, f{dst_hi} + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, regHiF(isn.dst))); + //c.xor x8, x12 + state.emit(rvc(rv64::C_XOR, Tmp1Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //c.xor x9, x12 + state.emit(rvc(rv64::C_XOR, Tmp2Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //fmv.d.x f{dst_lo}, x8 + state.emit(rvi(rv64::FMV_D_X, regLoF(isn.dst), Tmp1Reg)); + //fmv.d.x f{dst_hi}, x9 + state.emit(rvi(rv64::FMV_D_X, regHiF(isn.dst), Tmp2Reg)); + } + + void JitCompilerRV64::v1_FMUL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fmul.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FMUL_D, regLoE(isn.dst), regLoE(isn.dst), regLoA(isn.src))); + //fmul.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FMUL_D, regHiE(isn.dst), regHiE(isn.dst), regHiA(isn.src))); + } + + void JitCompilerRV64::v1_FDIV_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fmv.x.d x8, f24 + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, Tmp1RegF)); + //fmv.x.d x9, f25 + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, Tmp2RegF)); + //c.and x8, x13 + state.emit(rvc(rv64::C_AND, Tmp1Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.and x9, x13 + state.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.or x8, x14 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, MaskEsetLo + OffsetXC)); + //c.or x9, x15 + state.emit(rvc(rv64::C_OR, Tmp2Reg + OffsetXC, MaskEsetHi + OffsetXC)); + //fmv.d.x f24, x8 + state.emit(rvi(rv64::FMV_D_X, Tmp1RegF, Tmp1Reg)); + //fmv.d.x f25, x9 + state.emit(rvi(rv64::FMV_D_X, Tmp2RegF, Tmp2Reg)); + //fdiv.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FDIV_D, regLoE(isn.dst), regLoE(isn.dst), Tmp1RegF)); + //fdiv.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FDIV_D, regHiE(isn.dst), regHiE(isn.dst), Tmp2RegF)); + } + + void JitCompilerRV64::v1_FSQRT_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fsqrt.d f{dst_lo}, f{dst_lo} + state.emit(rvi(rv64::FSQRT_D, regLoE(isn.dst), regLoE(isn.dst))); + //fsqrt.d f{dst_hi}, f{dst_hi} + state.emit(rvi(rv64::FSQRT_D, regHiE(isn.dst), regHiE(isn.dst))); + } + + void JitCompilerRV64::v1_CBRANCH(HANDLER_ARGS) { + int reg = isn.dst; + int target = state.registerUsage[reg] + 1; + int shift = isn.getModCond() + RandomX_ConfigurationBase::JumpOffset; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm |= (1UL << shift); + if (RandomX_ConfigurationBase::JumpOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + //x8 = branchMask + emitImm32(state, (int32_t)((1 << RandomX_ConfigurationBase::JumpBits) - 1) << shift, Tmp1Reg); + //x{dst} += {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp2Reg); + //and x8, x8, x{dst} + state.emit(rvi(rv64::AND, Tmp1Reg, Tmp1Reg, regR(isn.dst))); + int32_t targetPos = state.instructionOffsets[target]; + int offset = targetPos - state.codePos; + if (offset >= -256) { //C.BEQZ only has a range of 256B + //c.beqz x8, {offset} + int imm8 = 1; //sign bit is always 1 + int imm21 = offset & 6; //offset[2:1] + int imm5 = (offset >> 5) & 1; //offset[5] + int imm43 = offset & 24; //offset[4:3] + int imm76 = (offset >> 3) & 24; //offset[7:6] + state.emit(rvc(rv64::C_BEQZ, imm8, imm43 + (Tmp1Reg + OffsetXC), imm76 + imm21 + imm5)); + } + else if (offset >= -4096) { //BEQ only has a range of 4KB + //beq x8, x0, offset + int imm12 = 1 << 11; //sign bit is always 1 + int imm105 = offset & 2016; //offset[10:5] + int imm41 = offset & 30; //offset[4:1] + int imm11 = (offset >> 11) & 1; //offset[11] + state.emit(rvi(rv64::BEQ, imm41 + imm11, Tmp1Reg, imm12 + imm105)); + } + else { + //c.bnez x8, +6 + state.emit(rvc(rv64::C_BNEZ, Tmp1Reg + OffsetXC, 6)); + //j targetPos + emitJump(state, 0, state.codePos, targetPos); + state.codePos += 4; + } + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + state.registerUsage[j] = i; + } + } + + void JitCompilerRV64::v1_CFROUND(HANDLER_ARGS) { + int32_t imm = (isn.getImm32() - 2) & 63; //-2 to avoid a later left shift to multiply by 4 + if (imm != 0) { +#ifdef __riscv_zbb + //rori x8, x{src}, {imm} + state.emit(rvi(rv64::RORI, Tmp1Reg, regR(isn.src), imm)); +#else + int32_t imml = -imm & 63; + //srli x8, x{src}, {imm} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.src), imm)); + //slli x9, x{src}, {imml} + state.emit(rvi(rv64::SLLI, Tmp2Reg, regR(isn.src), imml)); + //c.or x8, x9 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, Tmp2Reg + OffsetXC)); +#endif + //c.andi x8, 12 + state.emit(rvc(rv64::C_ANDI, Tmp1Reg + OffsetXC, 12)); + } + else { + //and x8, x{src}, 12 + state.emit(rvi(rv64::ANDI, Tmp1Reg, regR(isn.src), 12)); + } + //c.add x8, x3 + state.emit(rvc(rv64::C_ADD, Tmp1Reg, LiteralPoolReg)); + //c.lw x8, 64(x8) + state.emit(rvc(rv64::C_LW, Tmp1Reg + OffsetXC, 8 + Tmp1Reg + OffsetXC)); + //fsrm x8 + state.emit(rvi(rv64::FSRM, 0, Tmp1Reg, 0)); + } + + void JitCompilerRV64::v1_ISTORE(HANDLER_ARGS) { + genAddressRegDst(state, isn); + //sd x{src}, 0(x9) + state.emit(rvi(rv64::SD, 0, Tmp2Reg, regR(isn.src))); + } + + void JitCompilerRV64::v1_NOP(HANDLER_ARGS) { + } + +InstructionGeneratorRV64 JitCompilerRV64::engine[256] = {}; +} diff --git a/src/crypto/randomx/jit_compiler_rv64.hpp b/src/crypto/randomx/jit_compiler_rv64.hpp new file mode 100644 index 000000000..3eac10a2d --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64.hpp @@ -0,0 +1,144 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "crypto/randomx/common.hpp" +#include "crypto/randomx/jit_compiler_rv64_static.hpp" + +namespace randomx { + + struct CodeBuffer { + uint8_t* code; + int32_t codePos; + int32_t rcpCount; + + void emit(const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + codePos += len; + } + + template + void emit(T src) { + memcpy(&code[codePos], &src, sizeof(src)); + codePos += sizeof(src); + } + + void emitAt(int32_t codePos, const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + } + + template + void emitAt(int32_t codePos, T src) { + memcpy(&code[codePos], &src, sizeof(src)); + } + }; + + struct CompilerState : public CodeBuffer { + int32_t instructionOffsets[RANDOMX_PROGRAM_MAX_SIZE]; + int registerUsage[RegistersCount]; + }; + + class Program; + struct ProgramConfiguration; + class SuperscalarProgram; + class Instruction; + +#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i + typedef void(*InstructionGeneratorRV64)(HANDLER_ARGS); + + class JitCompilerRV64 { + public: + JitCompilerRV64(bool hugePagesEnable, bool optimizedInitDatasetEnable); + ~JitCompilerRV64(); + + void prepare() {} + void generateProgram(Program&, ProgramConfiguration&, uint32_t); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + + template + void generateSuperscalarHash(SuperscalarProgram(&programs)[N]); + + void generateDatasetInitCode() {} + + ProgramFunc* getProgramFunc() { + return (ProgramFunc*)entryProgram; + } + DatasetInitFunc* getDatasetInitFunc() { + return (DatasetInitFunc*)entryDataInit; + } + uint8_t* getCode() { + return state.code; + } + size_t getCodeSize(); + + void enableWriting() const; + void enableExecution() const; + + static InstructionGeneratorRV64 engine[256]; + private: + CompilerState state; + void* entryDataInit; + void* entryProgram; + + public: + static void v1_IADD_RS(HANDLER_ARGS); + static void v1_IADD_M(HANDLER_ARGS); + static void v1_ISUB_R(HANDLER_ARGS); + static void v1_ISUB_M(HANDLER_ARGS); + static void v1_IMUL_R(HANDLER_ARGS); + static void v1_IMUL_M(HANDLER_ARGS); + static void v1_IMULH_R(HANDLER_ARGS); + static void v1_IMULH_M(HANDLER_ARGS); + static void v1_ISMULH_R(HANDLER_ARGS); + static void v1_ISMULH_M(HANDLER_ARGS); + static void v1_IMUL_RCP(HANDLER_ARGS); + static void v1_INEG_R(HANDLER_ARGS); + static void v1_IXOR_R(HANDLER_ARGS); + static void v1_IXOR_M(HANDLER_ARGS); + static void v1_IROR_R(HANDLER_ARGS); + static void v1_IROL_R(HANDLER_ARGS); + static void v1_ISWAP_R(HANDLER_ARGS); + static void v1_FSWAP_R(HANDLER_ARGS); + static void v1_FADD_R(HANDLER_ARGS); + static void v1_FADD_M(HANDLER_ARGS); + static void v1_FSUB_R(HANDLER_ARGS); + static void v1_FSUB_M(HANDLER_ARGS); + static void v1_FSCAL_R(HANDLER_ARGS); + static void v1_FMUL_R(HANDLER_ARGS); + static void v1_FDIV_M(HANDLER_ARGS); + static void v1_FSQRT_R(HANDLER_ARGS); + static void v1_CBRANCH(HANDLER_ARGS); + static void v1_CFROUND(HANDLER_ARGS); + static void v1_ISTORE(HANDLER_ARGS); + static void v1_NOP(HANDLER_ARGS); + }; +} diff --git a/src/crypto/randomx/jit_compiler_rv64_static.S b/src/crypto/randomx/jit_compiler_rv64_static.S new file mode 100644 index 000000000..c4f341adb --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_static.S @@ -0,0 +1,1236 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define DECL(x) x + +.text +.option rvc + +#include "configuration.h" + +.global DECL(randomx_riscv64_literals) +.global DECL(randomx_riscv64_literals_end) +.global DECL(randomx_riscv64_data_init) +.global DECL(randomx_riscv64_fix_data_call) +.global DECL(randomx_riscv64_prologue) +.global DECL(randomx_riscv64_loop_begin) +.global DECL(randomx_riscv64_data_read) +.global DECL(randomx_riscv64_data_read_light) +.global DECL(randomx_riscv64_fix_loop_call) +.global DECL(randomx_riscv64_spad_store) +.global DECL(randomx_riscv64_spad_store_hardaes) +.global DECL(randomx_riscv64_spad_store_softaes) +.global DECL(randomx_riscv64_loop_end) +.global DECL(randomx_riscv64_fix_continue_loop) +.global DECL(randomx_riscv64_epilogue) +.global DECL(randomx_riscv64_softaes) +.global DECL(randomx_riscv64_program_end) +.global DECL(randomx_riscv64_ssh_init) +.global DECL(randomx_riscv64_ssh_load) +.global DECL(randomx_riscv64_ssh_prefetch) +.global DECL(randomx_riscv64_ssh_end) + +/* The literal pool can fit at most 494 IMUL_RCP literals */ +#if RANDOMX_PROGRAM_SIZE > 494 + #error RANDOMX_PROGRAM_SIZE larger than 494 is not supported. +#endif + +#define RANDOMX_ARGON_MEMORY 262144 +#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1) + +/* shared literal pool: 4 KB */ + /* space for 256 IMUL_RCP literals -2048 */ + /* filled by JIT compiler */ +DECL(randomx_riscv64_literals): +literal_pool: + /* SuperscalarHash constants +0 */ + .dword 6364136223846793005 + .dword 9298411001130361340 + .dword 12065312585734608966 + .dword 9306329213124626780 + .dword 5281919268842080866 + .dword 10536153434571861004 + .dword 3398623926847679864 + .dword 9549104520008361294 + /* CFROUND lookup table +64 */ + .word 0x00000000 /* RTN */ + .word 0x00000002 /* RDN */ + .word 0x00000003 /* RUP */ + .word 0x00000001 /* RTZ */ + /* mask literals +80,+84,+88,+92,+96,+104 */ + .word (/*RANDOMX_SCRATCHPAD_L1*/0) /* filled by JIT compiler */ + .word (/*RANDOMX_SCRATCHPAD_L2*/0) /* filled by JIT compiler */ + .word (/*RANDOMX_SCRATCHPAD_L3*/0) /* filled by JIT compiler */ + .word (/*RANDOMX_DATASET_BASE_SIZE*/0) /* filled by JIT compiler */ + .dword 0x80f0000000000000 + .dword 0x00ffffffffffffff +DECL(randomx_riscv64_literals_end): + /* E reg. set masks, +112,+120 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* soft AES table addresses, +128,+136 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* space for 238 IMUL_RCP literals, +144 */ + .fill 238,8,0 /* filled by JIT compiler */ + +/* ================================= */ +/* Dataset init function entry point */ +/* ================================= */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/return address + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> dataset pointer + x6 -> cache pointer + x7 -> temp/itemNumber + x8-x15 -> SuperscalarHash registers + x16 -> itemNumber + x17 -> endItem + x28-x31 -> temp + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> saved x3 + 16 -> saved x8-x9 + 32 -> caller stack +*/ +DECL(randomx_riscv64_data_init): + addi sp, sp, -32 + /* dataset ptr */ + mv x5, x11 + /* cache->memory */ + ld x6, 0(x10) + /* callee saved registers */ + sd x1, 0(sp) + sd x3, 8(sp) + /* literal pool */ + lla x3, literal_pool + sd x8, 16(sp) + sd x9, 24(sp) + /* startItem */ + mv x16, x12 + /* endItem */ + mv x17, x13 +init_item: + mv x7, x16 +DECL(randomx_riscv64_fix_data_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + sd x8, 0(x5) + sd x9, 8(x5) + sd x10, 16(x5) + sd x11, 24(x5) + sd x12, 32(x5) + sd x13, 40(x5) + sd x14, 48(x5) + sd x15, 56(x5) + addi x5, x5, 64 + addi x16, x16, 1 + bltu x16, x17, init_item + ld x1, 0(sp) + ld x3, 8(sp) + ld x8, 16(sp) + ld x9, 24(sp) + addi sp, sp, 32 + ret + +/* ====================================== */ +/* Program execution function entry point */ +/* ====================================== */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/scratchpad L3 mask + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> scratchpad pointer + x6 -> dataset/cache pointer + x7 -> temp/next dataset access + x8 -> temp + x9 -> temp + x10 -> scratchpad L1 mask (0x0000000000003ff8) + x11 -> scratchpad L2 mask (0x000000000003fff8) + x12 -> FSCAL_R mask (0x80f0000000000000) + x13 -> E reg. clear mask (0x00ffffffffffffff) + x14 -> E reg. set mask (0x3*00000000******) + x15 -> E reg. set mask (0x3*00000000******) + x16-x23 -> VM registers "r0"-"r7" + x24 -> iteration counter "ic" + x25 -> VM registers "mx", "ma" + x26 -> spAddr0 + x27 -> spAddr1 + x28-x31 -> temp/literals for IMUL_RCP (4x) + + (Note: We avoid using x4 because it breaks debugging with gdb.) + + f0-f7 -> VM registers "f0"-"f3" + f8-f15 -> VM registers "e0"-"e3" + f16-f23 -> VM registers "a0"-"a3" + f24-f25 -> temp + f26-f31 -> literals for IMUL_RCP (6x) + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> register file ptr + 16 -> saved x3-x4 + 32 -> saved x8-x9 + 48 -> saved x18-x27 + 128 -> saved f8-f9 + 144 -> saved f18-f27 + 224 -> caller stack +*/ + +DECL(randomx_riscv64_prologue): + addi sp, sp, -224 + /* scratchpad pointer */ + mv x5, x12 + /* register file pointer */ + sd x10, 8(sp) + /* callee saved registers */ + sd x3, 16(sp) + sd x8, 32(sp) + sd x9, 40(sp) + sd x18, 48(sp) + sd x19, 56(sp) + sd x20, 64(sp) + sd x21, 72(sp) + sd x22, 80(sp) + sd x23, 88(sp) + sd x24, 96(sp) + sd x25, 104(sp) + sd x26, 112(sp) + sd x27, 120(sp) + fsd f8, 128(sp) + fsd f9, 136(sp) + fsd f18, 144(sp) + fsd f19, 152(sp) + fsd f20, 160(sp) + fsd f21, 168(sp) + fsd f22, 176(sp) + fsd f23, 184(sp) + fsd f24, 192(sp) + fsd f25, 200(sp) + fsd f26, 208(sp) + fsd f27, 216(sp) + /* iteration counter */ + mv x24, x13 + /* return address */ + sd x1, 0(sp) + /* literal pool */ + lla x3, literal_pool + /* load (ma, mx) */ + ld x25, 0(x11) + /* dataset ptr */ + ld x6, 8(x11) + /* load dataset mask */ + lwu x1, 92(x3) + /* zero registers r0-r3, load a0-a1 */ + li x16, 0 + fld f16, 192(x10) + li x17, 0 + fld f17, 200(x10) + srli x7, x25, 32 /* x7 = ma */ + li x18, 0 + fld f18, 208(x10) + mv x27, x7 /* x27 = ma */ + li x19, 0 + fld f19, 216(x10) + /* set dataset read address */ + and x7, x7, x1 + add x7, x7, x6 + /* zero registers r4-r7, load a2-a3 */ + li x20, 0 + fld f20, 224(x10) + li x21, 0 + fld f21, 232(x10) + li x22, 0 + fld f22, 240(x10) + li x23, 0 + fld f23, 248(x10) + /* load L3 mask */ + lwu x1, 88(x3) + /* load scratchpad masks */ + lwu x10, 80(x3) + lwu x11, 84(x3) + /* set spAddr0, spAddr1 */ + and x26, x25, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* FSCAL, E reg. masks */ + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + /* IMUL_RCP literals */ + fld f26, 176(x3) + fld f27, 184(x3) + fld f28, 192(x3) + fld f29, 200(x3) + fld f30, 208(x3) + fld f31, 216(x3) + +.balign 4 +DECL(randomx_riscv64_loop_begin): +loop_begin: + /* mix integer registers */ + ld x8, 0(x26) + ld x9, 8(x26) + ld x30, 16(x26) + ld x31, 24(x26) + xor x16, x16, x8 + ld x8, 32(x26) + xor x17, x17, x9 + ld x9, 40(x26) + xor x18, x18, x30 + ld x30, 48(x26) + xor x19, x19, x31 + ld x31, 56(x26) + xor x20, x20, x8 + lw x8, 0(x27) + xor x21, x21, x9 + lw x9, 4(x27) + xor x22, x22, x30 + lw x30, 8(x27) + xor x23, x23, x31 + lw x31, 12(x27) + /* load F registers */ + fcvt.d.w f0, x8 + lw x8, 16(x27) + fcvt.d.w f1, x9 + lw x9, 20(x27) + fcvt.d.w f2, x30 + lw x30, 24(x27) + fcvt.d.w f3, x31 + lw x31, 28(x27) + fcvt.d.w f4, x8 + lw x8, 32(x27) + fcvt.d.w f5, x9 + lw x9, 36(x27) + fcvt.d.w f6, x30 + lw x30, 40(x27) + fcvt.d.w f7, x31 + lw x31, 44(x27) + /* load E registers */ + fcvt.d.w f8, x8 + lw x8, 48(x27) + fcvt.d.w f9, x9 + lw x9, 52(x27) + fcvt.d.w f10, x30 + lw x30, 56(x27) + fcvt.d.w f11, x31 + lw x31, 60(x27) + fcvt.d.w f12, x8 + fmv.x.d x8, f8 + fcvt.d.w f13, x9 + fmv.x.d x9, f9 + fcvt.d.w f14, x30 + fmv.x.d x30, f10 + fcvt.d.w f15, x31 + fmv.x.d x31, f11 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f8, x8 + fmv.d.x f9, x9 + fmv.d.x f10, x30 + fmv.d.x f11, x31 + fmv.x.d x8, f12 + fmv.x.d x9, f13 + fmv.x.d x30, f14 + fmv.x.d x31, f15 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + fmv.d.x f12, x8 + fmv.d.x f13, x9 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f14, x30 + fmv.d.x f15, x31 + /* reload clobbered IMUL_RCP regs */ + ld x28, 144(x3) + ld x29, 152(x3) + ld x30, 160(x3) + ld x31, 168(x3) + +DECL(randomx_riscv64_data_read): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset mask */ + lwu x1, 92(x3) + /* zero-extend x8 */ +#ifdef __riscv_zba + zext.w x8, x8 +#else + slli x8, x8, 32 + srli x8, x8, 32 +#endif + /* update "mx" */ + xor x25, x25, x8 + /* read dataset and update registers */ + ld x8, 0(x7) + ld x9, 8(x7) + ld x30, 16(x7) + ld x31, 24(x7) + xor x16, x16, x8 + ld x8, 32(x7) + xor x17, x17, x9 + ld x9, 40(x7) + xor x18, x18, x30 + ld x30, 48(x7) + xor x19, x19, x31 + ld x31, 56(x7) + xor x20, x20, x8 + /* calculate the next dataset address */ + and x7, x25, x1 + xor x21, x21, x9 + add x7, x7, x6 + xor x22, x22, x30 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + xor x23, x23, x31 + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x9, x25, 32 + slli x25, x25, 32 + or x25, x25, x9 +#endif + +DECL(randomx_riscv64_data_read_light): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset offset */ + lui x9, 0x02000 /* JIT compiler will adjust the immediate */ + addi x9, x9, -64 + /* load dataset mask */ + lwu x1, 92(x3) + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x31, x25, 32 + slli x25, x25, 32 + or x25, x25, x31 +#endif + slli x8, x8, 32 + /* update "mx" */ + xor x25, x25, x8 + /* the next dataset item */ + and x7, x25, x1 + srli x7, x7, 6 + add x7, x7, x9 +DECL(randomx_riscv64_fix_loop_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + xor x16, x16, x8 + xor x17, x17, x9 + xor x18, x18, x10 + xor x19, x19, x11 + xor x20, x20, x12 + xor x21, x21, x13 + xor x22, x22, x14 + xor x23, x23, x15 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_spad_store): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* XOR and store f0,e0 */ + fmv.x.d x8, f0 + fmv.x.d x9, f8 + fmv.x.d x30, f1 + fmv.x.d x31, f9 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 0(x26) + fmv.d.x f0, x8 + sd x30, 8(x26) + fmv.d.x f1, x30 + /* XOR and store f1,e1 */ + fmv.x.d x8, f2 + fmv.x.d x9, f10 + fmv.x.d x30, f3 + fmv.x.d x31, f11 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 16(x26) + fmv.d.x f2, x8 + sd x30, 24(x26) + fmv.d.x f3, x30 + /* XOR and store f2,e2 */ + fmv.x.d x8, f4 + fmv.x.d x9, f12 + fmv.x.d x30, f5 + fmv.x.d x31, f13 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 32(x26) + fmv.d.x f4, x8 + sd x30, 40(x26) + fmv.d.x f5, x30 + /* XOR and store f3,e3 */ + fmv.x.d x8, f6 + fmv.x.d x9, f14 + fmv.x.d x30, f7 + fmv.x.d x31, f15 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 48(x26) + fmv.d.x f6, x8 + sd x30, 56(x26) + fmv.d.x f7, x30 + +DECL(randomx_riscv64_spad_store_hardaes): + nop /* not implemented */ + +DECL(randomx_riscv64_spad_store_softaes): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* process f0 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f0 + fmv.x.d x31, f1 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 0(x26) + fmv.d.x f0, x30 + sd x31, 8(x26) + fmv.d.x f1, x31 + /* process f1 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f2 + fmv.x.d x31, f3 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 16(x26) + fmv.d.x f2, x30 + sd x31, 24(x26) + fmv.d.x f3, x31 + /* process f2 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f4 + fmv.x.d x31, f5 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 32(x26) + fmv.d.x f4, x30 + sd x31, 40(x26) + fmv.d.x f5, x31 + /* process f3 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f6 + fmv.x.d x31, f7 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 48(x26) + fmv.d.x f6, x30 + sd x31, 56(x26) + fmv.d.x f7, x31 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_loop_end): + xor x26, x16, x18 /* JIT compiler will adjust the registers */ + /* load L3 mask */ + lwu x1, 88(x3) + addi x24, x24, -1 + srli x27, x26, 32 + /* set spAddr0, spAddr1 */ + and x26, x26, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* conditional branch doesn't have sufficient range */ + j condition_check +DECL(randomx_riscv64_fix_continue_loop): +continue_loop: + .word 0 /* JIT compiler will write a jump to loop_begin */ +condition_check: + bnez x24, continue_loop + +DECL(randomx_riscv64_epilogue): + /* restore callee saved registers */ + ld x10, 8(sp) + ld x1, 0(sp) + ld x3, 16(sp) + ld x8, 32(sp) + ld x9, 40(sp) + ld x24, 96(sp) + ld x25, 104(sp) + ld x26, 112(sp) + ld x27, 120(sp) + fld f18, 144(sp) + fld f19, 152(sp) + fld f20, 160(sp) + fld f21, 168(sp) + fld f22, 176(sp) + fld f23, 184(sp) + fld f24, 192(sp) + fld f25, 200(sp) + fld f26, 208(sp) + fld f27, 216(sp) + /* save VM registers */ + sd x16, 0(x10) + sd x17, 8(x10) + sd x18, 16(x10) + sd x19, 24(x10) + sd x20, 32(x10) + sd x21, 40(x10) + sd x22, 48(x10) + sd x23, 56(x10) + fsd f0, 64(x10) + fsd f1, 72(x10) + fsd f2, 80(x10) + fsd f3, 88(x10) + fsd f4, 96(x10) + fsd f5, 104(x10) + fsd f6, 112(x10) + fsd f7, 120(x10) + fsd f8, 128(x10) + fsd f9, 136(x10) + fsd f10, 144(x10) + fsd f11, 152(x10) + fsd f12, 160(x10) + fsd f13, 168(x10) + fsd f14, 176(x10) + fsd f15, 184(x10) + /* restore callee saved registers */ + ld x18, 48(sp) + ld x19, 56(sp) + ld x20, 64(sp) + ld x21, 72(sp) + ld x22, 80(sp) + ld x23, 88(sp) + fld f8, 128(sp) + fld f9, 136(sp) + /* restore stack pointer */ + addi sp, sp, 224 + /* return */ + ret + +/* + Soft AES subroutines + in: + x3 = literal pool + x8, x10 = round key + x30, x31 = plaintext + out: + x30, x31 = ciphertext + clobbers: + x8-x11 (limbs) + x12-x13 (LUTs) + x14-x15 (temp) +*/ +DECL(randomx_riscv64_softaes): +softaes_enc: + /* enc. lookup table */ + ld x13, 128(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x30, x30, x15 + + ret + +softaes_dec: + /* dec. lookup table */ + ld x13, 136(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x31, x31, x15 + + ret + +DECL(randomx_riscv64_program_end): + nop + + +/* literal pool for SuperscalarHash */ + /* space for remaining IMUL_RCP literals */ +ssh_literal_pool: + /* space for 256 IMUL_RCP literals */ + .fill 256,8,0 + +/* + SuperscalarHash subroutine + in: + x3 = literal pool + x6 = cache + x7 = itemNumber + out: + x8-x15 = 64-byte hash + clobbers: + x7, x28-x31 +*/ +DECL(randomx_riscv64_ssh_init): +superscalar_hash: + ld x30, 0(x3) /* superscalarMul0 */ + addi x8, x7, 1 + ld x9, 8(x3) + li x31, RANDOMX_CACHE_MASK + ld x10, 16(x3) + ld x11, 24(x3) + mul x8, x8, x30 + ld x12, 32(x3) + ld x13, 40(x3) + lla x30, ssh_literal_pool + ld x14, 48(x3) + and x7, x7, x31 + ld x15, 56(x3) + slli x7, x7, 6 + xor x9, x9, x8 + add x7, x7, x6 + xor x10, x10, x8 + /* load the first IMUL_RCP literal */ + ld x31, 2040(x30) + xor x11, x11, x8 + xor x12, x12, x8 + xor x13, x13, x8 + xor x14, x14, x8 + xor x15, x15, x8 + +DECL(randomx_riscv64_ssh_load): + ld x28, 0(x7) + ld x29, 8(x7) + xor x8, x8, x28 + ld x28, 16(x7) + xor x9, x9, x29 + ld x29, 24(x7) + xor x10, x10, x28 + ld x28, 32(x7) + xor x11, x11, x29 + ld x29, 40(x7) + xor x12, x12, x28 + ld x28, 48(x7) + xor x13, x13, x29 + ld x29, 56(x7) + xor x14, x14, x28 + li x7, RANDOMX_CACHE_MASK + xor x15, x15, x29 + +DECL(randomx_riscv64_ssh_prefetch): + and x7, x8, x7 /* JIT compiler will adjust the register */ + slli x7, x7, 6 + add x7, x7, x6 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + +DECL(randomx_riscv64_ssh_end): + nop diff --git a/src/crypto/randomx/jit_compiler_rv64_static.hpp b/src/crypto/randomx/jit_compiler_rv64_static.hpp new file mode 100644 index 000000000..656623c74 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_rv64_static.hpp @@ -0,0 +1,53 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_riscv64_literals(); + void randomx_riscv64_literals_end(); + void randomx_riscv64_data_init(); + void randomx_riscv64_fix_data_call(); + void randomx_riscv64_prologue(); + void randomx_riscv64_loop_begin(); + void randomx_riscv64_data_read(); + void randomx_riscv64_data_read_light(); + void randomx_riscv64_fix_loop_call(); + void randomx_riscv64_spad_store(); + void randomx_riscv64_spad_store_hardaes(); + void randomx_riscv64_spad_store_softaes(); + void randomx_riscv64_loop_end(); + void randomx_riscv64_fix_continue_loop(); + void randomx_riscv64_epilogue(); + void randomx_riscv64_softaes(); + void randomx_riscv64_program_end(); + void randomx_riscv64_ssh_init(); + void randomx_riscv64_ssh_load(); + void randomx_riscv64_ssh_prefetch(); + void randomx_riscv64_ssh_end(); +} diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 1126c7a2e..1609a4af3 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -39,6 +39,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/jit_compiler_x86_static.hpp" #elif (XMRIG_ARM == 8) #include "crypto/randomx/jit_compiler_a64_static.hpp" +#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) +#include "crypto/randomx/jit_compiler_rv64_static.hpp" #endif #include "backend/cpu/Cpu.h" @@ -190,7 +192,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase() # endif } -#if (XMRIG_ARM == 8) +#if (XMRIG_ARM == 8) || defined(XMRIG_RISCV) static uint32_t Log2(size_t value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; } #endif @@ -274,6 +276,14 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx #define JIT_HANDLE(x, prev) randomx::JitCompilerA64::engine[k] = &randomx::JitCompilerA64::h_##x +#elif defined(XMRIG_RISCV) + + Log2_ScratchpadL1 = Log2(ScratchpadL1_Size); + Log2_ScratchpadL2 = Log2(ScratchpadL2_Size); + Log2_ScratchpadL3 = Log2(ScratchpadL3_Size); + +#define JIT_HANDLE(x, prev) randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x + #else #define JIT_HANDLE(x, prev) #endif diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index c2d244447..70abff348 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -133,7 +133,7 @@ struct RandomX_ConfigurationBase uint32_t ScratchpadL3Mask_Calculated; uint32_t ScratchpadL3Mask64_Calculated; -# if (XMRIG_ARM == 8) +# if (XMRIG_ARM == 8) || defined(XMRIG_RISCV) uint32_t Log2_ScratchpadL1; uint32_t Log2_ScratchpadL2; uint32_t Log2_ScratchpadL3; diff --git a/src/crypto/randomx/tests/riscv64_zba.s b/src/crypto/randomx/tests/riscv64_zba.s new file mode 100644 index 000000000..e1947e7a6 --- /dev/null +++ b/src/crypto/randomx/tests/riscv64_zba.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zba extension is present */ + +.text +.global main + +main: + sh1add x6, x6, x7 + li x10, 0 + ret diff --git a/src/crypto/randomx/tests/riscv64_zbb.s b/src/crypto/randomx/tests/riscv64_zbb.s new file mode 100644 index 000000000..d922043f0 --- /dev/null +++ b/src/crypto/randomx/tests/riscv64_zbb.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zbb extension is present */ + +.text +.global main + +main: + ror x6, x6, x7 + li x10, 0 + ret diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp index acaa25e05..6ffe210d4 100644 --- a/src/crypto/rx/RxVm.cpp +++ b/src/crypto/rx/RxVm.cpp @@ -29,9 +29,17 @@ randomx_vm *xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so { int flags = 0; + // On RISC-V, force software AES path even if CPU reports AES capability. + // The RandomX portable intrinsics will throw at runtime when HAVE_AES is not defined + // for this architecture. Until native AES intrinsics are wired for RISC-V, avoid + // setting HARD_AES to prevent "Platform doesn't support hardware AES" aborts. +# ifndef XMRIG_RISCV if (!softAes) { flags |= RANDOMX_FLAG_HARD_AES; } +# else + (void)softAes; // unused on RISC-V to force soft AES +# endif if (dataset->get()) { flags |= RANDOMX_FLAG_FULL_MEM; diff --git a/src/version.h b/src/version.h index a6773b14d..ce36b0afe 100644 --- a/src/version.h +++ b/src/version.h @@ -75,6 +75,8 @@ #ifdef XMRIG_ARM # define APP_ARCH "ARMv" STR2(XMRIG_ARM) +#elif defined(XMRIG_RISCV) +# define APP_ARCH "RISC-V" #else # if defined(__x86_64__) || defined(__amd64__) || defined(_M_X64) || defined(_M_AMD64) # define APP_ARCH "x86-64"