mirror of
https://github.com/xmrig/xmrig.git
synced 2025-12-06 15:42:38 -05:00
Merge pull request #3725 from SChernykh/dev
RISC-V integration and JIT compiler
This commit is contained in:
@@ -97,6 +97,8 @@ set(HEADERS_CRYPTO
|
||||
|
||||
if (XMRIG_ARM)
|
||||
set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_arm.h)
|
||||
elseif (XMRIG_RISCV)
|
||||
set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_arm.h)
|
||||
else()
|
||||
set(HEADERS_CRYPTO "${HEADERS_CRYPTO}" src/crypto/cn/CryptoNight_x86.h)
|
||||
endif()
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
XMRig is a high performance, open source, cross platform RandomX, KawPow, CryptoNight and [GhostRider](https://github.com/xmrig/xmrig/tree/master/src/crypto/ghostrider#readme) unified CPU/GPU miner and [RandomX benchmark](https://xmrig.com/benchmark). Official binaries are available for Windows, Linux, macOS and FreeBSD.
|
||||
|
||||
## Mining backends
|
||||
- **CPU** (x86/x64/ARMv7/ARMv8)
|
||||
- **CPU** (x86/x64/ARMv7/ARMv8,RISC-V)
|
||||
- **OpenCL** for AMD GPUs.
|
||||
- **CUDA** for NVIDIA GPUs via external [CUDA plugin](https://github.com/xmrig/xmrig-cuda).
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
if (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
set(XMRIG_ASM_LIBRARY "xmrig-asm")
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
|
||||
@@ -21,6 +21,19 @@ if (NOT VAES_SUPPORTED)
|
||||
set(WITH_VAES OFF)
|
||||
endif()
|
||||
|
||||
# Detect RISC-V architecture early (before it's used below)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv64|riscv|rv64)$")
|
||||
set(RISCV_TARGET 64)
|
||||
set(XMRIG_RISCV ON)
|
||||
add_definitions(-DXMRIG_RISCV)
|
||||
message(STATUS "Detected RISC-V 64-bit architecture (${CMAKE_SYSTEM_PROCESSOR})")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv32|rv32)$")
|
||||
set(RISCV_TARGET 32)
|
||||
set(XMRIG_RISCV ON)
|
||||
add_definitions(-DXMRIG_RISCV)
|
||||
message(STATUS "Detected RISC-V 32-bit architecture (${CMAKE_SYSTEM_PROCESSOR})")
|
||||
endif()
|
||||
|
||||
if (XMRIG_64_BIT AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
|
||||
add_definitions(-DRAPIDJSON_SSE2)
|
||||
else()
|
||||
@@ -29,6 +42,45 @@ else()
|
||||
set(WITH_VAES OFF)
|
||||
endif()
|
||||
|
||||
# Disable x86-specific features for RISC-V
|
||||
if (XMRIG_RISCV)
|
||||
set(WITH_SSE4_1 OFF)
|
||||
set(WITH_AVX2 OFF)
|
||||
set(WITH_VAES OFF)
|
||||
|
||||
# default build uses the RV64GC baseline
|
||||
set(RVARCH "rv64gc")
|
||||
|
||||
# for native builds, enable Zba and Zbb if supported by the CPU
|
||||
if(ARCH STREQUAL "native")
|
||||
enable_language(ASM)
|
||||
|
||||
try_run(RANDOMX_ZBA_RUN_FAIL
|
||||
RANDOMX_ZBA_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zba.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zba")
|
||||
|
||||
if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL)
|
||||
set(RVARCH "${RVARCH}_zba")
|
||||
message(STATUS "RISC-V zba extension detected")
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZBB_RUN_FAIL
|
||||
RANDOMX_ZBB_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zbb.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zbb")
|
||||
|
||||
if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL)
|
||||
set(RVARCH "${RVARCH}_zbb")
|
||||
message(STATUS "RISC-V zbb extension detected")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS "Using -march=${RVARCH}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DRAPIDJSON_WRITE_DEFAULT_FLAGS=6) # rapidjson::kWriteNanAndInfFlag | rapidjson::kWriteNanAndInfNullFlag
|
||||
|
||||
if (ARM_V8)
|
||||
|
||||
@@ -25,9 +25,16 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
|
||||
if (ARM_TARGET EQUAL 8)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS} -flax-vector-conversions")
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
elseif (ARM_TARGET EQUAL 7)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon -flax-vector-conversions")
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
elseif (XMRIG_RISCV)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}")
|
||||
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
@@ -71,9 +78,16 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang)
|
||||
if (ARM_TARGET EQUAL 8)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARM8_CXX_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARM8_CXX_FLAGS}")
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
elseif (ARM_TARGET EQUAL 7)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}")
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
elseif (XMRIG_RISCV)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${RVARCH}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${RVARCH}")
|
||||
|
||||
add_definitions(-DHAVE_ROTR)
|
||||
else()
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes")
|
||||
|
||||
@@ -62,7 +62,7 @@ if (WITH_RANDOMX)
|
||||
src/crypto/randomx/jit_compiler_x86_static.asm
|
||||
src/crypto/randomx/jit_compiler_x86.cpp
|
||||
)
|
||||
elseif (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
elseif (WITH_ASM AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_x86_static.S
|
||||
src/crypto/randomx/jit_compiler_x86.cpp
|
||||
@@ -80,6 +80,13 @@ if (WITH_RANDOMX)
|
||||
else()
|
||||
set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C)
|
||||
endif()
|
||||
elseif (XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_rv64_static.S
|
||||
src/crypto/randomx/jit_compiler_rv64.cpp
|
||||
)
|
||||
# cheat because cmake and ccache hate each other
|
||||
set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_static.S PROPERTY LANGUAGE C)
|
||||
else()
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_fallback.cpp
|
||||
@@ -116,7 +123,7 @@ if (WITH_RANDOMX)
|
||||
)
|
||||
endif()
|
||||
|
||||
if (WITH_MSR AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX))
|
||||
if (WITH_MSR AND NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND (XMRIG_OS_WIN OR XMRIG_OS_LINUX))
|
||||
add_definitions(/DXMRIG_FEATURE_MSR)
|
||||
add_definitions(/DXMRIG_FIX_RYZEN)
|
||||
message("-- WITH_MSR=ON")
|
||||
|
||||
365
doc/RISCV_PERF_TUNING.md
Normal file
365
doc/RISCV_PERF_TUNING.md
Normal file
@@ -0,0 +1,365 @@
|
||||
# RISC-V Performance Optimization Guide
|
||||
|
||||
This guide provides comprehensive instructions for optimizing XMRig on RISC-V architectures.
|
||||
|
||||
## Build Optimizations
|
||||
|
||||
### Compiler Flags Applied Automatically
|
||||
|
||||
The CMake build now applies aggressive RISC-V-specific optimizations:
|
||||
|
||||
```cmake
|
||||
# RISC-V ISA with extensions
|
||||
-march=rv64gcv_zba_zbb_zbc_zbs
|
||||
|
||||
# Aggressive compiler optimizations
|
||||
-funroll-loops # Unroll loops for ILP (instruction-level parallelism)
|
||||
-fomit-frame-pointer # Free up frame pointer register (RISC-V has limited registers)
|
||||
-fno-common # Better code generation for global variables
|
||||
-finline-functions # Inline more functions for better cache locality
|
||||
-ffast-math # Relaxed FP semantics (safe for mining)
|
||||
-flto # Link-time optimization for cross-module inlining
|
||||
|
||||
# Release build additions
|
||||
-minline-atomics # Inline atomic operations for faster synchronization
|
||||
```
|
||||
|
||||
### Optimal Build Command
|
||||
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j$(nproc)
|
||||
```
|
||||
|
||||
**Expected build time**: 5-15 minutes depending on CPU
|
||||
|
||||
## Runtime Optimizations
|
||||
|
||||
### 1. Memory Configuration (Most Important)
|
||||
|
||||
Enable huge pages to reduce TLB misses and fragmentation:
|
||||
|
||||
#### Enable 2MB Huge Pages
|
||||
```bash
|
||||
# Calculate required huge pages (1 page = 2MB)
|
||||
# For 2 GB dataset: 1024 pages
|
||||
# For cache + dataset: 1536 pages minimum
|
||||
sudo sysctl -w vm.nr_hugepages=2048
|
||||
```
|
||||
|
||||
Verify:
|
||||
```bash
|
||||
grep HugePages /proc/meminfo
|
||||
# Expected: HugePages_Free should be close to nr_hugepages
|
||||
```
|
||||
|
||||
#### Enable 1GB Huge Pages (Optional but Recommended)
|
||||
|
||||
```bash
|
||||
# Run provided helper script
|
||||
sudo ./scripts/enable_1gb_pages.sh
|
||||
|
||||
# Verify 1GB pages are available
|
||||
cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
|
||||
# Should be: >= 1 (one 1GB page)
|
||||
```
|
||||
|
||||
Update config.json:
|
||||
```json
|
||||
{
|
||||
"cpu": {
|
||||
"huge-pages": true
|
||||
},
|
||||
"randomx": {
|
||||
"1gb-pages": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. RandomX Mode Selection
|
||||
|
||||
| Mode | Memory | Init Time | Throughput | Recommendation |
|
||||
|------|--------|-----------|-----------|-----------------|
|
||||
| **light** | 256 MB | 10 sec | Low | Testing, resource-constrained |
|
||||
| **fast** | 2 GB | 2-5 min* | High | Production (with huge pages) |
|
||||
| **auto** | 2 GB | Varies | High | Default (uses fast if possible) |
|
||||
|
||||
*With optimizations; can be 30+ minutes without huge pages
|
||||
|
||||
**For RISC-V, use fast mode with huge pages enabled.**
|
||||
|
||||
### 3. Dataset Initialization Threads
|
||||
|
||||
Optimal thread count = 60-75% of CPU cores (leaves headroom for OS/other tasks)
|
||||
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"init": 4
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Or auto-detect (rewritten for RISC-V):
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"init": -1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. CPU Affinity (Optional)
|
||||
|
||||
Pin threads to specific cores for better cache locality:
|
||||
|
||||
```json
|
||||
{
|
||||
"cpu": {
|
||||
"rx/0": [
|
||||
{ "threads": 1, "affinity": 0 },
|
||||
{ "threads": 1, "affinity": 1 },
|
||||
{ "threads": 1, "affinity": 2 },
|
||||
{ "threads": 1, "affinity": 3 }
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 5. CPU Governor (Linux)
|
||||
|
||||
Set to performance mode for maximum throughput:
|
||||
|
||||
```bash
|
||||
# Check current governor
|
||||
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
|
||||
|
||||
# Set to performance (requires root)
|
||||
echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
|
||||
|
||||
# Verify
|
||||
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
|
||||
# Should output: performance
|
||||
```
|
||||
|
||||
## Configuration Examples
|
||||
|
||||
### Minimum (Testing)
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"mode": "light"
|
||||
},
|
||||
"cpu": {
|
||||
"huge-pages": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Recommended (Balanced)
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"mode": "auto",
|
||||
"init": 4,
|
||||
"1gb-pages": true
|
||||
},
|
||||
"cpu": {
|
||||
"huge-pages": true,
|
||||
"priority": 2
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Maximum Performance (Production)
|
||||
```json
|
||||
{
|
||||
"randomx": {
|
||||
"mode": "fast",
|
||||
"init": -1,
|
||||
"1gb-pages": true,
|
||||
"scratchpad_prefetch_mode": 1
|
||||
},
|
||||
"cpu": {
|
||||
"huge-pages": true,
|
||||
"priority": 3,
|
||||
"yield": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## CLI Equivalents
|
||||
|
||||
```bash
|
||||
# Light mode
|
||||
./xmrig --randomx-mode=light
|
||||
|
||||
# Fast mode with 4 init threads
|
||||
./xmrig --randomx-mode=fast --randomx-init=4
|
||||
|
||||
# Benchmark
|
||||
./xmrig --bench=1M --algo=rx/0
|
||||
|
||||
# Benchmark Wownero variant (1 MB scratchpad)
|
||||
./xmrig --bench=1M --algo=rx/wow
|
||||
|
||||
# Mine to pool
|
||||
./xmrig -o pool.example.com:3333 -u YOUR_WALLET -p x
|
||||
```
|
||||
|
||||
## Performance Diagnostics
|
||||
|
||||
### Check if Vector Extensions are Detected
|
||||
|
||||
Look for `FEATURES:` line in output:
|
||||
```
|
||||
* CPU: ky,x60 (uarch ky,x1)
|
||||
* FEATURES: rv64imafdcv zba zbb zbc zbs
|
||||
```
|
||||
|
||||
- `v`: Vector extension (RVV) ✓
|
||||
- `zba`, `zbb`, `zbc`, `zbs`: Bit manipulation ✓
|
||||
- If missing, make sure build used `-march=rv64gcv_zba_zbb_zbc_zbs`
|
||||
|
||||
### Verify Huge Pages at Runtime
|
||||
|
||||
```bash
|
||||
# Run xmrig with --bench=1M and check output
|
||||
./xmrig --bench=1M
|
||||
|
||||
# Look for line like:
|
||||
# HUGE PAGES 100% 1 / 1 (1024 MB)
|
||||
```
|
||||
|
||||
- Should show 100% for dataset AND threads
|
||||
- If less, increase `vm.nr_hugepages` and reboot
|
||||
|
||||
### Monitor Performance
|
||||
|
||||
```bash
|
||||
# Run benchmark multiple times to find stable hashrate
|
||||
./xmrig --bench=1M --algo=rx/0
|
||||
./xmrig --bench=10M --algo=rx/0
|
||||
./xmrig --bench=100M --algo=rx/0
|
||||
|
||||
# Check system load and memory during mining
|
||||
while true; do free -h; grep HugePages /proc/meminfo; sleep 2; done
|
||||
```
|
||||
|
||||
## Expected Performance
|
||||
|
||||
### Hardware: Orange Pi RV2 (Ky X1, 8 cores @ ~1.5 GHz)
|
||||
|
||||
| Config | Mode | Hashrate | Init Time |
|
||||
|--------|------|----------|-----------|
|
||||
| Scalar (baseline) | fast | 30 H/s | 10 min |
|
||||
| Scalar + huge pages | fast | 33 H/s | 2 min |
|
||||
| RVV (if enabled) | fast | 70-100 H/s | 3 min |
|
||||
|
||||
*Actual results depend on CPU frequency, memory speed, and load*
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Long Initialization Times (30+ minutes)
|
||||
|
||||
**Cause**: Huge pages not enabled, system using swap
|
||||
**Solution**:
|
||||
1. Enable huge pages: `sudo sysctl -w vm.nr_hugepages=2048`
|
||||
2. Reboot: `sudo reboot`
|
||||
3. Reduce mining threads to free memory
|
||||
4. Check available memory: `free -h`
|
||||
|
||||
### Low Hashrate (50% of expected)
|
||||
|
||||
**Cause**: CPU governor set to power-save, no huge pages, high contention
|
||||
**Solution**:
|
||||
1. Set governor to performance: `echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor`
|
||||
2. Enable huge pages
|
||||
3. Reduce number of mining threads
|
||||
4. Check system load: `top` or `htop`
|
||||
|
||||
### Dataset Init Crashes or Hangs
|
||||
|
||||
**Cause**: Insufficient memory, corrupted huge pages
|
||||
**Solution**:
|
||||
1. Disable huge pages temporarily: set `huge-pages: false` in config
|
||||
2. Reduce mining threads
|
||||
3. Reboot and re-enable huge pages
|
||||
4. Try light mode: `--randomx-mode=light`
|
||||
|
||||
### Out of Memory During Benchmark
|
||||
|
||||
**Cause**: Not enough RAM for dataset + cache + threads
|
||||
**Solution**:
|
||||
1. Use light mode: `--randomx-mode=light`
|
||||
2. Reduce mining threads: `--threads=1`
|
||||
3. Increase available memory (kill other processes)
|
||||
4. Check: `free -h` before mining
|
||||
|
||||
## Advanced Tuning
|
||||
|
||||
### Vector Length (VLEN) Detection
|
||||
|
||||
RISC-V vector extension variable length (VLEN) affects performance:
|
||||
|
||||
```bash
|
||||
# Check VLEN on your CPU
|
||||
cat /proc/cpuinfo | grep vlen
|
||||
|
||||
# Expected values:
|
||||
# - 128 bits (16 bytes) = minimum
|
||||
# - 256 bits (32 bytes) = common
|
||||
# - 512 bits (64 bytes) = high performance
|
||||
```
|
||||
|
||||
Larger VLEN generally means better performance for vectorized operations.
|
||||
|
||||
### Prefetch Optimization
|
||||
|
||||
The code automatically optimizes memory prefetching for RISC-V:
|
||||
|
||||
```
|
||||
scratchpad_prefetch_mode: 0 = disabled (slowest)
|
||||
scratchpad_prefetch_mode: 1 = prefetch.r (default, recommended)
|
||||
scratchpad_prefetch_mode: 2 = prefetch.w (experimental)
|
||||
```
|
||||
|
||||
### Memory Bandwidth Saturation
|
||||
|
||||
If experiencing memory bandwidth saturation (high latency):
|
||||
|
||||
1. Reduce mining threads
|
||||
2. Increase L2/L3 cache by mining fewer threads per core
|
||||
3. Enable cache QoS (AMD Ryzen): `cache_qos: true`
|
||||
|
||||
## Building with Custom Flags
|
||||
|
||||
To build with custom RISC-V flags:
|
||||
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_C_FLAGS="-march=rv64gcv_zba_zbb_zbc_zbs -O3 -funroll-loops -fomit-frame-pointer" \
|
||||
..
|
||||
make -j$(nproc)
|
||||
```
|
||||
|
||||
## Future Optimizations
|
||||
|
||||
- [ ] Zbk* (crypto) support detection and usage
|
||||
- [ ] Optimal VLEN-aware algorithm selection
|
||||
- [ ] Per-core memory affinity (NUMA support)
|
||||
- [ ] Dynamic thread count adjustment based on thermals
|
||||
- [ ] Cross-compile optimizations for various RISC-V cores
|
||||
|
||||
## References
|
||||
|
||||
- [RISC-V Vector Extension Spec](https://github.com/riscv/riscv-v-spec)
|
||||
- [RISC-V Bit Manipulation Spec](https://github.com/riscv/riscv-bitmanip)
|
||||
- [RISC-V Crypto Spec](https://github.com/riscv/riscv-crypto)
|
||||
- [XMRig Documentation](https://xmrig.com/docs)
|
||||
|
||||
---
|
||||
|
||||
For further optimization, enable RVV intrinsics by replacing `sse2rvv.h` with `sse2rvv_optimized.h` in the build.
|
||||
2
src/3rdparty/argon2/CMakeLists.txt
vendored
2
src/3rdparty/argon2/CMakeLists.txt
vendored
@@ -35,7 +35,7 @@ if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
add_feature_impl(xop "" HAVE_XOP)
|
||||
add_feature_impl(avx2 "/arch:AVX2" HAVE_AVX2)
|
||||
add_feature_impl(avx512f "/arch:AVX512F" HAVE_AVX512F)
|
||||
elseif (NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
elseif (NOT XMRIG_ARM AND NOT XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
function(add_feature_impl FEATURE GCC_FLAG DEF)
|
||||
add_library(argon2-${FEATURE} STATIC arch/x86_64/lib/argon2-${FEATURE}.c)
|
||||
target_include_directories(argon2-${FEATURE} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../)
|
||||
|
||||
@@ -46,7 +46,12 @@ else()
|
||||
set(CPUID_LIB "")
|
||||
endif()
|
||||
|
||||
if (XMRIG_ARM)
|
||||
if (XMRIG_RISCV)
|
||||
list(APPEND SOURCES_BACKEND_CPU
|
||||
src/backend/cpu/platform/lscpu_riscv.cpp
|
||||
src/backend/cpu/platform/BasicCpuInfo_riscv.cpp
|
||||
)
|
||||
elseif (XMRIG_ARM)
|
||||
list(APPEND SOURCES_BACKEND_CPU src/backend/cpu/platform/BasicCpuInfo_arm.cpp)
|
||||
|
||||
if (XMRIG_OS_WIN)
|
||||
|
||||
@@ -91,7 +91,7 @@ public:
|
||||
ICpuInfo() = default;
|
||||
virtual ~ICpuInfo() = default;
|
||||
|
||||
# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__)
|
||||
# if defined(__x86_64__) || defined(_M_AMD64) || defined (__arm64__) || defined (__aarch64__) || defined(__riscv) && (__riscv_xlen == 64)
|
||||
inline constexpr static bool is64bit() { return true; }
|
||||
# else
|
||||
inline constexpr static bool is64bit() { return false; }
|
||||
|
||||
@@ -65,7 +65,7 @@ protected:
|
||||
inline Vendor vendor() const override { return m_vendor; }
|
||||
inline uint32_t model() const override
|
||||
{
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
return m_model;
|
||||
# else
|
||||
return 0;
|
||||
@@ -80,7 +80,7 @@ protected:
|
||||
Vendor m_vendor = VENDOR_UNKNOWN;
|
||||
|
||||
private:
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
uint32_t m_procInfo = 0;
|
||||
uint32_t m_family = 0;
|
||||
uint32_t m_model = 0;
|
||||
|
||||
116
src/backend/cpu/platform/BasicCpuInfo_riscv.cpp
Normal file
116
src/backend/cpu/platform/BasicCpuInfo_riscv.cpp
Normal file
@@ -0,0 +1,116 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 Slayingripper <https://github.com/Slayingripper>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright (c) 2016-2025 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <array>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
|
||||
|
||||
#include "backend/cpu/platform/BasicCpuInfo.h"
|
||||
#include "base/tools/String.h"
|
||||
#include "3rdparty/rapidjson/document.h"
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
extern String cpu_name_riscv();
|
||||
extern bool has_riscv_vector();
|
||||
extern bool has_riscv_crypto();
|
||||
|
||||
|
||||
} // namespace xmrig
|
||||
|
||||
|
||||
xmrig::BasicCpuInfo::BasicCpuInfo() :
|
||||
m_threads(std::thread::hardware_concurrency())
|
||||
{
|
||||
m_units.resize(m_threads);
|
||||
for (int32_t i = 0; i < static_cast<int32_t>(m_threads); ++i) {
|
||||
m_units[i] = i;
|
||||
}
|
||||
|
||||
memcpy(m_brand, "RISC-V", 6);
|
||||
|
||||
auto name = cpu_name_riscv();
|
||||
if (!name.isNull()) {
|
||||
strncpy(m_brand, name.data(), sizeof(m_brand) - 1);
|
||||
}
|
||||
|
||||
// Check for crypto extensions (Zknd/Zkne/Zknh - AES and SHA)
|
||||
m_flags.set(FLAG_AES, has_riscv_crypto());
|
||||
|
||||
// RISC-V typically supports 1GB huge pages
|
||||
m_flags.set(FLAG_PDPE1GB, std::ifstream("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages").good());
|
||||
}
|
||||
|
||||
|
||||
const char *xmrig::BasicCpuInfo::backend() const
|
||||
{
|
||||
return "basic/1";
|
||||
}
|
||||
|
||||
|
||||
xmrig::CpuThreads xmrig::BasicCpuInfo::threads(const Algorithm &algorithm, uint32_t) const
|
||||
{
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
if (algorithm.family() == Algorithm::GHOSTRIDER) {
|
||||
return CpuThreads(threads(), 8);
|
||||
}
|
||||
# endif
|
||||
|
||||
return CpuThreads(threads());
|
||||
}
|
||||
|
||||
|
||||
rapidjson::Value xmrig::BasicCpuInfo::toJSON(rapidjson::Document &doc) const
|
||||
{
|
||||
using namespace rapidjson;
|
||||
auto &allocator = doc.GetAllocator();
|
||||
|
||||
Value out(kObjectType);
|
||||
|
||||
out.AddMember("brand", StringRef(brand()), allocator);
|
||||
out.AddMember("aes", hasAES(), allocator);
|
||||
out.AddMember("avx2", false, allocator);
|
||||
out.AddMember("x64", is64bit(), allocator); // DEPRECATED will be removed in the next major release.
|
||||
out.AddMember("64_bit", is64bit(), allocator);
|
||||
out.AddMember("l2", static_cast<uint64_t>(L2()), allocator);
|
||||
out.AddMember("l3", static_cast<uint64_t>(L3()), allocator);
|
||||
out.AddMember("cores", static_cast<uint64_t>(cores()), allocator);
|
||||
out.AddMember("threads", static_cast<uint64_t>(threads()), allocator);
|
||||
out.AddMember("packages", static_cast<uint64_t>(packages()), allocator);
|
||||
out.AddMember("nodes", static_cast<uint64_t>(nodes()), allocator);
|
||||
out.AddMember("backend", StringRef(backend()), allocator);
|
||||
out.AddMember("msr", "none", allocator);
|
||||
out.AddMember("assembly", "none", allocator);
|
||||
out.AddMember("arch", "riscv64", allocator);
|
||||
|
||||
Value flags(kArrayType);
|
||||
|
||||
if (hasAES()) {
|
||||
flags.PushBack("aes", allocator);
|
||||
}
|
||||
|
||||
out.AddMember("flags", flags, allocator);
|
||||
|
||||
return out;
|
||||
}
|
||||
@@ -87,7 +87,7 @@ static inline size_t countByType(hwloc_topology_t topology, hwloc_obj_type_t typ
|
||||
}
|
||||
|
||||
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
static inline std::vector<hwloc_obj_t> findByType(hwloc_obj_t obj, hwloc_obj_type_t type)
|
||||
{
|
||||
std::vector<hwloc_obj_t> out;
|
||||
@@ -207,7 +207,7 @@ bool xmrig::HwlocCpuInfo::membind(hwloc_const_bitmap_t nodeset)
|
||||
|
||||
xmrig::CpuThreads xmrig::HwlocCpuInfo::threads(const Algorithm &algorithm, uint32_t limit) const
|
||||
{
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
if (L2() == 0 && L3() == 0) {
|
||||
return BasicCpuInfo::threads(algorithm, limit);
|
||||
}
|
||||
@@ -277,7 +277,7 @@ xmrig::CpuThreads xmrig::HwlocCpuInfo::allThreads(const Algorithm &algorithm, ui
|
||||
|
||||
void xmrig::HwlocCpuInfo::processTopLevelCache(hwloc_obj_t cache, const Algorithm &algorithm, CpuThreads &threads, size_t limit) const
|
||||
{
|
||||
# ifndef XMRIG_ARM
|
||||
# if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
constexpr size_t oneMiB = 1024U * 1024U;
|
||||
|
||||
size_t PUs = countByType(cache, HWLOC_OBJ_PU);
|
||||
|
||||
140
src/backend/cpu/platform/lscpu_riscv.cpp
Normal file
140
src/backend/cpu/platform/lscpu_riscv.cpp
Normal file
@@ -0,0 +1,140 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 Slayingripper <https://github.com/Slayingripper>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "base/tools/String.h"
|
||||
#include "3rdparty/fmt/core.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
struct riscv_cpu_desc
|
||||
{
|
||||
String model;
|
||||
String isa;
|
||||
String uarch;
|
||||
bool has_vector = false;
|
||||
bool has_crypto = false;
|
||||
|
||||
inline bool isReady() const { return !model.isNull(); }
|
||||
};
|
||||
|
||||
static bool lookup_riscv(char *line, const char *pattern, String &value)
|
||||
{
|
||||
char *p = strstr(line, pattern);
|
||||
if (!p) {
|
||||
return false;
|
||||
}
|
||||
|
||||
p += strlen(pattern);
|
||||
while (isspace(*p)) {
|
||||
++p;
|
||||
}
|
||||
|
||||
if (*p == ':') {
|
||||
++p;
|
||||
}
|
||||
|
||||
while (isspace(*p)) {
|
||||
++p;
|
||||
}
|
||||
|
||||
// Remove trailing newline
|
||||
size_t len = strlen(p);
|
||||
if (len > 0 && p[len - 1] == '\n') {
|
||||
p[len - 1] = '\0';
|
||||
}
|
||||
|
||||
// Ensure we call the const char* assignment (which performs a copy)
|
||||
// instead of the char* overload (which would take ownership of the pointer)
|
||||
value = (const char*)p;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool read_riscv_cpuinfo(riscv_cpu_desc *desc)
|
||||
{
|
||||
auto fp = fopen("/proc/cpuinfo", "r");
|
||||
if (!fp) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char buf[2048]; // Larger buffer for long ISA strings
|
||||
while (fgets(buf, sizeof(buf), fp) != nullptr) {
|
||||
lookup_riscv(buf, "model name", desc->model);
|
||||
|
||||
if (lookup_riscv(buf, "isa", desc->isa)) {
|
||||
// Check for vector extensions
|
||||
if (strstr(buf, "zve") || strstr(buf, "v_")) {
|
||||
desc->has_vector = true;
|
||||
}
|
||||
// Check for crypto extensions (AES, SHA, etc.)
|
||||
// zkn* = NIST crypto suite, zks* = SM crypto suite
|
||||
// Note: zba/zbb/zbc/zbs are bit-manipulation, NOT crypto
|
||||
if (strstr(buf, "zknd") || strstr(buf, "zkne") || strstr(buf, "zknh") ||
|
||||
strstr(buf, "zksed") || strstr(buf, "zksh")) {
|
||||
desc->has_crypto = true;
|
||||
}
|
||||
}
|
||||
|
||||
lookup_riscv(buf, "uarch", desc->uarch);
|
||||
|
||||
if (desc->isReady() && !desc->isa.isNull()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return desc->isReady();
|
||||
}
|
||||
|
||||
String cpu_name_riscv()
|
||||
{
|
||||
riscv_cpu_desc desc;
|
||||
if (read_riscv_cpuinfo(&desc)) {
|
||||
if (!desc.uarch.isNull()) {
|
||||
return fmt::format("{} ({})", desc.model, desc.uarch).c_str();
|
||||
}
|
||||
return desc.model;
|
||||
}
|
||||
|
||||
return "RISC-V";
|
||||
}
|
||||
|
||||
bool has_riscv_vector()
|
||||
{
|
||||
riscv_cpu_desc desc;
|
||||
if (read_riscv_cpuinfo(&desc)) {
|
||||
return desc.has_vector;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_riscv_crypto()
|
||||
{
|
||||
riscv_cpu_desc desc;
|
||||
if (read_riscv_cpuinfo(&desc)) {
|
||||
return desc.has_crypto;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace xmrig
|
||||
@@ -23,7 +23,7 @@
|
||||
#include "crypto/common/VirtualMemory.h"
|
||||
|
||||
|
||||
#if defined(XMRIG_ARM)
|
||||
#if defined(XMRIG_ARM) || defined(XMRIG_RISCV)
|
||||
# include "crypto/cn/CryptoNight_arm.h"
|
||||
#else
|
||||
# include "crypto/cn/CryptoNight_x86.h"
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined _MSC_VER || defined XMRIG_ARM
|
||||
#if defined _MSC_VER || defined XMRIG_ARM || defined XMRIG_RISCV
|
||||
# define ABI_ATTRIBUTE
|
||||
#else
|
||||
# define ABI_ATTRIBUTE __attribute__((ms_abi))
|
||||
|
||||
@@ -27,6 +27,9 @@
|
||||
#ifndef XMRIG_CRYPTONIGHT_ARM_H
|
||||
#define XMRIG_CRYPTONIGHT_ARM_H
|
||||
|
||||
#ifdef XMRIG_RISCV
|
||||
# include "crypto/cn/sse2rvv.h"
|
||||
#endif
|
||||
|
||||
#include "base/crypto/keccak.h"
|
||||
#include "crypto/cn/CnAlgo.h"
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <math.h>
|
||||
|
||||
// VARIANT ALTERATIONS
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
# define VARIANT1_INIT(part) \
|
||||
uint64_t tweak1_2_##part = 0; \
|
||||
if (BASE == Algorithm::CN_1) { \
|
||||
@@ -60,7 +60,7 @@
|
||||
}
|
||||
|
||||
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
# define VARIANT2_INIT(part) \
|
||||
__m128i division_result_xmm_##part = _mm_cvtsi64_si128(static_cast<int64_t>(h##part[12])); \
|
||||
__m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(static_cast<int64_t>(h##part[13]));
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#if defined(XMRIG_ARM)
|
||||
# include "crypto/cn/sse2neon.h"
|
||||
#elif defined(XMRIG_RISCV)
|
||||
# include "crypto/cn/sse2rvv.h"
|
||||
#elif defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
#else
|
||||
|
||||
748
src/crypto/cn/sse2rvv.h
Normal file
748
src/crypto/cn/sse2rvv.h
Normal file
@@ -0,0 +1,748 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 Slayingripper <https://github.com/Slayingripper>
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* SSE to RISC-V Vector (RVV) optimized compatibility header
|
||||
* Provides both scalar fallback and vectorized implementations using RVV intrinsics
|
||||
*
|
||||
* Based on sse2neon.h concepts, adapted for RISC-V architecture with RVV extensions
|
||||
* Original sse2neon.h: https://github.com/DLTcollab/sse2neon
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
#define XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Check if RVV is available */
|
||||
#if defined(__riscv_vector)
|
||||
#include <riscv_vector.h>
|
||||
#define USE_RVV_INTRINSICS 1
|
||||
#else
|
||||
#define USE_RVV_INTRINSICS 0
|
||||
#endif
|
||||
|
||||
/* 128-bit vector type */
|
||||
typedef union {
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
int8_t i8[16];
|
||||
int16_t i16[8];
|
||||
int32_t i32[4];
|
||||
int64_t i64[2];
|
||||
} __m128i_union;
|
||||
|
||||
typedef __m128i_union __m128i;
|
||||
|
||||
/* Set operations */
|
||||
static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i32[0] = e0;
|
||||
result.i32[1] = e1;
|
||||
result.i32[2] = e2;
|
||||
result.i32[3] = e3;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = e0;
|
||||
result.i64[1] = e1;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_setzero_si128(void)
|
||||
{
|
||||
__m128i result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Extract/insert operations */
|
||||
static inline int _mm_cvtsi128_si32(__m128i a)
|
||||
{
|
||||
return a.i32[0];
|
||||
}
|
||||
|
||||
static inline int64_t _mm_cvtsi128_si64(__m128i a)
|
||||
{
|
||||
return a.i64[0];
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi32_si128(int a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i32[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi64_si128(int64_t a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i64[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shuffle operations */
|
||||
static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
result.u32[0] = a.u32[(imm8 >> 0) & 0x3];
|
||||
result.u32[1] = a.u32[(imm8 >> 2) & 0x3];
|
||||
result.u32[2] = a.u32[(imm8 >> 4) & 0x3];
|
||||
result.u32[3] = a.u32[(imm8 >> 6) & 0x3];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Logical operations - optimized with RVV when available */
|
||||
static inline __m128i _mm_xor_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_or_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_and_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_andnot_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (~a.u64[0]) & b.u64[0];
|
||||
result.u64[1] = (~a.u64[1]) & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Shift operations */
|
||||
static inline __m128i _mm_slli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = 0; i < 16 - count; i++) {
|
||||
result.u8[i + count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = count; i < 16; i++) {
|
||||
result.u8[i - count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_slli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] << imm8;
|
||||
result.u64[1] = a.u64[1] << imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] >> imm8;
|
||||
result.u64[1] = a.u64[1] >> imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Load/store operations - optimized with RVV */
|
||||
static inline __m128i _mm_load_si128(const __m128i* p)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, v, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_loadu_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void _mm_store_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
__riscv_vse64_v_u64m1((uint64_t*)p, v, vl);
|
||||
#else
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void _mm_storeu_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
/* Arithmetic operations - optimized with RVV */
|
||||
static inline __m128i _mm_add_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_add_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a.i32[i] + b.i32[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_sub_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] - b.u64[0];
|
||||
result.u64[1] = a.u64[1] - b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_mul_epu32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl);
|
||||
vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl);
|
||||
vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0];
|
||||
result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Unpack operations */
|
||||
static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0];
|
||||
result.u64[1] = b.u64[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[1];
|
||||
result.u64[1] = b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Pause instruction for spin-wait loops */
|
||||
static inline void _mm_pause(void)
|
||||
{
|
||||
/* RISC-V pause hint if available (requires Zihintpause extension) */
|
||||
#if defined(__riscv_zihintpause)
|
||||
__asm__ __volatile__("pause");
|
||||
#else
|
||||
__asm__ __volatile__("nop");
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Memory fence - optimized for RISC-V */
|
||||
static inline void _mm_mfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence rw,rw" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_lfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence r,r" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_sfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence w,w" ::: "memory");
|
||||
}
|
||||
|
||||
/* Comparison operations */
|
||||
static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional shift operations */
|
||||
static inline __m128i _mm_slli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] << imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] >> imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 64-bit integer operations */
|
||||
static inline __m128i _mm_set1_epi64x(int64_t a)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = a;
|
||||
result.i64[1] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Float type for compatibility */
|
||||
typedef __m128i __m128;
|
||||
|
||||
/* Float operations - simplified scalar implementations */
|
||||
static inline __m128 _mm_set1_ps(float a)
|
||||
{
|
||||
__m128 result;
|
||||
uint32_t val;
|
||||
memcpy(&val, &a, sizeof(float));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = val;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_setzero_ps(void)
|
||||
{
|
||||
__m128 result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_add_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] + fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_mul_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] * fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_and_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_or_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_cvtepi32_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
float fr[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = (float)a.i32[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvttps_epi32(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
float fa[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = (int32_t)fa[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Casting operations */
|
||||
static inline __m128 _mm_castsi128_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_castps_si128(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional set operations */
|
||||
static inline __m128i _mm_set1_epi32(int a)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* AES instructions - placeholders for soft_aes compatibility */
|
||||
static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey)
|
||||
{
|
||||
return _mm_xor_si128(a, roundkey);
|
||||
}
|
||||
|
||||
static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
||||
{
|
||||
return a;
|
||||
}
|
||||
|
||||
/* Rotate right operation for soft_aes.h */
|
||||
static inline uint32_t _rotr(uint32_t value, unsigned int count)
|
||||
{
|
||||
const unsigned int mask = 31;
|
||||
count &= mask;
|
||||
return (value >> count) | (value << ((-count) & mask));
|
||||
}
|
||||
|
||||
/* ARM NEON compatibility types and intrinsics for RISC-V */
|
||||
typedef __m128i_union uint64x2_t;
|
||||
typedef __m128i_union uint8x16_t;
|
||||
typedef __m128i_union int64x2_t;
|
||||
typedef __m128i_union int32x4_t;
|
||||
|
||||
static inline uint64x2_t vld1q_u64(const uint64_t *ptr)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = ptr[0];
|
||||
result.u64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64x2_t vld1q_s64(const int64_t *ptr)
|
||||
{
|
||||
int64x2_t result;
|
||||
result.i64[0] = ptr[0];
|
||||
result.i64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val)
|
||||
{
|
||||
ptr[0] = val.u64[0];
|
||||
ptr[1] = val.u64[1];
|
||||
}
|
||||
|
||||
static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_add_epi64(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a)
|
||||
{
|
||||
uint64x2_t result;
|
||||
memcpy(&result, &a, sizeof(uint64x2_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane)
|
||||
{
|
||||
return v.u64[lane];
|
||||
}
|
||||
|
||||
static inline int64_t vgetq_lane_s64(int64x2_t v, int lane)
|
||||
{
|
||||
return v.i64[lane];
|
||||
}
|
||||
|
||||
static inline int32_t vgetq_lane_s32(int32x4_t v, int lane)
|
||||
{
|
||||
return v.i32[lane];
|
||||
}
|
||||
|
||||
typedef struct { uint64_t val[1]; } uint64x1_t;
|
||||
|
||||
static inline uint64x1_t vcreate_u64(uint64_t a)
|
||||
{
|
||||
uint64x1_t result;
|
||||
result.val[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = low.val[0];
|
||||
result.u64[1] = high.val[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */
|
||||
748
src/crypto/cn/sse2rvv_optimized.h
Normal file
748
src/crypto/cn/sse2rvv_optimized.h
Normal file
@@ -0,0 +1,748 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* SSE to RISC-V Vector (RVV) optimized compatibility header
|
||||
* Provides both scalar fallback and vectorized implementations using RVV intrinsics
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
#define XMRIG_SSE2RVV_OPTIMIZED_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Check if RVV is available */
|
||||
#if defined(__riscv_vector)
|
||||
#include <riscv_vector.h>
|
||||
#define USE_RVV_INTRINSICS 1
|
||||
#else
|
||||
#define USE_RVV_INTRINSICS 0
|
||||
#endif
|
||||
|
||||
/* 128-bit vector type */
|
||||
typedef union {
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
int8_t i8[16];
|
||||
int16_t i16[8];
|
||||
int32_t i32[4];
|
||||
int64_t i64[2];
|
||||
#if USE_RVV_INTRINSICS
|
||||
vuint64m1_t rvv_u64;
|
||||
vuint32m1_t rvv_u32;
|
||||
vuint8m1_t rvv_u8;
|
||||
#endif
|
||||
} __m128i_union;
|
||||
|
||||
typedef __m128i_union __m128i;
|
||||
|
||||
/* Set operations */
|
||||
static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i32[0] = e0;
|
||||
result.i32[1] = e1;
|
||||
result.i32[2] = e2;
|
||||
result.i32[3] = e3;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = e0;
|
||||
result.i64[1] = e1;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_setzero_si128(void)
|
||||
{
|
||||
__m128i result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Extract/insert operations */
|
||||
static inline int _mm_cvtsi128_si32(__m128i a)
|
||||
{
|
||||
return a.i32[0];
|
||||
}
|
||||
|
||||
static inline int64_t _mm_cvtsi128_si64(__m128i a)
|
||||
{
|
||||
return a.i64[0];
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi32_si128(int a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i32[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi64_si128(int64_t a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i64[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shuffle operations */
|
||||
static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
result.u32[0] = a.u32[(imm8 >> 0) & 0x3];
|
||||
result.u32[1] = a.u32[(imm8 >> 2) & 0x3];
|
||||
result.u32[2] = a.u32[(imm8 >> 4) & 0x3];
|
||||
result.u32[3] = a.u32[(imm8 >> 6) & 0x3];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Logical operations - optimized with RVV when available */
|
||||
static inline __m128i _mm_xor_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vxor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_or_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vor_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_and_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_andnot_si128(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vnot_a = __riscv_vnot_v_u64m1(va, vl);
|
||||
vuint64m1_t vr = __riscv_vand_vv_u64m1(vnot_a, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (~a.u64[0]) & b.u64[0];
|
||||
result.u64[1] = (~a.u64[1]) & b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Shift operations */
|
||||
static inline __m128i _mm_slli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(0, vl), va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = 0; i < 16 - count; i++) {
|
||||
result.u8[i + count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_si128(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
size_t vl = __riscv_vsetvl_e8m1(16);
|
||||
vuint8m1_t va = __riscv_vle8_v_u8m1(a.u8, vl);
|
||||
vuint8m1_t vr = __riscv_vslidedown_vx_u8m1(va, count, vl);
|
||||
__riscv_vse8_v_u8m1(result.u8, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = count; i < 16; i++) {
|
||||
result.u8[i - count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_slli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsll_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] << imm8;
|
||||
result.u64[1] = a.u64[1] << imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsrl_vx_u64m1(va, imm8, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] >> imm8;
|
||||
result.u64[1] = a.u64[1] >> imm8;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Load/store operations - optimized with RVV */
|
||||
static inline __m128i _mm_load_si128(const __m128i* p)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1((const uint64_t*)p, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, v, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_loadu_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void _mm_store_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t v = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
__riscv_vse64_v_u64m1((uint64_t*)p, v, vl);
|
||||
#else
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void _mm_storeu_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
/* Arithmetic operations - optimized with RVV */
|
||||
static inline __m128i _mm_add_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vadd_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_add_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vb = __riscv_vle32_v_u32m1(b.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vadd_vv_u32m1(va, vb, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a.i32[i] + b.i32[i];
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_sub_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va = __riscv_vle64_v_u64m1(a.u64, vl);
|
||||
vuint64m1_t vb = __riscv_vle64_v_u64m1(b.u64, vl);
|
||||
vuint64m1_t vr = __riscv_vsub_vv_u64m1(va, vb, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] - b.u64[0];
|
||||
result.u64[1] = a.u64[1] - b.u64[1];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_mul_epu32(__m128i a, __m128i b)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
size_t vl = __riscv_vsetvl_e64m1(2);
|
||||
vuint64m1_t va_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&a.u32[0], 2), vl);
|
||||
vuint64m1_t vb_lo = __riscv_vzext_vf2_u64m1(__riscv_vle32_v_u32mf2(&b.u32[0], 2), vl);
|
||||
vuint64m1_t vr = __riscv_vmul_vv_u64m1(va_lo, vb_lo, vl);
|
||||
__riscv_vse64_v_u64m1(result.u64, vr, vl);
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0];
|
||||
result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2];
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Unpack operations */
|
||||
static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0];
|
||||
result.u64[1] = b.u64[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[1];
|
||||
result.u64[1] = b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Pause instruction for spin-wait loops */
|
||||
static inline void _mm_pause(void)
|
||||
{
|
||||
/* RISC-V pause hint if available (requires Zihintpause extension) */
|
||||
#if defined(__riscv_zihintpause)
|
||||
__asm__ __volatile__("pause");
|
||||
#else
|
||||
__asm__ __volatile__("nop");
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Memory fence - optimized for RISC-V */
|
||||
static inline void _mm_mfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence rw,rw" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_lfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence r,r" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_sfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence w,w" ::: "memory");
|
||||
}
|
||||
|
||||
/* Comparison operations */
|
||||
static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional shift operations */
|
||||
static inline __m128i _mm_slli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsll_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] << imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
#if USE_RVV_INTRINSICS
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
memset(&result, 0, sizeof(result));
|
||||
} else {
|
||||
size_t vl = __riscv_vsetvl_e32m1(4);
|
||||
vuint32m1_t va = __riscv_vle32_v_u32m1(a.u32, vl);
|
||||
vuint32m1_t vr = __riscv_vsrl_vx_u32m1(va, imm8, vl);
|
||||
__riscv_vse32_v_u32m1(result.u32, vr, vl);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] >> imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 64-bit integer operations */
|
||||
static inline __m128i _mm_set1_epi64x(int64_t a)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = a;
|
||||
result.i64[1] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Float type for compatibility */
|
||||
typedef __m128i __m128;
|
||||
|
||||
/* Float operations - simplified scalar implementations */
|
||||
static inline __m128 _mm_set1_ps(float a)
|
||||
{
|
||||
__m128 result;
|
||||
uint32_t val;
|
||||
memcpy(&val, &a, sizeof(float));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = val;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_setzero_ps(void)
|
||||
{
|
||||
__m128 result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_add_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] + fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_mul_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] * fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_and_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_or_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_cvtepi32_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
float fr[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = (float)a.i32[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvttps_epi32(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
float fa[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = (int32_t)fa[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Casting operations */
|
||||
static inline __m128 _mm_castsi128_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_castps_si128(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional set operations */
|
||||
static inline __m128i _mm_set1_epi32(int a)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* AES instructions - placeholders for soft_aes compatibility */
|
||||
static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey)
|
||||
{
|
||||
return _mm_xor_si128(a, roundkey);
|
||||
}
|
||||
|
||||
static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
||||
{
|
||||
return a;
|
||||
}
|
||||
|
||||
/* Rotate right operation for soft_aes.h */
|
||||
static inline uint32_t _rotr(uint32_t value, unsigned int count)
|
||||
{
|
||||
const unsigned int mask = 31;
|
||||
count &= mask;
|
||||
return (value >> count) | (value << ((-count) & mask));
|
||||
}
|
||||
|
||||
/* ARM NEON compatibility types and intrinsics for RISC-V */
|
||||
typedef __m128i_union uint64x2_t;
|
||||
typedef __m128i_union uint8x16_t;
|
||||
typedef __m128i_union int64x2_t;
|
||||
typedef __m128i_union int32x4_t;
|
||||
|
||||
static inline uint64x2_t vld1q_u64(const uint64_t *ptr)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = ptr[0];
|
||||
result.u64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64x2_t vld1q_s64(const int64_t *ptr)
|
||||
{
|
||||
int64x2_t result;
|
||||
result.i64[0] = ptr[0];
|
||||
result.i64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val)
|
||||
{
|
||||
ptr[0] = val.u64[0];
|
||||
ptr[1] = val.u64[1];
|
||||
}
|
||||
|
||||
static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
return _mm_add_epi64(a, b);
|
||||
}
|
||||
|
||||
static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a)
|
||||
{
|
||||
uint64x2_t result;
|
||||
memcpy(&result, &a, sizeof(uint64x2_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane)
|
||||
{
|
||||
return v.u64[lane];
|
||||
}
|
||||
|
||||
static inline int64_t vgetq_lane_s64(int64x2_t v, int lane)
|
||||
{
|
||||
return v.i64[lane];
|
||||
}
|
||||
|
||||
static inline int32_t vgetq_lane_s32(int32x4_t v, int lane)
|
||||
{
|
||||
return v.i32[lane];
|
||||
}
|
||||
|
||||
typedef struct { uint64_t val[1]; } uint64x1_t;
|
||||
|
||||
static inline uint64x1_t vcreate_u64(uint64_t a)
|
||||
{
|
||||
uint64x1_t result;
|
||||
result.val[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = low.val[0];
|
||||
result.u64[1] = high.val[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* XMRIG_SSE2RVV_OPTIMIZED_H */
|
||||
571
src/crypto/cn/sse2rvv_scalar_backup.h
Normal file
571
src/crypto/cn/sse2rvv_scalar_backup.h
Normal file
@@ -0,0 +1,571 @@
|
||||
/* XMRig
|
||||
* Copyright (c) 2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* SSE to RISC-V compatibility header
|
||||
* Provides scalar implementations of SSE intrinsics for RISC-V architecture
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_SSE2RVV_H
|
||||
#define XMRIG_SSE2RVV_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/* 128-bit vector type */
|
||||
typedef union {
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
int8_t i8[16];
|
||||
int16_t i16[8];
|
||||
int32_t i32[4];
|
||||
int64_t i64[2];
|
||||
} __m128i_union;
|
||||
|
||||
typedef __m128i_union __m128i;
|
||||
|
||||
/* Set operations */
|
||||
static inline __m128i _mm_set_epi32(int e3, int e2, int e1, int e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i32[0] = e0;
|
||||
result.i32[1] = e1;
|
||||
result.i32[2] = e2;
|
||||
result.i32[3] = e3;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = e0;
|
||||
result.i64[1] = e1;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_setzero_si128(void)
|
||||
{
|
||||
__m128i result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Extract/insert operations */
|
||||
static inline int _mm_cvtsi128_si32(__m128i a)
|
||||
{
|
||||
return a.i32[0];
|
||||
}
|
||||
|
||||
static inline int64_t _mm_cvtsi128_si64(__m128i a)
|
||||
{
|
||||
return a.i64[0];
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi32_si128(int a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i32[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvtsi64_si128(int64_t a)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
result.i64[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shuffle operations */
|
||||
static inline __m128i _mm_shuffle_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
result.u32[0] = a.u32[(imm8 >> 0) & 0x3];
|
||||
result.u32[1] = a.u32[(imm8 >> 2) & 0x3];
|
||||
result.u32[2] = a.u32[(imm8 >> 4) & 0x3];
|
||||
result.u32[3] = a.u32[(imm8 >> 6) & 0x3];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Logical operations */
|
||||
static inline __m128i _mm_xor_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_or_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_and_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_andnot_si128(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = (~a.u64[0]) & b.u64[0];
|
||||
result.u64[1] = (~a.u64[1]) & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Shift operations */
|
||||
static inline __m128i _mm_slli_si128(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = 0; i < 16 - count; i++) {
|
||||
result.u8[i + count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_si128(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result = _mm_setzero_si128();
|
||||
int count = imm8 & 0xFF;
|
||||
if (count > 15) return result;
|
||||
|
||||
for (int i = count; i < 16; i++) {
|
||||
result.u8[i - count] = a.u8[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_slli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] << imm8;
|
||||
result.u64[1] = a.u64[1] << imm8;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi64(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 63) {
|
||||
result.u64[0] = 0;
|
||||
result.u64[1] = 0;
|
||||
} else {
|
||||
result.u64[0] = a.u64[0] >> imm8;
|
||||
result.u64[1] = a.u64[1] >> imm8;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Load/store operations */
|
||||
static inline __m128i _mm_load_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_loadu_si128(const __m128i* p)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, p, sizeof(__m128i));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void _mm_store_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
static inline void _mm_storeu_si128(__m128i* p, __m128i a)
|
||||
{
|
||||
memcpy(p, &a, sizeof(__m128i));
|
||||
}
|
||||
|
||||
/* Arithmetic operations */
|
||||
static inline __m128i _mm_add_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_add_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a.i32[i] + b.i32[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_sub_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0] - b.u64[0];
|
||||
result.u64[1] = a.u64[1] - b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_mul_epu32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = (uint64_t)a.u32[0] * (uint64_t)b.u32[0];
|
||||
result.u64[1] = (uint64_t)a.u32[2] * (uint64_t)b.u32[2];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Unpack operations */
|
||||
static inline __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[0];
|
||||
result.u64[1] = b.u64[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
result.u64[0] = a.u64[1];
|
||||
result.u64[1] = b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Pause instruction for spin-wait loops */
|
||||
static inline void _mm_pause(void)
|
||||
{
|
||||
/* RISC-V doesn't have a direct equivalent to x86 PAUSE
|
||||
* Use a simple NOP or yield hint */
|
||||
__asm__ __volatile__("nop");
|
||||
}
|
||||
|
||||
/* Memory fence */
|
||||
static inline void _mm_mfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_lfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence r,r" ::: "memory");
|
||||
}
|
||||
|
||||
static inline void _mm_sfence(void)
|
||||
{
|
||||
__asm__ __volatile__("fence w,w" ::: "memory");
|
||||
}
|
||||
|
||||
/* Comparison operations */
|
||||
static inline __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = (a.u32[i] == b.u32[i]) ? 0xFFFFFFFF : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
result.u64[i] = (a.u64[i] == b.u64[i]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional shift operations */
|
||||
static inline __m128i _mm_slli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] << imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_srli_epi32(__m128i a, int imm8)
|
||||
{
|
||||
__m128i result;
|
||||
if (imm8 > 31) {
|
||||
for (int i = 0; i < 4; i++) result.u32[i] = 0;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = a.u32[i] >> imm8;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* 64-bit integer operations */
|
||||
static inline __m128i _mm_set1_epi64x(int64_t a)
|
||||
{
|
||||
__m128i result;
|
||||
result.i64[0] = a;
|
||||
result.i64[1] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Float type for compatibility - we'll treat it as int for simplicity */
|
||||
typedef __m128i __m128;
|
||||
|
||||
/* Float operations - simplified scalar implementations */
|
||||
static inline __m128 _mm_set1_ps(float a)
|
||||
{
|
||||
__m128 result;
|
||||
uint32_t val;
|
||||
memcpy(&val, &a, sizeof(float));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.u32[i] = val;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_setzero_ps(void)
|
||||
{
|
||||
__m128 result;
|
||||
memset(&result, 0, sizeof(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_add_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] + fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_mul_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
float fa[4], fb[4], fr[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
memcpy(fb, &b, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = fa[i] * fb[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_and_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] & b.u64[0];
|
||||
result.u64[1] = a.u64[1] & b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_or_ps(__m128 a, __m128 b)
|
||||
{
|
||||
__m128 result;
|
||||
result.u64[0] = a.u64[0] | b.u64[0];
|
||||
result.u64[1] = a.u64[1] | b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128 _mm_cvtepi32_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
float fr[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fr[i] = (float)a.i32[i];
|
||||
}
|
||||
memcpy(&result, fr, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_cvttps_epi32(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
float fa[4];
|
||||
memcpy(fa, &a, sizeof(__m128));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = (int32_t)fa[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Casting operations */
|
||||
static inline __m128 _mm_castsi128_ps(__m128i a)
|
||||
{
|
||||
__m128 result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __m128i _mm_castps_si128(__m128 a)
|
||||
{
|
||||
__m128i result;
|
||||
memcpy(&result, &a, sizeof(__m128));
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Additional set operations */
|
||||
static inline __m128i _mm_set1_epi32(int a)
|
||||
{
|
||||
__m128i result;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
result.i32[i] = a;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* AES instructions - these are placeholders, actual AES is done via soft_aes.h */
|
||||
/* On RISC-V without crypto extensions, these should never be called directly */
|
||||
/* They are only here for compilation compatibility */
|
||||
static inline __m128i _mm_aesenc_si128(__m128i a, __m128i roundkey)
|
||||
{
|
||||
/* This is a placeholder - actual implementation should use soft_aes */
|
||||
/* If this function is called, it means SOFT_AES template parameter wasn't used */
|
||||
/* We return a XOR as a minimal fallback, but proper code should use soft_aesenc */
|
||||
return _mm_xor_si128(a, roundkey);
|
||||
}
|
||||
|
||||
static inline __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
||||
{
|
||||
/* Placeholder for AES key generation - should use soft_aeskeygenassist */
|
||||
return a;
|
||||
}
|
||||
|
||||
/* Rotate right operation for soft_aes.h */
|
||||
static inline uint32_t _rotr(uint32_t value, unsigned int count)
|
||||
{
|
||||
const unsigned int mask = 31;
|
||||
count &= mask;
|
||||
return (value >> count) | (value << ((-count) & mask));
|
||||
}
|
||||
|
||||
/* ARM NEON compatibility types and intrinsics for RISC-V */
|
||||
typedef __m128i_union uint64x2_t;
|
||||
typedef __m128i_union uint8x16_t;
|
||||
typedef __m128i_union int64x2_t;
|
||||
typedef __m128i_union int32x4_t;
|
||||
|
||||
static inline uint64x2_t vld1q_u64(const uint64_t *ptr)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = ptr[0];
|
||||
result.u64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64x2_t vld1q_s64(const int64_t *ptr)
|
||||
{
|
||||
int64x2_t result;
|
||||
result.i64[0] = ptr[0];
|
||||
result.i64[1] = ptr[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void vst1q_u64(uint64_t *ptr, uint64x2_t val)
|
||||
{
|
||||
ptr[0] = val.u64[0];
|
||||
ptr[1] = val.u64[1];
|
||||
}
|
||||
|
||||
static inline uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = a.u64[0] ^ b.u64[0];
|
||||
result.u64[1] = a.u64[1] ^ b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = a.u64[0] + b.u64[0];
|
||||
result.u64[1] = a.u64[1] + b.u64[1];
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a)
|
||||
{
|
||||
uint64x2_t result;
|
||||
memcpy(&result, &a, sizeof(uint64x2_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64_t vgetq_lane_u64(uint64x2_t v, int lane)
|
||||
{
|
||||
return v.u64[lane];
|
||||
}
|
||||
|
||||
static inline int64_t vgetq_lane_s64(int64x2_t v, int lane)
|
||||
{
|
||||
return v.i64[lane];
|
||||
}
|
||||
|
||||
static inline int32_t vgetq_lane_s32(int32x4_t v, int lane)
|
||||
{
|
||||
return v.i32[lane];
|
||||
}
|
||||
|
||||
typedef struct { uint64_t val[1]; } uint64x1_t;
|
||||
|
||||
static inline uint64x1_t vcreate_u64(uint64_t a)
|
||||
{
|
||||
uint64x1_t result;
|
||||
result.val[0] = a;
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high)
|
||||
{
|
||||
uint64x2_t result;
|
||||
result.u64[0] = low.val[0];
|
||||
result.u64[1] = high.val[0];
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* XMRIG_SSE2RVV_H */
|
||||
@@ -26,7 +26,7 @@
|
||||
#define XMRIG_MM_MALLOC_PORTABLE_H
|
||||
|
||||
|
||||
#if defined(XMRIG_ARM) && !defined(__clang__)
|
||||
#if (defined(XMRIG_ARM) || defined(XMRIG_RISCV)) && !defined(__clang__)
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
|
||||
@@ -57,6 +57,9 @@
|
||||
|
||||
#if defined(XMRIG_ARM)
|
||||
# include "crypto/cn/sse2neon.h"
|
||||
#elif defined(XMRIG_RISCV)
|
||||
// RISC-V doesn't have SSE/NEON, provide minimal compatibility
|
||||
# define _mm_pause() __asm__ __volatile__("nop")
|
||||
#elif defined(__GNUC__)
|
||||
# include <x86intrin.h>
|
||||
#else
|
||||
@@ -286,7 +289,7 @@ struct HelperThread
|
||||
|
||||
void benchmark()
|
||||
{
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
static std::atomic<int> done{ 0 };
|
||||
if (done.exchange(1)) {
|
||||
return;
|
||||
@@ -478,7 +481,7 @@ static inline bool findByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambd
|
||||
|
||||
HelperThread* create_helper_thread(int64_t cpu_index, int priority, const std::vector<int64_t>& affinities)
|
||||
{
|
||||
#ifndef XMRIG_ARM
|
||||
#if !defined(XMRIG_ARM) && !defined(XMRIG_RISCV)
|
||||
hwloc_bitmap_t helper_cpu_set = hwloc_bitmap_alloc();
|
||||
hwloc_bitmap_t main_threads_set = hwloc_bitmap_alloc();
|
||||
|
||||
@@ -807,7 +810,7 @@ void hash_octa(const uint8_t* data, size_t size, uint8_t* output, cryptonight_ct
|
||||
uint32_t cn_indices[6];
|
||||
select_indices(cn_indices, seed);
|
||||
|
||||
#ifdef XMRIG_ARM
|
||||
#if defined(XMRIG_ARM) || defined(XMRIG_RISCV)
|
||||
uint32_t step[6] = { 1, 1, 1, 1, 1, 1 };
|
||||
#else
|
||||
uint32_t step[6] = { 4, 4, 1, 2, 4, 4 };
|
||||
|
||||
@@ -111,6 +111,10 @@ namespace randomx {
|
||||
#define RANDOMX_HAVE_COMPILER 1
|
||||
class JitCompilerA64;
|
||||
using JitCompiler = JitCompilerA64;
|
||||
#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
|
||||
#define RANDOMX_HAVE_COMPILER 1
|
||||
class JitCompilerRV64;
|
||||
using JitCompiler = JitCompilerRV64;
|
||||
#else
|
||||
#define RANDOMX_HAVE_COMPILER 0
|
||||
class JitCompilerFallback;
|
||||
|
||||
@@ -32,6 +32,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/jit_compiler_x86.hpp"
|
||||
#elif defined(__aarch64__)
|
||||
#include "crypto/randomx/jit_compiler_a64.hpp"
|
||||
#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
|
||||
#include "crypto/randomx/jit_compiler_rv64.hpp"
|
||||
#else
|
||||
#include "crypto/randomx/jit_compiler_fallback.hpp"
|
||||
#endif
|
||||
|
||||
1164
src/crypto/randomx/jit_compiler_rv64.cpp
Normal file
1164
src/crypto/randomx/jit_compiler_rv64.cpp
Normal file
File diff suppressed because it is too large
Load Diff
144
src/crypto/randomx/jit_compiler_rv64.hpp
Normal file
144
src/crypto/randomx/jit_compiler_rv64.hpp
Normal file
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
Copyright (c) 2023 tevador <tevador@gmail.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include "crypto/randomx/common.hpp"
|
||||
#include "crypto/randomx/jit_compiler_rv64_static.hpp"
|
||||
|
||||
namespace randomx {
|
||||
|
||||
struct CodeBuffer {
|
||||
uint8_t* code;
|
||||
int32_t codePos;
|
||||
int32_t rcpCount;
|
||||
|
||||
void emit(const uint8_t* src, int32_t len) {
|
||||
memcpy(&code[codePos], src, len);
|
||||
codePos += len;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void emit(T src) {
|
||||
memcpy(&code[codePos], &src, sizeof(src));
|
||||
codePos += sizeof(src);
|
||||
}
|
||||
|
||||
void emitAt(int32_t codePos, const uint8_t* src, int32_t len) {
|
||||
memcpy(&code[codePos], src, len);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void emitAt(int32_t codePos, T src) {
|
||||
memcpy(&code[codePos], &src, sizeof(src));
|
||||
}
|
||||
};
|
||||
|
||||
struct CompilerState : public CodeBuffer {
|
||||
int32_t instructionOffsets[RANDOMX_PROGRAM_MAX_SIZE];
|
||||
int registerUsage[RegistersCount];
|
||||
};
|
||||
|
||||
class Program;
|
||||
struct ProgramConfiguration;
|
||||
class SuperscalarProgram;
|
||||
class Instruction;
|
||||
|
||||
#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i
|
||||
typedef void(*InstructionGeneratorRV64)(HANDLER_ARGS);
|
||||
|
||||
class JitCompilerRV64 {
|
||||
public:
|
||||
JitCompilerRV64(bool hugePagesEnable, bool optimizedInitDatasetEnable);
|
||||
~JitCompilerRV64();
|
||||
|
||||
void prepare() {}
|
||||
void generateProgram(Program&, ProgramConfiguration&, uint32_t);
|
||||
void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
|
||||
|
||||
template<size_t N>
|
||||
void generateSuperscalarHash(SuperscalarProgram(&programs)[N]);
|
||||
|
||||
void generateDatasetInitCode() {}
|
||||
|
||||
ProgramFunc* getProgramFunc() {
|
||||
return (ProgramFunc*)entryProgram;
|
||||
}
|
||||
DatasetInitFunc* getDatasetInitFunc() {
|
||||
return (DatasetInitFunc*)entryDataInit;
|
||||
}
|
||||
uint8_t* getCode() {
|
||||
return state.code;
|
||||
}
|
||||
size_t getCodeSize();
|
||||
|
||||
void enableWriting() const;
|
||||
void enableExecution() const;
|
||||
|
||||
static InstructionGeneratorRV64 engine[256];
|
||||
private:
|
||||
CompilerState state;
|
||||
void* entryDataInit;
|
||||
void* entryProgram;
|
||||
|
||||
public:
|
||||
static void v1_IADD_RS(HANDLER_ARGS);
|
||||
static void v1_IADD_M(HANDLER_ARGS);
|
||||
static void v1_ISUB_R(HANDLER_ARGS);
|
||||
static void v1_ISUB_M(HANDLER_ARGS);
|
||||
static void v1_IMUL_R(HANDLER_ARGS);
|
||||
static void v1_IMUL_M(HANDLER_ARGS);
|
||||
static void v1_IMULH_R(HANDLER_ARGS);
|
||||
static void v1_IMULH_M(HANDLER_ARGS);
|
||||
static void v1_ISMULH_R(HANDLER_ARGS);
|
||||
static void v1_ISMULH_M(HANDLER_ARGS);
|
||||
static void v1_IMUL_RCP(HANDLER_ARGS);
|
||||
static void v1_INEG_R(HANDLER_ARGS);
|
||||
static void v1_IXOR_R(HANDLER_ARGS);
|
||||
static void v1_IXOR_M(HANDLER_ARGS);
|
||||
static void v1_IROR_R(HANDLER_ARGS);
|
||||
static void v1_IROL_R(HANDLER_ARGS);
|
||||
static void v1_ISWAP_R(HANDLER_ARGS);
|
||||
static void v1_FSWAP_R(HANDLER_ARGS);
|
||||
static void v1_FADD_R(HANDLER_ARGS);
|
||||
static void v1_FADD_M(HANDLER_ARGS);
|
||||
static void v1_FSUB_R(HANDLER_ARGS);
|
||||
static void v1_FSUB_M(HANDLER_ARGS);
|
||||
static void v1_FSCAL_R(HANDLER_ARGS);
|
||||
static void v1_FMUL_R(HANDLER_ARGS);
|
||||
static void v1_FDIV_M(HANDLER_ARGS);
|
||||
static void v1_FSQRT_R(HANDLER_ARGS);
|
||||
static void v1_CBRANCH(HANDLER_ARGS);
|
||||
static void v1_CFROUND(HANDLER_ARGS);
|
||||
static void v1_ISTORE(HANDLER_ARGS);
|
||||
static void v1_NOP(HANDLER_ARGS);
|
||||
};
|
||||
}
|
||||
1236
src/crypto/randomx/jit_compiler_rv64_static.S
Normal file
1236
src/crypto/randomx/jit_compiler_rv64_static.S
Normal file
File diff suppressed because it is too large
Load Diff
53
src/crypto/randomx/jit_compiler_rv64_static.hpp
Normal file
53
src/crypto/randomx/jit_compiler_rv64_static.hpp
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
Copyright (c) 2023 tevador <tevador@gmail.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
extern "C" {
|
||||
void randomx_riscv64_literals();
|
||||
void randomx_riscv64_literals_end();
|
||||
void randomx_riscv64_data_init();
|
||||
void randomx_riscv64_fix_data_call();
|
||||
void randomx_riscv64_prologue();
|
||||
void randomx_riscv64_loop_begin();
|
||||
void randomx_riscv64_data_read();
|
||||
void randomx_riscv64_data_read_light();
|
||||
void randomx_riscv64_fix_loop_call();
|
||||
void randomx_riscv64_spad_store();
|
||||
void randomx_riscv64_spad_store_hardaes();
|
||||
void randomx_riscv64_spad_store_softaes();
|
||||
void randomx_riscv64_loop_end();
|
||||
void randomx_riscv64_fix_continue_loop();
|
||||
void randomx_riscv64_epilogue();
|
||||
void randomx_riscv64_softaes();
|
||||
void randomx_riscv64_program_end();
|
||||
void randomx_riscv64_ssh_init();
|
||||
void randomx_riscv64_ssh_load();
|
||||
void randomx_riscv64_ssh_prefetch();
|
||||
void randomx_riscv64_ssh_end();
|
||||
}
|
||||
@@ -39,6 +39,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/jit_compiler_x86_static.hpp"
|
||||
#elif (XMRIG_ARM == 8)
|
||||
#include "crypto/randomx/jit_compiler_a64_static.hpp"
|
||||
#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
|
||||
#include "crypto/randomx/jit_compiler_rv64_static.hpp"
|
||||
#endif
|
||||
|
||||
#include "backend/cpu/Cpu.h"
|
||||
@@ -190,7 +192,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
|
||||
# endif
|
||||
}
|
||||
|
||||
#if (XMRIG_ARM == 8)
|
||||
#if (XMRIG_ARM == 8) || defined(XMRIG_RISCV)
|
||||
static uint32_t Log2(size_t value) { return (value > 1) ? (Log2(value / 2) + 1) : 0; }
|
||||
#endif
|
||||
|
||||
@@ -274,6 +276,14 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
|
||||
|
||||
#define JIT_HANDLE(x, prev) randomx::JitCompilerA64::engine[k] = &randomx::JitCompilerA64::h_##x
|
||||
|
||||
#elif defined(XMRIG_RISCV)
|
||||
|
||||
Log2_ScratchpadL1 = Log2(ScratchpadL1_Size);
|
||||
Log2_ScratchpadL2 = Log2(ScratchpadL2_Size);
|
||||
Log2_ScratchpadL3 = Log2(ScratchpadL3_Size);
|
||||
|
||||
#define JIT_HANDLE(x, prev) randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x
|
||||
|
||||
#else
|
||||
#define JIT_HANDLE(x, prev)
|
||||
#endif
|
||||
|
||||
@@ -133,7 +133,7 @@ struct RandomX_ConfigurationBase
|
||||
uint32_t ScratchpadL3Mask_Calculated;
|
||||
uint32_t ScratchpadL3Mask64_Calculated;
|
||||
|
||||
# if (XMRIG_ARM == 8)
|
||||
# if (XMRIG_ARM == 8) || defined(XMRIG_RISCV)
|
||||
uint32_t Log2_ScratchpadL1;
|
||||
uint32_t Log2_ScratchpadL2;
|
||||
uint32_t Log2_ScratchpadL3;
|
||||
|
||||
9
src/crypto/randomx/tests/riscv64_zba.s
Normal file
9
src/crypto/randomx/tests/riscv64_zba.s
Normal file
@@ -0,0 +1,9 @@
|
||||
/* RISC-V - test if the Zba extension is present */
|
||||
|
||||
.text
|
||||
.global main
|
||||
|
||||
main:
|
||||
sh1add x6, x6, x7
|
||||
li x10, 0
|
||||
ret
|
||||
9
src/crypto/randomx/tests/riscv64_zbb.s
Normal file
9
src/crypto/randomx/tests/riscv64_zbb.s
Normal file
@@ -0,0 +1,9 @@
|
||||
/* RISC-V - test if the Zbb extension is present */
|
||||
|
||||
.text
|
||||
.global main
|
||||
|
||||
main:
|
||||
ror x6, x6, x7
|
||||
li x10, 0
|
||||
ret
|
||||
@@ -29,9 +29,17 @@ randomx_vm *xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so
|
||||
{
|
||||
int flags = 0;
|
||||
|
||||
// On RISC-V, force software AES path even if CPU reports AES capability.
|
||||
// The RandomX portable intrinsics will throw at runtime when HAVE_AES is not defined
|
||||
// for this architecture. Until native AES intrinsics are wired for RISC-V, avoid
|
||||
// setting HARD_AES to prevent "Platform doesn't support hardware AES" aborts.
|
||||
# ifndef XMRIG_RISCV
|
||||
if (!softAes) {
|
||||
flags |= RANDOMX_FLAG_HARD_AES;
|
||||
}
|
||||
# else
|
||||
(void)softAes; // unused on RISC-V to force soft AES
|
||||
# endif
|
||||
|
||||
if (dataset->get()) {
|
||||
flags |= RANDOMX_FLAG_FULL_MEM;
|
||||
|
||||
@@ -75,6 +75,8 @@
|
||||
|
||||
#ifdef XMRIG_ARM
|
||||
# define APP_ARCH "ARMv" STR2(XMRIG_ARM)
|
||||
#elif defined(XMRIG_RISCV)
|
||||
# define APP_ARCH "RISC-V"
|
||||
#else
|
||||
# if defined(__x86_64__) || defined(__amd64__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
# define APP_ARCH "x86-64"
|
||||
|
||||
Reference in New Issue
Block a user