From f661e1eb3070426c74b8efe0062f6b011711fc29 Mon Sep 17 00:00:00 2001 From: SChernykh <15806605+SChernykh@users.noreply.github.com> Date: Fri, 26 Dec 2025 21:11:11 +0100 Subject: [PATCH] RISC-V: vectorized RandomX main loop --- cmake/cpu.cmake | 92 ++- cmake/randomx.cmake | 16 + src/Summary.cpp | 7 +- src/backend/cpu/interfaces/ICpuInfo.h | 2 + src/backend/cpu/platform/BasicCpuInfo.cpp | 4 +- src/backend/cpu/platform/BasicCpuInfo.h | 1 + .../cpu/platform/BasicCpuInfo_riscv.cpp | 3 + src/backend/cpu/platform/lscpu_riscv.cpp | 6 +- src/crypto/randomx/jit_compiler_rv64.cpp | 39 +- src/crypto/randomx/jit_compiler_rv64.hpp | 18 +- .../randomx/jit_compiler_rv64_vector.cpp | 709 +++++++++++++++++- src/crypto/randomx/jit_compiler_rv64_vector.h | 5 +- .../randomx/jit_compiler_rv64_vector_static.S | 612 ++++++++++++++- .../randomx/jit_compiler_rv64_vector_static.h | 19 +- src/crypto/randomx/randomx.cpp | 5 +- src/crypto/randomx/tests/riscv64_vector.s | 6 +- src/crypto/randomx/tests/riscv64_zicbop.s | 11 + src/crypto/randomx/vm_compiled.cpp | 2 +- 18 files changed, 1460 insertions(+), 97 deletions(-) create mode 100644 src/crypto/randomx/tests/riscv64_zicbop.s diff --git a/cmake/cpu.cmake b/cmake/cpu.cmake index 515c2ccbb..e9f1cf847 100644 --- a/cmake/cpu.cmake +++ b/cmake/cpu.cmake @@ -51,42 +51,74 @@ if (XMRIG_RISCV) # default build uses the RV64GC baseline set(RVARCH "rv64gc") + enable_language(ASM) + + try_run(RANDOMX_VECTOR_RUN_FAIL + RANDOMX_VECTOR_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_vector.s + COMPILE_DEFINITIONS "-march=rv64gcv") + + if (RANDOMX_VECTOR_COMPILE_OK AND NOT RANDOMX_VECTOR_RUN_FAIL) + set(RVARCH_V ON) + message(STATUS "RISC-V vector extension detected") + else() + set(RVARCH_V OFF) + endif() + + try_run(RANDOMX_ZICBOP_RUN_FAIL + RANDOMX_ZICBOP_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zicbop.s + COMPILE_DEFINITIONS "-march=rv64gc_zicbop") + + if (RANDOMX_ZICBOP_COMPILE_OK AND NOT RANDOMX_ZICBOP_RUN_FAIL) + set(RVARCH_ZICBOP ON) + message(STATUS "RISC-V zicbop extension detected") + else() + set(RVARCH_ZICBOP OFF) + endif() + + try_run(RANDOMX_ZBA_RUN_FAIL + RANDOMX_ZBA_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zba.s + COMPILE_DEFINITIONS "-march=rv64gc_zba") + + if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL) + set(RVARCH_ZBA ON) + message(STATUS "RISC-V zba extension detected") + else() + set(RVARCH_ZBA OFF) + endif() + + try_run(RANDOMX_ZBB_RUN_FAIL + RANDOMX_ZBB_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zbb.s + COMPILE_DEFINITIONS "-march=rv64gc_zbb") + + if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL) + set(RVARCH_ZBB ON) + message(STATUS "RISC-V zbb extension detected") + else() + set(RVARCH_ZBB OFF) + endif() + # for native builds, enable Zba and Zbb if supported by the CPU - if(ARCH STREQUAL "native") - enable_language(ASM) - - try_run(RANDOMX_VECTOR_RUN_FAIL - RANDOMX_VECTOR_COMPILE_OK - ${CMAKE_CURRENT_BINARY_DIR}/ - ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_vector.s - COMPILE_DEFINITIONS "-march=rv64gcv_zicbop") - - if (RANDOMX_VECTOR_COMPILE_OK AND NOT RANDOMX_VECTOR_RUN_FAIL) - set(RVARCH "${RVARCH}v_zicbop") + if (ARCH STREQUAL "native") + if (RVARCH_V) + set(RVARCH "${RVARCH}v") add_definitions(-DXMRIG_RVV_ENABLED) - message(STATUS "RISC-V vector extension detected") endif() - - try_run(RANDOMX_ZBA_RUN_FAIL - RANDOMX_ZBA_COMPILE_OK - ${CMAKE_CURRENT_BINARY_DIR}/ - ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zba.s - COMPILE_DEFINITIONS "-march=rv64gc_zba") - - if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL) + if (RVARCH_ZICBOP) + set(RVARCH "${RVARCH}_zicbop") + endif() + if (RVARCH_ZBA) set(RVARCH "${RVARCH}_zba") - message(STATUS "RISC-V zba extension detected") endif() - - try_run(RANDOMX_ZBB_RUN_FAIL - RANDOMX_ZBB_COMPILE_OK - ${CMAKE_CURRENT_BINARY_DIR}/ - ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zbb.s - COMPILE_DEFINITIONS "-march=rv64gc_zbb") - - if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL) + if (RVARCH_ZBB) set(RVARCH "${RVARCH}_zbb") - message(STATUS "RISC-V zbb extension detected") endif() endif() diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index c15024c97..7cfc4ca11 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -90,6 +90,22 @@ if (WITH_RANDOMX) # cheat because cmake and ccache hate each other set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_static.S PROPERTY LANGUAGE C) set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTY LANGUAGE C) + + set(RV64_VECTOR_FILE_ARCH "rv64gcv") + + if (ARCH STREQUAL "native") + if (RVARCH_ZICBOP) + set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zicbop") + endif() + if (RVARCH_ZBA) + set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zba") + endif() + if (RVARCH_ZBB) + set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zbb") + endif() + endif() + + set_source_files_properties(src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}") else() list(APPEND SOURCES_CRYPTO src/crypto/randomx/jit_compiler_fallback.cpp diff --git a/src/Summary.cpp b/src/Summary.cpp index 7682398f9..8dae866b6 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -89,11 +89,16 @@ static void print_cpu(const Config *) { const auto info = Cpu::info(); - Log::print(GREEN_BOLD(" * ") WHITE_BOLD("%-13s%s (%zu)") " %s %sAES%s", + Log::print(GREEN_BOLD(" * ") WHITE_BOLD("%-13s%s (%zu)") " %s %s%sAES%s", "CPU", info->brand(), info->packages(), ICpuInfo::is64bit() ? GREEN_BOLD("64-bit") : RED_BOLD("32-bit"), +#ifdef XMRIG_RISCV + info->hasRISCV_Vector() ? GREEN_BOLD_S "RVV " : RED_BOLD_S "-RVV ", +#else + "", +#endif info->hasAES() ? GREEN_BOLD_S : RED_BOLD_S "-", info->isVM() ? RED_BOLD_S " VM" : "" ); diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index e28a14734..401c585dc 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -85,6 +85,7 @@ public: FLAG_POPCNT, FLAG_CAT_L3, FLAG_VM, + FLAG_RISCV_VECTOR, FLAG_MAX }; @@ -109,6 +110,7 @@ public: virtual bool hasOneGbPages() const = 0; virtual bool hasXOP() const = 0; virtual bool isVM() const = 0; + virtual bool hasRISCV_Vector() const = 0; virtual bool jccErratum() const = 0; virtual const char *backend() const = 0; virtual const char *brand() const = 0; diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index 9f5595aac..c9cb79515 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -58,8 +58,8 @@ namespace xmrig { -constexpr size_t kCpuFlagsSize = 15; -static const std::array flagNames = { "aes", "vaes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" }; +constexpr size_t kCpuFlagsSize = 16; +static const std::array flagNames = { "aes", "vaes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm", "rvv" }; static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch"); diff --git a/src/backend/cpu/platform/BasicCpuInfo.h b/src/backend/cpu/platform/BasicCpuInfo.h index 97fe20e1b..67c2ac026 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.h +++ b/src/backend/cpu/platform/BasicCpuInfo.h @@ -52,6 +52,7 @@ protected: inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); } inline bool hasXOP() const override { return has(FLAG_XOP); } inline bool isVM() const override { return has(FLAG_VM); } + inline bool hasRISCV_Vector() const override { return has(FLAG_RISCV_VECTOR); } inline bool jccErratum() const override { return m_jccErratum; } inline const char *brand() const override { return m_brand; } inline const std::vector &units() const override { return m_units; } diff --git a/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp b/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp index fd9c9ce62..e892bbbae 100644 --- a/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo_riscv.cpp @@ -55,6 +55,9 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : strncpy(m_brand, name.data(), sizeof(m_brand) - 1); } + // Check for vector extensions + m_flags.set(FLAG_RISCV_VECTOR, has_riscv_vector()); + // Check for crypto extensions (Zknd/Zkne/Zknh - AES and SHA) m_flags.set(FLAG_AES, has_riscv_crypto()); diff --git a/src/backend/cpu/platform/lscpu_riscv.cpp b/src/backend/cpu/platform/lscpu_riscv.cpp index d19d26a8f..1fba9493b 100644 --- a/src/backend/cpu/platform/lscpu_riscv.cpp +++ b/src/backend/cpu/platform/lscpu_riscv.cpp @@ -34,7 +34,7 @@ struct riscv_cpu_desc bool has_vector = false; bool has_crypto = false; - inline bool isReady() const { return !model.isNull(); } + inline bool isReady() const { return !isa.isNull(); } }; static bool lookup_riscv(char *line, const char *pattern, String &value) @@ -82,7 +82,7 @@ static bool read_riscv_cpuinfo(riscv_cpu_desc *desc) if (lookup_riscv(buf, "isa", desc->isa)) { // Check for vector extensions - if (strstr(buf, "zve") || strstr(buf, "v_")) { + if (strstr(buf, "zve64d") || strstr(buf, "v_")) { desc->has_vector = true; } // Check for crypto extensions (AES, SHA, etc.) @@ -96,7 +96,7 @@ static bool read_riscv_cpuinfo(riscv_cpu_desc *desc) lookup_riscv(buf, "uarch", desc->uarch); - if (desc->isReady() && !desc->isa.isNull()) { + if (desc->isReady()) { break; } } diff --git a/src/crypto/randomx/jit_compiler_rv64.cpp b/src/crypto/randomx/jit_compiler_rv64.cpp index 161343471..a3871bc84 100644 --- a/src/crypto/randomx/jit_compiler_rv64.cpp +++ b/src/crypto/randomx/jit_compiler_rv64.cpp @@ -30,6 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include "backend/cpu/Cpu.h" #include "crypto/randomx/jit_compiler_rv64.hpp" #include "crypto/randomx/jit_compiler_rv64_static.hpp" #include "crypto/randomx/jit_compiler_rv64_vector.h" @@ -621,13 +622,22 @@ namespace randomx { //jal x1, SuperscalarHash emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset); - vectorCodeSize = ((uint8_t*)randomx_riscv64_vector_sshash_end) - ((uint8_t*)randomx_riscv64_vector_sshash_begin); - vectorCode = static_cast(allocExecutableMemory(vectorCodeSize, hugePagesJIT && hugePagesEnable)); + if (xmrig::Cpu::info()->hasRISCV_Vector()) { + vectorCodeSize = ((uint8_t*)randomx_riscv64_vector_code_end) - ((uint8_t*)randomx_riscv64_vector_code_begin); + vectorCode = static_cast(allocExecutableMemory(vectorCodeSize, hugePagesJIT && hugePagesEnable)); + + if (vectorCode) { + memcpy(vectorCode, reinterpret_cast(randomx_riscv64_vector_code_begin), vectorCodeSize); + entryProgramVector = vectorCode + (((uint8_t*)randomx_riscv64_vector_program_begin) - ((uint8_t*)randomx_riscv64_vector_code_begin)); + } + } } JitCompilerRV64::~JitCompilerRV64() { freePagedMemory(state.code, CodeSize); - freePagedMemory(vectorCode, vectorCodeSize); + if (vectorCode) { + freePagedMemory(vectorCode, vectorCodeSize); + } } void JitCompilerRV64::enableWriting() const @@ -649,6 +659,11 @@ namespace randomx { } void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t) { + if (vectorCode) { + generateProgramVectorRV64(vectorCode, prog, pcfg, inst_map, nullptr, 0); + return; + } + emitProgramPrefix(state, prog, pcfg); int32_t fixPos = state.codePos; state.emit(codeDataRead, sizeDataRead); @@ -659,6 +674,11 @@ namespace randomx { } void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + if (vectorCode) { + generateProgramVectorRV64(vectorCode, prog, pcfg, inst_map, entryDataInit, datasetOffset); + return; + } + emitProgramPrefix(state, prog, pcfg); int32_t fixPos = state.codePos; state.emit(codeDataReadLight, sizeDataReadLight); @@ -680,9 +700,9 @@ namespace randomx { template void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) { - if (optimizedDatasetInit > 0) { - entryDataInitOptimized = generateDatasetInitVectorRV64(vectorCode, vectorCodeSize, programs, RandomX_ConfigurationBase::CacheAccesses); - return; + if (vectorCode) { + entryDataInitVector = generateDatasetInitVectorRV64(vectorCode, programs, RandomX_ConfigurationBase::CacheAccesses); + // No return here because we also need the scalar dataset init function for the light mode } state.codePos = SuperScalarHashOffset; @@ -722,10 +742,6 @@ namespace randomx { template void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram(&)[RANDOMX_CACHE_MAX_ACCESSES]); - DatasetInitFunc* JitCompilerRV64::getDatasetInitFunc() { - return (DatasetInitFunc*)((optimizedDatasetInit > 0) ? entryDataInitOptimized : entryDataInit); - } - void JitCompilerRV64::v1_IADD_RS(HANDLER_ARGS) { state.registerUsage[isn.dst] = i; int shift = isn.getModShift(); @@ -1183,5 +1199,6 @@ namespace randomx { void JitCompilerRV64::v1_NOP(HANDLER_ARGS) { } -InstructionGeneratorRV64 JitCompilerRV64::engine[256] = {}; +alignas(64) InstructionGeneratorRV64 JitCompilerRV64::engine[256] = {}; +alignas(64) uint8_t JitCompilerRV64::inst_map[256] = {}; } diff --git a/src/crypto/randomx/jit_compiler_rv64.hpp b/src/crypto/randomx/jit_compiler_rv64.hpp index dbad88e1b..62b6d1a29 100644 --- a/src/crypto/randomx/jit_compiler_rv64.hpp +++ b/src/crypto/randomx/jit_compiler_rv64.hpp @@ -90,9 +90,11 @@ namespace randomx { void generateDatasetInitCode() {} ProgramFunc* getProgramFunc() { - return (ProgramFunc*)entryProgram; + return (ProgramFunc*)(vectorCode ? entryProgramVector : entryProgram); + } + DatasetInitFunc* getDatasetInitFunc() { + return (DatasetInitFunc*)(vectorCode ? entryDataInitVector : entryDataInit); } - DatasetInitFunc* getDatasetInitFunc(); uint8_t* getCode() { return state.code; } @@ -102,15 +104,17 @@ namespace randomx { void enableExecution() const; static InstructionGeneratorRV64 engine[256]; + static uint8_t inst_map[256]; private: CompilerState state; - uint8_t* vectorCode; - size_t vectorCodeSize; + uint8_t* vectorCode = nullptr; + size_t vectorCodeSize = 0; - void* entryDataInit; - void* entryDataInitOptimized; - void* entryProgram; + void* entryDataInit = nullptr; + void* entryDataInitVector = nullptr; + void* entryProgram = nullptr; + void* entryProgramVector = nullptr; public: static void v1_IADD_RS(HANDLER_ARGS); diff --git a/src/crypto/randomx/jit_compiler_rv64_vector.cpp b/src/crypto/randomx/jit_compiler_rv64_vector.cpp index 8dc95613e..e7a9c4d24 100644 --- a/src/crypto/randomx/jit_compiler_rv64_vector.cpp +++ b/src/crypto/randomx/jit_compiler_rv64_vector.cpp @@ -33,19 +33,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/jit_compiler_rv64_vector_static.h" #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/superscalar.hpp" +#include "crypto/randomx/program.hpp" namespace randomx { #define ADDR(x) ((uint8_t*) &(x)) #define DIST(x, y) (ADDR(y) - ADDR(x)) -void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarProgram* programs, size_t num_programs) +void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs) { - memcpy(buf, reinterpret_cast(randomx_riscv64_vector_sshash_begin), buf_size); + uint8_t* p = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions); - uint8_t* p = buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_generated_instructions); - - uint8_t* literals = buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_imul_rcp_literals); + uint8_t* literals = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_imul_rcp_literals); uint8_t* cur_literal = literals; for (size_t i = 0; i < num_programs; ++i) { @@ -76,10 +75,16 @@ void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarPr break; case SuperscalarInstructionType::IADD_RS: - // 57 39 00 96 vsll.vi v18, v0, 0 - // 57 00 09 02 vadd.vv v0, v0, v18 - EMIT(0x96003957 | (modShift << 15) | (src << 20)); - EMIT(0x02090057 | (dst << 7) | (dst << 20)); + if (modShift == 0) { + // 57 00 00 02 vadd.vv v0, v0, v0 + EMIT(0x02000057 | (dst << 7) | (src << 15) | (dst << 20)); + } + else { + // 57 39 00 96 vsll.vi v18, v0, 0 + // 57 00 09 02 vadd.vv v0, v0, v18 + EMIT(0x96003957 | (modShift << 15) | (src << 20)); + EMIT(0x02090057 | (dst << 7) | (dst << 20)); + } break; case SuperscalarInstructionType::IMUL_R: @@ -126,7 +131,7 @@ void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarPr // 9B 82 02 00 addiw x5, x5, 0 // 57 C0 02 02 vadd.vx v0, v0, x5 EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000)); - EMIT(0x0002829B | ((imm32 & 0x00000FFF)) << 20); + EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20)); EMIT(0x0202C057 | (dst << 7) | (dst << 20)); break; @@ -137,7 +142,7 @@ void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarPr // 9B 82 02 00 addiw x5, x5, 0 // 57 C0 02 2E vxor.vx v0, v0, x5 EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000)); - EMIT(0x0002829B | ((imm32 & 0x00000FFF)) << 20); + EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20)); EMIT(0x2E02C057 | (dst << 7) | (dst << 20)); break; @@ -175,33 +180,701 @@ void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarPr break; default: - break; + UNREACHABLE; } } // Step 6 - k = DIST(randomx_riscv64_vector_sshash_xor, randomx_riscv64_vector_sshash_set_cache_index); + k = DIST(randomx_riscv64_vector_sshash_xor, randomx_riscv64_vector_sshash_end); memcpy(p, reinterpret_cast(randomx_riscv64_vector_sshash_xor), k); p += k; - // Step 7 + // Step 7. Set cacheIndex to the value of the register that has the longest dependency chain in the SuperscalarHash function executed in step 5. if (i + 1 < num_programs) { - memcpy(p, reinterpret_cast(randomx_riscv64_vector_sshash_set_cache_index) + programs[i].getAddressRegister() * 4, 4); + // vmv.v.v v9, v0 + programs[i].getAddressRegister() + const uint32_t t = 0x5E0004D7 + (static_cast(programs[i].getAddressRegister()) << 15); + memcpy(p, &t, 4); p += 4; } } // Emit "J randomx_riscv64_vector_sshash_generated_instructions_end" instruction - const uint8_t* e = buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_generated_instructions_end); + const uint8_t* e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions_end); const uint32_t k = e - p; const uint32_t j = 0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000); memcpy(p, &j, 4); + char* result = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_dataset_init)); + #ifdef __GNUC__ - __builtin___clear_cache((char*) buf, (char*)(buf + buf_size)); + __builtin___clear_cache(result, (char*)(buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_end))); #endif - return buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_dataset_init); + return result; +} + +#define emit16(value) { const uint16_t t = value; memcpy(p, &t, 2); p += 2; } +#define emit32(value) { const uint32_t t = value; memcpy(p, &t, 4); p += 4; } +#define emit64(value) { const uint64_t t = value; memcpy(p, &t, 8); p += 8; } +#define emit_data(arr) { memcpy(p, arr, sizeof(arr)); p += sizeof(arr); } + +static void imm_to_x5(uint32_t imm, uint8_t*& p) +{ + const uint32_t imm_hi = (imm + ((imm & 0x800) << 1)) & 0xFFFFF000U; + const uint32_t imm_lo = imm & 0x00000FFFU; + + if (imm_hi == 0) { + // li x5, imm_lo + emit32(0x00000293 + (imm_lo << 20)); + return; + } + + if (imm_lo == 0) { + // lui x5, imm_hi + emit32(0x000002B7 + imm_hi); + return; + } + + // lui x5, imm_hi + // addiw x5, x5, imm_lo + emit64(0x0002829B000002B7ULL | imm_hi | (static_cast(imm_lo) << 52)) +} + +static void loadFromScratchpad(uint32_t src, uint32_t dst, uint32_t mod, uint32_t imm, uint8_t*& p) +{ + if (src == dst) { + imm &= RandomX_CurrentConfig.ScratchpadL3Mask_Calculated; + + if (imm <= 2047) { + // ld x5, imm(x12) + emit32(0x00063283 | (imm << 20)); + } + else if (imm <= 2047 * 2) { + // addi x5, x12, 2047 + emit32(0x7FF60293); + // ld x5, (imm - 2047)(x5) + emit32(0x0002B283 | ((imm - 2047) << 20)); + } + else { + // lui x5, imm & 0xFFFFF000U + emit32(0x000002B7 | ((imm + ((imm & 0x800) << 1)) & 0xFFFFF000U)); + // c.add x5, x12 + emit16(0x92B2); + // ld x5, (imm & 0xFFF)(x5) + emit32(0x0002B283 | ((imm & 0xFFF) << 20)); + } + + return; + } + + uint32_t shift = 32; + uint32_t mask_reg; + + if ((mod & 3) == 0) { + shift -= RandomX_CurrentConfig.Log2_ScratchpadL2; + mask_reg = 17; + } + else { + shift -= RandomX_CurrentConfig.Log2_ScratchpadL1; + mask_reg = 16; + } + + imm = static_cast(static_cast(imm << shift) >> shift); + + // 0-0x7FF, 0xFFFFF800-0xFFFFFFFF fit into 12 bit (a single addi instruction) + if (imm - 0xFFFFF800U < 0x1000U) { + // addi x5, x20 + src, imm + emit32(0x000A0293 + (src << 15) + (imm << 20)); + } + else { + imm_to_x5(imm, p); + // c.add x5, x20 + src + emit16(0x92D2 + (src << 2)); + } + + // and x5, x5, mask_reg + emit32(0x0002F2B3 + (mask_reg << 20)); + // c.add x5, x12 + emit16(0x92B2); + // ld x5, 0(x5) + emit32(0x0002B283); +} + +void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset) +{ + uint64_t* params = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params)); + + params[0] = RandomX_CurrentConfig.ScratchpadL1_Size - 8; + params[1] = RandomX_CurrentConfig.ScratchpadL2_Size - 8; + params[2] = RandomX_CurrentConfig.ScratchpadL3_Size - 8; + params[3] = RandomX_CurrentConfig.DatasetBaseSize - 64; + params[4] = (1 << RandomX_ConfigurationBase::JumpBits) - 1; + + uint64_t* imul_rcp_literals = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_imul_rcp_literals)); + uint64_t* cur_literal = imul_rcp_literals; + + uint32_t* spaddr_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_spaddr_xor)); + uint32_t* spaddr_xor2 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch)); + uint32_t* mx_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor)); + uint32_t* mx_xor_light = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor_light_mode)); + + *spaddr_xor = 0x014A47B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x15, readReg0, readReg1 + *spaddr_xor2 = 0x014A42B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x5, readReg0, readReg1 + const uint32_t mx_xor_value = 0x014A42B3 + (pcfg.readReg2 << 15) + (pcfg.readReg3 << 20); // xor x5, readReg2, readReg3 + + *mx_xor = mx_xor_value; + *mx_xor_light = mx_xor_value; + + if (entryDataInitScalar) { + void* light_mode_data = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_light_mode_data); + + const uint64_t data[2] = { reinterpret_cast(entryDataInitScalar), datasetOffset }; + memcpy(light_mode_data, &data, sizeof(data)); + } + + uint8_t* p = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions)); + + // 57C8025E vmv.v.x v16, x5 + // 57A9034B vsext.vf2 v18, v16 + // 5798214B vfcvt.f.x.v v16, v18 + static constexpr uint8_t group_f_convert[] = { + 0x57, 0xC8, 0x02, 0x5E, 0x57, 0xA9, 0x03, 0x4B, 0x57, 0x98, 0x21, 0x4B + }; + + // 57080627 vand.vv v16, v16, v12 + // 5788062B vor.vv v16, v16, v13 + static constexpr uint8_t group_e_post_process[] = { 0x57, 0x08, 0x06, 0x27, 0x57, 0x88, 0x06, 0x2B }; + + uint8_t* last_modified[RegistersCount] = { p, p, p, p, p, p, p, p }; + + uint8_t readReg01[RegistersCount] = {}; + + readReg01[pcfg.readReg0] = 1; + readReg01[pcfg.readReg1] = 1; + + uint32_t scratchpad_prefetch_pos = 0; + + for (int32_t i = static_cast(prog.getSize()) - 1; i >= 0; --i) { + Instruction instr = prog(i); + + const InstructionType inst_type = static_cast(inst_map[instr.opcode]); + + if (inst_type == InstructionType::CBRANCH) { + scratchpad_prefetch_pos = i; + break; + } + + if (inst_type < InstructionType::FSWAP_R) { + const uint32_t src = instr.src % RegistersCount; + const uint32_t dst = instr.dst % RegistersCount; + + if ((inst_type == InstructionType::ISWAP_R) && (src != dst) && (readReg01[src] || readReg01[dst])) { + scratchpad_prefetch_pos = i; + break; + } + + if ((inst_type == InstructionType::IMUL_RCP) && readReg01[dst] && !isZeroOrPowerOf2(instr.getImm32())) { + scratchpad_prefetch_pos = i; + break; + } + + if (readReg01[dst]) { + scratchpad_prefetch_pos = i; + break; + } + } + } + + for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { + Instruction instr = prog(i); + + uint32_t src = instr.src % RegistersCount; + uint32_t dst = instr.dst % RegistersCount; + const uint32_t shift = instr.getModShift(); + uint32_t imm = instr.getImm32(); + const uint32_t mod = instr.mod; + + switch (static_cast(inst_map[instr.opcode])) { + case InstructionType::IADD_RS: + if (shift == 0) { + // c.add x20 + dst, x20 + src + emit16(0x9A52 + (src << 2) + (dst << 7)); + } + else { +#ifdef __riscv_zba + // sh{shift}add x20 + dst, x20 + src, x20 + dst + emit32(0x214A0A33 + (shift << 13) + (dst << 7) + (src << 15) + (dst << 20)); +#else // __riscv_zba + // slli x5, x20 + src, shift + emit32(0x000A1293 + (src << 15) + (shift << 20)); + // c.add x20 + dst, x5 + emit16(0x9A16 + (dst << 7)); +#endif // __riscv_zba + } + if (dst == RegisterNeedsDisplacement) { + imm_to_x5(imm, p); + + // c.add x20 + dst, x5 + emit16(0x9A16 + (dst << 7)); + } + + last_modified[dst] = p; + break; + + case InstructionType::IADD_M: + loadFromScratchpad(src, dst, mod, imm, p); + // c.add x20 + dst, x5 + emit16(0x9A16 + (dst << 7)); + + last_modified[dst] = p; + break; + + case InstructionType::ISUB_R: + if (src != dst) { + // sub x20 + dst, x20 + dst, x20 + src + emit32(0x414A0A33 + (dst << 7) + (dst << 15) + (src << 20)); + } + else { + imm_to_x5(-imm, p); + // c.add x20 + dst, x5 + emit16(0x9A16 + (dst << 7)); + } + + last_modified[dst] = p; + break; + + case InstructionType::ISUB_M: + loadFromScratchpad(src, dst, mod, imm, p); + // sub x20 + dst, x20 + dst, x5 + emit32(0x405A0A33 + (dst << 7) + (dst << 15)); + + last_modified[dst] = p; + break; + + case InstructionType::IMUL_R: + if (src != dst) { + // mul x20 + dst, x20 + dst, x20 + src + emit32(0x034A0A33 + (dst << 7) + (dst << 15) + (src << 20)); + } + else { + imm_to_x5(imm, p); + // mul x20 + dst, x20 + dst, x5 + emit32(0x025A0A33 + (dst << 7) + (dst << 15)); + } + + last_modified[dst] = p; + break; + + case InstructionType::IMUL_M: + loadFromScratchpad(src, dst, mod, imm, p); + // mul x20 + dst, x20 + dst, x5 + emit32(0x025A0A33 + (dst << 7) + (dst << 15)); + + last_modified[dst] = p; + break; + + case InstructionType::IMULH_R: + // mulhu x20 + dst, x20 + dst, x20 + src + emit32(0x034A3A33 + (dst << 7) + (dst << 15) + (src << 20)); + + last_modified[dst] = p; + break; + + case InstructionType::IMULH_M: + loadFromScratchpad(src, dst, mod, imm, p); + // mulhu x20 + dst, x20 + dst, x5 + emit32(0x025A3A33 + (dst << 7) + (dst << 15)); + + last_modified[dst] = p; + break; + + case InstructionType::ISMULH_R: + // mulh x20 + dst, x20 + dst, x20 + src + emit32(0x034A1A33 + (dst << 7) + (dst << 15) + (src << 20)); + + last_modified[dst] = p; + break; + + case InstructionType::ISMULH_M: + loadFromScratchpad(src, dst, mod, imm, p); + // mulh x20 + dst, x20 + dst, x5 + emit32(0x025A1A33 + (dst << 7) + (dst << 15)); + + last_modified[dst] = p; + break; + + case InstructionType::IMUL_RCP: + if (!isZeroOrPowerOf2(imm)) { + const uint64_t offset = (cur_literal - imul_rcp_literals) * 8; + *(cur_literal++) = randomx_reciprocal_fast(imm); + + static constexpr uint32_t rcp_regs[26] = { + /* Integer */ 8, 10, 28, 29, 30, 31, + /* Float */ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 28, 29, 30, 31 + }; + + if (offset < 6 * 8) { + // mul x20 + dst, x20 + dst, rcp_reg + emit32(0x020A0A33 + (dst << 7) + (dst << 15) + (rcp_regs[offset / 8] << 20)); + } + else if (offset < 26 * 8) { + // fmv.x.d x5, rcp_reg + emit32(0xE20002D3 + (rcp_regs[offset / 8] << 15)); + // mul x20 + dst, x20 + dst, x5 + emit32(0x025A0A33 + (dst << 7) + (dst << 15)); + } + else { + // ld x5, offset(x18) + emit32(0x00093283 + (offset << 20)); + // mul x20 + dst, x20 + dst, x5 + emit32(0x025A0A33 + (dst << 7) + (dst << 15)); + } + + last_modified[dst] = p; + } + break; + + case InstructionType::INEG_R: + // sub x20 + dst, x0, x20 + dst + emit32(0x41400A33 + (dst << 7) + (dst << 20)); + + last_modified[dst] = p; + break; + + case InstructionType::IXOR_R: + if (src != dst) { + // xor x20 + dst, x20 + dst, x20 + src + emit32(0x014A4A33 + (dst << 7) + (dst << 15) + (src << 20)); + } + else { + imm_to_x5(imm, p); + // xor x20, x20, x5 + emit32(0x005A4A33 + (dst << 7) + (dst << 15)); + } + + last_modified[dst] = p; + break; + + case InstructionType::IXOR_M: + loadFromScratchpad(src, dst, mod, imm, p); + // xor x20, x20, x5 + emit32(0x005A4A33 + (dst << 7) + (dst << 15)); + + last_modified[dst] = p; + break; + +#ifdef __riscv_zbb + case InstructionType::IROR_R: + if (src != dst) { + // ror x20 + dst, x20 + dst, x20 + src + emit32(0x614A5A33 + (dst << 7) + (dst << 15) + (src << 20)); + } + else { + // rori x20 + dst, x20 + dst, imm + emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((imm & 63) << 20)); + } + + last_modified[dst] = p; + break; + + case InstructionType::IROL_R: + if (src != dst) { + // rol x20 + dst, x20 + dst, x20 + src + emit32(0x614A1A33 + (dst << 7) + (dst << 15) + (src << 20)); + } + else { + // rori x20 + dst, x20 + dst, -imm + emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((-imm & 63) << 20)); + } + + last_modified[dst] = p; + break; +#else // __riscv_zbb + case InstructionType::IROR_R: + if (src != dst) { + // sub x5, x0, x20 + src + emit32(0x414002B3 + (src << 20)); + // srl x6, x20 + dst, x20 + src + emit32(0x014A5333 + (dst << 15) + (src << 20)); + // sll x20 + dst, x20 + dst, x5 + emit32(0x005A1A33 + (dst << 7) + (dst << 15)); + // or x20 + dst, x20 + dst, x6 + emit32(0x006A6A33 + (dst << 7) + (dst << 15)); + } + else { + // srli x5, x20 + dst, imm + emit32(0x000A5293 + (dst << 15) + ((imm & 63) << 20)); + // slli x6, x20 + dst, -imm + emit32(0x000A1313 + (dst << 15) + ((-imm & 63) << 20)); + // or x20 + dst, x5, x6 + emit32(0x0062EA33 + (dst << 7)); + } + + last_modified[dst] = p; + break; + + case InstructionType::IROL_R: + if (src != dst) { + // sub x5, x0, x20 + src + emit32(0x414002B3 + (src << 20)); + // sll x6, x20 + dst, x20 + src + emit32(0x014A1333 + (dst << 15) + (src << 20)); + // srl x20 + dst, x20 + dst, x5 + emit32(0x005A5A33 + (dst << 7) + (dst << 15)); + // or x20 + dst, x20 + dst, x6 + emit32(0x006A6A33 + (dst << 7) + (dst << 15)); + } + else { + // srli x5, x20 + dst, -imm + emit32(0x000A5293 + (dst << 15) + ((-imm & 63) << 20)); + // slli x6, x20 + dst, imm + emit32(0x000A1313 + (dst << 15) + ((imm & 63) << 20)); + // or x20 + dst, x5, x6 + emit32(0x0062EA33 + (dst << 7)); + } + + last_modified[dst] = p; + break; +#endif // __riscv_zbb + + case InstructionType::ISWAP_R: + if (src != dst) { + // c.mv x5, x20 + dst + emit16(0x82D2 + (dst << 2)); + // c.mv x20 + dst, x20 + src + emit16(0x8A52 + (src << 2) + (dst << 7)); + // c.mv x20 + src, x5 + emit16(0x8A16 + (src << 7)); + + last_modified[src] = p; + last_modified[dst] = p; + } + break; + + case InstructionType::FSWAP_R: + // vmv.x.s x5, v0 + dst + emit32(0x420022D7 + (dst << 20)); + // vslide1down.vx v0 + dst, v0 + dst, x5 + emit32(0x3E02E057 + (dst << 7) + (dst << 20)); + break; + + case InstructionType::FADD_R: + src %= RegisterCountFlt; + dst %= RegisterCountFlt; + + // vfadd.vv v0 + dst, v0 + dst, v8 + src + emit32(0x02041057 + (dst << 7) + (src << 15) + (dst << 20)); + break; + + case InstructionType::FADD_M: + dst %= RegisterCountFlt; + + loadFromScratchpad(src, RegistersCount, mod, imm, p); + emit_data(group_f_convert); + + // vfadd.vv v0 + dst, v0 + dst, v16 + emit32(0x02081057 + (dst << 7) + (dst << 20)); + break; + + case InstructionType::FSUB_R: + src %= RegisterCountFlt; + dst %= RegisterCountFlt; + + // vfsub.vv v0 + dst, v0 + dst, v8 + src + emit32(0x0A041057 + (dst << 7) + (src << 15) + (dst << 20)); + break; + + case InstructionType::FSUB_M: + dst %= RegisterCountFlt; + + loadFromScratchpad(src, RegistersCount, mod, imm, p); + emit_data(group_f_convert); + + // vfsub.vv v0 + dst, v0 + dst, v16 + emit32(0x0A081057 + (dst << 7) + (dst << 20)); + break; + + case InstructionType::FSCAL_R: + dst %= RegisterCountFlt; + + // vxor.vv v0, v0, v14 + emit32(0x2E070057 + (dst << 7) + (dst << 20)); + break; + + case InstructionType::FMUL_R: + src %= RegisterCountFlt; + dst %= RegisterCountFlt; + + // vfmul.vv v4 + dst, v4 + dst, v8 + src + emit32(0x92441257 + (dst << 7) + (src << 15) + (dst << 20)); + break; + + case InstructionType::FDIV_M: + dst %= RegisterCountFlt; + + loadFromScratchpad(src, RegistersCount, mod, imm, p); + emit_data(group_f_convert); + emit_data(group_e_post_process); + + // vfdiv.vv v0 + dst, v0 + dst, v16 + emit32(0x82481257 + (dst << 7) + (dst << 20)); + break; + + case InstructionType::FSQRT_R: + dst %= RegisterCountFlt; + + // vfsqrt.v v4 + dst, v4 + dst + emit32(0x4E401257 + (dst << 7) + (dst << 20)); + break; + + case InstructionType::CBRANCH: + { + const uint32_t shift = (mod >> 4) + RandomX_ConfigurationBase::JumpOffset; + + imm |= (1UL << shift); + + if (RandomX_ConfigurationBase::JumpOffset > 0 || shift > 0) { + imm &= ~(1UL << (shift - 1)); + } + + // slli x6, x7, shift + // x6 = branchMask + emit32(0x00039313 + (shift << 20)); + + // x5 = imm + imm_to_x5(imm, p); + + // c.add x20 + dst, x5 + emit16(0x9A16 + (dst << 7)); + + // and x5, x20 + dst, x6 + emit32(0x006A72B3 + (dst << 15)); + + const int offset = static_cast(last_modified[dst] - p); + + if (offset >= -4096) { + // beqz x5, offset + const uint32_t k = static_cast(offset); + emit32(0x80028063 | ((k & 0x1E) << 7) | ((k & 0x7E0) << 20) | ((k & 0x800) >> 4)); + } + else { + // bnez x5, 8 + emit32(0x00029463); + // j offset + const uint32_t k = static_cast(offset - 4); + emit32(0x8000006F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000)); + } + + for (uint32_t j = 0; j < RegistersCount; ++j) { + last_modified[j] = p; + } + } + break; + + case InstructionType::CFROUND: + if ((imm - 1) & 63) { +#ifdef __riscv_zbb + // rori x5, x20 + src, imm - 1 + emit32(0x600A5293 + (src << 15) + (((imm - 1) & 63) << 20)); +#else // __riscv_zbb + // srli x5, x20 + src, imm - 1 + emit32(0x000A5293 + (src << 15) + (((imm - 1) & 63) << 20)); + // slli x6, x20 + src, 1 - imm + emit32(0x000A1313 + (src << 15) + (((1 - imm) & 63) << 20)); + // or x5, x5, x6 + emit32(0x0062E2B3); +#endif // __riscv_zbb + + // andi x5, x5, 6 + emit32(0x0062F293); + } + else { + // andi x5, x20 + src, 6 + emit32(0x006A7293 + (src << 15)); + } + + // li x6, 01111000b + // x6 = CFROUND lookup table + emit32(0x07800313); + // srl x5, x6, x5 + emit32(0x005352B3); + // andi x5, x5, 3 + emit32(0x0032F293); + // csrw frm, x5 + emit32(0x00229073); + break; + + case InstructionType::ISTORE: + { + uint32_t mask_reg; + uint32_t shift = 32; + + if ((mod >> 4) >= 14) { + shift -= RandomX_CurrentConfig.Log2_ScratchpadL3; + mask_reg = 1; // x1 = L3 mask + } + else { + if ((mod & 3) == 0) { + shift -= RandomX_CurrentConfig.Log2_ScratchpadL2; + mask_reg = 17; // x17 = L2 mask + } + else { + shift -= RandomX_CurrentConfig.Log2_ScratchpadL1; + mask_reg = 16; // x16 = L1 mask + } + } + + imm = static_cast(static_cast(imm << shift) >> shift); + imm_to_x5(imm, p); + + // c.add x5, x20 + dst + emit16(0x92D2 + (dst << 2)); + // and x5, x5, x0 + mask_reg + emit32(0x0002F2B3 + (mask_reg << 20)); + // c.add x5, x12 + emit16(0x92B2); + // sd x20 + src, 0(x5) + emit32(0x0142B023 + (src << 20)); + } + break; + + default: + UNREACHABLE; + } + + // Prefetch scratchpad lines for the next main loop iteration + // scratchpad_prefetch_pos is a conservative estimate of the earliest place in the code where we can do it + if (i == scratchpad_prefetch_pos) { + uint8_t* e = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch_end)); + const size_t n = e - ((uint8_t*)spaddr_xor2); + + memcpy(p, spaddr_xor2, n); + p += n; + } + } + + const uint8_t* e; + + if (entryDataInitScalar) { + // Emit "J randomx_riscv64_vector_program_main_loop_instructions_end_light_mode" instruction + e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end_light_mode); + } + else { + // Emit "J randomx_riscv64_vector_program_main_loop_instructions_end" instruction + e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end); + } + + const uint32_t k = e - p; + emit32(0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000)); + +#ifdef __GNUC__ + char* p1 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params)); + char* p2 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_end)); + + __builtin___clear_cache(p1, p2); +#endif + + return buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_begin); } } // namespace randomx diff --git a/src/crypto/randomx/jit_compiler_rv64_vector.h b/src/crypto/randomx/jit_compiler_rv64_vector.h index ea06862e5..d7a34ee2c 100644 --- a/src/crypto/randomx/jit_compiler_rv64_vector.h +++ b/src/crypto/randomx/jit_compiler_rv64_vector.h @@ -36,7 +36,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { class SuperscalarProgram; +struct ProgramConfiguration; +class Program; -void* generateDatasetInitVectorRV64(uint8_t* buf, size_t buf_size, SuperscalarProgram* programs, size_t num_programs); +void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs); +void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset); } // namespace randomx diff --git a/src/crypto/randomx/jit_compiler_rv64_vector_static.S b/src/crypto/randomx/jit_compiler_rv64_vector_static.S index ac63c625f..26409724e 100644 --- a/src/crypto/randomx/jit_compiler_rv64_vector_static.S +++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.S @@ -46,9 +46,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .text -.option arch, rv64gcv_zicbop +#ifndef __riscv_v +#error This file requires rv64gcv +#endif + .option pic +.global DECL(randomx_riscv64_vector_code_begin) + .global DECL(randomx_riscv64_vector_sshash_begin) .global DECL(randomx_riscv64_vector_sshash_imul_rcp_literals) .global DECL(randomx_riscv64_vector_sshash_dataset_init) @@ -56,11 +61,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .global DECL(randomx_riscv64_vector_sshash_generated_instructions_end) .global DECL(randomx_riscv64_vector_sshash_cache_prefetch) .global DECL(randomx_riscv64_vector_sshash_xor) -.global DECL(randomx_riscv64_vector_sshash_set_cache_index) .global DECL(randomx_riscv64_vector_sshash_end) +.global DECL(randomx_riscv64_vector_program_params) +.global DECL(randomx_riscv64_vector_program_imul_rcp_literals) +.global DECL(randomx_riscv64_vector_program_begin) +.global DECL(randomx_riscv64_vector_program_main_loop_instructions) +.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end) +.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor) +.global DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor) + +.global DECL(randomx_riscv64_vector_program_main_loop_light_mode_data) +.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode) +.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode) +.global DECL(randomx_riscv64_vector_program_scratchpad_prefetch) +.global DECL(randomx_riscv64_vector_program_scratchpad_prefetch_end) + +.global DECL(randomx_riscv64_vector_program_end) + +.global DECL(randomx_riscv64_vector_code_end) + .balign 8 +DECL(randomx_riscv64_vector_code_begin): + DECL(randomx_riscv64_vector_sshash_begin): sshash_constant_0: .dword 6364136223846793005 @@ -104,8 +128,7 @@ v19 = dataset item store offsets DECL(randomx_riscv64_vector_sshash_dataset_init): // Process 4 64-bit values at a time - li x5, 4 - vsetvli x5, x5, e64, m1, ta, ma + vsetivli zero, 4, e64, m1, ta, ma // Load cache->memory pointer ld x10, (x10) @@ -182,7 +205,6 @@ DECL(randomx_riscv64_vector_sshash_generated_instructions): // Step 4. randomx_riscv64_vector_sshash_cache_prefetch // Step 5. SuperscalarHash[i] // Step 6. randomx_riscv64_vector_sshash_xor - // Step 7. randomx_riscv64_vector_sshash_set_cache_index // // Above steps will be repeated RANDOMX_CACHE_ACCESSES times .fill RANDOMX_CACHE_ACCESSES * 2048, 4, 0 @@ -228,22 +250,38 @@ DECL(randomx_riscv64_vector_sshash_cache_prefetch): // Prefetch element 0 vmv.x.s x5, v9 +#ifdef __riscv_zicbop prefetch.r (x5) +#else + ld x5, (x5) +#endif // Prefetch element 1 vslidedown.vi v18, v9, 1 vmv.x.s x5, v18 +#ifdef __riscv_zicbop prefetch.r (x5) +#else + ld x5, (x5) +#endif // Prefetch element 2 vslidedown.vi v18, v9, 2 vmv.x.s x5, v18 +#ifdef __riscv_zicbop prefetch.r (x5) +#else + ld x5, (x5) +#endif // Prefetch element 3 vslidedown.vi v18, v9, 3 vmv.x.s x5, v18 +#ifdef __riscv_zicbop prefetch.r (x5) +#else + ld x5, (x5) +#endif // v9 = byte offset into cache->memory vsub.vx v9, v9, x10 @@ -281,16 +319,556 @@ DECL(randomx_riscv64_vector_sshash_xor): vluxei64.v v18, (x5), v9 vxor.vv v7, v7, v18 -// Step 7. Set cacheIndex to the value of the register that has the longest dependency chain in the SuperscalarHash function executed in step 5. -DECL(randomx_riscv64_vector_sshash_set_cache_index): - // JIT compiler will pick a single instruction reading from the required register - vmv.v.v v9, v0 - vmv.v.v v9, v1 - vmv.v.v v9, v2 - vmv.v.v v9, v3 - vmv.v.v v9, v4 - vmv.v.v v9, v5 - vmv.v.v v9, v6 - vmv.v.v v9, v7 - DECL(randomx_riscv64_vector_sshash_end): + +/* +Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#46-vm-execution + +C declarations: + +struct RegisterFile { + uint64_t r[8]; + double f[4][2]; + double e[4][2]; + double a[4][2]; +}; + +struct MemoryRegisters { + uint32_t mx, ma; + uint8_t* memory; // dataset (fast mode) or cache (light mode) +}; + +void ProgramFunc(RegisterFile* reg, MemoryRegisters* mem, uint8_t* scratchpad, uint64_t iterations); + +Register layout +--------------- +x0 = zero +x1 = scratchpad L3 mask +x2 = stack pointer +x3 = global pointer (unused) +x4 = thread pointer (unused) +x5 = temporary +x6 = temporary +x7 = branch mask (unshifted) +x8 = frame pointer, also 64-bit literal inside the loop +x9 = scratchpad L3 mask (64-byte aligned) +x10 = RegisterFile* reg, also 64-bit literal inside the loop +x11 = MemoryRegisters* mem, then dataset/cache pointer +x12 = scratchpad +x13 = iterations +x14 = mx, ma (always stored with dataset mask applied) +x15 = spAddr0, spAddr1 +x16 = scratchpad L1 mask +x17 = scratchpad L2 mask +x18 = IMUL_RCP literals pointer +x19 = dataset mask +x20-x27 = r0-r7 +x28-x31 = 64-bit literals + +f0-f7 = 64-bit literals +f10-f17 = 64-bit literals +f28-f31 = 64-bit literals + +v0-v3 = f0-f3 +v4-v7 = e0-e3 +v8-v11 = a0-a3 +v12 = E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff +v13 = E 'or' mask = 0x3*00000000******'3*00000000****** +v14 = scale mask = 0x80f0000000000000'80f0000000000000 + +v15 = unused +v16 = temporary +v17 = unused +v18 = temporary + +v19-v31 = unused +*/ + +.balign 8 + +DECL(randomx_riscv64_vector_program_params): + +// JIT compiler will adjust these values for different RandomX variants +randomx_masks: .dword 16376, 262136, 2097144, 2147483584, 255 + +DECL(randomx_riscv64_vector_program_imul_rcp_literals): + +imul_rcp_literals: .fill RANDOMX_PROGRAM_MAX_SIZE, 8, 0 + +DECL(randomx_riscv64_vector_program_begin): + addi sp, sp, -112 + sd x8, 96(sp) // save old frame pointer + addi x8, sp, 112 // setup new frame pointer + sd x1, 104(sp) // save return address + + // Save callee-saved registers + sd x9, 0(sp) + sd x18, 8(sp) + sd x19, 16(sp) + sd x20, 24(sp) + sd x21, 32(sp) + sd x22, 40(sp) + sd x23, 48(sp) + sd x24, 56(sp) + sd x25, 64(sp) + sd x26, 72(sp) + sd x27, 80(sp) + + // Save x10 as it will be used as an IMUL_RCP literal + sd x10, 88(sp) + + // Load mx, ma and dataset pointer + ld x14, (x11) + ld x11, 8(x11) + + // Initialize spAddr0-spAddr1 + mv x15, x14 + + // Set registers r0-r7 to zero + li x20, 0 + li x21, 0 + li x22, 0 + li x23, 0 + li x24, 0 + li x25, 0 + li x26, 0 + li x27, 0 + + // Load masks + lla x5, randomx_masks + ld x16, 0(x5) + ld x17, 8(x5) + ld x1, 16(x5) + ld x19, 24(x5) + ld x7, 32(x5) + addi x9, x1, -56 + + // Set vector registers to 2x64 bit + vsetivli zero, 2, e64, m1, ta, ma + + // Apply dataset mask to mx, ma + slli x5, x19, 32 + or x5, x5, x19 + and x14, x14, x5 + + // Load group A registers + addi x5, x10, 192 + vle64.v v8, (x5) + + addi x5, x10, 208 + vle64.v v9, (x5) + + addi x5, x10, 224 + vle64.v v10, (x5) + + addi x5, x10, 240 + vle64.v v11, (x5) + + // Load E 'and' mask + vmv.v.i v12, -1 + vsrl.vi v12, v12, 8 + + // Load E 'or' mask (stored in reg.f[0]) + addi x5, x10, 64 + vle64.v v13, (x5) + + // Load scale mask + lui x5, 0x80f00 + slli x5, x5, 32 + vmv.v.x v14, x5 + + // IMUL_RCP literals pointer + lla x18, imul_rcp_literals + + // Load IMUL_RCP literals + ld x8, 0(x18) + ld x10, 8(x18) + ld x28, 16(x18) + ld x29, 24(x18) + ld x30, 32(x18) + ld x31, 40(x18) + fld f0, 48(x18) + fld f1, 56(x18) + fld f2, 64(x18) + fld f3, 72(x18) + fld f4, 80(x18) + fld f5, 88(x18) + fld f6, 96(x18) + fld f7, 104(x18) + fld f10, 112(x18) + fld f11, 120(x18) + fld f12, 128(x18) + fld f13, 136(x18) + fld f14, 144(x18) + fld f15, 152(x18) + fld f16, 160(x18) + fld f17, 168(x18) + fld f28, 176(x18) + fld f29, 184(x18) + fld f30, 192(x18) + fld f31, 200(x18) + +randomx_riscv64_vector_program_main_loop: + and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask + add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask] + + // read a 64-byte line from scratchpad (indexed by spAddr0) and XOR it with r0-r7 + ld x6, 0(x5) + xor x20, x20, x6 + ld x6, 8(x5) + xor x21, x21, x6 + ld x6, 16(x5) + xor x22, x22, x6 + ld x6, 24(x5) + xor x23, x23, x6 + ld x6, 32(x5) + xor x24, x24, x6 + ld x6, 40(x5) + xor x25, x25, x6 + ld x6, 48(x5) + xor x26, x26, x6 + ld x6, 56(x5) + xor x27, x27, x6 + + srli x5, x15, 32 // x5 = spAddr1 + and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask + add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask] + + // read a 64-byte line from scratchpad (indexed by spAddr1) and initialize f0-f3, e0-e3 registers + + // Set vector registers to 2x32 bit + vsetivli zero, 2, e32, m1, ta, ma + + // load f0 + vle32.v v16, (x5) + vfwcvt.f.x.v v0, v16 + + // load f1 + addi x6, x5, 8 + vle32.v v1, (x6) + // Use v16 as an intermediary register because vfwcvt accepts only registers with even numbers here + vfwcvt.f.x.v v16, v1 + vmv1r.v v1, v16 + + // load f2 + addi x6, x5, 16 + vle32.v v16, (x6) + vfwcvt.f.x.v v2, v16 + + // load f3 + addi x6, x5, 24 + vle32.v v3, (x6) + vfwcvt.f.x.v v16, v3 + vmv1r.v v3, v16 + + // load e0 + addi x6, x5, 32 + vle32.v v16, (x6) + vfwcvt.f.x.v v4, v16 + + // load e1 + addi x6, x5, 40 + vle32.v v5, (x6) + vfwcvt.f.x.v v16, v5 + vmv1r.v v5, v16 + + // load e2 + addi x6, x5, 48 + vle32.v v16, (x6) + vfwcvt.f.x.v v6, v16 + + // load e3 + addi x6, x5, 56 + vle32.v v7, (x6) + vfwcvt.f.x.v v16, v7 + vmv1r.v v7, v16 + + // Set vector registers back to 2x64 bit + vsetivli zero, 2, e64, m1, ta, ma + + // post-process e0-e3 + vand.vv v4, v4, v12 + vand.vv v5, v5, v12 + vand.vv v6, v6, v12 + vand.vv v7, v7, v12 + + vor.vv v4, v4, v13 + vor.vv v5, v5, v13 + vor.vv v6, v6, v13 + vor.vv v7, v7, v13 + +DECL(randomx_riscv64_vector_program_main_loop_instructions): + // Generated by JIT compiler + // FDIV_M can generate up to 50 bytes of code (round it up to 52 - a multiple of 4) + // +32 bytes for the scratchpad prefetch and the final jump instruction + .fill RANDOMX_PROGRAM_MAX_SIZE * 52 + 32, 1, 0 + +DECL(randomx_riscv64_vector_program_main_loop_instructions_end): + // Calculate dataset pointer for dataset read + // Do it here to break false dependency from readReg2 and readReg3 (see below) + srli x6, x14, 32 // x6 = ma & dataset mask + +DECL(randomx_riscv64_vector_program_main_loop_mx_xor): + xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers) + + and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask + xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask + + and x5, x14, x19 // x5 = mx & dataset mask + add x5, x5, x11 // x5 = &dataset[mx & dataset mask] + +#ifdef __riscv_zicbop + prefetch.r (x5) +#else + ld x5, (x5) +#endif + + add x5, x6, x11 // x5 = &dataset[ma & dataset mask] + + // read a 64-byte line from dataset and XOR it with r0-r7 + ld x6, 0(x5) + xor x20, x20, x6 + ld x6, 8(x5) + xor x21, x21, x6 + ld x6, 16(x5) + xor x22, x22, x6 + ld x6, 24(x5) + xor x23, x23, x6 + ld x6, 32(x5) + xor x24, x24, x6 + ld x6, 40(x5) + xor x25, x25, x6 + ld x6, 48(x5) + xor x26, x26, x6 + ld x6, 56(x5) + xor x27, x27, x6 + +randomx_riscv64_vector_program_main_loop_swap_mx_ma: + // swap mx <-> ma +#ifdef __riscv_zbb + rori x14, x14, 32 +#else + srli x5, x14, 32 + slli x14, x14, 32 + or x14, x14, x5 +#endif + + srli x5, x15, 32 // x5 = spAddr1 + and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask + add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask] + + // store registers r0-r7 to the scratchpad + sd x20, 0(x5) + sd x21, 8(x5) + sd x22, 16(x5) + sd x23, 24(x5) + sd x24, 32(x5) + sd x25, 40(x5) + sd x26, 48(x5) + sd x27, 56(x5) + + and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask + add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask] + +DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor): + xor x15, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers) + + // store registers f0-f3 to the scratchpad (f0-f3 are first combined with e0-e3) + vxor.vv v0, v0, v4 + vxor.vv v1, v1, v5 + vxor.vv v2, v2, v6 + vxor.vv v3, v3, v7 + + vse64.v v0, (x5) + + addi x6, x5, 16 + vse64.v v1, (x6) + + addi x6, x5, 32 + vse64.v v2, (x6) + + addi x6, x5, 48 + vse64.v v3, (x6) + + addi x13, x13, -1 + beqz x13, randomx_riscv64_vector_program_main_loop_end + j randomx_riscv64_vector_program_main_loop + +randomx_riscv64_vector_program_main_loop_end: + // Restore x8 and x10 + addi x8, sp, 112 + ld x10, 88(sp) + + // Store integer registers + sd x20, 0(x10) + sd x21, 8(x10) + sd x22, 16(x10) + sd x23, 24(x10) + sd x24, 32(x10) + sd x25, 40(x10) + sd x26, 48(x10) + sd x27, 56(x10) + + // Store FP registers + addi x5, x10, 64 + vse64.v v0, (x5) + + addi x5, x10, 80 + vse64.v v1, (x5) + + addi x5, x10, 96 + vse64.v v2, (x5) + + addi x5, x10, 112 + vse64.v v3, (x5) + + addi x5, x10, 128 + vse64.v v4, (x5) + + addi x5, x10, 144 + vse64.v v5, (x5) + + addi x5, x10, 160 + vse64.v v6, (x5) + + addi x5, x10, 176 + vse64.v v7, (x5) + + // Restore callee-saved registers + ld x9, 0(sp) + ld x18, 8(sp) + ld x19, 16(sp) + ld x20, 24(sp) + ld x21, 32(sp) + ld x22, 40(sp) + ld x23, 48(sp) + ld x24, 56(sp) + ld x25, 64(sp) + ld x26, 72(sp) + ld x27, 80(sp) + + ld x8, 96(sp) // old frame pointer + ld x1, 104(sp) // return address + + addi sp, sp, 112 + + ret + +DECL(randomx_riscv64_vector_program_main_loop_light_mode_data): + // 1) Pointer to the scalar dataset init function + // 2) Dataset offset + .dword 0, 0 + +DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode): + // Calculate dataset pointer for dataset read + // Do it here to break false dependency from readReg2 and readReg3 (see below) + srli x6, x14, 32 // x6 = ma & dataset mask + +DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode): + xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers) + and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask + xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask + + // Save all registers modified when calling dataset_init_scalar_func_ptr + addi sp, sp, -192 + + // bytes [0, 127] - saved registers + // bytes [128, 191] - output buffer + + sd x1, 0(sp) + sd x7, 16(sp) + sd x10, 24(sp) + sd x11, 32(sp) + sd x12, 40(sp) + sd x13, 48(sp) + sd x14, 56(sp) + sd x15, 64(sp) + sd x16, 72(sp) + sd x17, 80(sp) + sd x28, 88(sp) + sd x29, 96(sp) + sd x30, 104(sp) + sd x31, 112(sp) + + // setup randomx_riscv64_vector_sshash_dataset_init's parameters + + // x10 = pointer to pointer to cache memory + // pointer to cache memory was saved in "sd x11, 32(sp)", so x10 = sp + 32 + addi x10, sp, 32 + + // x11 = output buffer (64 bytes) + addi x11, sp, 128 + + // x12 = start block + lla x5, randomx_riscv64_vector_program_main_loop_light_mode_data + ld x12, 8(x5) + add x12, x12, x6 + srli x12, x12, 6 + + // x13 = end block + addi x13, x12, 1 + + ld x5, 0(x5) + jalr x1, 0(x5) + + // restore registers + ld x1, 0(sp) + ld x7, 16(sp) + ld x10, 24(sp) + ld x11, 32(sp) + ld x12, 40(sp) + ld x13, 48(sp) + ld x14, 56(sp) + ld x15, 64(sp) + ld x16, 72(sp) + ld x17, 80(sp) + ld x28, 88(sp) + ld x29, 96(sp) + ld x30, 104(sp) + ld x31, 112(sp) + + // read a 64-byte line from dataset and XOR it with r0-r7 + ld x5, 128(sp) + xor x20, x20, x5 + ld x5, 136(sp) + xor x21, x21, x5 + ld x5, 144(sp) + xor x22, x22, x5 + ld x5, 152(sp) + xor x23, x23, x5 + ld x5, 160(sp) + xor x24, x24, x5 + ld x5, 168(sp) + xor x25, x25, x5 + ld x5, 176(sp) + xor x26, x26, x5 + ld x5, 184(sp) + xor x27, x27, x5 + + addi sp, sp, 192 + + j randomx_riscv64_vector_program_main_loop_swap_mx_ma + +DECL(randomx_riscv64_vector_program_scratchpad_prefetch): + xor x5, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers) + srli x6, x5, 32 // x6 = spAddr1 + + and x5, x5, x9 // x5 = spAddr0 & 64-byte aligned L3 mask + and x6, x6, x9 // x6 = spAddr1 & 64-byte aligned L3 mask + + c.add x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask] + c.add x6, x12 // x6 = &scratchpad[spAddr1 & 64-byte aligned L3 mask] + +#ifdef __riscv_zicbop + prefetch.r (x5) + prefetch.r (x6) +#else + ld x5, (x5) + ld x6, (x6) +#endif + +DECL(randomx_riscv64_vector_program_scratchpad_prefetch_end): + +DECL(randomx_riscv64_vector_program_end): + +DECL(randomx_riscv64_vector_code_end): diff --git a/src/crypto/randomx/jit_compiler_rv64_vector_static.h b/src/crypto/randomx/jit_compiler_rv64_vector_static.h index 09bab597e..a616f27f9 100644 --- a/src/crypto/randomx/jit_compiler_rv64_vector_static.h +++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.h @@ -42,6 +42,8 @@ extern "C" { struct randomx_cache; +void randomx_riscv64_vector_code_begin(); + void randomx_riscv64_vector_sshash_begin(); void randomx_riscv64_vector_sshash_imul_rcp_literals(); void randomx_riscv64_vector_sshash_dataset_init(struct randomx_cache* cache, uint8_t* output_buf, uint32_t startBlock, uint32_t endBlock); @@ -50,9 +52,24 @@ void randomx_riscv64_vector_sshash_generated_instructions(); void randomx_riscv64_vector_sshash_generated_instructions_end(); void randomx_riscv64_vector_sshash_cache_prefetch(); void randomx_riscv64_vector_sshash_xor(); -void randomx_riscv64_vector_sshash_set_cache_index(); void randomx_riscv64_vector_sshash_end(); +void randomx_riscv64_vector_program_params(); +void randomx_riscv64_vector_program_imul_rcp_literals(); +void randomx_riscv64_vector_program_begin(); +void randomx_riscv64_vector_program_main_loop_instructions(); +void randomx_riscv64_vector_program_main_loop_instructions_end(); +void randomx_riscv64_vector_program_main_loop_mx_xor(); +void randomx_riscv64_vector_program_main_loop_spaddr_xor(); +void randomx_riscv64_vector_program_main_loop_light_mode_data(); +void randomx_riscv64_vector_program_main_loop_instructions_end_light_mode(); +void randomx_riscv64_vector_program_main_loop_mx_xor_light_mode(); +void randomx_riscv64_vector_program_end(); +void randomx_riscv64_vector_program_scratchpad_prefetch(); +void randomx_riscv64_vector_program_scratchpad_prefetch_end(); + +void randomx_riscv64_vector_code_end(); + #if defined(__cplusplus) } #endif diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 1609a4af3..745cdf9aa 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -282,7 +282,10 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx Log2_ScratchpadL2 = Log2(ScratchpadL2_Size); Log2_ScratchpadL3 = Log2(ScratchpadL3_Size); -#define JIT_HANDLE(x, prev) randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x +#define JIT_HANDLE(x, prev) do { \ + randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x; \ + randomx::JitCompilerRV64::inst_map[k] = static_cast(randomx::InstructionType::x); \ + } while (0) #else #define JIT_HANDLE(x, prev) diff --git a/src/crypto/randomx/tests/riscv64_vector.s b/src/crypto/randomx/tests/riscv64_vector.s index ee4c234f7..1b4681a6f 100644 --- a/src/crypto/randomx/tests/riscv64_vector.s +++ b/src/crypto/randomx/tests/riscv64_vector.s @@ -1,12 +1,10 @@ -/* RISC-V - test if the vector extension and prefetch instruction are present */ +/* RISC-V - test if the vector extension is present */ .text -.option arch, rv64gcv_zicbop +.option arch, rv64gcv .global main main: - lla x5, main - prefetch.r (x5) li x5, 4 vsetvli x6, x5, e64, m1, ta, ma vxor.vv v0, v0, v0 diff --git a/src/crypto/randomx/tests/riscv64_zicbop.s b/src/crypto/randomx/tests/riscv64_zicbop.s new file mode 100644 index 000000000..e64b57d6e --- /dev/null +++ b/src/crypto/randomx/tests/riscv64_zicbop.s @@ -0,0 +1,11 @@ +/* RISC-V - test if the prefetch instruction is present */ + +.text +.option arch, rv64gc_zicbop +.global main + +main: + lla x5, main + prefetch.r (x5) + mv x10, x0 + ret diff --git a/src/crypto/randomx/vm_compiled.cpp b/src/crypto/randomx/vm_compiled.cpp index 842bfe651..1985e7cae 100644 --- a/src/crypto/randomx/vm_compiled.cpp +++ b/src/crypto/randomx/vm_compiled.cpp @@ -58,7 +58,7 @@ namespace randomx { void CompiledVm::execute() { PROFILE_SCOPE(RandomX_JIT_execute); -# ifdef XMRIG_ARM +# if defined(XMRIG_ARM) || defined(XMRIG_RISCV) memcpy(reg.f, config.eMask, sizeof(config.eMask)); # endif compiler.getProgramFunc()(reg, mem, scratchpad, RandomX_CurrentConfig.ProgramIterations);