From d82d7f3f2000e679a948a34f85ea82c9b6fae7f4 Mon Sep 17 00:00:00 2001 From: SChernykh <15806605+SChernykh@users.noreply.github.com> Date: Sat, 31 Jan 2026 21:50:38 +0100 Subject: [PATCH] RandomX v2 (RISC-V) --- cmake/randomx.cmake | 2 +- src/crypto/randomx/aes_hash.cpp | 8 +- src/crypto/randomx/aes_hash_rv64_vector.cpp | 80 ++---- src/crypto/randomx/aes_hash_rv64_vector.hpp | 7 - src/crypto/randomx/jit_compiler_rv64.cpp | 59 ++++- src/crypto/randomx/jit_compiler_rv64_static.S | 22 +- .../randomx/jit_compiler_rv64_static.hpp | 4 +- .../randomx/jit_compiler_rv64_vector.cpp | 76 +++++- .../randomx/jit_compiler_rv64_vector_static.S | 237 +++++++++++++++++- .../randomx/jit_compiler_rv64_vector_static.h | 6 + 10 files changed, 398 insertions(+), 103 deletions(-) diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index 82156709a..d33286784 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -110,7 +110,7 @@ if (WITH_RANDOMX) endif() endif() - set_source_files_properties(src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}") + set_source_files_properties(src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}_zvkned") set_source_files_properties(src/crypto/randomx/aes_hash_rv64_vector.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}") set_source_files_properties(src/crypto/randomx/aes_hash_rv64_zvkned.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}_zvkned") else() diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 97b12cdf4..6cb081c17 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -75,7 +75,7 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) } if (xmrig::Cpu::info()->hasRISCV_Vector()) { - hashAes1Rx4_RVV(input, inputSize, hash); + hashAes1Rx4_RVV(input, inputSize, hash); return; } #endif @@ -156,7 +156,7 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) } if (xmrig::Cpu::info()->hasRISCV_Vector()) { - fillAes1Rx4_RVV(state, outputSize, buffer); + fillAes1Rx4_RVV(state, outputSize, buffer); return; } #endif @@ -213,7 +213,7 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) } if (xmrig::Cpu::info()->hasRISCV_Vector()) { - fillAes4Rx4_RVV(state, outputSize, buffer); + fillAes4Rx4_RVV(state, outputSize, buffer); return; } #endif @@ -297,7 +297,7 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi } if (xmrig::Cpu::info()->hasRISCV_Vector()) { - hashAndFillAes1Rx4_RVV(scratchpad, scratchpadSize, hash, fill_state); + hashAndFillAes1Rx4_RVV(scratchpad, scratchpadSize, hash, fill_state); return; } #endif diff --git a/src/crypto/randomx/aes_hash_rv64_vector.cpp b/src/crypto/randomx/aes_hash_rv64_vector.cpp index 76401f7f9..db541c9f8 100644 --- a/src/crypto/randomx/aes_hash_rv64_vector.cpp +++ b/src/crypto/randomx/aes_hash_rv64_vector.cpp @@ -69,7 +69,16 @@ static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 }; static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 0, 4, 8, 12, 64, 68, 72, 76 }; -template +#define lutEnc0 lutEnc[0] +#define lutEnc1 lutEnc[1] +#define lutEnc2 lutEnc[2] +#define lutEnc3 lutEnc[3] + +#define lutDec0 lutDec[0] +#define lutDec1 lutDec[1] +#define lutDec2 lutDec[2] +#define lutDec3 lutDec[3] + void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash) { const uint8_t* inptr = (const uint8_t*)input; const uint8_t* inputEnd = inptr + inputSize; @@ -113,10 +122,6 @@ void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash) { __riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, state13, 8); } -template void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash); -template void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash); - -template void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer) { const uint8_t* outptr = (uint8_t*)buffer; const uint8_t* outputEnd = outptr + outputSize; @@ -153,10 +158,6 @@ void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer) { __riscv_vsuxei32_v_u32m1((uint32_t*)state + 4, stride, state13, 8); } -template void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer); -template void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer); - -template void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer) { const uint8_t* outptr = (uint8_t*)buffer; const uint8_t* outputEnd = outptr + outputSize; @@ -203,10 +204,6 @@ void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer) { } } -template void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer); -template void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer); - -template void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { uint8_t* scratchpadPtr = (uint8_t*)scratchpad; const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; @@ -244,54 +241,13 @@ void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, __riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, fill_state02, 8); \ __riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, fill_state13, 8); - switch (softAes) { - case 0: - HASH_STATE(0); - HASH_STATE(1); + HASH_STATE(0); + HASH_STATE(1); - FILL_STATE(0); - FILL_STATE(1); + FILL_STATE(0); + FILL_STATE(1); - scratchpadPtr += 128; - break; - - default: - switch (unroll) { - case 4: - HASH_STATE(0); - FILL_STATE(0); - - HASH_STATE(1); - FILL_STATE(1); - - HASH_STATE(2); - FILL_STATE(2); - - HASH_STATE(3); - FILL_STATE(3); - - scratchpadPtr += 64 * 4; - break; - - case 2: - HASH_STATE(0); - FILL_STATE(0); - - HASH_STATE(1); - FILL_STATE(1); - - scratchpadPtr += 64 * 2; - break; - - default: - HASH_STATE(0); - FILL_STATE(0); - - scratchpadPtr += 64; - break; - } - break; - } + scratchpadPtr += 128; } #undef HASH_STATE @@ -314,9 +270,3 @@ void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, __riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8); __riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8); } - -template void hashAndFillAes1Rx4_RVV<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); -template void hashAndFillAes1Rx4_RVV<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); -template void hashAndFillAes1Rx4_RVV<2,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); -template void hashAndFillAes1Rx4_RVV<2,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); -template void hashAndFillAes1Rx4_RVV<2,4>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state); diff --git a/src/crypto/randomx/aes_hash_rv64_vector.hpp b/src/crypto/randomx/aes_hash_rv64_vector.hpp index d0101d662..c63e84ed7 100644 --- a/src/crypto/randomx/aes_hash_rv64_vector.hpp +++ b/src/crypto/randomx/aes_hash_rv64_vector.hpp @@ -29,14 +29,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -template void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash); - -template void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer); - -template void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer); - -template void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/crypto/randomx/jit_compiler_rv64.cpp b/src/crypto/randomx/jit_compiler_rv64.cpp index a3871bc84..230b22d1b 100644 --- a/src/crypto/randomx/jit_compiler_rv64.cpp +++ b/src/crypto/randomx/jit_compiler_rv64.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/program.hpp" #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/virtual_memory.hpp" +#include "crypto/randomx/soft_aes.h" #include "crypto/common/VirtualMemory.h" @@ -253,10 +254,12 @@ namespace randomx { static const uint8_t* codePrologue = (uint8_t*)&randomx_riscv64_prologue; static const uint8_t* codeLoopBegin = (uint8_t*)&randomx_riscv64_loop_begin; static const uint8_t* codeDataRead = (uint8_t*)&randomx_riscv64_data_read; + static const uint8_t* codeDataRead2 = (uint8_t*)&randomx_riscv64_data_read_v2_tweak; static const uint8_t* codeDataReadLight = (uint8_t*)&randomx_riscv64_data_read_light; + static const uint8_t* codeDataReadLight1 = (uint8_t*)&randomx_riscv64_data_read_light_v1; + static const uint8_t* codeDataReadLight2 = (uint8_t*)&randomx_riscv64_data_read_light_v2; static const uint8_t* codeFixLoopCall = (uint8_t*)&randomx_riscv64_fix_loop_call; static const uint8_t* codeSpadStore = (uint8_t*)&randomx_riscv64_spad_store; - static const uint8_t* codeSpadStoreHardAes = (uint8_t*)&randomx_riscv64_spad_store_hardaes; static const uint8_t* codeSpadStoreSoftAes = (uint8_t*)&randomx_riscv64_spad_store_softaes; static const uint8_t* codeLoopEnd = (uint8_t*)&randomx_riscv64_loop_end; static const uint8_t* codeFixContinueLoop = (uint8_t*)&randomx_riscv64_fix_continue_loop; @@ -272,9 +275,13 @@ namespace randomx { static const int32_t sizeDataInit = codePrologue - codeDataInit; static const int32_t sizePrologue = codeLoopBegin - codePrologue; static const int32_t sizeLoopBegin = codeDataRead - codeLoopBegin; - static const int32_t sizeDataRead = codeDataReadLight - codeDataRead; - static const int32_t sizeDataReadLight = codeSpadStore - codeDataReadLight; - static const int32_t sizeSpadStore = codeSpadStoreHardAes - codeSpadStore; + static const int32_t sizeDataRead = codeDataRead2 - codeDataRead; + static const int32_t sizeDataRead2 = codeDataReadLight - codeDataRead2; + static const int32_t sizeDataReadLight = codeDataReadLight1 - codeDataReadLight; + static const int32_t sizeDataReadLight1 = codeDataReadLight2 - codeDataReadLight1; + static const int32_t sizeDataReadLight2 = codeFixLoopCall - codeDataReadLight2; + static const int32_t sizeFixLoopCall = codeSpadStore - codeFixLoopCall; + static const int32_t sizeSpadStore = codeSpadStoreSoftAes - codeSpadStore; static const int32_t sizeSpadStoreSoftAes = codeLoopEnd - codeSpadStoreSoftAes; static const int32_t sizeLoopEnd = codeEpilogue - codeLoopEnd; static const int32_t sizeEpilogue = codeSoftAes - codeEpilogue; @@ -284,7 +291,6 @@ namespace randomx { static const int32_t sizeSshPrefetch = codeSshEnd - codeSshPrefetch; static const int32_t offsetFixDataCall = codeFixDataCall - codeDataInit; - static const int32_t offsetFixLoopCall = codeFixLoopCall - codeDataReadLight; static const int32_t offsetFixContinueLoop = codeFixContinueLoop - codeLoopEnd; static const int32_t LoopTopPos = LiteralPoolSize + sizeDataInit + sizePrologue; @@ -479,8 +485,15 @@ namespace randomx { static void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg) { state.codePos = RandomXCodePos; state.rcpCount = 0; + state.emitAt(LiteralPoolOffset + sizeLiterals, pcfg.eMask[0]); state.emitAt(LiteralPoolOffset + sizeLiterals + 8, pcfg.eMask[1]); + + if (RandomX_CurrentConfig.Tweak_V2_AES) { + state.emitAt(LiteralPoolOffset + sizeLiterals + 16, (uint64_t) &lutEnc[2][0]); + state.emitAt(LiteralPoolOffset + sizeLiterals + 24, (uint64_t) &lutDec[2][0]); + } + for (unsigned i = 0; i < RegistersCount; ++i) { state.registerUsage[i] = -1; } @@ -493,7 +506,13 @@ namespace randomx { } static void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg) { - state.emit(codeSpadStore, sizeSpadStore); + if (RandomX_CurrentConfig.Tweak_V2_AES) { + state.emit(codeSpadStoreSoftAes, sizeSpadStoreSoftAes); + } + else { + state.emit(codeSpadStore, sizeSpadStore); + } + int32_t fixPos = state.codePos; state.emit(codeLoopEnd, sizeLoopEnd); //xor x26, x{readReg0}, x{readReg1} @@ -502,6 +521,10 @@ namespace randomx { //j LoopTop emitJump(state, 0, fixPos, LoopTopPos); state.emit(codeEpilogue, sizeEpilogue); + + if (RandomX_CurrentConfig.Tweak_V2_AES) { + state.emit(codeSoftAes, sizeSoftAes); + } } static void generateSuperscalarCode(CodeBuffer& buf, Instruction isn, bool lastLiteral) { @@ -669,6 +692,9 @@ namespace randomx { state.emit(codeDataRead, sizeDataRead); //xor x8, x{readReg2}, x{readReg3} state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + int32_t fixPos2 = state.codePos; + state.emit(codeDataRead2, sizeDataRead2); + state.emitAt(fixPos2, (uint16_t)(RandomX_CurrentConfig.Tweak_V2_PREFETCH ? 0x1402 : 0x0001)); emitProgramSuffix(state, pcfg); clearCache(state); } @@ -691,7 +717,14 @@ namespace randomx { state.emitAt(fixPos + 4, rv64::LUI | (uimm << 12) | rvrd(Tmp2Reg)); //addi x9, x9, {limm} state.emitAt(fixPos + 8, rvi(rv64::ADDI, Tmp2Reg, Tmp2Reg, limm)); - fixPos += offsetFixLoopCall; + if (RandomX_CurrentConfig.Tweak_V2_PREFETCH) { + state.emit(codeDataReadLight2, sizeDataReadLight2); + } + else { + state.emit(codeDataReadLight1, sizeDataReadLight1); + } + fixPos = state.codePos; + state.emit(codeFixLoopCall, sizeFixLoopCall); //jal x1, SuperscalarHash emitJump(state, ReturnReg, fixPos, SuperScalarHashOffset); emitProgramSuffix(state, pcfg); @@ -1175,10 +1208,22 @@ namespace randomx { //c.or x8, x9 state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, Tmp2Reg + OffsetXC)); #endif + if (RandomX_CurrentConfig.Tweak_V2_CFROUND) { + //andi x9, x8, 240 + state.emit(rvi(rv64::ANDI, Tmp2Reg, Tmp1Reg, 240)); + //c.bnez x9, +12 + state.emit(uint16_t(0xE491)); + } //c.andi x8, 12 state.emit(rvc(rv64::C_ANDI, Tmp1Reg + OffsetXC, 12)); } else { + if (RandomX_CurrentConfig.Tweak_V2_CFROUND) { + //andi x9, x{src}, 240 + state.emit(rvi(rv64::ANDI, Tmp2Reg, regR(isn.src), 240)); + //c.bnez x9, +14 + state.emit(uint16_t(0xE499)); + } //and x8, x{src}, 12 state.emit(rvi(rv64::ANDI, Tmp1Reg, regR(isn.src), 12)); } diff --git a/src/crypto/randomx/jit_compiler_rv64_static.S b/src/crypto/randomx/jit_compiler_rv64_static.S index c4f341adb..ad1fdf9e1 100644 --- a/src/crypto/randomx/jit_compiler_rv64_static.S +++ b/src/crypto/randomx/jit_compiler_rv64_static.S @@ -40,10 +40,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .global DECL(randomx_riscv64_prologue) .global DECL(randomx_riscv64_loop_begin) .global DECL(randomx_riscv64_data_read) +.global DECL(randomx_riscv64_data_read_v2_tweak) .global DECL(randomx_riscv64_data_read_light) +.global DECL(randomx_riscv64_data_read_light_v1) +.global DECL(randomx_riscv64_data_read_light_v2) .global DECL(randomx_riscv64_fix_loop_call) .global DECL(randomx_riscv64_spad_store) -.global DECL(randomx_riscv64_spad_store_hardaes) .global DECL(randomx_riscv64_spad_store_softaes) .global DECL(randomx_riscv64_loop_end) .global DECL(randomx_riscv64_fix_continue_loop) @@ -408,7 +410,9 @@ DECL(randomx_riscv64_data_read): slli x8, x8, 32 srli x8, x8, 32 #endif - /* update "mx" */ +DECL(randomx_riscv64_data_read_v2_tweak): + slli x8, x8, 32 /* JIT compiler will replace it with "nop" for RandomX v1 */ + /* update "mp" */ xor x25, x25, x8 /* read dataset and update registers */ ld x8, 0(x7) @@ -456,13 +460,22 @@ DECL(randomx_riscv64_data_read_light): slli x25, x25, 32 or x25, x25, x31 #endif +DECL(randomx_riscv64_data_read_light_v1): slli x8, x8, 32 - /* update "mx" */ + /* update "mp" */ xor x25, x25, x8 /* the next dataset item */ and x7, x25, x1 srli x7, x7, 6 add x7, x7, x9 +DECL(randomx_riscv64_data_read_light_v2): + /* the next dataset item */ + and x7, x25, x1 + srli x7, x7, 6 + add x7, x7, x9 + and x8, x8, x1 + /* update "mp" */ + xor x25, x25, x8 DECL(randomx_riscv64_fix_loop_call): jal superscalar_hash /* JIT compiler will adjust the offset */ xor x16, x16, x8 @@ -536,9 +549,6 @@ DECL(randomx_riscv64_spad_store): sd x30, 56(x26) fmv.d.x f7, x30 -DECL(randomx_riscv64_spad_store_hardaes): - nop /* not implemented */ - DECL(randomx_riscv64_spad_store_softaes): /* store integer registers */ sd x16, 0(x27) diff --git a/src/crypto/randomx/jit_compiler_rv64_static.hpp b/src/crypto/randomx/jit_compiler_rv64_static.hpp index 656623c74..26b79caec 100644 --- a/src/crypto/randomx/jit_compiler_rv64_static.hpp +++ b/src/crypto/randomx/jit_compiler_rv64_static.hpp @@ -36,10 +36,12 @@ extern "C" { void randomx_riscv64_prologue(); void randomx_riscv64_loop_begin(); void randomx_riscv64_data_read(); + void randomx_riscv64_data_read_v2_tweak(); void randomx_riscv64_data_read_light(); + void randomx_riscv64_data_read_light_v1(); + void randomx_riscv64_data_read_light_v2(); void randomx_riscv64_fix_loop_call(); void randomx_riscv64_spad_store(); - void randomx_riscv64_spad_store_hardaes(); void randomx_riscv64_spad_store_softaes(); void randomx_riscv64_loop_end(); void randomx_riscv64_fix_continue_loop(); diff --git a/src/crypto/randomx/jit_compiler_rv64_vector.cpp b/src/crypto/randomx/jit_compiler_rv64_vector.cpp index 5725779f8..81d99e600 100644 --- a/src/crypto/randomx/jit_compiler_rv64_vector.cpp +++ b/src/crypto/randomx/jit_compiler_rv64_vector.cpp @@ -34,12 +34,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/superscalar.hpp" #include "crypto/randomx/program.hpp" +#include "crypto/randomx/soft_aes.h" +#include "backend/cpu/Cpu.h" namespace randomx { #define ADDR(x) ((uint8_t*) &(x)) #define DIST(x, y) (ADDR(y) - ADDR(x)) +#define JUMP(offset) (0x6F | (((offset) & 0x7FE) << 20) | (((offset) & 0x800) << 9) | ((offset) & 0xFF000)) + void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs) { uint8_t* p = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions); @@ -205,8 +209,7 @@ void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, // Emit "J randomx_riscv64_vector_sshash_generated_instructions_end" instruction const uint8_t* e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions_end); - const uint32_t k = e - p; - const uint32_t j = 0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000); + const uint32_t j = JUMP(e - p); memcpy(p, &j, 4); char* result = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_dataset_init)); @@ -323,6 +326,26 @@ void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguratio params[3] = RandomX_CurrentConfig.DatasetBaseSize - 64; params[4] = (1 << RandomX_ConfigurationBase::JumpBits) - 1; + const bool hasAES = xmrig::Cpu::info()->hasAES(); + + if (RandomX_CurrentConfig.Tweak_V2_AES && !hasAES) { + params[5] = (uint64_t) &lutEnc[2][0]; + params[6] = (uint64_t) &lutDec[2][0]; + params[7] = (uint64_t) lutEncIndex; + params[8] = (uint64_t) lutDecIndex; + + uint32_t* p1 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_v2_soft_aes_init)); + + // Restore vsetivli zero, 4, e32, m1, ta, ma + *p1 = 0xCD027057; + } + else { + uint32_t* p1 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_v2_soft_aes_init)); + + // Emit "J randomx_riscv64_vector_program_main_loop" instruction + *p1 = JUMP(DIST(randomx_riscv64_vector_program_v2_soft_aes_init, randomx_riscv64_vector_program_main_loop)); + } + uint64_t* imul_rcp_literals = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_imul_rcp_literals)); uint64_t* cur_literal = imul_rcp_literals; @@ -338,6 +361,16 @@ void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguratio *mx_xor = mx_xor_value; *mx_xor_light = mx_xor_value; + // "slli x5, x5, 32" for RandomX v2, "nop" for RandomX v1 + const uint16_t mp_reg_value = RandomX_CurrentConfig.Tweak_V2_PREFETCH ? 0x1282 : 0x0001; + + memcpy(((uint8_t*)mx_xor) + 8, &mp_reg_value, sizeof(mp_reg_value)); + memcpy(((uint8_t*)mx_xor_light) + 8, &mp_reg_value, sizeof(mp_reg_value)); + + // "srli x5, x14, 32" for RandomX v2, "srli x5, x14, 0" for RandomX v1 + const uint32_t mp_reg_value2 = RandomX_CurrentConfig.Tweak_V2_PREFETCH ? 0x02075293 : 0x00075293; + memcpy(((uint8_t*)mx_xor) + 14, &mp_reg_value2, sizeof(mp_reg_value2)); + if (entryDataInitScalar) { void* light_mode_data = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_light_mode_data); @@ -760,10 +793,24 @@ void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguratio emit32(0x0062E2B3); #endif // __riscv_zbb + if (RandomX_CurrentConfig.Tweak_V2_CFROUND) { + // andi x6, x5, 120 + emit32(0x0782F313); + // bnez x6, +24 + emit32(0x00031C63); + } + // andi x5, x5, 6 emit32(0x0062F293); } else { + if (RandomX_CurrentConfig.Tweak_V2_CFROUND) { + // andi x6, x20 + src, 120 + emit32(0x078A7313 + (src << 15)); + // bnez x6, +24 + emit32(0x00031C63); + } + // andi x5, x20 + src, 6 emit32(0x006A7293 + (src << 15)); } @@ -813,6 +860,9 @@ void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguratio } break; + case InstructionType::NOP: + break; + default: UNREACHABLE; } @@ -829,8 +879,26 @@ void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguratio e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end); } - const uint32_t k = e - p; - emit32(0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000)); + emit32(JUMP(e - p)); + + if (RandomX_CurrentConfig.Tweak_V2_AES) { + uint32_t* p1 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_fe_mix)); + + if (hasAES) { + // Restore vsetivli zero, 4, e32, m1, ta, ma + *p1 = 0xCD027057; + } + else { + // Emit "J randomx_riscv64_vector_program_main_loop_fe_mix_v2_soft_aes" instruction + *p1 = JUMP(DIST(randomx_riscv64_vector_program_main_loop_fe_mix, randomx_riscv64_vector_program_main_loop_fe_mix_v2_soft_aes)); + } + } + else { + uint32_t* p1 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_fe_mix)); + + // Emit "J randomx_riscv64_vector_program_main_loop_fe_mix_v1" instruction + *p1 = JUMP(DIST(randomx_riscv64_vector_program_main_loop_fe_mix, randomx_riscv64_vector_program_main_loop_fe_mix_v1)); + } #ifdef __GNUC__ char* p1 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params)); diff --git a/src/crypto/randomx/jit_compiler_rv64_vector_static.S b/src/crypto/randomx/jit_compiler_rv64_vector_static.S index f8c0a7883..2f97d93ef 100644 --- a/src/crypto/randomx/jit_compiler_rv64_vector_static.S +++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.S @@ -66,16 +66,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .global DECL(randomx_riscv64_vector_program_params) .global DECL(randomx_riscv64_vector_program_imul_rcp_literals) .global DECL(randomx_riscv64_vector_program_begin) +.global DECL(randomx_riscv64_vector_program_v2_soft_aes_init) +.global DECL(randomx_riscv64_vector_program_main_loop) .global DECL(randomx_riscv64_vector_program_main_loop_instructions) .global DECL(randomx_riscv64_vector_program_main_loop_instructions_end) .global DECL(randomx_riscv64_vector_program_main_loop_mx_xor) .global DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor) +.global DECL(randomx_riscv64_vector_program_main_loop_fe_mix) .global DECL(randomx_riscv64_vector_program_main_loop_light_mode_data) .global DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode) .global DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode) .global DECL(randomx_riscv64_vector_program_scratchpad_prefetch) +.global DECL(randomx_riscv64_vector_program_main_loop_fe_mix_v1) +.global DECL(randomx_riscv64_vector_program_main_loop_fe_mix_v2_soft_aes) + .global DECL(randomx_riscv64_vector_program_end) .global DECL(randomx_riscv64_vector_code_end) @@ -375,12 +381,21 @@ v12 = E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff v13 = E 'or' mask = 0x3*00000000******'3*00000000****** v14 = scale mask = 0x80f0000000000000'80f0000000000000 -v15 = unused +v15 = all zeroes v16 = temporary v17 = unused v18 = temporary -v19-v31 = unused +v19 = unused +v20 = randomx_aes_lut_enc_index[0] +v21 = randomx_aes_lut_enc_index[1] +v22 = randomx_aes_lut_enc_index[2] +v23 = randomx_aes_lut_enc_index[3] +v24 = randomx_aes_lut_dec_index[0] +v25 = randomx_aes_lut_dec_index[1] +v26 = randomx_aes_lut_dec_index[2] +v27 = randomx_aes_lut_dec_index[3] +v28-v31 = temporary in aesenc_soft/aesdec_soft */ .balign 8 @@ -390,6 +405,11 @@ DECL(randomx_riscv64_vector_program_params): // JIT compiler will adjust these values for different RandomX variants randomx_masks: .dword 16376, 262136, 2097144, 2147483584, 255 +randomx_aes_lut_enc_ptr: .dword 0 +randomx_aes_lut_dec_ptr: .dword 0 +randomx_aes_lut_enc_index_ptr: .dword 0 +randomx_aes_lut_dec_index_ptr: .dword 0 + DECL(randomx_riscv64_vector_program_imul_rcp_literals): imul_rcp_literals: .fill RANDOMX_PROGRAM_MAX_SIZE, 8, 0 @@ -507,7 +527,44 @@ DECL(randomx_riscv64_vector_program_begin): fld f30, 192(x18) fld f31, 200(x18) -randomx_riscv64_vector_program_main_loop: + // Set v15 to zero + vxor.vv v15, v15, v15 + +DECL(randomx_riscv64_vector_program_v2_soft_aes_init): + // JIT compiler will place a jump to the main loop here if needed + + // Load randomx_aes_lut_enc_index/randomx_aes_lut_dec_index + vsetivli zero, 4, e32, m1, ta, ma + + lla x5, randomx_aes_lut_enc_index_ptr + ld x5, (x5) + vle32.v v20, (x5) + + addi x6, x5, 32 + vle32.v v21, (x6) + + addi x6, x5, 64 + vle32.v v22, (x6) + + addi x6, x5, 96 + vle32.v v23, (x6) + + lla x5, randomx_aes_lut_dec_index_ptr + ld x5, (x5) + vle32.v v24, (x5) + + addi x6, x5, 32 + vle32.v v25, (x6) + + addi x6, x5, 64 + vle32.v v26, (x6) + + addi x6, x5, 96 + vle32.v v27, (x6) + + vsetivli zero, 2, e64, m1, ta, ma + +DECL(randomx_riscv64_vector_program_main_loop): and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask] @@ -609,11 +666,13 @@ DECL(randomx_riscv64_vector_program_main_loop_instructions_end): DECL(randomx_riscv64_vector_program_main_loop_mx_xor): xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers) - and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask - xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask + slli x5, x5, 32 // JIT compiler will replace it with "nop" for v1 + xor x14, x14, x5 // mp ^= (readReg2 ^ readReg3) & dataset mask - add x5, x14, x11 // x5 = &dataset[mx & dataset mask] + srli x5, x14, 32 // JIT compiler will replace it with "srli x5, x14, 0" for v1 + and x5, x5, x19 // x5 = mp & dataset mask + add x5, x5, x11 // x5 = &dataset[mp & dataset mask] #ifdef __riscv_zicbop prefetch.r (x5) @@ -689,11 +748,47 @@ DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor): xor x15, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers) // store registers f0-f3 to the scratchpad (f0-f3 are first combined with e0-e3) - vxor.vv v0, v0, v4 + + // v2 FE mix code is the main code path + // JIT compiler will place a jump to v1 or v2 soft AES code here if needed +DECL(randomx_riscv64_vector_program_main_loop_fe_mix): + vsetivli zero, 4, e32, m1, ta, ma + + // f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0) + vaesem.vv v0, v4 + vaesdm.vv v1, v15 + vaesem.vv v2, v4 + vaesdm.vv v3, v15 + vxor.vv v1, v1, v4 + vxor.vv v3, v3, v4 + + // f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1) + vaesem.vv v0, v5 + vaesdm.vv v1, v15 + vaesem.vv v2, v5 + vaesdm.vv v3, v15 vxor.vv v1, v1, v5 - vxor.vv v2, v2, v6 + vxor.vv v3, v3, v5 + + // f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2) + vaesem.vv v0, v6 + vaesdm.vv v1, v15 + vaesem.vv v2, v6 + vaesdm.vv v3, v15 + vxor.vv v1, v1, v6 + vxor.vv v3, v3, v6 + + // f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3) + vaesem.vv v0, v7 + vaesdm.vv v1, v15 + vaesem.vv v2, v7 + vaesdm.vv v3, v15 + vxor.vv v1, v1, v7 vxor.vv v3, v3, v7 + vsetivli zero, 2, e64, m1, ta, ma + +randomx_riscv64_vector_program_main_loop_fe_store: vse64.v v0, (x5) addi x6, x5, 16 @@ -782,6 +877,7 @@ DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode): DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode): xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers) and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask + slli x5, x5, 32 // JIT compiler will replace it with "nop" for v1 xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask // Save all registers modified when calling dataset_init_scalar_func_ptr @@ -864,6 +960,131 @@ DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode): j randomx_riscv64_vector_program_scratchpad_prefetch +DECL(randomx_riscv64_vector_program_main_loop_fe_mix_v1): + vxor.vv v0, v0, v4 + vxor.vv v1, v1, v5 + vxor.vv v2, v2, v6 + vxor.vv v3, v3, v7 + j randomx_riscv64_vector_program_main_loop_fe_store + +/* +aesenc middle round + +x5 = pointer to aesenc LUT +v16 = input and return value +*/ +.macro aesenc_soft input, key + vsetivli zero, 16, e8, m1, ta, ma + + vrgather.vv v28, \input, v20 + vrgather.vv v29, \input, v21 + vrgather.vv v30, \input, v22 + vrgather.vv v31, \input, v23 + + vsetivli zero, 4, e32, m1, ta, ma + + vsll.vi v28, v28, 2 + vsll.vi v29, v29, 2 + vsll.vi v30, v30, 2 + vsll.vi v31, v31, 2 + + addi x6, x5, -2048 + vluxei32.v v28, (x6), v28 + + addi x6, x5, -1024 + vluxei32.v v29, (x6), v29 + + vluxei32.v v30, (x5), v30 + + addi x6, x5, 1024 + vluxei32.v v31, (x6), v31 + + vxor.vv v28, v28, v29 + vxor.vv v30, v30, v31 + vxor.vv \input, v28, v30 + vxor.vv \input, \input, \key +.endm + +/* +aesdec middle round + +x5 = pointer to aesdec LUT +v16 = input and return value +*/ +.macro aesdec_soft input, key + vsetivli zero, 16, e8, m1, ta, ma + + vrgather.vv v28, \input, v24 + vrgather.vv v29, \input, v25 + vrgather.vv v30, \input, v26 + vrgather.vv v31, \input, v27 + + vsetivli zero, 4, e32, m1, ta, ma + + vsll.vi v28, v28, 2 + vsll.vi v29, v29, 2 + vsll.vi v30, v30, 2 + vsll.vi v31, v31, 2 + + addi x6, x5, -2048 + vluxei32.v v28, (x6), v28 + + addi x6, x5, -1024 + vluxei32.v v29, (x6), v29 + + vluxei32.v v30, (x5), v30 + + addi x6, x5, 1024 + vluxei32.v v31, (x6), v31 + + vxor.vv v28, v28, v29 + vxor.vv v30, v30, v31 + vxor.vv \input, v28, v30 + vxor.vv \input, \input, \key +.endm + +DECL(randomx_riscv64_vector_program_main_loop_fe_mix_v2_soft_aes): + // save x5 + vmv.s.x v16, x5 + + lla x5, randomx_aes_lut_enc_ptr + ld x5, (x5) + + // f0 = aesenc(f0, e0), f0 = aesenc(f0, e1), f0 = aesenc(f0, e2), f0 = aesenc(f0, e3) + aesenc_soft v0, v4 + aesenc_soft v0, v5 + aesenc_soft v0, v6 + aesenc_soft v0, v7 + + // f2 = aesenc(f2, e0), f2 = aesenc(f2, e1), f2 = aesenc(f2, e2), f2 = aesenc(f2, e3) + aesenc_soft v2, v4 + aesenc_soft v2, v5 + aesenc_soft v2, v6 + aesenc_soft v2, v7 + + lla x5, randomx_aes_lut_dec_ptr + ld x5, (x5) + + // f1 = aesdec(f1, e0), f1 = aesdec(f1, e1), f1 = aesdec(f1, e2), f1 = aesdec(f1, e3) + aesdec_soft v1, v4 + aesdec_soft v1, v5 + aesdec_soft v1, v6 + aesdec_soft v1, v7 + + // f3 = aesdec(f3, e0), f3 = aesdec(f3, e1), f3 = aesdec(f3, e2), f3 = aesdec(f3, e3) + aesdec_soft v3, v4 + aesdec_soft v3, v5 + aesdec_soft v3, v6 + aesdec_soft v3, v7 + + // Set vector registers back to 2x64 bit + vsetivli zero, 2, e64, m1, ta, ma + + // restore x5 + vmv.x.s x5, v16 + + j randomx_riscv64_vector_program_main_loop_fe_store + DECL(randomx_riscv64_vector_program_end): DECL(randomx_riscv64_vector_code_end): diff --git a/src/crypto/randomx/jit_compiler_rv64_vector_static.h b/src/crypto/randomx/jit_compiler_rv64_vector_static.h index 0624cc83a..7a4937e1c 100644 --- a/src/crypto/randomx/jit_compiler_rv64_vector_static.h +++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.h @@ -57,16 +57,22 @@ void randomx_riscv64_vector_sshash_end(); void randomx_riscv64_vector_program_params(); void randomx_riscv64_vector_program_imul_rcp_literals(); void randomx_riscv64_vector_program_begin(); +void randomx_riscv64_vector_program_v2_soft_aes_init(); +void randomx_riscv64_vector_program_main_loop(); void randomx_riscv64_vector_program_main_loop_instructions(); void randomx_riscv64_vector_program_main_loop_instructions_end(); void randomx_riscv64_vector_program_main_loop_mx_xor(); void randomx_riscv64_vector_program_main_loop_spaddr_xor(); +void randomx_riscv64_vector_program_main_loop_fe_mix(); void randomx_riscv64_vector_program_main_loop_light_mode_data(); void randomx_riscv64_vector_program_main_loop_instructions_end_light_mode(); void randomx_riscv64_vector_program_main_loop_mx_xor_light_mode(); void randomx_riscv64_vector_program_end(); void randomx_riscv64_vector_program_scratchpad_prefetch(); +void randomx_riscv64_vector_program_main_loop_fe_mix_v1(); +void randomx_riscv64_vector_program_main_loop_fe_mix_v2_soft_aes(); + void randomx_riscv64_vector_code_end(); #if defined(__cplusplus)