/* Copyright (c) 2018-2020, tevador Copyright (c) 2019-2021, XMRig , Copyright (c) 2025, SChernykh All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "crypto/randomx/configuration.h" #include "crypto/randomx/jit_compiler_rv64_vector.h" #include "crypto/randomx/jit_compiler_rv64_vector_static.h" #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/superscalar.hpp" #include "crypto/randomx/program.hpp" namespace randomx { #define ADDR(x) ((uint8_t*) &(x)) #define DIST(x, y) (ADDR(y) - ADDR(x)) void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs) { uint8_t* p = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions); uint8_t* literals = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_imul_rcp_literals); uint8_t* cur_literal = literals; for (size_t i = 0; i < num_programs; ++i) { // Step 4 size_t k = DIST(randomx_riscv64_vector_sshash_cache_prefetch, randomx_riscv64_vector_sshash_xor); memcpy(p, reinterpret_cast(randomx_riscv64_vector_sshash_cache_prefetch), k); p += k; // Step 5 for (uint32_t j = 0; j < programs[i].size; ++j) { const uint32_t dst = programs[i].programBuffer[j].dst & 7; const uint32_t src = programs[i].programBuffer[j].src & 7; const uint32_t modShift = (programs[i].programBuffer[j].mod >> 2) & 3; const uint32_t imm32 = programs[i].programBuffer[j].imm32; uint32_t inst; #define EMIT(data) inst = (data); memcpy(p, &inst, 4); p += 4 switch (static_cast(programs[i].programBuffer[j].opcode)) { case SuperscalarInstructionType::ISUB_R: // 57 00 00 0A vsub.vv v0, v0, v0 EMIT(0x0A000057 | (dst << 7) | (src << 15) | (dst << 20)); break; case SuperscalarInstructionType::IXOR_R: // 57 00 00 2E vxor.vv v0, v0, v0 EMIT(0x2E000057 | (dst << 7) | (src << 15) | (dst << 20)); break; case SuperscalarInstructionType::IADD_RS: if (modShift == 0) { // 57 00 00 02 vadd.vv v0, v0, v0 EMIT(0x02000057 | (dst << 7) | (src << 15) | (dst << 20)); } else { // 57 39 00 96 vsll.vi v18, v0, 0 // 57 00 09 02 vadd.vv v0, v0, v18 EMIT(0x96003957 | (modShift << 15) | (src << 20)); EMIT(0x02090057 | (dst << 7) | (dst << 20)); } break; case SuperscalarInstructionType::IMUL_R: // 57 20 00 96 vmul.vv v0, v0, v0 EMIT(0x96002057 | (dst << 7) | (src << 15) | (dst << 20)); break; case SuperscalarInstructionType::IROR_C: { const uint32_t shift_right = imm32 & 63; const uint32_t shift_left = 64 - shift_right; if (shift_right < 32) { // 57 39 00 A2 vsrl.vi v18, v0, 0 EMIT(0xA2003957 | (shift_right << 15) | (dst << 20)); } else { // 93 02 00 00 li x5, 0 // 57 C9 02 A2 vsrl.vx v18, v0, x5 EMIT(0x00000293 | (shift_right << 20)); EMIT(0xA202C957 | (dst << 20)); } if (shift_left < 32) { // 57 30 00 96 vsll.vi v0, v0, 0 EMIT(0x96003057 | (dst << 7) | (shift_left << 15) | (dst << 20)); } else { // 93 02 00 00 li x5, 0 // 57 C0 02 96 vsll.vx v0, v0, x5 EMIT(0x00000293 | (shift_left << 20)); EMIT(0x9602C057 | (dst << 7) | (dst << 20)); } // 57 00 20 2B vor.vv v0, v18, v0 EMIT(0x2B200057 | (dst << 7) | (dst << 15)); } break; case SuperscalarInstructionType::IADD_C7: case SuperscalarInstructionType::IADD_C8: case SuperscalarInstructionType::IADD_C9: // B7 02 00 00 lui x5, 0 // 9B 82 02 00 addiw x5, x5, 0 // 57 C0 02 02 vadd.vx v0, v0, x5 EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000)); EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20)); EMIT(0x0202C057 | (dst << 7) | (dst << 20)); break; case SuperscalarInstructionType::IXOR_C7: case SuperscalarInstructionType::IXOR_C8: case SuperscalarInstructionType::IXOR_C9: // B7 02 00 00 lui x5, 0 // 9B 82 02 00 addiw x5, x5, 0 // 57 C0 02 2E vxor.vx v0, v0, x5 EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000)); EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20)); EMIT(0x2E02C057 | (dst << 7) | (dst << 20)); break; case SuperscalarInstructionType::IMULH_R: // 57 20 00 92 vmulhu.vv v0, v0, v0 EMIT(0x92002057 | (dst << 7) | (src << 15) | (dst << 20)); break; case SuperscalarInstructionType::ISMULH_R: // 57 20 00 9E vmulh.vv v0, v0, v0 EMIT(0x9E002057 | (dst << 7) | (src << 15) | (dst << 20)); break; case SuperscalarInstructionType::IMUL_RCP: { uint32_t offset = cur_literal - literals; if (offset == 2040) { literals += 2040; offset = 0; // 93 87 87 7F add x15, x15, 2040 EMIT(0x7F878793); } const uint64_t r = randomx_reciprocal_fast(imm32); memcpy(cur_literal, &r, 8); cur_literal += 8; // 83 B2 07 00 ld x5, (x15) // 57 E0 02 96 vmul.vx v0, v0, x5 EMIT(0x0007B283 | (offset << 20)); EMIT(0x9602E057 | (dst << 7) | (dst << 20)); } break; default: UNREACHABLE; } } // Step 6 k = DIST(randomx_riscv64_vector_sshash_xor, randomx_riscv64_vector_sshash_end); memcpy(p, reinterpret_cast(randomx_riscv64_vector_sshash_xor), k); p += k; // Step 7. Set cacheIndex to the value of the register that has the longest dependency chain in the SuperscalarHash function executed in step 5. if (i + 1 < num_programs) { // vmv.v.v v9, v0 + programs[i].getAddressRegister() const uint32_t t = 0x5E0004D7 + (static_cast(programs[i].getAddressRegister()) << 15); memcpy(p, &t, 4); p += 4; } } // Emit "J randomx_riscv64_vector_sshash_generated_instructions_end" instruction const uint8_t* e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions_end); const uint32_t k = e - p; const uint32_t j = 0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000); memcpy(p, &j, 4); char* result = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_dataset_init)); #ifdef __GNUC__ __builtin___clear_cache(result, (char*)(buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_end))); #endif return result; } #define emit16(value) { const uint16_t t = value; memcpy(p, &t, 2); p += 2; } #define emit32(value) { const uint32_t t = value; memcpy(p, &t, 4); p += 4; } #define emit64(value) { const uint64_t t = value; memcpy(p, &t, 8); p += 8; } #define emit_data(arr) { memcpy(p, arr, sizeof(arr)); p += sizeof(arr); } static void imm_to_x5(uint32_t imm, uint8_t*& p) { const uint32_t imm_hi = (imm + ((imm & 0x800) << 1)) & 0xFFFFF000U; const uint32_t imm_lo = imm & 0x00000FFFU; if (imm_hi == 0) { // li x5, imm_lo emit32(0x00000293 + (imm_lo << 20)); return; } if (imm_lo == 0) { // lui x5, imm_hi emit32(0x000002B7 + imm_hi); return; } // lui x5, imm_hi // addiw x5, x5, imm_lo emit64(0x0002829B000002B7ULL | imm_hi | (static_cast(imm_lo) << 52)) } static void loadFromScratchpad(uint32_t src, uint32_t dst, uint32_t mod, uint32_t imm, uint8_t*& p) { if (src == dst) { imm &= RandomX_CurrentConfig.ScratchpadL3Mask_Calculated; if (imm <= 2047) { // ld x5, imm(x12) emit32(0x00063283 | (imm << 20)); } else if (imm <= 2047 * 2) { // addi x5, x12, 2047 emit32(0x7FF60293); // ld x5, (imm - 2047)(x5) emit32(0x0002B283 | ((imm - 2047) << 20)); } else { // lui x5, imm & 0xFFFFF000U emit32(0x000002B7 | ((imm + ((imm & 0x800) << 1)) & 0xFFFFF000U)); // c.add x5, x12 emit16(0x92B2); // ld x5, (imm & 0xFFF)(x5) emit32(0x0002B283 | ((imm & 0xFFF) << 20)); } return; } uint32_t shift = 32; uint32_t mask_reg; if ((mod & 3) == 0) { shift -= RandomX_CurrentConfig.Log2_ScratchpadL2; mask_reg = 17; } else { shift -= RandomX_CurrentConfig.Log2_ScratchpadL1; mask_reg = 16; } imm = static_cast(static_cast(imm << shift) >> shift); // 0-0x7FF, 0xFFFFF800-0xFFFFFFFF fit into 12 bit (a single addi instruction) if (imm - 0xFFFFF800U < 0x1000U) { // addi x5, x20 + src, imm emit32(0x000A0293 + (src << 15) + (imm << 20)); } else { imm_to_x5(imm, p); // c.add x5, x20 + src emit16(0x92D2 + (src << 2)); } // and x5, x5, mask_reg emit32(0x0002F2B3 + (mask_reg << 20)); // c.add x5, x12 emit16(0x92B2); // ld x5, 0(x5) emit32(0x0002B283); } void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset) { uint64_t* params = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params)); params[0] = RandomX_CurrentConfig.ScratchpadL1_Size - 8; params[1] = RandomX_CurrentConfig.ScratchpadL2_Size - 8; params[2] = RandomX_CurrentConfig.ScratchpadL3_Size - 8; params[3] = RandomX_CurrentConfig.DatasetBaseSize - 64; params[4] = (1 << RandomX_ConfigurationBase::JumpBits) - 1; uint64_t* imul_rcp_literals = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_imul_rcp_literals)); uint64_t* cur_literal = imul_rcp_literals; uint32_t* spaddr_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_spaddr_xor)); uint32_t* spaddr_xor2 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch)); uint32_t* mx_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor)); uint32_t* mx_xor_light = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor_light_mode)); *spaddr_xor = 0x014A47B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x15, readReg0, readReg1 *spaddr_xor2 = 0x014A42B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x5, readReg0, readReg1 const uint32_t mx_xor_value = 0x014A42B3 + (pcfg.readReg2 << 15) + (pcfg.readReg3 << 20); // xor x5, readReg2, readReg3 *mx_xor = mx_xor_value; *mx_xor_light = mx_xor_value; if (entryDataInitScalar) { void* light_mode_data = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_light_mode_data); const uint64_t data[2] = { reinterpret_cast(entryDataInitScalar), datasetOffset }; memcpy(light_mode_data, &data, sizeof(data)); } uint8_t* p = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions)); // 57C8025E vmv.v.x v16, x5 // 57A9034B vsext.vf2 v18, v16 // 5798214B vfcvt.f.x.v v16, v18 static constexpr uint8_t group_f_convert[] = { 0x57, 0xC8, 0x02, 0x5E, 0x57, 0xA9, 0x03, 0x4B, 0x57, 0x98, 0x21, 0x4B }; // 57080627 vand.vv v16, v16, v12 // 5788062B vor.vv v16, v16, v13 static constexpr uint8_t group_e_post_process[] = { 0x57, 0x08, 0x06, 0x27, 0x57, 0x88, 0x06, 0x2B }; uint8_t* last_modified[RegistersCount] = { p, p, p, p, p, p, p, p }; uint8_t readReg01[RegistersCount] = {}; readReg01[pcfg.readReg0] = 1; readReg01[pcfg.readReg1] = 1; uint32_t scratchpad_prefetch_pos = 0; for (int32_t i = static_cast(prog.getSize()) - 1; i >= 0; --i) { Instruction instr = prog(i); const InstructionType inst_type = static_cast(inst_map[instr.opcode]); if (inst_type == InstructionType::CBRANCH) { scratchpad_prefetch_pos = i; break; } if (inst_type < InstructionType::FSWAP_R) { const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; if ((inst_type == InstructionType::ISWAP_R) && (src != dst) && (readReg01[src] || readReg01[dst])) { scratchpad_prefetch_pos = i; break; } if ((inst_type == InstructionType::IMUL_RCP) && readReg01[dst] && !isZeroOrPowerOf2(instr.getImm32())) { scratchpad_prefetch_pos = i; break; } if (readReg01[dst]) { scratchpad_prefetch_pos = i; break; } } } for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { Instruction instr = prog(i); uint32_t src = instr.src % RegistersCount; uint32_t dst = instr.dst % RegistersCount; const uint32_t shift = instr.getModShift(); uint32_t imm = instr.getImm32(); const uint32_t mod = instr.mod; switch (static_cast(inst_map[instr.opcode])) { case InstructionType::IADD_RS: if (shift == 0) { // c.add x20 + dst, x20 + src emit16(0x9A52 + (src << 2) + (dst << 7)); } else { #ifdef __riscv_zba // sh{shift}add x20 + dst, x20 + src, x20 + dst emit32(0x214A0A33 + (shift << 13) + (dst << 7) + (src << 15) + (dst << 20)); #else // __riscv_zba // slli x5, x20 + src, shift emit32(0x000A1293 + (src << 15) + (shift << 20)); // c.add x20 + dst, x5 emit16(0x9A16 + (dst << 7)); #endif // __riscv_zba } if (dst == RegisterNeedsDisplacement) { imm_to_x5(imm, p); // c.add x20 + dst, x5 emit16(0x9A16 + (dst << 7)); } last_modified[dst] = p; break; case InstructionType::IADD_M: loadFromScratchpad(src, dst, mod, imm, p); // c.add x20 + dst, x5 emit16(0x9A16 + (dst << 7)); last_modified[dst] = p; break; case InstructionType::ISUB_R: if (src != dst) { // sub x20 + dst, x20 + dst, x20 + src emit32(0x414A0A33 + (dst << 7) + (dst << 15) + (src << 20)); } else { imm_to_x5(-imm, p); // c.add x20 + dst, x5 emit16(0x9A16 + (dst << 7)); } last_modified[dst] = p; break; case InstructionType::ISUB_M: loadFromScratchpad(src, dst, mod, imm, p); // sub x20 + dst, x20 + dst, x5 emit32(0x405A0A33 + (dst << 7) + (dst << 15)); last_modified[dst] = p; break; case InstructionType::IMUL_R: if (src != dst) { // mul x20 + dst, x20 + dst, x20 + src emit32(0x034A0A33 + (dst << 7) + (dst << 15) + (src << 20)); } else { imm_to_x5(imm, p); // mul x20 + dst, x20 + dst, x5 emit32(0x025A0A33 + (dst << 7) + (dst << 15)); } last_modified[dst] = p; break; case InstructionType::IMUL_M: loadFromScratchpad(src, dst, mod, imm, p); // mul x20 + dst, x20 + dst, x5 emit32(0x025A0A33 + (dst << 7) + (dst << 15)); last_modified[dst] = p; break; case InstructionType::IMULH_R: // mulhu x20 + dst, x20 + dst, x20 + src emit32(0x034A3A33 + (dst << 7) + (dst << 15) + (src << 20)); last_modified[dst] = p; break; case InstructionType::IMULH_M: loadFromScratchpad(src, dst, mod, imm, p); // mulhu x20 + dst, x20 + dst, x5 emit32(0x025A3A33 + (dst << 7) + (dst << 15)); last_modified[dst] = p; break; case InstructionType::ISMULH_R: // mulh x20 + dst, x20 + dst, x20 + src emit32(0x034A1A33 + (dst << 7) + (dst << 15) + (src << 20)); last_modified[dst] = p; break; case InstructionType::ISMULH_M: loadFromScratchpad(src, dst, mod, imm, p); // mulh x20 + dst, x20 + dst, x5 emit32(0x025A1A33 + (dst << 7) + (dst << 15)); last_modified[dst] = p; break; case InstructionType::IMUL_RCP: if (!isZeroOrPowerOf2(imm)) { const uint64_t offset = (cur_literal - imul_rcp_literals) * 8; *(cur_literal++) = randomx_reciprocal_fast(imm); static constexpr uint32_t rcp_regs[26] = { /* Integer */ 8, 10, 28, 29, 30, 31, /* Float */ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 28, 29, 30, 31 }; if (offset < 6 * 8) { // mul x20 + dst, x20 + dst, rcp_reg emit32(0x020A0A33 + (dst << 7) + (dst << 15) + (rcp_regs[offset / 8] << 20)); } else if (offset < 26 * 8) { // fmv.x.d x5, rcp_reg emit32(0xE20002D3 + (rcp_regs[offset / 8] << 15)); // mul x20 + dst, x20 + dst, x5 emit32(0x025A0A33 + (dst << 7) + (dst << 15)); } else { // ld x5, offset(x18) emit32(0x00093283 + (offset << 20)); // mul x20 + dst, x20 + dst, x5 emit32(0x025A0A33 + (dst << 7) + (dst << 15)); } last_modified[dst] = p; } break; case InstructionType::INEG_R: // sub x20 + dst, x0, x20 + dst emit32(0x41400A33 + (dst << 7) + (dst << 20)); last_modified[dst] = p; break; case InstructionType::IXOR_R: if (src != dst) { // xor x20 + dst, x20 + dst, x20 + src emit32(0x014A4A33 + (dst << 7) + (dst << 15) + (src << 20)); } else { imm_to_x5(imm, p); // xor x20, x20, x5 emit32(0x005A4A33 + (dst << 7) + (dst << 15)); } last_modified[dst] = p; break; case InstructionType::IXOR_M: loadFromScratchpad(src, dst, mod, imm, p); // xor x20, x20, x5 emit32(0x005A4A33 + (dst << 7) + (dst << 15)); last_modified[dst] = p; break; #ifdef __riscv_zbb case InstructionType::IROR_R: if (src != dst) { // ror x20 + dst, x20 + dst, x20 + src emit32(0x614A5A33 + (dst << 7) + (dst << 15) + (src << 20)); } else { // rori x20 + dst, x20 + dst, imm emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((imm & 63) << 20)); } last_modified[dst] = p; break; case InstructionType::IROL_R: if (src != dst) { // rol x20 + dst, x20 + dst, x20 + src emit32(0x614A1A33 + (dst << 7) + (dst << 15) + (src << 20)); } else { // rori x20 + dst, x20 + dst, -imm emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((-imm & 63) << 20)); } last_modified[dst] = p; break; #else // __riscv_zbb case InstructionType::IROR_R: if (src != dst) { // sub x5, x0, x20 + src emit32(0x414002B3 + (src << 20)); // srl x6, x20 + dst, x20 + src emit32(0x014A5333 + (dst << 15) + (src << 20)); // sll x20 + dst, x20 + dst, x5 emit32(0x005A1A33 + (dst << 7) + (dst << 15)); // or x20 + dst, x20 + dst, x6 emit32(0x006A6A33 + (dst << 7) + (dst << 15)); } else { // srli x5, x20 + dst, imm emit32(0x000A5293 + (dst << 15) + ((imm & 63) << 20)); // slli x6, x20 + dst, -imm emit32(0x000A1313 + (dst << 15) + ((-imm & 63) << 20)); // or x20 + dst, x5, x6 emit32(0x0062EA33 + (dst << 7)); } last_modified[dst] = p; break; case InstructionType::IROL_R: if (src != dst) { // sub x5, x0, x20 + src emit32(0x414002B3 + (src << 20)); // sll x6, x20 + dst, x20 + src emit32(0x014A1333 + (dst << 15) + (src << 20)); // srl x20 + dst, x20 + dst, x5 emit32(0x005A5A33 + (dst << 7) + (dst << 15)); // or x20 + dst, x20 + dst, x6 emit32(0x006A6A33 + (dst << 7) + (dst << 15)); } else { // srli x5, x20 + dst, -imm emit32(0x000A5293 + (dst << 15) + ((-imm & 63) << 20)); // slli x6, x20 + dst, imm emit32(0x000A1313 + (dst << 15) + ((imm & 63) << 20)); // or x20 + dst, x5, x6 emit32(0x0062EA33 + (dst << 7)); } last_modified[dst] = p; break; #endif // __riscv_zbb case InstructionType::ISWAP_R: if (src != dst) { // c.mv x5, x20 + dst emit16(0x82D2 + (dst << 2)); // c.mv x20 + dst, x20 + src emit16(0x8A52 + (src << 2) + (dst << 7)); // c.mv x20 + src, x5 emit16(0x8A16 + (src << 7)); last_modified[src] = p; last_modified[dst] = p; } break; case InstructionType::FSWAP_R: // vmv.x.s x5, v0 + dst emit32(0x420022D7 + (dst << 20)); // vslide1down.vx v0 + dst, v0 + dst, x5 emit32(0x3E02E057 + (dst << 7) + (dst << 20)); break; case InstructionType::FADD_R: src %= RegisterCountFlt; dst %= RegisterCountFlt; // vfadd.vv v0 + dst, v0 + dst, v8 + src emit32(0x02041057 + (dst << 7) + (src << 15) + (dst << 20)); break; case InstructionType::FADD_M: dst %= RegisterCountFlt; loadFromScratchpad(src, RegistersCount, mod, imm, p); emit_data(group_f_convert); // vfadd.vv v0 + dst, v0 + dst, v16 emit32(0x02081057 + (dst << 7) + (dst << 20)); break; case InstructionType::FSUB_R: src %= RegisterCountFlt; dst %= RegisterCountFlt; // vfsub.vv v0 + dst, v0 + dst, v8 + src emit32(0x0A041057 + (dst << 7) + (src << 15) + (dst << 20)); break; case InstructionType::FSUB_M: dst %= RegisterCountFlt; loadFromScratchpad(src, RegistersCount, mod, imm, p); emit_data(group_f_convert); // vfsub.vv v0 + dst, v0 + dst, v16 emit32(0x0A081057 + (dst << 7) + (dst << 20)); break; case InstructionType::FSCAL_R: dst %= RegisterCountFlt; // vxor.vv v0, v0, v14 emit32(0x2E070057 + (dst << 7) + (dst << 20)); break; case InstructionType::FMUL_R: src %= RegisterCountFlt; dst %= RegisterCountFlt; // vfmul.vv v4 + dst, v4 + dst, v8 + src emit32(0x92441257 + (dst << 7) + (src << 15) + (dst << 20)); break; case InstructionType::FDIV_M: dst %= RegisterCountFlt; loadFromScratchpad(src, RegistersCount, mod, imm, p); emit_data(group_f_convert); emit_data(group_e_post_process); // vfdiv.vv v0 + dst, v0 + dst, v16 emit32(0x82481257 + (dst << 7) + (dst << 20)); break; case InstructionType::FSQRT_R: dst %= RegisterCountFlt; // vfsqrt.v v4 + dst, v4 + dst emit32(0x4E401257 + (dst << 7) + (dst << 20)); break; case InstructionType::CBRANCH: { const uint32_t shift = (mod >> 4) + RandomX_ConfigurationBase::JumpOffset; imm |= (1UL << shift); if (RandomX_ConfigurationBase::JumpOffset > 0 || shift > 0) { imm &= ~(1UL << (shift - 1)); } // slli x6, x7, shift // x6 = branchMask emit32(0x00039313 + (shift << 20)); // x5 = imm imm_to_x5(imm, p); // c.add x20 + dst, x5 emit16(0x9A16 + (dst << 7)); // and x5, x20 + dst, x6 emit32(0x006A72B3 + (dst << 15)); const int offset = static_cast(last_modified[dst] - p); if (offset >= -4096) { // beqz x5, offset const uint32_t k = static_cast(offset); emit32(0x80028063 | ((k & 0x1E) << 7) | ((k & 0x7E0) << 20) | ((k & 0x800) >> 4)); } else { // bnez x5, 8 emit32(0x00029463); // j offset const uint32_t k = static_cast(offset - 4); emit32(0x8000006F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000)); } for (uint32_t j = 0; j < RegistersCount; ++j) { last_modified[j] = p; } } break; case InstructionType::CFROUND: if ((imm - 1) & 63) { #ifdef __riscv_zbb // rori x5, x20 + src, imm - 1 emit32(0x600A5293 + (src << 15) + (((imm - 1) & 63) << 20)); #else // __riscv_zbb // srli x5, x20 + src, imm - 1 emit32(0x000A5293 + (src << 15) + (((imm - 1) & 63) << 20)); // slli x6, x20 + src, 1 - imm emit32(0x000A1313 + (src << 15) + (((1 - imm) & 63) << 20)); // or x5, x5, x6 emit32(0x0062E2B3); #endif // __riscv_zbb // andi x5, x5, 6 emit32(0x0062F293); } else { // andi x5, x20 + src, 6 emit32(0x006A7293 + (src << 15)); } // li x6, 01111000b // x6 = CFROUND lookup table emit32(0x07800313); // srl x5, x6, x5 emit32(0x005352B3); // andi x5, x5, 3 emit32(0x0032F293); // csrw frm, x5 emit32(0x00229073); break; case InstructionType::ISTORE: { uint32_t mask_reg; uint32_t shift = 32; if ((mod >> 4) >= 14) { shift -= RandomX_CurrentConfig.Log2_ScratchpadL3; mask_reg = 1; // x1 = L3 mask } else { if ((mod & 3) == 0) { shift -= RandomX_CurrentConfig.Log2_ScratchpadL2; mask_reg = 17; // x17 = L2 mask } else { shift -= RandomX_CurrentConfig.Log2_ScratchpadL1; mask_reg = 16; // x16 = L1 mask } } imm = static_cast(static_cast(imm << shift) >> shift); imm_to_x5(imm, p); // c.add x5, x20 + dst emit16(0x92D2 + (dst << 2)); // and x5, x5, x0 + mask_reg emit32(0x0002F2B3 + (mask_reg << 20)); // c.add x5, x12 emit16(0x92B2); // sd x20 + src, 0(x5) emit32(0x0142B023 + (src << 20)); } break; default: UNREACHABLE; } // Prefetch scratchpad lines for the next main loop iteration // scratchpad_prefetch_pos is a conservative estimate of the earliest place in the code where we can do it if (i == scratchpad_prefetch_pos) { uint8_t* e = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch_end)); const size_t n = e - ((uint8_t*)spaddr_xor2); memcpy(p, spaddr_xor2, n); p += n; } } const uint8_t* e; if (entryDataInitScalar) { // Emit "J randomx_riscv64_vector_program_main_loop_instructions_end_light_mode" instruction e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end_light_mode); } else { // Emit "J randomx_riscv64_vector_program_main_loop_instructions_end" instruction e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end); } const uint32_t k = e - p; emit32(0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000)); #ifdef __GNUC__ char* p1 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params)); char* p2 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_end)); __builtin___clear_cache(p1, p2); #endif return buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_begin); } } // namespace randomx