diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index bae695cf1..dc693d71f 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -152,6 +152,20 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con (this->*engine[instr.opcode])(instr, codePos); } + { + const uint32_t rr2Off = reg_changed_offset[config.readReg2]; + const uint32_t rr3Off = reg_changed_offset[config.readReg3]; + const uint32_t maxOff = (rr2Off > rr3Off) ? rr2Off : rr3Off; + if (codePos - maxOff > 40 * 4) { + const uint32_t datasetMask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10); + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + emit32(ARMV8A::EOR32 | 19 | (9 << 5) | (20 << 16), code, codePos); + emit32(0x121A0000 | 19 | (19 << 5) | datasetMask, code, codePos); + emit32(ARMV8A::ADD | 19 | (19 << 5) | (1 << 16), code, codePos); + emit32(0xF9800000 | 3 | (19 << 5), code, codePos); + } + } + // Update spMix2 // eor w20, config.readReg2, config.readReg3 emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S index 2a7859cb4..3a2435211 100644 --- a/src/crypto/randomx/jit_compiler_a64_static.S +++ b/src/crypto/randomx/jit_compiler_a64_static.S @@ -303,7 +303,7 @@ DECL(randomx_program_aarch64_cacheline_align_mask1): add x20, x20, x1 # Prefetch dataset data - prfm pldl2strm, [x20] + prfm pldl1strm, [x20] DECL(randomx_program_aarch64_cacheline_align_mask2): # Actual mask will be inserted by JIT compiler @@ -312,16 +312,16 @@ DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_xor_with_dataset_line): # xor integer registers with dataset data - ldp x20, x19, [x10] + ldnp x20, x19, [x10] eor x4, x4, x20 eor x5, x5, x19 - ldp x20, x19, [x10, 16] + ldnp x20, x19, [x10, 16] eor x6, x6, x20 eor x7, x7, x19 - ldp x20, x19, [x10, 32] + ldnp x20, x19, [x10, 32] eor x12, x12, x20 eor x13, x13, x19 - ldp x20, x19, [x10, 48] + ldnp x20, x19, [x10, 48] eor x14, x14, x20 eor x15, x15, x19