From 978720462d4ea84a1d07795514bbc9996bf07092 Mon Sep 17 00:00:00 2001 From: aa022 <117016786+aa022@users.noreply.github.com> Date: Mon, 25 May 2026 12:12:13 +0200 Subject: [PATCH 1/2] ARM64 RandomX JIT: dataset prefetch + non-temporal loads Two Apple-silicon-targeted tweaks to the aarch64 RandomX JIT: - Early dataset prefetch: when readReg2/readReg3 are finalized well before the end of the program body, emit the next iteration's dataset-line prefetch early to hide more DRAM latency on the serial scalar chain. - Non-temporal dataset loads: each 64-byte dataset line is read once and never reused, so ldp -> ldnp avoids evicting the hot scratchpad, and the prefetch hint moves pldl2strm -> pldl1strm to match the longer lead time. Measured ~8% hashrate gain on Apple M4 base over dev (7eadfdc9). --- src/crypto/randomx/jit_compiler_a64.cpp | 14 ++++++++++++++ src/crypto/randomx/jit_compiler_a64_static.S | 10 +++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index bae695cf1..dc693d71f 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -152,6 +152,20 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con (this->*engine[instr.opcode])(instr, codePos); } + { + const uint32_t rr2Off = reg_changed_offset[config.readReg2]; + const uint32_t rr3Off = reg_changed_offset[config.readReg3]; + const uint32_t maxOff = (rr2Off > rr3Off) ? rr2Off : rr3Off; + if (codePos - maxOff > 40 * 4) { + const uint32_t datasetMask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10); + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + emit32(ARMV8A::EOR32 | 19 | (9 << 5) | (20 << 16), code, codePos); + emit32(0x121A0000 | 19 | (19 << 5) | datasetMask, code, codePos); + emit32(ARMV8A::ADD | 19 | (19 << 5) | (1 << 16), code, codePos); + emit32(0xF9800000 | 3 | (19 << 5), code, codePos); + } + } + // Update spMix2 // eor w20, config.readReg2, config.readReg3 emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S index 2a7859cb4..3a2435211 100644 --- a/src/crypto/randomx/jit_compiler_a64_static.S +++ b/src/crypto/randomx/jit_compiler_a64_static.S @@ -303,7 +303,7 @@ DECL(randomx_program_aarch64_cacheline_align_mask1): add x20, x20, x1 # Prefetch dataset data - prfm pldl2strm, [x20] + prfm pldl1strm, [x20] DECL(randomx_program_aarch64_cacheline_align_mask2): # Actual mask will be inserted by JIT compiler @@ -312,16 +312,16 @@ DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_xor_with_dataset_line): # xor integer registers with dataset data - ldp x20, x19, [x10] + ldnp x20, x19, [x10] eor x4, x4, x20 eor x5, x5, x19 - ldp x20, x19, [x10, 16] + ldnp x20, x19, [x10, 16] eor x6, x6, x20 eor x7, x7, x19 - ldp x20, x19, [x10, 32] + ldnp x20, x19, [x10, 32] eor x12, x12, x20 eor x13, x13, x19 - ldp x20, x19, [x10, 48] + ldnp x20, x19, [x10, 48] eor x14, x14, x20 eor x15, x15, x19 From 9ac373fea5182b8291f1e61a29de21be358f3536 Mon Sep 17 00:00:00 2001 From: aa022 <117016786+aa022@users.noreply.github.com> Date: Mon, 25 May 2026 18:05:50 +0200 Subject: [PATCH 2/2] ARM64 RandomX JIT: drop early dataset prefetch --- src/crypto/randomx/jit_compiler_a64.cpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index dc693d71f..bae695cf1 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -152,20 +152,6 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con (this->*engine[instr.opcode])(instr, codePos); } - { - const uint32_t rr2Off = reg_changed_offset[config.readReg2]; - const uint32_t rr3Off = reg_changed_offset[config.readReg3]; - const uint32_t maxOff = (rr2Off > rr3Off) ? rr2Off : rr3Off; - if (codePos - maxOff > 40 * 4) { - const uint32_t datasetMask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10); - emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); - emit32(ARMV8A::EOR32 | 19 | (9 << 5) | (20 << 16), code, codePos); - emit32(0x121A0000 | 19 | (19 << 5) | datasetMask, code, codePos); - emit32(ARMV8A::ADD | 19 | (19 << 5) | (1 << 16), code, codePos); - emit32(0xF9800000 | 3 | (19 << 5), code, codePos); - } - } - // Update spMix2 // eor w20, config.readReg2, config.readReg3 emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);