1
0
mirror of https://github.com/xmrig/xmrig.git synced 2026-06-18 10:22:39 -04:00

ARM64 RandomX JIT: dataset prefetch + non-temporal loads

Two Apple-silicon-targeted tweaks to the aarch64 RandomX JIT:

- Early dataset prefetch: when readReg2/readReg3 are finalized well before
  the end of the program body, emit the next iteration's dataset-line prefetch
  early to hide more DRAM latency on the serial scalar chain.
- Non-temporal dataset loads: each 64-byte dataset line is read once and never
  reused, so ldp -> ldnp avoids evicting the hot scratchpad, and the prefetch
  hint moves pldl2strm -> pldl1strm to match the longer lead time.

Measured ~8% hashrate gain on Apple M4 base over dev (7eadfdc9).
This commit is contained in:
aa022
2026-05-25 12:12:13 +02:00
parent 7eadfdc9c6
commit 978720462d
2 changed files with 19 additions and 5 deletions

View File

@@ -152,6 +152,20 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
(this->*engine[instr.opcode])(instr, codePos); (this->*engine[instr.opcode])(instr, codePos);
} }
{
const uint32_t rr2Off = reg_changed_offset[config.readReg2];
const uint32_t rr3Off = reg_changed_offset[config.readReg3];
const uint32_t maxOff = (rr2Off > rr3Off) ? rr2Off : rr3Off;
if (codePos - maxOff > 40 * 4) {
const uint32_t datasetMask = ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10);
emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
emit32(ARMV8A::EOR32 | 19 | (9 << 5) | (20 << 16), code, codePos);
emit32(0x121A0000 | 19 | (19 << 5) | datasetMask, code, codePos);
emit32(ARMV8A::ADD | 19 | (19 << 5) | (1 << 16), code, codePos);
emit32(0xF9800000 | 3 | (19 << 5), code, codePos);
}
}
// Update spMix2 // Update spMix2
// eor w20, config.readReg2, config.readReg3 // eor w20, config.readReg2, config.readReg3
emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);

View File

@@ -303,7 +303,7 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
add x20, x20, x1 add x20, x20, x1
# Prefetch dataset data # Prefetch dataset data
prfm pldl2strm, [x20] prfm pldl1strm, [x20]
DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler # Actual mask will be inserted by JIT compiler
@@ -312,16 +312,16 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):
DECL(randomx_program_aarch64_xor_with_dataset_line): DECL(randomx_program_aarch64_xor_with_dataset_line):
# xor integer registers with dataset data # xor integer registers with dataset data
ldp x20, x19, [x10] ldnp x20, x19, [x10]
eor x4, x4, x20 eor x4, x4, x20
eor x5, x5, x19 eor x5, x5, x19
ldp x20, x19, [x10, 16] ldnp x20, x19, [x10, 16]
eor x6, x6, x20 eor x6, x6, x20
eor x7, x7, x19 eor x7, x7, x19
ldp x20, x19, [x10, 32] ldnp x20, x19, [x10, 32]
eor x12, x12, x20 eor x12, x12, x20
eor x13, x13, x19 eor x13, x13, x19
ldp x20, x19, [x10, 48] ldnp x20, x19, [x10, 48]
eor x14, x14, x20 eor x14, x14, x20
eor x15, x15, x19 eor x15, x15, x19