1
0
mirror of https://github.com/xmrig/xmrig.git synced 2026-06-28 13:42:43 -04:00

Compare commits

...

5 Commits

Author SHA1 Message Date
xmrig 6dc014f71f Merge pull request #3826 from SChernykh/dev
ISUB_R fix for ARM/RISC-V JIT
2026-06-27 20:33:57 +07:00
SChernykh 906fd4693b ISUB_R fix for ARM/RISC-V JIT 2026-06-27 13:22:48 +02:00
xmrig 3fb851d91d Merge pull request #3820 from aa022/dev
ARM64 RandomX JIT: dataset prefetch + non-temporal loads (+~8% on M4 base)
2026-05-26 00:21:52 +07:00
aa022 9ac373fea5 ARM64 RandomX JIT: drop early dataset prefetch 2026-05-25 18:05:50 +02:00
aa022 978720462d ARM64 RandomX JIT: dataset prefetch + non-temporal loads
Two Apple-silicon-targeted tweaks to the aarch64 RandomX JIT:

- Early dataset prefetch: when readReg2/readReg3 are finalized well before
  the end of the program body, emit the next iteration's dataset-line prefetch
  early to hide more DRAM latency on the serial scalar chain.
- Non-temporal dataset loads: each 64-byte dataset line is read once and never
  reused, so ldp -> ldnp avoids evicting the hot scratchpad, and the prefetch
  hint moves pldl2strm -> pldl1strm to match the longer lead time.

Measured ~8% hashrate gain on Apple M4 base over dev (7eadfdc9).
2026-05-25 13:46:41 +02:00
5 changed files with 32 additions and 10 deletions
+9
View File
@@ -726,8 +726,17 @@ void JitCompilerA64::h_ISUB_R(Instruction& instr, uint32_t& codePos)
}
else
{
const uint32_t imm = instr.getImm32();
if (imm == 0x80000000ul) {
constexpr uint32_t tmp_reg = 20;
emit32(ARMV8A::MOVZ | tmp_reg | (1u << 21) | (0x8000u << 5), code, k);
emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k);
}
else {
emitAddImmediate(dst, dst, -instr.getImm32(), code, k);
}
}
reg_changed_offset[instr.dst] = k;
codePos = k;
+5 -5
View File
@@ -303,7 +303,7 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
add x20, x20, x1
# Prefetch dataset data
prfm pldl2strm, [x20]
prfm pldl1strm, [x20]
DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler
@@ -312,16 +312,16 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):
DECL(randomx_program_aarch64_xor_with_dataset_line):
# xor integer registers with dataset data
ldp x20, x19, [x10]
ldnp x20, x19, [x10]
eor x4, x4, x20
eor x5, x5, x19
ldp x20, x19, [x10, 16]
ldnp x20, x19, [x10, 16]
eor x6, x6, x20
eor x7, x7, x19
ldp x20, x19, [x10, 32]
ldnp x20, x19, [x10, 32]
eor x12, x12, x20
eor x13, x13, x19
ldp x20, x19, [x10, 48]
ldnp x20, x19, [x10, 48]
eor x14, x14, x20
eor x15, x15, x19
+8 -1
View File
@@ -814,11 +814,18 @@ namespace randomx {
state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src)));
}
else {
int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add
const uint32_t uimm = isn.getImm32();
if (uimm == 0x80000000ul) {
state.emit(rv64::LUI | (0x80000 << 12) | rvrd(Tmp1Reg));
state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg));
}
else {
int32_t imm = unsigned32ToSigned2sCompl(-uimm); //convert to add
//x{dst} = x{dst} + {-imm}
emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg);
}
}
}
void JitCompilerRV64::v1_ISUB_M(HANDLER_ARGS) {
state.registerUsage[isn.dst] = i;
@@ -444,6 +444,12 @@ void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguratio
// sub x20 + dst, x20 + dst, x20 + src
emit32(0x414A0A33 + (dst << 7) + (dst << 15) + (src << 20));
}
else if (imm == 0x80000000U) {
// lui x5, 0x80000000U
emit32(0x800002B7);
// sub x20 + dst, x20 + dst, x5
emit32(0x405A0A33 + (dst << 7) + (dst << 15));
}
else {
imm_to_x5(-imm, p);
// c.add x20 + dst, x5
+1 -1
View File
@@ -163,7 +163,7 @@ extern RandomX_ConfigurationGraft RandomX_GraftConfig;
extern RandomX_ConfigurationSafex RandomX_SafexConfig;
extern RandomX_ConfigurationYada RandomX_YadaConfig;
extern RandomX_ConfigurationBase RandomX_CurrentConfig;
alignas(64) extern RandomX_ConfigurationBase RandomX_CurrentConfig;
template<typename T>
void randomx_apply_config(const T& config)