1
0
mirror of https://github.com/xmrig/xmrig.git synced 2026-06-20 03:24:37 -04:00

Compare commits

...

7 Commits

Author SHA1 Message Date
xmrig
3fb851d91d Merge pull request #3820 from aa022/dev
ARM64 RandomX JIT: dataset prefetch + non-temporal loads (+~8% on M4 base)
2026-05-26 00:21:52 +07:00
aa022
9ac373fea5 ARM64 RandomX JIT: drop early dataset prefetch 2026-05-25 18:05:50 +02:00
aa022
978720462d ARM64 RandomX JIT: dataset prefetch + non-temporal loads
Two Apple-silicon-targeted tweaks to the aarch64 RandomX JIT:

- Early dataset prefetch: when readReg2/readReg3 are finalized well before
  the end of the program body, emit the next iteration's dataset-line prefetch
  early to hide more DRAM latency on the serial scalar chain.
- Non-temporal dataset loads: each 64-byte dataset line is read once and never
  reused, so ldp -> ldnp avoids evicting the hot scratchpad, and the prefetch
  hint moves pldl2strm -> pldl1strm to match the longer lead time.

Measured ~8% hashrate gain on Apple M4 base over dev (7eadfdc9).
2026-05-25 13:46:41 +02:00
xmrig
7eadfdc9c6 Merge pull request #3816 from SChernykh/dev
ARM64 RandomX optimizations
2026-05-18 02:12:24 +07:00
SChernykh
720325c40f RandomX optimizations:
- ARM64: optimized emitMovImmediate/emitMemLoad
- ARM64: disabled 32-bit literal preloading (it was slower)
- Android and Linux: added MADV_COLLAPSE support to memory allocation
2026-05-17 21:04:02 +02:00
xmrig
27f116e2da Merge pull request #3815 from SChernykh/dev
ARM64 RandomX JIT optimizations
2026-05-15 22:42:02 +07:00
SChernykh
f8dd210531 ARM64 RandomX JIT:
- optimized F/E register loading
- aligned asm code
- optimized emitAddImmediate for small negative values
- v2: optimized CFROUND
- v2: optimized AES in the main loop
2026-05-15 17:30:05 +02:00
4 changed files with 121 additions and 89 deletions

View File

@@ -252,5 +252,5 @@ if (WIN32)
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode)
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
endif()

View File

@@ -74,6 +74,11 @@
#endif
#ifndef MADV_COLLAPSE
# define MADV_COLLAPSE 25
#endif
#if defined(XMRIG_OS_LINUX) || (!defined(XMRIG_OS_APPLE) && !defined(XMRIG_OS_FREEBSD))
static inline int hugePagesFlag(size_t size)
{
@@ -278,8 +283,9 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
{
# ifdef XMRIG_OS_LINUX
return (madvise(p, size, MADV_HUGEPAGE) == 0);
# if defined(XMRIG_OS_ANDROID) || defined(XMRIG_OS_LINUX)
// MADV_COLLAPSE works even if /sys/kernel/mm/transparent_hugepage/enabled is set to "never", but only on Linux 6.1+
return (madvise(p, size, MADV_COLLAPSE) == 0) || (madvise(p, size, MADV_HUGEPAGE) == 0);
# else
return false;
# endif

View File

@@ -64,6 +64,8 @@ constexpr uint32_t MOVN = 0x92800000;
constexpr uint32_t MOVK = 0xF2800000;
constexpr uint32_t ADD_IMM_LO = 0x91000000;
constexpr uint32_t ADD_IMM_HI = 0x91400000;
constexpr uint32_t SUB_IMM_LO = 0xD1000000;
constexpr uint32_t SUB_IMM_HI = 0xD1400000;
constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000;
@@ -139,7 +141,7 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd;
num32bitLiterals = 0;
num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk
for (uint32_t i = 0; i < RegistersCount; ++i)
reg_changed_offset[i] = codePos;
@@ -235,7 +237,7 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd;
num32bitLiterals = 0;
num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk
for (uint32_t i = 0; i < RegistersCount; ++i)
reg_changed_offset[i] = codePos;
@@ -486,13 +488,31 @@ void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code,
{
uint32_t k = codePos;
// 196606 different values can be encoded with a single instruction, the rest requires smov/umov load, or movn/movz+movk pair
if (imm < (1 << 16))
{
// Sign-extended 64-bit value: 0x000000000000xxxx
// movz tmp_reg, imm32 (16 low bits)
emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
}
else if ((imm >> 16) == 0xFFFF) {
// Sign-extended 64-bit value: 0xFFFFFFFFFFFFxxxx
// movn tmp_reg, ~imm32 (16 low bits)
emit32(ARMV8A::MOVN | dst | ((~imm & 0xFFFF) << 5), code, k);
}
else if (((imm & 0xFFFF) == 0xFFFF) && (static_cast<int32_t>(imm) < 0)) {
// Sign-extended 64-bit value: 0xFFFFFFFFxxxxFFFF
// movn tmp_reg, ~imm32 (16 high bits)
emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k);
}
else if (((imm & 0xFFFF) == 0) && (static_cast<int32_t>(imm) >= 0)) {
// Sign-extended 64-bit value: 0x00000000xxxx0000
// movz tmp_reg, imm32 (16 high bits)
emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k);
}
else
{
// Full sign-extended 64-bit value: 0x00000000xxxxxxxx or 0xFFFFFFFFxxxxxxxx
if (num32bitLiterals < 64)
{
if (static_cast<int32_t>(imm) < 0)
@@ -534,23 +554,40 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
{
uint32_t k = codePos;
if (imm < (1 << 24))
{
const uint32_t imm_lo = imm & ((1 << 12) - 1);
const uint32_t imm_hi = imm >> 12;
if (imm == 0) {
if (dst != src) {
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
}
if (imm_lo && imm_hi)
{
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
}
else if (imm_lo)
{
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
}
else
{
emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k);
codePos = k;
return;
}
const int32_t simm = static_cast<int32_t>(imm);
uint32_t mag, opLo, opHi;
if (simm > 0) {
mag = imm;
opLo = ARMV8A::ADD_IMM_LO;
opHi = ARMV8A::ADD_IMM_HI;
} else {
mag = static_cast<uint32_t>(-static_cast<int64_t>(simm));
opLo = ARMV8A::SUB_IMM_LO;
opHi = ARMV8A::SUB_IMM_HI;
}
if (mag < (1u << 24)) {
const uint32_t lo = mag & ((1u << 12) - 1);
const uint32_t hi = mag >> 12;
if (lo && hi) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
emit32(opHi | dst | (dst << 5) | (hi << 10), code, k);
} else if (lo) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
} else {
emit32(opHi | dst | (src << 5) | (hi << 10), code, k);
}
}
else
@@ -592,18 +629,17 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
else
{
imm = (imm & ScratchpadL3Mask) >> 3;
if (imm)
if (imm < 4096) {
// ldr tmp_reg, [x2, #imm*8]
emit32(0xf9400040 | tmp_reg | (imm << 10), code, k);
}
else
{
emitMovImmediate(tmp_reg, imm, code, k);
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
}
else
{
// ldr tmp_reg, [x2]
emit32(0xf9400040 | tmp_reg, code, k);
}
}
codePos = k;
@@ -744,7 +780,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg
// mul dst, dst, tmp_reg
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
@@ -1109,17 +1145,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20;
constexpr uint32_t fpcr_tmp_reg = 8;
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
// tst tmp_reg, 60
emit32(0xF27E0E9F, code, k);
const uint32_t immr = (62 - instr.getImm32()) & 63;
// tst src, ROR(60, -(instr.getImm32() & 63))
emit32(0xF2400C1F | (immr << 16) | (src << 5), code, k);
// bne next
emit32(0x54000081, code, k);
emit32(0x540000A1, code, k);
}
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);

View File

@@ -187,6 +187,7 @@ DECL(randomx_program_aarch64):
ldp x27, x28, [x30, -32] // literal_x27
ldp x29, x30, [x30, -16] // literal_x29
.balign 64
DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64;
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
@@ -215,30 +216,28 @@ DECL(randomx_program_aarch64_main_loop):
eor x14, x14, x20
eor x15, x15, x19
# Load group F registers (spAddr1)
ldr q17, [x17]
sxtl v16.2d, v17.2s
scvtf v16.2d, v16.2d
sxtl2 v17.2d, v17.4s
scvtf v17.2d, v17.2d
# Load group F/E registers (spAddr1)
ldp q17, q19, [x17]
ldp q21, q23, [x17, 32]
ldr q19, [x17, 16]
sxtl v16.2d, v17.2s
sxtl2 v17.2d, v17.4s
sxtl v18.2d, v19.2s
scvtf v18.2d, v18.2d
sxtl2 v19.2d, v19.4s
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d
# Load group E registers (spAddr1)
ldr q21, [x17, 32]
sxtl v20.2d, v21.2s
scvtf v20.2d, v20.2d
sxtl2 v21.2d, v21.4s
scvtf v21.2d, v21.2d
ldr q23, [x17, 48]
sxtl v22.2d, v23.2s
scvtf v22.2d, v22.2d
sxtl2 v23.2d, v23.4s
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d
scvtf v23.2d, v23.2d
bif v20.16b, v30.16b, v29.16b
@@ -283,6 +282,7 @@ literal_v13: .fill 2,8,0
literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0
.balign 64
DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
@@ -303,7 +303,7 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
add x20, x20, x1
# Prefetch dataset data
prfm pldl2strm, [x20]
prfm pldl1strm, [x20]
DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler
@@ -312,16 +312,16 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):
DECL(randomx_program_aarch64_xor_with_dataset_line):
# xor integer registers with dataset data
ldp x20, x19, [x10]
ldnp x20, x19, [x10]
eor x4, x4, x20
eor x5, x5, x19
ldp x20, x19, [x10, 16]
ldnp x20, x19, [x10, 16]
eor x6, x6, x20
eor x7, x7, x19
ldp x20, x19, [x10, 32]
ldnp x20, x19, [x10, 32]
eor x12, x12, x20
eor x13, x13, x19
ldp x20, x19, [x10, 48]
ldnp x20, x19, [x10, 48]
eor x14, x14, x20
eor x15, x15, x19
@@ -345,15 +345,18 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
# Quote from ARM optimization guides for basically all ARM Cortex CPUs starting from Cortex-A53:
#
# "pairs of dependent AESE/AESMC and AESD/AESIMC instructions are higher performance
# when they are adjacent in the program code and both instructions use the same
# destination register since they are fused"
#
# Same applies to all Apple silicon CPUs
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v20.16b
eor v17.16b, v17.16b, v20.16b
@@ -362,15 +365,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v21.16b
eor v17.16b, v17.16b, v21.16b
@@ -379,15 +377,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v22.16b
eor v17.16b, v17.16b, v22.16b
@@ -396,15 +389,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v23.16b
eor v17.16b, v17.16b, v23.16b