1
0
mirror of https://github.com/xmrig/xmrig.git synced 2026-06-18 10:22:39 -04:00

Merge pull request #3815 from SChernykh/dev

ARM64 RandomX JIT optimizations
This commit is contained in:
xmrig
2026-05-15 22:42:02 +07:00
committed by GitHub
3 changed files with 83 additions and 74 deletions

View File

@@ -252,5 +252,5 @@ if (WIN32)
endif() endif()
if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode) if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode)
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>") add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
endif() endif()

View File

@@ -64,6 +64,8 @@ constexpr uint32_t MOVN = 0x92800000;
constexpr uint32_t MOVK = 0xF2800000; constexpr uint32_t MOVK = 0xF2800000;
constexpr uint32_t ADD_IMM_LO = 0x91000000; constexpr uint32_t ADD_IMM_LO = 0x91000000;
constexpr uint32_t ADD_IMM_HI = 0x91400000; constexpr uint32_t ADD_IMM_HI = 0x91400000;
constexpr uint32_t SUB_IMM_LO = 0xD1000000;
constexpr uint32_t SUB_IMM_HI = 0xD1400000;
constexpr uint32_t LDR_LITERAL = 0x58000000; constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00; constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000; constexpr uint32_t ROR_IMM = 0x93C00000;
@@ -534,23 +536,40 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
{ {
uint32_t k = codePos; uint32_t k = codePos;
if (imm < (1 << 24)) if (imm == 0) {
{ if (dst != src) {
const uint32_t imm_lo = imm & ((1 << 12) - 1); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
const uint32_t imm_hi = imm >> 12; }
if (imm_lo && imm_hi) codePos = k;
{ return;
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
} }
else if (imm_lo)
{ const int32_t simm = static_cast<int32_t>(imm);
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
uint32_t mag, opLo, opHi;
if (simm > 0) {
mag = imm;
opLo = ARMV8A::ADD_IMM_LO;
opHi = ARMV8A::ADD_IMM_HI;
} else {
mag = static_cast<uint32_t>(-static_cast<int64_t>(simm));
opLo = ARMV8A::SUB_IMM_LO;
opHi = ARMV8A::SUB_IMM_HI;
} }
else
{ if (mag < (1u << 24)) {
emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k); const uint32_t lo = mag & ((1u << 12) - 1);
const uint32_t hi = mag >> 12;
if (lo && hi) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
emit32(opHi | dst | (dst << 5) | (hi << 10), code, k);
} else if (lo) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
} else {
emit32(opHi | dst | (src << 5) | (hi << 10), code, k);
} }
} }
else else
@@ -744,7 +763,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20; constexpr uint32_t tmp_reg = 20;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg // mul dst, dst, tmp_reg
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k; reg_changed_offset[instr.dst] = k;
@@ -1109,17 +1128,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20; constexpr uint32_t tmp_reg = 20;
constexpr uint32_t fpcr_tmp_reg = 8; constexpr uint32_t fpcr_tmp_reg = 8;
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) { if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
// tst tmp_reg, 60 const uint32_t immr = (62 - instr.getImm32()) & 63;
emit32(0xF27E0E9F, code, k);
// tst src, ROR(60, -(instr.getImm32() & 63))
emit32(0xF2400C1F | (immr << 16) | (src << 5), code, k);
// bne next // bne next
emit32(0x54000081, code, k); emit32(0x540000A1, code, k);
} }
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
// bfi fpcr_tmp_reg, tmp_reg, 40, 2 // bfi fpcr_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);

View File

@@ -187,6 +187,7 @@ DECL(randomx_program_aarch64):
ldp x27, x28, [x30, -32] // literal_x27 ldp x27, x28, [x30, -32] // literal_x27
ldp x29, x30, [x30, -16] // literal_x29 ldp x29, x30, [x30, -16] // literal_x29
.balign 64
DECL(randomx_program_aarch64_main_loop): DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr0 = spMix1 & ScratchpadL3Mask64;
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
@@ -215,30 +216,28 @@ DECL(randomx_program_aarch64_main_loop):
eor x14, x14, x20 eor x14, x14, x20
eor x15, x15, x19 eor x15, x15, x19
# Load group F registers (spAddr1) # Load group F/E registers (spAddr1)
ldr q17, [x17] ldp q17, q19, [x17]
sxtl v16.2d, v17.2s ldp q21, q23, [x17, 32]
scvtf v16.2d, v16.2d
sxtl2 v17.2d, v17.4s
scvtf v17.2d, v17.2d
ldr q19, [x17, 16] sxtl v16.2d, v17.2s
sxtl2 v17.2d, v17.4s
sxtl v18.2d, v19.2s sxtl v18.2d, v19.2s
scvtf v18.2d, v18.2d
sxtl2 v19.2d, v19.4s sxtl2 v19.2d, v19.4s
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d scvtf v19.2d, v19.2d
# Load group E registers (spAddr1)
ldr q21, [x17, 32]
sxtl v20.2d, v21.2s sxtl v20.2d, v21.2s
scvtf v20.2d, v20.2d
sxtl2 v21.2d, v21.4s sxtl2 v21.2d, v21.4s
scvtf v21.2d, v21.2d
ldr q23, [x17, 48]
sxtl v22.2d, v23.2s sxtl v22.2d, v23.2s
scvtf v22.2d, v22.2d
sxtl2 v23.2d, v23.4s sxtl2 v23.2d, v23.4s
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d
scvtf v23.2d, v23.2d scvtf v23.2d, v23.2d
bif v20.16b, v30.16b, v29.16b bif v20.16b, v30.16b, v29.16b
@@ -283,6 +282,7 @@ literal_v13: .fill 2,8,0
literal_v14: .fill 2,8,0 literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0 literal_v15: .fill 2,8,0
.balign 64
DECL(randomx_program_aarch64_vm_instructions_end): DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read # Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line) # Do it here to break false dependency from readReg2 and readReg3 (see next line)
@@ -345,15 +345,18 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0) # f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
aese v16.16b, v28.16b # Quote from ARM optimization guides for basically all ARM Cortex CPUs starting from Cortex-A53:
aesd v17.16b, v28.16b #
aese v18.16b, v28.16b # "pairs of dependent AESE/AESMC and AESD/AESIMC instructions are higher performance
aesd v19.16b, v28.16b # when they are adjacent in the program code and both instructions use the same
# destination register since they are fused"
#
# Same applies to all Apple silicon CPUs
aesmc v16.16b, v16.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v20.16b eor v16.16b, v16.16b, v20.16b
eor v17.16b, v17.16b, v20.16b eor v17.16b, v17.16b, v20.16b
@@ -362,15 +365,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1) # f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
aese v16.16b, v28.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v21.16b eor v16.16b, v16.16b, v21.16b
eor v17.16b, v17.16b, v21.16b eor v17.16b, v17.16b, v21.16b
@@ -379,15 +377,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2) # f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
aese v16.16b, v28.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v22.16b eor v16.16b, v16.16b, v22.16b
eor v17.16b, v17.16b, v22.16b eor v17.16b, v17.16b, v22.16b
@@ -396,15 +389,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3) # f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
aese v16.16b, v28.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v23.16b eor v16.16b, v16.16b, v23.16b
eor v17.16b, v17.16b, v23.16b eor v17.16b, v17.16b, v23.16b