mirror of
https://github.com/xmrig/xmrig.git
synced 2026-06-18 10:22:39 -04:00
ARM64 RandomX JIT:
- optimized F/E register loading - aligned asm code - optimized emitAddImmediate for small negative values - v2: optimized CFROUND - v2: optimized AES in the main loop
This commit is contained in:
@@ -252,5 +252,5 @@ if (WIN32)
|
||||
endif()
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode)
|
||||
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
|
||||
add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
|
||||
endif()
|
||||
|
||||
@@ -64,6 +64,8 @@ constexpr uint32_t MOVN = 0x92800000;
|
||||
constexpr uint32_t MOVK = 0xF2800000;
|
||||
constexpr uint32_t ADD_IMM_LO = 0x91000000;
|
||||
constexpr uint32_t ADD_IMM_HI = 0x91400000;
|
||||
constexpr uint32_t SUB_IMM_LO = 0xD1000000;
|
||||
constexpr uint32_t SUB_IMM_HI = 0xD1400000;
|
||||
constexpr uint32_t LDR_LITERAL = 0x58000000;
|
||||
constexpr uint32_t ROR = 0x9AC02C00;
|
||||
constexpr uint32_t ROR_IMM = 0x93C00000;
|
||||
@@ -534,23 +536,40 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
|
||||
{
|
||||
uint32_t k = codePos;
|
||||
|
||||
if (imm < (1 << 24))
|
||||
{
|
||||
const uint32_t imm_lo = imm & ((1 << 12) - 1);
|
||||
const uint32_t imm_hi = imm >> 12;
|
||||
if (imm == 0) {
|
||||
if (dst != src) {
|
||||
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
|
||||
}
|
||||
|
||||
if (imm_lo && imm_hi)
|
||||
{
|
||||
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
|
||||
emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
|
||||
codePos = k;
|
||||
return;
|
||||
}
|
||||
else if (imm_lo)
|
||||
{
|
||||
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
|
||||
|
||||
const int32_t simm = static_cast<int32_t>(imm);
|
||||
|
||||
uint32_t mag, opLo, opHi;
|
||||
|
||||
if (simm > 0) {
|
||||
mag = imm;
|
||||
opLo = ARMV8A::ADD_IMM_LO;
|
||||
opHi = ARMV8A::ADD_IMM_HI;
|
||||
} else {
|
||||
mag = static_cast<uint32_t>(-static_cast<int64_t>(simm));
|
||||
opLo = ARMV8A::SUB_IMM_LO;
|
||||
opHi = ARMV8A::SUB_IMM_HI;
|
||||
}
|
||||
else
|
||||
{
|
||||
emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k);
|
||||
|
||||
if (mag < (1u << 24)) {
|
||||
const uint32_t lo = mag & ((1u << 12) - 1);
|
||||
const uint32_t hi = mag >> 12;
|
||||
|
||||
if (lo && hi) {
|
||||
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
|
||||
emit32(opHi | dst | (dst << 5) | (hi << 10), code, k);
|
||||
} else if (lo) {
|
||||
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
|
||||
} else {
|
||||
emit32(opHi | dst | (src << 5) | (hi << 10), code, k);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -744,7 +763,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
|
||||
constexpr uint32_t tmp_reg = 20;
|
||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||
|
||||
// sub dst, dst, tmp_reg
|
||||
// mul dst, dst, tmp_reg
|
||||
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
|
||||
|
||||
reg_changed_offset[instr.dst] = k;
|
||||
@@ -1109,17 +1128,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
|
||||
constexpr uint32_t tmp_reg = 20;
|
||||
constexpr uint32_t fpcr_tmp_reg = 8;
|
||||
|
||||
// ror tmp_reg, src, imm
|
||||
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
|
||||
|
||||
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||
// tst tmp_reg, 60
|
||||
emit32(0xF27E0E9F, code, k);
|
||||
const uint32_t immr = (62 - instr.getImm32()) & 63;
|
||||
|
||||
// tst src, ROR(60, -(instr.getImm32() & 63))
|
||||
emit32(0xF2400C1F | (immr << 16) | (src << 5), code, k);
|
||||
|
||||
// bne next
|
||||
emit32(0x54000081, code, k);
|
||||
emit32(0x540000A1, code, k);
|
||||
}
|
||||
|
||||
// ror tmp_reg, src, imm
|
||||
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
|
||||
|
||||
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
|
||||
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
|
||||
|
||||
|
||||
@@ -187,6 +187,7 @@ DECL(randomx_program_aarch64):
|
||||
ldp x27, x28, [x30, -32] // literal_x27
|
||||
ldp x29, x30, [x30, -16] // literal_x29
|
||||
|
||||
.balign 64
|
||||
DECL(randomx_program_aarch64_main_loop):
|
||||
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
||||
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
|
||||
@@ -215,30 +216,28 @@ DECL(randomx_program_aarch64_main_loop):
|
||||
eor x14, x14, x20
|
||||
eor x15, x15, x19
|
||||
|
||||
# Load group F registers (spAddr1)
|
||||
ldr q17, [x17]
|
||||
sxtl v16.2d, v17.2s
|
||||
scvtf v16.2d, v16.2d
|
||||
sxtl2 v17.2d, v17.4s
|
||||
scvtf v17.2d, v17.2d
|
||||
# Load group F/E registers (spAddr1)
|
||||
ldp q17, q19, [x17]
|
||||
ldp q21, q23, [x17, 32]
|
||||
|
||||
ldr q19, [x17, 16]
|
||||
sxtl v16.2d, v17.2s
|
||||
sxtl2 v17.2d, v17.4s
|
||||
sxtl v18.2d, v19.2s
|
||||
scvtf v18.2d, v18.2d
|
||||
sxtl2 v19.2d, v19.4s
|
||||
|
||||
scvtf v16.2d, v16.2d
|
||||
scvtf v17.2d, v17.2d
|
||||
scvtf v18.2d, v18.2d
|
||||
scvtf v19.2d, v19.2d
|
||||
|
||||
# Load group E registers (spAddr1)
|
||||
ldr q21, [x17, 32]
|
||||
sxtl v20.2d, v21.2s
|
||||
scvtf v20.2d, v20.2d
|
||||
sxtl2 v21.2d, v21.4s
|
||||
scvtf v21.2d, v21.2d
|
||||
|
||||
ldr q23, [x17, 48]
|
||||
sxtl v22.2d, v23.2s
|
||||
scvtf v22.2d, v22.2d
|
||||
sxtl2 v23.2d, v23.4s
|
||||
|
||||
scvtf v20.2d, v20.2d
|
||||
scvtf v21.2d, v21.2d
|
||||
scvtf v22.2d, v22.2d
|
||||
scvtf v23.2d, v23.2d
|
||||
|
||||
bif v20.16b, v30.16b, v29.16b
|
||||
@@ -283,6 +282,7 @@ literal_v13: .fill 2,8,0
|
||||
literal_v14: .fill 2,8,0
|
||||
literal_v15: .fill 2,8,0
|
||||
|
||||
.balign 64
|
||||
DECL(randomx_program_aarch64_vm_instructions_end):
|
||||
# Calculate dataset pointer for dataset read
|
||||
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
|
||||
@@ -345,15 +345,18 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
||||
|
||||
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
# Quote from ARM optimization guides for basically all ARM Cortex CPUs starting from Cortex-A53:
|
||||
#
|
||||
# "pairs of dependent AESE/AESMC and AESD/AESIMC instructions are higher performance
|
||||
# when they are adjacent in the program code and both instructions use the same
|
||||
# destination register since they are fused"
|
||||
#
|
||||
# Same applies to all Apple silicon CPUs
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v20.16b
|
||||
eor v17.16b, v17.16b, v20.16b
|
||||
@@ -362,15 +365,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
||||
|
||||
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v21.16b
|
||||
eor v17.16b, v17.16b, v21.16b
|
||||
@@ -379,15 +377,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
||||
|
||||
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v22.16b
|
||||
eor v17.16b, v17.16b, v22.16b
|
||||
@@ -396,15 +389,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
||||
|
||||
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
|
||||
|
||||
aese v16.16b, v28.16b
|
||||
aesd v17.16b, v28.16b
|
||||
aese v18.16b, v28.16b
|
||||
aesd v19.16b, v28.16b
|
||||
|
||||
aesmc v16.16b, v16.16b
|
||||
aesimc v17.16b, v17.16b
|
||||
aesmc v18.16b, v18.16b
|
||||
aesimc v19.16b, v19.16b
|
||||
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||
|
||||
eor v16.16b, v16.16b, v23.16b
|
||||
eor v17.16b, v17.16b, v23.16b
|
||||
|
||||
Reference in New Issue
Block a user