diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b36a8dfb..231b18b02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,5 +252,5 @@ if (WIN32) endif() if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode) - add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$") + add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$") endif() diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index fa24380ce..0d624dfa9 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -64,6 +64,8 @@ constexpr uint32_t MOVN = 0x92800000; constexpr uint32_t MOVK = 0xF2800000; constexpr uint32_t ADD_IMM_LO = 0x91000000; constexpr uint32_t ADD_IMM_HI = 0x91400000; +constexpr uint32_t SUB_IMM_LO = 0xD1000000; +constexpr uint32_t SUB_IMM_HI = 0xD1400000; constexpr uint32_t LDR_LITERAL = 0x58000000; constexpr uint32_t ROR = 0x9AC02C00; constexpr uint32_t ROR_IMM = 0x93C00000; @@ -534,23 +536,40 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, { uint32_t k = codePos; - if (imm < (1 << 24)) - { - const uint32_t imm_lo = imm & ((1 << 12) - 1); - const uint32_t imm_hi = imm >> 12; + if (imm == 0) { + if (dst != src) { + emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); + } - if (imm_lo && imm_hi) - { - emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k); - emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k); - } - else if (imm_lo) - { - emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k); - } - else - { - emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k); + codePos = k; + return; + } + + const int32_t simm = static_cast(imm); + + uint32_t mag, opLo, opHi; + + if (simm > 0) { + mag = imm; + opLo = ARMV8A::ADD_IMM_LO; + opHi = ARMV8A::ADD_IMM_HI; + } else { + mag = static_cast(-static_cast(simm)); + opLo = ARMV8A::SUB_IMM_LO; + opHi = ARMV8A::SUB_IMM_HI; + } + + if (mag < (1u << 24)) { + const uint32_t lo = mag & ((1u << 12) - 1); + const uint32_t hi = mag >> 12; + + if (lo && hi) { + emit32(opLo | dst | (src << 5) | (lo << 10), code, k); + emit32(opHi | dst | (dst << 5) | (hi << 10), code, k); + } else if (lo) { + emit32(opLo | dst | (src << 5) | (lo << 10), code, k); + } else { + emit32(opHi | dst | (src << 5) | (hi << 10), code, k); } } else @@ -744,7 +763,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); - // sub dst, dst, tmp_reg + // mul dst, dst, tmp_reg emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); reg_changed_offset[instr.dst] = k; @@ -1109,17 +1128,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg = 20; constexpr uint32_t fpcr_tmp_reg = 8; - // ror tmp_reg, src, imm - emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); - if (RandomX_CurrentConfig.Tweak_V2_CFROUND) { - // tst tmp_reg, 60 - emit32(0xF27E0E9F, code, k); + const uint32_t immr = (62 - instr.getImm32()) & 63; + + // tst src, ROR(60, -(instr.getImm32() & 63)) + emit32(0xF2400C1F | (immr << 16) | (src << 5), code, k); // bne next - emit32(0x54000081, code, k); + emit32(0x540000A1, code, k); } + // ror tmp_reg, src, imm + emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k); + // bfi fpcr_tmp_reg, tmp_reg, 40, 2 emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S index 1bc55ae38..2a7859cb4 100644 --- a/src/crypto/randomx/jit_compiler_a64_static.S +++ b/src/crypto/randomx/jit_compiler_a64_static.S @@ -187,6 +187,7 @@ DECL(randomx_program_aarch64): ldp x27, x28, [x30, -32] // literal_x27 ldp x29, x30, [x30, -16] // literal_x29 + .balign 64 DECL(randomx_program_aarch64_main_loop): # spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; @@ -215,30 +216,28 @@ DECL(randomx_program_aarch64_main_loop): eor x14, x14, x20 eor x15, x15, x19 - # Load group F registers (spAddr1) - ldr q17, [x17] - sxtl v16.2d, v17.2s - scvtf v16.2d, v16.2d - sxtl2 v17.2d, v17.4s - scvtf v17.2d, v17.2d + # Load group F/E registers (spAddr1) + ldp q17, q19, [x17] + ldp q21, q23, [x17, 32] - ldr q19, [x17, 16] + sxtl v16.2d, v17.2s + sxtl2 v17.2d, v17.4s sxtl v18.2d, v19.2s - scvtf v18.2d, v18.2d sxtl2 v19.2d, v19.4s + + scvtf v16.2d, v16.2d + scvtf v17.2d, v17.2d + scvtf v18.2d, v18.2d scvtf v19.2d, v19.2d - # Load group E registers (spAddr1) - ldr q21, [x17, 32] sxtl v20.2d, v21.2s - scvtf v20.2d, v20.2d sxtl2 v21.2d, v21.4s - scvtf v21.2d, v21.2d - - ldr q23, [x17, 48] sxtl v22.2d, v23.2s - scvtf v22.2d, v22.2d sxtl2 v23.2d, v23.4s + + scvtf v20.2d, v20.2d + scvtf v21.2d, v21.2d + scvtf v22.2d, v22.2d scvtf v23.2d, v23.2d bif v20.16b, v30.16b, v29.16b @@ -283,6 +282,7 @@ literal_v13: .fill 2,8,0 literal_v14: .fill 2,8,0 literal_v15: .fill 2,8,0 + .balign 64 DECL(randomx_program_aarch64_vm_instructions_end): # Calculate dataset pointer for dataset read # Do it here to break false dependency from readReg2 and readReg3 (see next line) @@ -345,15 +345,18 @@ DECL(randomx_program_aarch64_v2_FE_mix): # f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0) - aese v16.16b, v28.16b - aesd v17.16b, v28.16b - aese v18.16b, v28.16b - aesd v19.16b, v28.16b + # Quote from ARM optimization guides for basically all ARM Cortex CPUs starting from Cortex-A53: + # + # "pairs of dependent AESE/AESMC and AESD/AESIMC instructions are higher performance + # when they are adjacent in the program code and both instructions use the same + # destination register since they are fused" + # + # Same applies to all Apple silicon CPUs - aesmc v16.16b, v16.16b - aesimc v17.16b, v17.16b - aesmc v18.16b, v18.16b - aesimc v19.16b, v19.16b + aese v16.16b, v28.16b; aesmc v16.16b, v16.16b + aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b + aese v18.16b, v28.16b; aesmc v18.16b, v18.16b + aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b eor v16.16b, v16.16b, v20.16b eor v17.16b, v17.16b, v20.16b @@ -362,15 +365,10 @@ DECL(randomx_program_aarch64_v2_FE_mix): # f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1) - aese v16.16b, v28.16b - aesd v17.16b, v28.16b - aese v18.16b, v28.16b - aesd v19.16b, v28.16b - - aesmc v16.16b, v16.16b - aesimc v17.16b, v17.16b - aesmc v18.16b, v18.16b - aesimc v19.16b, v19.16b + aese v16.16b, v28.16b; aesmc v16.16b, v16.16b + aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b + aese v18.16b, v28.16b; aesmc v18.16b, v18.16b + aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b eor v16.16b, v16.16b, v21.16b eor v17.16b, v17.16b, v21.16b @@ -379,15 +377,10 @@ DECL(randomx_program_aarch64_v2_FE_mix): # f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2) - aese v16.16b, v28.16b - aesd v17.16b, v28.16b - aese v18.16b, v28.16b - aesd v19.16b, v28.16b - - aesmc v16.16b, v16.16b - aesimc v17.16b, v17.16b - aesmc v18.16b, v18.16b - aesimc v19.16b, v19.16b + aese v16.16b, v28.16b; aesmc v16.16b, v16.16b + aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b + aese v18.16b, v28.16b; aesmc v18.16b, v18.16b + aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b eor v16.16b, v16.16b, v22.16b eor v17.16b, v17.16b, v22.16b @@ -396,15 +389,10 @@ DECL(randomx_program_aarch64_v2_FE_mix): # f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3) - aese v16.16b, v28.16b - aesd v17.16b, v28.16b - aese v18.16b, v28.16b - aesd v19.16b, v28.16b - - aesmc v16.16b, v16.16b - aesimc v17.16b, v17.16b - aesmc v18.16b, v18.16b - aesimc v19.16b, v19.16b + aese v16.16b, v28.16b; aesmc v16.16b, v16.16b + aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b + aese v18.16b, v28.16b; aesmc v18.16b, v18.16b + aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b eor v16.16b, v16.16b, v23.16b eor v17.16b, v17.16b, v23.16b