1
0
mirror of https://github.com/xmrig/xmrig.git synced 2026-06-20 03:24:37 -04:00

Compare commits

...

10 Commits

Author SHA1 Message Date
xmrig
27f116e2da Merge pull request #3815 from SChernykh/dev
ARM64 RandomX JIT optimizations
2026-05-15 22:42:02 +07:00
SChernykh
f8dd210531 ARM64 RandomX JIT:
- optimized F/E register loading
- aligned asm code
- optimized emitAddImmediate for small negative values
- v2: optimized CFROUND
- v2: optimized AES in the main loop
2026-05-15 17:30:05 +02:00
xmrig
ab8f005977 Merge pull request #3812 from SChernykh/dev
RandomX: 2.5% faster dataset init on RISC-V
2026-05-07 23:08:49 +07:00
SChernykh
f91b79681d RandomX: 2.5% faster dataset init on RISC-V
And a couple small improvements in the main loop.
2026-05-07 17:57:16 +02:00
xmrig
a7baa9cb63 Merge pull request #3807 from SChernykh/dev
Update FCMP++ block template layout
2026-04-30 17:31:14 +07:00
SChernykh
c59c03e137 Update FCMP++ block template layout 2026-04-29 15:29:14 +02:00
xmrig
80eff55ed6 Merge pull request #3805 from SChernykh/dev
ARM64 JIT: Optimize Group E register conversion
2026-04-25 17:38:04 +07:00
SChernykh
5347458fc7 ARM64 JIT: Optimize Group E register conversion
Based on https://github.com/tevador/RandomX/pull/324
2026-04-25 11:37:47 +02:00
XMRig
6bf43053f7 v6.26.1-dev 2026-03-28 20:43:46 +07:00
XMRig
69b7e60d35 Merge branch 'master' into dev 2026-03-28 20:42:02 +07:00
7 changed files with 123 additions and 157 deletions

View File

@@ -252,5 +252,5 @@ if (WIN32)
endif()
if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode)
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
endif()

View File

@@ -406,9 +406,9 @@ bool xmrig::BlockTemplate::parse(bool hashes)
if (hashes) {
// FCMP++ layout:
//
// index 0 fcmp_pp_n_tree_layers + 31 zero bytes
// index 1 fcmp_pp_tree_root
// index 2 coinbase transaction hash
// index 0 coinbase transaction hash
// index 1 fcmp_pp_n_tree_layers + 31 zero bytes
// index 2 fcmp_pp_tree_root
// index 3+ other transaction hashes
//
// pre-FCMP++ layout:
@@ -416,30 +416,28 @@ bool xmrig::BlockTemplate::parse(bool hashes)
// index 0 coinbase transaction hash
// index 1+ other transaction hashes
//
const uint32_t coinbase_tx_index = is_fcmp_pp ? 2 : 0;
// Update: FCMP moved coinbase tx to index 0 to stay consistent with pre-fork layout
m_hashes.clear();
m_hashes.resize((coinbase_tx_index + m_numHashes + 1) * kHashSize);
m_hashes.resize((m_numHashes + (is_fcmp_pp ? 3 : 1)) * kHashSize);
uint8_t* data = m_hashes.data() + coinbase_tx_index * kHashSize;
calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), data);
calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), m_hashes.data());
for (uint64_t i = 1; i <= m_numHashes; ++i) {
Span h;
ar(h, kHashSize);
memcpy(data + i * kHashSize, h.data(), kHashSize);
memcpy(m_hashes.data() + (i + (is_fcmp_pp ? 2 : 0)) * kHashSize, h.data(), kHashSize);
}
if (is_fcmp_pp) {
ar(m_FCMPTreeLayers);
ar(m_FCMPTreeRoot);
m_hashes[0] = m_FCMPTreeLayers;
memcpy(m_hashes.data() + kHashSize, m_FCMPTreeRoot, kHashSize);
m_hashes[kHashSize] = m_FCMPTreeLayers;
memcpy(m_hashes.data() + kHashSize * 2, m_FCMPTreeRoot, kHashSize);
}
calculateMerkleTreeHash(coinbase_tx_index);
calculateMerkleTreeHash(0);
}
return true;

View File

@@ -64,6 +64,8 @@ constexpr uint32_t MOVN = 0x92800000;
constexpr uint32_t MOVK = 0xF2800000;
constexpr uint32_t ADD_IMM_LO = 0x91000000;
constexpr uint32_t ADD_IMM_HI = 0x91400000;
constexpr uint32_t SUB_IMM_LO = 0xD1000000;
constexpr uint32_t SUB_IMM_HI = 0xD1400000;
constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000;
@@ -534,23 +536,40 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
{
uint32_t k = codePos;
if (imm < (1 << 24))
{
const uint32_t imm_lo = imm & ((1 << 12) - 1);
const uint32_t imm_hi = imm >> 12;
if (imm == 0) {
if (dst != src) {
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
}
if (imm_lo && imm_hi)
{
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
}
else if (imm_lo)
{
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
}
else
{
emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k);
codePos = k;
return;
}
const int32_t simm = static_cast<int32_t>(imm);
uint32_t mag, opLo, opHi;
if (simm > 0) {
mag = imm;
opLo = ARMV8A::ADD_IMM_LO;
opHi = ARMV8A::ADD_IMM_HI;
} else {
mag = static_cast<uint32_t>(-static_cast<int64_t>(simm));
opLo = ARMV8A::SUB_IMM_LO;
opHi = ARMV8A::SUB_IMM_HI;
}
if (mag < (1u << 24)) {
const uint32_t lo = mag & ((1u << 12) - 1);
const uint32_t hi = mag >> 12;
if (lo && hi) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
emit32(opHi | dst | (dst << 5) | (hi << 10), code, k);
} else if (lo) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
} else {
emit32(opHi | dst | (src << 5) | (hi << 10), code, k);
}
}
else
@@ -744,7 +763,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20;
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg
// mul dst, dst, tmp_reg
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k;
@@ -1059,11 +1078,8 @@ void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg_fp = 28;
emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
// and tmp_reg_fp, tmp_reg_fp, and_mask_reg
emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
// orr tmp_reg_fp, tmp_reg_fp, or_mask_reg
emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
// bif tmp_reg_fp, or_mask_reg, and_mask_reg
emit32(0x6EE01C00 | tmp_reg_fp | (30 << 5) | (29 << 16), code, k);
emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
@@ -1112,17 +1128,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20;
constexpr uint32_t fpcr_tmp_reg = 8;
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
// tst tmp_reg, 60
emit32(0xF27E0E9F, code, k);
const uint32_t immr = (62 - instr.getImm32()) & 63;
// tst src, ROR(60, -(instr.getImm32() & 63))
emit32(0xF2400C1F | (immr << 16) | (src << 5), code, k);
// bne next
emit32(0x54000081, code, k);
emit32(0x540000A1, code, k);
}
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);

View File

@@ -109,7 +109,7 @@
# v26 -> "a2"
# v27 -> "a3"
# v28 -> temporary
# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
# v29 -> E 'and' mask = 0x00ffffffffc00000'00ffffffffc00000
# v30 -> E 'or' mask = 0x3*00000000******'3*00000000******
# v31 -> scale mask = 0x80f0000000000000'80f0000000000000
@@ -151,7 +151,9 @@ DECL(randomx_program_aarch64):
ldp q26, q27, [x0, 224]
# Load E 'and' mask
movi v29.2d, #0x00FFFFFFFFFFFFFF
mov x16, 0x00FFFFFFFFC00000
ins v29.d[0], x16
ins v29.d[1], x16
# Load E 'or' mask (stored in reg.f[0])
ldr q30, [x0, 64]
@@ -185,6 +187,7 @@ DECL(randomx_program_aarch64):
ldp x27, x28, [x30, -32] // literal_x27
ldp x29, x30, [x30, -16] // literal_x29
.balign 64
DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64;
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
@@ -213,40 +216,34 @@ DECL(randomx_program_aarch64_main_loop):
eor x14, x14, x20
eor x15, x15, x19
# Load group F registers (spAddr1)
ldr q17, [x17]
sxtl v16.2d, v17.2s
scvtf v16.2d, v16.2d
sxtl2 v17.2d, v17.4s
scvtf v17.2d, v17.2d
# Load group F/E registers (spAddr1)
ldp q17, q19, [x17]
ldp q21, q23, [x17, 32]
ldr q19, [x17, 16]
sxtl v16.2d, v17.2s
sxtl2 v17.2d, v17.4s
sxtl v18.2d, v19.2s
scvtf v18.2d, v18.2d
sxtl2 v19.2d, v19.4s
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d
# Load group E registers (spAddr1)
ldr q21, [x17, 32]
sxtl v20.2d, v21.2s
scvtf v20.2d, v20.2d
sxtl2 v21.2d, v21.4s
scvtf v21.2d, v21.2d
ldr q23, [x17, 48]
sxtl v22.2d, v23.2s
scvtf v22.2d, v22.2d
sxtl2 v23.2d, v23.4s
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d
scvtf v23.2d, v23.2d
and v20.16b, v20.16b, v29.16b
and v21.16b, v21.16b, v29.16b
and v22.16b, v22.16b, v29.16b
and v23.16b, v23.16b, v29.16b
orr v20.16b, v20.16b, v30.16b
orr v21.16b, v21.16b, v30.16b
orr v22.16b, v22.16b, v30.16b
orr v23.16b, v23.16b, v30.16b
bif v20.16b, v30.16b, v29.16b
bif v21.16b, v30.16b, v29.16b
bif v22.16b, v30.16b, v29.16b
bif v23.16b, v30.16b, v29.16b
# Execute VM instructions
DECL(randomx_program_aarch64_vm_instructions):
@@ -285,6 +282,7 @@ literal_v13: .fill 2,8,0
literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0
.balign 64
DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
@@ -347,15 +345,18 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
# Quote from ARM optimization guides for basically all ARM Cortex CPUs starting from Cortex-A53:
#
# "pairs of dependent AESE/AESMC and AESD/AESIMC instructions are higher performance
# when they are adjacent in the program code and both instructions use the same
# destination register since they are fused"
#
# Same applies to all Apple silicon CPUs
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v20.16b
eor v17.16b, v17.16b, v20.16b
@@ -364,15 +365,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v21.16b
eor v17.16b, v17.16b, v21.16b
@@ -381,15 +377,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v22.16b
eor v17.16b, v17.16b, v22.16b
@@ -398,15 +389,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
aese v16.16b, v28.16b
aesd v17.16b, v28.16b
aese v18.16b, v28.16b
aesd v19.16b, v28.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v23.16b
eor v17.16b, v17.16b, v23.16b

View File

@@ -243,9 +243,11 @@ static void imm_to_x5(uint32_t imm, uint8_t*& p)
return;
}
if (imm_hi < (32 << 12)) {
const int32_t simm_hi = static_cast<int32_t>(imm_hi);
if ((simm_hi >= -(32 << 12)) && (simm_hi < (32 << 12))) {
//c.lui x5, imm_hi
emit16(0x6281 + (imm_hi >> 10));
emit16(0x6281 | ((imm_hi & 0x1F000) >> 10) | ((simm_hi < 0) ? 0x1000 : 0));
}
else {
// lui x5, imm_hi

View File

@@ -129,6 +129,8 @@ v10-v17 = sshash constants
v18 = temporary
v19 = dataset item store offsets
v24-v31 = temporary
*/
DECL(randomx_riscv64_vector_sshash_dataset_init):
@@ -180,6 +182,7 @@ DECL(randomx_riscv64_vector_sshash_dataset_init):
slli x13, x13, 6
add x13, x13, x11
.balign 64
init_item:
// Step 1. Init r0-r7
@@ -216,28 +219,7 @@ DECL(randomx_riscv64_vector_sshash_generated_instructions):
DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
vsuxei64.v v0, (x11), v19
add x5, x11, 8
vsuxei64.v v1, (x5), v19
add x5, x11, 16
vsuxei64.v v2, (x5), v19
add x5, x11, 24
vsuxei64.v v3, (x5), v19
add x5, x11, 32
vsuxei64.v v4, (x5), v19
add x5, x11, 40
vsuxei64.v v5, (x5), v19
add x5, x11, 48
vsuxei64.v v6, (x5), v19
add x5, x11, 56
vsuxei64.v v7, (x5), v19
vsuxseg8ei64.v v0, (x11), v19
// Iterate to the next 4 items
vadd.vi v8, v8, 4
@@ -293,36 +275,15 @@ DECL(randomx_riscv64_vector_sshash_cache_prefetch):
// Step 6. XOR all registers with data loaded from randomx cache
DECL(randomx_riscv64_vector_sshash_xor):
vluxei64.v v18, (x10), v9
vxor.vv v0, v0, v18
add x5, x10, 8
vluxei64.v v18, (x5), v9
vxor.vv v1, v1, v18
add x5, x10, 16
vluxei64.v v18, (x5), v9
vxor.vv v2, v2, v18
add x5, x10, 24
vluxei64.v v18, (x5), v9
vxor.vv v3, v3, v18
add x5, x10, 32
vluxei64.v v18, (x5), v9
vxor.vv v4, v4, v18
add x5, x10, 40
vluxei64.v v18, (x5), v9
vxor.vv v5, v5, v18
add x5, x10, 48
vluxei64.v v18, (x5), v9
vxor.vv v6, v6, v18
add x5, x10, 56
vluxei64.v v18, (x5), v9
vxor.vv v7, v7, v18
vluxseg8ei64.v v24, (x10), v9
vxor.vv v0, v0, v24
vxor.vv v1, v1, v25
vxor.vv v2, v2, v26
vxor.vv v3, v3, v27
vxor.vv v4, v4, v28
vxor.vv v5, v5, v29
vxor.vv v6, v6, v30
vxor.vv v7, v7, v31
DECL(randomx_riscv64_vector_sshash_end):
@@ -564,6 +525,7 @@ DECL(randomx_riscv64_vector_program_v2_soft_aes_init):
vsetivli zero, 2, e64, m1, ta, ma
.balign 64
DECL(randomx_riscv64_vector_program_main_loop):
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]

View File

@@ -11,7 +11,7 @@
#define APP_ID "xmrig"
#define APP_NAME "XMRig"
#define APP_DESC "XMRig miner"
#define APP_VERSION "6.26.0"
#define APP_VERSION "6.26.1-dev"
#define APP_DOMAIN "xmrig.com"
#define APP_SITE "www.xmrig.com"
#define APP_COPYRIGHT "Copyright (C) 2016-2026 xmrig.com"
@@ -19,7 +19,7 @@
#define APP_VER_MAJOR 6
#define APP_VER_MINOR 26
#define APP_VER_PATCH 0
#define APP_VER_PATCH 1
#ifdef _MSC_VER
# if (_MSC_VER >= 1950)