Merge pull request #3812 from SChernykh/dev

RandomX: 2.5% faster dataset init on RISC-V
2026-06-20 11:32:41 -04:00 · 2026-05-07 23:08:49 +07:00 · 2026-05-07 17:57:16 +02:00 · 2026-04-30 17:31:14 +07:00 · 2026-04-29 15:29:14 +02:00 · 2026-04-25 17:38:04 +07:00
6 changed files with 40 additions and 83 deletions
--- a/src/base/tools/cryptonote/BlockTemplate.cpp
+++ b/src/base/tools/cryptonote/BlockTemplate.cpp
@@ -406,9 +406,9 @@ bool xmrig::BlockTemplate::parse(bool hashes)
    if (hashes) {
        // FCMP++ layout:
        //
-        // index 0  fcmp_pp_n_tree_layers + 31 zero bytes
-        // index 1  fcmp_pp_tree_root
-        // index 2  coinbase transaction hash
+        // index 0  coinbase transaction hash
+        // index 1  fcmp_pp_n_tree_layers + 31 zero bytes
+        // index 2  fcmp_pp_tree_root
        // index 3+ other transaction hashes
        //
        // pre-FCMP++ layout:
@@ -416,30 +416,28 @@ bool xmrig::BlockTemplate::parse(bool hashes)
        // index 0  coinbase transaction hash
        // index 1+ other transaction hashes
        //
-        const uint32_t coinbase_tx_index = is_fcmp_pp ? 2 : 0;
+        // Update: FCMP moved coinbase tx to index 0 to stay consistent with pre-fork layout

        m_hashes.clear();
-        m_hashes.resize((coinbase_tx_index + m_numHashes + 1) * kHashSize);
+        m_hashes.resize((m_numHashes + (is_fcmp_pp ? 3 : 1)) * kHashSize);

-        uint8_t* data = m_hashes.data() + coinbase_tx_index * kHashSize;
-
-        calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), data);
+        calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), m_hashes.data());

        for (uint64_t i = 1; i <= m_numHashes; ++i) {
            Span h;
            ar(h, kHashSize);
-            memcpy(data + i * kHashSize, h.data(), kHashSize);
+            memcpy(m_hashes.data() + (i + (is_fcmp_pp ? 2 : 0)) * kHashSize, h.data(), kHashSize);
        }

        if (is_fcmp_pp) {
            ar(m_FCMPTreeLayers);
            ar(m_FCMPTreeRoot);

-            m_hashes[0] = m_FCMPTreeLayers;
-            memcpy(m_hashes.data() + kHashSize, m_FCMPTreeRoot, kHashSize);
+            m_hashes[kHashSize] = m_FCMPTreeLayers;
+            memcpy(m_hashes.data() + kHashSize * 2, m_FCMPTreeRoot, kHashSize);
        }

-        calculateMerkleTreeHash(coinbase_tx_index);
+        calculateMerkleTreeHash(0);
    }

    return true;
--- a/src/crypto/randomx/jit_compiler_a64.cpp
+++ b/src/crypto/randomx/jit_compiler_a64.cpp
@@ -1059,11 +1059,8 @@ void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
 	constexpr uint32_t tmp_reg_fp = 28;
 	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);

-	// and tmp_reg_fp, tmp_reg_fp, and_mask_reg
-	emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
-
-	// orr tmp_reg_fp, tmp_reg_fp, or_mask_reg
-	emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
+	// bif tmp_reg_fp, or_mask_reg, and_mask_reg
+	emit32(0x6EE01C00 | tmp_reg_fp | (30 << 5) | (29 << 16), code, k);

 	emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);

--- a/src/crypto/randomx/jit_compiler_a64_static.S
+++ b/src/crypto/randomx/jit_compiler_a64_static.S
@@ -109,7 +109,7 @@
 # v26 -> "a2"
 # v27 -> "a3"
 # v28 -> temporary
-# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
+# v29 -> E 'and' mask = 0x00ffffffffc00000'00ffffffffc00000
 # v30 -> E 'or' mask  = 0x3*00000000******'3*00000000******
 # v31 -> scale mask   = 0x80f0000000000000'80f0000000000000

@@ -151,7 +151,9 @@ DECL(randomx_program_aarch64):
 	ldp	q26, q27, [x0, 224]

 	# Load E 'and' mask
-	movi	v29.2d, #0x00FFFFFFFFFFFFFF
+	mov	x16, 0x00FFFFFFFFC00000
+	ins	v29.d[0], x16
+	ins	v29.d[1], x16

 	# Load E 'or' mask (stored in reg.f[0])
 	ldr	q30, [x0, 64]
@@ -239,14 +241,10 @@ DECL(randomx_program_aarch64_main_loop):
 	sxtl2	v23.2d, v23.4s
 	scvtf	v23.2d, v23.2d

-	and	v20.16b, v20.16b, v29.16b
-	and	v21.16b, v21.16b, v29.16b
-	and	v22.16b, v22.16b, v29.16b
-	and	v23.16b, v23.16b, v29.16b
-	orr	v20.16b, v20.16b, v30.16b
-	orr	v21.16b, v21.16b, v30.16b
-	orr	v22.16b, v22.16b, v30.16b
-	orr	v23.16b, v23.16b, v30.16b
+	bif	v20.16b, v30.16b, v29.16b
+	bif	v21.16b, v30.16b, v29.16b
+	bif	v22.16b, v30.16b, v29.16b
+	bif	v23.16b, v30.16b, v29.16b

 	# Execute VM instructions
 DECL(randomx_program_aarch64_vm_instructions):
--- a/src/crypto/randomx/jit_compiler_rv64_vector.cpp
+++ b/src/crypto/randomx/jit_compiler_rv64_vector.cpp
@@ -243,9 +243,11 @@ static void imm_to_x5(uint32_t imm, uint8_t*& p)
 		return;
 	}

-	if (imm_hi < (32 << 12)) {
+	const int32_t simm_hi = static_cast<int32_t>(imm_hi);
+
+	if ((simm_hi >= -(32 << 12)) && (simm_hi < (32 << 12))) {
 		//c.lui x5, imm_hi
-		emit16(0x6281 + (imm_hi >> 10));
+		emit16(0x6281 | ((imm_hi & 0x1F000) >> 10) | ((simm_hi < 0) ? 0x1000 : 0));
 	}
 	else {
 		// lui x5, imm_hi
--- a/src/crypto/randomx/jit_compiler_rv64_vector_static.S
+++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.S
@@ -129,6 +129,8 @@ v10-v17	= sshash constants
 v18	= temporary

 v19	= dataset item store offsets
+
+v24-v31 = temporary
 */

 DECL(randomx_riscv64_vector_sshash_dataset_init):
@@ -180,6 +182,7 @@ DECL(randomx_riscv64_vector_sshash_dataset_init):
 	slli x13, x13, 6
 	add x13, x13, x11

+.balign 64
 init_item:
 	// Step 1. Init r0-r7

@@ -216,28 +219,7 @@ DECL(randomx_riscv64_vector_sshash_generated_instructions):

 DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
 	// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
-	vsuxei64.v v0, (x11), v19
-
-	add x5, x11, 8
-	vsuxei64.v v1, (x5), v19
-
-	add x5, x11, 16
-	vsuxei64.v v2, (x5), v19
-
-	add x5, x11, 24
-	vsuxei64.v v3, (x5), v19
-
-	add x5, x11, 32
-	vsuxei64.v v4, (x5), v19
-
-	add x5, x11, 40
-	vsuxei64.v v5, (x5), v19
-
-	add x5, x11, 48
-	vsuxei64.v v6, (x5), v19
-
-	add x5, x11, 56
-	vsuxei64.v v7, (x5), v19
+	vsuxseg8ei64.v v0, (x11), v19

 	// Iterate to the next 4 items
 	vadd.vi v8, v8, 4
@@ -293,36 +275,15 @@ DECL(randomx_riscv64_vector_sshash_cache_prefetch):

 // Step 6. XOR all registers with data loaded from randomx cache
 DECL(randomx_riscv64_vector_sshash_xor):
-	vluxei64.v v18, (x10), v9
-	vxor.vv v0, v0, v18
-
-	add x5, x10, 8
-	vluxei64.v v18, (x5), v9
-	vxor.vv v1, v1, v18
-
-	add x5, x10, 16
-	vluxei64.v v18, (x5), v9
-	vxor.vv v2, v2, v18
-
-	add x5, x10, 24
-	vluxei64.v v18, (x5), v9
-	vxor.vv v3, v3, v18
-
-	add x5, x10, 32
-	vluxei64.v v18, (x5), v9
-	vxor.vv v4, v4, v18
-
-	add x5, x10, 40
-	vluxei64.v v18, (x5), v9
-	vxor.vv v5, v5, v18
-
-	add x5, x10, 48
-	vluxei64.v v18, (x5), v9
-	vxor.vv v6, v6, v18
-
-	add x5, x10, 56
-	vluxei64.v v18, (x5), v9
-	vxor.vv v7, v7, v18
+	vluxseg8ei64.v v24, (x10), v9
+	vxor.vv v0, v0, v24
+	vxor.vv v1, v1, v25
+	vxor.vv v2, v2, v26
+	vxor.vv v3, v3, v27
+	vxor.vv v4, v4, v28
+	vxor.vv v5, v5, v29
+	vxor.vv v6, v6, v30
+	vxor.vv v7, v7, v31

 DECL(randomx_riscv64_vector_sshash_end):

@@ -564,6 +525,7 @@ DECL(randomx_riscv64_vector_program_v2_soft_aes_init):

 	vsetivli zero, 2, e64, m1, ta, ma

+.balign 64
 DECL(randomx_riscv64_vector_program_main_loop):
 	and x5, x15, x9		// x5 = spAddr0 & 64-byte aligned L3 mask
 	add x5, x5, x12		// x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
--- a/src/version.h
+++ b/src/version.h
@@ -11,7 +11,7 @@
 #define APP_ID        "xmrig"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "XMRig miner"
-#define APP_VERSION   "6.26.0"
+#define APP_VERSION   "6.26.1-dev"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2016-2026 xmrig.com"
@@ -19,7 +19,7 @@

 #define APP_VER_MAJOR  6
 #define APP_VER_MINOR  26
-#define APP_VER_PATCH  0
+#define APP_VER_PATCH  1

 #ifdef _MSC_VER
 #   if (_MSC_VER >= 1950)
Author	SHA1	Message	Date
xmrig	ab8f005977	Merge pull request #3812 from SChernykh/dev RandomX: 2.5% faster dataset init on RISC-V	2026-05-07 23:08:49 +07:00
SChernykh	f91b79681d	RandomX: 2.5% faster dataset init on RISC-V And a couple small improvements in the main loop.	2026-05-07 17:57:16 +02:00
xmrig	a7baa9cb63	Merge pull request #3807 from SChernykh/dev Update FCMP++ block template layout	2026-04-30 17:31:14 +07:00
SChernykh	c59c03e137	Update FCMP++ block template layout	2026-04-29 15:29:14 +02:00
xmrig	80eff55ed6	Merge pull request #3805 from SChernykh/dev ARM64 JIT: Optimize Group E register conversion	2026-04-25 17:38:04 +07:00
SChernykh	5347458fc7	ARM64 JIT: Optimize Group E register conversion Based on https://github.com/tevador/RandomX/pull/324	2026-04-25 11:37:47 +02:00
XMRig	6bf43053f7	v6.26.1-dev	2026-03-28 20:43:46 +07:00
XMRig	69b7e60d35	Merge branch 'master' into dev	2026-03-28 20:42:02 +07:00