Merge pull request #3812 from SChernykh/dev

RandomX: 2.5% faster dataset init on RISC-V
2026-06-20 11:32:41 -04:00 · 2026-05-07 23:08:49 +07:00 · 2026-05-07 17:57:16 +02:00 · 2026-04-30 17:31:14 +07:00 · 2026-04-29 15:29:14 +02:00 · 2026-04-25 17:38:04 +07:00
6 changed files with 40 additions and 83 deletions
--- a/src/base/tools/cryptonote/BlockTemplate.cpp
+++ b/src/base/tools/cryptonote/BlockTemplate.cpp
@@ -406,9 +406,9 @@ bool xmrig::BlockTemplate::parse(bool hashes)
    if (hashes) {
        // FCMP++ layout:
        //
-        // index 0  fcmp_pp_n_tree_layers + 31 zero bytes
+        // index 0  coinbase transaction hash
-        // index 1  fcmp_pp_tree_root
+        // index 1  fcmp_pp_n_tree_layers + 31 zero bytes
-        // index 2  coinbase transaction hash
+        // index 2  fcmp_pp_tree_root
        // index 3+ other transaction hashes
        //
        // pre-FCMP++ layout:
@@ -416,30 +416,28 @@ bool xmrig::BlockTemplate::parse(bool hashes)
        // index 0  coinbase transaction hash
        // index 1+ other transaction hashes
        //
-        const uint32_t coinbase_tx_index = is_fcmp_pp ? 2 : 0;
+        // Update: FCMP moved coinbase tx to index 0 to stay consistent with pre-fork layout
        m_hashes.clear();
-        m_hashes.resize((coinbase_tx_index + m_numHashes + 1) * kHashSize);
+        m_hashes.resize((m_numHashes + (is_fcmp_pp ? 3 : 1)) * kHashSize);
-        uint8_t* data = m_hashes.data() + coinbase_tx_index * kHashSize;
+        calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), m_hashes.data());
        calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), data);
        for (uint64_t i = 1; i <= m_numHashes; ++i) {
            Span h;
            ar(h, kHashSize);
-            memcpy(data + i * kHashSize, h.data(), kHashSize);
+            memcpy(m_hashes.data() + (i + (is_fcmp_pp ? 2 : 0)) * kHashSize, h.data(), kHashSize);
        }
        if (is_fcmp_pp) {
            ar(m_FCMPTreeLayers);
            ar(m_FCMPTreeRoot);
-            m_hashes[0] = m_FCMPTreeLayers;
+            m_hashes[kHashSize] = m_FCMPTreeLayers;
-            memcpy(m_hashes.data() + kHashSize, m_FCMPTreeRoot, kHashSize);
+            memcpy(m_hashes.data() + kHashSize * 2, m_FCMPTreeRoot, kHashSize);
        }
-        calculateMerkleTreeHash(coinbase_tx_index);
+        calculateMerkleTreeHash(0);
    }
    return true;
--- a/src/crypto/randomx/jit_compiler_a64.cpp
+++ b/src/crypto/randomx/jit_compiler_a64.cpp
@@ -1059,11 +1059,8 @@ void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
 	constexpr uint32_t tmp_reg_fp = 28;
 	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
-	// and tmp_reg_fp, tmp_reg_fp, and_mask_reg
+	// bif tmp_reg_fp, or_mask_reg, and_mask_reg
-	emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
+	emit32(0x6EE01C00 | tmp_reg_fp | (30 << 5) | (29 << 16), code, k);
 	// orr tmp_reg_fp, tmp_reg_fp, or_mask_reg
 	emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
 	emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
--- a/src/crypto/randomx/jit_compiler_a64_static.S
+++ b/src/crypto/randomx/jit_compiler_a64_static.S
@@ -109,7 +109,7 @@
 # v26 -> "a2"
 # v27 -> "a3"
 # v28 -> temporary
-# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
+# v29 -> E 'and' mask = 0x00ffffffffc00000'00ffffffffc00000
 # v30 -> E 'or' mask  = 0x3*00000000******'3*00000000******
 # v31 -> scale mask   = 0x80f0000000000000'80f0000000000000
@@ -151,7 +151,9 @@ DECL(randomx_program_aarch64):
 	ldp	q26, q27, [x0, 224]
 	# Load E 'and' mask
-	movi	v29.2d, #0x00FFFFFFFFFFFFFF
+	mov	x16, 0x00FFFFFFFFC00000
 	ins	v29.d[0], x16
 	ins	v29.d[1], x16
 	# Load E 'or' mask (stored in reg.f[0])
 	ldr	q30, [x0, 64]
@@ -239,14 +241,10 @@ DECL(randomx_program_aarch64_main_loop):
 	sxtl2	v23.2d, v23.4s
 	scvtf	v23.2d, v23.2d
-	and	v20.16b, v20.16b, v29.16b
+	bif	v20.16b, v30.16b, v29.16b
-	and	v21.16b, v21.16b, v29.16b
+	bif	v21.16b, v30.16b, v29.16b
-	and	v22.16b, v22.16b, v29.16b
+	bif	v22.16b, v30.16b, v29.16b
-	and	v23.16b, v23.16b, v29.16b
+	bif	v23.16b, v30.16b, v29.16b
 	orr	v20.16b, v20.16b, v30.16b
 	orr	v21.16b, v21.16b, v30.16b
 	orr	v22.16b, v22.16b, v30.16b
 	orr	v23.16b, v23.16b, v30.16b
 	# Execute VM instructions
 DECL(randomx_program_aarch64_vm_instructions):
--- a/src/crypto/randomx/jit_compiler_rv64_vector.cpp
+++ b/src/crypto/randomx/jit_compiler_rv64_vector.cpp
@@ -243,9 +243,11 @@ static void imm_to_x5(uint32_t imm, uint8_t*& p)
 		return;
 	}
-	if (imm_hi < (32 << 12)) {
+	const int32_t simm_hi = static_cast<int32_t>(imm_hi);
 	if ((simm_hi >= -(32 << 12)) && (simm_hi < (32 << 12))) {
 		//c.lui x5, imm_hi
-		emit16(0x6281 + (imm_hi >> 10));
+		emit16(0x6281 | ((imm_hi & 0x1F000) >> 10) | ((simm_hi < 0) ? 0x1000 : 0));
 	}
 	else {
 		// lui x5, imm_hi
--- a/src/crypto/randomx/jit_compiler_rv64_vector_static.S
+++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.S
@@ -129,6 +129,8 @@ v10-v17	= sshash constants
 v18	= temporary
 v19	= dataset item store offsets
 v24-v31 = temporary
 */
 DECL(randomx_riscv64_vector_sshash_dataset_init):
@@ -180,6 +182,7 @@ DECL(randomx_riscv64_vector_sshash_dataset_init):
 	slli x13, x13, 6
 	add x13, x13, x11
 .balign 64
 init_item:
 	// Step 1. Init r0-r7
@@ -216,28 +219,7 @@ DECL(randomx_riscv64_vector_sshash_generated_instructions):
 DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
 	// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
-	vsuxei64.v v0, (x11), v19
+	vsuxseg8ei64.v v0, (x11), v19
 	add x5, x11, 8
 	vsuxei64.v v1, (x5), v19
 	add x5, x11, 16
 	vsuxei64.v v2, (x5), v19
 	add x5, x11, 24
 	vsuxei64.v v3, (x5), v19
 	add x5, x11, 32
 	vsuxei64.v v4, (x5), v19
 	add x5, x11, 40
 	vsuxei64.v v5, (x5), v19
 	add x5, x11, 48
 	vsuxei64.v v6, (x5), v19
 	add x5, x11, 56
 	vsuxei64.v v7, (x5), v19
 	// Iterate to the next 4 items
 	vadd.vi v8, v8, 4
@@ -293,36 +275,15 @@ DECL(randomx_riscv64_vector_sshash_cache_prefetch):
 // Step 6. XOR all registers with data loaded from randomx cache
 DECL(randomx_riscv64_vector_sshash_xor):
-	vluxei64.v v18, (x10), v9
+	vluxseg8ei64.v v24, (x10), v9
-	vxor.vv v0, v0, v18
+	vxor.vv v0, v0, v24
-
+	vxor.vv v1, v1, v25
-	add x5, x10, 8
+	vxor.vv v2, v2, v26
-	vluxei64.v v18, (x5), v9
+	vxor.vv v3, v3, v27
-	vxor.vv v1, v1, v18
+	vxor.vv v4, v4, v28
-
+	vxor.vv v5, v5, v29
-	add x5, x10, 16
+	vxor.vv v6, v6, v30
-	vluxei64.v v18, (x5), v9
+	vxor.vv v7, v7, v31
 	vxor.vv v2, v2, v18
 	add x5, x10, 24
 	vluxei64.v v18, (x5), v9
 	vxor.vv v3, v3, v18
 	add x5, x10, 32
 	vluxei64.v v18, (x5), v9
 	vxor.vv v4, v4, v18
 	add x5, x10, 40
 	vluxei64.v v18, (x5), v9
 	vxor.vv v5, v5, v18
 	add x5, x10, 48
 	vluxei64.v v18, (x5), v9
 	vxor.vv v6, v6, v18
 	add x5, x10, 56
 	vluxei64.v v18, (x5), v9
 	vxor.vv v7, v7, v18
 DECL(randomx_riscv64_vector_sshash_end):
@@ -564,6 +525,7 @@ DECL(randomx_riscv64_vector_program_v2_soft_aes_init):
 	vsetivli zero, 2, e64, m1, ta, ma
 .balign 64
 DECL(randomx_riscv64_vector_program_main_loop):
 	and x5, x15, x9		// x5 = spAddr0 & 64-byte aligned L3 mask
 	add x5, x5, x12		// x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
--- a/src/version.h
+++ b/src/version.h
@@ -11,7 +11,7 @@
 #define APP_ID        "xmrig"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "XMRig miner"
-#define APP_VERSION   "6.26.0"
+#define APP_VERSION   "6.26.1-dev"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2016-2026 xmrig.com"
@@ -19,7 +19,7 @@
 #define APP_VER_MAJOR  6
 #define APP_VER_MINOR  26
-#define APP_VER_PATCH  0
+#define APP_VER_PATCH  1
 #ifdef _MSC_VER
 #   if (_MSC_VER >= 1950)
Author	SHA1	Message	Date
xmrig	ab8f005977	Merge pull request #3812 from SChernykh/dev RandomX: 2.5% faster dataset init on RISC-V	2026-05-07 23:08:49 +07:00
SChernykh	f91b79681d	RandomX: 2.5% faster dataset init on RISC-V And a couple small improvements in the main loop.	2026-05-07 17:57:16 +02:00
xmrig	a7baa9cb63	Merge pull request #3807 from SChernykh/dev Update FCMP++ block template layout	2026-04-30 17:31:14 +07:00
SChernykh	c59c03e137	Update FCMP++ block template layout	2026-04-29 15:29:14 +02:00
xmrig	80eff55ed6	Merge pull request #3805 from SChernykh/dev ARM64 JIT: Optimize Group E register conversion	2026-04-25 17:38:04 +07:00
SChernykh	5347458fc7	ARM64 JIT: Optimize Group E register conversion Based on https://github.com/tevador/RandomX/pull/324	2026-04-25 11:37:47 +02:00
XMRig	6bf43053f7	v6.26.1-dev	2026-03-28 20:43:46 +07:00
XMRig	69b7e60d35	Merge branch 'master' into dev	2026-03-28 20:42:02 +07:00