RandomX: 2.5% faster dataset init on RISC-V

And a couple small improvements in the main loop.
2026-06-18 10:22:39 -04:00 · 2026-05-07 17:57:16 +02:00
parent a7baa9cb63
commit f91b79681d
2 changed files with 18 additions and 54 deletions
--- a/src/crypto/randomx/jit_compiler_rv64_vector.cpp
+++ b/src/crypto/randomx/jit_compiler_rv64_vector.cpp
@@ -243,9 +243,11 @@ static void imm_to_x5(uint32_t imm, uint8_t*& p)
 		return;
 	}

-	if (imm_hi < (32 << 12)) {
+	const int32_t simm_hi = static_cast<int32_t>(imm_hi);
+
+	if ((simm_hi >= -(32 << 12)) && (simm_hi < (32 << 12))) {
 		//c.lui x5, imm_hi
-		emit16(0x6281 + (imm_hi >> 10));
+		emit16(0x6281 | ((imm_hi & 0x1F000) >> 10) | ((simm_hi < 0) ? 0x1000 : 0));
 	}
 	else {
 		// lui x5, imm_hi
--- a/src/crypto/randomx/jit_compiler_rv64_vector_static.S
+++ b/src/crypto/randomx/jit_compiler_rv64_vector_static.S
@@ -129,6 +129,8 @@ v10-v17	= sshash constants
 v18	= temporary

 v19	= dataset item store offsets
+
+v24-v31 = temporary
 */

 DECL(randomx_riscv64_vector_sshash_dataset_init):
@@ -180,6 +182,7 @@ DECL(randomx_riscv64_vector_sshash_dataset_init):
 	slli x13, x13, 6
 	add x13, x13, x11

+.balign 64
 init_item:
 	// Step 1. Init r0-r7

@@ -216,28 +219,7 @@ DECL(randomx_riscv64_vector_sshash_generated_instructions):

 DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
 	// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
-	vsuxei64.v v0, (x11), v19
-
-	add x5, x11, 8
-	vsuxei64.v v1, (x5), v19
-
-	add x5, x11, 16
-	vsuxei64.v v2, (x5), v19
-
-	add x5, x11, 24
-	vsuxei64.v v3, (x5), v19
-
-	add x5, x11, 32
-	vsuxei64.v v4, (x5), v19
-
-	add x5, x11, 40
-	vsuxei64.v v5, (x5), v19
-
-	add x5, x11, 48
-	vsuxei64.v v6, (x5), v19
-
-	add x5, x11, 56
-	vsuxei64.v v7, (x5), v19
+	vsuxseg8ei64.v v0, (x11), v19

 	// Iterate to the next 4 items
 	vadd.vi v8, v8, 4
@@ -293,36 +275,15 @@ DECL(randomx_riscv64_vector_sshash_cache_prefetch):

 // Step 6. XOR all registers with data loaded from randomx cache
 DECL(randomx_riscv64_vector_sshash_xor):
-	vluxei64.v v18, (x10), v9
-	vxor.vv v0, v0, v18
-
-	add x5, x10, 8
-	vluxei64.v v18, (x5), v9
-	vxor.vv v1, v1, v18
-
-	add x5, x10, 16
-	vluxei64.v v18, (x5), v9
-	vxor.vv v2, v2, v18
-
-	add x5, x10, 24
-	vluxei64.v v18, (x5), v9
-	vxor.vv v3, v3, v18
-
-	add x5, x10, 32
-	vluxei64.v v18, (x5), v9
-	vxor.vv v4, v4, v18
-
-	add x5, x10, 40
-	vluxei64.v v18, (x5), v9
-	vxor.vv v5, v5, v18
-
-	add x5, x10, 48
-	vluxei64.v v18, (x5), v9
-	vxor.vv v6, v6, v18
-
-	add x5, x10, 56
-	vluxei64.v v18, (x5), v9
-	vxor.vv v7, v7, v18
+	vluxseg8ei64.v v24, (x10), v9
+	vxor.vv v0, v0, v24
+	vxor.vv v1, v1, v25
+	vxor.vv v2, v2, v26
+	vxor.vv v3, v3, v27
+	vxor.vv v4, v4, v28
+	vxor.vv v5, v5, v29
+	vxor.vv v6, v6, v30
+	vxor.vv v7, v7, v31

 DECL(randomx_riscv64_vector_sshash_end):

@@ -564,6 +525,7 @@ DECL(randomx_riscv64_vector_program_v2_soft_aes_init):

 	vsetivli zero, 2, e64, m1, ta, ma

+.balign 64
 DECL(randomx_riscv64_vector_program_main_loop):
 	and x5, x15, x9		// x5 = spAddr0 & 64-byte aligned L3 mask
 	add x5, x5, x12		// x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]