RISC-V: added vectorized soft AES

2026-04-17 04:59:28 -04:00 · 2025-12-05 21:09:22 +01:00
parent 7ef5142a52
commit 23da1a90f5
5 changed files with 231 additions and 0 deletions
--- a/cmake/cpu.cmake
+++ b/cmake/cpu.cmake
@@ -55,6 +55,18 @@ if (XMRIG_RISCV)
    if(ARCH STREQUAL "native")
        enable_language(ASM)

+        try_run(RANDOMX_VECTOR_RUN_FAIL
+            RANDOMX_VECTOR_COMPILE_OK
+            ${CMAKE_CURRENT_BINARY_DIR}/
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_vector.s
+            COMPILE_DEFINITIONS "-march=rv64gcv_zicbop")
+
+        if (RANDOMX_VECTOR_COMPILE_OK AND NOT RANDOMX_VECTOR_RUN_FAIL)
+            set(RVARCH "${RVARCH}v_zicbop")
+            add_definitions(-DXMRIG_RVV_ENABLED)
+            message(STATUS "RISC-V vector extension detected")
+        endif()
+
        try_run(RANDOMX_ZBA_RUN_FAIL
            RANDOMX_ZBA_COMPILE_OK
            ${CMAKE_CURRENT_BINARY_DIR}/
--- a/src/crypto/randomx/aes_hash.cpp
+++ b/src/crypto/randomx/aes_hash.cpp
@@ -235,6 +235,131 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
 template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
 template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);

+#if defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
+static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
+static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
+
+static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
+static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
+
+static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
+static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
+
+static constexpr uint32_t AES_HASH_STRIDE[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
+
+template<int softAes, int unroll>
+void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
+	PROFILE_SCOPE(RandomX_AES);
+
+	uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
+	const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
+
+	vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
+	vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
+
+	const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
+	const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
+
+	const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE, 8);
+
+	vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
+	vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
+
+	const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
+	const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
+	const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
+	const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
+
+	const vuint8m1_t& lutdec_index0 = lutenc_index0;
+	const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
+	const vuint8m1_t& lutdec_index2 = lutenc_index2;
+	const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
+
+	//process 64 bytes at a time in 4 lanes
+	while (scratchpadPtr < scratchpadEnd) {
+#define HASH_STATE(k) \
+		hash_state02 = softaes_vector_double(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
+		hash_state13 = softaes_vector_double(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
+
+#define FILL_STATE(k) \
+		fill_state02 = softaes_vector_double(fill_state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); \
+		fill_state13 = softaes_vector_double(fill_state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
+		__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, fill_state02, 8); \
+		__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, fill_state13, 8);
+
+		switch (softAes) {
+			case 0:
+				HASH_STATE(0);
+				HASH_STATE(1);
+
+				FILL_STATE(0);
+				FILL_STATE(1);
+
+				scratchpadPtr += 128;
+				break;
+
+			default:
+				switch (unroll) {
+					case 4:
+						HASH_STATE(0);
+						FILL_STATE(0);
+
+						HASH_STATE(1);
+						FILL_STATE(1);
+
+						HASH_STATE(2);
+						FILL_STATE(2);
+
+						HASH_STATE(3);
+						FILL_STATE(3);
+
+						scratchpadPtr += 64 * 4;
+						break;
+
+					case 2:
+						HASH_STATE(0);
+						FILL_STATE(0);
+
+						HASH_STATE(1);
+						FILL_STATE(1);
+
+						scratchpadPtr += 64 * 2;
+						break;
+
+					default:
+						HASH_STATE(0);
+						FILL_STATE(0);
+
+						scratchpadPtr += 64;
+						break;
+				}
+				break;
+		}
+	}
+
+#undef HASH_STATE
+#undef FILL_STATE
+
+	__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
+	__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
+
+	//two extra rounds to achieve full diffusion
+	const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
+	const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
+
+	hash_state02 = softaes_vector_double(hash_state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
+	hash_state13 = softaes_vector_double(hash_state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
+
+	hash_state02 = softaes_vector_double(hash_state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
+	hash_state13 = softaes_vector_double(hash_state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
+
+	//output hash
+	__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
+	__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
+}
+
+#else // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
+
 template<int softAes, int unroll>
 void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
 	PROFILE_SCOPE(RandomX_AES);
@@ -375,6 +500,7 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
 	rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2);
 	rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
 }
+#endif // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)

 template void hashAndFillAes1Rx4<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
 template void hashAndFillAes1Rx4<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
--- a/src/crypto/randomx/soft_aes.cpp
+++ b/src/crypto/randomx/soft_aes.cpp
@@ -39,6 +39,9 @@ alignas(64) uint32_t lutDec1[256];
 alignas(64) uint32_t lutDec2[256];
 alignas(64) uint32_t lutDec3[256];

+alignas(64) uint8_t lutEncIndex[4][32];
+alignas(64) uint8_t lutDecIndex[4][32];
+
 static uint32_t mul_gf2(uint32_t b, uint32_t c)
 {
 	uint32_t s = 0;
@@ -115,5 +118,49 @@ static struct SAESInitializer
 			lutDec2[i] = w; w = (w << 8) | (w >> 24);
 			lutDec3[i] = w;
 		}
+
+		memset(lutEncIndex, -1, sizeof(lutEncIndex));
+		memset(lutDecIndex, -1, sizeof(lutDecIndex));
+
+		lutEncIndex[0][ 0] =  0;
+		lutEncIndex[0][ 4] =  4;
+		lutEncIndex[0][ 8] =  8;
+		lutEncIndex[0][12] = 12;
+		lutEncIndex[1][ 0] =  5;
+		lutEncIndex[1][ 4] =  9;
+		lutEncIndex[1][ 8] = 13;
+		lutEncIndex[1][12] =  1;
+		lutEncIndex[2][ 0] = 10;
+		lutEncIndex[2][ 4] = 14;
+		lutEncIndex[2][ 8] =  2;
+		lutEncIndex[2][12] =  6;
+		lutEncIndex[3][ 0] = 15;
+		lutEncIndex[3][ 4] =  3;
+		lutEncIndex[3][ 8] =  7;
+		lutEncIndex[3][12] = 11;
+
+		lutDecIndex[0][ 0] =  0;
+		lutDecIndex[0][ 4] =  4;
+		lutDecIndex[0][ 8] =  8;
+		lutDecIndex[0][12] = 12;
+		lutDecIndex[1][ 0] = 13;
+		lutDecIndex[1][ 4] =  1;
+		lutDecIndex[1][ 8] =  5;
+		lutDecIndex[1][12] =  9;
+		lutDecIndex[2][ 0] = 10;
+		lutDecIndex[2][ 4] = 14;
+		lutDecIndex[2][ 8] =  2;
+		lutDecIndex[2][12] =  6;
+		lutDecIndex[3][ 0] =  7;
+		lutDecIndex[3][ 4] = 11;
+		lutDecIndex[3][ 8] = 15;
+		lutDecIndex[3][12] =  3;
+
+		for (uint32_t i = 0; i < 4; ++i) {
+			for (uint32_t j = 0; j < 16; j += 4) {
+				lutEncIndex[i][j + 16] = lutEncIndex[i][j] + 16;
+				lutDecIndex[i][j + 16] = lutDecIndex[i][j] + 16;
+			}
+		}
 	}
 } aes_initializer;
--- a/src/crypto/randomx/soft_aes.h
+++ b/src/crypto/randomx/soft_aes.h
@@ -41,6 +41,9 @@ extern uint32_t lutDec1[256];
 extern uint32_t lutDec2[256];
 extern uint32_t lutDec3[256];

+extern uint8_t lutEncIndex[4][32];
+extern uint8_t lutDecIndex[4][32];
+
 template<int soft> rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key);
 template<int soft> rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key);

@@ -147,3 +150,32 @@ template<>
 FORCE_INLINE rx_vec_i128 aesdec<0>(rx_vec_i128 in, rx_vec_i128 key) {
 	return rx_aesdec_vec_i128(in, key);
 }
+
+#if defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
+#include <riscv_vector.h>
+
+FORCE_INLINE vuint32m1_t softaes_vector_double(
+	vuint32m1_t in,
+	vuint32m1_t key,
+	vuint8m1_t i0, vuint8m1_t i1, vuint8m1_t i2, vuint8m1_t i3,
+	const uint32_t* lut0, const uint32_t* lut1, const uint32_t *lut2, const uint32_t* lut3)
+{
+	const vuint8m1_t in8 = __riscv_vreinterpret_v_u32m1_u8m1(in);
+
+	const vuint32m1_t index0 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i0, 32));
+	const vuint32m1_t index1 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i1, 32));
+	const vuint32m1_t index2 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i2, 32));
+	const vuint32m1_t index3 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i3, 32));
+
+	vuint32m1_t s0 = __riscv_vluxei32_v_u32m1(lut0, __riscv_vsll_vx_u32m1(index0, 2, 8), 8);
+	vuint32m1_t s1 = __riscv_vluxei32_v_u32m1(lut1, __riscv_vsll_vx_u32m1(index1, 2, 8), 8);
+	vuint32m1_t s2 = __riscv_vluxei32_v_u32m1(lut2, __riscv_vsll_vx_u32m1(index2, 2, 8), 8);
+	vuint32m1_t s3 = __riscv_vluxei32_v_u32m1(lut3, __riscv_vsll_vx_u32m1(index3, 2, 8), 8);
+
+	s0 = __riscv_vxor_vv_u32m1(s0, s1, 8);
+	s2 = __riscv_vxor_vv_u32m1(s2, s3, 8);
+	s0 = __riscv_vxor_vv_u32m1(s0, s2, 8);
+
+	return __riscv_vxor_vv_u32m1(s0, key, 8);
+}
+#endif // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
--- a/src/crypto/randomx/tests/riscv64_vector.s
+++ b/src/crypto/randomx/tests/riscv64_vector.s
@@ -0,0 +1,14 @@
+/* RISC-V - test if the vector extension and prefetch instruction are present */
+
+.text
+.option arch, rv64gcv_zicbop
+.global main
+
+main:
+	lla x5, main
+	prefetch.r (x5)
+	li x5, 4
+	vsetvli x6, x5, e64, m1, ta, ma
+	vxor.vv v0, v0, v0
+	sub x10, x5, x6
+	ret