mirror of
https://github.com/xmrig/xmrig.git
synced 2025-12-06 15:42:38 -05:00
RISC-V: added vectorized soft AES
This commit is contained in:
@@ -55,6 +55,18 @@ if (XMRIG_RISCV)
|
||||
if(ARCH STREQUAL "native")
|
||||
enable_language(ASM)
|
||||
|
||||
try_run(RANDOMX_VECTOR_RUN_FAIL
|
||||
RANDOMX_VECTOR_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_vector.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gcv_zicbop")
|
||||
|
||||
if (RANDOMX_VECTOR_COMPILE_OK AND NOT RANDOMX_VECTOR_RUN_FAIL)
|
||||
set(RVARCH "${RVARCH}v_zicbop")
|
||||
add_definitions(-DXMRIG_RVV_ENABLED)
|
||||
message(STATUS "RISC-V vector extension detected")
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZBA_RUN_FAIL
|
||||
RANDOMX_ZBA_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
|
||||
@@ -235,6 +235,131 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
|
||||
template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
#if defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
|
||||
static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
|
||||
static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
|
||||
|
||||
static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
|
||||
static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
|
||||
|
||||
static constexpr uint32_t AES_HASH_STRIDE[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||
PROFILE_SCOPE(RandomX_AES);
|
||||
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE, 8);
|
||||
|
||||
vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
|
||||
vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (scratchpadPtr < scratchpadEnd) {
|
||||
#define HASH_STATE(k) \
|
||||
hash_state02 = softaes_vector_double(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
|
||||
hash_state13 = softaes_vector_double(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
#define FILL_STATE(k) \
|
||||
fill_state02 = softaes_vector_double(fill_state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); \
|
||||
fill_state13 = softaes_vector_double(fill_state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, fill_state02, 8); \
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, fill_state13, 8);
|
||||
|
||||
switch (softAes) {
|
||||
case 0:
|
||||
HASH_STATE(0);
|
||||
HASH_STATE(1);
|
||||
|
||||
FILL_STATE(0);
|
||||
FILL_STATE(1);
|
||||
|
||||
scratchpadPtr += 128;
|
||||
break;
|
||||
|
||||
default:
|
||||
switch (unroll) {
|
||||
case 4:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
HASH_STATE(1);
|
||||
FILL_STATE(1);
|
||||
|
||||
HASH_STATE(2);
|
||||
FILL_STATE(2);
|
||||
|
||||
HASH_STATE(3);
|
||||
FILL_STATE(3);
|
||||
|
||||
scratchpadPtr += 64 * 4;
|
||||
break;
|
||||
|
||||
case 2:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
HASH_STATE(1);
|
||||
FILL_STATE(1);
|
||||
|
||||
scratchpadPtr += 64 * 2;
|
||||
break;
|
||||
|
||||
default:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#undef HASH_STATE
|
||||
#undef FILL_STATE
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
hash_state02 = softaes_vector_double(hash_state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
hash_state13 = softaes_vector_double(hash_state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
hash_state02 = softaes_vector_double(hash_state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
hash_state13 = softaes_vector_double(hash_state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
|
||||
}
|
||||
|
||||
#else // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||
PROFILE_SCOPE(RandomX_AES);
|
||||
@@ -375,6 +500,7 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi
|
||||
rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2);
|
||||
rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
|
||||
}
|
||||
#endif // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
|
||||
|
||||
template void hashAndFillAes1Rx4<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
|
||||
@@ -39,6 +39,9 @@ alignas(64) uint32_t lutDec1[256];
|
||||
alignas(64) uint32_t lutDec2[256];
|
||||
alignas(64) uint32_t lutDec3[256];
|
||||
|
||||
alignas(64) uint8_t lutEncIndex[4][32];
|
||||
alignas(64) uint8_t lutDecIndex[4][32];
|
||||
|
||||
static uint32_t mul_gf2(uint32_t b, uint32_t c)
|
||||
{
|
||||
uint32_t s = 0;
|
||||
@@ -115,5 +118,49 @@ static struct SAESInitializer
|
||||
lutDec2[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec3[i] = w;
|
||||
}
|
||||
|
||||
memset(lutEncIndex, -1, sizeof(lutEncIndex));
|
||||
memset(lutDecIndex, -1, sizeof(lutDecIndex));
|
||||
|
||||
lutEncIndex[0][ 0] = 0;
|
||||
lutEncIndex[0][ 4] = 4;
|
||||
lutEncIndex[0][ 8] = 8;
|
||||
lutEncIndex[0][12] = 12;
|
||||
lutEncIndex[1][ 0] = 5;
|
||||
lutEncIndex[1][ 4] = 9;
|
||||
lutEncIndex[1][ 8] = 13;
|
||||
lutEncIndex[1][12] = 1;
|
||||
lutEncIndex[2][ 0] = 10;
|
||||
lutEncIndex[2][ 4] = 14;
|
||||
lutEncIndex[2][ 8] = 2;
|
||||
lutEncIndex[2][12] = 6;
|
||||
lutEncIndex[3][ 0] = 15;
|
||||
lutEncIndex[3][ 4] = 3;
|
||||
lutEncIndex[3][ 8] = 7;
|
||||
lutEncIndex[3][12] = 11;
|
||||
|
||||
lutDecIndex[0][ 0] = 0;
|
||||
lutDecIndex[0][ 4] = 4;
|
||||
lutDecIndex[0][ 8] = 8;
|
||||
lutDecIndex[0][12] = 12;
|
||||
lutDecIndex[1][ 0] = 13;
|
||||
lutDecIndex[1][ 4] = 1;
|
||||
lutDecIndex[1][ 8] = 5;
|
||||
lutDecIndex[1][12] = 9;
|
||||
lutDecIndex[2][ 0] = 10;
|
||||
lutDecIndex[2][ 4] = 14;
|
||||
lutDecIndex[2][ 8] = 2;
|
||||
lutDecIndex[2][12] = 6;
|
||||
lutDecIndex[3][ 0] = 7;
|
||||
lutDecIndex[3][ 4] = 11;
|
||||
lutDecIndex[3][ 8] = 15;
|
||||
lutDecIndex[3][12] = 3;
|
||||
|
||||
for (uint32_t i = 0; i < 4; ++i) {
|
||||
for (uint32_t j = 0; j < 16; j += 4) {
|
||||
lutEncIndex[i][j + 16] = lutEncIndex[i][j] + 16;
|
||||
lutDecIndex[i][j + 16] = lutDecIndex[i][j] + 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
} aes_initializer;
|
||||
|
||||
@@ -41,6 +41,9 @@ extern uint32_t lutDec1[256];
|
||||
extern uint32_t lutDec2[256];
|
||||
extern uint32_t lutDec3[256];
|
||||
|
||||
extern uint8_t lutEncIndex[4][32];
|
||||
extern uint8_t lutDecIndex[4][32];
|
||||
|
||||
template<int soft> rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key);
|
||||
template<int soft> rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key);
|
||||
|
||||
@@ -147,3 +150,32 @@ template<>
|
||||
FORCE_INLINE rx_vec_i128 aesdec<0>(rx_vec_i128 in, rx_vec_i128 key) {
|
||||
return rx_aesdec_vec_i128(in, key);
|
||||
}
|
||||
|
||||
#if defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
|
||||
#include <riscv_vector.h>
|
||||
|
||||
FORCE_INLINE vuint32m1_t softaes_vector_double(
|
||||
vuint32m1_t in,
|
||||
vuint32m1_t key,
|
||||
vuint8m1_t i0, vuint8m1_t i1, vuint8m1_t i2, vuint8m1_t i3,
|
||||
const uint32_t* lut0, const uint32_t* lut1, const uint32_t *lut2, const uint32_t* lut3)
|
||||
{
|
||||
const vuint8m1_t in8 = __riscv_vreinterpret_v_u32m1_u8m1(in);
|
||||
|
||||
const vuint32m1_t index0 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i0, 32));
|
||||
const vuint32m1_t index1 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i1, 32));
|
||||
const vuint32m1_t index2 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i2, 32));
|
||||
const vuint32m1_t index3 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i3, 32));
|
||||
|
||||
vuint32m1_t s0 = __riscv_vluxei32_v_u32m1(lut0, __riscv_vsll_vx_u32m1(index0, 2, 8), 8);
|
||||
vuint32m1_t s1 = __riscv_vluxei32_v_u32m1(lut1, __riscv_vsll_vx_u32m1(index1, 2, 8), 8);
|
||||
vuint32m1_t s2 = __riscv_vluxei32_v_u32m1(lut2, __riscv_vsll_vx_u32m1(index2, 2, 8), 8);
|
||||
vuint32m1_t s3 = __riscv_vluxei32_v_u32m1(lut3, __riscv_vsll_vx_u32m1(index3, 2, 8), 8);
|
||||
|
||||
s0 = __riscv_vxor_vv_u32m1(s0, s1, 8);
|
||||
s2 = __riscv_vxor_vv_u32m1(s2, s3, 8);
|
||||
s0 = __riscv_vxor_vv_u32m1(s0, s2, 8);
|
||||
|
||||
return __riscv_vxor_vv_u32m1(s0, key, 8);
|
||||
}
|
||||
#endif // defined(XMRIG_RISCV) && defined(XMRIG_RVV_ENABLED)
|
||||
|
||||
14
src/crypto/randomx/tests/riscv64_vector.s
Normal file
14
src/crypto/randomx/tests/riscv64_vector.s
Normal file
@@ -0,0 +1,14 @@
|
||||
/* RISC-V - test if the vector extension and prefetch instruction are present */
|
||||
|
||||
.text
|
||||
.option arch, rv64gcv_zicbop
|
||||
.global main
|
||||
|
||||
main:
|
||||
lla x5, main
|
||||
prefetch.r (x5)
|
||||
li x5, 4
|
||||
vsetvli x6, x5, e64, m1, ta, ma
|
||||
vxor.vv v0, v0, v0
|
||||
sub x10, x5, x6
|
||||
ret
|
||||
Reference in New Issue
Block a user