1
0
mirror of https://github.com/xmrig/xmrig.git synced 2026-06-20 11:32:41 -04:00

Compare commits

...

9 Commits

Author SHA1 Message Date
xmrig
3fb851d91d Merge pull request #3820 from aa022/dev
ARM64 RandomX JIT: dataset prefetch + non-temporal loads (+~8% on M4 base)
2026-05-26 00:21:52 +07:00
aa022
9ac373fea5 ARM64 RandomX JIT: drop early dataset prefetch 2026-05-25 18:05:50 +02:00
aa022
978720462d ARM64 RandomX JIT: dataset prefetch + non-temporal loads
Two Apple-silicon-targeted tweaks to the aarch64 RandomX JIT:

- Early dataset prefetch: when readReg2/readReg3 are finalized well before
  the end of the program body, emit the next iteration's dataset-line prefetch
  early to hide more DRAM latency on the serial scalar chain.
- Non-temporal dataset loads: each 64-byte dataset line is read once and never
  reused, so ldp -> ldnp avoids evicting the hot scratchpad, and the prefetch
  hint moves pldl2strm -> pldl1strm to match the longer lead time.

Measured ~8% hashrate gain on Apple M4 base over dev (7eadfdc9).
2026-05-25 13:46:41 +02:00
xmrig
7eadfdc9c6 Merge pull request #3816 from SChernykh/dev
ARM64 RandomX optimizations
2026-05-18 02:12:24 +07:00
SChernykh
720325c40f RandomX optimizations:
- ARM64: optimized emitMovImmediate/emitMemLoad
- ARM64: disabled 32-bit literal preloading (it was slower)
- Android and Linux: added MADV_COLLAPSE support to memory allocation
2026-05-17 21:04:02 +02:00
xmrig
27f116e2da Merge pull request #3815 from SChernykh/dev
ARM64 RandomX JIT optimizations
2026-05-15 22:42:02 +07:00
SChernykh
f8dd210531 ARM64 RandomX JIT:
- optimized F/E register loading
- aligned asm code
- optimized emitAddImmediate for small negative values
- v2: optimized CFROUND
- v2: optimized AES in the main loop
2026-05-15 17:30:05 +02:00
xmrig
ab8f005977 Merge pull request #3812 from SChernykh/dev
RandomX: 2.5% faster dataset init on RISC-V
2026-05-07 23:08:49 +07:00
SChernykh
f91b79681d RandomX: 2.5% faster dataset init on RISC-V
And a couple small improvements in the main loop.
2026-05-07 17:57:16 +02:00
6 changed files with 139 additions and 143 deletions

View File

@@ -252,5 +252,5 @@ if (WIN32)
endif() endif()
if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode) if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode)
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>") add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
endif() endif()

View File

@@ -74,6 +74,11 @@
#endif #endif
#ifndef MADV_COLLAPSE
# define MADV_COLLAPSE 25
#endif
#if defined(XMRIG_OS_LINUX) || (!defined(XMRIG_OS_APPLE) && !defined(XMRIG_OS_FREEBSD)) #if defined(XMRIG_OS_LINUX) || (!defined(XMRIG_OS_APPLE) && !defined(XMRIG_OS_FREEBSD))
static inline int hugePagesFlag(size_t size) static inline int hugePagesFlag(size_t size)
{ {
@@ -278,8 +283,9 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size) bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
{ {
# ifdef XMRIG_OS_LINUX # if defined(XMRIG_OS_ANDROID) || defined(XMRIG_OS_LINUX)
return (madvise(p, size, MADV_HUGEPAGE) == 0); // MADV_COLLAPSE works even if /sys/kernel/mm/transparent_hugepage/enabled is set to "never", but only on Linux 6.1+
return (madvise(p, size, MADV_COLLAPSE) == 0) || (madvise(p, size, MADV_HUGEPAGE) == 0);
# else # else
return false; return false;
# endif # endif

View File

@@ -64,6 +64,8 @@ constexpr uint32_t MOVN = 0x92800000;
constexpr uint32_t MOVK = 0xF2800000; constexpr uint32_t MOVK = 0xF2800000;
constexpr uint32_t ADD_IMM_LO = 0x91000000; constexpr uint32_t ADD_IMM_LO = 0x91000000;
constexpr uint32_t ADD_IMM_HI = 0x91400000; constexpr uint32_t ADD_IMM_HI = 0x91400000;
constexpr uint32_t SUB_IMM_LO = 0xD1000000;
constexpr uint32_t SUB_IMM_HI = 0xD1400000;
constexpr uint32_t LDR_LITERAL = 0x58000000; constexpr uint32_t LDR_LITERAL = 0x58000000;
constexpr uint32_t ROR = 0x9AC02C00; constexpr uint32_t ROR = 0x9AC02C00;
constexpr uint32_t ROR_IMM = 0x93C00000; constexpr uint32_t ROR_IMM = 0x93C00000;
@@ -139,7 +141,7 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
codePos = PrologueSize; codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd; literalPos = ImulRcpLiteralsEnd;
num32bitLiterals = 0; num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk
for (uint32_t i = 0; i < RegistersCount; ++i) for (uint32_t i = 0; i < RegistersCount; ++i)
reg_changed_offset[i] = codePos; reg_changed_offset[i] = codePos;
@@ -235,7 +237,7 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
codePos = PrologueSize; codePos = PrologueSize;
literalPos = ImulRcpLiteralsEnd; literalPos = ImulRcpLiteralsEnd;
num32bitLiterals = 0; num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk
for (uint32_t i = 0; i < RegistersCount; ++i) for (uint32_t i = 0; i < RegistersCount; ++i)
reg_changed_offset[i] = codePos; reg_changed_offset[i] = codePos;
@@ -486,13 +488,31 @@ void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code,
{ {
uint32_t k = codePos; uint32_t k = codePos;
// 196606 different values can be encoded with a single instruction, the rest requires smov/umov load, or movn/movz+movk pair
if (imm < (1 << 16)) if (imm < (1 << 16))
{ {
// Sign-extended 64-bit value: 0x000000000000xxxx
// movz tmp_reg, imm32 (16 low bits) // movz tmp_reg, imm32 (16 low bits)
emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k); emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
} }
else if ((imm >> 16) == 0xFFFF) {
// Sign-extended 64-bit value: 0xFFFFFFFFFFFFxxxx
// movn tmp_reg, ~imm32 (16 low bits)
emit32(ARMV8A::MOVN | dst | ((~imm & 0xFFFF) << 5), code, k);
}
else if (((imm & 0xFFFF) == 0xFFFF) && (static_cast<int32_t>(imm) < 0)) {
// Sign-extended 64-bit value: 0xFFFFFFFFxxxxFFFF
// movn tmp_reg, ~imm32 (16 high bits)
emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k);
}
else if (((imm & 0xFFFF) == 0) && (static_cast<int32_t>(imm) >= 0)) {
// Sign-extended 64-bit value: 0x00000000xxxx0000
// movz tmp_reg, imm32 (16 high bits)
emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k);
}
else else
{ {
// Full sign-extended 64-bit value: 0x00000000xxxxxxxx or 0xFFFFFFFFxxxxxxxx
if (num32bitLiterals < 64) if (num32bitLiterals < 64)
{ {
if (static_cast<int32_t>(imm) < 0) if (static_cast<int32_t>(imm) < 0)
@@ -534,23 +554,40 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
{ {
uint32_t k = codePos; uint32_t k = codePos;
if (imm < (1 << 24)) if (imm == 0) {
{ if (dst != src) {
const uint32_t imm_lo = imm & ((1 << 12) - 1); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
const uint32_t imm_hi = imm >> 12; }
if (imm_lo && imm_hi) codePos = k;
{ return;
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k); }
emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
} const int32_t simm = static_cast<int32_t>(imm);
else if (imm_lo)
{ uint32_t mag, opLo, opHi;
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
} if (simm > 0) {
else mag = imm;
{ opLo = ARMV8A::ADD_IMM_LO;
emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k); opHi = ARMV8A::ADD_IMM_HI;
} else {
mag = static_cast<uint32_t>(-static_cast<int64_t>(simm));
opLo = ARMV8A::SUB_IMM_LO;
opHi = ARMV8A::SUB_IMM_HI;
}
if (mag < (1u << 24)) {
const uint32_t lo = mag & ((1u << 12) - 1);
const uint32_t hi = mag >> 12;
if (lo && hi) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
emit32(opHi | dst | (dst << 5) | (hi << 10), code, k);
} else if (lo) {
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
} else {
emit32(opHi | dst | (src << 5) | (hi << 10), code, k);
} }
} }
else else
@@ -592,18 +629,17 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
else else
{ {
imm = (imm & ScratchpadL3Mask) >> 3; imm = (imm & ScratchpadL3Mask) >> 3;
if (imm) if (imm < 4096) {
// ldr tmp_reg, [x2, #imm*8]
emit32(0xf9400040 | tmp_reg | (imm << 10), code, k);
}
else
{ {
emitMovImmediate(tmp_reg, imm, code, k); emitMovImmediate(tmp_reg, imm, code, k);
// ldr tmp_reg, [x2, tmp_reg, lsl 3] // ldr tmp_reg, [x2, tmp_reg, lsl 3]
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k); emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
} }
else
{
// ldr tmp_reg, [x2]
emit32(0xf9400040 | tmp_reg, code, k);
}
} }
codePos = k; codePos = k;
@@ -744,7 +780,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20; constexpr uint32_t tmp_reg = 20;
emitMemLoad<tmp_reg>(dst, src, instr, code, k); emitMemLoad<tmp_reg>(dst, src, instr, code, k);
// sub dst, dst, tmp_reg // mul dst, dst, tmp_reg
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k); emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
reg_changed_offset[instr.dst] = k; reg_changed_offset[instr.dst] = k;
@@ -1109,17 +1145,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
constexpr uint32_t tmp_reg = 20; constexpr uint32_t tmp_reg = 20;
constexpr uint32_t fpcr_tmp_reg = 8; constexpr uint32_t fpcr_tmp_reg = 8;
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) { if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
// tst tmp_reg, 60 const uint32_t immr = (62 - instr.getImm32()) & 63;
emit32(0xF27E0E9F, code, k);
// tst src, ROR(60, -(instr.getImm32() & 63))
emit32(0xF2400C1F | (immr << 16) | (src << 5), code, k);
// bne next // bne next
emit32(0x54000081, code, k); emit32(0x540000A1, code, k);
} }
// ror tmp_reg, src, imm
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
// bfi fpcr_tmp_reg, tmp_reg, 40, 2 // bfi fpcr_tmp_reg, tmp_reg, 40, 2
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k); emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);

View File

@@ -187,6 +187,7 @@ DECL(randomx_program_aarch64):
ldp x27, x28, [x30, -32] // literal_x27 ldp x27, x28, [x30, -32] // literal_x27
ldp x29, x30, [x30, -16] // literal_x29 ldp x29, x30, [x30, -16] // literal_x29
.balign 64
DECL(randomx_program_aarch64_main_loop): DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr0 = spMix1 & ScratchpadL3Mask64;
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
@@ -215,30 +216,28 @@ DECL(randomx_program_aarch64_main_loop):
eor x14, x14, x20 eor x14, x14, x20
eor x15, x15, x19 eor x15, x15, x19
# Load group F registers (spAddr1) # Load group F/E registers (spAddr1)
ldr q17, [x17] ldp q17, q19, [x17]
sxtl v16.2d, v17.2s ldp q21, q23, [x17, 32]
scvtf v16.2d, v16.2d
sxtl2 v17.2d, v17.4s
scvtf v17.2d, v17.2d
ldr q19, [x17, 16] sxtl v16.2d, v17.2s
sxtl2 v17.2d, v17.4s
sxtl v18.2d, v19.2s sxtl v18.2d, v19.2s
scvtf v18.2d, v18.2d
sxtl2 v19.2d, v19.4s sxtl2 v19.2d, v19.4s
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
scvtf v18.2d, v18.2d
scvtf v19.2d, v19.2d scvtf v19.2d, v19.2d
# Load group E registers (spAddr1)
ldr q21, [x17, 32]
sxtl v20.2d, v21.2s sxtl v20.2d, v21.2s
scvtf v20.2d, v20.2d
sxtl2 v21.2d, v21.4s sxtl2 v21.2d, v21.4s
scvtf v21.2d, v21.2d
ldr q23, [x17, 48]
sxtl v22.2d, v23.2s sxtl v22.2d, v23.2s
scvtf v22.2d, v22.2d
sxtl2 v23.2d, v23.4s sxtl2 v23.2d, v23.4s
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
scvtf v22.2d, v22.2d
scvtf v23.2d, v23.2d scvtf v23.2d, v23.2d
bif v20.16b, v30.16b, v29.16b bif v20.16b, v30.16b, v29.16b
@@ -283,6 +282,7 @@ literal_v13: .fill 2,8,0
literal_v14: .fill 2,8,0 literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0 literal_v15: .fill 2,8,0
.balign 64
DECL(randomx_program_aarch64_vm_instructions_end): DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read # Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line) # Do it here to break false dependency from readReg2 and readReg3 (see next line)
@@ -303,7 +303,7 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
add x20, x20, x1 add x20, x20, x1
# Prefetch dataset data # Prefetch dataset data
prfm pldl2strm, [x20] prfm pldl1strm, [x20]
DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler # Actual mask will be inserted by JIT compiler
@@ -312,16 +312,16 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):
DECL(randomx_program_aarch64_xor_with_dataset_line): DECL(randomx_program_aarch64_xor_with_dataset_line):
# xor integer registers with dataset data # xor integer registers with dataset data
ldp x20, x19, [x10] ldnp x20, x19, [x10]
eor x4, x4, x20 eor x4, x4, x20
eor x5, x5, x19 eor x5, x5, x19
ldp x20, x19, [x10, 16] ldnp x20, x19, [x10, 16]
eor x6, x6, x20 eor x6, x6, x20
eor x7, x7, x19 eor x7, x7, x19
ldp x20, x19, [x10, 32] ldnp x20, x19, [x10, 32]
eor x12, x12, x20 eor x12, x12, x20
eor x13, x13, x19 eor x13, x13, x19
ldp x20, x19, [x10, 48] ldnp x20, x19, [x10, 48]
eor x14, x14, x20 eor x14, x14, x20
eor x15, x15, x19 eor x15, x15, x19
@@ -345,15 +345,18 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0) # f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
aese v16.16b, v28.16b # Quote from ARM optimization guides for basically all ARM Cortex CPUs starting from Cortex-A53:
aesd v17.16b, v28.16b #
aese v18.16b, v28.16b # "pairs of dependent AESE/AESMC and AESD/AESIMC instructions are higher performance
aesd v19.16b, v28.16b # when they are adjacent in the program code and both instructions use the same
# destination register since they are fused"
#
# Same applies to all Apple silicon CPUs
aesmc v16.16b, v16.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v20.16b eor v16.16b, v16.16b, v20.16b
eor v17.16b, v17.16b, v20.16b eor v17.16b, v17.16b, v20.16b
@@ -362,15 +365,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1) # f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
aese v16.16b, v28.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v21.16b eor v16.16b, v16.16b, v21.16b
eor v17.16b, v17.16b, v21.16b eor v17.16b, v17.16b, v21.16b
@@ -379,15 +377,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2) # f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
aese v16.16b, v28.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v22.16b eor v16.16b, v16.16b, v22.16b
eor v17.16b, v17.16b, v22.16b eor v17.16b, v17.16b, v22.16b
@@ -396,15 +389,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3) # f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
aese v16.16b, v28.16b aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
aesd v17.16b, v28.16b aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
aese v18.16b, v28.16b aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
aesd v19.16b, v28.16b aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
aesmc v16.16b, v16.16b
aesimc v17.16b, v17.16b
aesmc v18.16b, v18.16b
aesimc v19.16b, v19.16b
eor v16.16b, v16.16b, v23.16b eor v16.16b, v16.16b, v23.16b
eor v17.16b, v17.16b, v23.16b eor v17.16b, v17.16b, v23.16b

View File

@@ -243,9 +243,11 @@ static void imm_to_x5(uint32_t imm, uint8_t*& p)
return; return;
} }
if (imm_hi < (32 << 12)) { const int32_t simm_hi = static_cast<int32_t>(imm_hi);
if ((simm_hi >= -(32 << 12)) && (simm_hi < (32 << 12))) {
//c.lui x5, imm_hi //c.lui x5, imm_hi
emit16(0x6281 + (imm_hi >> 10)); emit16(0x6281 | ((imm_hi & 0x1F000) >> 10) | ((simm_hi < 0) ? 0x1000 : 0));
} }
else { else {
// lui x5, imm_hi // lui x5, imm_hi

View File

@@ -129,6 +129,8 @@ v10-v17 = sshash constants
v18 = temporary v18 = temporary
v19 = dataset item store offsets v19 = dataset item store offsets
v24-v31 = temporary
*/ */
DECL(randomx_riscv64_vector_sshash_dataset_init): DECL(randomx_riscv64_vector_sshash_dataset_init):
@@ -180,6 +182,7 @@ DECL(randomx_riscv64_vector_sshash_dataset_init):
slli x13, x13, 6 slli x13, x13, 6
add x13, x13, x11 add x13, x13, x11
.balign 64
init_item: init_item:
// Step 1. Init r0-r7 // Step 1. Init r0-r7
@@ -216,28 +219,7 @@ DECL(randomx_riscv64_vector_sshash_generated_instructions):
DECL(randomx_riscv64_vector_sshash_generated_instructions_end): DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data. // Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
vsuxei64.v v0, (x11), v19 vsuxseg8ei64.v v0, (x11), v19
add x5, x11, 8
vsuxei64.v v1, (x5), v19
add x5, x11, 16
vsuxei64.v v2, (x5), v19
add x5, x11, 24
vsuxei64.v v3, (x5), v19
add x5, x11, 32
vsuxei64.v v4, (x5), v19
add x5, x11, 40
vsuxei64.v v5, (x5), v19
add x5, x11, 48
vsuxei64.v v6, (x5), v19
add x5, x11, 56
vsuxei64.v v7, (x5), v19
// Iterate to the next 4 items // Iterate to the next 4 items
vadd.vi v8, v8, 4 vadd.vi v8, v8, 4
@@ -293,36 +275,15 @@ DECL(randomx_riscv64_vector_sshash_cache_prefetch):
// Step 6. XOR all registers with data loaded from randomx cache // Step 6. XOR all registers with data loaded from randomx cache
DECL(randomx_riscv64_vector_sshash_xor): DECL(randomx_riscv64_vector_sshash_xor):
vluxei64.v v18, (x10), v9 vluxseg8ei64.v v24, (x10), v9
vxor.vv v0, v0, v18 vxor.vv v0, v0, v24
vxor.vv v1, v1, v25
add x5, x10, 8 vxor.vv v2, v2, v26
vluxei64.v v18, (x5), v9 vxor.vv v3, v3, v27
vxor.vv v1, v1, v18 vxor.vv v4, v4, v28
vxor.vv v5, v5, v29
add x5, x10, 16 vxor.vv v6, v6, v30
vluxei64.v v18, (x5), v9 vxor.vv v7, v7, v31
vxor.vv v2, v2, v18
add x5, x10, 24
vluxei64.v v18, (x5), v9
vxor.vv v3, v3, v18
add x5, x10, 32
vluxei64.v v18, (x5), v9
vxor.vv v4, v4, v18
add x5, x10, 40
vluxei64.v v18, (x5), v9
vxor.vv v5, v5, v18
add x5, x10, 48
vluxei64.v v18, (x5), v9
vxor.vv v6, v6, v18
add x5, x10, 56
vluxei64.v v18, (x5), v9
vxor.vv v7, v7, v18
DECL(randomx_riscv64_vector_sshash_end): DECL(randomx_riscv64_vector_sshash_end):
@@ -564,6 +525,7 @@ DECL(randomx_riscv64_vector_program_v2_soft_aes_init):
vsetivli zero, 2, e64, m1, ta, ma vsetivli zero, 2, e64, m1, ta, ma
.balign 64
DECL(randomx_riscv64_vector_program_main_loop): DECL(randomx_riscv64_vector_program_main_loop):
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask] add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]