mirror of
https://github.com/xmrig/xmrig.git
synced 2026-06-20 11:32:41 -04:00
Compare commits
12 Commits
master
...
7eadfdc9c6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7eadfdc9c6 | ||
|
|
720325c40f | ||
|
|
27f116e2da | ||
|
|
f8dd210531 | ||
|
|
ab8f005977 | ||
|
|
f91b79681d | ||
|
|
a7baa9cb63 | ||
|
|
c59c03e137 | ||
|
|
80eff55ed6 | ||
|
|
5347458fc7 | ||
|
|
6bf43053f7 | ||
|
|
69b7e60d35 |
@@ -252,5 +252,5 @@ if (WIN32)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode)
|
if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND CMAKE_BUILD_TYPE STREQUAL Release AND NOT CMAKE_GENERATOR STREQUAL Xcode)
|
||||||
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
|
add_custom_command(TARGET ${CMAKE_PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} "$<TARGET_FILE:${CMAKE_PROJECT_NAME}>")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -406,9 +406,9 @@ bool xmrig::BlockTemplate::parse(bool hashes)
|
|||||||
if (hashes) {
|
if (hashes) {
|
||||||
// FCMP++ layout:
|
// FCMP++ layout:
|
||||||
//
|
//
|
||||||
// index 0 fcmp_pp_n_tree_layers + 31 zero bytes
|
// index 0 coinbase transaction hash
|
||||||
// index 1 fcmp_pp_tree_root
|
// index 1 fcmp_pp_n_tree_layers + 31 zero bytes
|
||||||
// index 2 coinbase transaction hash
|
// index 2 fcmp_pp_tree_root
|
||||||
// index 3+ other transaction hashes
|
// index 3+ other transaction hashes
|
||||||
//
|
//
|
||||||
// pre-FCMP++ layout:
|
// pre-FCMP++ layout:
|
||||||
@@ -416,30 +416,28 @@ bool xmrig::BlockTemplate::parse(bool hashes)
|
|||||||
// index 0 coinbase transaction hash
|
// index 0 coinbase transaction hash
|
||||||
// index 1+ other transaction hashes
|
// index 1+ other transaction hashes
|
||||||
//
|
//
|
||||||
const uint32_t coinbase_tx_index = is_fcmp_pp ? 2 : 0;
|
// Update: FCMP moved coinbase tx to index 0 to stay consistent with pre-fork layout
|
||||||
|
|
||||||
m_hashes.clear();
|
m_hashes.clear();
|
||||||
m_hashes.resize((coinbase_tx_index + m_numHashes + 1) * kHashSize);
|
m_hashes.resize((m_numHashes + (is_fcmp_pp ? 3 : 1)) * kHashSize);
|
||||||
|
|
||||||
uint8_t* data = m_hashes.data() + coinbase_tx_index * kHashSize;
|
calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), m_hashes.data());
|
||||||
|
|
||||||
calculateMinerTxHash(blob(MINER_TX_PREFIX_OFFSET), blob(MINER_TX_PREFIX_END_OFFSET), data);
|
|
||||||
|
|
||||||
for (uint64_t i = 1; i <= m_numHashes; ++i) {
|
for (uint64_t i = 1; i <= m_numHashes; ++i) {
|
||||||
Span h;
|
Span h;
|
||||||
ar(h, kHashSize);
|
ar(h, kHashSize);
|
||||||
memcpy(data + i * kHashSize, h.data(), kHashSize);
|
memcpy(m_hashes.data() + (i + (is_fcmp_pp ? 2 : 0)) * kHashSize, h.data(), kHashSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_fcmp_pp) {
|
if (is_fcmp_pp) {
|
||||||
ar(m_FCMPTreeLayers);
|
ar(m_FCMPTreeLayers);
|
||||||
ar(m_FCMPTreeRoot);
|
ar(m_FCMPTreeRoot);
|
||||||
|
|
||||||
m_hashes[0] = m_FCMPTreeLayers;
|
m_hashes[kHashSize] = m_FCMPTreeLayers;
|
||||||
memcpy(m_hashes.data() + kHashSize, m_FCMPTreeRoot, kHashSize);
|
memcpy(m_hashes.data() + kHashSize * 2, m_FCMPTreeRoot, kHashSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
calculateMerkleTreeHash(coinbase_tx_index);
|
calculateMerkleTreeHash(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@@ -74,6 +74,11 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef MADV_COLLAPSE
|
||||||
|
# define MADV_COLLAPSE 25
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(XMRIG_OS_LINUX) || (!defined(XMRIG_OS_APPLE) && !defined(XMRIG_OS_FREEBSD))
|
#if defined(XMRIG_OS_LINUX) || (!defined(XMRIG_OS_APPLE) && !defined(XMRIG_OS_FREEBSD))
|
||||||
static inline int hugePagesFlag(size_t size)
|
static inline int hugePagesFlag(size_t size)
|
||||||
{
|
{
|
||||||
@@ -278,8 +283,9 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()
|
|||||||
|
|
||||||
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
|
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
|
||||||
{
|
{
|
||||||
# ifdef XMRIG_OS_LINUX
|
# if defined(XMRIG_OS_ANDROID) || defined(XMRIG_OS_LINUX)
|
||||||
return (madvise(p, size, MADV_HUGEPAGE) == 0);
|
// MADV_COLLAPSE works even if /sys/kernel/mm/transparent_hugepage/enabled is set to "never", but only on Linux 6.1+
|
||||||
|
return (madvise(p, size, MADV_COLLAPSE) == 0) || (madvise(p, size, MADV_HUGEPAGE) == 0);
|
||||||
# else
|
# else
|
||||||
return false;
|
return false;
|
||||||
# endif
|
# endif
|
||||||
|
|||||||
@@ -64,6 +64,8 @@ constexpr uint32_t MOVN = 0x92800000;
|
|||||||
constexpr uint32_t MOVK = 0xF2800000;
|
constexpr uint32_t MOVK = 0xF2800000;
|
||||||
constexpr uint32_t ADD_IMM_LO = 0x91000000;
|
constexpr uint32_t ADD_IMM_LO = 0x91000000;
|
||||||
constexpr uint32_t ADD_IMM_HI = 0x91400000;
|
constexpr uint32_t ADD_IMM_HI = 0x91400000;
|
||||||
|
constexpr uint32_t SUB_IMM_LO = 0xD1000000;
|
||||||
|
constexpr uint32_t SUB_IMM_HI = 0xD1400000;
|
||||||
constexpr uint32_t LDR_LITERAL = 0x58000000;
|
constexpr uint32_t LDR_LITERAL = 0x58000000;
|
||||||
constexpr uint32_t ROR = 0x9AC02C00;
|
constexpr uint32_t ROR = 0x9AC02C00;
|
||||||
constexpr uint32_t ROR_IMM = 0x93C00000;
|
constexpr uint32_t ROR_IMM = 0x93C00000;
|
||||||
@@ -139,7 +141,7 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
|
|||||||
|
|
||||||
codePos = PrologueSize;
|
codePos = PrologueSize;
|
||||||
literalPos = ImulRcpLiteralsEnd;
|
literalPos = ImulRcpLiteralsEnd;
|
||||||
num32bitLiterals = 0;
|
num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk
|
||||||
|
|
||||||
for (uint32_t i = 0; i < RegistersCount; ++i)
|
for (uint32_t i = 0; i < RegistersCount; ++i)
|
||||||
reg_changed_offset[i] = codePos;
|
reg_changed_offset[i] = codePos;
|
||||||
@@ -235,7 +237,7 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
|
|||||||
|
|
||||||
codePos = PrologueSize;
|
codePos = PrologueSize;
|
||||||
literalPos = ImulRcpLiteralsEnd;
|
literalPos = ImulRcpLiteralsEnd;
|
||||||
num32bitLiterals = 0;
|
num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk
|
||||||
|
|
||||||
for (uint32_t i = 0; i < RegistersCount; ++i)
|
for (uint32_t i = 0; i < RegistersCount; ++i)
|
||||||
reg_changed_offset[i] = codePos;
|
reg_changed_offset[i] = codePos;
|
||||||
@@ -486,13 +488,31 @@ void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code,
|
|||||||
{
|
{
|
||||||
uint32_t k = codePos;
|
uint32_t k = codePos;
|
||||||
|
|
||||||
|
// 196606 different values can be encoded with a single instruction, the rest requires smov/umov load, or movn/movz+movk pair
|
||||||
if (imm < (1 << 16))
|
if (imm < (1 << 16))
|
||||||
{
|
{
|
||||||
|
// Sign-extended 64-bit value: 0x000000000000xxxx
|
||||||
// movz tmp_reg, imm32 (16 low bits)
|
// movz tmp_reg, imm32 (16 low bits)
|
||||||
emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
|
emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
|
||||||
}
|
}
|
||||||
|
else if ((imm >> 16) == 0xFFFF) {
|
||||||
|
// Sign-extended 64-bit value: 0xFFFFFFFFFFFFxxxx
|
||||||
|
// movn tmp_reg, ~imm32 (16 low bits)
|
||||||
|
emit32(ARMV8A::MOVN | dst | ((~imm & 0xFFFF) << 5), code, k);
|
||||||
|
}
|
||||||
|
else if (((imm & 0xFFFF) == 0xFFFF) && (static_cast<int32_t>(imm) < 0)) {
|
||||||
|
// Sign-extended 64-bit value: 0xFFFFFFFFxxxxFFFF
|
||||||
|
// movn tmp_reg, ~imm32 (16 high bits)
|
||||||
|
emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k);
|
||||||
|
}
|
||||||
|
else if (((imm & 0xFFFF) == 0) && (static_cast<int32_t>(imm) >= 0)) {
|
||||||
|
// Sign-extended 64-bit value: 0x00000000xxxx0000
|
||||||
|
// movz tmp_reg, imm32 (16 high bits)
|
||||||
|
emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// Full sign-extended 64-bit value: 0x00000000xxxxxxxx or 0xFFFFFFFFxxxxxxxx
|
||||||
if (num32bitLiterals < 64)
|
if (num32bitLiterals < 64)
|
||||||
{
|
{
|
||||||
if (static_cast<int32_t>(imm) < 0)
|
if (static_cast<int32_t>(imm) < 0)
|
||||||
@@ -534,23 +554,40 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
|
|||||||
{
|
{
|
||||||
uint32_t k = codePos;
|
uint32_t k = codePos;
|
||||||
|
|
||||||
if (imm < (1 << 24))
|
if (imm == 0) {
|
||||||
{
|
if (dst != src) {
|
||||||
const uint32_t imm_lo = imm & ((1 << 12) - 1);
|
emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
|
||||||
const uint32_t imm_hi = imm >> 12;
|
}
|
||||||
|
|
||||||
if (imm_lo && imm_hi)
|
codePos = k;
|
||||||
{
|
return;
|
||||||
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
|
|
||||||
emit32(ARMV8A::ADD_IMM_HI | dst | (dst << 5) | (imm_hi << 10), code, k);
|
|
||||||
}
|
}
|
||||||
else if (imm_lo)
|
|
||||||
{
|
const int32_t simm = static_cast<int32_t>(imm);
|
||||||
emit32(ARMV8A::ADD_IMM_LO | dst | (src << 5) | (imm_lo << 10), code, k);
|
|
||||||
|
uint32_t mag, opLo, opHi;
|
||||||
|
|
||||||
|
if (simm > 0) {
|
||||||
|
mag = imm;
|
||||||
|
opLo = ARMV8A::ADD_IMM_LO;
|
||||||
|
opHi = ARMV8A::ADD_IMM_HI;
|
||||||
|
} else {
|
||||||
|
mag = static_cast<uint32_t>(-static_cast<int64_t>(simm));
|
||||||
|
opLo = ARMV8A::SUB_IMM_LO;
|
||||||
|
opHi = ARMV8A::SUB_IMM_HI;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
if (mag < (1u << 24)) {
|
||||||
emit32(ARMV8A::ADD_IMM_HI | dst | (src << 5) | (imm_hi << 10), code, k);
|
const uint32_t lo = mag & ((1u << 12) - 1);
|
||||||
|
const uint32_t hi = mag >> 12;
|
||||||
|
|
||||||
|
if (lo && hi) {
|
||||||
|
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
|
||||||
|
emit32(opHi | dst | (dst << 5) | (hi << 10), code, k);
|
||||||
|
} else if (lo) {
|
||||||
|
emit32(opLo | dst | (src << 5) | (lo << 10), code, k);
|
||||||
|
} else {
|
||||||
|
emit32(opHi | dst | (src << 5) | (hi << 10), code, k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -592,18 +629,17 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
imm = (imm & ScratchpadL3Mask) >> 3;
|
imm = (imm & ScratchpadL3Mask) >> 3;
|
||||||
if (imm)
|
if (imm < 4096) {
|
||||||
|
// ldr tmp_reg, [x2, #imm*8]
|
||||||
|
emit32(0xf9400040 | tmp_reg | (imm << 10), code, k);
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
emitMovImmediate(tmp_reg, imm, code, k);
|
emitMovImmediate(tmp_reg, imm, code, k);
|
||||||
|
|
||||||
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
|
// ldr tmp_reg, [x2, tmp_reg, lsl 3]
|
||||||
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
|
emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
// ldr tmp_reg, [x2]
|
|
||||||
emit32(0xf9400040 | tmp_reg, code, k);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
codePos = k;
|
codePos = k;
|
||||||
@@ -744,7 +780,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
|
|||||||
constexpr uint32_t tmp_reg = 20;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
emitMemLoad<tmp_reg>(dst, src, instr, code, k);
|
||||||
|
|
||||||
// sub dst, dst, tmp_reg
|
// mul dst, dst, tmp_reg
|
||||||
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
|
emit32(ARMV8A::MUL | dst | (dst << 5) | (tmp_reg << 16), code, k);
|
||||||
|
|
||||||
reg_changed_offset[instr.dst] = k;
|
reg_changed_offset[instr.dst] = k;
|
||||||
@@ -1059,11 +1095,8 @@ void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
|
|||||||
constexpr uint32_t tmp_reg_fp = 28;
|
constexpr uint32_t tmp_reg_fp = 28;
|
||||||
emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
|
emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
|
||||||
|
|
||||||
// and tmp_reg_fp, tmp_reg_fp, and_mask_reg
|
// bif tmp_reg_fp, or_mask_reg, and_mask_reg
|
||||||
emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
|
emit32(0x6EE01C00 | tmp_reg_fp | (30 << 5) | (29 << 16), code, k);
|
||||||
|
|
||||||
// orr tmp_reg_fp, tmp_reg_fp, or_mask_reg
|
|
||||||
emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
|
|
||||||
|
|
||||||
emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
|
emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
|
||||||
|
|
||||||
@@ -1112,17 +1145,19 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
|
|||||||
constexpr uint32_t tmp_reg = 20;
|
constexpr uint32_t tmp_reg = 20;
|
||||||
constexpr uint32_t fpcr_tmp_reg = 8;
|
constexpr uint32_t fpcr_tmp_reg = 8;
|
||||||
|
|
||||||
// ror tmp_reg, src, imm
|
|
||||||
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
|
|
||||||
|
|
||||||
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
if (RandomX_CurrentConfig.Tweak_V2_CFROUND) {
|
||||||
// tst tmp_reg, 60
|
const uint32_t immr = (62 - instr.getImm32()) & 63;
|
||||||
emit32(0xF27E0E9F, code, k);
|
|
||||||
|
// tst src, ROR(60, -(instr.getImm32() & 63))
|
||||||
|
emit32(0xF2400C1F | (immr << 16) | (src << 5), code, k);
|
||||||
|
|
||||||
// bne next
|
// bne next
|
||||||
emit32(0x54000081, code, k);
|
emit32(0x540000A1, code, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ror tmp_reg, src, imm
|
||||||
|
emit32(ARMV8A::ROR_IMM | tmp_reg | (src << 5) | ((instr.getImm32() & 63) << 10) | (src << 16), code, k);
|
||||||
|
|
||||||
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
|
// bfi fpcr_tmp_reg, tmp_reg, 40, 2
|
||||||
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
|
emit32(0xB3580400 | fpcr_tmp_reg | (tmp_reg << 5), code, k);
|
||||||
|
|
||||||
|
|||||||
@@ -109,7 +109,7 @@
|
|||||||
# v26 -> "a2"
|
# v26 -> "a2"
|
||||||
# v27 -> "a3"
|
# v27 -> "a3"
|
||||||
# v28 -> temporary
|
# v28 -> temporary
|
||||||
# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
|
# v29 -> E 'and' mask = 0x00ffffffffc00000'00ffffffffc00000
|
||||||
# v30 -> E 'or' mask = 0x3*00000000******'3*00000000******
|
# v30 -> E 'or' mask = 0x3*00000000******'3*00000000******
|
||||||
# v31 -> scale mask = 0x80f0000000000000'80f0000000000000
|
# v31 -> scale mask = 0x80f0000000000000'80f0000000000000
|
||||||
|
|
||||||
@@ -151,7 +151,9 @@ DECL(randomx_program_aarch64):
|
|||||||
ldp q26, q27, [x0, 224]
|
ldp q26, q27, [x0, 224]
|
||||||
|
|
||||||
# Load E 'and' mask
|
# Load E 'and' mask
|
||||||
movi v29.2d, #0x00FFFFFFFFFFFFFF
|
mov x16, 0x00FFFFFFFFC00000
|
||||||
|
ins v29.d[0], x16
|
||||||
|
ins v29.d[1], x16
|
||||||
|
|
||||||
# Load E 'or' mask (stored in reg.f[0])
|
# Load E 'or' mask (stored in reg.f[0])
|
||||||
ldr q30, [x0, 64]
|
ldr q30, [x0, 64]
|
||||||
@@ -185,6 +187,7 @@ DECL(randomx_program_aarch64):
|
|||||||
ldp x27, x28, [x30, -32] // literal_x27
|
ldp x27, x28, [x30, -32] // literal_x27
|
||||||
ldp x29, x30, [x30, -16] // literal_x29
|
ldp x29, x30, [x30, -16] // literal_x29
|
||||||
|
|
||||||
|
.balign 64
|
||||||
DECL(randomx_program_aarch64_main_loop):
|
DECL(randomx_program_aarch64_main_loop):
|
||||||
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
||||||
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
|
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
|
||||||
@@ -213,40 +216,34 @@ DECL(randomx_program_aarch64_main_loop):
|
|||||||
eor x14, x14, x20
|
eor x14, x14, x20
|
||||||
eor x15, x15, x19
|
eor x15, x15, x19
|
||||||
|
|
||||||
# Load group F registers (spAddr1)
|
# Load group F/E registers (spAddr1)
|
||||||
ldr q17, [x17]
|
ldp q17, q19, [x17]
|
||||||
sxtl v16.2d, v17.2s
|
ldp q21, q23, [x17, 32]
|
||||||
scvtf v16.2d, v16.2d
|
|
||||||
sxtl2 v17.2d, v17.4s
|
|
||||||
scvtf v17.2d, v17.2d
|
|
||||||
|
|
||||||
ldr q19, [x17, 16]
|
sxtl v16.2d, v17.2s
|
||||||
|
sxtl2 v17.2d, v17.4s
|
||||||
sxtl v18.2d, v19.2s
|
sxtl v18.2d, v19.2s
|
||||||
scvtf v18.2d, v18.2d
|
|
||||||
sxtl2 v19.2d, v19.4s
|
sxtl2 v19.2d, v19.4s
|
||||||
|
|
||||||
|
scvtf v16.2d, v16.2d
|
||||||
|
scvtf v17.2d, v17.2d
|
||||||
|
scvtf v18.2d, v18.2d
|
||||||
scvtf v19.2d, v19.2d
|
scvtf v19.2d, v19.2d
|
||||||
|
|
||||||
# Load group E registers (spAddr1)
|
|
||||||
ldr q21, [x17, 32]
|
|
||||||
sxtl v20.2d, v21.2s
|
sxtl v20.2d, v21.2s
|
||||||
scvtf v20.2d, v20.2d
|
|
||||||
sxtl2 v21.2d, v21.4s
|
sxtl2 v21.2d, v21.4s
|
||||||
scvtf v21.2d, v21.2d
|
|
||||||
|
|
||||||
ldr q23, [x17, 48]
|
|
||||||
sxtl v22.2d, v23.2s
|
sxtl v22.2d, v23.2s
|
||||||
scvtf v22.2d, v22.2d
|
|
||||||
sxtl2 v23.2d, v23.4s
|
sxtl2 v23.2d, v23.4s
|
||||||
|
|
||||||
|
scvtf v20.2d, v20.2d
|
||||||
|
scvtf v21.2d, v21.2d
|
||||||
|
scvtf v22.2d, v22.2d
|
||||||
scvtf v23.2d, v23.2d
|
scvtf v23.2d, v23.2d
|
||||||
|
|
||||||
and v20.16b, v20.16b, v29.16b
|
bif v20.16b, v30.16b, v29.16b
|
||||||
and v21.16b, v21.16b, v29.16b
|
bif v21.16b, v30.16b, v29.16b
|
||||||
and v22.16b, v22.16b, v29.16b
|
bif v22.16b, v30.16b, v29.16b
|
||||||
and v23.16b, v23.16b, v29.16b
|
bif v23.16b, v30.16b, v29.16b
|
||||||
orr v20.16b, v20.16b, v30.16b
|
|
||||||
orr v21.16b, v21.16b, v30.16b
|
|
||||||
orr v22.16b, v22.16b, v30.16b
|
|
||||||
orr v23.16b, v23.16b, v30.16b
|
|
||||||
|
|
||||||
# Execute VM instructions
|
# Execute VM instructions
|
||||||
DECL(randomx_program_aarch64_vm_instructions):
|
DECL(randomx_program_aarch64_vm_instructions):
|
||||||
@@ -285,6 +282,7 @@ literal_v13: .fill 2,8,0
|
|||||||
literal_v14: .fill 2,8,0
|
literal_v14: .fill 2,8,0
|
||||||
literal_v15: .fill 2,8,0
|
literal_v15: .fill 2,8,0
|
||||||
|
|
||||||
|
.balign 64
|
||||||
DECL(randomx_program_aarch64_vm_instructions_end):
|
DECL(randomx_program_aarch64_vm_instructions_end):
|
||||||
# Calculate dataset pointer for dataset read
|
# Calculate dataset pointer for dataset read
|
||||||
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
|
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
|
||||||
@@ -347,15 +345,18 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
|||||||
|
|
||||||
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
|
# f0 = aesenc(f0, e0), f1 = aesdec(f1, e0), f2 = aesenc(f2, e0), f3 = aesdec(f3, e0)
|
||||||
|
|
||||||
aese v16.16b, v28.16b
|
# Quote from ARM optimization guides for basically all ARM Cortex CPUs starting from Cortex-A53:
|
||||||
aesd v17.16b, v28.16b
|
#
|
||||||
aese v18.16b, v28.16b
|
# "pairs of dependent AESE/AESMC and AESD/AESIMC instructions are higher performance
|
||||||
aesd v19.16b, v28.16b
|
# when they are adjacent in the program code and both instructions use the same
|
||||||
|
# destination register since they are fused"
|
||||||
|
#
|
||||||
|
# Same applies to all Apple silicon CPUs
|
||||||
|
|
||||||
aesmc v16.16b, v16.16b
|
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||||
aesimc v17.16b, v17.16b
|
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||||
aesmc v18.16b, v18.16b
|
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||||
aesimc v19.16b, v19.16b
|
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||||
|
|
||||||
eor v16.16b, v16.16b, v20.16b
|
eor v16.16b, v16.16b, v20.16b
|
||||||
eor v17.16b, v17.16b, v20.16b
|
eor v17.16b, v17.16b, v20.16b
|
||||||
@@ -364,15 +365,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
|||||||
|
|
||||||
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
|
# f0 = aesenc(f0, e1), f1 = aesdec(f1, e1), f2 = aesenc(f2, e1), f3 = aesdec(f3, e1)
|
||||||
|
|
||||||
aese v16.16b, v28.16b
|
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||||
aesd v17.16b, v28.16b
|
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||||
aese v18.16b, v28.16b
|
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||||
aesd v19.16b, v28.16b
|
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||||
|
|
||||||
aesmc v16.16b, v16.16b
|
|
||||||
aesimc v17.16b, v17.16b
|
|
||||||
aesmc v18.16b, v18.16b
|
|
||||||
aesimc v19.16b, v19.16b
|
|
||||||
|
|
||||||
eor v16.16b, v16.16b, v21.16b
|
eor v16.16b, v16.16b, v21.16b
|
||||||
eor v17.16b, v17.16b, v21.16b
|
eor v17.16b, v17.16b, v21.16b
|
||||||
@@ -381,15 +377,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
|||||||
|
|
||||||
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
|
# f0 = aesenc(f0, e2), f1 = aesdec(f1, e2), f2 = aesenc(f2, e2), f3 = aesdec(f3, e2)
|
||||||
|
|
||||||
aese v16.16b, v28.16b
|
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||||
aesd v17.16b, v28.16b
|
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||||
aese v18.16b, v28.16b
|
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||||
aesd v19.16b, v28.16b
|
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||||
|
|
||||||
aesmc v16.16b, v16.16b
|
|
||||||
aesimc v17.16b, v17.16b
|
|
||||||
aesmc v18.16b, v18.16b
|
|
||||||
aesimc v19.16b, v19.16b
|
|
||||||
|
|
||||||
eor v16.16b, v16.16b, v22.16b
|
eor v16.16b, v16.16b, v22.16b
|
||||||
eor v17.16b, v17.16b, v22.16b
|
eor v17.16b, v17.16b, v22.16b
|
||||||
@@ -398,15 +389,10 @@ DECL(randomx_program_aarch64_v2_FE_mix):
|
|||||||
|
|
||||||
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
|
# f0 = aesenc(f0, e3), f1 = aesdec(f1, e3), f2 = aesenc(f2, e3), f3 = aesdec(f3, e3)
|
||||||
|
|
||||||
aese v16.16b, v28.16b
|
aese v16.16b, v28.16b; aesmc v16.16b, v16.16b
|
||||||
aesd v17.16b, v28.16b
|
aesd v17.16b, v28.16b; aesimc v17.16b, v17.16b
|
||||||
aese v18.16b, v28.16b
|
aese v18.16b, v28.16b; aesmc v18.16b, v18.16b
|
||||||
aesd v19.16b, v28.16b
|
aesd v19.16b, v28.16b; aesimc v19.16b, v19.16b
|
||||||
|
|
||||||
aesmc v16.16b, v16.16b
|
|
||||||
aesimc v17.16b, v17.16b
|
|
||||||
aesmc v18.16b, v18.16b
|
|
||||||
aesimc v19.16b, v19.16b
|
|
||||||
|
|
||||||
eor v16.16b, v16.16b, v23.16b
|
eor v16.16b, v16.16b, v23.16b
|
||||||
eor v17.16b, v17.16b, v23.16b
|
eor v17.16b, v17.16b, v23.16b
|
||||||
|
|||||||
@@ -243,9 +243,11 @@ static void imm_to_x5(uint32_t imm, uint8_t*& p)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (imm_hi < (32 << 12)) {
|
const int32_t simm_hi = static_cast<int32_t>(imm_hi);
|
||||||
|
|
||||||
|
if ((simm_hi >= -(32 << 12)) && (simm_hi < (32 << 12))) {
|
||||||
//c.lui x5, imm_hi
|
//c.lui x5, imm_hi
|
||||||
emit16(0x6281 + (imm_hi >> 10));
|
emit16(0x6281 | ((imm_hi & 0x1F000) >> 10) | ((simm_hi < 0) ? 0x1000 : 0));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// lui x5, imm_hi
|
// lui x5, imm_hi
|
||||||
|
|||||||
@@ -129,6 +129,8 @@ v10-v17 = sshash constants
|
|||||||
v18 = temporary
|
v18 = temporary
|
||||||
|
|
||||||
v19 = dataset item store offsets
|
v19 = dataset item store offsets
|
||||||
|
|
||||||
|
v24-v31 = temporary
|
||||||
*/
|
*/
|
||||||
|
|
||||||
DECL(randomx_riscv64_vector_sshash_dataset_init):
|
DECL(randomx_riscv64_vector_sshash_dataset_init):
|
||||||
@@ -180,6 +182,7 @@ DECL(randomx_riscv64_vector_sshash_dataset_init):
|
|||||||
slli x13, x13, 6
|
slli x13, x13, 6
|
||||||
add x13, x13, x11
|
add x13, x13, x11
|
||||||
|
|
||||||
|
.balign 64
|
||||||
init_item:
|
init_item:
|
||||||
// Step 1. Init r0-r7
|
// Step 1. Init r0-r7
|
||||||
|
|
||||||
@@ -216,28 +219,7 @@ DECL(randomx_riscv64_vector_sshash_generated_instructions):
|
|||||||
|
|
||||||
DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
|
DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
|
||||||
// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
|
// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
|
||||||
vsuxei64.v v0, (x11), v19
|
vsuxseg8ei64.v v0, (x11), v19
|
||||||
|
|
||||||
add x5, x11, 8
|
|
||||||
vsuxei64.v v1, (x5), v19
|
|
||||||
|
|
||||||
add x5, x11, 16
|
|
||||||
vsuxei64.v v2, (x5), v19
|
|
||||||
|
|
||||||
add x5, x11, 24
|
|
||||||
vsuxei64.v v3, (x5), v19
|
|
||||||
|
|
||||||
add x5, x11, 32
|
|
||||||
vsuxei64.v v4, (x5), v19
|
|
||||||
|
|
||||||
add x5, x11, 40
|
|
||||||
vsuxei64.v v5, (x5), v19
|
|
||||||
|
|
||||||
add x5, x11, 48
|
|
||||||
vsuxei64.v v6, (x5), v19
|
|
||||||
|
|
||||||
add x5, x11, 56
|
|
||||||
vsuxei64.v v7, (x5), v19
|
|
||||||
|
|
||||||
// Iterate to the next 4 items
|
// Iterate to the next 4 items
|
||||||
vadd.vi v8, v8, 4
|
vadd.vi v8, v8, 4
|
||||||
@@ -293,36 +275,15 @@ DECL(randomx_riscv64_vector_sshash_cache_prefetch):
|
|||||||
|
|
||||||
// Step 6. XOR all registers with data loaded from randomx cache
|
// Step 6. XOR all registers with data loaded from randomx cache
|
||||||
DECL(randomx_riscv64_vector_sshash_xor):
|
DECL(randomx_riscv64_vector_sshash_xor):
|
||||||
vluxei64.v v18, (x10), v9
|
vluxseg8ei64.v v24, (x10), v9
|
||||||
vxor.vv v0, v0, v18
|
vxor.vv v0, v0, v24
|
||||||
|
vxor.vv v1, v1, v25
|
||||||
add x5, x10, 8
|
vxor.vv v2, v2, v26
|
||||||
vluxei64.v v18, (x5), v9
|
vxor.vv v3, v3, v27
|
||||||
vxor.vv v1, v1, v18
|
vxor.vv v4, v4, v28
|
||||||
|
vxor.vv v5, v5, v29
|
||||||
add x5, x10, 16
|
vxor.vv v6, v6, v30
|
||||||
vluxei64.v v18, (x5), v9
|
vxor.vv v7, v7, v31
|
||||||
vxor.vv v2, v2, v18
|
|
||||||
|
|
||||||
add x5, x10, 24
|
|
||||||
vluxei64.v v18, (x5), v9
|
|
||||||
vxor.vv v3, v3, v18
|
|
||||||
|
|
||||||
add x5, x10, 32
|
|
||||||
vluxei64.v v18, (x5), v9
|
|
||||||
vxor.vv v4, v4, v18
|
|
||||||
|
|
||||||
add x5, x10, 40
|
|
||||||
vluxei64.v v18, (x5), v9
|
|
||||||
vxor.vv v5, v5, v18
|
|
||||||
|
|
||||||
add x5, x10, 48
|
|
||||||
vluxei64.v v18, (x5), v9
|
|
||||||
vxor.vv v6, v6, v18
|
|
||||||
|
|
||||||
add x5, x10, 56
|
|
||||||
vluxei64.v v18, (x5), v9
|
|
||||||
vxor.vv v7, v7, v18
|
|
||||||
|
|
||||||
DECL(randomx_riscv64_vector_sshash_end):
|
DECL(randomx_riscv64_vector_sshash_end):
|
||||||
|
|
||||||
@@ -564,6 +525,7 @@ DECL(randomx_riscv64_vector_program_v2_soft_aes_init):
|
|||||||
|
|
||||||
vsetivli zero, 2, e64, m1, ta, ma
|
vsetivli zero, 2, e64, m1, ta, ma
|
||||||
|
|
||||||
|
.balign 64
|
||||||
DECL(randomx_riscv64_vector_program_main_loop):
|
DECL(randomx_riscv64_vector_program_main_loop):
|
||||||
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
||||||
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
#define APP_ID "xmrig"
|
#define APP_ID "xmrig"
|
||||||
#define APP_NAME "XMRig"
|
#define APP_NAME "XMRig"
|
||||||
#define APP_DESC "XMRig miner"
|
#define APP_DESC "XMRig miner"
|
||||||
#define APP_VERSION "6.26.0"
|
#define APP_VERSION "6.26.1-dev"
|
||||||
#define APP_DOMAIN "xmrig.com"
|
#define APP_DOMAIN "xmrig.com"
|
||||||
#define APP_SITE "www.xmrig.com"
|
#define APP_SITE "www.xmrig.com"
|
||||||
#define APP_COPYRIGHT "Copyright (C) 2016-2026 xmrig.com"
|
#define APP_COPYRIGHT "Copyright (C) 2016-2026 xmrig.com"
|
||||||
@@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
#define APP_VER_MAJOR 6
|
#define APP_VER_MAJOR 6
|
||||||
#define APP_VER_MINOR 26
|
#define APP_VER_MINOR 26
|
||||||
#define APP_VER_PATCH 0
|
#define APP_VER_PATCH 1
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
# if (_MSC_VER >= 1950)
|
# if (_MSC_VER >= 1950)
|
||||||
|
|||||||
Reference in New Issue
Block a user