mirror of
https://github.com/xmrig/xmrig.git
synced 2026-01-03 00:22:45 -05:00
Compare commits
21 Commits
5115597e7f
...
dev
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1bd59129c4 | ||
|
|
8ccf7de304 | ||
|
|
30ffb9cb27 | ||
|
|
d3a84c4b52 | ||
|
|
eb49237aaa | ||
|
|
e1efd3dc7f | ||
|
|
e3d0135708 | ||
|
|
f661e1eb30 | ||
|
|
99488751f1 | ||
|
|
5fb0321c84 | ||
|
|
753859caea | ||
|
|
712a5a5e66 | ||
|
|
290a0de6e5 | ||
|
|
e0564b5fdd | ||
|
|
482a1f0b40 | ||
|
|
856813c1ae | ||
|
|
23da1a90f5 | ||
|
|
7981e4a76a | ||
|
|
7ef5142a52 | ||
|
|
db5c6d9190 | ||
|
|
e88009d575 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -5,3 +5,4 @@ scripts/deps
|
||||
/.idea
|
||||
/src/backend/opencl/cl/cn/cryptonight_gen.cl
|
||||
.vscode
|
||||
/.qtcreator
|
||||
|
||||
14
CHANGELOG.md
14
CHANGELOG.md
@@ -1,3 +1,17 @@
|
||||
# v6.25.0
|
||||
- [#3680](https://github.com/xmrig/xmrig/pull/3680) Added `armv8l` to the list of 32-bit ARM targets.
|
||||
- [#3708](https://github.com/xmrig/xmrig/pull/3708) Minor Aarch64 JIT changes (better instruction selection, don't emit instructions that add 0, etc).
|
||||
- [#3718](https://github.com/xmrig/xmrig/pull/3718) Solo mining: added support for FCMP++ hardfork.
|
||||
- [#3722](https://github.com/xmrig/xmrig/pull/3722) Added Zen4 (Hawk Point) CPUs detection.
|
||||
- [#3725](https://github.com/xmrig/xmrig/pull/3725) Added **RISC-V** support with JIT compiler.
|
||||
- [#3731](https://github.com/xmrig/xmrig/pull/3731) Added initial Haiku OS support.
|
||||
- [#3733](https://github.com/xmrig/xmrig/pull/3733) Added detection for MSVC/2026.
|
||||
- [#3736](https://github.com/xmrig/xmrig/pull/3736) RISC-V: added vectorized dataset init.
|
||||
- [#3740](https://github.com/xmrig/xmrig/pull/3740) RISC-V: added vectorized soft AES.
|
||||
- [#3743](https://github.com/xmrig/xmrig/pull/3743) Linux: added support for transparent huge pages.
|
||||
- Improved LibreSSL support.
|
||||
- Improved compatibility for automatically enabling huge pages on Linux systems without NUMA support.
|
||||
|
||||
# v6.24.0
|
||||
- [#3671](https://github.com/xmrig/xmrig/pull/3671) Fixed detection of L2 cache size for some complex NUMA topologies.
|
||||
- [#3674](https://github.com/xmrig/xmrig/pull/3674) Fixed ARMv7 build.
|
||||
|
||||
@@ -51,10 +51,34 @@ if (XMRIG_RISCV)
|
||||
# default build uses the RV64GC baseline
|
||||
set(RVARCH "rv64gc")
|
||||
|
||||
# for native builds, enable Zba and Zbb if supported by the CPU
|
||||
if(ARCH STREQUAL "native")
|
||||
enable_language(ASM)
|
||||
|
||||
try_run(RANDOMX_VECTOR_RUN_FAIL
|
||||
RANDOMX_VECTOR_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_vector.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gcv")
|
||||
|
||||
if (RANDOMX_VECTOR_COMPILE_OK AND NOT RANDOMX_VECTOR_RUN_FAIL)
|
||||
set(RVARCH_V ON)
|
||||
message(STATUS "RISC-V vector extension detected")
|
||||
else()
|
||||
set(RVARCH_V OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZICBOP_RUN_FAIL
|
||||
RANDOMX_ZICBOP_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zicbop.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zicbop")
|
||||
|
||||
if (RANDOMX_ZICBOP_COMPILE_OK AND NOT RANDOMX_ZICBOP_RUN_FAIL)
|
||||
set(RVARCH_ZICBOP ON)
|
||||
message(STATUS "RISC-V zicbop extension detected")
|
||||
else()
|
||||
set(RVARCH_ZICBOP OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZBA_RUN_FAIL
|
||||
RANDOMX_ZBA_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
@@ -62,8 +86,10 @@ if (XMRIG_RISCV)
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zba")
|
||||
|
||||
if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL)
|
||||
set(RVARCH "${RVARCH}_zba")
|
||||
set(RVARCH_ZBA ON)
|
||||
message(STATUS "RISC-V zba extension detected")
|
||||
else()
|
||||
set(RVARCH_ZBA OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZBB_RUN_FAIL
|
||||
@@ -73,8 +99,57 @@ if (XMRIG_RISCV)
|
||||
COMPILE_DEFINITIONS "-march=rv64gc_zbb")
|
||||
|
||||
if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL)
|
||||
set(RVARCH "${RVARCH}_zbb")
|
||||
set(RVARCH_ZBB ON)
|
||||
message(STATUS "RISC-V zbb extension detected")
|
||||
else()
|
||||
set(RVARCH_ZBB OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZVKB_RUN_FAIL
|
||||
RANDOMX_ZVKB_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zvkb.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gcv_zvkb")
|
||||
|
||||
if (RANDOMX_ZVKB_COMPILE_OK AND NOT RANDOMX_ZVKB_RUN_FAIL)
|
||||
set(RVARCH_ZVKB ON)
|
||||
message(STATUS "RISC-V zvkb extension detected")
|
||||
else()
|
||||
set(RVARCH_ZVKB OFF)
|
||||
endif()
|
||||
|
||||
try_run(RANDOMX_ZVKNED_RUN_FAIL
|
||||
RANDOMX_ZVKNED_COMPILE_OK
|
||||
${CMAKE_CURRENT_BINARY_DIR}/
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/crypto/randomx/tests/riscv64_zvkned.s
|
||||
COMPILE_DEFINITIONS "-march=rv64gcv_zvkned")
|
||||
|
||||
if (RANDOMX_ZVKNED_COMPILE_OK AND NOT RANDOMX_ZVKNED_RUN_FAIL)
|
||||
set(RVARCH_ZVKNED ON)
|
||||
message(STATUS "RISC-V zvkned extension detected")
|
||||
else()
|
||||
set(RVARCH_ZVKNED OFF)
|
||||
endif()
|
||||
|
||||
# for native builds, enable Zba and Zbb if supported by the CPU
|
||||
if (ARCH STREQUAL "native")
|
||||
if (RVARCH_V)
|
||||
set(RVARCH "${RVARCH}v")
|
||||
endif()
|
||||
if (RVARCH_ZICBOP)
|
||||
set(RVARCH "${RVARCH}_zicbop")
|
||||
endif()
|
||||
if (RVARCH_ZBA)
|
||||
set(RVARCH "${RVARCH}_zba")
|
||||
endif()
|
||||
if (RVARCH_ZBB)
|
||||
set(RVARCH "${RVARCH}_zbb")
|
||||
endif()
|
||||
if (RVARCH_ZVKB)
|
||||
set(RVARCH "${RVARCH}_zvkb")
|
||||
endif()
|
||||
if (RVARCH_ZVKNED)
|
||||
set(RVARCH "${RVARCH}_zvkned")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -83,10 +83,36 @@ if (WITH_RANDOMX)
|
||||
elseif (XMRIG_RISCV AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_rv64_static.S
|
||||
src/crypto/randomx/jit_compiler_rv64_vector_static.S
|
||||
src/crypto/randomx/jit_compiler_rv64.cpp
|
||||
src/crypto/randomx/jit_compiler_rv64_vector.cpp
|
||||
src/crypto/randomx/aes_hash_rv64_vector.cpp
|
||||
src/crypto/randomx/aes_hash_rv64_zvkned.cpp
|
||||
)
|
||||
# cheat because cmake and ccache hate each other
|
||||
set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_static.S PROPERTY LANGUAGE C)
|
||||
set_property(SOURCE src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTY LANGUAGE C)
|
||||
|
||||
set(RV64_VECTOR_FILE_ARCH "rv64gcv")
|
||||
|
||||
if (ARCH STREQUAL "native")
|
||||
if (RVARCH_ZICBOP)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zicbop")
|
||||
endif()
|
||||
if (RVARCH_ZBA)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zba")
|
||||
endif()
|
||||
if (RVARCH_ZBB)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zbb")
|
||||
endif()
|
||||
if (RVARCH_ZVKB)
|
||||
set(RV64_VECTOR_FILE_ARCH "${RV64_VECTOR_FILE_ARCH}_zvkb")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_source_files_properties(src/crypto/randomx/jit_compiler_rv64_vector_static.S PROPERTIES COMPILE_FLAGS "-march=${RV64_VECTOR_FILE_ARCH}")
|
||||
set_source_files_properties(src/crypto/randomx/aes_hash_rv64_vector.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}")
|
||||
set_source_files_properties(src/crypto/randomx/aes_hash_rv64_zvkned.cpp PROPERTIES COMPILE_FLAGS "-O3 -march=${RV64_VECTOR_FILE_ARCH}_zvkned")
|
||||
else()
|
||||
list(APPEND SOURCES_CRYPTO
|
||||
src/crypto/randomx/jit_compiler_fallback.cpp
|
||||
|
||||
@@ -89,11 +89,16 @@ static void print_cpu(const Config *)
|
||||
{
|
||||
const auto info = Cpu::info();
|
||||
|
||||
Log::print(GREEN_BOLD(" * ") WHITE_BOLD("%-13s%s (%zu)") " %s %sAES%s",
|
||||
Log::print(GREEN_BOLD(" * ") WHITE_BOLD("%-13s%s (%zu)") " %s %s%sAES%s",
|
||||
"CPU",
|
||||
info->brand(),
|
||||
info->packages(),
|
||||
ICpuInfo::is64bit() ? GREEN_BOLD("64-bit") : RED_BOLD("32-bit"),
|
||||
#ifdef XMRIG_RISCV
|
||||
info->hasRISCV_Vector() ? GREEN_BOLD_S "RVV " : RED_BOLD_S "-RVV ",
|
||||
#else
|
||||
"",
|
||||
#endif
|
||||
info->hasAES() ? GREEN_BOLD_S : RED_BOLD_S "-",
|
||||
info->isVM() ? RED_BOLD_S " VM" : ""
|
||||
);
|
||||
|
||||
@@ -87,14 +87,14 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
|
||||
if (!cn_heavyZen3Memory) {
|
||||
// Round up number of threads to the multiple of 8
|
||||
const size_t num_threads = ((m_threads + 7) / 8) * 8;
|
||||
cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * num_threads, data.hugePages, false, false, node());
|
||||
cn_heavyZen3Memory = new VirtualMemory(m_algorithm.l3() * num_threads, data.hugePages, false, false, node(), VirtualMemory::kDefaultHugePageSize);
|
||||
}
|
||||
m_memory = cn_heavyZen3Memory;
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node());
|
||||
m_memory = new VirtualMemory(m_algorithm.l3() * N, data.hugePages, false, true, node(), VirtualMemory::kDefaultHugePageSize);
|
||||
}
|
||||
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
|
||||
@@ -85,6 +85,7 @@ public:
|
||||
FLAG_POPCNT,
|
||||
FLAG_CAT_L3,
|
||||
FLAG_VM,
|
||||
FLAG_RISCV_VECTOR,
|
||||
FLAG_MAX
|
||||
};
|
||||
|
||||
@@ -109,6 +110,7 @@ public:
|
||||
virtual bool hasOneGbPages() const = 0;
|
||||
virtual bool hasXOP() const = 0;
|
||||
virtual bool isVM() const = 0;
|
||||
virtual bool hasRISCV_Vector() const = 0;
|
||||
virtual bool jccErratum() const = 0;
|
||||
virtual const char *backend() const = 0;
|
||||
virtual const char *brand() const = 0;
|
||||
|
||||
@@ -58,8 +58,8 @@
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
constexpr size_t kCpuFlagsSize = 15;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "vaes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" };
|
||||
constexpr size_t kCpuFlagsSize = 16;
|
||||
static const std::array<const char *, kCpuFlagsSize> flagNames = { "aes", "vaes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm", "rvv" };
|
||||
static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch");
|
||||
|
||||
|
||||
|
||||
@@ -52,6 +52,7 @@ protected:
|
||||
inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); }
|
||||
inline bool hasXOP() const override { return has(FLAG_XOP); }
|
||||
inline bool isVM() const override { return has(FLAG_VM); }
|
||||
inline bool hasRISCV_Vector() const override { return has(FLAG_RISCV_VECTOR); }
|
||||
inline bool jccErratum() const override { return m_jccErratum; }
|
||||
inline const char *brand() const override { return m_brand; }
|
||||
inline const std::vector<int32_t> &units() const override { return m_units; }
|
||||
|
||||
@@ -34,7 +34,7 @@ namespace xmrig {
|
||||
|
||||
extern String cpu_name_riscv();
|
||||
extern bool has_riscv_vector();
|
||||
extern bool has_riscv_crypto();
|
||||
extern bool has_riscv_aes();
|
||||
|
||||
|
||||
} // namespace xmrig
|
||||
@@ -55,8 +55,11 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
|
||||
strncpy(m_brand, name.data(), sizeof(m_brand) - 1);
|
||||
}
|
||||
|
||||
// Check for crypto extensions (Zknd/Zkne/Zknh - AES and SHA)
|
||||
m_flags.set(FLAG_AES, has_riscv_crypto());
|
||||
// Check for vector extensions
|
||||
m_flags.set(FLAG_RISCV_VECTOR, has_riscv_vector());
|
||||
|
||||
// Check for AES extensions (Zknd/Zkne)
|
||||
m_flags.set(FLAG_AES, has_riscv_aes());
|
||||
|
||||
// RISC-V typically supports 1GB huge pages
|
||||
m_flags.set(FLAG_PDPE1GB, std::ifstream("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages").good());
|
||||
|
||||
@@ -32,9 +32,9 @@ struct riscv_cpu_desc
|
||||
String isa;
|
||||
String uarch;
|
||||
bool has_vector = false;
|
||||
bool has_crypto = false;
|
||||
bool has_aes = false;
|
||||
|
||||
inline bool isReady() const { return !model.isNull(); }
|
||||
inline bool isReady() const { return !isa.isNull(); }
|
||||
};
|
||||
|
||||
static bool lookup_riscv(char *line, const char *pattern, String &value)
|
||||
@@ -81,22 +81,32 @@ static bool read_riscv_cpuinfo(riscv_cpu_desc *desc)
|
||||
lookup_riscv(buf, "model name", desc->model);
|
||||
|
||||
if (lookup_riscv(buf, "isa", desc->isa)) {
|
||||
// Check for vector extensions
|
||||
if (strstr(buf, "zve") || strstr(buf, "v_")) {
|
||||
desc->isa.toLower();
|
||||
|
||||
for (const String& s : desc->isa.split('_')) {
|
||||
const char* p = s.data();
|
||||
const size_t n = s.size();
|
||||
|
||||
if ((s.size() > 4) && (memcmp(p, "rv64", 4) == 0)) {
|
||||
for (size_t i = 4; i < n; ++i) {
|
||||
if (p[i] == 'v') {
|
||||
desc->has_vector = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (s == "zve64d") {
|
||||
desc->has_vector = true;
|
||||
}
|
||||
// Check for crypto extensions (AES, SHA, etc.)
|
||||
// zkn* = NIST crypto suite, zks* = SM crypto suite
|
||||
// Note: zba/zbb/zbc/zbs are bit-manipulation, NOT crypto
|
||||
if (strstr(buf, "zknd") || strstr(buf, "zkne") || strstr(buf, "zknh") ||
|
||||
strstr(buf, "zksed") || strstr(buf, "zksh")) {
|
||||
desc->has_crypto = true;
|
||||
else if ((s == "zvkn") || (s == "zvknc") || (s == "zvkned") || (s == "zvkng")){
|
||||
desc->has_aes = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lookup_riscv(buf, "uarch", desc->uarch);
|
||||
|
||||
if (desc->isReady() && !desc->isa.isNull()) {
|
||||
if (desc->isReady()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -128,11 +138,11 @@ bool has_riscv_vector()
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_riscv_crypto()
|
||||
bool has_riscv_aes()
|
||||
{
|
||||
riscv_cpu_desc desc;
|
||||
if (read_riscv_cpuinfo(&desc)) {
|
||||
return desc.has_crypto;
|
||||
return desc.has_aes;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ xmrig::MemoryPool::MemoryPool(size_t size, bool hugePages, uint32_t node)
|
||||
|
||||
constexpr size_t alignment = 1 << 24;
|
||||
|
||||
m_memory = new VirtualMemory(size * pageSize + alignment, hugePages, false, false, node);
|
||||
m_memory = new VirtualMemory(size * pageSize + alignment, hugePages, false, false, node, VirtualMemory::kDefaultHugePageSize);
|
||||
|
||||
m_alignOffset = (alignment - (((size_t)m_memory->scratchpad()) % alignment)) % alignment;
|
||||
}
|
||||
|
||||
@@ -75,6 +75,16 @@ xmrig::VirtualMemory::VirtualMemory(size_t size, bool hugePages, bool oneGbPages
|
||||
}
|
||||
|
||||
m_scratchpad = static_cast<uint8_t*>(_mm_malloc(m_size, alignSize));
|
||||
|
||||
// Huge pages failed to allocate, but try to enable transparent huge pages for the range
|
||||
if (alignSize >= kDefaultHugePageSize) {
|
||||
if (m_scratchpad) {
|
||||
adviseLargePages(m_scratchpad, m_size);
|
||||
}
|
||||
else {
|
||||
m_scratchpad = static_cast<uint8_t*>(_mm_malloc(m_size, 64));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ public:
|
||||
static void *allocateExecutableMemory(size_t size, bool hugePages);
|
||||
static void *allocateLargePagesMemory(size_t size);
|
||||
static void *allocateOneGbPagesMemory(size_t size);
|
||||
static bool adviseLargePages(void *p, size_t size);
|
||||
static void destroy();
|
||||
static void flushInstructionCache(void *p, size_t size);
|
||||
static void freeLargePagesMemory(void *p, size_t size);
|
||||
|
||||
@@ -276,6 +276,16 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()
|
||||
}
|
||||
|
||||
|
||||
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
|
||||
{
|
||||
# ifdef XMRIG_OS_LINUX
|
||||
return (madvise(p, size, MADV_HUGEPAGE) == 0);
|
||||
# else
|
||||
return false;
|
||||
# endif
|
||||
}
|
||||
|
||||
|
||||
void xmrig::VirtualMemory::freeLargePagesMemory()
|
||||
{
|
||||
if (m_flags.test(FLAG_LOCK)) {
|
||||
|
||||
@@ -260,6 +260,12 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()
|
||||
}
|
||||
|
||||
|
||||
bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void xmrig::VirtualMemory::freeLargePagesMemory()
|
||||
{
|
||||
freeLargePagesMemory(m_scratchpad, m_size);
|
||||
|
||||
@@ -38,6 +38,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "crypto/randomx/common.hpp"
|
||||
#include "crypto/rx/Profiler.h"
|
||||
|
||||
#ifdef XMRIG_RISCV
|
||||
#include "backend/cpu/Cpu.h"
|
||||
#include "crypto/randomx/aes_hash_rv64_vector.hpp"
|
||||
#include "crypto/randomx/aes_hash_rv64_zvkned.hpp"
|
||||
#endif
|
||||
|
||||
#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
|
||||
#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
|
||||
#define AES_HASH_1R_STATE2 0xe8a07ce4, 0x5079506b, 0xae62c7d0, 0x6a770017
|
||||
@@ -59,7 +65,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
Hashing throughput: >20 GiB/s per CPU core with hardware AES
|
||||
*/
|
||||
template<int softAes>
|
||||
void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
|
||||
void hashAes1Rx4(const void *input, size_t inputSize, void *hash)
|
||||
{
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
hashAes1Rx4_zvkned(input, inputSize, hash);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
hashAes1Rx4_RVV<softAes>(input, inputSize, hash);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint8_t* inptr = (uint8_t*)input;
|
||||
const uint8_t* inputEnd = inptr + inputSize;
|
||||
|
||||
@@ -127,7 +146,20 @@ template void hashAes1Rx4<true>(const void *input, size_t inputSize, void *hash)
|
||||
calls to this function.
|
||||
*/
|
||||
template<int softAes>
|
||||
void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
|
||||
void fillAes1Rx4(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
fillAes1Rx4_zvkned(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
fillAes1Rx4_RVV<softAes>(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
@@ -171,7 +203,20 @@ static constexpr randomx::Instruction inst{ 0xFF, 7, 7, 0xFF, 0xFFFFFFFFU };
|
||||
alignas(16) static const randomx::Instruction inst_mask[2] = { inst, inst };
|
||||
|
||||
template<int softAes>
|
||||
void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {
|
||||
void fillAes4Rx4(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
fillAes4Rx4_zvkned(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
fillAes4Rx4_RVV<softAes>(state, outputSize, buffer);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
@@ -236,9 +281,22 @@ template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state)
|
||||
{
|
||||
PROFILE_SCOPE(RandomX_AES);
|
||||
|
||||
#ifdef XMRIG_RISCV
|
||||
if (xmrig::Cpu::info()->hasAES()) {
|
||||
hashAndFillAes1Rx4_zvkned(scratchpad, scratchpadSize, hash, fill_state);
|
||||
return;
|
||||
}
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
hashAndFillAes1Rx4_RVV<softAes, unroll>(scratchpad, scratchpadSize, hash, fill_state);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
@@ -387,37 +445,48 @@ hashAndFillAes1Rx4_impl* softAESImpl = &hashAndFillAes1Rx4<1,1>;
|
||||
void SelectSoftAESImpl(size_t threadsCount)
|
||||
{
|
||||
constexpr uint64_t test_length_ms = 100;
|
||||
|
||||
const std::array<hashAndFillAes1Rx4_impl *, 4> impl = {
|
||||
&hashAndFillAes1Rx4<1,1>,
|
||||
&hashAndFillAes1Rx4<2,1>,
|
||||
&hashAndFillAes1Rx4<2,2>,
|
||||
&hashAndFillAes1Rx4<2,4>,
|
||||
};
|
||||
|
||||
size_t fast_idx = 0;
|
||||
double fast_speed = 0.0;
|
||||
|
||||
for (size_t run = 0; run < 3; ++run) {
|
||||
for (size_t i = 0; i < impl.size(); ++i) {
|
||||
const double t1 = xmrig::Chrono::highResolutionMSecs();
|
||||
|
||||
std::vector<uint32_t> count(threadsCount, 0);
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
for (size_t t = 0; t < threadsCount; ++t) {
|
||||
threads.emplace_back([&, t]() {
|
||||
std::vector<uint8_t> scratchpad(10 * 1024);
|
||||
|
||||
alignas(16) uint8_t hash[64] = {};
|
||||
alignas(16) uint8_t state[64] = {};
|
||||
|
||||
do {
|
||||
(*impl[i])(scratchpad.data(), scratchpad.size(), hash, state);
|
||||
++count[t];
|
||||
} while (xmrig::Chrono::highResolutionMSecs() - t1 < test_length_ms);
|
||||
});
|
||||
}
|
||||
|
||||
uint32_t total = 0;
|
||||
|
||||
for (size_t t = 0; t < threadsCount; ++t) {
|
||||
threads[t].join();
|
||||
total += count[t];
|
||||
}
|
||||
|
||||
const double t2 = xmrig::Chrono::highResolutionMSecs();
|
||||
const double speed = total * 1e3 / (t2 - t1);
|
||||
|
||||
if (speed > fast_speed) {
|
||||
fast_idx = i;
|
||||
fast_speed = speed;
|
||||
|
||||
322
src/crypto/randomx/aes_hash_rv64_vector.cpp
Normal file
322
src/crypto/randomx/aes_hash_rv64_vector.cpp
Normal file
@@ -0,0 +1,322 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
#include "crypto/randomx/soft_aes.h"
|
||||
#include "crypto/randomx/randomx.h"
|
||||
|
||||
static FORCE_INLINE vuint32m1_t softaes_vector_double(
|
||||
vuint32m1_t in,
|
||||
vuint32m1_t key,
|
||||
vuint8m1_t i0, vuint8m1_t i1, vuint8m1_t i2, vuint8m1_t i3,
|
||||
const uint32_t* lut0, const uint32_t* lut1, const uint32_t *lut2, const uint32_t* lut3)
|
||||
{
|
||||
const vuint8m1_t in8 = __riscv_vreinterpret_v_u32m1_u8m1(in);
|
||||
|
||||
const vuint32m1_t index0 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i0, 32));
|
||||
const vuint32m1_t index1 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i1, 32));
|
||||
const vuint32m1_t index2 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i2, 32));
|
||||
const vuint32m1_t index3 = __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vrgather_vv_u8m1(in8, i3, 32));
|
||||
|
||||
vuint32m1_t s0 = __riscv_vluxei32_v_u32m1(lut0, __riscv_vsll_vx_u32m1(index0, 2, 8), 8);
|
||||
vuint32m1_t s1 = __riscv_vluxei32_v_u32m1(lut1, __riscv_vsll_vx_u32m1(index1, 2, 8), 8);
|
||||
vuint32m1_t s2 = __riscv_vluxei32_v_u32m1(lut2, __riscv_vsll_vx_u32m1(index2, 2, 8), 8);
|
||||
vuint32m1_t s3 = __riscv_vluxei32_v_u32m1(lut3, __riscv_vsll_vx_u32m1(index3, 2, 8), 8);
|
||||
|
||||
s0 = __riscv_vxor_vv_u32m1(s0, s1, 8);
|
||||
s2 = __riscv_vxor_vv_u32m1(s2, s3, 8);
|
||||
s0 = __riscv_vxor_vv_u32m1(s0, s2, 8);
|
||||
|
||||
return __riscv_vxor_vv_u32m1(s0, key, 8);
|
||||
}
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
|
||||
static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
|
||||
|
||||
static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
|
||||
static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
|
||||
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 0, 4, 8, 12, 64, 68, 72, 76 };
|
||||
|
||||
template<int softAes>
|
||||
void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash) {
|
||||
const uint8_t* inptr = (const uint8_t*)input;
|
||||
const uint8_t* inputEnd = inptr + inputSize;
|
||||
|
||||
//intial state
|
||||
vuint32m1_t state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (inptr < inputEnd) {
|
||||
state02 = softaes_vector_double(state02, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
state13 = softaes_vector_double(state13, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
inptr += 64;
|
||||
}
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
state02 = softaes_vector_double(state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
state13 = softaes_vector_double(state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
state02 = softaes_vector_double(state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
state13 = softaes_vector_double(state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
template void hashAes1Rx4_RVV<false>(const void *input, size_t inputSize, void *hash);
|
||||
template void hashAes1Rx4_RVV<true>(const void *input, size_t inputSize, void *hash);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer) {
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = softaes_vector_double(state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
template void fillAes1Rx4_RVV<false>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes1Rx4_RVV<true>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer) {
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t stride4 = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X4, 8);
|
||||
|
||||
const vuint32m1_t key04 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 0), stride4, 8);
|
||||
const vuint32m1_t key15 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 1), stride4, 8);
|
||||
const vuint32m1_t key26 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 2), stride4, 8);
|
||||
const vuint32m1_t key37 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 3), stride4, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = softaes_vector_double(state02, key04, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key04, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
state02 = softaes_vector_double(state02, key15, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key15, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
state02 = softaes_vector_double(state02, key26, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key26, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
state02 = softaes_vector_double(state02, key37, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
state13 = softaes_vector_double(state13, key37, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
}
|
||||
|
||||
template void fillAes4Rx4_RVV<false>(void *state, size_t outputSize, void *buffer);
|
||||
template void fillAes4Rx4_RVV<true>(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
|
||||
vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
|
||||
vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
|
||||
|
||||
const vuint8m1_t lutenc_index0 = __riscv_vle8_v_u8m1(lutEncIndex[0], 32);
|
||||
const vuint8m1_t lutenc_index1 = __riscv_vle8_v_u8m1(lutEncIndex[1], 32);
|
||||
const vuint8m1_t lutenc_index2 = __riscv_vle8_v_u8m1(lutEncIndex[2], 32);
|
||||
const vuint8m1_t lutenc_index3 = __riscv_vle8_v_u8m1(lutEncIndex[3], 32);
|
||||
|
||||
const vuint8m1_t& lutdec_index0 = lutenc_index0;
|
||||
const vuint8m1_t lutdec_index1 = __riscv_vle8_v_u8m1(lutDecIndex[1], 32);
|
||||
const vuint8m1_t& lutdec_index2 = lutenc_index2;
|
||||
const vuint8m1_t lutdec_index3 = __riscv_vle8_v_u8m1(lutDecIndex[3], 32);
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (scratchpadPtr < scratchpadEnd) {
|
||||
#define HASH_STATE(k) \
|
||||
hash_state02 = softaes_vector_double(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, 8), lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
|
||||
hash_state13 = softaes_vector_double(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, 8), lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
#define FILL_STATE(k) \
|
||||
fill_state02 = softaes_vector_double(fill_state02, key02, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3); \
|
||||
fill_state13 = softaes_vector_double(fill_state13, key13, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3); \
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 0, stride, fill_state02, 8); \
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + k * 16 + 4, stride, fill_state13, 8);
|
||||
|
||||
switch (softAes) {
|
||||
case 0:
|
||||
HASH_STATE(0);
|
||||
HASH_STATE(1);
|
||||
|
||||
FILL_STATE(0);
|
||||
FILL_STATE(1);
|
||||
|
||||
scratchpadPtr += 128;
|
||||
break;
|
||||
|
||||
default:
|
||||
switch (unroll) {
|
||||
case 4:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
HASH_STATE(1);
|
||||
FILL_STATE(1);
|
||||
|
||||
HASH_STATE(2);
|
||||
FILL_STATE(2);
|
||||
|
||||
HASH_STATE(3);
|
||||
FILL_STATE(3);
|
||||
|
||||
scratchpadPtr += 64 * 4;
|
||||
break;
|
||||
|
||||
case 2:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
HASH_STATE(1);
|
||||
FILL_STATE(1);
|
||||
|
||||
scratchpadPtr += 64 * 2;
|
||||
break;
|
||||
|
||||
default:
|
||||
HASH_STATE(0);
|
||||
FILL_STATE(0);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#undef HASH_STATE
|
||||
#undef FILL_STATE
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
hash_state02 = softaes_vector_double(hash_state02, xkey00, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
hash_state13 = softaes_vector_double(hash_state13, xkey00, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
hash_state02 = softaes_vector_double(hash_state02, xkey11, lutenc_index0, lutenc_index1, lutenc_index2, lutenc_index3, lutEnc0, lutEnc1, lutEnc2, lutEnc3);
|
||||
hash_state13 = softaes_vector_double(hash_state13, xkey11, lutdec_index0, lutdec_index1, lutdec_index2, lutdec_index3, lutDec0, lutDec1, lutDec2, lutDec3);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
|
||||
}
|
||||
|
||||
template void hashAndFillAes1Rx4_RVV<0,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<1,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<2,1>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<2,2>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
template void hashAndFillAes1Rx4_RVV<2,4>(void* scratchpad, size_t scratchpadSize, void* hash, void* fill_state);
|
||||
42
src/crypto/randomx/aes_hash_rv64_vector.hpp
Normal file
42
src/crypto/randomx/aes_hash_rv64_vector.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
template<int softAes>
|
||||
void hashAes1Rx4_RVV(const void *input, size_t inputSize, void *hash);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes1Rx4_RVV(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes>
|
||||
void fillAes4Rx4_RVV(void *state, size_t outputSize, void *buffer);
|
||||
|
||||
template<int softAes, int unroll>
|
||||
void hashAndFillAes1Rx4_RVV(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
199
src/crypto/randomx/aes_hash_rv64_zvkned.cpp
Normal file
199
src/crypto/randomx/aes_hash_rv64_zvkned.cpp
Normal file
@@ -0,0 +1,199 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "crypto/randomx/aes_hash.hpp"
|
||||
#include "crypto/randomx/randomx.h"
|
||||
#include "crypto/rx/Profiler.h"
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
static FORCE_INLINE vuint32m1_t aesenc_zvkned(vuint32m1_t a, vuint32m1_t b) { return __riscv_vaesem_vv_u32m1(a, b, 8); }
|
||||
static FORCE_INLINE vuint32m1_t aesdec_zvkned(vuint32m1_t a, vuint32m1_t b, vuint32m1_t zero) { return __riscv_vxor_vv_u32m1(__riscv_vaesdm_vv_u32m1(a, zero, 8), b, 8); }
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_STATE02[8] = { 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4 };
|
||||
static constexpr uint32_t AES_HASH_1R_STATE13[8] = { 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948 };
|
||||
|
||||
static constexpr uint32_t AES_GEN_1R_KEY02[8] = { 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345 };
|
||||
static constexpr uint32_t AES_GEN_1R_KEY13[8] = { 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154 };
|
||||
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY00[8] = { 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201, 0xf6fa8389, 0x8b24949f, 0x90dc56bf, 0x06890201 };
|
||||
static constexpr uint32_t AES_HASH_1R_XKEY11[8] = { 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b, 0x61b263d1, 0x51f4e03c, 0xee1043c6, 0xed18f99b };
|
||||
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X2[8] = { 0, 4, 8, 12, 32, 36, 40, 44 };
|
||||
static constexpr uint32_t AES_HASH_STRIDE_X4[8] = { 0, 4, 8, 12, 64, 68, 72, 76 };
|
||||
|
||||
void hashAes1Rx4_zvkned(const void *input, size_t inputSize, void *hash)
|
||||
{
|
||||
const uint8_t* inptr = (const uint8_t*)input;
|
||||
const uint8_t* inputEnd = inptr + inputSize;
|
||||
|
||||
//intial state
|
||||
vuint32m1_t state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (inptr < inputEnd) {
|
||||
state02 = aesenc_zvkned(state02, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 0, stride, 8));
|
||||
state13 = aesdec_zvkned(state13, __riscv_vluxei32_v_u32m1((uint32_t*)inptr + 4, stride, 8), zero);
|
||||
|
||||
inptr += 64;
|
||||
}
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
state02 = aesenc_zvkned(state02, xkey00);
|
||||
state13 = aesdec_zvkned(state13, xkey00, zero);
|
||||
|
||||
state02 = aesenc_zvkned(state02, xkey11);
|
||||
state13 = aesdec_zvkned(state13, xkey11, zero);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
void fillAes1Rx4_zvkned(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = aesdec_zvkned(state02, key02, zero);
|
||||
state13 = aesenc_zvkned(state13, key13);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)state + 4, stride, state13, 8);
|
||||
}
|
||||
|
||||
void fillAes4Rx4_zvkned(void *state, size_t outputSize, void *buffer)
|
||||
{
|
||||
const uint8_t* outptr = (uint8_t*)buffer;
|
||||
const uint8_t* outputEnd = outptr + outputSize;
|
||||
|
||||
const vuint32m1_t stride4 = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X4, 8);
|
||||
|
||||
const vuint32m1_t key04 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 0), stride4, 8);
|
||||
const vuint32m1_t key15 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 1), stride4, 8);
|
||||
const vuint32m1_t key26 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 2), stride4, 8);
|
||||
const vuint32m1_t key37 = __riscv_vluxei32_v_u32m1((uint32_t*)(RandomX_CurrentConfig.fillAes4Rx4_Key + 3), stride4, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
vuint32m1_t state02 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 0, stride, 8);
|
||||
vuint32m1_t state13 = __riscv_vluxei32_v_u32m1((uint32_t*)state + 4, stride, 8);
|
||||
|
||||
while (outptr < outputEnd) {
|
||||
state02 = aesdec_zvkned(state02, key04, zero);
|
||||
state13 = aesenc_zvkned(state13, key04);
|
||||
|
||||
state02 = aesdec_zvkned(state02, key15, zero);
|
||||
state13 = aesenc_zvkned(state13, key15);
|
||||
|
||||
state02 = aesdec_zvkned(state02, key26, zero);
|
||||
state13 = aesenc_zvkned(state13, key26);
|
||||
|
||||
state02 = aesdec_zvkned(state02, key37, zero);
|
||||
state13 = aesenc_zvkned(state13, key37);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 0, stride, state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)outptr + 4, stride, state13, 8);
|
||||
|
||||
outptr += 64;
|
||||
}
|
||||
}
|
||||
|
||||
void hashAndFillAes1Rx4_zvkned(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state)
|
||||
{
|
||||
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
|
||||
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;
|
||||
|
||||
vuint32m1_t hash_state02 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE02, 8);
|
||||
vuint32m1_t hash_state13 = __riscv_vle32_v_u32m1(AES_HASH_1R_STATE13, 8);
|
||||
|
||||
const vuint32m1_t key02 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY02, 8);
|
||||
const vuint32m1_t key13 = __riscv_vle32_v_u32m1(AES_GEN_1R_KEY13, 8);
|
||||
|
||||
const vuint32m1_t stride = __riscv_vle32_v_u32m1(AES_HASH_STRIDE_X2, 8);
|
||||
const vuint32m1_t zero = {};
|
||||
|
||||
vuint32m1_t fill_state02 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 0, stride, 8);
|
||||
vuint32m1_t fill_state13 = __riscv_vluxei32_v_u32m1((uint32_t*)fill_state + 4, stride, 8);
|
||||
|
||||
//process 64 bytes at a time in 4 lanes
|
||||
while (scratchpadPtr < scratchpadEnd) {
|
||||
hash_state02 = aesenc_zvkned(hash_state02, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + 0, stride, 8));
|
||||
hash_state13 = aesdec_zvkned(hash_state13, __riscv_vluxei32_v_u32m1((uint32_t*)scratchpadPtr + 4, stride, 8), zero);
|
||||
|
||||
fill_state02 = aesdec_zvkned(fill_state02, key02, zero);
|
||||
fill_state13 = aesenc_zvkned(fill_state13, key13);
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + 0, stride, fill_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)scratchpadPtr + 4, stride, fill_state13, 8);
|
||||
|
||||
scratchpadPtr += 64;
|
||||
}
|
||||
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 0, stride, fill_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)fill_state + 4, stride, fill_state13, 8);
|
||||
|
||||
//two extra rounds to achieve full diffusion
|
||||
const vuint32m1_t xkey00 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY00, 8);
|
||||
const vuint32m1_t xkey11 = __riscv_vle32_v_u32m1(AES_HASH_1R_XKEY11, 8);
|
||||
|
||||
hash_state02 = aesenc_zvkned(hash_state02, xkey00);
|
||||
hash_state13 = aesdec_zvkned(hash_state13, xkey00, zero);
|
||||
|
||||
hash_state02 = aesenc_zvkned(hash_state02, xkey11);
|
||||
hash_state13 = aesdec_zvkned(hash_state13, xkey11, zero);
|
||||
|
||||
//output hash
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 0, stride, hash_state02, 8);
|
||||
__riscv_vsuxei32_v_u32m1((uint32_t*)hash + 4, stride, hash_state13, 8);
|
||||
}
|
||||
35
src/crypto/randomx/aes_hash_rv64_zvkned.hpp
Normal file
35
src/crypto/randomx/aes_hash_rv64_zvkned.hpp
Normal file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
Copyright (c) 2025 SChernykh <https://github.com/SChernykh>
|
||||
Copyright (c) 2025 XMRig <support@xmrig.com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
void hashAes1Rx4_zvkned(const void *input, size_t inputSize, void *hash);
|
||||
void fillAes1Rx4_zvkned(void *state, size_t outputSize, void *buffer);
|
||||
void fillAes4Rx4_zvkned(void *state, size_t outputSize, void *buffer);
|
||||
void hashAndFillAes1Rx4_zvkned(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
|
||||
@@ -30,8 +30,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <cstring>
|
||||
#include <climits>
|
||||
#include <cassert>
|
||||
#include "backend/cpu/Cpu.h"
|
||||
#include "crypto/randomx/jit_compiler_rv64.hpp"
|
||||
#include "crypto/randomx/jit_compiler_rv64_static.hpp"
|
||||
#include "crypto/randomx/jit_compiler_rv64_vector.h"
|
||||
#include "crypto/randomx/jit_compiler_rv64_vector_static.h"
|
||||
#include "crypto/randomx/superscalar.hpp"
|
||||
#include "crypto/randomx/program.hpp"
|
||||
#include "crypto/randomx/reciprocal.h"
|
||||
@@ -618,23 +621,49 @@ namespace randomx {
|
||||
entryProgram = state.code + LiteralPoolSize + sizeDataInit;
|
||||
//jal x1, SuperscalarHash
|
||||
emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset);
|
||||
|
||||
if (xmrig::Cpu::info()->hasRISCV_Vector()) {
|
||||
vectorCodeSize = ((uint8_t*)randomx_riscv64_vector_code_end) - ((uint8_t*)randomx_riscv64_vector_code_begin);
|
||||
vectorCode = static_cast<uint8_t*>(allocExecutableMemory(vectorCodeSize, hugePagesJIT && hugePagesEnable));
|
||||
|
||||
if (vectorCode) {
|
||||
memcpy(vectorCode, reinterpret_cast<uint8_t*>(randomx_riscv64_vector_code_begin), vectorCodeSize);
|
||||
entryProgramVector = vectorCode + (((uint8_t*)randomx_riscv64_vector_program_begin) - ((uint8_t*)randomx_riscv64_vector_code_begin));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
JitCompilerRV64::~JitCompilerRV64() {
|
||||
freePagedMemory(state.code, CodeSize);
|
||||
if (vectorCode) {
|
||||
freePagedMemory(vectorCode, vectorCodeSize);
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerRV64::enableWriting() const
|
||||
{
|
||||
xmrig::VirtualMemory::protectRW(entryDataInit, ExecutableSize);
|
||||
|
||||
if (vectorCode) {
|
||||
xmrig::VirtualMemory::protectRW(vectorCode, vectorCodeSize);
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerRV64::enableExecution() const
|
||||
{
|
||||
xmrig::VirtualMemory::protectRX(entryDataInit, ExecutableSize);
|
||||
|
||||
if (vectorCode) {
|
||||
xmrig::VirtualMemory::protectRX(vectorCode, vectorCodeSize);
|
||||
}
|
||||
}
|
||||
|
||||
void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t) {
|
||||
if (vectorCode) {
|
||||
generateProgramVectorRV64(vectorCode, prog, pcfg, inst_map, nullptr, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
emitProgramPrefix(state, prog, pcfg);
|
||||
int32_t fixPos = state.codePos;
|
||||
state.emit(codeDataRead, sizeDataRead);
|
||||
@@ -645,6 +674,11 @@ namespace randomx {
|
||||
}
|
||||
|
||||
void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
|
||||
if (vectorCode) {
|
||||
generateProgramVectorRV64(vectorCode, prog, pcfg, inst_map, entryDataInit, datasetOffset);
|
||||
return;
|
||||
}
|
||||
|
||||
emitProgramPrefix(state, prog, pcfg);
|
||||
int32_t fixPos = state.codePos;
|
||||
state.emit(codeDataReadLight, sizeDataReadLight);
|
||||
@@ -666,6 +700,11 @@ namespace randomx {
|
||||
|
||||
template<size_t N>
|
||||
void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) {
|
||||
if (vectorCode) {
|
||||
entryDataInitVector = generateDatasetInitVectorRV64(vectorCode, programs, RandomX_ConfigurationBase::CacheAccesses);
|
||||
// No return here because we also need the scalar dataset init function for the light mode
|
||||
}
|
||||
|
||||
state.codePos = SuperScalarHashOffset;
|
||||
state.rcpCount = 0;
|
||||
state.emit(codeSshInit, sizeSshInit);
|
||||
@@ -1160,5 +1199,6 @@ namespace randomx {
|
||||
void JitCompilerRV64::v1_NOP(HANDLER_ARGS) {
|
||||
}
|
||||
|
||||
InstructionGeneratorRV64 JitCompilerRV64::engine[256] = {};
|
||||
alignas(64) InstructionGeneratorRV64 JitCompilerRV64::engine[256] = {};
|
||||
alignas(64) uint8_t JitCompilerRV64::inst_map[256] = {};
|
||||
}
|
||||
|
||||
@@ -90,10 +90,10 @@ namespace randomx {
|
||||
void generateDatasetInitCode() {}
|
||||
|
||||
ProgramFunc* getProgramFunc() {
|
||||
return (ProgramFunc*)entryProgram;
|
||||
return (ProgramFunc*)(vectorCode ? entryProgramVector : entryProgram);
|
||||
}
|
||||
DatasetInitFunc* getDatasetInitFunc() {
|
||||
return (DatasetInitFunc*)entryDataInit;
|
||||
return (DatasetInitFunc*)(vectorCode ? entryDataInitVector : entryDataInit);
|
||||
}
|
||||
uint8_t* getCode() {
|
||||
return state.code;
|
||||
@@ -104,10 +104,17 @@ namespace randomx {
|
||||
void enableExecution() const;
|
||||
|
||||
static InstructionGeneratorRV64 engine[256];
|
||||
static uint8_t inst_map[256];
|
||||
private:
|
||||
CompilerState state;
|
||||
void* entryDataInit;
|
||||
void* entryProgram;
|
||||
|
||||
uint8_t* vectorCode = nullptr;
|
||||
size_t vectorCodeSize = 0;
|
||||
|
||||
void* entryDataInit = nullptr;
|
||||
void* entryDataInitVector = nullptr;
|
||||
void* entryProgram = nullptr;
|
||||
void* entryProgramVector = nullptr;
|
||||
|
||||
public:
|
||||
static void v1_IADD_RS(HANDLER_ARGS);
|
||||
|
||||
893
src/crypto/randomx/jit_compiler_rv64_vector.cpp
Normal file
893
src/crypto/randomx/jit_compiler_rv64_vector.cpp
Normal file
@@ -0,0 +1,893 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "crypto/randomx/configuration.h"
|
||||
#include "crypto/randomx/jit_compiler_rv64_vector.h"
|
||||
#include "crypto/randomx/jit_compiler_rv64_vector_static.h"
|
||||
#include "crypto/randomx/reciprocal.h"
|
||||
#include "crypto/randomx/superscalar.hpp"
|
||||
#include "crypto/randomx/program.hpp"
|
||||
|
||||
namespace randomx {
|
||||
|
||||
#define ADDR(x) ((uint8_t*) &(x))
|
||||
#define DIST(x, y) (ADDR(y) - ADDR(x))
|
||||
|
||||
void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs)
|
||||
{
|
||||
uint8_t* p = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions);
|
||||
|
||||
uint8_t* literals = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_imul_rcp_literals);
|
||||
uint8_t* cur_literal = literals;
|
||||
|
||||
for (size_t i = 0; i < num_programs; ++i) {
|
||||
// Step 4
|
||||
size_t k = DIST(randomx_riscv64_vector_sshash_cache_prefetch, randomx_riscv64_vector_sshash_xor);
|
||||
memcpy(p, reinterpret_cast<void*>(randomx_riscv64_vector_sshash_cache_prefetch), k);
|
||||
p += k;
|
||||
|
||||
// Step 5
|
||||
for (uint32_t j = 0; j < programs[i].size; ++j) {
|
||||
const uint32_t dst = programs[i].programBuffer[j].dst & 7;
|
||||
const uint32_t src = programs[i].programBuffer[j].src & 7;
|
||||
const uint32_t modShift = (programs[i].programBuffer[j].mod >> 2) & 3;
|
||||
const uint32_t imm32 = programs[i].programBuffer[j].imm32;
|
||||
|
||||
uint32_t inst;
|
||||
#define EMIT(data) inst = (data); memcpy(p, &inst, 4); p += 4
|
||||
|
||||
switch (static_cast<SuperscalarInstructionType>(programs[i].programBuffer[j].opcode)) {
|
||||
case SuperscalarInstructionType::ISUB_R:
|
||||
// 57 00 00 0A vsub.vv v0, v0, v0
|
||||
EMIT(0x0A000057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IXOR_R:
|
||||
// 57 00 00 2E vxor.vv v0, v0, v0
|
||||
EMIT(0x2E000057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IADD_RS:
|
||||
if (modShift == 0) {
|
||||
// 57 00 00 02 vadd.vv v0, v0, v0
|
||||
EMIT(0x02000057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
}
|
||||
else {
|
||||
// 57 39 00 96 vsll.vi v18, v0, 0
|
||||
// 57 00 09 02 vadd.vv v0, v0, v18
|
||||
EMIT(0x96003957 | (modShift << 15) | (src << 20));
|
||||
EMIT(0x02090057 | (dst << 7) | (dst << 20));
|
||||
}
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IMUL_R:
|
||||
// 57 20 00 96 vmul.vv v0, v0, v0
|
||||
EMIT(0x96002057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IROR_C:
|
||||
{
|
||||
#ifdef __riscv_zvkb
|
||||
// 57 30 00 52 vror.vi v0, v0, 0
|
||||
EMIT(0x52003057 | (dst << 7) | (dst << 20) | ((imm32 & 31) << 15) | ((imm32 & 32) << 21));
|
||||
#else // __riscv_zvkb
|
||||
const uint32_t shift_right = imm32 & 63;
|
||||
const uint32_t shift_left = 64 - shift_right;
|
||||
|
||||
if (shift_right < 32) {
|
||||
// 57 39 00 A2 vsrl.vi v18, v0, 0
|
||||
EMIT(0xA2003957 | (shift_right << 15) | (dst << 20));
|
||||
}
|
||||
else {
|
||||
// 93 02 00 00 li x5, 0
|
||||
// 57 C9 02 A2 vsrl.vx v18, v0, x5
|
||||
EMIT(0x00000293 | (shift_right << 20));
|
||||
EMIT(0xA202C957 | (dst << 20));
|
||||
}
|
||||
|
||||
if (shift_left < 32) {
|
||||
// 57 30 00 96 vsll.vi v0, v0, 0
|
||||
EMIT(0x96003057 | (dst << 7) | (shift_left << 15) | (dst << 20));
|
||||
}
|
||||
else {
|
||||
// 93 02 00 00 li x5, 0
|
||||
// 57 C0 02 96 vsll.vx v0, v0, x5
|
||||
EMIT(0x00000293 | (shift_left << 20));
|
||||
EMIT(0x9602C057 | (dst << 7) | (dst << 20));
|
||||
}
|
||||
|
||||
// 57 00 20 2B vor.vv v0, v18, v0
|
||||
EMIT(0x2B200057 | (dst << 7) | (dst << 15));
|
||||
#endif // __riscv_zvkb
|
||||
}
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IADD_C7:
|
||||
case SuperscalarInstructionType::IADD_C8:
|
||||
case SuperscalarInstructionType::IADD_C9:
|
||||
// B7 02 00 00 lui x5, 0
|
||||
// 9B 82 02 00 addiw x5, x5, 0
|
||||
// 57 C0 02 02 vadd.vx v0, v0, x5
|
||||
EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000));
|
||||
EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20));
|
||||
EMIT(0x0202C057 | (dst << 7) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IXOR_C7:
|
||||
case SuperscalarInstructionType::IXOR_C8:
|
||||
case SuperscalarInstructionType::IXOR_C9:
|
||||
// B7 02 00 00 lui x5, 0
|
||||
// 9B 82 02 00 addiw x5, x5, 0
|
||||
// 57 C0 02 2E vxor.vx v0, v0, x5
|
||||
EMIT(0x000002B7 | ((imm32 + ((imm32 & 0x800) << 1)) & 0xFFFFF000));
|
||||
EMIT(0x0002829B | ((imm32 & 0x00000FFF) << 20));
|
||||
EMIT(0x2E02C057 | (dst << 7) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IMULH_R:
|
||||
// 57 20 00 92 vmulhu.vv v0, v0, v0
|
||||
EMIT(0x92002057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::ISMULH_R:
|
||||
// 57 20 00 9E vmulh.vv v0, v0, v0
|
||||
EMIT(0x9E002057 | (dst << 7) | (src << 15) | (dst << 20));
|
||||
break;
|
||||
|
||||
case SuperscalarInstructionType::IMUL_RCP:
|
||||
{
|
||||
uint32_t offset = cur_literal - literals;
|
||||
|
||||
if (offset == 2040) {
|
||||
literals += 2040;
|
||||
offset = 0;
|
||||
|
||||
// 93 87 87 7F add x15, x15, 2040
|
||||
EMIT(0x7F878793);
|
||||
}
|
||||
|
||||
const uint64_t r = randomx_reciprocal_fast(imm32);
|
||||
memcpy(cur_literal, &r, 8);
|
||||
cur_literal += 8;
|
||||
|
||||
// 83 B2 07 00 ld x5, (x15)
|
||||
// 57 E0 02 96 vmul.vx v0, v0, x5
|
||||
EMIT(0x0007B283 | (offset << 20));
|
||||
EMIT(0x9602E057 | (dst << 7) | (dst << 20));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
UNREACHABLE;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 6
|
||||
k = DIST(randomx_riscv64_vector_sshash_xor, randomx_riscv64_vector_sshash_end);
|
||||
memcpy(p, reinterpret_cast<void*>(randomx_riscv64_vector_sshash_xor), k);
|
||||
p += k;
|
||||
|
||||
// Step 7. Set cacheIndex to the value of the register that has the longest dependency chain in the SuperscalarHash function executed in step 5.
|
||||
if (i + 1 < num_programs) {
|
||||
// vmv.v.v v9, v0 + programs[i].getAddressRegister()
|
||||
const uint32_t t = 0x5E0004D7 + (static_cast<uint32_t>(programs[i].getAddressRegister()) << 15);
|
||||
memcpy(p, &t, 4);
|
||||
p += 4;
|
||||
}
|
||||
}
|
||||
|
||||
// Emit "J randomx_riscv64_vector_sshash_generated_instructions_end" instruction
|
||||
const uint8_t* e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_generated_instructions_end);
|
||||
const uint32_t k = e - p;
|
||||
const uint32_t j = 0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000);
|
||||
memcpy(p, &j, 4);
|
||||
|
||||
char* result = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_sshash_dataset_init));
|
||||
|
||||
#ifdef __GNUC__
|
||||
__builtin___clear_cache(result, (char*)(buf + DIST(randomx_riscv64_vector_sshash_begin, randomx_riscv64_vector_sshash_end)));
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#define emit16(value) { const uint16_t t = value; memcpy(p, &t, 2); p += 2; }
|
||||
#define emit32(value) { const uint32_t t = value; memcpy(p, &t, 4); p += 4; }
|
||||
#define emit64(value) { const uint64_t t = value; memcpy(p, &t, 8); p += 8; }
|
||||
#define emit_data(arr) { memcpy(p, arr, sizeof(arr)); p += sizeof(arr); }
|
||||
|
||||
static void imm_to_x5(uint32_t imm, uint8_t*& p)
|
||||
{
|
||||
const uint32_t imm_hi = (imm + ((imm & 0x800) << 1)) & 0xFFFFF000U;
|
||||
const uint32_t imm_lo = imm & 0x00000FFFU;
|
||||
|
||||
if (imm_hi == 0) {
|
||||
// li x5, imm_lo
|
||||
emit32(0x00000293 + (imm_lo << 20));
|
||||
return;
|
||||
}
|
||||
|
||||
if (imm_lo == 0) {
|
||||
// lui x5, imm_hi
|
||||
emit32(0x000002B7 + imm_hi);
|
||||
return;
|
||||
}
|
||||
|
||||
if (imm_hi < (32 << 12)) {
|
||||
//c.lui x5, imm_hi
|
||||
emit16(0x6281 + (imm_hi >> 10));
|
||||
}
|
||||
else {
|
||||
// lui x5, imm_hi
|
||||
emit32(0x000002B7 + imm_hi);
|
||||
}
|
||||
|
||||
// addiw x5, x5, imm_lo
|
||||
emit32(0x0002829B | (imm_lo << 20));
|
||||
}
|
||||
|
||||
static void loadFromScratchpad(uint32_t src, uint32_t dst, uint32_t mod, uint32_t imm, uint8_t*& p)
|
||||
{
|
||||
if (src == dst) {
|
||||
imm &= RandomX_CurrentConfig.ScratchpadL3Mask_Calculated;
|
||||
|
||||
if (imm <= 2047) {
|
||||
// ld x5, imm(x12)
|
||||
emit32(0x00063283 | (imm << 20));
|
||||
}
|
||||
else if (imm <= 2047 * 2) {
|
||||
// addi x5, x12, 2047
|
||||
emit32(0x7FF60293);
|
||||
// ld x5, (imm - 2047)(x5)
|
||||
emit32(0x0002B283 | ((imm - 2047) << 20));
|
||||
}
|
||||
else {
|
||||
// lui x5, imm & 0xFFFFF000U
|
||||
emit32(0x000002B7 | ((imm + ((imm & 0x800) << 1)) & 0xFFFFF000U));
|
||||
// c.add x5, x12
|
||||
emit16(0x92B2);
|
||||
// ld x5, (imm & 0xFFF)(x5)
|
||||
emit32(0x0002B283 | ((imm & 0xFFF) << 20));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t shift = 32;
|
||||
uint32_t mask_reg;
|
||||
|
||||
if ((mod & 3) == 0) {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL2;
|
||||
mask_reg = 17;
|
||||
}
|
||||
else {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL1;
|
||||
mask_reg = 16;
|
||||
}
|
||||
|
||||
imm = static_cast<uint32_t>(static_cast<int32_t>(imm << shift) >> shift);
|
||||
|
||||
// 0-0x7FF, 0xFFFFF800-0xFFFFFFFF fit into 12 bit (a single addi instruction)
|
||||
if (imm - 0xFFFFF800U < 0x1000U) {
|
||||
// addi x5, x20 + src, imm
|
||||
emit32(0x000A0293 + (src << 15) + (imm << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(imm, p);
|
||||
// c.add x5, x20 + src
|
||||
emit16(0x92D2 + (src << 2));
|
||||
}
|
||||
|
||||
// and x5, x5, mask_reg
|
||||
emit32(0x0002F2B3 + (mask_reg << 20));
|
||||
// c.add x5, x12
|
||||
emit16(0x92B2);
|
||||
// ld x5, 0(x5)
|
||||
emit32(0x0002B283);
|
||||
}
|
||||
|
||||
void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset)
|
||||
{
|
||||
uint64_t* params = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params));
|
||||
|
||||
params[0] = RandomX_CurrentConfig.ScratchpadL1_Size - 8;
|
||||
params[1] = RandomX_CurrentConfig.ScratchpadL2_Size - 8;
|
||||
params[2] = RandomX_CurrentConfig.ScratchpadL3_Size - 8;
|
||||
params[3] = RandomX_CurrentConfig.DatasetBaseSize - 64;
|
||||
params[4] = (1 << RandomX_ConfigurationBase::JumpBits) - 1;
|
||||
|
||||
uint64_t* imul_rcp_literals = (uint64_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_imul_rcp_literals));
|
||||
uint64_t* cur_literal = imul_rcp_literals;
|
||||
|
||||
uint32_t* spaddr_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_spaddr_xor));
|
||||
uint32_t* spaddr_xor2 = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch));
|
||||
uint32_t* mx_xor = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor));
|
||||
uint32_t* mx_xor_light = (uint32_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_mx_xor_light_mode));
|
||||
|
||||
*spaddr_xor = 0x014A47B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x15, readReg0, readReg1
|
||||
*spaddr_xor2 = 0x014A42B3 + (pcfg.readReg0 << 15) + (pcfg.readReg1 << 20); // xor x5, readReg0, readReg1
|
||||
const uint32_t mx_xor_value = 0x014A42B3 + (pcfg.readReg2 << 15) + (pcfg.readReg3 << 20); // xor x5, readReg2, readReg3
|
||||
|
||||
*mx_xor = mx_xor_value;
|
||||
*mx_xor_light = mx_xor_value;
|
||||
|
||||
if (entryDataInitScalar) {
|
||||
void* light_mode_data = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_light_mode_data);
|
||||
|
||||
const uint64_t data[2] = { reinterpret_cast<uint64_t>(entryDataInitScalar), datasetOffset };
|
||||
memcpy(light_mode_data, &data, sizeof(data));
|
||||
}
|
||||
|
||||
uint8_t* p = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions));
|
||||
|
||||
// 57C8025E vmv.v.x v16, x5
|
||||
// 57A9034B vsext.vf2 v18, v16
|
||||
// 5798214B vfcvt.f.x.v v16, v18
|
||||
static constexpr uint8_t group_f_convert[] = {
|
||||
0x57, 0xC8, 0x02, 0x5E, 0x57, 0xA9, 0x03, 0x4B, 0x57, 0x98, 0x21, 0x4B
|
||||
};
|
||||
|
||||
// 57080627 vand.vv v16, v16, v12
|
||||
// 5788062B vor.vv v16, v16, v13
|
||||
static constexpr uint8_t group_e_post_process[] = { 0x57, 0x08, 0x06, 0x27, 0x57, 0x88, 0x06, 0x2B };
|
||||
|
||||
uint8_t* last_modified[RegistersCount] = { p, p, p, p, p, p, p, p };
|
||||
|
||||
uint8_t readReg01[RegistersCount] = {};
|
||||
|
||||
readReg01[pcfg.readReg0] = 1;
|
||||
readReg01[pcfg.readReg1] = 1;
|
||||
|
||||
uint32_t scratchpad_prefetch_pos = 0;
|
||||
|
||||
for (int32_t i = static_cast<int32_t>(prog.getSize()) - 1; i >= 0; --i) {
|
||||
Instruction instr = prog(i);
|
||||
|
||||
const InstructionType inst_type = static_cast<InstructionType>(inst_map[instr.opcode]);
|
||||
|
||||
if (inst_type == InstructionType::CBRANCH) {
|
||||
scratchpad_prefetch_pos = i;
|
||||
break;
|
||||
}
|
||||
|
||||
if (inst_type < InstructionType::FSWAP_R) {
|
||||
const uint32_t src = instr.src % RegistersCount;
|
||||
const uint32_t dst = instr.dst % RegistersCount;
|
||||
|
||||
if ((inst_type == InstructionType::ISWAP_R) && (src != dst) && (readReg01[src] || readReg01[dst])) {
|
||||
scratchpad_prefetch_pos = i;
|
||||
break;
|
||||
}
|
||||
|
||||
if ((inst_type == InstructionType::IMUL_RCP) && readReg01[dst] && !isZeroOrPowerOf2(instr.getImm32())) {
|
||||
scratchpad_prefetch_pos = i;
|
||||
break;
|
||||
}
|
||||
|
||||
if (readReg01[dst]) {
|
||||
scratchpad_prefetch_pos = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
|
||||
Instruction instr = prog(i);
|
||||
|
||||
uint32_t src = instr.src % RegistersCount;
|
||||
uint32_t dst = instr.dst % RegistersCount;
|
||||
const uint32_t shift = instr.getModShift();
|
||||
uint32_t imm = instr.getImm32();
|
||||
const uint32_t mod = instr.mod;
|
||||
|
||||
switch (static_cast<InstructionType>(inst_map[instr.opcode])) {
|
||||
case InstructionType::IADD_RS:
|
||||
if (shift == 0) {
|
||||
// c.add x20 + dst, x20 + src
|
||||
emit16(0x9A52 + (src << 2) + (dst << 7));
|
||||
}
|
||||
else {
|
||||
#ifdef __riscv_zba
|
||||
// sh{shift}add x20 + dst, x20 + src, x20 + dst
|
||||
emit32(0x214A0A33 + (shift << 13) + (dst << 7) + (src << 15) + (dst << 20));
|
||||
#else // __riscv_zba
|
||||
// slli x5, x20 + src, shift
|
||||
emit32(0x000A1293 + (src << 15) + (shift << 20));
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
#endif // __riscv_zba
|
||||
}
|
||||
if (dst == RegisterNeedsDisplacement) {
|
||||
imm_to_x5(imm, p);
|
||||
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IADD_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISUB_R:
|
||||
if (src != dst) {
|
||||
// sub x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x414A0A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(-imm, p);
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISUB_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// sub x20 + dst, x20 + dst, x5
|
||||
emit32(0x405A0A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMUL_R:
|
||||
if (src != dst) {
|
||||
// mul x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x034A0A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(imm, p);
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMUL_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMULH_R:
|
||||
// mulhu x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x034A3A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMULH_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// mulhu x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A3A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISMULH_R:
|
||||
// mulh x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x034A1A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::ISMULH_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// mulh x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A1A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IMUL_RCP:
|
||||
if (!isZeroOrPowerOf2(imm)) {
|
||||
const uint64_t offset = (cur_literal - imul_rcp_literals) * 8;
|
||||
*(cur_literal++) = randomx_reciprocal_fast(imm);
|
||||
|
||||
static constexpr uint32_t rcp_regs[26] = {
|
||||
/* Integer */ 8, 10, 28, 29, 30, 31,
|
||||
/* Float */ 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 28, 29, 30, 31
|
||||
};
|
||||
|
||||
if (offset < 6 * 8) {
|
||||
// mul x20 + dst, x20 + dst, rcp_reg
|
||||
emit32(0x020A0A33 + (dst << 7) + (dst << 15) + (rcp_regs[offset / 8] << 20));
|
||||
}
|
||||
else if (offset < 26 * 8) {
|
||||
// fmv.x.d x5, rcp_reg
|
||||
emit32(0xE20002D3 + (rcp_regs[offset / 8] << 15));
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
else {
|
||||
// ld x5, offset(x18)
|
||||
emit32(0x00093283 + (offset << 20));
|
||||
// mul x20 + dst, x20 + dst, x5
|
||||
emit32(0x025A0A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
}
|
||||
break;
|
||||
|
||||
case InstructionType::INEG_R:
|
||||
// sub x20 + dst, x0, x20 + dst
|
||||
emit32(0x41400A33 + (dst << 7) + (dst << 20));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IXOR_R:
|
||||
if (src != dst) {
|
||||
// xor x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x014A4A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
imm_to_x5(imm, p);
|
||||
// xor x20, x20, x5
|
||||
emit32(0x005A4A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IXOR_M:
|
||||
loadFromScratchpad(src, dst, mod, imm, p);
|
||||
// xor x20, x20, x5
|
||||
emit32(0x005A4A33 + (dst << 7) + (dst << 15));
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
#ifdef __riscv_zbb
|
||||
case InstructionType::IROR_R:
|
||||
if (src != dst) {
|
||||
// ror x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x614A5A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
// rori x20 + dst, x20 + dst, imm
|
||||
emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((imm & 63) << 20));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IROL_R:
|
||||
if (src != dst) {
|
||||
// rol x20 + dst, x20 + dst, x20 + src
|
||||
emit32(0x614A1A33 + (dst << 7) + (dst << 15) + (src << 20));
|
||||
}
|
||||
else {
|
||||
// rori x20 + dst, x20 + dst, -imm
|
||||
emit32(0x600A5A13 + (dst << 7) + (dst << 15) + ((-imm & 63) << 20));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
#else // __riscv_zbb
|
||||
case InstructionType::IROR_R:
|
||||
if (src != dst) {
|
||||
// sub x5, x0, x20 + src
|
||||
emit32(0x414002B3 + (src << 20));
|
||||
// srl x6, x20 + dst, x20 + src
|
||||
emit32(0x014A5333 + (dst << 15) + (src << 20));
|
||||
// sll x20 + dst, x20 + dst, x5
|
||||
emit32(0x005A1A33 + (dst << 7) + (dst << 15));
|
||||
// or x20 + dst, x20 + dst, x6
|
||||
emit32(0x006A6A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
else {
|
||||
// srli x5, x20 + dst, imm
|
||||
emit32(0x000A5293 + (dst << 15) + ((imm & 63) << 20));
|
||||
// slli x6, x20 + dst, -imm
|
||||
emit32(0x000A1313 + (dst << 15) + ((-imm & 63) << 20));
|
||||
// or x20 + dst, x5, x6
|
||||
emit32(0x0062EA33 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
|
||||
case InstructionType::IROL_R:
|
||||
if (src != dst) {
|
||||
// sub x5, x0, x20 + src
|
||||
emit32(0x414002B3 + (src << 20));
|
||||
// sll x6, x20 + dst, x20 + src
|
||||
emit32(0x014A1333 + (dst << 15) + (src << 20));
|
||||
// srl x20 + dst, x20 + dst, x5
|
||||
emit32(0x005A5A33 + (dst << 7) + (dst << 15));
|
||||
// or x20 + dst, x20 + dst, x6
|
||||
emit32(0x006A6A33 + (dst << 7) + (dst << 15));
|
||||
}
|
||||
else {
|
||||
// srli x5, x20 + dst, -imm
|
||||
emit32(0x000A5293 + (dst << 15) + ((-imm & 63) << 20));
|
||||
// slli x6, x20 + dst, imm
|
||||
emit32(0x000A1313 + (dst << 15) + ((imm & 63) << 20));
|
||||
// or x20 + dst, x5, x6
|
||||
emit32(0x0062EA33 + (dst << 7));
|
||||
}
|
||||
|
||||
last_modified[dst] = p;
|
||||
break;
|
||||
#endif // __riscv_zbb
|
||||
|
||||
case InstructionType::ISWAP_R:
|
||||
if (src != dst) {
|
||||
// c.mv x5, x20 + dst
|
||||
emit16(0x82D2 + (dst << 2));
|
||||
// c.mv x20 + dst, x20 + src
|
||||
emit16(0x8A52 + (src << 2) + (dst << 7));
|
||||
// c.mv x20 + src, x5
|
||||
emit16(0x8A16 + (src << 7));
|
||||
|
||||
last_modified[src] = p;
|
||||
last_modified[dst] = p;
|
||||
}
|
||||
break;
|
||||
|
||||
case InstructionType::FSWAP_R:
|
||||
// vmv.x.s x5, v0 + dst
|
||||
emit32(0x420022D7 + (dst << 20));
|
||||
// vslide1down.vx v0 + dst, v0 + dst, x5
|
||||
emit32(0x3E02E057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FADD_R:
|
||||
src %= RegisterCountFlt;
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfadd.vv v0 + dst, v0 + dst, v8 + src
|
||||
emit32(0x02041057 + (dst << 7) + (src << 15) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FADD_M:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
loadFromScratchpad(src, RegistersCount, mod, imm, p);
|
||||
emit_data(group_f_convert);
|
||||
|
||||
// vfadd.vv v0 + dst, v0 + dst, v16
|
||||
emit32(0x02081057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSUB_R:
|
||||
src %= RegisterCountFlt;
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfsub.vv v0 + dst, v0 + dst, v8 + src
|
||||
emit32(0x0A041057 + (dst << 7) + (src << 15) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSUB_M:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
loadFromScratchpad(src, RegistersCount, mod, imm, p);
|
||||
emit_data(group_f_convert);
|
||||
|
||||
// vfsub.vv v0 + dst, v0 + dst, v16
|
||||
emit32(0x0A081057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSCAL_R:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vxor.vv v0, v0, v14
|
||||
emit32(0x2E070057 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FMUL_R:
|
||||
src %= RegisterCountFlt;
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfmul.vv v4 + dst, v4 + dst, v8 + src
|
||||
emit32(0x92441257 + (dst << 7) + (src << 15) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FDIV_M:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
loadFromScratchpad(src, RegistersCount, mod, imm, p);
|
||||
emit_data(group_f_convert);
|
||||
emit_data(group_e_post_process);
|
||||
|
||||
// vfdiv.vv v0 + dst, v0 + dst, v16
|
||||
emit32(0x82481257 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::FSQRT_R:
|
||||
dst %= RegisterCountFlt;
|
||||
|
||||
// vfsqrt.v v4 + dst, v4 + dst
|
||||
emit32(0x4E401257 + (dst << 7) + (dst << 20));
|
||||
break;
|
||||
|
||||
case InstructionType::CBRANCH:
|
||||
{
|
||||
const uint32_t shift = (mod >> 4) + RandomX_ConfigurationBase::JumpOffset;
|
||||
|
||||
imm |= (1UL << shift);
|
||||
|
||||
if (RandomX_ConfigurationBase::JumpOffset > 0 || shift > 0) {
|
||||
imm &= ~(1UL << (shift - 1));
|
||||
}
|
||||
|
||||
// slli x6, x7, shift
|
||||
// x6 = branchMask
|
||||
emit32(0x00039313 + (shift << 20));
|
||||
|
||||
// x5 = imm
|
||||
imm_to_x5(imm, p);
|
||||
|
||||
// c.add x20 + dst, x5
|
||||
emit16(0x9A16 + (dst << 7));
|
||||
|
||||
// and x5, x20 + dst, x6
|
||||
emit32(0x006A72B3 + (dst << 15));
|
||||
|
||||
const int offset = static_cast<int>(last_modified[dst] - p);
|
||||
|
||||
if (offset >= -4096) {
|
||||
// beqz x5, offset
|
||||
const uint32_t k = static_cast<uint32_t>(offset);
|
||||
emit32(0x80028063 | ((k & 0x1E) << 7) | ((k & 0x7E0) << 20) | ((k & 0x800) >> 4));
|
||||
}
|
||||
else {
|
||||
// bnez x5, 8
|
||||
emit32(0x00029463);
|
||||
// j offset
|
||||
const uint32_t k = static_cast<uint32_t>(offset - 4);
|
||||
emit32(0x8000006F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000));
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < RegistersCount; ++j) {
|
||||
last_modified[j] = p;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case InstructionType::CFROUND:
|
||||
if ((imm - 1) & 63) {
|
||||
#ifdef __riscv_zbb
|
||||
// rori x5, x20 + src, imm - 1
|
||||
emit32(0x600A5293 + (src << 15) + (((imm - 1) & 63) << 20));
|
||||
#else // __riscv_zbb
|
||||
// srli x5, x20 + src, imm - 1
|
||||
emit32(0x000A5293 + (src << 15) + (((imm - 1) & 63) << 20));
|
||||
// slli x6, x20 + src, 1 - imm
|
||||
emit32(0x000A1313 + (src << 15) + (((1 - imm) & 63) << 20));
|
||||
// or x5, x5, x6
|
||||
emit32(0x0062E2B3);
|
||||
#endif // __riscv_zbb
|
||||
|
||||
// andi x5, x5, 6
|
||||
emit32(0x0062F293);
|
||||
}
|
||||
else {
|
||||
// andi x5, x20 + src, 6
|
||||
emit32(0x006A7293 + (src << 15));
|
||||
}
|
||||
|
||||
// li x6, 01111000b
|
||||
// x6 = CFROUND lookup table
|
||||
emit32(0x07800313);
|
||||
// srl x5, x6, x5
|
||||
emit32(0x005352B3);
|
||||
// andi x5, x5, 3
|
||||
emit32(0x0032F293);
|
||||
// csrw frm, x5
|
||||
emit32(0x00229073);
|
||||
break;
|
||||
|
||||
case InstructionType::ISTORE:
|
||||
{
|
||||
uint32_t mask_reg;
|
||||
uint32_t shift = 32;
|
||||
|
||||
if ((mod >> 4) >= 14) {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL3;
|
||||
mask_reg = 1; // x1 = L3 mask
|
||||
}
|
||||
else {
|
||||
if ((mod & 3) == 0) {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL2;
|
||||
mask_reg = 17; // x17 = L2 mask
|
||||
}
|
||||
else {
|
||||
shift -= RandomX_CurrentConfig.Log2_ScratchpadL1;
|
||||
mask_reg = 16; // x16 = L1 mask
|
||||
}
|
||||
}
|
||||
|
||||
imm = static_cast<uint32_t>(static_cast<int32_t>(imm << shift) >> shift);
|
||||
imm_to_x5(imm, p);
|
||||
|
||||
// c.add x5, x20 + dst
|
||||
emit16(0x92D2 + (dst << 2));
|
||||
// and x5, x5, x0 + mask_reg
|
||||
emit32(0x0002F2B3 + (mask_reg << 20));
|
||||
// c.add x5, x12
|
||||
emit16(0x92B2);
|
||||
// sd x20 + src, 0(x5)
|
||||
emit32(0x0142B023 + (src << 20));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
UNREACHABLE;
|
||||
}
|
||||
|
||||
// Prefetch scratchpad lines for the next main loop iteration
|
||||
// scratchpad_prefetch_pos is a conservative estimate of the earliest place in the code where we can do it
|
||||
if (i == scratchpad_prefetch_pos) {
|
||||
uint8_t* e = (uint8_t*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_scratchpad_prefetch_end));
|
||||
const size_t n = e - ((uint8_t*)spaddr_xor2);
|
||||
|
||||
memcpy(p, spaddr_xor2, n);
|
||||
p += n;
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t* e;
|
||||
|
||||
if (entryDataInitScalar) {
|
||||
// Emit "J randomx_riscv64_vector_program_main_loop_instructions_end_light_mode" instruction
|
||||
e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end_light_mode);
|
||||
}
|
||||
else {
|
||||
// Emit "J randomx_riscv64_vector_program_main_loop_instructions_end" instruction
|
||||
e = buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_main_loop_instructions_end);
|
||||
}
|
||||
|
||||
const uint32_t k = e - p;
|
||||
emit32(0x6F | ((k & 0x7FE) << 20) | ((k & 0x800) << 9) | (k & 0xFF000));
|
||||
|
||||
#ifdef __GNUC__
|
||||
char* p1 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_params));
|
||||
char* p2 = (char*)(buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_end));
|
||||
|
||||
__builtin___clear_cache(p1, p2);
|
||||
#endif
|
||||
|
||||
return buf + DIST(randomx_riscv64_vector_code_begin, randomx_riscv64_vector_program_begin);
|
||||
}
|
||||
|
||||
} // namespace randomx
|
||||
45
src/crypto/randomx/jit_compiler_rv64_vector.h
Normal file
45
src/crypto/randomx/jit_compiler_rv64_vector.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace randomx {
|
||||
|
||||
class SuperscalarProgram;
|
||||
struct ProgramConfiguration;
|
||||
class Program;
|
||||
|
||||
void* generateDatasetInitVectorRV64(uint8_t* buf, SuperscalarProgram* programs, size_t num_programs);
|
||||
void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguration& pcfg, const uint8_t (&inst_map)[256], void* entryDataInitScalar, uint32_t datasetOffset);
|
||||
|
||||
} // namespace randomx
|
||||
874
src/crypto/randomx/jit_compiler_rv64_vector_static.S
Normal file
874
src/crypto/randomx/jit_compiler_rv64_vector_static.S
Normal file
@@ -0,0 +1,874 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "configuration.h"
|
||||
|
||||
// Compatibility macros
|
||||
|
||||
#if !defined(RANDOMX_CACHE_ACCESSES) && defined(RANDOMX_CACHE_MAX_ACCESSES)
|
||||
#define RANDOMX_CACHE_ACCESSES RANDOMX_CACHE_MAX_ACCESSES
|
||||
#endif
|
||||
|
||||
#if defined(RANDOMX_ARGON_MEMORY)
|
||||
#define RANDOMX_CACHE_MASK RANDOMX_ARGON_MEMORY * 1024 / 64 - 1
|
||||
#elif defined(RANDOMX_CACHE_MAX_SIZE)
|
||||
#define RANDOMX_CACHE_MASK RANDOMX_CACHE_MAX_SIZE / 64 - 1
|
||||
#endif
|
||||
|
||||
#define DECL(x) x
|
||||
|
||||
.text
|
||||
|
||||
#ifndef __riscv_v
|
||||
#error This file requires rv64gcv
|
||||
#endif
|
||||
|
||||
.option pic
|
||||
|
||||
.global DECL(randomx_riscv64_vector_code_begin)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_sshash_begin)
|
||||
.global DECL(randomx_riscv64_vector_sshash_imul_rcp_literals)
|
||||
.global DECL(randomx_riscv64_vector_sshash_dataset_init)
|
||||
.global DECL(randomx_riscv64_vector_sshash_generated_instructions)
|
||||
.global DECL(randomx_riscv64_vector_sshash_generated_instructions_end)
|
||||
.global DECL(randomx_riscv64_vector_sshash_cache_prefetch)
|
||||
.global DECL(randomx_riscv64_vector_sshash_xor)
|
||||
.global DECL(randomx_riscv64_vector_sshash_end)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_program_params)
|
||||
.global DECL(randomx_riscv64_vector_program_imul_rcp_literals)
|
||||
.global DECL(randomx_riscv64_vector_program_begin)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_instructions)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_light_mode_data)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode)
|
||||
.global DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode)
|
||||
.global DECL(randomx_riscv64_vector_program_scratchpad_prefetch)
|
||||
.global DECL(randomx_riscv64_vector_program_scratchpad_prefetch_end)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_program_end)
|
||||
|
||||
.global DECL(randomx_riscv64_vector_code_end)
|
||||
|
||||
.balign 8
|
||||
|
||||
DECL(randomx_riscv64_vector_code_begin):
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_begin):
|
||||
|
||||
sshash_constant_0: .dword 6364136223846793005
|
||||
sshash_constant_1: .dword 9298411001130361340
|
||||
sshash_constant_2: .dword 12065312585734608966
|
||||
sshash_constant_3: .dword 9306329213124626780
|
||||
sshash_constant_4: .dword 5281919268842080866
|
||||
sshash_constant_5: .dword 10536153434571861004
|
||||
sshash_constant_6: .dword 3398623926847679864
|
||||
sshash_constant_7: .dword 9549104520008361294
|
||||
sshash_offsets: .dword 0,1,2,3
|
||||
store_offsets: .dword 0,64,128,192
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_imul_rcp_literals): .fill 512,8,0
|
||||
|
||||
/*
|
||||
Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#73-dataset-block-generation
|
||||
|
||||
Register layout
|
||||
---------------
|
||||
x5 = temporary
|
||||
|
||||
x10 = randomx cache
|
||||
x11 = output buffer
|
||||
x12 = startBlock
|
||||
x13 = endBlock
|
||||
|
||||
x14 = cache mask
|
||||
x15 = imul_rcp literal pointer
|
||||
|
||||
v0-v7 = r0-r7
|
||||
v8 = itemNumber
|
||||
v9 = cacheIndex, then a pointer into cache->memory (for prefetch), then a byte offset into cache->memory
|
||||
|
||||
v10-v17 = sshash constants
|
||||
|
||||
v18 = temporary
|
||||
|
||||
v19 = dataset item store offsets
|
||||
*/
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_dataset_init):
|
||||
// Process 4 64-bit values at a time
|
||||
vsetivli zero, 4, e64, m1, ta, ma
|
||||
|
||||
// Load cache->memory pointer
|
||||
ld x10, (x10)
|
||||
|
||||
// Init cache mask
|
||||
li x14, RANDOMX_CACHE_MASK
|
||||
|
||||
// Init dataset item store offsets
|
||||
lla x5, store_offsets
|
||||
vle64.v v19, (x5)
|
||||
|
||||
// Init itemNumber vector to (startBlock, startBlock + 1, startBlock + 2, startBlock + 3)
|
||||
lla x5, sshash_offsets
|
||||
vle64.v v8, (x5)
|
||||
vadd.vx v8, v8, x12
|
||||
|
||||
// Load constants (stride = x0 = 0, so a 64-bit value will be broadcast into each element of a vector)
|
||||
lla x5, sshash_constant_0
|
||||
vlse64.v v10, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_1
|
||||
vlse64.v v11, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_2
|
||||
vlse64.v v12, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_3
|
||||
vlse64.v v13, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_4
|
||||
vlse64.v v14, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_5
|
||||
vlse64.v v15, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_6
|
||||
vlse64.v v16, (x5), x0
|
||||
|
||||
lla x5, sshash_constant_7
|
||||
vlse64.v v17, (x5), x0
|
||||
|
||||
// Calculate the end pointer for dataset init
|
||||
sub x13, x13, x12
|
||||
slli x13, x13, 6
|
||||
add x13, x13, x11
|
||||
|
||||
init_item:
|
||||
// Step 1. Init r0-r7
|
||||
|
||||
// r0 = (itemNumber + 1) * 6364136223846793005
|
||||
vmv.v.v v0, v8
|
||||
vmadd.vv v0, v10, v10
|
||||
|
||||
// r_i = r0 ^ c_i for i = 1..7
|
||||
vxor.vv v1, v0, v11
|
||||
vxor.vv v2, v0, v12
|
||||
vxor.vv v3, v0, v13
|
||||
vxor.vv v4, v0, v14
|
||||
vxor.vv v5, v0, v15
|
||||
vxor.vv v6, v0, v16
|
||||
vxor.vv v7, v0, v17
|
||||
|
||||
// Step 2. Let cacheIndex = itemNumber
|
||||
vmv.v.v v9, v8
|
||||
|
||||
// Step 3 is implicit (all iterations are inlined, there is no "i")
|
||||
|
||||
// Init imul_rcp literal pointer
|
||||
lla x15, randomx_riscv64_vector_sshash_imul_rcp_literals
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_generated_instructions):
|
||||
// Generated by JIT compiler
|
||||
//
|
||||
// Step 4. randomx_riscv64_vector_sshash_cache_prefetch
|
||||
// Step 5. SuperscalarHash[i]
|
||||
// Step 6. randomx_riscv64_vector_sshash_xor
|
||||
//
|
||||
// Above steps will be repeated RANDOMX_CACHE_ACCESSES times
|
||||
.fill RANDOMX_CACHE_ACCESSES * 2048, 4, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_generated_instructions_end):
|
||||
// Step 9. Concatenate registers r0-r7 in little endian format to get the final Dataset item data.
|
||||
vsuxei64.v v0, (x11), v19
|
||||
|
||||
add x5, x11, 8
|
||||
vsuxei64.v v1, (x5), v19
|
||||
|
||||
add x5, x11, 16
|
||||
vsuxei64.v v2, (x5), v19
|
||||
|
||||
add x5, x11, 24
|
||||
vsuxei64.v v3, (x5), v19
|
||||
|
||||
add x5, x11, 32
|
||||
vsuxei64.v v4, (x5), v19
|
||||
|
||||
add x5, x11, 40
|
||||
vsuxei64.v v5, (x5), v19
|
||||
|
||||
add x5, x11, 48
|
||||
vsuxei64.v v6, (x5), v19
|
||||
|
||||
add x5, x11, 56
|
||||
vsuxei64.v v7, (x5), v19
|
||||
|
||||
// Iterate to the next 4 items
|
||||
vadd.vi v8, v8, 4
|
||||
add x11, x11, 256
|
||||
bltu x11, x13, init_item
|
||||
|
||||
ret
|
||||
|
||||
// Step 4. Load a 64-byte item from the Cache. The item index is given by cacheIndex modulo the total number of 64-byte items in Cache.
|
||||
DECL(randomx_riscv64_vector_sshash_cache_prefetch):
|
||||
// v9 = convert from cacheIndex to a direct pointer into cache->memory
|
||||
vand.vx v9, v9, x14
|
||||
vsll.vi v9, v9, 6
|
||||
vadd.vx v9, v9, x10
|
||||
|
||||
// Prefetch element 0
|
||||
vmv.x.s x5, v9
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// Prefetch element 1
|
||||
vslidedown.vi v18, v9, 1
|
||||
vmv.x.s x5, v18
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// Prefetch element 2
|
||||
vslidedown.vi v18, v9, 2
|
||||
vmv.x.s x5, v18
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// Prefetch element 3
|
||||
vslidedown.vi v18, v9, 3
|
||||
vmv.x.s x5, v18
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
// v9 = byte offset into cache->memory
|
||||
vsub.vx v9, v9, x10
|
||||
|
||||
// Step 6. XOR all registers with data loaded from randomx cache
|
||||
DECL(randomx_riscv64_vector_sshash_xor):
|
||||
vluxei64.v v18, (x10), v9
|
||||
vxor.vv v0, v0, v18
|
||||
|
||||
add x5, x10, 8
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v1, v1, v18
|
||||
|
||||
add x5, x10, 16
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v2, v2, v18
|
||||
|
||||
add x5, x10, 24
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v3, v3, v18
|
||||
|
||||
add x5, x10, 32
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v4, v4, v18
|
||||
|
||||
add x5, x10, 40
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v5, v5, v18
|
||||
|
||||
add x5, x10, 48
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v6, v6, v18
|
||||
|
||||
add x5, x10, 56
|
||||
vluxei64.v v18, (x5), v9
|
||||
vxor.vv v7, v7, v18
|
||||
|
||||
DECL(randomx_riscv64_vector_sshash_end):
|
||||
|
||||
/*
|
||||
Reference: https://github.com/tevador/RandomX/blob/master/doc/specs.md#46-vm-execution
|
||||
|
||||
C declarations:
|
||||
|
||||
struct RegisterFile {
|
||||
uint64_t r[8];
|
||||
double f[4][2];
|
||||
double e[4][2];
|
||||
double a[4][2];
|
||||
};
|
||||
|
||||
struct MemoryRegisters {
|
||||
uint32_t mx, ma;
|
||||
uint8_t* memory; // dataset (fast mode) or cache (light mode)
|
||||
};
|
||||
|
||||
void ProgramFunc(RegisterFile* reg, MemoryRegisters* mem, uint8_t* scratchpad, uint64_t iterations);
|
||||
|
||||
Register layout
|
||||
---------------
|
||||
x0 = zero
|
||||
x1 = scratchpad L3 mask
|
||||
x2 = stack pointer
|
||||
x3 = global pointer (unused)
|
||||
x4 = thread pointer (unused)
|
||||
x5 = temporary
|
||||
x6 = temporary
|
||||
x7 = branch mask (unshifted)
|
||||
x8 = frame pointer, also 64-bit literal inside the loop
|
||||
x9 = scratchpad L3 mask (64-byte aligned)
|
||||
x10 = RegisterFile* reg, also 64-bit literal inside the loop
|
||||
x11 = MemoryRegisters* mem, then dataset/cache pointer
|
||||
x12 = scratchpad
|
||||
x13 = iterations
|
||||
x14 = mx, ma (always stored with dataset mask applied)
|
||||
x15 = spAddr0, spAddr1
|
||||
x16 = scratchpad L1 mask
|
||||
x17 = scratchpad L2 mask
|
||||
x18 = IMUL_RCP literals pointer
|
||||
x19 = dataset mask
|
||||
x20-x27 = r0-r7
|
||||
x28-x31 = 64-bit literals
|
||||
|
||||
f0-f7 = 64-bit literals
|
||||
f10-f17 = 64-bit literals
|
||||
f28-f31 = 64-bit literals
|
||||
|
||||
v0-v3 = f0-f3
|
||||
v4-v7 = e0-e3
|
||||
v8-v11 = a0-a3
|
||||
v12 = E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
|
||||
v13 = E 'or' mask = 0x3*00000000******'3*00000000******
|
||||
v14 = scale mask = 0x80f0000000000000'80f0000000000000
|
||||
|
||||
v15 = unused
|
||||
v16 = temporary
|
||||
v17 = unused
|
||||
v18 = temporary
|
||||
|
||||
v19-v31 = unused
|
||||
*/
|
||||
|
||||
.balign 8
|
||||
|
||||
DECL(randomx_riscv64_vector_program_params):
|
||||
|
||||
// JIT compiler will adjust these values for different RandomX variants
|
||||
randomx_masks: .dword 16376, 262136, 2097144, 2147483584, 255
|
||||
|
||||
DECL(randomx_riscv64_vector_program_imul_rcp_literals):
|
||||
|
||||
imul_rcp_literals: .fill RANDOMX_PROGRAM_MAX_SIZE, 8, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_program_begin):
|
||||
addi sp, sp, -112
|
||||
sd x8, 96(sp) // save old frame pointer
|
||||
addi x8, sp, 112 // setup new frame pointer
|
||||
sd x1, 104(sp) // save return address
|
||||
|
||||
// Save callee-saved registers
|
||||
sd x9, 0(sp)
|
||||
sd x18, 8(sp)
|
||||
sd x19, 16(sp)
|
||||
sd x20, 24(sp)
|
||||
sd x21, 32(sp)
|
||||
sd x22, 40(sp)
|
||||
sd x23, 48(sp)
|
||||
sd x24, 56(sp)
|
||||
sd x25, 64(sp)
|
||||
sd x26, 72(sp)
|
||||
sd x27, 80(sp)
|
||||
|
||||
// Save x10 as it will be used as an IMUL_RCP literal
|
||||
sd x10, 88(sp)
|
||||
|
||||
// Load mx, ma and dataset pointer
|
||||
ld x14, (x11)
|
||||
ld x11, 8(x11)
|
||||
|
||||
// Initialize spAddr0-spAddr1
|
||||
mv x15, x14
|
||||
|
||||
// Set registers r0-r7 to zero
|
||||
li x20, 0
|
||||
li x21, 0
|
||||
li x22, 0
|
||||
li x23, 0
|
||||
li x24, 0
|
||||
li x25, 0
|
||||
li x26, 0
|
||||
li x27, 0
|
||||
|
||||
// Load masks
|
||||
lla x5, randomx_masks
|
||||
ld x16, 0(x5)
|
||||
ld x17, 8(x5)
|
||||
ld x1, 16(x5)
|
||||
ld x19, 24(x5)
|
||||
ld x7, 32(x5)
|
||||
addi x9, x1, -56
|
||||
|
||||
// Set vector registers to 2x64 bit
|
||||
vsetivli zero, 2, e64, m1, ta, ma
|
||||
|
||||
// Apply dataset mask to mx, ma
|
||||
slli x5, x19, 32
|
||||
or x5, x5, x19
|
||||
and x14, x14, x5
|
||||
|
||||
// Load group A registers
|
||||
addi x5, x10, 192
|
||||
vle64.v v8, (x5)
|
||||
|
||||
addi x5, x10, 208
|
||||
vle64.v v9, (x5)
|
||||
|
||||
addi x5, x10, 224
|
||||
vle64.v v10, (x5)
|
||||
|
||||
addi x5, x10, 240
|
||||
vle64.v v11, (x5)
|
||||
|
||||
// Load E 'and' mask
|
||||
vmv.v.i v12, -1
|
||||
vsrl.vi v12, v12, 8
|
||||
|
||||
// Load E 'or' mask (stored in reg.f[0])
|
||||
addi x5, x10, 64
|
||||
vle64.v v13, (x5)
|
||||
|
||||
// Load scale mask
|
||||
lui x5, 0x80f00
|
||||
slli x5, x5, 32
|
||||
vmv.v.x v14, x5
|
||||
|
||||
// IMUL_RCP literals pointer
|
||||
lla x18, imul_rcp_literals
|
||||
|
||||
// Load IMUL_RCP literals
|
||||
ld x8, 0(x18)
|
||||
ld x10, 8(x18)
|
||||
ld x28, 16(x18)
|
||||
ld x29, 24(x18)
|
||||
ld x30, 32(x18)
|
||||
ld x31, 40(x18)
|
||||
fld f0, 48(x18)
|
||||
fld f1, 56(x18)
|
||||
fld f2, 64(x18)
|
||||
fld f3, 72(x18)
|
||||
fld f4, 80(x18)
|
||||
fld f5, 88(x18)
|
||||
fld f6, 96(x18)
|
||||
fld f7, 104(x18)
|
||||
fld f10, 112(x18)
|
||||
fld f11, 120(x18)
|
||||
fld f12, 128(x18)
|
||||
fld f13, 136(x18)
|
||||
fld f14, 144(x18)
|
||||
fld f15, 152(x18)
|
||||
fld f16, 160(x18)
|
||||
fld f17, 168(x18)
|
||||
fld f28, 176(x18)
|
||||
fld f29, 184(x18)
|
||||
fld f30, 192(x18)
|
||||
fld f31, 200(x18)
|
||||
|
||||
randomx_riscv64_vector_program_main_loop:
|
||||
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
||||
|
||||
// read a 64-byte line from scratchpad (indexed by spAddr0) and XOR it with r0-r7
|
||||
ld x6, 0(x5)
|
||||
xor x20, x20, x6
|
||||
ld x6, 8(x5)
|
||||
xor x21, x21, x6
|
||||
ld x6, 16(x5)
|
||||
xor x22, x22, x6
|
||||
ld x6, 24(x5)
|
||||
xor x23, x23, x6
|
||||
ld x6, 32(x5)
|
||||
xor x24, x24, x6
|
||||
ld x6, 40(x5)
|
||||
xor x25, x25, x6
|
||||
ld x6, 48(x5)
|
||||
xor x26, x26, x6
|
||||
ld x6, 56(x5)
|
||||
xor x27, x27, x6
|
||||
|
||||
srli x5, x15, 32 // x5 = spAddr1
|
||||
and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
|
||||
|
||||
// read a 64-byte line from scratchpad (indexed by spAddr1) and initialize f0-f3, e0-e3 registers
|
||||
|
||||
// Set vector registers to 2x32 bit
|
||||
vsetivli zero, 2, e32, m1, ta, ma
|
||||
|
||||
// load f0
|
||||
vle32.v v16, (x5)
|
||||
vfwcvt.f.x.v v0, v16
|
||||
|
||||
// load f1
|
||||
addi x6, x5, 8
|
||||
vle32.v v1, (x6)
|
||||
// Use v16 as an intermediary register because vfwcvt accepts only registers with even numbers here
|
||||
vfwcvt.f.x.v v16, v1
|
||||
vmv1r.v v1, v16
|
||||
|
||||
// load f2
|
||||
addi x6, x5, 16
|
||||
vle32.v v16, (x6)
|
||||
vfwcvt.f.x.v v2, v16
|
||||
|
||||
// load f3
|
||||
addi x6, x5, 24
|
||||
vle32.v v3, (x6)
|
||||
vfwcvt.f.x.v v16, v3
|
||||
vmv1r.v v3, v16
|
||||
|
||||
// load e0
|
||||
addi x6, x5, 32
|
||||
vle32.v v16, (x6)
|
||||
vfwcvt.f.x.v v4, v16
|
||||
|
||||
// load e1
|
||||
addi x6, x5, 40
|
||||
vle32.v v5, (x6)
|
||||
vfwcvt.f.x.v v16, v5
|
||||
vmv1r.v v5, v16
|
||||
|
||||
// load e2
|
||||
addi x6, x5, 48
|
||||
vle32.v v16, (x6)
|
||||
vfwcvt.f.x.v v6, v16
|
||||
|
||||
// load e3
|
||||
addi x6, x5, 56
|
||||
vle32.v v7, (x6)
|
||||
vfwcvt.f.x.v v16, v7
|
||||
vmv1r.v v7, v16
|
||||
|
||||
// Set vector registers back to 2x64 bit
|
||||
vsetivli zero, 2, e64, m1, ta, ma
|
||||
|
||||
// post-process e0-e3
|
||||
vand.vv v4, v4, v12
|
||||
vand.vv v5, v5, v12
|
||||
vand.vv v6, v6, v12
|
||||
vand.vv v7, v7, v12
|
||||
|
||||
vor.vv v4, v4, v13
|
||||
vor.vv v5, v5, v13
|
||||
vor.vv v6, v6, v13
|
||||
vor.vv v7, v7, v13
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_instructions):
|
||||
// Generated by JIT compiler
|
||||
// FDIV_M can generate up to 50 bytes of code (round it up to 52 - a multiple of 4)
|
||||
// +32 bytes for the scratchpad prefetch and the final jump instruction
|
||||
.fill RANDOMX_PROGRAM_MAX_SIZE * 52 + 32, 1, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_instructions_end):
|
||||
// Calculate dataset pointer for dataset read
|
||||
// Do it here to break false dependency from readReg2 and readReg3 (see below)
|
||||
srli x6, x14, 32 // x6 = ma & dataset mask
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_mx_xor):
|
||||
xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers)
|
||||
|
||||
and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask
|
||||
xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask
|
||||
|
||||
and x5, x14, x19 // x5 = mx & dataset mask
|
||||
add x5, x5, x11 // x5 = &dataset[mx & dataset mask]
|
||||
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
#endif
|
||||
|
||||
add x5, x6, x11 // x5 = &dataset[ma & dataset mask]
|
||||
|
||||
// read a 64-byte line from dataset and XOR it with r0-r7
|
||||
ld x6, 0(x5)
|
||||
xor x20, x20, x6
|
||||
ld x6, 8(x5)
|
||||
xor x21, x21, x6
|
||||
ld x6, 16(x5)
|
||||
xor x22, x22, x6
|
||||
ld x6, 24(x5)
|
||||
xor x23, x23, x6
|
||||
ld x6, 32(x5)
|
||||
xor x24, x24, x6
|
||||
ld x6, 40(x5)
|
||||
xor x25, x25, x6
|
||||
ld x6, 48(x5)
|
||||
xor x26, x26, x6
|
||||
ld x6, 56(x5)
|
||||
xor x27, x27, x6
|
||||
|
||||
randomx_riscv64_vector_program_main_loop_swap_mx_ma:
|
||||
// swap mx <-> ma
|
||||
#ifdef __riscv_zbb
|
||||
rori x14, x14, 32
|
||||
#else
|
||||
srli x5, x14, 32
|
||||
slli x14, x14, 32
|
||||
or x14, x14, x5
|
||||
#endif
|
||||
|
||||
srli x5, x15, 32 // x5 = spAddr1
|
||||
and x5, x5, x9 // x5 = spAddr1 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
|
||||
|
||||
// store registers r0-r7 to the scratchpad
|
||||
sd x20, 0(x5)
|
||||
sd x21, 8(x5)
|
||||
sd x22, 16(x5)
|
||||
sd x23, 24(x5)
|
||||
sd x24, 32(x5)
|
||||
sd x25, 40(x5)
|
||||
sd x26, 48(x5)
|
||||
sd x27, 56(x5)
|
||||
|
||||
and x5, x15, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
||||
add x5, x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_spaddr_xor):
|
||||
xor x15, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers)
|
||||
|
||||
// store registers f0-f3 to the scratchpad (f0-f3 are first combined with e0-e3)
|
||||
vxor.vv v0, v0, v4
|
||||
vxor.vv v1, v1, v5
|
||||
vxor.vv v2, v2, v6
|
||||
vxor.vv v3, v3, v7
|
||||
|
||||
vse64.v v0, (x5)
|
||||
|
||||
addi x6, x5, 16
|
||||
vse64.v v1, (x6)
|
||||
|
||||
addi x6, x5, 32
|
||||
vse64.v v2, (x6)
|
||||
|
||||
addi x6, x5, 48
|
||||
vse64.v v3, (x6)
|
||||
|
||||
addi x13, x13, -1
|
||||
beqz x13, randomx_riscv64_vector_program_main_loop_end
|
||||
j randomx_riscv64_vector_program_main_loop
|
||||
|
||||
randomx_riscv64_vector_program_main_loop_end:
|
||||
// Restore x8 and x10
|
||||
addi x8, sp, 112
|
||||
ld x10, 88(sp)
|
||||
|
||||
// Store integer registers
|
||||
sd x20, 0(x10)
|
||||
sd x21, 8(x10)
|
||||
sd x22, 16(x10)
|
||||
sd x23, 24(x10)
|
||||
sd x24, 32(x10)
|
||||
sd x25, 40(x10)
|
||||
sd x26, 48(x10)
|
||||
sd x27, 56(x10)
|
||||
|
||||
// Store FP registers
|
||||
addi x5, x10, 64
|
||||
vse64.v v0, (x5)
|
||||
|
||||
addi x5, x10, 80
|
||||
vse64.v v1, (x5)
|
||||
|
||||
addi x5, x10, 96
|
||||
vse64.v v2, (x5)
|
||||
|
||||
addi x5, x10, 112
|
||||
vse64.v v3, (x5)
|
||||
|
||||
addi x5, x10, 128
|
||||
vse64.v v4, (x5)
|
||||
|
||||
addi x5, x10, 144
|
||||
vse64.v v5, (x5)
|
||||
|
||||
addi x5, x10, 160
|
||||
vse64.v v6, (x5)
|
||||
|
||||
addi x5, x10, 176
|
||||
vse64.v v7, (x5)
|
||||
|
||||
// Restore callee-saved registers
|
||||
ld x9, 0(sp)
|
||||
ld x18, 8(sp)
|
||||
ld x19, 16(sp)
|
||||
ld x20, 24(sp)
|
||||
ld x21, 32(sp)
|
||||
ld x22, 40(sp)
|
||||
ld x23, 48(sp)
|
||||
ld x24, 56(sp)
|
||||
ld x25, 64(sp)
|
||||
ld x26, 72(sp)
|
||||
ld x27, 80(sp)
|
||||
|
||||
ld x8, 96(sp) // old frame pointer
|
||||
ld x1, 104(sp) // return address
|
||||
|
||||
addi sp, sp, 112
|
||||
|
||||
ret
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_light_mode_data):
|
||||
// 1) Pointer to the scalar dataset init function
|
||||
// 2) Dataset offset
|
||||
.dword 0, 0
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_instructions_end_light_mode):
|
||||
// Calculate dataset pointer for dataset read
|
||||
// Do it here to break false dependency from readReg2 and readReg3 (see below)
|
||||
srli x6, x14, 32 // x6 = ma & dataset mask
|
||||
|
||||
DECL(randomx_riscv64_vector_program_main_loop_mx_xor_light_mode):
|
||||
xor x5, x24, x26 // x5 = readReg2 ^ readReg3 (JIT compiler will substitute the actual registers)
|
||||
and x5, x5, x19 // x5 = (readReg2 ^ readReg3) & dataset mask
|
||||
xor x14, x14, x5 // mx ^= (readReg2 ^ readReg3) & dataset mask
|
||||
|
||||
// Save all registers modified when calling dataset_init_scalar_func_ptr
|
||||
addi sp, sp, -192
|
||||
|
||||
// bytes [0, 127] - saved registers
|
||||
// bytes [128, 191] - output buffer
|
||||
|
||||
sd x1, 0(sp)
|
||||
sd x7, 16(sp)
|
||||
sd x10, 24(sp)
|
||||
sd x11, 32(sp)
|
||||
sd x12, 40(sp)
|
||||
sd x13, 48(sp)
|
||||
sd x14, 56(sp)
|
||||
sd x15, 64(sp)
|
||||
sd x16, 72(sp)
|
||||
sd x17, 80(sp)
|
||||
sd x28, 88(sp)
|
||||
sd x29, 96(sp)
|
||||
sd x30, 104(sp)
|
||||
sd x31, 112(sp)
|
||||
|
||||
// setup randomx_riscv64_vector_sshash_dataset_init's parameters
|
||||
|
||||
// x10 = pointer to pointer to cache memory
|
||||
// pointer to cache memory was saved in "sd x11, 32(sp)", so x10 = sp + 32
|
||||
addi x10, sp, 32
|
||||
|
||||
// x11 = output buffer (64 bytes)
|
||||
addi x11, sp, 128
|
||||
|
||||
// x12 = start block
|
||||
lla x5, randomx_riscv64_vector_program_main_loop_light_mode_data
|
||||
ld x12, 8(x5)
|
||||
add x12, x12, x6
|
||||
srli x12, x12, 6
|
||||
|
||||
// x13 = end block
|
||||
addi x13, x12, 1
|
||||
|
||||
ld x5, 0(x5)
|
||||
jalr x1, 0(x5)
|
||||
|
||||
// restore registers
|
||||
ld x1, 0(sp)
|
||||
ld x7, 16(sp)
|
||||
ld x10, 24(sp)
|
||||
ld x11, 32(sp)
|
||||
ld x12, 40(sp)
|
||||
ld x13, 48(sp)
|
||||
ld x14, 56(sp)
|
||||
ld x15, 64(sp)
|
||||
ld x16, 72(sp)
|
||||
ld x17, 80(sp)
|
||||
ld x28, 88(sp)
|
||||
ld x29, 96(sp)
|
||||
ld x30, 104(sp)
|
||||
ld x31, 112(sp)
|
||||
|
||||
// read a 64-byte line from dataset and XOR it with r0-r7
|
||||
ld x5, 128(sp)
|
||||
xor x20, x20, x5
|
||||
ld x5, 136(sp)
|
||||
xor x21, x21, x5
|
||||
ld x5, 144(sp)
|
||||
xor x22, x22, x5
|
||||
ld x5, 152(sp)
|
||||
xor x23, x23, x5
|
||||
ld x5, 160(sp)
|
||||
xor x24, x24, x5
|
||||
ld x5, 168(sp)
|
||||
xor x25, x25, x5
|
||||
ld x5, 176(sp)
|
||||
xor x26, x26, x5
|
||||
ld x5, 184(sp)
|
||||
xor x27, x27, x5
|
||||
|
||||
addi sp, sp, 192
|
||||
|
||||
j randomx_riscv64_vector_program_main_loop_swap_mx_ma
|
||||
|
||||
DECL(randomx_riscv64_vector_program_scratchpad_prefetch):
|
||||
xor x5, x20, x22 // spAddr0-spAddr1 = readReg0 ^ readReg1 (JIT compiler will substitute the actual registers)
|
||||
srli x6, x5, 32 // x6 = spAddr1
|
||||
|
||||
and x5, x5, x9 // x5 = spAddr0 & 64-byte aligned L3 mask
|
||||
and x6, x6, x9 // x6 = spAddr1 & 64-byte aligned L3 mask
|
||||
|
||||
c.add x5, x12 // x5 = &scratchpad[spAddr0 & 64-byte aligned L3 mask]
|
||||
c.add x6, x12 // x6 = &scratchpad[spAddr1 & 64-byte aligned L3 mask]
|
||||
|
||||
#ifdef __riscv_zicbop
|
||||
prefetch.r (x5)
|
||||
prefetch.r (x6)
|
||||
#else
|
||||
ld x5, (x5)
|
||||
ld x6, (x6)
|
||||
#endif
|
||||
|
||||
DECL(randomx_riscv64_vector_program_scratchpad_prefetch_end):
|
||||
|
||||
DECL(randomx_riscv64_vector_program_end):
|
||||
|
||||
DECL(randomx_riscv64_vector_code_end):
|
||||
75
src/crypto/randomx/jit_compiler_rv64_vector_static.h
Normal file
75
src/crypto/randomx/jit_compiler_rv64_vector_static.h
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
||||
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
Copyright (c) 2025, SChernykh <https://github.com/SChernykh>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the copyright holder nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#include <cstdint>
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct randomx_cache;
|
||||
|
||||
void randomx_riscv64_vector_code_begin();
|
||||
|
||||
void randomx_riscv64_vector_sshash_begin();
|
||||
void randomx_riscv64_vector_sshash_imul_rcp_literals();
|
||||
void randomx_riscv64_vector_sshash_dataset_init(struct randomx_cache* cache, uint8_t* output_buf, uint32_t startBlock, uint32_t endBlock);
|
||||
void randomx_riscv64_vector_sshash_cache_prefetch();
|
||||
void randomx_riscv64_vector_sshash_generated_instructions();
|
||||
void randomx_riscv64_vector_sshash_generated_instructions_end();
|
||||
void randomx_riscv64_vector_sshash_cache_prefetch();
|
||||
void randomx_riscv64_vector_sshash_xor();
|
||||
void randomx_riscv64_vector_sshash_end();
|
||||
|
||||
void randomx_riscv64_vector_program_params();
|
||||
void randomx_riscv64_vector_program_imul_rcp_literals();
|
||||
void randomx_riscv64_vector_program_begin();
|
||||
void randomx_riscv64_vector_program_main_loop_instructions();
|
||||
void randomx_riscv64_vector_program_main_loop_instructions_end();
|
||||
void randomx_riscv64_vector_program_main_loop_mx_xor();
|
||||
void randomx_riscv64_vector_program_main_loop_spaddr_xor();
|
||||
void randomx_riscv64_vector_program_main_loop_light_mode_data();
|
||||
void randomx_riscv64_vector_program_main_loop_instructions_end_light_mode();
|
||||
void randomx_riscv64_vector_program_main_loop_mx_xor_light_mode();
|
||||
void randomx_riscv64_vector_program_end();
|
||||
void randomx_riscv64_vector_program_scratchpad_prefetch();
|
||||
void randomx_riscv64_vector_program_scratchpad_prefetch_end();
|
||||
|
||||
void randomx_riscv64_vector_code_end();
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
@@ -282,7 +282,10 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
|
||||
Log2_ScratchpadL2 = Log2(ScratchpadL2_Size);
|
||||
Log2_ScratchpadL3 = Log2(ScratchpadL3_Size);
|
||||
|
||||
#define JIT_HANDLE(x, prev) randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x
|
||||
#define JIT_HANDLE(x, prev) do { \
|
||||
randomx::JitCompilerRV64::engine[k] = &randomx::JitCompilerRV64::v1_##x; \
|
||||
randomx::JitCompilerRV64::inst_map[k] = static_cast<uint8_t>(randomx::InstructionType::x); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
#define JIT_HANDLE(x, prev)
|
||||
|
||||
@@ -73,8 +73,20 @@ uint64_t randomx_reciprocal(uint64_t divisor) {
|
||||
|
||||
#if !RANDOMX_HAVE_FAST_RECIPROCAL
|
||||
|
||||
#ifdef __GNUC__
|
||||
uint64_t randomx_reciprocal_fast(uint64_t divisor)
|
||||
{
|
||||
const uint64_t q = (1ULL << 63) / divisor;
|
||||
const uint64_t r = (1ULL << 63) % divisor;
|
||||
|
||||
const uint64_t shift = 64 - __builtin_clzll(divisor);
|
||||
|
||||
return (q << shift) + ((r << shift) / divisor);
|
||||
}
|
||||
#else
|
||||
uint64_t randomx_reciprocal_fast(uint64_t divisor) {
|
||||
return randomx_reciprocal(divisor);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -39,6 +39,9 @@ alignas(64) uint32_t lutDec1[256];
|
||||
alignas(64) uint32_t lutDec2[256];
|
||||
alignas(64) uint32_t lutDec3[256];
|
||||
|
||||
alignas(64) uint8_t lutEncIndex[4][32];
|
||||
alignas(64) uint8_t lutDecIndex[4][32];
|
||||
|
||||
static uint32_t mul_gf2(uint32_t b, uint32_t c)
|
||||
{
|
||||
uint32_t s = 0;
|
||||
@@ -115,5 +118,49 @@ static struct SAESInitializer
|
||||
lutDec2[i] = w; w = (w << 8) | (w >> 24);
|
||||
lutDec3[i] = w;
|
||||
}
|
||||
|
||||
memset(lutEncIndex, -1, sizeof(lutEncIndex));
|
||||
memset(lutDecIndex, -1, sizeof(lutDecIndex));
|
||||
|
||||
lutEncIndex[0][ 0] = 0;
|
||||
lutEncIndex[0][ 4] = 4;
|
||||
lutEncIndex[0][ 8] = 8;
|
||||
lutEncIndex[0][12] = 12;
|
||||
lutEncIndex[1][ 0] = 5;
|
||||
lutEncIndex[1][ 4] = 9;
|
||||
lutEncIndex[1][ 8] = 13;
|
||||
lutEncIndex[1][12] = 1;
|
||||
lutEncIndex[2][ 0] = 10;
|
||||
lutEncIndex[2][ 4] = 14;
|
||||
lutEncIndex[2][ 8] = 2;
|
||||
lutEncIndex[2][12] = 6;
|
||||
lutEncIndex[3][ 0] = 15;
|
||||
lutEncIndex[3][ 4] = 3;
|
||||
lutEncIndex[3][ 8] = 7;
|
||||
lutEncIndex[3][12] = 11;
|
||||
|
||||
lutDecIndex[0][ 0] = 0;
|
||||
lutDecIndex[0][ 4] = 4;
|
||||
lutDecIndex[0][ 8] = 8;
|
||||
lutDecIndex[0][12] = 12;
|
||||
lutDecIndex[1][ 0] = 13;
|
||||
lutDecIndex[1][ 4] = 1;
|
||||
lutDecIndex[1][ 8] = 5;
|
||||
lutDecIndex[1][12] = 9;
|
||||
lutDecIndex[2][ 0] = 10;
|
||||
lutDecIndex[2][ 4] = 14;
|
||||
lutDecIndex[2][ 8] = 2;
|
||||
lutDecIndex[2][12] = 6;
|
||||
lutDecIndex[3][ 0] = 7;
|
||||
lutDecIndex[3][ 4] = 11;
|
||||
lutDecIndex[3][ 8] = 15;
|
||||
lutDecIndex[3][12] = 3;
|
||||
|
||||
for (uint32_t i = 0; i < 4; ++i) {
|
||||
for (uint32_t j = 0; j < 16; j += 4) {
|
||||
lutEncIndex[i][j + 16] = lutEncIndex[i][j] + 16;
|
||||
lutDecIndex[i][j + 16] = lutDecIndex[i][j] + 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
} aes_initializer;
|
||||
|
||||
@@ -41,6 +41,9 @@ extern uint32_t lutDec1[256];
|
||||
extern uint32_t lutDec2[256];
|
||||
extern uint32_t lutDec3[256];
|
||||
|
||||
extern uint8_t lutEncIndex[4][32];
|
||||
extern uint8_t lutDecIndex[4][32];
|
||||
|
||||
template<int soft> rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key);
|
||||
template<int soft> rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key);
|
||||
|
||||
|
||||
12
src/crypto/randomx/tests/riscv64_vector.s
Normal file
12
src/crypto/randomx/tests/riscv64_vector.s
Normal file
@@ -0,0 +1,12 @@
|
||||
/* RISC-V - test if the vector extension is present */
|
||||
|
||||
.text
|
||||
.option arch, rv64gcv
|
||||
.global main
|
||||
|
||||
main:
|
||||
li x5, 4
|
||||
vsetvli x6, x5, e64, m1, ta, ma
|
||||
vxor.vv v0, v0, v0
|
||||
sub x10, x5, x6
|
||||
ret
|
||||
11
src/crypto/randomx/tests/riscv64_zicbop.s
Normal file
11
src/crypto/randomx/tests/riscv64_zicbop.s
Normal file
@@ -0,0 +1,11 @@
|
||||
/* RISC-V - test if the prefetch instruction is present */
|
||||
|
||||
.text
|
||||
.option arch, rv64gc_zicbop
|
||||
.global main
|
||||
|
||||
main:
|
||||
lla x5, main
|
||||
prefetch.r (x5)
|
||||
mv x10, x0
|
||||
ret
|
||||
13
src/crypto/randomx/tests/riscv64_zvkb.s
Normal file
13
src/crypto/randomx/tests/riscv64_zvkb.s
Normal file
@@ -0,0 +1,13 @@
|
||||
/* RISC-V - test if the vector bit manipulation extension is present */
|
||||
|
||||
.text
|
||||
.option arch, rv64gcv_zvkb
|
||||
.global main
|
||||
|
||||
main:
|
||||
vsetivli zero, 8, e32, m1, ta, ma
|
||||
vror.vv v0, v0, v0
|
||||
vror.vx v0, v0, x5
|
||||
vror.vi v0, v0, 1
|
||||
li x10, 0
|
||||
ret
|
||||
12
src/crypto/randomx/tests/riscv64_zvkned.s
Normal file
12
src/crypto/randomx/tests/riscv64_zvkned.s
Normal file
@@ -0,0 +1,12 @@
|
||||
/* RISC-V - test if the vector bit manipulation extension is present */
|
||||
|
||||
.text
|
||||
.option arch, rv64gcv_zvkned
|
||||
.global main
|
||||
|
||||
main:
|
||||
vsetivli zero, 8, e32, m1, ta, ma
|
||||
vaesem.vv v0, v0
|
||||
vaesdm.vv v0, v0
|
||||
li x10, 0
|
||||
ret
|
||||
@@ -58,7 +58,7 @@ namespace randomx {
|
||||
void CompiledVm<softAes>::execute() {
|
||||
PROFILE_SCOPE(RandomX_JIT_execute);
|
||||
|
||||
# ifdef XMRIG_ARM
|
||||
# if defined(XMRIG_ARM) || defined(XMRIG_RISCV)
|
||||
memcpy(reg.f, config.eMask, sizeof(config.eMask));
|
||||
# endif
|
||||
compiler.getProgramFunc()(reg, mem, scratchpad, RandomX_CurrentConfig.ProgramIterations);
|
||||
|
||||
@@ -43,6 +43,12 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache,
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5));
|
||||
randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5);
|
||||
}
|
||||
#ifdef XMRIG_RISCV
|
||||
else if (itemCount % 4) {
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 4));
|
||||
randomx_init_dataset(dataset, cache, startItem + itemCount - 4, 4);
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
randomx_init_dataset(dataset, cache, startItem, itemCount);
|
||||
}
|
||||
@@ -209,7 +215,7 @@ void xmrig::RxDataset::allocate(bool hugePages, bool oneGbPages)
|
||||
return;
|
||||
}
|
||||
|
||||
m_memory = new VirtualMemory(maxSize(), hugePages, oneGbPages, false, m_node);
|
||||
m_memory = new VirtualMemory(maxSize(), hugePages, oneGbPages, false, m_node, VirtualMemory::kDefaultHugePageSize);
|
||||
|
||||
if (m_memory->isOneGbPages()) {
|
||||
m_scratchpadOffset = maxSize() + RANDOMX_CACHE_MAX_SIZE;
|
||||
|
||||
@@ -115,7 +115,7 @@ static inline void checkHash(const JobBundle &bundle, std::vector<JobResult> &re
|
||||
static void getResults(JobBundle &bundle, std::vector<JobResult> &results, uint32_t &errors, bool hwAES)
|
||||
{
|
||||
const auto &algorithm = bundle.job.algorithm();
|
||||
auto memory = new VirtualMemory(algorithm.l3(), false, false, false);
|
||||
auto memory = new VirtualMemory(algorithm.l3(), false, false, false, 0, VirtualMemory::kDefaultHugePageSize);
|
||||
alignas(16) uint8_t hash[32]{ 0 };
|
||||
|
||||
if (algorithm.family() == Algorithm::RANDOM_X) {
|
||||
|
||||
@@ -2,18 +2,7 @@
|
||||
* Copyright (c) 2018-2025 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright (c) 2016-2025 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
* SPDX-License-Identifier: GPL-3.0-or-later
|
||||
*/
|
||||
|
||||
#ifndef XMRIG_VERSION_H
|
||||
@@ -22,18 +11,20 @@
|
||||
#define APP_ID "xmrig"
|
||||
#define APP_NAME "XMRig"
|
||||
#define APP_DESC "XMRig miner"
|
||||
#define APP_VERSION "6.24.1-dev"
|
||||
#define APP_VERSION "6.25.1-dev"
|
||||
#define APP_DOMAIN "xmrig.com"
|
||||
#define APP_SITE "www.xmrig.com"
|
||||
#define APP_COPYRIGHT "Copyright (C) 2016-2025 xmrig.com"
|
||||
#define APP_KIND "miner"
|
||||
|
||||
#define APP_VER_MAJOR 6
|
||||
#define APP_VER_MINOR 24
|
||||
#define APP_VER_MINOR 25
|
||||
#define APP_VER_PATCH 1
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# if (_MSC_VER >= 1930)
|
||||
# if (_MSC_VER >= 1950)
|
||||
# define MSVC_VERSION 2026
|
||||
# elif (_MSC_VER >=1930 && _MSC_VER < 1950)
|
||||
# define MSVC_VERSION 2022
|
||||
# elif (_MSC_VER >= 1920 && _MSC_VER < 1930)
|
||||
# define MSVC_VERSION 2019
|
||||
|
||||
Reference in New Issue
Block a user