Merge pull request #3826 from SChernykh/dev

ISUB_R fix for ARM/RISC-V JIT
2026-06-28 05:37:39 -04:00 · 2026-06-27 20:33:57 +07:00 · 2026-06-27 13:22:48 +02:00 · 2026-05-26 00:21:52 +07:00 · 2026-05-25 18:05:50 +02:00 · 2026-05-25 13:46:41 +02:00
6 changed files with 65 additions and 20 deletions
@@ -74,6 +74,11 @@
 #endif


+#ifndef MADV_COLLAPSE
+#   define MADV_COLLAPSE 25
+#endif
+
+
 #if defined(XMRIG_OS_LINUX) || (!defined(XMRIG_OS_APPLE) && !defined(XMRIG_OS_FREEBSD))
 static inline int hugePagesFlag(size_t size)
 {
@@ -278,8 +283,9 @@ bool xmrig::VirtualMemory::allocateOneGbPagesMemory()

 bool xmrig::VirtualMemory::adviseLargePages(void *p, size_t size)
 {
-#   ifdef XMRIG_OS_LINUX
-    return (madvise(p, size, MADV_HUGEPAGE) == 0);
+#   if defined(XMRIG_OS_ANDROID) || defined(XMRIG_OS_LINUX)
+    // MADV_COLLAPSE works even if /sys/kernel/mm/transparent_hugepage/enabled is set to "never", but only on Linux 6.1+
+    return (madvise(p, size, MADV_COLLAPSE) == 0) || (madvise(p, size, MADV_HUGEPAGE) == 0);
 #   else
    return false;
 #   endif
@@ -141,7 +141,7 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con

 	codePos = PrologueSize;
 	literalPos = ImulRcpLiteralsEnd;
-	num32bitLiterals = 0;
+	num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk

 	for (uint32_t i = 0; i < RegistersCount; ++i)
 		reg_changed_offset[i] = codePos;
@@ -237,7 +237,7 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration

 	codePos = PrologueSize;
 	literalPos = ImulRcpLiteralsEnd;
-	num32bitLiterals = 0;
+	num32bitLiterals = 64; // effectively disabled because it's slower than plain movn/movz+movk

 	for (uint32_t i = 0; i < RegistersCount; ++i)
 		reg_changed_offset[i] = codePos;
@@ -488,13 +488,31 @@ void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code,
 {
 	uint32_t k = codePos;

+	// 196606 different values can be encoded with a single instruction, the rest requires smov/umov load, or movn/movz+movk pair
 	if (imm < (1 << 16))
 	{
+		// Sign-extended 64-bit value: 0x000000000000xxxx
 		// movz tmp_reg, imm32 (16 low bits)
 		emit32(ARMV8A::MOVZ | dst | (imm << 5), code, k);
 	}
+	else if ((imm >> 16) == 0xFFFF) {
+		// Sign-extended 64-bit value: 0xFFFFFFFFFFFFxxxx
+		// movn tmp_reg, ~imm32 (16 low bits)
+		emit32(ARMV8A::MOVN | dst | ((~imm & 0xFFFF) << 5), code, k);
+	}
+	else if (((imm & 0xFFFF) == 0xFFFF) && (static_cast<int32_t>(imm) < 0)) {
+		// Sign-extended 64-bit value: 0xFFFFFFFFxxxxFFFF
+		// movn tmp_reg, ~imm32 (16 high bits)
+		emit32(ARMV8A::MOVN | dst | (1 << 21) | ((~imm >> 16) << 5), code, k);
+	}
+	else if (((imm & 0xFFFF) == 0) && (static_cast<int32_t>(imm) >= 0)) {
+		// Sign-extended 64-bit value: 0x00000000xxxx0000
+		// movz tmp_reg, imm32 (16 high bits)
+		emit32(ARMV8A::MOVZ | dst | (1 << 21) | ((imm >> 16) << 5), code, k);
+	}
 	else
 	{
+		// Full sign-extended 64-bit value: 0x00000000xxxxxxxx or 0xFFFFFFFFxxxxxxxx
 		if (num32bitLiterals < 64)
 		{
 			if (static_cast<int32_t>(imm) < 0)
@@ -611,18 +629,17 @@ void JitCompilerA64::emitMemLoad(uint32_t dst, uint32_t src, Instruction& instr,
 	else
 	{
 		imm = (imm & ScratchpadL3Mask) >> 3;
-		if (imm)
+		if (imm < 4096) {
+			// ldr tmp_reg, [x2, #imm*8]
+			emit32(0xf9400040 | tmp_reg | (imm << 10), code, k);
+		}
+		else
 		{
 			emitMovImmediate(tmp_reg, imm, code, k);

 			// ldr tmp_reg, [x2, tmp_reg, lsl 3]
 			emit32(0xf8607840 | tmp_reg | (tmp_reg << 16), code, k);
 		}
-		else
-		{
-			// ldr tmp_reg, [x2]
-			emit32(0xf9400040 | tmp_reg, code, k);
-		}
 	}

 	codePos = k;
@@ -709,8 +726,17 @@ void JitCompilerA64::h_ISUB_R(Instruction& instr, uint32_t& codePos)
 	}
 	else
 	{
+		const uint32_t imm = instr.getImm32();
+
+		if (imm == 0x80000000ul) {
+			constexpr uint32_t tmp_reg = 20;
+			emit32(ARMV8A::MOVZ | tmp_reg | (1u << 21) | (0x8000u << 5), code, k);
+			emit32(ARMV8A::ADD | dst | (dst << 5) | (tmp_reg << 16), code, k);
+		}
+		else {
 			emitAddImmediate(dst, dst, -instr.getImm32(), code, k);
 		}
+	}

 	reg_changed_offset[instr.dst] = k;
 	codePos = k;
@@ -303,7 +303,7 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
 	add	x20, x20, x1

 	# Prefetch dataset data
-	prfm	pldl2strm, [x20]
+	prfm	pldl1strm, [x20]

 DECL(randomx_program_aarch64_cacheline_align_mask2):
 	# Actual mask will be inserted by JIT compiler
@@ -312,16 +312,16 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):

 DECL(randomx_program_aarch64_xor_with_dataset_line):
 	# xor integer registers with dataset data
-	ldp	x20, x19, [x10]
+	ldnp	x20, x19, [x10]
 	eor	x4, x4, x20
 	eor	x5, x5, x19
-	ldp	x20, x19, [x10, 16]
+	ldnp	x20, x19, [x10, 16]
 	eor	x6, x6, x20
 	eor	x7, x7, x19
-	ldp	x20, x19, [x10, 32]
+	ldnp	x20, x19, [x10, 32]
 	eor	x12, x12, x20
 	eor	x13, x13, x19
-	ldp	x20, x19, [x10, 48]
+	ldnp	x20, x19, [x10, 48]
 	eor	x14, x14, x20
 	eor	x15, x15, x19

@@ -814,11 +814,18 @@ namespace randomx {
 			state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src)));
 		}
 		else {
-			int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add
+			const uint32_t uimm = isn.getImm32();
+			if (uimm == 0x80000000ul) {
+				state.emit(rv64::LUI | (0x80000 << 12) | rvrd(Tmp1Reg));
+				state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+			}
+			else {
+				int32_t imm = unsigned32ToSigned2sCompl(-uimm); //convert to add
 				//x{dst} = x{dst} + {-imm}
 				emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg);
 			}
 		}
+	}

 	void JitCompilerRV64::v1_ISUB_M(HANDLER_ARGS) {
 		state.registerUsage[isn.dst] = i;
@@ -444,6 +444,12 @@ void* generateProgramVectorRV64(uint8_t* buf, Program& prog, ProgramConfiguratio
 				// sub x20 + dst, x20 + dst, x20 + src
 				emit32(0x414A0A33 + (dst << 7) + (dst << 15) + (src << 20));
 			}
+			else if (imm == 0x80000000U) {
+				// lui x5, 0x80000000U
+				emit32(0x800002B7);
+				// sub x20 + dst, x20 + dst, x5
+				emit32(0x405A0A33 + (dst << 7) + (dst << 15));
+			}
 			else {
 				imm_to_x5(-imm, p);
 				// c.add x20 + dst, x5
@@ -163,7 +163,7 @@ extern RandomX_ConfigurationGraft RandomX_GraftConfig;
 extern RandomX_ConfigurationSafex RandomX_SafexConfig;
 extern RandomX_ConfigurationYada RandomX_YadaConfig;

-extern RandomX_ConfigurationBase RandomX_CurrentConfig;
+alignas(64) extern RandomX_ConfigurationBase RandomX_CurrentConfig;

 template<typename T>
 void randomx_apply_config(const T& config)
Author	SHA1	Message	Date
xmrig	6dc014f71f	Merge pull request #3826 from SChernykh/dev ISUB_R fix for ARM/RISC-V JIT	2026-06-27 20:33:57 +07:00
SChernykh	906fd4693b	ISUB_R fix for ARM/RISC-V JIT	2026-06-27 13:22:48 +02:00
xmrig	3fb851d91d	Merge pull request #3820 from aa022/dev ARM64 RandomX JIT: dataset prefetch + non-temporal loads (+~8% on M4 base)	2026-05-26 00:21:52 +07:00
aa022	9ac373fea5	ARM64 RandomX JIT: drop early dataset prefetch	2026-05-25 18:05:50 +02:00
aa022	978720462d	ARM64 RandomX JIT: dataset prefetch + non-temporal loads Two Apple-silicon-targeted tweaks to the aarch64 RandomX JIT: - Early dataset prefetch: when readReg2/readReg3 are finalized well before the end of the program body, emit the next iteration's dataset-line prefetch early to hide more DRAM latency on the serial scalar chain. - Non-temporal dataset loads: each 64-byte dataset line is read once and never reused, so ldp -> ldnp avoids evicting the hot scratchpad, and the prefetch hint moves pldl2strm -> pldl1strm to match the longer lead time. Measured ~8% hashrate gain on Apple M4 base over dev (`7eadfdc9`).	2026-05-25 13:46:41 +02:00
xmrig	7eadfdc9c6	Merge pull request #3816 from SChernykh/dev ARM64 RandomX optimizations	2026-05-18 02:12:24 +07:00
SChernykh	720325c40f	RandomX optimizations: - ARM64: optimized emitMovImmediate/emitMemLoad - ARM64: disabled 32-bit literal preloading (it was slower) - Android and Linux: added MADV_COLLAPSE support to memory allocation	2026-05-17 21:04:02 +02:00