mirror of
https://github.com/xmrig/xmrig.git
synced 2026-04-17 21:12:58 -04:00
minor Aarch64 JIT changes (better instruction selection, don't emit instructions that add 0, etc)
This commit is contained in:
@@ -100,9 +100,9 @@
|
||||
# v26 -> "a2"
|
||||
# v27 -> "a3"
|
||||
# v28 -> temporary
|
||||
# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
|
||||
# v30 -> E 'or' mask = 0x3*00000000******3*00000000******
|
||||
# v31 -> scale mask = 0x81f000000000000081f0000000000000
|
||||
# v29 -> E 'and' mask = 0x00ffffffffffffff'00ffffffffffffff
|
||||
# v30 -> E 'or' mask = 0x3*00000000******'3*00000000******
|
||||
# v31 -> scale mask = 0x80f0000000000000'80f0000000000000
|
||||
|
||||
.balign 4
|
||||
DECL(randomx_program_aarch64):
|
||||
@@ -142,17 +142,14 @@ DECL(randomx_program_aarch64):
|
||||
ldp q26, q27, [x0, 224]
|
||||
|
||||
# Load E 'and' mask
|
||||
mov x16, 0x00FFFFFFFFFFFFFF
|
||||
ins v29.d[0], x16
|
||||
ins v29.d[1], x16
|
||||
movi.2d v29, #0x00FFFFFFFFFFFFFF
|
||||
|
||||
# Load E 'or' mask (stored in reg.f[0])
|
||||
ldr q30, [x0, 64]
|
||||
|
||||
# Load scale mask
|
||||
mov x16, 0x80f0000000000000
|
||||
ins v31.d[0], x16
|
||||
ins v31.d[1], x16
|
||||
dup v31.2d, x16
|
||||
|
||||
# Read fpcr
|
||||
mrs x8, fpcr
|
||||
@@ -162,35 +159,22 @@ DECL(randomx_program_aarch64):
|
||||
str x0, [sp, -16]!
|
||||
|
||||
# Read literals
|
||||
ldr x0, literal_x0
|
||||
ldr x11, literal_x11
|
||||
ldr x21, literal_x21
|
||||
ldr x22, literal_x22
|
||||
ldr x23, literal_x23
|
||||
ldr x24, literal_x24
|
||||
ldr x25, literal_x25
|
||||
ldr x26, literal_x26
|
||||
ldr x27, literal_x27
|
||||
ldr x28, literal_x28
|
||||
ldr x29, literal_x29
|
||||
ldr x30, literal_x30
|
||||
adr x30, literal_v0
|
||||
ldp q0, q1, [x30]
|
||||
ldp q2, q3, [x30, 32]
|
||||
ldp q4, q5, [x30, 64]
|
||||
ldp q6, q7, [x30, 96]
|
||||
ldp q8, q9, [x30, 128]
|
||||
ldp q10, q11, [x30, 160]
|
||||
ldp q12, q13, [x30, 192]
|
||||
ldp q14, q15, [x30, 224]
|
||||
|
||||
ldr q0, literal_v0
|
||||
ldr q1, literal_v1
|
||||
ldr q2, literal_v2
|
||||
ldr q3, literal_v3
|
||||
ldr q4, literal_v4
|
||||
ldr q5, literal_v5
|
||||
ldr q6, literal_v6
|
||||
ldr q7, literal_v7
|
||||
ldr q8, literal_v8
|
||||
ldr q9, literal_v9
|
||||
ldr q10, literal_v10
|
||||
ldr q11, literal_v11
|
||||
ldr q12, literal_v12
|
||||
ldr q13, literal_v13
|
||||
ldr q14, literal_v14
|
||||
ldr q15, literal_v15
|
||||
ldp x0, x11, [x30, -96] // literal_x0
|
||||
ldp x21, x22, [x30, -80] // literal_x21
|
||||
ldp x23, x24, [x30, -64] // literal_x23
|
||||
ldp x25, x26, [x30, -48] // literal_x25
|
||||
ldp x27, x28, [x30, -32] // literal_x27
|
||||
ldp x29, x30, [x30, -16] // literal_x29
|
||||
|
||||
DECL(randomx_program_aarch64_main_loop):
|
||||
# spAddr0 = spMix1 & ScratchpadL3Mask64;
|
||||
@@ -221,40 +205,31 @@ DECL(randomx_program_aarch64_main_loop):
|
||||
eor x15, x15, x19
|
||||
|
||||
# Load group F registers (spAddr1)
|
||||
ldpsw x20, x19, [x17]
|
||||
ins v16.d[0], x20
|
||||
ins v16.d[1], x19
|
||||
ldpsw x20, x19, [x17, 8]
|
||||
ins v17.d[0], x20
|
||||
ins v17.d[1], x19
|
||||
ldpsw x20, x19, [x17, 16]
|
||||
ins v18.d[0], x20
|
||||
ins v18.d[1], x19
|
||||
ldpsw x20, x19, [x17, 24]
|
||||
ins v19.d[0], x20
|
||||
ins v19.d[1], x19
|
||||
scvtf v16.2d, v16.2d
|
||||
scvtf v17.2d, v17.2d
|
||||
scvtf v18.2d, v18.2d
|
||||
scvtf v19.2d, v19.2d
|
||||
ldr q17, [x17]
|
||||
sxtl.2d v16, v17
|
||||
scvtf.2d v16, v16
|
||||
sxtl2.2d v17, v17
|
||||
scvtf.2d v17, v17
|
||||
|
||||
ldr q19, [x17, 16]
|
||||
sxtl.2d v18, v19
|
||||
scvtf.2d v18, v18
|
||||
sxtl2.2d v19, v19
|
||||
scvtf.2d v19, v19
|
||||
|
||||
# Load group E registers (spAddr1)
|
||||
ldpsw x20, x19, [x17, 32]
|
||||
ins v20.d[0], x20
|
||||
ins v20.d[1], x19
|
||||
ldpsw x20, x19, [x17, 40]
|
||||
ins v21.d[0], x20
|
||||
ins v21.d[1], x19
|
||||
ldpsw x20, x19, [x17, 48]
|
||||
ins v22.d[0], x20
|
||||
ins v22.d[1], x19
|
||||
ldpsw x20, x19, [x17, 56]
|
||||
ins v23.d[0], x20
|
||||
ins v23.d[1], x19
|
||||
scvtf v20.2d, v20.2d
|
||||
scvtf v21.2d, v21.2d
|
||||
scvtf v22.2d, v22.2d
|
||||
scvtf v23.2d, v23.2d
|
||||
ldr q21, [x17, 32]
|
||||
sxtl.2d v20, v21
|
||||
scvtf.2d v20, v20
|
||||
sxtl2.2d v21, v21
|
||||
scvtf.2d v21, v21
|
||||
|
||||
ldr q23, [x17, 48]
|
||||
sxtl.2d v22, v23
|
||||
scvtf.2d v22, v22
|
||||
sxtl2.2d v23, v23
|
||||
scvtf.2d v23, v23
|
||||
|
||||
and v20.16b, v20.16b, v29.16b
|
||||
and v21.16b, v21.16b, v29.16b
|
||||
and v22.16b, v22.16b, v29.16b
|
||||
@@ -310,10 +285,9 @@ DECL(randomx_program_aarch64_vm_instructions_end):
|
||||
eor x9, x9, x20
|
||||
|
||||
# Calculate dataset pointer for dataset prefetch
|
||||
mov w20, w9
|
||||
DECL(randomx_program_aarch64_cacheline_align_mask1):
|
||||
# Actual mask will be inserted by JIT compiler
|
||||
and x20, x20, 1
|
||||
and x20, x9, 1
|
||||
add x20, x20, x1
|
||||
|
||||
# Prefetch dataset data
|
||||
@@ -491,42 +465,39 @@ DECL(randomx_calc_dataset_item_aarch64):
|
||||
stp x10, x11, [sp, 80]
|
||||
stp x12, x13, [sp, 96]
|
||||
|
||||
ldr x12, superscalarMul0
|
||||
adr x7, superscalarMul0
|
||||
# superscalarMul0, superscalarAdd1
|
||||
ldp x12, x13, [x7]
|
||||
|
||||
mov x8, x0
|
||||
mov x9, x1
|
||||
ldp x8, x9, [sp]
|
||||
mov x10, x2
|
||||
|
||||
# rl[0] = (itemNumber + 1) * superscalarMul0;
|
||||
madd x0, x2, x12, x12
|
||||
|
||||
# rl[1] = rl[0] ^ superscalarAdd1;
|
||||
ldr x12, superscalarAdd1
|
||||
eor x1, x0, x12
|
||||
eor x1, x0, x13
|
||||
|
||||
# rl[2] = rl[0] ^ superscalarAdd2;
|
||||
ldr x12, superscalarAdd2
|
||||
ldp x12, x13, [x7, 16]
|
||||
eor x2, x0, x12
|
||||
|
||||
# rl[3] = rl[0] ^ superscalarAdd3;
|
||||
ldr x12, superscalarAdd3
|
||||
eor x3, x0, x12
|
||||
eor x3, x0, x13
|
||||
|
||||
# rl[4] = rl[0] ^ superscalarAdd4;
|
||||
ldr x12, superscalarAdd4
|
||||
ldp x12, x13, [x7, 32]
|
||||
eor x4, x0, x12
|
||||
|
||||
# rl[5] = rl[0] ^ superscalarAdd5;
|
||||
ldr x12, superscalarAdd5
|
||||
eor x5, x0, x12
|
||||
eor x5, x0, x13
|
||||
|
||||
# rl[6] = rl[0] ^ superscalarAdd6;
|
||||
ldr x12, superscalarAdd6
|
||||
ldp x12, x13, [x7, 48]
|
||||
eor x6, x0, x12
|
||||
|
||||
# rl[7] = rl[0] ^ superscalarAdd7;
|
||||
ldr x12, superscalarAdd7
|
||||
eor x7, x0, x12
|
||||
eor x7, x0, x13
|
||||
|
||||
b DECL(randomx_calc_dataset_item_aarch64_prefetch)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user