1
0
mirror of https://github.com/xmrig/xmrig.git synced 2025-12-07 16:05:05 -05:00
Files
xmrig/src/backend/opencl/cl/rx/fillAes1Rx4.cl
2019-09-18 23:46:51 +07:00

119 lines
5.6 KiB
Common Lisp

/*
Copyright (c) 2019 SChernykh
This file is part of RandomX OpenCL.
RandomX OpenCL is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
RandomX OpenCL is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
*/
__attribute__((reqd_work_group_size(64, 1, 1)))
__kernel void fillAes_name(__global void* state, __global void* out, uint batch_size, uint rx_version)
{
__local uint T[2048];
const uint global_index = get_global_id(0);
if (global_index >= batch_size * 4)
return;
const uint idx = global_index / 4;
const uint sub = global_index % 4;
for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step)
T[i] = AES_TABLE[i];
barrier(CLK_LOCAL_MEM_FENCE);
#if num_rounds != 4
const uint k[4] = { AES_KEY_FILL[sub * 4], AES_KEY_FILL[sub * 4 + 1], AES_KEY_FILL[sub * 4 + 2], AES_KEY_FILL[sub * 4 + 3] };
#else
const bool b1 = (rx_version < 104);
const bool b2 = (sub < 2);
uint k[16];
k[ 0] = b1 ? 0xf890465du : (b2 ? 0x6421aaddu : 0xb5826f73u);
k[ 1] = b1 ? 0x7ffbe4a6u : (b2 ? 0xd1833ddbu : 0xe3d6a7a6u);
k[ 2] = b1 ? 0x141f82b7u : (b2 ? 0x2f546d2bu : 0x3d518b6du);
k[ 3] = b1 ? 0xcf359e95u : (b2 ? 0x99e5d23fu : 0x229effb4u);
k[ 4] = b1 ? 0x6a55c450u : (b2 ? 0xb20e3450u : 0xc7566bf3u);
k[ 5] = b1 ? 0xfee8278au : (b2 ? 0xb6913f55u : 0x9c10b3d9u);
k[ 6] = b1 ? 0xbd5c5ac3u : (b2 ? 0x06f79d53u : 0xe9024d4eu);
k[ 7] = b1 ? 0x6741ffdcu : (b2 ? 0xa5dfcde5u : 0xb272b7d2u);
k[ 8] = b1 ? 0x114c47a4u : (b2 ? 0x5c3ed904u : 0xf273c9e7u);
k[ 9] = b1 ? 0xd524fde4u : (b2 ? 0x515e7bafu : 0xf765a38bu);
k[10] = b1 ? 0xa7279ad2u : (b2 ? 0x0aa4679fu : 0x2ba9660au);
k[11] = b1 ? 0x3d324aacu : (b2 ? 0x171c02bfu : 0xf63befa7u);
k[12] = b1 ? 0x810c3a2au : (b2 ? 0x85623763u : 0x7a7cd609u);
k[13] = b1 ? 0x99a9aeffu : (b2 ? 0xe78f5d08u : 0x915839deu);
k[14] = b1 ? 0x42d3dbd9u : (b2 ? 0xcd673785u : 0x0c06d1fdu);
k[15] = b1 ? 0x76f6db08u : (b2 ? 0xd8ded291u : 0xc0b0762du);
#endif
__global uint* s = ((__global uint*) state) + idx * (64 / sizeof(uint)) + sub * (16 / sizeof(uint));
uint x[4] = { s[0], s[1], s[2], s[3] };
const uint s1 = (sub & 1) ? 8 : 24;
const uint s3 = (sub & 1) ? 24 : 8;
__global uint4* p = ((__global uint4*) out) + idx * (outputSize0 / sizeof(uint4)) + sub;
const __local uint* const t0 = (sub & 1) ? T : (T + 1024);
const __local uint* const t1 = (sub & 1) ? (T + 256) : (T + 1792);
const __local uint* const t2 = (sub & 1) ? (T + 512) : (T + 1536);
const __local uint* const t3 = (sub & 1) ? (T + 768) : (T + 1280);
#pragma unroll unroll_factor
for (uint i = 0; i < outputSize / sizeof(uint4); i += 4, p += 4)
{
uint y[4];
#if num_rounds != 4
y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[0];
y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[1];
y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[2];
y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[3];
*p = *(uint4*)(y);
x[0] = y[0];
x[1] = y[1];
x[2] = y[2];
x[3] = y[3];
#else
y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[ 0];
y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[ 1];
y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[ 2];
y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[ 3];
x[0] = t0[get_byte32(y[0], 0)] ^ t1[get_byte32(y[1], s1)] ^ t2[get_byte32(y[2], 16)] ^ t3[get_byte32(y[3], s3)] ^ k[ 4];
x[1] = t0[get_byte32(y[1], 0)] ^ t1[get_byte32(y[2], s1)] ^ t2[get_byte32(y[3], 16)] ^ t3[get_byte32(y[0], s3)] ^ k[ 5];
x[2] = t0[get_byte32(y[2], 0)] ^ t1[get_byte32(y[3], s1)] ^ t2[get_byte32(y[0], 16)] ^ t3[get_byte32(y[1], s3)] ^ k[ 6];
x[3] = t0[get_byte32(y[3], 0)] ^ t1[get_byte32(y[0], s1)] ^ t2[get_byte32(y[1], 16)] ^ t3[get_byte32(y[2], s3)] ^ k[ 7];
y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[ 8];
y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[ 9];
y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[10];
y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[11];
x[0] = t0[get_byte32(y[0], 0)] ^ t1[get_byte32(y[1], s1)] ^ t2[get_byte32(y[2], 16)] ^ t3[get_byte32(y[3], s3)] ^ k[12];
x[1] = t0[get_byte32(y[1], 0)] ^ t1[get_byte32(y[2], s1)] ^ t2[get_byte32(y[3], 16)] ^ t3[get_byte32(y[0], s3)] ^ k[13];
x[2] = t0[get_byte32(y[2], 0)] ^ t1[get_byte32(y[3], s1)] ^ t2[get_byte32(y[0], 16)] ^ t3[get_byte32(y[1], s3)] ^ k[14];
x[3] = t0[get_byte32(y[3], 0)] ^ t1[get_byte32(y[0], s1)] ^ t2[get_byte32(y[1], 16)] ^ t3[get_byte32(y[2], s3)] ^ k[15];
*p = *(uint4*)(x);
#endif
}
*(__global uint4*)(s) = *(uint4*)(x);
}