mirror of
https://github.com/xmrig/xmrig.git
synced 2026-02-01 01:42:59 -05:00
Added OpenCL source.
This commit is contained in:
120
src/backend/opencl/cl/rx/fillAes1Rx4.cl
Normal file
120
src/backend/opencl/cl/rx/fillAes1Rx4.cl
Normal file
@@ -0,0 +1,120 @@
|
||||
R"===(
|
||||
/*
|
||||
Copyright (c) 2019 SChernykh
|
||||
|
||||
This file is part of RandomX OpenCL.
|
||||
|
||||
RandomX OpenCL is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
RandomX OpenCL is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
__attribute__((reqd_work_group_size(64, 1, 1)))
|
||||
__kernel void fillAes_name(__global void* state, __global void* out, uint batch_size, uint rx_version)
|
||||
{
|
||||
__local uint T[2048];
|
||||
|
||||
const uint global_index = get_global_id(0);
|
||||
if (global_index >= batch_size * 4)
|
||||
return;
|
||||
|
||||
const uint idx = global_index / 4;
|
||||
const uint sub = global_index % 4;
|
||||
|
||||
for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step)
|
||||
T[i] = AES_TABLE[i];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#if num_rounds != 4
|
||||
const uint k[4] = { AES_KEY_FILL[sub * 4], AES_KEY_FILL[sub * 4 + 1], AES_KEY_FILL[sub * 4 + 2], AES_KEY_FILL[sub * 4 + 3] };
|
||||
#else
|
||||
const bool b1 = (rx_version < 104);
|
||||
const bool b2 = (sub < 2);
|
||||
|
||||
uint k[16];
|
||||
k[ 0] = b1 ? 0xf890465du : (b2 ? 0x6421aaddu : 0xb5826f73u);
|
||||
k[ 1] = b1 ? 0x7ffbe4a6u : (b2 ? 0xd1833ddbu : 0xe3d6a7a6u);
|
||||
k[ 2] = b1 ? 0x141f82b7u : (b2 ? 0x2f546d2bu : 0x3d518b6du);
|
||||
k[ 3] = b1 ? 0xcf359e95u : (b2 ? 0x99e5d23fu : 0x229effb4u);
|
||||
k[ 4] = b1 ? 0x6a55c450u : (b2 ? 0xb20e3450u : 0xc7566bf3u);
|
||||
k[ 5] = b1 ? 0xfee8278au : (b2 ? 0xb6913f55u : 0x9c10b3d9u);
|
||||
k[ 6] = b1 ? 0xbd5c5ac3u : (b2 ? 0x06f79d53u : 0xe9024d4eu);
|
||||
k[ 7] = b1 ? 0x6741ffdcu : (b2 ? 0xa5dfcde5u : 0xb272b7d2u);
|
||||
k[ 8] = b1 ? 0x114c47a4u : (b2 ? 0x5c3ed904u : 0xf273c9e7u);
|
||||
k[ 9] = b1 ? 0xd524fde4u : (b2 ? 0x515e7bafu : 0xf765a38bu);
|
||||
k[10] = b1 ? 0xa7279ad2u : (b2 ? 0x0aa4679fu : 0x2ba9660au);
|
||||
k[11] = b1 ? 0x3d324aacu : (b2 ? 0x171c02bfu : 0xf63befa7u);
|
||||
k[12] = b1 ? 0x810c3a2au : (b2 ? 0x85623763u : 0x7a7cd609u);
|
||||
k[13] = b1 ? 0x99a9aeffu : (b2 ? 0xe78f5d08u : 0x915839deu);
|
||||
k[14] = b1 ? 0x42d3dbd9u : (b2 ? 0xcd673785u : 0x0c06d1fdu);
|
||||
k[15] = b1 ? 0x76f6db08u : (b2 ? 0xd8ded291u : 0xc0b0762du);
|
||||
#endif
|
||||
|
||||
__global uint* s = ((__global uint*) state) + idx * (64 / sizeof(uint)) + sub * (16 / sizeof(uint));
|
||||
uint x[4] = { s[0], s[1], s[2], s[3] };
|
||||
|
||||
const uint s1 = (sub & 1) ? 8 : 24;
|
||||
const uint s3 = (sub & 1) ? 24 : 8;
|
||||
|
||||
__global uint4* p = ((__global uint4*) out) + idx * (outputSize0 / sizeof(uint4)) + sub;
|
||||
|
||||
const __local uint* const t0 = (sub & 1) ? T : (T + 1024);
|
||||
const __local uint* const t1 = (sub & 1) ? (T + 256) : (T + 1792);
|
||||
const __local uint* const t2 = (sub & 1) ? (T + 512) : (T + 1536);
|
||||
const __local uint* const t3 = (sub & 1) ? (T + 768) : (T + 1280);
|
||||
|
||||
#pragma unroll(unroll_factor)
|
||||
for (uint i = 0; i < outputSize / sizeof(uint4); i += 4, p += 4)
|
||||
{
|
||||
uint y[4];
|
||||
|
||||
#if num_rounds != 4
|
||||
y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[0];
|
||||
y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[1];
|
||||
y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[2];
|
||||
y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[3];
|
||||
|
||||
*p = *(uint4*)(y);
|
||||
|
||||
x[0] = y[0];
|
||||
x[1] = y[1];
|
||||
x[2] = y[2];
|
||||
x[3] = y[3];
|
||||
#else
|
||||
y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[ 0];
|
||||
y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[ 1];
|
||||
y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[ 2];
|
||||
y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[ 3];
|
||||
|
||||
x[0] = t0[get_byte32(y[0], 0)] ^ t1[get_byte32(y[1], s1)] ^ t2[get_byte32(y[2], 16)] ^ t3[get_byte32(y[3], s3)] ^ k[ 4];
|
||||
x[1] = t0[get_byte32(y[1], 0)] ^ t1[get_byte32(y[2], s1)] ^ t2[get_byte32(y[3], 16)] ^ t3[get_byte32(y[0], s3)] ^ k[ 5];
|
||||
x[2] = t0[get_byte32(y[2], 0)] ^ t1[get_byte32(y[3], s1)] ^ t2[get_byte32(y[0], 16)] ^ t3[get_byte32(y[1], s3)] ^ k[ 6];
|
||||
x[3] = t0[get_byte32(y[3], 0)] ^ t1[get_byte32(y[0], s1)] ^ t2[get_byte32(y[1], 16)] ^ t3[get_byte32(y[2], s3)] ^ k[ 7];
|
||||
|
||||
y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[ 8];
|
||||
y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[ 9];
|
||||
y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[10];
|
||||
y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[11];
|
||||
|
||||
x[0] = t0[get_byte32(y[0], 0)] ^ t1[get_byte32(y[1], s1)] ^ t2[get_byte32(y[2], 16)] ^ t3[get_byte32(y[3], s3)] ^ k[12];
|
||||
x[1] = t0[get_byte32(y[1], 0)] ^ t1[get_byte32(y[2], s1)] ^ t2[get_byte32(y[3], 16)] ^ t3[get_byte32(y[0], s3)] ^ k[13];
|
||||
x[2] = t0[get_byte32(y[2], 0)] ^ t1[get_byte32(y[3], s1)] ^ t2[get_byte32(y[0], 16)] ^ t3[get_byte32(y[1], s3)] ^ k[14];
|
||||
x[3] = t0[get_byte32(y[3], 0)] ^ t1[get_byte32(y[0], s1)] ^ t2[get_byte32(y[1], 16)] ^ t3[get_byte32(y[2], s3)] ^ k[15];
|
||||
|
||||
*p = *(uint4*)(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
*(__global uint4*)(s) = *(uint4*)(x);
|
||||
}
|
||||
)==="
|
||||
Reference in New Issue
Block a user