/* XMRig * Copyright 2010 Jeff Garzik * Copyright 2012-2014 pooler * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018-2020 SChernykh * Copyright 2016-2020 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #define STAGE1_SIZE 147253 #define COUNTING_SORT_BITS 11 #define COUNTING_SORT_SIZE (1 << COUNTING_SORT_BITS) __attribute__((reqd_work_group_size(BWT_GROUP_SIZE, 1, 1))) __kernel void BWT(__global uint8_t* datas, __global uint32_t* data_sizes, uint32_t data_stride, __global uint64_t* indices, __global uint64_t* tmp_indices) { const uint32_t tid = get_local_id(0); const uint32_t gid = get_group_id(0); __local int counters[COUNTING_SORT_SIZE][2]; for (uint32_t i = tid; i < COUNTING_SORT_SIZE * 2; i += BWT_GROUP_SIZE) ((__local int*)counters)[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); const uint64_t data_offset = (uint64_t)(gid) * data_stride; __global uint8_t* input = datas + data_offset + 128; const uint32_t N = data_sizes[gid] + 1; __global uint64_t* p = (__global uint64_t*)(input); volatile __local uint8_t* counters_atomic = (volatile __local uint8_t*)(counters); indices += data_offset; tmp_indices += data_offset; for (uint32_t i = tid; i < N; i += BWT_GROUP_SIZE) { const uint32_t index = i >> 3; const uint32_t bit_offset = (i & 7) << 3; const uint64_t a = p[index]; uint64_t b = p[index + 1]; if (bit_offset == 0) b = 0; uint64_t value = (a >> bit_offset) | (b << (64 - bit_offset)); uint2 tmp; const uchar4 mask = (uchar4)(3, 2, 1, 0); tmp.x = as_uint(shuffle(as_uchar4(as_uint2(value).y), mask)); tmp.y = as_uint(shuffle(as_uchar4(as_uint2(value).x), mask)); value = as_ulong(tmp); indices[i] = (value & ((uint64_t)(-1) << 21)) | i; atomic_add((volatile __local int*)(counters_atomic + (((value >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1); atomic_add((volatile __local int*)(counters_atomic + ((value >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1); } barrier(CLK_LOCAL_MEM_FENCE); if (tid == 0) { int t0 = counters[0][0]; int t1 = counters[0][1]; counters[0][0] = t0 - 1; counters[0][1] = t1 - 1; for (uint32_t i = 1; i < COUNTING_SORT_SIZE; ++i) { t0 += counters[i][0]; t1 += counters[i][1]; counters[i][0] = t0 - 1; counters[i][1] = t1 - 1; } } barrier(CLK_LOCAL_MEM_FENCE); for (int i = tid; i < N; i += BWT_GROUP_SIZE) { const uint64_t data = indices[i]; const int k = atomic_sub((volatile __local int*)(counters_atomic + (((data >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1); tmp_indices[k] = data; } barrier(CLK_GLOBAL_MEM_FENCE); if (tid == 0) { for (int i = N - 1; i >= 0; --i) { const uint64_t data = tmp_indices[i]; const int k = atomic_sub((volatile __local int*)(counters_atomic + ((data >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1); indices[k] = data; } } barrier(CLK_GLOBAL_MEM_FENCE); const uint32_t N1 = N - 1; __global uint32_t* indices32 = (__global uint32_t*)(indices); for (uint32_t i = tid; i < N1; i += BWT_GROUP_SIZE) { const uint32_t value = indices32[i * 2 + 1] >> (32 - COUNTING_SORT_BITS * 2); if (value != (indices32[i * 2 + 3] >> (32 - COUNTING_SORT_BITS * 2))) continue; if ((i > 0) && ((indices32[i * 2 - 1] >> (32 - COUNTING_SORT_BITS * 2)) == value)) continue; uint32_t i1 = i + 2; while ((i1 < N) && ((indices32[i1 * 2 + 1] >> (32 - COUNTING_SORT_BITS * 2)) == value)) ++i1; for (uint32_t j = i; j < i1; ++j) { uint64_t t = indices[j]; for (uint32_t k = j + 1; k < i1; ++k) { if (t > indices[k]) { const uint64_t t1 = t; t = indices[k]; indices[k] = t1; } } indices[j] = t; } } barrier(CLK_GLOBAL_MEM_FENCE); --input; __global uint8_t* output = (__global uint8_t*)(tmp_indices); for (int i = tid; i <= N; i += BWT_GROUP_SIZE) output[i] = input[indices[i] & ((1 << 21) - 1)]; } __kernel void filter(uint32_t nonce, uint32_t bwt_max_size, __global const uint32_t* hashes, __global uint32_t* filtered_hashes) { const uint32_t global_id = get_global_id(0); __global const uint32_t* hash = hashes + global_id * (32 / sizeof(uint32_t)); const uint32_t stage2_size = STAGE1_SIZE + (*hash & 0xfffff); if (stage2_size < bwt_max_size) { const int index = atomic_add((volatile __global int*)(filtered_hashes), 1) * (36 / sizeof(uint32_t)) + 1; filtered_hashes[index] = nonce + global_id; #pragma unroll 8 for (uint32_t i = 0; i < 8; ++i) filtered_hashes[index + i + 1] = hash[i]; } } __kernel void prepare_batch2(__global uint32_t* hashes, __global uint32_t* filtered_hashes, __global uint32_t* data_sizes) { const uint32_t global_id = get_global_id(0); const uint32_t N = filtered_hashes[0] - get_global_size(0); if (global_id == 0) filtered_hashes[0] = N; __global uint32_t* hash = hashes + global_id * 8; __global uint32_t* filtered_hash = filtered_hashes + (global_id + N) * 9 + 1; const uint32_t stage2_size = STAGE1_SIZE + (filtered_hash[1] & 0xfffff); data_sizes[global_id] = stage2_size; #pragma unroll 8 for (uint32_t i = 0; i < 8; ++i) hash[i] = filtered_hash[i + 1]; } __kernel void find_shares(__global const uint64_t* hashes, __global const uint32_t* filtered_hashes, uint64_t target, __global uint32_t* shares) { const uint32_t global_index = get_global_id(0); if (hashes[global_index * 4 + 3] < target) { const uint32_t idx = atomic_inc(shares + 0xFF); if (idx < 0xFF) shares[idx] = filtered_hashes[(filtered_hashes[0] + global_index) * 9 + 1]; } }